diff --git a/.claude/commands/opsx/apply.md b/.claude/commands/opsx/apply.md
deleted file mode 100644
index c3f1a50..0000000
--- a/.claude/commands/opsx/apply.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# OPSX: Apply
-
-Implement tasks from an OpenSpec change.
-
-**Input**: Optionally specify a change name (e.g., `/opsx:apply add-auth`). If omitted, check if it can be inferred from conversation context. If vague or ambiguous you MUST prompt for available changes.
-
-**Steps**
-
-1. **Select the change**
-
-   If a name is provided, use it. Otherwise:
-   - Infer from conversation context if the user mentioned a change
-   - Auto-select if only one active change exists
-   - If ambiguous, list available changes from `openspec/changes/` and use **AskUserQuestion** to let the user select
-
-   Always announce: "Using change: <name>" and how to override.
-
-2. **Read change artifacts**
-
-   Read all available artifacts from `openspec/changes/<name>/`:
-   - proposal.md (what & why)
-   - specs/ (requirements delta)
-   - design.md (how)
-   - tasks.md (implementation checklist)
-
-3. **Show current progress**
-
-   Display:
-   - Change name and purpose
-   - Progress: "N/M tasks complete"
-   - Remaining tasks overview
-
-4. **Implement tasks (loop until done or blocked)**
-
-   For each pending task:
-   - Show which task is being worked on
-   - Make the code changes required
-   - Keep changes minimal and focused
-   - Mark task complete in tasks.md: `- [ ]` → `- [x]`
-   - Continue to next task
-
-   **Pause if:**
-   - Task is unclear → ask for clarification
-   - Implementation reveals a design issue → suggest updating artifacts
-   - Error or blocker encountered → report and wait for guidance
-   - User interrupts
-
-5. **On completion or pause, show status**
-
-   Display:
-   - Tasks completed this session
-   - Overall progress: "N/M tasks complete"
-   - If all done: suggest archive
-   - If paused: explain why and wait for guidance
-
-**Output During Implementation**
-
-```
-## Implementing: <change-name>
-
-Working on task 3/7: <task description>
-[...implementation happening...]
-✓ Task complete
-
-Working on task 4/7: <task description>
-[...implementation happening...]
-✓ Task complete
-```
-
-**Output On Completion**
-
-```
-## Implementation Complete
-
-**Change:** <change-name>
-**Progress:** 7/7 tasks complete ✓
-
-### Completed This Session
-- [x] Task 1
-- [x] Task 2
-...
-
-All tasks complete! You can archive this change with `/opsx:archive`.
-```
-
-**Guardrails**
-
-- Keep going through tasks until done or blocked
-- Always read context files before starting
-- If task is ambiguous, pause and ask before implementing
-- If implementation reveals issues, pause and suggest artifact updates
-- Keep code changes minimal and scoped to each task
-- Update task checkbox immediately after completing each task
-- Pause on errors, blockers, or unclear requirements - don't guess
diff --git a/.claude/commands/opsx/archive.md b/.claude/commands/opsx/archive.md
deleted file mode 100644
index d2ae873..0000000
--- a/.claude/commands/opsx/archive.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# OPSX: Archive
-
-Archive a completed change in the OpenSpec workflow.
-
-**Input**: Optionally specify a change name after `/opsx:archive` (e.g., `/opsx:archive add-auth`). If omitted, check if it can be inferred from conversation context. If vague or ambiguous you MUST prompt for available changes.
-
-**Steps**
-
-1. **If no change name provided, prompt for selection**
-
-   List available changes from `openspec/changes/` (excluding archive directory).
-   Use **AskUserQuestion** to let the user select.
-
-   **IMPORTANT**: Do NOT guess or auto-select a change. Always let the user choose.
-
-2. **Check task completion status**
-
-   Read `openspec/changes/<name>/tasks.md` to check for incomplete tasks.
-
-   Count tasks marked with `- [ ]` (incomplete) vs `- [x]` (complete).
-
-   **If incomplete tasks found:**
-   - Display warning showing count of incomplete tasks
-   - Prompt user for confirmation to proceed
-   - Proceed if user confirms
-
-3. **Assess delta spec sync state**
-
-   Check for delta specs at `openspec/changes/<name>/specs/`. If none exist, proceed without sync prompt.
-
-   **If delta specs exist:**
-   - Compare each delta spec with its corresponding main spec at `openspec/specs/<capability>/`
-   - Determine what changes would be applied (adds, modifications, removals)
-   - Show a summary before prompting
-
-   **Prompt options:**
-   - "Sync now (recommended)" - Merge delta specs into main specs
-   - "Archive without syncing"
-
-   If user chooses sync, merge delta specs into main specs before archiving.
-
-4. **Perform the archive**
-
-   Create the archive directory if it doesn't exist:
-   ```bash
-   mkdir -p openspec/archive
-   ```
-
-   Generate target name using current date: `YYYY-MM-DD-<change-name>`
-
-   Move the change directory to archive:
-   ```bash
-   mv openspec/changes/<name> openspec/archive/YYYY-MM-DD-<name>
-   ```
-
-5. **Display summary**
-
-   Show archive completion summary including:
-   - Change name
-   - Archive location
-   - Whether specs were synced (if applicable)
-   - Note about any warnings (incomplete tasks)
-
-**Output On Success**
-
-```
-## Archive Complete
-
-**Change:** <change-name>
-**Archived to:** openspec/archive/YYYY-MM-DD-<name>/
-**Specs:** ✓ Synced to main specs (or "No delta specs" or "Sync skipped")
-
-All artifacts complete. All tasks complete.
-```
-
-**Output On Success With Warnings**
-
-```
-## Archive Complete (with warnings)
-
-**Change:** <change-name>
-**Archived to:** openspec/archive/YYYY-MM-DD-<name>/
-**Specs:** Sync skipped (user chose to skip)
-
-**Warnings:**
-- Archived with 3 incomplete tasks
-- Delta spec sync was skipped
-
-Review the archive if this was not intentional.
-```
-
-**Guardrails**
-
-- Always prompt for change selection if not provided
-- Don't block archive on warnings - just inform and confirm
-- Preserve all files when moving to archive
-- Show clear summary of what happened
diff --git a/.claude/commands/opsx/explore.md b/.claude/commands/opsx/explore.md
deleted file mode 100644
index 4dc3f34..0000000
--- a/.claude/commands/opsx/explore.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# OPSX: Explore
-
-Enter explore mode. Think deeply. Visualize freely. Follow the conversation wherever it goes.
-
-**IMPORTANT: Explore mode is for thinking, not implementing.** You may read files, search code, and investigate the codebase, but you must NEVER write code or implement features. If the user asks you to implement something, remind them to exit explore mode first and create a change proposal. You MAY create OpenSpec artifacts (proposals, designs, specs) if the user asks—that's capturing thinking, not implementing.
-
-**This is a stance, not a workflow.** There are no fixed steps, no required sequence, no mandatory outputs. You're a thinking partner helping the user explore.
-
-**Input**: The argument after `/opsx:explore` is whatever the user wants to think about. Could be:
-- A vague idea: "real-time collaboration"
-- A specific problem: "the auth system is getting unwieldy"
-- A change name: "add-dark-mode" (to explore in context of that change)
-- A comparison: "postgres vs sqlite for this"
-- Nothing (just enter explore mode)
-
----
-
-## The Stance
-
-- **Curious, not prescriptive** - Ask questions that emerge naturally, don't follow a script
-- **Open threads, not interrogations** - Surface multiple interesting directions and let the user follow what resonates
-- **Visual** - Use ASCII diagrams liberally when they'd help clarify thinking
-- **Adaptive** - Follow interesting threads, pivot when new information emerges
-- **Patient** - Don't rush to conclusions, let the shape of the problem emerge
-- **Grounded** - Explore the actual codebase when relevant, don't just theorize
-
----
-
-## What You Might Do
-
-Depending on what the user brings, you might:
-
-**Explore the problem space**
-- Ask clarifying questions that emerge from what they said
-- Challenge assumptions
-- Reframe the problem
-- Find analogies
-
-**Investigate the codebase**
-- Map existing architecture relevant to the discussion
-- Find integration points
-- Identify patterns already in use
-- Surface hidden complexity
-
-**Compare options**
-- Brainstorm multiple approaches
-- Build comparison tables
-- Sketch tradeoffs
-- Recommend a path (if asked)
-
-**Visualize**
-```
-┌─────────────────────────────────────────┐
-│     Use ASCII diagrams liberally        │
-├─────────────────────────────────────────┤
-│                                         │
-│      ┌────────┐         ┌────────┐      │
-│      │ State  │────────▶│ State  │      │
-│      │   A    │         │   B    │      │
-│      └────────┘         └────────┘      │
-│                                         │
-│   System diagrams, state machines,      │
-│   data flows, architecture sketches,    │
-│   dependency graphs, comparison tables  │
-│                                         │
-└─────────────────────────────────────────┘
-```
-
-**Surface risks and unknowns**
-- Identify what could go wrong
-- Find gaps in understanding
-- Suggest spikes or investigations
-
----
-
-## OpenSpec Awareness
-
-You have full context of the OpenSpec system. Use it naturally, don't force it.
-
-### Check for context
-
-At the start, quickly check what exists:
-- List `openspec/changes/` for active changes
-- List `openspec/specs/` for existing specifications
-
-### When no change exists
-
-Think freely. When insights crystallize, you might offer:
-
-- "This feels solid enough to start a change. Want me to create a proposal?"
-- Or keep exploring - no pressure to formalize
-
-### When a change exists
-
-If the user mentions a change or you detect one is relevant:
-
-1. **Read existing artifacts for context**
-   - `openspec/changes/<name>/proposal.md`
-   - `openspec/changes/<name>/design.md`
-   - `openspec/changes/<name>/tasks.md`
-
-2. **Reference them naturally in conversation**
-
-3. **Offer to capture when decisions are made**
-
-    | Insight Type               | Where to Capture               |
-    |----------------------------|--------------------------------|
-    | New requirement discovered | `specs/<capability>/spec.md` |
-    | Requirement changed        | `specs/<capability>/spec.md` |
-    | Design decision made       | `design.md`                  |
-    | Scope changed              | `proposal.md`                |
-    | New work identified        | `tasks.md`                   |
-
----
-
-## Guardrails
-
-- **Don't implement** - Never write code or implement features
-- **Don't fake understanding** - If something is unclear, dig deeper
-- **Don't rush** - Discovery is thinking time, not task time
-- **Don't force structure** - Let patterns emerge naturally
-- **Don't auto-capture** - Offer to save insights, don't just do it
-- **Do visualize** - A good diagram is worth many paragraphs
-- **Do explore the codebase** - Ground discussions in reality
-- **Do question assumptions** - Including the user's and your own
diff --git a/.claude/commands/opsx/propose.md b/.claude/commands/opsx/propose.md
deleted file mode 100644
index 3de5cb2..0000000
--- a/.claude/commands/opsx/propose.md
+++ /dev/null
@@ -1,82 +0,0 @@
-# OPSX: Propose
-
-Propose a new change - create the change and generate all artifacts in one step.
-
-I'll create a change with artifacts:
-- proposal.md (what & why)
-- design.md (how)
-- tasks.md (implementation steps)
-
-When ready to implement, run /opsx:apply
-
----
-
-**Input**: The argument after `/opsx:propose` is the change name (kebab-case), OR a description of what the user wants to build.
-
-**Steps**
-
-1. **If no input provided, ask what they want to build**
-
-   Use the **AskUserQuestion tool** (open-ended, no preset options) to ask:
-   > "What change do you want to work on? Describe what you want to build or fix."
-
-   From their description, derive a kebab-case name (e.g., "add user authentication" → `add-user-auth`).
-
-   **IMPORTANT**: Do NOT proceed without understanding what the user wants to build.
-
-2. **Create the change directory**
-   ```bash
-   mkdir -p openspec/changes/<name>
-   ```
-   This creates a scaffolded change at `openspec/changes/<name>/`.
-
-3. **Read existing specs for context**
-   - Check `openspec/specs/` for related specifications
-   - Understand what capabilities already exist
-   - Identify which specs might be modified
-
-4. **Create artifacts in sequence**
-
-   a. **Create proposal.md**:
-      - Why: Problem or opportunity being addressed
-      - What Changes: Specific changes (new, modified, removed)
-      - Capabilities: Which specs affected
-      - Impact: Affected code, APIs, dependencies
-
-   b. **Create specs delta** (if modifying existing specs):
-      - Create `openspec/changes/<name>/specs/<capability>/spec.md`
-      - Use ADDED/MODIFIED/REMOVED format
-
-   c. **Create design.md**:
-      - Context: Background and current state
-      - Goals/Non-Goals: What this aims to achieve
-      - Decisions: Key technical choices with rationale
-      - Risks/Trade-offs: Known limitations
-
-   d. **Create tasks.md**:
-      - Group related tasks under numbered headings
-      - Each task as checkbox: `- [ ] X.Y Task description`
-      - Order by dependency
-
-5. **Show final status**
-
-   Summarize:
-   - Change name and location
-   - List of artifacts created
-   - What's ready: "All artifacts created! Ready for implementation."
-   - Prompt: "Run `/opsx:apply` to start implementing."
-
-**Artifact Creation Guidelines**
-
-- Follow OpenSpec schema for each artifact type
-- Read existing specs before creating delta specs
-- Keep proposal concise (1-2 pages)
-- Design should explain "how", not restate "what"
-- Tasks should be completable in one session
-
-**Guardrails**
-
-- Create ALL artifacts needed for implementation
-- Always read existing specs before proposing modifications
-- If context is critically unclear, ask the user
-- If a change with that name already exists, ask if user wants to continue or create new
diff --git a/.claude/commands/opsx/status.md b/.claude/commands/opsx/status.md
deleted file mode 100644
index 5a745f7..0000000
--- a/.claude/commands/opsx/status.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# OPSX: Status
-
-Show the status of OpenSpec changes and specs.
-
-**Input**: Optionally specify a change name to see detailed status of that change.
-
-**Steps**
-
-1. **If no change name provided, show project overview**
-
-   List all active changes from `openspec/changes/`:
-   ```
-   ## OpenSpec Status
-
-   ### Active Changes
-   | Change | Artifacts | Tasks | Status |
-   |--------|-----------|-------|--------|
-   | add-auth | 4/4 | 5/8 | In Progress |
-   | fix-memory | 3/4 | 0/3 | Planning |
-
-   ### Specs
-   | Type | Count | Location |
-   |------|-------|----------|
-   | Product | 2 | openspec/specs/product/ |
-   | Architecture | 8 | openspec/specs/architecture/ |
-   | API | 2 | openspec/specs/api/ |
-   | Data | 2 | openspec/specs/data/ |
-   | Testing | 2 | openspec/specs/testing/ |
-
-   ### Archive
-   N changes archived
-   ```
-
-2. **If change name provided, show detailed status**
-
-   Read all artifacts from `openspec/changes/<name>/`:
-
-   ```
-   ## Change: <change-name>
-
-   ### Artifacts
-   - [x] proposal.md
-   - [x] specs/
-   - [x] design.md
-   - [ ] tasks.md
-
-   ### Tasks (3/7 complete)
-   - [x] 1.1 Setup database
-   - [x] 1.2 Create models
-   - [x] 2.1 Implement auth
-   - [ ] 2.2 Add tests
-   - [ ] 2.3 Update docs
-   - [ ] 3.1 Integration test
-   - [ ] 3.2 Deploy
-
-   ### Delta Specs
-   - user-auth/spec.md (ADDED: 3 requirements)
-   ```
-
-3. **Show next steps**
-
-   Based on current state, suggest:
-   - If artifacts incomplete: "Run `/opsx:propose` to complete artifacts"
-   - If tasks pending: "Run `/opsx:apply` to continue implementation"
-   - If all tasks complete: "Run `/opsx:archive` to archive this change"
-
-**Guardrails**
-
-- Provide clear, actionable next steps
-- Show progress visually (checkmarks, counts)
-- Highlight blockers if any
diff --git a/.claude/commands/opsx/verify.md b/.claude/commands/opsx/verify.md
deleted file mode 100644
index c0cb20d..0000000
--- a/.claude/commands/opsx/verify.md
+++ /dev/null
@@ -1,121 +0,0 @@
-# OPSX: Verify
-
-Verify that an implementation matches the change artifacts (specs, tasks, design).
-
-**Input**: Optionally specify a change name after `/opsx:verify` (e.g., `/opsx:verify add-auth`). If omitted, check if it can be inferred from conversation context. If vague or ambiguous you MUST prompt for available changes.
-
-**Steps**
-
-1. **If no change name provided, prompt for selection**
-
-   List available changes from `openspec/changes/`.
-   Use **AskUserQuestion** to let the user select.
-
-   **IMPORTANT**: Do NOT guess or auto-select a change. Always let the user choose.
-
-2. **Read change artifacts**
-
-   Read all available artifacts from `openspec/changes/<name>/`:
-   - proposal.md
-   - specs/ (delta specs)
-   - design.md
-   - tasks.md
-
-3. **Initialize verification report structure**
-
-   Create a report structure with three dimensions:
-   - **Completeness**: Track tasks and spec coverage
-   - **Correctness**: Track requirement implementation and scenario coverage
-   - **Coherence**: Track design adherence and pattern consistency
-
-   Each dimension can have CRITICAL, WARNING, or SUGGESTION issues.
-
-4. **Verify Completeness**
-
-   **Task Completion**:
-   - Read tasks.md
-   - Parse checkboxes: `- [ ]` (incomplete) vs `- [x]` (complete)
-   - Count complete vs total tasks
-   - If incomplete tasks exist:
-     - Add CRITICAL issue for each incomplete task
-     - Recommendation: "Complete task: <description>" or "Mark as done if already implemented"
-
-   **Spec Coverage**:
-   - If delta specs exist in `openspec/changes/<name>/specs/`:
-     - Extract all requirements (marked with "### Requirement:")
-     - For each requirement, search codebase for implementation evidence
-     - If requirements appear unimplemented:
-       - Add CRITICAL issue: "Requirement not found: <requirement name>"
-
-5. **Verify Correctness**
-
-   **Requirement Implementation Mapping**:
-   - For each requirement from delta specs:
-     - Search codebase for implementation evidence
-     - If found, note file paths and line ranges
-     - Assess if implementation matches requirement intent
-     - If divergence detected:
-       - Add WARNING: "Implementation may diverge from spec: <details>"
-
-   **Scenario Coverage**:
-   - For each scenario in delta specs (marked with "#### Scenario:"):
-     - Check if conditions are handled in code
-     - Check if tests exist covering the scenario
-     - If scenario appears uncovered:
-       - Add WARNING: "Scenario not covered: <scenario name>"
-
-6. **Verify Coherence**
-
-   **Design Adherence**:
-   - If design.md exists:
-     - Extract key decisions
-     - Verify implementation follows those decisions
-     - If contradiction detected:
-       - Add WARNING: "Design decision not followed: <decision>"
-
-   **Code Pattern Consistency**:
-   - Review new code for consistency with project patterns
-   - If significant deviations found:
-     - Add SUGGESTION: "Code pattern deviation: <details>"
-
-7. **Generate Verification Report**
-
-   **Summary Scorecard**:
-   ```
-   ## Verification Report: <change-name>
-
-   ### Summary
-   | Dimension    | Status           |
-   |--------------|------------------|
-   | Completeness | X/Y tasks, N reqs|
-   | Correctness  | M/N reqs covered |
-   | Coherence    | Followed/Issues  |
-   ```
-
-   **Issues by Priority**:
-
-   1. **CRITICAL** (Must fix before archive):
-      - Incomplete tasks
-      - Missing requirement implementations
-      - Each with specific, actionable recommendation
-
-   2. **WARNING** (Should fix):
-      - Spec/design divergences
-      - Missing scenario coverage
-
-   3. **SUGGESTION** (Nice to fix):
-      - Pattern inconsistencies
-      - Minor improvements
-
-   **Final Assessment**:
-   - If CRITICAL issues: "X critical issue(s) found. Fix before archiving."
-   - If only warnings: "No critical issues. Y warning(s) to consider. Ready for archive."
-   - If all clear: "All checks passed. Ready for archive."
-
-**Output Format**
-
-Use clear markdown with:
-- Table for summary scorecard
-- Grouped lists for issues (CRITICAL/WARNING/SUGGESTION)
-- Code references in format: `file.ts:123`
-- Specific, actionable recommendations
diff --git a/.claude/settings.json b/.claude/settings.json
deleted file mode 100644
index 76e3f89..0000000
--- a/.claude/settings.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "hooks": {
-    "PostToolUse": [
-      {
-        "matcher": "Write|Edit",
-        "hooks": [
-          {
-            "type": "command",
-            "command": "jq -r '.tool_input.file_path // .tool_response.filePath' | { read -r f; ext=\"${f##*.}\"; case \"$ext\" in cu|cpp|h|cuh) clang-format --style=file -i \"$f\" 2>/dev/null || true;; esac; }",
-            "timeout": 30,
-            "statusMessage": "Formatting code..."
-          }
-        ]
-      }
-    ]
-  }
-}
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
deleted file mode 100644
index 1403144..0000000
--- a/.claude/settings.local.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Skill(update-config)"
-    ]
-  }
-}
diff --git a/.claude/skills/openspec/SKILL.md b/.claude/skills/openspec/SKILL.md
deleted file mode 100644
index d39aaca..0000000
--- a/.claude/skills/openspec/SKILL.md
+++ /dev/null
@@ -1,143 +0,0 @@
-# OpenSpec Workflow Skill
-
-Spec-Driven Development workflow using OpenSpec methodology.
-
-## Overview
-
-This skill provides a structured workflow for spec-driven development:
-
-```
-/opsx:explore → /opsx:propose → /opsx:apply → /opsx:verify → /opsx:archive
-```
-
-## Commands
-
-| Command | Purpose |
-|---------|---------|
-| `/opsx:explore` | Think through ideas, investigate problems, clarify requirements |
-| `/opsx:propose` | Create a new change with all artifacts (proposal, design, tasks) |
-| `/opsx:apply` | Implement tasks from a change |
-| `/opsx:verify` | Verify implementation matches artifacts before archiving |
-| `/opsx:archive` | Archive a completed change |
-| `/opsx:status` | Show status of changes and specs |
-
-## Directory Structure
-
-```
-openspec/
-├── config.yaml           # OpenSpec configuration
-├── specs/                # Source of truth (current system behavior)
-│   ├── product/          # Product requirements
-│   ├── architecture/     # Technical designs (RFCs)
-│   ├── api/              # API contracts
-│   ├── data/             # Data schemas
-│   └── testing/          # Test specifications
-├── changes/              # Active change proposals
-│   └── <change-name>/    # Each change has its own directory
-│       ├── proposal.md   # What & why
-│       ├── design.md     # How
-│       ├── tasks.md      # Implementation checklist
-│       └── specs/        # Delta specs (changes to specs)
-└── archive/              # Completed changes
-```
-
-## Workflow
-
-### 1. Explore (Optional)
-
-Use `/opsx:explore` to:
-- Think through a problem before committing to a change
-- Investigate the codebase
-- Compare options
-- Clarify requirements
-
-Explore mode is for thinking, not implementing.
-
-### 2. Propose
-
-Use `/opsx:propose <name>` to create a change with:
-- `proposal.md` - Why this change, what changes, which capabilities affected
-- `specs/` - Delta specs (ADDED/MODIFIED/REMOVED requirements)
-- `design.md` - Technical approach, decisions, tradeoffs
-- `tasks.md` - Implementation checklist with checkboxes
-
-### 3. Apply
-
-Use `/opsx:apply <name>` to:
-- Read all artifacts for context
-- Work through tasks in order
-- Mark tasks complete as you go
-- Pause on blockers or unclear requirements
-
-### 4. Verify
-
-Use `/opsx:verify <name>` to:
-- Check task completion
-- Verify spec coverage
-- Validate design adherence
-- Generate verification report
-
-### 5. Archive
-
-Use `/opsx:archive <name>` to:
-- Sync delta specs to main specs (optional)
-- Move change to archive with timestamp
-- Clean up active changes directory
-
-## Spec Format
-
-### Delta Specs
-
-Delta specs use section headers to indicate operations:
-
-```markdown
-## ADDED Requirements
-
-### Requirement: User authentication
-The system SHALL support user login via email/password.
-
-#### Scenario: Successful login
-- **WHEN** user provides valid credentials
-- **THEN** system creates a session and redirects to dashboard
-
-## MODIFIED Requirements
-
-### Requirement: Password strength
-<!-- Full updated requirement text -->
-
-## REMOVED Requirements
-
-### Requirement: Legacy auth
-**Reason**: Replaced by OAuth
-**Migration**: Use `/auth/oauth` endpoint
-```
-
-## Frontmatter
-
-All spec files should include OpenSpec frontmatter:
-
-```yaml
----
-openspec:
-  type: product|architecture|api|data|testing
-  status: active|deprecated
-  created: YYYY-MM-DD
-  tags: [tag1, tag2]
----
-```
-
-## Best Practices
-
-1. **Start with explore** - Don't jump straight to proposing
-2. **Read existing specs** - Before proposing modifications
-3. **Keep proposals concise** - Focus on "why" not "how"
-4. **Small tasks** - Each task should be completable in one session
-5. **Update as you learn** - Artifacts can be updated during implementation
-6. **Verify before archive** - Catch issues early
-
-## Integration with AGENTS.md
-
-This skill works alongside AGENTS.md:
-- AGENTS.md defines project-specific conventions
-- OpenSpec defines the change management workflow
-- Both should be followed during development
diff --git a/.claude/skills/verify/SKILL.md b/.claude/skills/verify/SKILL.md
deleted file mode 100644
index 4cdc09b..0000000
--- a/.claude/skills/verify/SKILL.md
+++ /dev/null
@@ -1,29 +0,0 @@
----
-name: verify
-description: Build the project and run tests to verify changes. Use after making code modifications to ensure nothing is broken.
----
-
-Build the CUDA inference engine and run tests.
-
-## Steps
-
-1. Configure with CMake (debug preset with tests):
-   ```bash
-   cmake --preset default
-   ```
-
-2. Build the project:
-   ```bash
-   cmake --build --preset default
-   ```
-
-3. Run tests (requires GPU):
-   ```bash
-   ctest --preset default
-   ```
-
-## Notes
-
-- Tests require an NVIDIA GPU with CUDA support
-- Target architectures: 75, 80, 86, 89, 90 (Turing through Blackwell)
-- If tests fail due to GPU unavailability, at minimum verify the build succeeds
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ab80e7f..d91e07e 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,87 +1,2 @@
-# Code Owners
-
-This file defines the code ownership for the mini-inference-engine project.
-
-## Repository Owners
-
+# Default owner
 * @shane
-
-## Code Ownership by Area
-
-### CUDA Kernels
-```
-src/*.cu
-include/kernels.cuh
-include/half_gemm.cuh
-include/vectorized_gemm.cuh
-```
-
-### Inference Engine
-```
-src/inference_engine.cpp
-include/inference_engine.h
-include/tensor.h
-src/tensor.cu
-```
-
-### Memory Management
-```
-include/memory_pool.h
-include/stream_manager.h
-```
-
-### Performance & Profiling
-```
-include/autotuner.h
-include/profiler.h
-benchmarks/
-```
-
-### Configuration & Logging
-```
-include/config.h
-include/logger.h
-```
-
-### Quantization
-```
-include/quantization.h
-```
-
-### Batch Operations
-```
-include/batch_gemm.h
-```
-
-### Tests
-```
-tests/
-```
-
-### Documentation
-```
-docs/
-README.md
-README.zh-CN.md
-CHANGELOG.md
-AGENTS.md
-CLAUDE.md
-```
-
-### CI/CD
-```
-.github/
-CMakeLists.txt
-CMakePresets.json
-```
-
-### OpenSpec
-```
-openspec/
-```
-
-## Review Requirements
-
-- Changes to core CUDA kernels require review by kernel owners
-- Breaking API changes require review from all area owners
-- Documentation changes can be approved by any owner
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index c4ee813..91eac0c 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -1,27 +1,27 @@
-# GitHub Copilot Instructions for Mini-Inference Engine
+# Mini-Inference Engine
 
-Mini-Inference Engine 是一个 C++17/CUDA GEMM 优化教程与迷你推理引擎。核心代码展示 7 级 GEMM 优化路径，并配套 Tensor、InferenceEngine、MemoryPool、StreamManager、AutoTuner、Profiler、benchmark、OpenSpec 和双语文档。
+Mini-Inference Engine 是一个 C++17/CUDA GEMM 优化教程与迷你推理引擎。仓库重点是渐进式 GEMM kernels、最小运行时组件、benchmark、测试和双语文档。
 
-## 编码约束
+## 代码约束
 
-- 先查 `openspec/specs/**`，再改需求、架构、API 或测试行为。
 - 使用 `.clang-format`；函数/变量 `snake_case`，类 `PascalCase`，常量与模板参数 `UPPER_SNAKE_CASE`。
 - CUDA API 必须用 `CUDA_CHECK()`，cuBLAS API 必须用 `CUBLAS_CHECK()`。
-- GPU 内存优先用 `DeviceMemory`/`PooledMemory`；不要新增裸指针生命周期。
-- 新源文件必须显式加入 `CMakeLists.txt`，不要使用 `GLOB_RECURSE`。
+- GPU 内存优先用 `DeviceMemory` 或 `PooledMemory`；不要随意扩散裸 `cudaMalloc` / `cudaFree` 生命周期。
+- 新源文件必须显式加入 `CMakeLists.txt`，不要使用递归 glob。
+- 优先删除重复实现和过时分支，不要叠加新的流程层。
 
 ## 构建与测试
 
 ```bash
-cmake --preset default
-cmake --build --preset default
-ctest --preset default --output-on-failure
+cmake --preset gcc-cuda
+cmake --build --preset gcc-cuda
+ctest --preset gcc-cuda
 ```
 
-`tests_host` 覆盖不需要 GPU 设备的工具测试；`tests_gpu` 覆盖 CUDA runtime/kernel 行为。没有 CUDA 设备时 GPU 测试应 skip，但配置和构建仍需要 CUDA Toolkit。
+`gcc-cuda` 是当前仓库优先的本地稳定 CUDA 构建路径，固定使用系统 GCC 12 / G++ 12。`tests_host` 覆盖不需要 GPU 设备的工具测试；`tests_gpu` 覆盖 CUDA runtime/kernel 行为。没有 CUDA 设备时 GPU 测试可跳过，但配置和构建仍需要 CUDA Toolkit。
 
 ## 文档口径
 
 - 性能统一写为“参考 RTX 3080 1024×1024 benchmark 中约 85% cuBLAS 级吞吐”。
-- GitHub Pages 是门户，不要把 README/CHANGELOG 整段搬过去。
-- 避免通用模板；所有 AI/文档/CI 配置都要体现 CUDA GEMM 与推理引擎上下文。
+- GitHub Pages 只放用户文档，不复制 changelog 或流程元文档。
+- 项目变更历史只保留根目录 `CHANGELOG.md`。
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fc030a5..f96cb73 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -39,15 +39,14 @@ jobs:
       - name: Install CUDA Toolkit
         if: steps.cache-cuda.outputs.cache-hit != 'true'
         uses: Jimver/cuda-toolkit@v0.2.35
-        id: cuda-toolkit
         with:
           cuda: ${{ env.CUDA_VERSION }}
-          method: 'network'
+          method: network
 
       - name: Setup CUDA Environment
         run: |
-          echo "CUDA_PATH=/usr/local/cuda-${{ env.CUDA_VERSION }}" >> $GITHUB_ENV
-          echo "/usr/local/cuda-${{ env.CUDA_VERSION }}/bin" >> $GITHUB_PATH
+          echo "CUDA_PATH=/usr/local/cuda-${{ env.CUDA_VERSION }}" >> "$GITHUB_ENV"
+          echo "/usr/local/cuda-${{ env.CUDA_VERSION }}/bin" >> "$GITHUB_PATH"
 
       - name: CUDA Info
         run: |
@@ -70,17 +69,15 @@ jobs:
             -DCMAKE_CUDA_ARCHITECTURES="${{ env.CUDA_ARCHITECTURES }}"
 
       - name: Build
-        run: cmake --build build --parallel $(nproc)
+        run: cmake --build build --parallel "$(nproc)"
 
       - name: Discover tests
         run: ctest --test-dir build -N
 
-      # GPU tests require an NVIDIA device. Standard GitHub-hosted runners only
-      # validate configure/build/test discovery for this CUDA project.
-
   format-check:
     name: Code Format Check
     runs-on: ubuntu-latest
+
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -92,32 +89,37 @@ jobs:
 
       - name: Check formatting
         run: |
-          echo "Checking code formatting..."
-          FILES=$(find src include tests benchmarks -type f \( -name '*.cpp' -o -name '*.cu' -o -name '*.h' -o -name '*.cuh' \) 2>/dev/null || true)
-          if [ -n "$FILES" ]; then
-            echo "$FILES" | xargs clang-format-18 --dry-run --Werror
-            echo "✓ All files formatted correctly"
-          else
-            echo "No source files found to check"
+          set -euo pipefail
+          files=$(find src include tests benchmarks -type f \( -name '*.cpp' -o -name '*.cu' -o -name '*.h' -o -name '*.cuh' \))
+          if [ -n "$files" ]; then
+            echo "$files" | xargs clang-format-18 --dry-run --Werror
           fi
 
   docs-check:
     name: Documentation Check
     runs-on: ubuntu-latest
+
     steps:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Verify docs structure
+      - name: Verify documentation structure
         run: |
-          echo "=== Documentation Structure ==="
-          ls -la docs/
-          echo "=== Required Files ==="
-          test -f README.md && echo "✓ README.md"
-          test -f README.zh-CN.md && echo "✓ README.zh-CN.md"
-          test -f CHANGELOG.md && echo "✓ CHANGELOG.md"
-          test -f docs/en/CONTRIBUTING.md && echo "✓ docs/en/CONTRIBUTING.md"
-          test -f docs/zh/CONTRIBUTING.md && echo "✓ docs/zh/CONTRIBUTING.md"
-          test -f index.md && echo "✓ index.md"
-          test -f _config.yml && echo "✓ _config.yml"
-          test -f Gemfile && echo "✓ Gemfile"
+          set -euo pipefail
+          required=(
+            README.md
+            README.zh-CN.md
+            CHANGELOG.md
+            docs/package.json
+            docs/.vitepress/config.ts
+            docs/en/index.md
+            docs/zh/index.md
+            docs/en/guides/getting-started.md
+            docs/zh/guides/getting-started.md
+            docs/en/contributing.md
+            docs/zh/contributing.md
+          )
+          for path in "${required[@]}"; do
+            test -f "$path"
+            echo "✓ $path"
+          done
diff --git a/.gitignore b/.gitignore
index 1490378..c63b551 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 build/
 build-release/
 build-debug/
+build-*/
 cmake-build-*/
 
 # IDE
@@ -42,15 +43,6 @@ tests_gpu
 *.weights
 /data/
 
-# Jekyll / GitHub Pages
-_site/
-.jekyll-cache/
-.jekyll-metadata
-.sass-cache/
-
-# Ruby / Bundler (Gemfile.lock MUST be tracked for GitHub Pages)
-.bundle/
-
 # Cache
 .cache/
 
@@ -63,15 +55,6 @@ Thumbs.db
 Testing/
 CTestTestfile.cmake
 
-# OpenSpec backup (keep migration backup local)
-specs.backup/
-
-# OpenSpec archive (optional - uncomment if you don't want to track archives)
-# openspec/archive/
-
-# OMC (oh-my-claudecode) local state
-.omc/
-
 # VitePress / Node.js
 docs/.vitepress/dist/
 docs/.vitepress/cache/
diff --git a/AGENTS.md b/AGENTS.md
deleted file mode 100644
index bad773a..0000000
--- a/AGENTS.md
+++ /dev/null
@@ -1,231 +0,0 @@
-# AGENTS.md - Mini-Inference Engine AI Workflow
-
-本文件是仓库唯一的 AI/工程协作总纲。平台专属入口（如 `CLAUDE.md`、`.github/copilot-instructions.md`）只做最小路由，不重复维护完整规则。
-
-## 1. 项目定位
-
-Mini-Inference Engine 是一个 CUDA GEMM 优化教程与迷你推理引擎。它用可编译、可测试、可 benchmark 的 C++17/CUDA 工程展示从朴素矩阵乘法到约 85% cuBLAS 级参考吞吐的优化路径，并提供轻量推理运行时组件。
-
-核心资产：
-
-| 层级 | 文件 | 责任 |
-| --- | --- | --- |
-| Kernel | `src/*gemm*.cu`, `include/kernels.cuh` | Naive、Tiled、Coalesced、Double Buffer、Register Blocked、Fused、Vectorized GEMM。 |
-| Runtime | `include/tensor.h`, `include/inference_engine.h`, `include/memory_pool.h`, `include/stream_manager.h` | Tensor、推理执行、GPU 内存池、CUDA stream 管理。 |
-| Tooling | `include/autotuner.h`, `include/profiler.h`, `benchmarks/` | kernel 选择、profiling、benchmark、MNIST demo。 |
-| Specs | `openspec/specs/**` | 产品需求、架构 RFC、API、数据与测试事实源。 |
-| Docs | `README*.md`, `index.md`, `docs/en`, `docs/zh` | 仓库入口、GitHub Pages 门户、双语教程。 |
-
-性能口径必须保守：只描述“参考 RTX 3080、1024×1024 benchmark 中最高优化 kernel 约 85% cuBLAS 级吞吐”。不要写成跨硬件、跨矩阵规模的承诺。
-
-## 2. 强制工作流
-
-1. **读规格**：涉及功能、架构、API、测试策略时，先查 `openspec/specs/**`。
-2. **定边界**：不做与当前目标无关的大迁移；优先修复真实腐化点。
-3. **TDD**：行为变更先写失败测试，再实现，再验证。
-4. **验证**：代码变更至少跑可用的构建/测试/格式检查；当前环境无 CUDA 时，明确记录限制并给出 GPU 环境必跑命令。
-5. **审查**：重大变更使用 code review；不要把自评当成审查。
-6. **归档**：完成后提交到主线，清理临时 branch/worktree。
-
-OpenSpec slash commands 位于 `.claude/commands/opsx/`，用于 `explore`、`propose`、`apply`、`verify`、`archive`、`status`。Superpowers skills 用于过程纪律：brainstorming、writing-plans、test-driven-development、systematic-debugging、subagent-driven-development、verification-before-completion。
-
-## 3. 构建与测试
-
-```bash
-cmake --preset default
-cmake --build --preset default
-ctest --preset default --output-on-failure
-
-cmake --preset release
-cmake --build --preset release
-./build-release/benchmark
-```
-
-测试目标：
-
-- `tests_host`：配置、日志、量化等不需要 GPU 设备的工具测试；项目配置和编译仍需要 CUDA Toolkit。
-- `tests_gpu`：Tensor、MemoryPool、StreamManager、InferenceEngine、GEMM kernel、fusion、batch GEMM 等 CUDA 行为测试。
-
-GPU 测试需要 CUDA Toolkit 与可用 NVIDIA GPU。无设备时测试应 skip；无 CUDA Toolkit 时 CMake 配置会失败，这是环境缺失而不是产品行为失败。
-
-## 4. 代码规范
-
-- C++/CUDA：C++17，`.clang-format`，4 空格，100 列。
-- 命名：类 `PascalCase`；函数/变量 `snake_case`；常量和模板参数 `UPPER_SNAKE_CASE`；成员变量 `snake_case_`。
-- CUDA API 用 `CUDA_CHECK()`；cuBLAS API 用 `CUBLAS_CHECK()`。
-- GPU 内存优先 `DeviceMemory` 或 `PooledMemory`；只有在边界 API 或测试误用场景中才直接触达 `cudaMalloc/cudaFree`。
-- 新源文件必须显式加入 `CMakeLists.txt`；不要使用 `GLOB_RECURSE`。
-- CUDA kernel launch 后要尽早检查 launch error，避免错误只在最终同步点暴露。
-
-## 5. 文档治理
-
-- README 是仓库入口，只保留定位、核心亮点、快速开始、文档导航。
-- GitHub Pages (`index.md`) 是项目门户，聚焦价值主张、学习路径、技术可信度和 CTA；不要搬运 README 或堆 Changelog。
-- `docs/en` 与 `docs/zh` 保留双语教程正文；重复索引页要尽量短。
-- `CHANGELOG.md` 是结构化变更事实源；Release 页面只做发布摘要与迁移提示。
-- 严禁散落对项目状态或维护责任的临时性口径。
-
-## 6. Git 与 CI
-
-- `master` 是唯一长期主线。
-- 短期 feature branch/worktree 只用于隔离开发，合并后删除。
-- CI 不创建版本分支，不自动推送分支。
-- 无 GPU runner 上 CI 只做 CUDA 编译、测试发现、格式和文档检查；完整 GPU 测试在有设备环境运行。
-- Pages workflow 保留，用于部署项目门户。
-
-## 7. MCP、CLI Skills 与本地脚本取舍
-
-- **OpenSpec**：存放需求、架构、API、数据、测试事实。
-- **Superpowers/CLI Skills**：约束工作方法和重复流程，节省上下文。
-- **MCP/GitHub CLI**：用于 GitHub Actions、Issues、PR、repo metadata 等外部系统读写。
-- **本地 shell/CMake/ctest**：用于构建、测试、格式、静态扫描；不要把本地可脚本化任务强行上 MCP。
-
-## 8. 收尾检查清单
-
-```bash
-git status --short --branch
-rg -n 'deprecated-status-marker|temporary-owner-marker|unsupported-performance-claim' .
-git diff --check
-cmake --preset default
-cmake --build --preset default
-ctest --preset default --output-on-failure
-```
-
-若当前机器没有 CUDA Toolkit 或 GPU，保留失败输出并在 GPU 环境补跑后三项 CUDA 验证。
-
-## 9. CUDA 调试指南
-
-### 常见 CUDA 错误排查
-
-#### 错误：cudaErrorInvalidDeviceFunction
-
-**原因**：内核未针对当前 GPU 架构编译
-
-**解决**：
-```bash
-# 检查当前 GPU 计算能力
-nvidia-smi --query-gpu=compute_cap --format=csv
-
-# 使用正确的架构重新编译
-cmake -B build -DCMAKE_CUDA_ARCHITECTURES=86  # 例如 RTX 30xx
-cmake --build build
-```
-
-#### 错误：cudaErrorMemoryAllocation
-
-**原因**：GPU 显存不足
-
-**解决**：
-```bash
-# 检查显存使用
-nvidia-smi
-
-# 减小批量大小或矩阵维度
-```
-
-#### 错误：cudaErrorLaunchFailure
-
-**原因**：内核执行越界或内存访问错误
-
-**调试步骤**：
-1. 使用 `cuda-memcheck` 检测内存错误：
-   ```bash
-   cuda-memcheck ./build/tests_gpu --gtest_filter=*specific_test*
-   ```
-2. 使用 `compute-sanitizer` 进行详细分析：
-   ```bash
-   compute-sanitizer ./build/tests_gpu
-   ```
-3. 检查内核边界条件：确保所有线程都在有效范围内访问
-
-### 性能调试工具
-
-#### Nsight Systems（时间线分析）
-```bash
-nsys profile --stats=true ./build/benchmark
-nsys-ui report.nsys-rep
-```
-
-#### Nsight Compute（内核分析）
-```bash
-ncu --set full ./build/benchmark -o profile.ncu-rep
-ncu-ui profile.ncu-rep
-```
-
-### 性能基准解读
-
-#### RTX 3080 参考性能（1024×1024 GEMM）
-
-| Kernel | 时间 (ms) | TFLOPS | vs cuBLAS |
-|--------|-----------|--------|-----------|
-| Naive | 15.2 | 0.14 | ~2% |
-| Tiled | 1.8 | 1.20 | ~15% |
-| Coalesced | 1.5 | 1.43 | ~18% |
-| Double Buffer | 0.9 | 2.39 | ~30% |
-| Register Blocked | 0.32 | 6.71 | ~85% |
-| cuBLAS | 0.27 | 8.01 | 100% |
-
-**注意**：这些数据仅供参考，实际性能取决于 GPU 型号、频率和内存带宽。
-
-#### 关键性能指标
-
-- **TFLOPS**：每秒万亿次浮点运算
-  - 峰值 = GPU 核心数 × 频率 × 2（FMA）
-  - 实际 TFLOPS = 2 × M × N × K / (时间_ms × 1e9)
-
-- **内存带宽利用率**：实际带宽 vs 峰值带宽
-  - GEMM 理论带宽 = (M×K + K×N + M×N) × 4 / 时间
-  - 峰值带宽参考：RTX 3080 = 760 GB/s
-
-- **计算强度**：FLOPs/Byte
-  - GEMM 理论强度 = 2×K / 12（float32）
-  - 高计算强度（>10）表示计算密集型
-
----
-
-## 10. 常见问题排查流程
-
-### 编译问题
-
-| 问题 | 检查命令 | 解决方案 |
-|------|----------|----------|
-| CUDA not found | `nvcc --version` | 设置 `CUDA_PATH` 环境变量 |
-| CMake 版本过低 | `cmake --version` | 升级到 3.18+ |
-| 编译错误 | `cmake --build build 2>&1 \| head -50` | 检查错误日志 |
-
-### 运行时问题
-
-| 问题 | 检查命令 | 解决方案 |
-|------|----------|----------|
-| GPU 不可见 | `nvidia-smi` | 检查驱动、CUDA 安装 |
-| 测试跳过 | 查看测试输出 | 正常行为（无 GPU 环境） |
-| 性能异常 | `nsys profile` | 使用 profiler 分析 |
-
-### 测试问题
-
-```bash
-# 运行单个测试
-./build/tests_gpu --gtest_filter=GemmTest.NaiveMatMulCorrectness
-
-# 详细输出
-ctest --preset default -V
-
-# 仅运行 host 测试
-ctest --test-dir build -L host
-```
-
----
-
-## 11. Agent Skills
-
-### Issue tracker
-
-Issues live in GitHub at `LessUp/mini-inference-engine`. Skills use the `gh` CLI to create and manage issues. See `docs/agents/issue-tracker.md`.
-
-### Triage labels
-
-Triage uses the default vocabulary: `needs-triage`, `needs-info`, `ready-for-agent`, `ready-for-human`, `wontfix`. See `docs/agents/triage-labels.md`.
-
-### Domain docs
-
-Single-context layout. The primary domain source is `AGENTS.md` (this file). A `CONTEXT.md` may be added at the repo root for a distilled domain glossary; ADRs live in `docs/adr/`. See `docs/agents/domain.md`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b3ce8d9..c4f6d40 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,10 +13,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - None
 
 ### Changed
-- None
+- Removed the OpenSpec / Claude skill workflow layer and collapsed repository guidance to a smaller set of live docs.
+- Simplified the VitePress stack and fixed stale documentation and CI assumptions.
+- Standardized on the root `CHANGELOG.md` as the only change log surface.
 
 ### Fixed
-- None
+- Corrected stale documentation links and outdated workflow references.
 
 ---
 
diff --git a/CLAUDE.md b/CLAUDE.md
deleted file mode 100644
index 8cce9f1..0000000
--- a/CLAUDE.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# CLAUDE.md
-
-Claude Code 的项目入口只保留必要路由；完整工程与 AI 协作规则见 [AGENTS.md](AGENTS.md)，需求/架构/API/测试事实源见 `openspec/specs/**`。
-
-## 项目定位
-
-Mini-Inference Engine 是 CUDA GEMM 优化教程与迷你推理引擎：用 C++17/CUDA/CMake 展示从 Naive GEMM 到 Tiled、Coalesced、Double Buffer、Register Blocked、Fused、Vectorized 的渐进优化，并提供 Tensor、InferenceEngine、MemoryPool、StreamManager、AutoTuner、Profiler 等轻量运行时组件。
-
-## Claude 工作规则
-
-- 先读 OpenSpec：涉及功能、架构、API、测试口径时，优先查 `openspec/specs/**`。
-- 保持单主线：`master` 是唯一长期分支；复杂变更用临时 worktree/branch，合并后清理。
-- 不夸大性能：统一表述为”参考 RTX 3080 1024×1024 benchmark 中约 85% cuBLAS 级吞吐”，不要写成跨硬件承诺。
-- 不新增泛化模板：文档和配置必须绑定本项目 CUDA GEMM/推理引擎业务，不写通用 boilerplate。
-- 代码变更遵循 TDD；CUDA/GPU 相关改动至少补充可在 GPU 环境执行的测试。
-
-## 常用命令
-
-```bash
-cmake --preset default
-cmake --build --preset default
-ctest --preset default --output-on-failure
-
-cmake --preset release
-cmake --build --preset release
-./build-release/benchmark
-```
-
-测试和 benchmark 需要 CUDA Toolkit 与可用 NVIDIA GPU；无 GPU 的环境只能完成静态检查、文档构建和部分配置验证。
-
-## 代码风格
-
-- C++/CUDA：C++17、`.clang-format`、4 空格、100 列。
-- 公共 CUDA API 调用使用 `CUDA_CHECK()`；cuBLAS 调用使用 `CUBLAS_CHECK()`。
-- GPU 内存优先使用 `DeviceMemory` 或 `PooledMemory`，避免裸 `cudaMalloc/cudaFree` 泄漏。
-
-## 错误处理规范
-
-### CUDA 错误
-
-所有 CUDA API 调用必须使用 `CUDA_CHECK()` 宏包装，错误时抛出 `CudaException`：
-
-```cpp
-CUDA_CHECK(cudaMalloc(&ptr, size));
-CUDA_CHECK(cudaStreamSynchronize(stream));
-```
-
-### 输入验证
-
-公共 API 必须验证输入参数：
-
-```cpp
-if (A == nullptr || B == nullptr || C == nullptr) {
-    throw std::invalid_argument(“Matrix pointers must not be null”);
-}
-if (M <= 0 || N <= 0 || K <= 0) {
-    throw std::invalid_argument(“Matrix dimensions must be positive”);
-}
-```
-
-### 边界检查
-
-使用 `checked_element_count()` 和 `checked_byte_size()` 防止整数溢出。
-
-## 测试分类
-
-| 标签 | 文件 | 需要 GPU | 说明 |
-|------|------|----------|------|
-| `host` | test_config.cpp, test_logger.cpp, test_quantization.cpp | 否 | 配置、日志、量化 |
-| `host` | test_autotuner.cpp, test_profiler.cpp | 否 | 调优、分析 |
-| `gpu` | test_gemm.cu, test_fusion.cu, test_advanced.cu | 是 | GEMM 内核 |
-| `gpu` | test_tensor.cpp, test_inference.cpp | 是 | 推理引擎 |
-| `gpu` | test_memory_pool.cpp, test_stream_manager.cpp | 是 | GPU 资源管理 |
-| `gpu` | test_batch_gemm.cpp, test_half_gemm.cu | 是 | 批量/FP16 |
-
-### 无 GPU 环境
-
-测试文件使用 `MINI_INFERENCE_REQUIRE_CUDA_DEVICE()` 宏自动跳过 GPU 测试。
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1fc9f2..53384d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,7 @@ endif()
 
 # Find CUDA
 find_package(CUDAToolkit REQUIRED)
+find_path(MINI_INFERENCE_CUDA_INCLUDE_DIR cublas_v2.h REQUIRED)
 
 # Export compile_commands.json for IDE integration
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -72,7 +73,9 @@ set(LIB_HEADERS
 add_library(mini_inference STATIC ${LIB_SOURCES} ${LIB_HEADERS})
 
 target_include_directories(mini_inference
-    PUBLIC  ${CMAKE_SOURCE_DIR}/include
+    PUBLIC
+        ${CMAKE_SOURCE_DIR}/include
+        ${MINI_INFERENCE_CUDA_INCLUDE_DIR}
 )
 
 target_link_libraries(mini_inference PUBLIC CUDA::cudart CUDA::cublas)
@@ -158,9 +161,9 @@ if(BUILD_TESTS)
         PRIVATE
             ${CMAKE_SOURCE_DIR}/include
             ${CMAKE_SOURCE_DIR}/tests
-            ${CUDAToolkit_INCLUDE_DIRS}
+            ${MINI_INFERENCE_CUDA_INCLUDE_DIR}
     )
-    target_link_libraries(tests_host PRIVATE ${MINI_INFERENCE_GTEST_LIBS})
+    target_link_libraries(tests_host PRIVATE mini_inference ${MINI_INFERENCE_GTEST_LIBS})
 
     add_executable(tests_gpu ${GPU_TEST_SOURCES})
     target_include_directories(tests_gpu
diff --git a/CMakePresets.json b/CMakePresets.json
index fabf90b..4936ae2 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -6,6 +6,15 @@
     "patch": 0
   },
   "configurePresets": [
+    {
+      "name": "system-gcc",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_C_COMPILER": "/usr/bin/gcc-12",
+        "CMAKE_CXX_COMPILER": "/usr/bin/g++-12",
+        "CMAKE_CUDA_HOST_COMPILER": "/usr/bin/g++-12"
+      }
+    },
     {
       "name": "default",
       "displayName": "Default (Debug)",
@@ -17,6 +26,15 @@
         "ENABLE_FAST_MATH": "OFF"
       }
     },
+    {
+      "name": "gcc-cuda",
+      "displayName": "System GCC + CUDA (Debug)",
+      "inherits": [
+        "default",
+        "system-gcc"
+      ],
+      "binaryDir": "${sourceDir}/build-gcc-cuda"
+    },
     {
       "name": "release",
       "displayName": "Release",
@@ -28,6 +46,15 @@
         "ENABLE_FAST_MATH": "ON"
       }
     },
+    {
+      "name": "release-gcc-cuda",
+      "displayName": "System GCC + CUDA (Release)",
+      "inherits": [
+        "release",
+        "system-gcc"
+      ],
+      "binaryDir": "${sourceDir}/build-release-gcc-cuda"
+    },
     {
       "name": "ci",
       "displayName": "CI",
@@ -45,10 +72,18 @@
       "name": "default",
       "configurePreset": "default"
     },
+    {
+      "name": "gcc-cuda",
+      "configurePreset": "gcc-cuda"
+    },
     {
       "name": "release",
       "configurePreset": "release"
     },
+    {
+      "name": "release-gcc-cuda",
+      "configurePreset": "release-gcc-cuda"
+    },
     {
       "name": "ci",
       "configurePreset": "ci"
@@ -62,6 +97,13 @@
         "outputOnFailure": true
       }
     },
+    {
+      "name": "gcc-cuda",
+      "configurePreset": "gcc-cuda",
+      "output": {
+        "outputOnFailure": true
+      }
+    },
     {
       "name": "ci",
       "configurePreset": "ci",
diff --git a/CONTEXT.md b/CONTEXT.md
deleted file mode 100644
index ef5da4f..0000000
--- a/CONTEXT.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# CONTEXT.md — Mini-Inference Engine Domain Glossary
-
-## Core Concepts
-
-| Term | Definition |
-|------|-----------|
-| **GEMM Kernel** | A CUDA kernel implementing general matrix multiply C = A×B. Each optimization level (Naive, Tiled, Coalesced, DoubleBuffer, RegisterBlocked, Fused, Vectorized) is a distinct kernel. |
-| **Kernel Registry** | Singleton registry mapping `GemmKernelType` → launch function. Single point of truth for kernel dispatch; eliminates N-way switch duplication. |
-| **Kernel Selector** | Interface (`IKernelSelector`) for choosing which GEMM kernel to use for given (M,N,K). Adapters: `FixedKernelSelector`, `AutoTunedKernelSelector`, `ProfiledKernelSelector`. |
-| **GpuAllocator** | Interface (`IGpuAllocator`) for GPU memory allocation. Adapters: `DirectAllocator` (cudaMalloc), `PooledAllocator` (MemoryPool-backed). |
-| **MemoryPool** | GPU memory pool with free-list caching. Implements `IMemoryPool`. Can be instantiated directly (preferred) or accessed via legacy singleton. |
-| **StreamManager** | CUDA stream pool with round-robin dispatch. Implements `IStreamManager`. Can be instantiated directly (preferred) or accessed via legacy singleton. |
-| **Tensor** | N-dimensional device tensor with shape, strides, and optional memory pool backing. |
-| **InferenceEngine** | Layer-by-layer neural network forward pass engine. Accepts `IKernelSelector` for kernel strategy injection. |
-| **AutoTuner** | Benchmarks all registered kernels for given (M,N,K) and caches the best result. |
-| **Profiler** | Measures kernel execution time with GPU events; includes `RooflineAnalyzer`. |
-| **GemmPreset** | Named kernel tuning configuration (block sizes, thread counts, feature flags). |
-| **DeviceConfig** | GPU device settings (device ID, memory limits, feature flags, stream count). |
-| **InferenceConfig** | Engine-level settings (device, streams, memory pool, profiling, kernel preset). |
-
-## Architectural Decisions
-
-- **Kernel Registry over N-switch**: Adding a GEMM kernel now requires only: (1) kernel implementation, (2) `initialize_kernel_registry()` registration call. No more editing 7 switch statements.
-- **Interface + Adapters over Singleton**: `MemoryPool`, `StreamManager` now have interfaces (`IMemoryPool`, `IStreamManager`) and can be instantiated directly for dependency injection. Legacy `::instance()` retained for backward compatibility.
-- **Kernel Selector over hardcoded launch**: `InferenceEngine` accepts `IKernelSelector` to decouple kernel choice from engine logic. Default uses fused kernel directly.
-- **Config as value types, not singleton**: `DeviceConfig`, `InferenceConfig`, `GemmPreset` are plain structs constructed via factory functions. No global mutable state.
diff --git a/README.md b/README.md
index b56805b..2667eac 100644
--- a/README.md
+++ b/README.md
@@ -1,180 +1,71 @@
 <p align="center">
-  <img src="assets/images/logo.svg" alt="Mini-Inference Engine Logo" width="120" height="120">
+  <img src="assets/images/logo.svg" alt="Mini-Inference Engine logo" width="120" height="120">
 </p>
 
 <h1 align="center">Mini-Inference Engine</h1>
 
 <p align="center">
-  <b>CUDA GEMM 优化教程与迷你推理引擎</b><br>
-  <i>从朴素矩阵乘法到 ~85% cuBLAS 性能的 7 级渐进式优化路线</i>
+  <b>CUDA GEMM optimization tutorial and mini inference runtime</b><br>
+  <i>Compact C++17/CUDA codebase with a 7-stage kernel path and a conservative ~85% cuBLAS reference result on the RTX 3080 1024×1024 benchmark</i>
 </p>
 
 <p align="center">
   <a href="README.zh-CN.md">简体中文</a> ·
-  <a href="README.md">English</a> ·
-  <a href="https://lessup.github.io/mini-inference-engine/">在线文档</a>
+  <a href="https://lessup.github.io/mini-inference-engine/">Docs</a> ·
+  <a href="docs/en/guides/getting-started.md">Getting Started</a>
 </p>
 
-<p align="center">
-  <a href="https://github.com/LessUp/mini-inference-engine/actions/workflows/ci.yml">
-    <img src="https://github.com/LessUp/mini-inference-engine/actions/workflows/ci.yml/badge.svg" alt="CI">
-  </a>
-  <a href="https://github.com/LessUp/mini-inference-engine/actions/workflows/pages.yml">
-    <img src="https://github.com/LessUp/mini-inference-engine/actions/workflows/pages.yml/badge.svg" alt="Docs">
-  </a>
-  <img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License">
-  <img src="https://img.shields.io/badge/CUDA-12.x-76B900?logo=nvidia&logoColor=white" alt="CUDA">
-  <img src="https://img.shields.io/badge/C%2B%2B-17-00599C?logo=c%2B%2B&logoColor=white" alt="C++17">
-  <img src="https://img.shields.io/badge/Docs-VitePress-16b1ff" alt="VitePress">
-</p>
-
----
-
-## TL;DR — 面试官速览
-
-| 技术领域 | 本项目覆盖 | 代码证据 |
-|----------|-----------|----------|
-| **CUDA 编程** | 7 级 GEMM 优化（Naive → Vectorized） | `src/*_gemm.cu` |
-| **内存优化** | 共享内存分块、双缓冲、寄存器分块 | `tiled_gemm.cu`, `double_buffer_gemm.cu` |
-| **性能调优** | AutoTuner 自动调参、Profiler 性能分析 | `include/autotuner.h`, `include/profiler.h` |
-| **系统设计** | 四层架构、RAII 资源管理、内存池 | `include/memory_pool.h`, `include/stream_manager.h` |
-| **工程实践** | CMake、GoogleTest、CI/CD、OpenSpec | `CMakePresets.json`, `.github/workflows/` |
-
----
-
-## 性能一览
-
-```
-Performance vs cuBLAS (RTX 3080, 1024×1024)
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-L1 Naive        ████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░  10%
-L2 Tiled        ████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░  20%
-L3 Coalesced    ██████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░  25%
-L4 Double Buf   ████████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░  40%
-L5 Register     ████████████████████████████████████████████░  85%
-L6 Fused        ██████████████████████████████████████████░░░  80%
-L7 Vectorized   █████████████████████████████████████████████  89%
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-```
-
-> 性能数字基于 RTX 3080、1024×1024 矩阵场景。本项目采用保守口径，用于教学对比而非跨硬件承诺。
-
----
+## What this repository contains
 
-## 与业界项目对比
+Mini-Inference Engine keeps the scope narrow:
 
-| 项目 | 定位 | 本项目差异 |
-|------|------|-----------|
-| **cuBLAS** | NVIDIA 官方 BLAS 库 | 本项目是教学版，逐级展示优化过程 |
-| **CUTLASS** | CUDA 模板库 | 本项目更简单，适合入门学习 |
-| **llama.cpp** | LLM 推理框架 | 本项目聚焦 GEMM 优化教学 |
-| **vLLM** | LLM 服务框架 | 本项目是底层 kernel 教学 |
+- **Progressive GEMM kernels**: naive, tiled, coalesced, double-buffered, register-blocked, fused, and vectorized CUDA implementations.
+- **Minimal runtime pieces**: `Tensor`, `InferenceEngine`, `MemoryPool`, `StreamManager`, `AutoTuner`, and `Profiler`.
+- **Benchmarks and tests**: buildable examples plus host/GPU test split.
+- **Bilingual documentation**: practical guides for building, profiling, and understanding the code.
 
-**推荐学习路径：**
-```
-本项目 (GEMM 基础) → CUTLASS (进阶) → cuBLAS (生产)
-```
+## Build and test
 
----
-
-## 快速开始
-
-**要求：** CUDA Toolkit 12.x、CMake 3.18+、C++17 编译器、NVIDIA GPU (SM 7.0+)
+The stable local CUDA path uses the system GCC 12 / G++ 12 toolchain:
 
 ```bash
-git clone https://github.com/LessUp/mini-inference-engine.git
-cd mini-inference-engine
-
-# Debug 构建 + 测试
-cmake --preset default
-cmake --build --preset default
-ctest --preset default --output-on-failure
-
-# Release 构建 + Benchmark
-cmake --preset release
-cmake --build --preset release
-./build-release/benchmark
-```
-
----
+cmake --preset gcc-cuda
+cmake --build --preset gcc-cuda
+ctest --preset gcc-cuda
 
-## 核心架构
-
-```
-┌─────────────────────────────────────────────────────────────┐
-│                    Application Layer                         │
-│         Benchmark / MNIST Demo / Tests                       │
-└─────────────────────────────────────────────────────────────┘
-                              │
-                              ▼
-┌─────────────────────────────────────────────────────────────┐
-│                      Engine Layer                            │
-│    InferenceEngine / Tensor / AutoTuner / Profiler           │
-└─────────────────────────────────────────────────────────────┘
-                              │
-                              ▼
-┌─────────────────────────────────────────────────────────────┐
-│                      Kernel Layer                            │
-│      7-Level GEMM / FP16 GEMM / Batch GEMM / cuBLAS          │
-└─────────────────────────────────────────────────────────────┘
-                              │
-                              ▼
-┌─────────────────────────────────────────────────────────────┐
-│                  Infrastructure Layer                        │
-│   MemoryPool / StreamManager / Logger / Config / Quantization│
-└─────────────────────────────────────────────────────────────┘
+cmake --preset release-gcc-cuda
+cmake --build --preset release-gcc-cuda
+./build-release-gcc-cuda/benchmark
 ```
 
----
-
-## 文档
-
-| 主题 | 中文 | English |
-|------|------|---------|
-| 快速开始 | [快速入门](docs/zh/QUICK_START.md) | [Quick Start](docs/en/QUICK_START.md) |
-| 架构设计 | [架构](docs/zh/ARCHITECTURE.md) | [Architecture](docs/en/ARCHITECTURE.md) |
-| GEMM 优化 | [优化详解](docs/zh/GEMM_OPTIMIZATION.md) | [Optimization](docs/en/GEMM_OPTIMIZATION.md) |
-| 性能调优 | [调优指南](docs/zh/PERFORMANCE_TUNING.md) | [Tuning](docs/en/PERFORMANCE_TUNING.md) |
-| API 参考 | [API](docs/zh/API_REFERENCE.md) | [API](docs/en/API_REFERENCE.md) |
-| 学习路径 | [学习计划](docs/zh/guides/learning-path.md) | [Learning Path](docs/en/guides/learning-path.md) |
-| FAQ | [常见问题](docs/zh/guides/faq.md) | [FAQ](docs/en/guides/faq.md) |
-
----
-
-## 学术论文引用
-
-本项目的优化技术来自以下经典论文：
-
-```bibtex
-@article{volkov2009better,
-  title={Better performance at lower occupancy},
-  author={Volkov, Vasily},
-  journal={GTC},
-  year={2009}
-}
-
-@inproceedings{hong2009analytic,
-  title={An analytical model for the GPU architecture},
-  author={Hong, Sunpyo and Kim, Hyesoon},
-  booktitle={ISPASS},
-  year={2009}
-}
-```
-
----
-
-## 工程规范
-
-- **规格驱动**：`openspec/specs/**` 作为唯一事实源
-- **构建系统**：显式源列表，禁止 globbing
-- **代码风格**：`.clang-format` (Google-based, 4-space)
-- **测试分类**：`tests_host` (无 GPU) + `tests_gpu` (需要 GPU)
-- **分支策略**：`master` 唯一长期分支
-
-详见 [AGENTS.md](AGENTS.md)。
-
----
-
-## License
-
-[MIT](LICENSE) © 2024-present
+If your shell is already using a clean system compiler, `default` and `release` remain available. `tests_host` covers utilities that do not need a GPU device. `tests_gpu` covers CUDA runtime and kernel behavior; they may skip without an available NVIDIA GPU, but configuration and compilation still require the CUDA Toolkit.
+
+## Repository layout
+
+| Area | Purpose |
+| --- | --- |
+| `src/` | CUDA kernels and runtime implementation |
+| `include/` | Public headers for kernels, runtime, and utilities |
+| `benchmarks/` | Benchmark and demo entry points |
+| `tests/` | Host tests and GPU-backed behavior tests |
+| `docs/` | GitHub Pages source and long-form documentation |
+| `CHANGELOG.md` | Single change log for the whole project |
+
+## Documentation
+
+| Topic | English | 中文 |
+| --- | --- | --- |
+| Getting started | [docs/en/guides/getting-started.md](docs/en/guides/getting-started.md) | [docs/zh/guides/getting-started.md](docs/zh/guides/getting-started.md) |
+| Architecture | [docs/en/architecture.md](docs/en/architecture.md) | [docs/zh/architecture.md](docs/zh/architecture.md) |
+| GEMM deep dive | [docs/en/deep-dive/gemm-optimization.md](docs/en/deep-dive/gemm-optimization.md) | [docs/zh/deep-dive/gemm-optimization.md](docs/zh/deep-dive/gemm-optimization.md) |
+| Performance tuning | [docs/en/performance-tuning.md](docs/en/performance-tuning.md) | [docs/zh/performance-tuning.md](docs/zh/performance-tuning.md) |
+| API reference | [docs/en/api-reference.md](docs/en/api-reference.md) | [docs/zh/api-reference.md](docs/zh/api-reference.md) |
+| Contributing | [docs/en/contributing.md](docs/en/contributing.md) | [docs/zh/contributing.md](docs/zh/contributing.md) |
+
+## Project rules
+
+- Use `.clang-format`; functions and variables use `snake_case`, classes use `PascalCase`, and constants/template parameters use `UPPER_SNAKE_CASE`.
+- Wrap CUDA API calls with `CUDA_CHECK()` and cuBLAS calls with `CUBLAS_CHECK()`.
+- Prefer `DeviceMemory` or `PooledMemory` over raw GPU allocation lifetimes.
+- Add new source files explicitly to `CMakeLists.txt`; do not rely on recursive globbing.
+- Keep GitHub Pages focused on documentation and keep all release history in the root `CHANGELOG.md`.
diff --git a/README.zh-CN.md b/README.zh-CN.md
index 18fa2ce..1300662 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -1,95 +1,71 @@
 <p align="center">
-  <img src="assets/images/logo.svg" alt="Mini-Inference Engine Logo" width="120" height="120">
+  <img src="assets/images/logo.svg" alt="Mini-Inference Engine 标志" width="120" height="120">
 </p>
 
 <h1 align="center">Mini-Inference Engine</h1>
 
 <p align="center">
-  <b>CUDA GEMM 优化教程与迷你推理引擎</b><br>
-  <i>从朴素矩阵乘法到参考 benchmark 中约 85% cuBLAS 级吞吐</i>
+  <b>CUDA GEMM 优化教程与迷你推理运行时</b><br>
+  <i>保持紧凑的 C++17/CUDA 代码库：7 级 GEMM 优化路径 + 最小推理引擎骨架；性能口径统一为参考 RTX 3080 1024×1024 benchmark 中约 85% cuBLAS 级吞吐</i>
 </p>
 
 <p align="center">
   <a href="README.md">English</a> ·
-  <a href="README.zh-CN.md">简体中文</a> ·
   <a href="https://lessup.github.io/mini-inference-engine/">在线文档</a> ·
-  <a href="docs/zh/QUICK_START.md">快速开始</a>
+  <a href="docs/zh/guides/getting-started.md">快速开始</a>
 </p>
 
-<p align="center">
-  <a href="https://github.com/LessUp/mini-inference-engine/actions/workflows/ci.yml">
-    <img src="https://github.com/LessUp/mini-inference-engine/actions/workflows/ci.yml/badge.svg" alt="CI">
-  </a>
-  <a href="https://github.com/LessUp/mini-inference-engine/actions/workflows/pages.yml">
-    <img src="https://github.com/LessUp/mini-inference-engine/actions/workflows/pages.yml/badge.svg" alt="Docs">
-  </a>
-  <a href="LICENSE">
-    <img src="https://img.shields.io/badge/许可证-MIT-green.svg" alt="License: MIT">
-  </a>
-  <img src="https://img.shields.io/badge/CUDA-11.0+-76B900?logo=nvidia&logoColor=white" alt="CUDA">
-  <img src="https://img.shields.io/badge/C%2B%2B-17-00599C?logo=c%2B%2B&logoColor=white" alt="C++17">
-</p>
-
----
-
-## 这个仓库有什么
-
-Mini-Inference Engine 是一个紧凑的 CUDA/C++17 项目，用真实推理引擎的工程形态讲清楚高性能 GEMM 优化。项目刻意保持小而完整：矩阵乘法 kernel、运行时组件、benchmark、测试、双语文档和 OpenSpec 规格都在同一个可追踪代码库中。
-
-**核心入口：**
-
-| 领域 | 建议查看 |
-| --- | --- |
-| GEMM kernels | `src/naive_matmul.cu` 到 `src/vectorized_gemm.cu` 展示完整优化路径。 |
-| 运行时组件 | `include/tensor.h`、`include/inference_engine.h`、`include/memory_pool.h`、`include/stream_manager.h`。 |
-| Benchmark | `benchmarks/benchmark.cpp`、`benchmarks/detailed_benchmark.cu`、`benchmarks/mnist_demo.cpp`。 |
-| 规格文档 | `openspec/specs/` 定义需求、架构、API、数据和测试约束。 |
-| 教程文档 | `docs/zh/` 与 `docs/en/` 提供教程、架构、API、调优指南。 |
+## 仓库保留的核心内容
 
-性能数字与硬件、矩阵规模和编译选项强相关。本项目统一采用保守口径：在文档记录的 RTX 3080、1024×1024 矩阵 benchmark 中，最高优化 kernel 可达到约 **85% cuBLAS 级吞吐**。
+Mini-Inference Engine 现在只保留长期有价值的主线内容：
 
----
+- **渐进式 GEMM kernels**：从 naive、tiled、coalesced 到 double buffer、register blocked、fused、vectorized。
+- **最小运行时组件**：`Tensor`、`InferenceEngine`、`MemoryPool`、`StreamManager`、`AutoTuner`、`Profiler`。
+- **基准与测试**：可构建的 benchmark/demo，以及 host/GPU 分层测试。
+- **双语文档**：围绕构建、调优、架构和 API 的实用文档。
 
-## 快速开始
+## 构建与测试
 
-环境要求：CUDA Toolkit 11.0+、CMake 3.18+、C++17 编译器，以及计算能力 7.0+ 的 NVIDIA GPU。
+本地最稳定的 CUDA 路径已经固化为系统 GCC 12 / G++ 12 预设：
 
 ```bash
-git clone https://github.com/LessUp/mini-inference-engine.git
-cd mini-inference-engine
+cmake --preset gcc-cuda
+cmake --build --preset gcc-cuda
+ctest --preset gcc-cuda
 
-cmake --preset default
-cmake --build --preset default
-ctest --preset default --output-on-failure
-
-cmake --preset release
-cmake --build --preset release
-./build-release/benchmark
+cmake --preset release-gcc-cuda
+cmake --build --preset release-gcc-cuda
+./build-release-gcc-cuda/benchmark
 ```
 
-没有 CUDA 设备时 GPU 测试会跳过，但构建仍需要 CUDA Toolkit，因为项目本身按 CUDA 工程编译。
+如果当前 shell 本身就是干净的系统编译器环境，`default` 和 `release` 预设仍可使用。`tests_host` 覆盖不依赖 GPU 设备的工具测试；`tests_gpu` 覆盖 CUDA runtime 与 kernel 行为，没有可用 NVIDIA GPU 时可跳过，但项目配置和编译仍需要 CUDA Toolkit。
+
+## 仓库结构
 
----
+| 目录 | 作用 |
+| --- | --- |
+| `src/` | CUDA kernels 与运行时实现 |
+| `include/` | kernels、运行时与工具头文件 |
+| `benchmarks/` | benchmark 与 demo 入口 |
+| `tests/` | host 测试与 GPU 行为测试 |
+| `docs/` | GitHub Pages 源文件与长文档 |
+| `CHANGELOG.md` | 项目唯一变更记录 |
 
-## 文档导航
+## 文档入口
 
 | 主题 | 中文 | English |
 | --- | --- | --- |
-| 快速开始 | [docs/zh/QUICK_START.md](docs/zh/QUICK_START.md) | [docs/en/QUICK_START.md](docs/en/QUICK_START.md) |
-| 架构设计 | [docs/zh/ARCHITECTURE.md](docs/zh/ARCHITECTURE.md) | [docs/en/ARCHITECTURE.md](docs/en/ARCHITECTURE.md) |
-| GEMM 优化 | [docs/zh/GEMM_OPTIMIZATION.md](docs/zh/GEMM_OPTIMIZATION.md) | [docs/en/GEMM_OPTIMIZATION.md](docs/en/GEMM_OPTIMIZATION.md) |
-| 性能调优 | [docs/zh/PERFORMANCE_TUNING.md](docs/zh/PERFORMANCE_TUNING.md) | [docs/en/PERFORMANCE_TUNING.md](docs/en/PERFORMANCE_TUNING.md) |
-| API 参考 | [docs/zh/API_REFERENCE.md](docs/zh/API_REFERENCE.md) | [docs/en/API_REFERENCE.md](docs/en/API_REFERENCE.md) |
-| 开发指南 | [docs/zh/CONTRIBUTING.md](docs/zh/CONTRIBUTING.md) | [docs/en/CONTRIBUTING.md](docs/en/CONTRIBUTING.md) |
-
----
-
-## 工程流程
-
-- 事实源：`openspec/specs/**`。
-- 构建系统：`CMakeLists.txt` 使用显式源文件列表，不使用递归 glob。
-- 代码格式：`.clang-format`，Google-based 4 空格风格。
-- 测试拆分：`tests_host` 覆盖不需要 GPU 设备的工具测试，`tests_gpu` 覆盖 CUDA runtime/kernel 行为。项目配置和编译仍需要 CUDA Toolkit。
-- 分支策略：`master` 是唯一长期主线；变更使用短期分支/worktree，合并后删除。
-
-完整 AI 与工程协作规范见 [AGENTS.md](AGENTS.md)。
+| 快速开始 | [docs/zh/guides/getting-started.md](docs/zh/guides/getting-started.md) | [docs/en/guides/getting-started.md](docs/en/guides/getting-started.md) |
+| 架构 | [docs/zh/architecture.md](docs/zh/architecture.md) | [docs/en/architecture.md](docs/en/architecture.md) |
+| GEMM 深入解析 | [docs/zh/deep-dive/gemm-optimization.md](docs/zh/deep-dive/gemm-optimization.md) | [docs/en/deep-dive/gemm-optimization.md](docs/en/deep-dive/gemm-optimization.md) |
+| 性能调优 | [docs/zh/performance-tuning.md](docs/zh/performance-tuning.md) | [docs/en/performance-tuning.md](docs/en/performance-tuning.md) |
+| API 参考 | [docs/zh/api-reference.md](docs/zh/api-reference.md) | [docs/en/api-reference.md](docs/en/api-reference.md) |
+| 贡献指南 | [docs/zh/contributing.md](docs/zh/contributing.md) | [docs/en/contributing.md](docs/en/contributing.md) |
+
+## 维护规则
+
+- 使用 `.clang-format`；函数/变量 `snake_case`，类 `PascalCase`，常量与模板参数 `UPPER_SNAKE_CASE`。
+- CUDA API 必须用 `CUDA_CHECK()`，cuBLAS API 必须用 `CUBLAS_CHECK()`。
+- GPU 内存优先使用 `DeviceMemory` 或 `PooledMemory`，避免新增裸分配生命周期。
+- 新源文件必须显式加入 `CMakeLists.txt`，不要依赖递归 glob。
+- GitHub Pages 只承载文档，项目变更历史只保留根目录 `CHANGELOG.md`。
diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts
index 93ecd80..210d39f 100644
--- a/docs/.vitepress/config.ts
+++ b/docs/.vitepress/config.ts
@@ -1,8 +1,5 @@
 import { defineConfig } from 'vitepress'
-import { withMermaid } from 'vitepress-plugin-mermaid'
-import llmstxt from 'vitepress-plugin-llms'
 
-// Robust base path handling (like kimi-cli)
 const rawBase = process.env.VITEPRESS_BASE
 const base = rawBase
   ? rawBase.startsWith('/')
@@ -10,28 +7,21 @@ const base = rawBase
     : `/${rawBase}/`
   : '/'
 
-export default withMermaid(defineConfig({
+export default defineConfig({
   base,
   title: 'Mini-Inference Engine',
   description: 'CUDA GEMM optimization tutorial and mini inference engine',
-
-  // Ignore dead links (legacy Jekyll-era files reference paths that don't exist in VitePress)
-  ignoreDeadLinks: true,
-
   head: [
     ['link', { rel: 'icon', href: '/favicon.svg' }],
     ['meta', { name: 'theme-color', content: '#16b1ff' }],
-    // Open Graph
     ['meta', { property: 'og:type', content: 'website' }],
     ['meta', { property: 'og:title', content: 'Mini-Inference Engine' }],
     ['meta', { property: 'og:description', content: 'CUDA GEMM optimization tutorial and mini inference engine' }],
     ['meta', { property: 'og:site_name', content: 'Mini-Inference Engine' }],
-    // Twitter Card
     ['meta', { name: 'twitter:card', content: 'summary_large_image' }],
     ['meta', { name: 'twitter:title', content: 'Mini-Inference Engine' }],
     ['meta', { name: 'twitter:description', content: 'CUDA GEMM optimization tutorial and mini inference engine' }],
   ],
-
   locales: {
     zh: {
       label: '简体中文',
@@ -132,43 +122,13 @@ export default withMermaid(defineConfig({
       },
     },
   },
-
   themeConfig: {
     outline: { level: [2, 3] },
     search: { provider: 'local' },
-    socialLinks: [
-      { icon: 'github', link: 'https://github.com/LessUp/mini-inference-engine' },
-    ],
+    socialLinks: [{ icon: 'github', link: 'https://github.com/LessUp/mini-inference-engine' }],
     footer: {
       message: 'MIT License | CUDA GEMM optimization tutorial',
       copyright: 'Copyright © 2024-present',
     },
-    // i18n UI text for Chinese
-    docFooter: {
-      prev: '上一页',
-      next: '下一页',
-    },
-    lastUpdated: {
-      text: '最后更新',
-      formatOptions: {
-        dateStyle: 'short',
-      },
-    },
-  },
-
-  vite: {
-    plugins: [llmstxt()],
-  },
-
-  mermaid: {
-    theme: 'base',
-    themeVariables: {
-      primaryColor: '#16b1ff',
-      primaryTextColor: '#fff',
-      primaryBorderColor: '#488aff',
-      lineColor: '#8b949e',
-      secondaryColor: '#f6f8fa',
-      tertiaryColor: '#f6f8fa',
-    },
   },
-}))
+})
diff --git a/docs/INFRASTRUCTURE.md b/docs/INFRASTRUCTURE.md
deleted file mode 100644
index 5085999..0000000
--- a/docs/INFRASTRUCTURE.md
+++ /dev/null
@@ -1,173 +0,0 @@
-# 基础设施与开发环境
-
-本文档记录 Mini-Inference Engine 的构建环境要求、CUDA 版本依赖、CI 配置说明和开发流程。
-
-## 构建环境要求
-
-### 必需依赖
-
-| 依赖 | 最低版本 | 推荐版本 |
-|------|----------|----------|
-| CMake | 3.18 | 3.25+ |
-| CUDA Toolkit | 11.0 | 12.x |
-| C++ 编译器 | C++17 | GCC 11+ / MSVC 2022 |
-| Git | 2.20+ | 最新 |
-
-### 可选依赖
-
-| 依赖 | 用途 |
-|------|------|
-| Google Test 1.14+ | 测试框架（可自动 FetchContent） |
-| clang-format 18 | 代码格式化 |
-| clang-tidy | 静态分析 |
-
-## CUDA 版本说明
-
-### 支持的 CUDA 版本
-
-- **CUDA 11.x**: 支持 SM 75-86
-- **CUDA 12.x**: 支持 SM 75-90
-
-### 默认架构
-
-项目默认编译为 SM 75（RTX 20xx / T4）：
-```bash
-cmake -B build -DCMAKE_CUDA_ARCHITECTURES=75
-```
-
-### CI 构建配置
-
-GitHub Actions CI 使用 CUDA 12.8.0，架构 75。
-
-## 构建命令
-
-### Debug 构建
-```bash
-cmake --preset default
-cmake --build --preset default
-```
-
-### Release 构建
-```bash
-cmake --preset release
-cmake --build --preset release
-```
-
-### 运行测试
-```bash
-ctest --preset default --output-on-failure
-```
-
-### 仅运行 Host 测试
-```bash
-ctest --test-dir build -L host --output-on-failure
-```
-
-### 仅运行 GPU 测试
-```bash
-ctest --test-dir build -L gpu --output-on-failure
-```
-
-## CI 配置说明
-
-### 工作流文件
-
-| 文件 | 触发条件 | 功能 |
-|------|----------|------|
-| `.github/workflows/ci.yml` | push/PR 到 master | 编译、格式检查、测试发现 |
-| `.github/workflows/pages.yml` | push 到 master | 部署 GitHub Pages |
-
-### CI 限制
-
-由于 GitHub-hosted runners 没有 GPU，CI 仅执行：
-1. CUDA 编译验证
-2. 代码格式检查（clang-format 18）
-3. 测试发现（`ctest -N`）
-4. 文档结构检查
-
-**完整 GPU 测试需在本地或 self-hosted GPU runner 执行。**
-
-## 开发流程
-
-### 1. 创建功能分支
-```bash
-git checkout -b feature/your-feature master
-```
-
-### 2. 开发与测试
-```bash
-# 编译
-cmake --build --preset default
-
-# 运行测试（如有 GPU）
-ctest --preset default --output-on-failure
-
-# 格式检查
-clang-format --style=file -i src/*.cu include/*.h tests/*.cu
-```
-
-### 3. 提交变更
-```bash
-git add -A
-git commit -m "feat: your feature description"
-```
-
-### 4. 合并到主线
-```bash
-git checkout master
-git merge feature/your-feature
-git push origin master
-git branch -d feature/your-feature
-```
-
-## 测试分类
-
-| 标签 | 文件 | 需要 GPU |
-|------|------|----------|
-| `host` | test_config.cpp, test_logger.cpp, test_quantization.cpp | 否 |
-| `gpu` | test_gemm.cu, test_fusion.cu, test_tensor.cpp 等 | 是 |
-
-无 GPU 环境时，GPU 测试会自动跳过（使用 `MINI_INFERENCE_REQUIRE_CUDA_DEVICE()` 宏）。
-
-## 代码风格
-
-### clang-format 配置
-
-项目使用 `.clang-format` 配置文件，基于 Google Style：
-- 4 空格缩进
-- 100 字符行宽
-- 指针靠左对齐
-
-### 格式化命令
-```bash
-# 格式化单个文件
-clang-format --style=file -i path/to/file.cpp
-
-# 格式化所有源文件
-find src include tests benchmarks -name '*.cpp' -o -name '*.cu' -o -name '*.h' -o -name '*.cuh' | \
-xargs clang-format --style=file -i
-```
-
-## 常见问题
-
-### Q: 编译时找不到 CUDA？
-
-确保 `CUDA_PATH` 环境变量已设置：
-```bash
-export CUDA_PATH=/usr/local/cuda
-export PATH=$CUDA_PATH/bin:$PATH
-```
-
-### Q: 测试编译失败 "test_cuda_utils.h not found"？
-
-确保 `tests/test_cuda_utils.h` 存在。该文件提供 GPU 测试跳过宏。
-
-### Q: GPU 测试被跳过？
-
-这是正常行为。`MINI_INFERENCE_REQUIRE_CUDA_DEVICE()` 宏会在无 GPU 设备时自动跳过测试。
-
-## 相关文档
-
-- [AGENTS.md](https://github.com/LessUp/mini-inference-engine/blob/master/AGENTS.md) - AI 工作流与协作规则
-- [CLAUDE.md](https://github.com/LessUp/mini-inference-engine/blob/master/CLAUDE.md) - Claude Code 入口
-- [贡献指南](en/contributing.md) - 开发规范
diff --git a/docs/README.md b/docs/README.md
index 4f964e9..703c841 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,18 +1,21 @@
 # Documentation
 
-This directory contains the long-form documentation for Mini-Inference Engine. The GitHub Pages portal starts at [../index.md](https://github.com/LessUp/mini-inference-engine/blob/master/index.md); this file is only a compact navigation index.
+This directory is the single source for the GitHub Pages site. Keep it focused on user-facing guides and architecture notes; release history lives only in the repository root `CHANGELOG.md`.
 
-## Core guides
+## Primary entry points
 
 | Topic | English | 中文 |
 | --- | --- | --- |
-| Quick Start | [en/QUICK_START.md](en/quick-start.md) | [zh/QUICK_START.md](zh/quick-start.md) |
-| Architecture | [en/ARCHITECTURE.md](en/architecture.md) | [zh/ARCHITECTURE.md](zh/architecture.md) |
-| GEMM Optimization | [en/GEMM_OPTIMIZATION.md](en/deep-dive/gemm-optimization.md) | [zh/GEMM_OPTIMIZATION.md](zh/deep-dive/gemm-optimization.md) |
-| Performance Tuning | [en/PERFORMANCE_TUNING.md](en/performance-tuning.md) | [zh/PERFORMANCE_TUNING.md](zh/performance-tuning.md) |
-| API Reference | [en/API_REFERENCE.md](en/api-reference.md) | [zh/API_REFERENCE.md](zh/api-reference.md) |
-| Development Guide | [en/CONTRIBUTING.md](en/contributing.md) | [zh/CONTRIBUTING.md](zh/contributing.md) |
+| Home | [en/index.md](en/index.md) | [zh/index.md](zh/index.md) |
+| Getting started | [en/guides/getting-started.md](en/guides/getting-started.md) | [zh/guides/getting-started.md](zh/guides/getting-started.md) |
+| Architecture | [en/architecture.md](en/architecture.md) | [zh/architecture.md](zh/architecture.md) |
+| GEMM deep dive | [en/deep-dive/gemm-optimization.md](en/deep-dive/gemm-optimization.md) | [zh/deep-dive/gemm-optimization.md](zh/deep-dive/gemm-optimization.md) |
+| Performance tuning | [en/performance-tuning.md](en/performance-tuning.md) | [zh/performance-tuning.md](zh/performance-tuning.md) |
+| API reference | [en/api-reference.md](en/api-reference.md) | [zh/api-reference.md](zh/api-reference.md) |
+| Contributing | [en/contributing.md](en/contributing.md) | [zh/contributing.md](zh/contributing.md) |
 
-## Release notes
+## Documentation rules
 
-Detailed release notes live in [`releases/`](https://github.com/LessUp/mini-inference-engine/tree/master/docs/releases). `CHANGELOG.md` at the repository root remains the structured change log.
+- Keep pages aligned with the repository as it exists today; do not document removed workflow layers.
+- Prefer one canonical page per topic; avoid parallel “guide” and “quick start” copies for the same purpose.
+- Do not add changelog copies or release-note directories under `docs/`.
diff --git a/docs/agents/domain.md b/docs/agents/domain.md
deleted file mode 100644
index f1143cc..0000000
--- a/docs/agents/domain.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Domain Docs
-
-How the engineering skills should consume this repo's domain documentation when exploring the codebase.
-
-## Before exploring, read these
-
-- **`AGENTS.md`** at the repo root — the primary domain source for this project. It defines project positioning, core assets, workflow, code conventions, and CUDA-specific guidance.
-- **`CONTEXT.md`** at the repo root (if it exists) — a distilled domain glossary for quick reference.
-- **`docs/adr/`** — read ADRs that touch the area you're about to work in.
-
-If `CONTEXT.md` or `docs/adr/` don't exist, **proceed silently**. Don't flag their absence; don't suggest creating them upfront. The producer skill (`/grill-with-docs`) creates them lazily when terms or decisions actually get resolved.
-
-## File structure
-
-Single-context repo:
-
-```
-/
-├── AGENTS.md              ← primary domain source (this project)
-├── CONTEXT.md             ← optional distilled glossary
-├── docs/adr/              ← architectural decision records
-│   ├── 0001-*.md
-│   └── ...
-└── src/
-```
-
-## Use the glossary's vocabulary
-
-When your output names a domain concept (in an issue title, a refactor proposal, a hypothesis, a test name), use the term as defined in `AGENTS.md` or `CONTEXT.md`. Don't drift to synonyms the glossary explicitly avoids.
-
-If the concept you need isn't documented yet, that's a signal — either you're inventing language the project doesn't use (reconsider) or there's a real gap (note it for `/grill-with-docs`).
-
-## Flag ADR conflicts
-
-If your output contradicts an existing ADR, surface it explicitly rather than silently overriding:
-
-> _Contradicts ADR-0007 (...) — but worth reopening because…_
diff --git a/docs/agents/issue-tracker.md b/docs/agents/issue-tracker.md
deleted file mode 100644
index cce77ec..0000000
--- a/docs/agents/issue-tracker.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# Issue tracker: GitHub
-
-Issues and PRDs for this repo live as GitHub issues. Use the `gh` CLI for all operations.
-
-## Conventions
-
-- **Create an issue**: `gh issue create --title "..." --body "..."`. Use a heredoc for multi-line bodies.
-- **Read an issue**: `gh issue view <number> --comments`, filtering comments by `jq` and also fetching labels.
-- **List issues**: `gh issue list --state open --json number,title,body,labels,comments --jq '[.[] | {number, title, body, labels: [.labels[].name], comments: [.comments[].body]}]'` with appropriate `--label` and `--state` filters.
-- **Comment on an issue**: `gh issue comment <number> --body "..."`
-- **Apply / remove labels**: `gh issue edit <number> --add-label "..."` / `--remove-label "..."`
-- **Close**: `gh issue close <number> --comment "..."`
-
-Infer the repo from `git remote -v` — `gh` does this automatically when run inside a clone.
-
-## When a skill says "publish to the issue tracker"
-
-Create a GitHub issue.
-
-## When a skill says "fetch the relevant ticket"
-
-Run `gh issue view <number> --comments`.
diff --git a/docs/agents/triage-labels.md b/docs/agents/triage-labels.md
deleted file mode 100644
index b716855..0000000
--- a/docs/agents/triage-labels.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Triage Labels
-
-The skills speak in terms of five canonical triage roles. This file maps those roles to the actual label strings used in this repo's issue tracker.
-
-| Label in mattpocock/skills | Label in our tracker | Meaning                                  |
-| -------------------------- | -------------------- | ---------------------------------------- |
-| `needs-triage`             | `needs-triage`       | Maintainer needs to evaluate this issue  |
-| `needs-info`               | `needs-info`         | Waiting on reporter for more information |
-| `ready-for-agent`          | `ready-for-agent`    | Fully specified, ready for an AFK agent  |
-| `ready-for-human`          | `ready-for-human`    | Requires human implementation            |
-| `wontfix`                  | `wontfix`            | Will not be actioned                     |
-
-When a skill mentions a role (e.g. "apply the AFK-ready triage label"), use the corresponding label string from this table.
-
-Edit the right-hand column to match whatever vocabulary you actually use.
diff --git a/docs/en/comparison/industry-analysis.md b/docs/en/comparison/industry-analysis.md
index 6eb502e..6f4fe3c 100644
--- a/docs/en/comparison/industry-analysis.md
+++ b/docs/en/comparison/industry-analysis.md
@@ -210,7 +210,7 @@ This project's optimization techniques come from these academic papers:
 1. **Progressive learning**: From Naive to ~85% cuBLAS, every step verifiable
 2. **Complete engineering**: Not isolated kernels, but complete inference engine skeleton
 3. **Bilingual documentation**: Full Chinese and English docs, suitable for Chinese learners
-4. **OpenSpec driven**: Specification-based development, AI-collaboration friendly
+4. **Focused repository**: Fewer workflow layers, easier to build, read, and maintain
 
 ### Recommended Learning Path
 
diff --git a/docs/en/contributing.md b/docs/en/contributing.md
index 9bdc31b..44c3b5c 100644
--- a/docs/en/contributing.md
+++ b/docs/en/contributing.md
@@ -1,320 +1,52 @@
----
-title: Contributing
----
+# Contributing
 
+Keep contributions narrow, buildable, and easy to review. This project is intentionally small: CUDA GEMM kernels, a lightweight runtime, benchmarks, tests, and bilingual docs.
 
-> **Language**: English | [简体中文](../zh/contributing.md)
+## What belongs here
 
----
+- CUDA kernel work that improves correctness, clarity, or measurable performance.
+- Runtime changes around `Tensor`, `InferenceEngine`, memory management, or profiling.
+- Build, test, benchmark, and documentation fixes tied to the real project layout.
 
-> **Project status:** The core CUDA GEMM path and mini inference runtime are stable.
-> Changes should stay focused, spec-backed, and reviewable through the single `master`
-> trunk workflow described below.
+## What does not belong here
 
----
+- New AI-control frameworks, generated governance layers, or workflow meta-systems.
+- Duplicate docs for the same topic.
+- Unused branches of implementation kept “just in case”.
 
-Thank you for your interest in Mini-Inference Engine! This document describes how to contribute to the project.
-
-## Table of Contents
-
-- [Code of Conduct](#code-of-conduct)
-- [How to Contribute](#how-to-contribute)
-- [Development Setup](#development-setup)
-- [Code Style](#code-style)
-- [Testing](#testing)
-- [Documentation](#documentation)
-- [Submission Process](#submission-process)
-
----
-
-## Code of Conduct
-
-- Respect all contributors
-- Maintain professional and constructive discussions
-- Accept constructive criticism
-- Act in the best interest of the project
-
----
-
-## How to Contribute
-
-### Reporting Bugs
-
-1. Check [Issues](https://github.com/LessUp/mini-inference-engine/issues) for existing reports
-2. Create a new issue including:
-   - **Description**: Clear problem description
-   - **Reproduction steps**: How to reproduce
-   - **Expected behavior**: What should happen
-   - **Actual behavior**: What actually happened
-   - **Environment**: GPU model, CUDA version, OS
-
-### Feature Requests
-
-1. Describe the feature need and use case
-2. Explain why this feature is valuable
-3. Provide possible implementation approach (optional)
-
-### Submitting Code
-
-#### 1. Fork the Repository
+## Development loop
 
 ```bash
-git clone https://github.com/<your-username>/mini-inference-engine.git
-cd mini-inference-engine
-git remote add upstream https://github.com/LessUp/mini-inference-engine.git
-```
-
-#### 2. Create a short-lived branch or worktree
+cmake --preset gcc-cuda
+cmake --build --preset gcc-cuda
+ctest --preset gcc-cuda
 
-```bash
-# Keep master as the only long-lived branch.
-git checkout -b chore/focused-change
+cmake --preset release-gcc-cuda
+cmake --build --preset release-gcc-cuda
 ```
 
-For larger work, prefer an isolated worktree and delete it after the change is merged.
-
-#### 3. Write Code
+GPU-backed tests may skip without an available NVIDIA device, but the project still expects the CUDA Toolkit for configuration and compilation.
 
-Follow [Code Style](#code-style).
+## Coding rules
 
-#### 4. Test
+- Use `.clang-format`.
+- Functions and variables use `snake_case`; classes use `PascalCase`; constants and template parameters use `UPPER_SNAKE_CASE`.
+- Wrap CUDA API calls with `CUDA_CHECK()` and cuBLAS calls with `CUBLAS_CHECK()`.
+- Prefer `DeviceMemory` or `PooledMemory` over manual `cudaMalloc` / `cudaFree` lifetimes.
+- Add new source files explicitly to `CMakeLists.txt`.
 
-```bash
-# Debug + tests
-cmake --preset default
-cmake --build --preset default
-ctest --preset default
+## Documentation rules
 
-# Release build
-cmake --preset release
-cmake --build --preset release
-```
-
-#### 5. Commit
-
-```bash
-git add .
-git commit -m "feat: add new feature"
-```
+- Keep GitHub Pages content under `docs/`.
+- Keep release history only in the root `CHANGELOG.md`.
+- Update docs when behavior, commands, or repository layout changes.
 
-Commit message format follows [Conventional Commits](https://www.conventionalcommits.org/):
+## Submission
 
-| Type | Description |
-|:---|:---|
-| `feat:` | New feature |
-| `fix:` | Bug fix |
-| `docs:` | Documentation update |
-| `perf:` | Performance improvement |
-| `refactor:` | Code refactoring |
-| `test:` | Test-related |
-| `chore:` | Build/tool changes |
-
-#### 6. Review, push, and merge back to master
-
-```bash
-git push origin chore/focused-change
-```
-
-Create a Pull Request, request review, merge to `master`, then delete the short-lived branch.
-
----
-
-## Development Setup
-
-### Requirements
-
-| Dependency | Minimum | Recommended |
-|:---|:---|:---|
-| CUDA Toolkit | 11.0 | 12.0+ |
-| CMake | 3.18 | 3.25+ |
-| C++ Compiler | GCC 9 / Clang 10 | GCC 11+ |
-| GPU | Compute Capability 7.0+ | 8.0+ |
-
-### Verify Environment
-
-```bash
-# Check CUDA
-nvcc --version
-nvidia-smi
-
-# Check CMake
-cmake --version
-
-# Check Compiler
-gcc --version
-```
-
----
-
-## Code Style
-
-### C++ Style
-
-```cpp
-// Naming conventions
-class ClassName;           // Class: PascalCase
-void function_name();      // Function: snake_case
-int variable_name;         // Variable: snake_case
-const int CONSTANT_NAME;   // Constant: UPPER_SNAKE_CASE
-int member_variable_;      // Member: snake_case + suffix
-
-// Indentation: 4 spaces
-void function() {
-    if (condition) {
-        // code
-    }
-}
-
-// Braces: same line
-if (condition) {
-    // code
-} else {
-    // code
-}
-```
+Use a short-lived branch or worktree, keep the diff focused, and submit a pull request against `master`.
 
-### CUDA Style
+Useful references:
 
-```cpp
-// Kernel naming: snake_case
-__global__ void my_kernel(...);
-
-// Template parameters: UPPER_SNAKE_CASE
-template<int BLOCK_SIZE, bool USE_SHARED>
-__global__ void templated_kernel(...);
-
-// Shared memory: s_ prefix
-__shared__ float s_data[256];
-
-// Register variables: r_ prefix
-float r_sum = 0.0f;
-```
-
-### Code Formatting
-
-Use clang-format:
-
-```bash
-clang-format --style=file -i <file>
-```
-
----
-
-## Testing
-
-### Unit Tests
-
-Every new feature needs tests:
-
-```cpp
-#include <gtest/gtest.h>
-#include "feature.h"
-
-class FeatureTest : public ::testing::Test {
-protected:
-    void SetUp() override {
-        CUDA_CHECK(cudaSetDevice(0));
-    }
-};
-
-TEST_F(FeatureTest, BasicFunctionality) {
-    EXPECT_EQ(expected, actual);
-}
-```
-
-### Performance Tests
-
-For performance-related changes:
-
-```cpp
-TEST_F(FeatureTest, Performance) {
-    GpuTimer timer;
-    
-    // Warmup
-    for (int i = 0; i < 5; i++) {
-        function_under_test();
-    }
-    
-    // Benchmark
-    timer.start();
-    for (int i = 0; i < 20; i++) {
-        function_under_test();
-    }
-    timer.stop();
-    
-    float avg_time = timer.elapsed_ms() / 20;
-    printf("Average time: %.3f ms\n", avg_time);
-}
-```
-
----
-
-## Documentation
-
-### Code Comments
-
-```cpp
-/// @brief Execute optimized GEMM operation
-/// @param A Input matrix A (M x K)
-/// @param B Input matrix B (K x N)
-/// @param C Output matrix C (M x N)
-/// @param M Rows of A
-/// @param N Columns of B
-/// @param K Columns of A / Rows of B
-/// @param stream CUDA stream (optional)
-void launch_optimized_gemm(const float* A, const float* B, float* C,
-                           int M, int N, int K, cudaStream_t stream = 0);
-```
-
-### README Updates
-
-When adding new features, update README.md:
-- Feature list
-- Usage examples
-- API documentation
-
----
-
-## Submission Process
-
-### Review Process
-
-1. **Automatic checks**
-   - Compilation passes
-   - Tests pass
-   - Code style check
-
-2. **Manual review**
-   - Code quality
-   - Design rationale
-   - Documentation completeness
-
-3. **Performance validation** (if applicable)
-   - No performance regression
-   - New optimizations are effective
-
-### Merge Requirements
-
-- All CI checks pass
-- At least 1 reviewer approval
-- No unresolved review comments
-
----
-
-## Contact
-
-- **Issues**: [GitHub Issues](https://github.com/LessUp/mini-inference-engine/issues)
-- **Discussions**: [GitHub Discussions](https://github.com/LessUp/mini-inference-engine/discussions)
-
----
-
-## Related Links
-
-- [简体中文](../zh/contributing.md)
-- [Quick Start](quick-start.md)
-- [Architecture Design](architecture.md)
+- [Getting Started](guides/getting-started.md)
+- [Architecture](architecture.md)
 - [API Reference](api-reference.md)
-
----
-
-*Last Updated: 2025-04-16 | Document Version: v1.1.0*
diff --git a/docs/en/guides/getting-started.md b/docs/en/guides/getting-started.md
index 2c7a0b1..f0663cd 100644
--- a/docs/en/guides/getting-started.md
+++ b/docs/en/guides/getting-started.md
@@ -26,20 +26,22 @@ cd mini-inference-engine
 
 ## Step 2: Debug Build + Tests
 
+Use the system GCC 12 / G++ 12 preset when your shell has Conda or another custom C++ toolchain active.
+
 ```bash
-# Configure Debug build
-cmake --preset default
+# Configure Debug build with system GCC 12 / G++ 12
+cmake --preset gcc-cuda
 
 # Build
-cmake --build --preset default
+cmake --build --preset gcc-cuda
 
 # Run tests
-ctest --preset default --output-on-failure
+ctest --preset gcc-cuda
 ```
 
 **Expected Output:**
 ```
-Test project /path/to/mini-inference-engine/build-default
+Test project /path/to/mini-inference-engine/build-gcc-cuda
     Start 1: test_config
 1/8 Test #1: test_config .....................   Passed    0.01 sec
     Start 2: test_logger
@@ -55,14 +57,14 @@ Test project /path/to/mini-inference-engine/build-default
 ## Step 3: Release Build + Benchmark
 
 ```bash
-# Configure Release build
-cmake --preset release
+# Configure Release build with system GCC 12 / G++ 12
+cmake --preset release-gcc-cuda
 
 # Build
-cmake --build --preset release
+cmake --build --preset release-gcc-cuda
 
 # Run benchmark
-./build-release/benchmark
+./build-release-gcc-cuda/benchmark
 ```
 
 **Expected Output:**
@@ -114,7 +116,7 @@ If no NVIDIA GPU is available, GPU tests will automatically skip. This is expect
 
 1. Check CUDA version compatibility
 2. Check C++ compiler supports C++17
-3. Check CMake version >= 3.18
+3. If Conda is active, prefer `cmake --preset gcc-cuda`
 
 ---
 
diff --git a/docs/en/index.md b/docs/en/index.md
index 92392d7..d2db6f2 100644
--- a/docs/en/index.md
+++ b/docs/en/index.md
@@ -82,8 +82,8 @@ features:
     <div class="perf-row">
       <span class="perf-label">L7 Vectorized</span>
       <div class="perf-bar-bg">
-        <div class="perf-bar perf-bar-animated" style="--target-width: 89%; background: var(--perf-vectorized);">
-          <span class="perf-value">89%</span>
+        <div class="perf-bar perf-bar-animated" style="--target-width: 85%; background: var(--perf-vectorized);">
+          <span class="perf-value">85%</span>
         </div>
       </div>
     </div>
@@ -99,8 +99,8 @@ features:
   <div class="quick-start-title">Requirements</div>
   <div class="quick-start-content">
     <ul>
-      <li>CUDA Toolkit 11.0+</li>
-      <li>CMake 3.20+</li>
+      <li>CUDA Toolkit 12.x</li>
+      <li>CMake 3.18+</li>
       <li>C++17 compatible compiler</li>
       <li>NVIDIA GPU (Compute Capability 7.0+)</li>
     </ul>
diff --git a/docs/en/quick-start.md b/docs/en/quick-start.md
deleted file mode 100644
index f5c7606..0000000
--- a/docs/en/quick-start.md
+++ /dev/null
@@ -1,399 +0,0 @@
----
-title: Quick Start
----
-
-
-> **Language**: English | [简体中文](../zh/quick-start.md)
-
----
-
-## Table of Contents
-
-
-<details open markdown="block">
-  <summary>
-    Sections
-  </summary>
-  {: .text-delta }
-  - TOC
-  {:toc}
-</details>
-
----
-
-## System Requirements
-
-### Hardware Requirements
-
-| Component | Minimum | Recommended |
-|:---|:---|:---|
-| GPU | NVIDIA GPU, Compute Capability 7.0+ | RTX 30 series or higher |
-| VRAM | 4 GB | 8 GB+ |
-| System Memory | 8 GB | 16 GB+ |
-| Operating System | Linux / Windows / macOS | Ubuntu 22.04 LTS |
-
-### Software Requirements
-
-| Dependency | Minimum | Recommended |
-|:---|:---|:---|
-| CUDA Toolkit | 11.0 | 12.0+ |
-| CMake | 3.18 | 3.25+ |
-| GCC | 9.0 | 11.0+ |
-| Python | 3.8+ | 3.10+ |
-
-### Verify Environment
-
-```bash
-# Check CUDA version
-nvcc --version
-
-# Check GPU info
-nvidia-smi
-
-# Check CMake version
-cmake --version
-
-# Check GCC version
-gcc --version
-```
-
----
-
-## Quick Start
-
-### 1. Clone the Repository
-
-```bash
-git clone https://github.com/LessUp/mini-inference-engine.git
-cd mini-inference-engine
-```
-
-### 2. Build the Project (Recommended)
-
-This project uses CMake Presets to simplify the build process:
-
-```bash
-# Debug build (includes tests, enables assertions)
-cmake --preset default
-cmake --build --preset default
-
-# Release build (optimized performance)
-cmake --preset release
-cmake --build --preset release
-```
-
-### 3. Verify Installation
-
-```bash
-# Run unit tests
-ctest --preset default
-
-# Run performance benchmarks
-./build-release/benchmark
-
-# Run MNIST demo (optional)
-./build-release/mnist_demo
-```
-
----
-
-## Build the Project
-
-### Using CMake Presets (Recommended)
-
-| Preset | Purpose | Configuration |
-|:---|:---|:---|
-| `default` | Development & debugging | Debug mode, enables tests |
-| `release` | Performance testing | Release mode, O3 optimization |
-| `ci` | Continuous integration | Strict warnings, test coverage |
-
-```bash
-# List available presets
-cmake --list-presets
-
-# Use specific preset
-cmake --preset <preset-name>
-cmake --build --preset <preset-name>
-```
-
-### Manual Build
-
-```bash
-mkdir build && cd build
-
-# Configure
-cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON
-
-# Compile (using all available cores)
-make -j$(nproc)
-
-# Run tests
-ctest --output-on-failure
-```
-
-### Build Options
-
-| Option | Description | Default |
-|:---|:---|:---|
-| `BUILD_TESTS` | Build unit tests | `ON` |
-| `BUILD_BENCHMARKS` | Build benchmarks | `ON` |
-| `BUILD_MNIST_DEMO` | Build MNIST demo | `ON` |
-| `CMAKE_CUDA_ARCHITECTURES` | GPU architecture | Native architecture |
-
-```bash
-cmake .. -DBUILD_TESTS=ON -DBUILD_BENCHMARKS=ON
-```
-
----
-
-## Run Tests
-
-### Run All Tests
-
-```bash
-ctest --preset default
-```
-
-### Run Specific Tests
-
-```bash
-# Run GEMM-related tests
-./build/tests_gpu --gtest_filter="GemmTest*"
-
-# Run Tensor tests
-./build/tests_gpu --gtest_filter="TensorTest*"
-
-# Run specific test case
-./build/tests_gpu --gtest_filter="GemmTest.NaiveMatMulCorrectness"
-```
-
-### Test Coverage
-
-```bash
-# Generate coverage report (requires gcov/lcov)
-cmake --preset ci
-cmake --build --preset ci
-ctest --preset ci
-```
-
----
-
-## Your First Program
-
-### Basic GEMM Example
-
-Create file `first_gemm.cpp`:
-
-```cpp
-#include "common.h"
-#include "kernels.cuh"
-#include <iostream>
-#include <vector>
-
-int main() {
-    // Set GPU device
-    CUDA_CHECK(cudaSetDevice(0));
-    
-    // Define matrix dimensions
-    const int M = 1024, N = 1024, K = 1024;
-    
-    // Allocate GPU memory
-    DeviceMemory d_A(M * K * sizeof(float));
-    DeviceMemory d_B(K * N * sizeof(float));
-    DeviceMemory d_C(M * N * sizeof(float));
-    
-    // Create and initialize host data
-    std::vector<float> h_A(M * K), h_B(K * N);
-    random_init(h_A.data(), h_A.size());
-    random_init(h_B.data(), h_B.size());
-    
-    // Copy to GPU
-    d_A.copy_from_host(h_A.data(), M * K * sizeof(float));
-    d_B.copy_from_host(h_B.data(), K * N * sizeof(float));
-    
-    // Execute optimized GEMM
-    launch_optimized_gemm(d_A.get(), d_B.get(), d_C.get(), M, N, K);
-    
-    // Synchronize
-    CUDA_CHECK(cudaDeviceSynchronize());
-    
-    // Get results
-    std::vector<float> h_C(M * N);
-    d_C.copy_to_host(h_C.data(), M * N * sizeof(float));
-    
-    std::cout << "✓ GEMM completed! C[0] = " << h_C[0] << std::endl;
-    
-    return 0;
-}
-```
-
-### Compile and Run
-
-```bash
-# Add file to CMakeLists.txt as executable target
-# Or compile manually:
-nvcc -o first_gemm first_gemm.cpp \
-    -I./include -L./build -lmini_inference \
-    -lcudart -lcublas -std=c++17
-
-./first_gemm
-```
-
-### Verify Correctness
-
-```cpp
-#include "common.h"
-
-// Add verification code
-std::vector<float> h_C_ref(M * N);
-cpu_matmul(h_A.data(), h_B.data(), h_C_ref.data(), M, N, K);
-
-float max_error = compare_matrices(h_C.data(), h_C_ref.data(), M * N);
-std::cout << "Max error: " << max_error << std::endl;
-// Should be < 1e-4
-```
-
----
-
-## MNIST Demo
-
-MNIST demo shows how to use the inference engine for handwritten digit recognition.
-
-### Prepare Weights File
-
-```bash
-# Use Python script to export weights
-cd scripts
-python export_mnist_weights.py --output ../weights/mnist_model.bin
-```
-
-### Run Demo
-
-```bash
-./build-release/mnist_demo --weights weights/mnist_model.bin
-```
-
-### Expected Output
-
-```
-Loading weights from: weights/mnist_model.bin
-Network Info:
-  Layers: 3
-  Input dim: 784
-  Output dim: 10
-
-Running inference on batch of 32 samples...
-Sample 0: Predicted digit 7, Confidence 92.3%
-Sample 1: Predicted digit 2, Confidence 88.7%
-...
-Average inference time: 0.45 ms
-```
-
----
-
-## Performance Benchmarking
-
-### Run Benchmarks
-
-```bash
-# Run full benchmark
-./build-release/benchmark
-
-# Specify matrix size
-./build-release/benchmark --m 2048 --n 2048 --k 2048
-
-# Specify kernel type
-./build-release/benchmark --kernel optimized
-```
-
-### Expected Performance (RTX 3080, 1024×1024×1024)
-
-| Kernel | Time (ms) | GFLOPS | vs cuBLAS |
-|:---|---:|---:|---:|
-| cuBLAS | 0.31 | 6920 | 100% |
-| Naive | 3.10 | 694 | 10% |
-| Tiled | 1.55 | 1388 | 20% |
-| Coalesced | 1.03 | 2088 | 30% |
-| Double Buffer | 0.78 | 2768 | 40% |
-| Optimized | 0.44 | 4870 | 70% |
-| Fused | 0.38 | 5630 | 81% |
-| Vectorized | 0.35 | 6130 | 85% |
-
----
-
-## Troubleshooting
-
-### Compile Error "Unsupported gpu architecture"
-
-**Solution**: Modify GPU architecture setting in `CMakeLists.txt`:
-
-```cmake
-# Check GPU architecture
-nvidia-smi --query-gpu=compute_cap --format=csv
-
-# Set corresponding architecture
-set(CMAKE_CUDA_ARCHITECTURES 86)  # RTX 30 series
-set(CMAKE_CUDA_ARCHITECTURES 89)  # RTX 40 series
-```
-
-### Runtime Error "CUDA out of memory"
-
-**Solution**:
-
-```cpp
-// 1. Reduce matrix size
-const int M = 512, N = 512, K = 512;
-
-// 2. Clear memory pool cache
-MemoryPool::instance().clear_cache();
-
-// 3. Check GPU memory usage
-nvidia-smi
-```
-
-### Lower Than Expected Performance
-
-**Checklist**:
-
-- [ ] GPU power state is P0:
-  ```bash
-  nvidia-smi -q -d PERFORMANCE | grep "Performance State"
-  ```
-- [ ] No other programs using GPU
-- [ ] Built in Release mode
-- [ ] Matrix size is power of 2 (aligned)
-
-### Test Failures
-
-```bash
-# Run single test for detailed error
-./build/tests_gpu --gtest_filter="GemmTest.NaiveMatMulCorrectness" --gtest_also_run_disabled_tests
-
-# Use CUDA memory checker
-cuda-memcheck ./build/tests_gpu --gtest_filter="GemmTest*"
-```
-
----
-
-## Next Steps
-
-Congratulations! You've completed the quick start. Next you can:
-
-1. 📖 Read [Architecture Design](architecture.md) to understand system principles
-2. ⚡ Study [GEMM Optimization Guide](deep-dive/gemm-optimization.md) to master optimization techniques
-3. 🔧 Check [API Reference](api-reference.md) for complete interface documentation
-4. 📊 Read [Performance Tuning Guide](performance-tuning.md) for advanced optimization
-
----
-
-## Related Links
-
-- [简体中文](../zh/quick-start.md)
-- [API Reference](api-reference.md)
-- [Architecture Design](architecture.md)
-- [GEMM Optimization Guide](deep-dive/gemm-optimization.md)
-- [GitHub Issues](https://github.com/LessUp/mini-inference-engine/issues)
-
----
-
-*Last Updated: 2025-04-16 | Document Version: v1.1.0*
diff --git a/docs/package.json b/docs/package.json
index 507277b..ae03511 100644
--- a/docs/package.json
+++ b/docs/package.json
@@ -9,10 +9,5 @@
   },
   "devDependencies": {
     "vitepress": "^1.5.0"
-  },
-  "dependencies": {
-    "mermaid": "^11.12.2",
-    "vitepress-plugin-llms": "^1.10.0",
-    "vitepress-plugin-mermaid": "^2.0.17"
   }
-}
\ No newline at end of file
+}
diff --git a/docs/releases/README.md b/docs/releases/README.md
deleted file mode 100644
index cf9490d..0000000
--- a/docs/releases/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Release Notes Index
-
-`CHANGELOG.md` is the structured source of release history. This directory keeps human-readable release notes for published milestones.
-
-| Version | Date | Summary |
-| --- | --- | --- |
-| [v1.1.0](v1.1.0.md) | 2025-04-21 | Bilingual documentation structure and site polish. |
-| [v1.0.0](v1.0.0.md) | 2025-04-16 | Stable CUDA GEMM optimization path and mini inference runtime. |
-| [v0.2.0](v0.2.0.md) | 2025-03-15 | Advanced kernels and runtime infrastructure. |
-| [v0.1.0](v0.1.0.md) | 2025-01-01 | Initial GEMM kernels and project skeleton. |
diff --git a/docs/releases/v0.1.0.md b/docs/releases/v0.1.0.md
deleted file mode 100644
index ea0bcdb..0000000
--- a/docs/releases/v0.1.0.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-title: "v0.1.0 Release Notes"
----
-
-## 🔗 Links
-
-- [v1.1.0](v1.1.0.md) | [v1.0.0](v1.0.0.md) | [v0.2.0](v0.2.0.md)
diff --git a/docs/releases/v0.2.0.md b/docs/releases/v0.2.0.md
deleted file mode 100644
index b36e207..0000000
--- a/docs/releases/v0.2.0.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-title: "v0.2.0 Release Notes"
----
-
-## 🔗 Links
-
-- [v1.1.0](v1.1.0.md) | [v1.0.0](v1.0.0.md) | [v0.1.0](v0.1.0.md)
diff --git a/docs/releases/v1.0.0.md b/docs/releases/v1.0.0.md
deleted file mode 100644
index 0d37fe6..0000000
--- a/docs/releases/v1.0.0.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-title: "v1.0.0 Release Notes"
----
-
-*This release was published on April 16, 2025*
diff --git a/docs/releases/v1.1.0.md b/docs/releases/v1.1.0.md
deleted file mode 100644
index b35175a..0000000
--- a/docs/releases/v1.1.0.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-title: "v1.1.0 Release Notes"
----
-
-*This release was published on April 16, 2024*
diff --git a/docs/zh/comparison/industry-analysis.md b/docs/zh/comparison/industry-analysis.md
index 3627e27..3e77f27 100644
--- a/docs/zh/comparison/industry-analysis.md
+++ b/docs/zh/comparison/industry-analysis.md
@@ -210,7 +210,7 @@ TensorRT-LLM 是 NVIDIA 官方的 LLM 推理优化库：
 1. **渐进式学习**：从 Naive 到 ~85% cuBLAS，每一步都可验证
 2. **完整工程**：不是孤立 kernel，而是完整的推理引擎骨架
 3. **双语文档**：中英文档齐全，适合中文学习者
-4. **OpenSpec 驱动**：规格化开发，便于 AI 协作
+4. **聚焦型仓库**：流程层更少，更容易构建、阅读和维护
 
 ### 推荐学习路径
 
diff --git a/docs/zh/contributing.md b/docs/zh/contributing.md
index 2fc4a83..e54f4ed 100644
--- a/docs/zh/contributing.md
+++ b/docs/zh/contributing.md
@@ -1,344 +1,52 @@
----
-title: 贡献指南
----
+# 贡献指南
 
+贡献应保持聚焦、可构建、易审查。这个项目刻意维持小而完整：CUDA GEMM kernels、轻量运行时、benchmark、测试和双语文档。
 
-> **Language**: 简体中文 | [English](../en/contributing.md)
+## 适合进入仓库的改动
 
----
+- 改善正确性、可读性或可测性能的 CUDA kernel 变更。
+- 围绕 `Tensor`、`InferenceEngine`、内存管理或 profiling 的运行时改动。
+- 与真实仓库结构一致的构建、测试、benchmark 与文档修复。
 
-> **项目状态：** CUDA GEMM 主路径与迷你推理运行时已经稳定。
-> 后续变更应保持聚焦、可追溯到规格，并遵循本文描述的单 `master`
-> 主线流程完成审查与合并。
+## 不应继续引入的内容
 
----
+- 新的 AI 控制框架、元流程系统或生成式治理层。
+- 同一主题的重复文档。
+- 为了“以后可能用到”而保留的无效实现分支。
 
-感谢你对 Mini-Inference Engine 的兴趣！本文档详细说明如何为项目做出贡献。
-
-## 目录 (Table of Contents)
-
-- [行为准则](#行为准则)
-- [如何贡献](#如何贡献)
-- [开发环境设置](#开发环境设置)
-- [代码规范](#代码规范)
-- [测试要求](#测试要求)
-- [文档要求](#文档要求)
-- [提交流程](#提交流程)
-
----
-
-## 行为准则
-
-- 尊重所有贡献者
-- 保持专业和建设性的讨论
-- 接受建设性的批评
-- 以项目最佳利益为出发点
-
----
-
-## 如何贡献
-
-### 报告 Bug
-
-1. 检查 [Issues](https://github.com/LessUp/mini-inference-engine/issues) 是否已有相关问题
-2. 创建新 issue，包含以下信息：
-   - **问题描述**：清晰描述问题
-   - **复现步骤**：如何复现问题
-   - **预期行为**：应该发生什么
-   - **实际行为**：实际发生了什么
-   - **环境信息**：GPU 型号、CUDA 版本、操作系统
-
-### 提交功能请求
-
-1. 描述功能需求和使用场景
-2. 说明为什么这个功能对项目有价值
-3. 提供可能的实现方案（可选）
-
-### 提交代码
-
-#### 1. Fork 仓库
-
-```bash
-git clone https://github.com/<your-username>/mini-inference-engine.git
-cd mini-inference-engine
-git remote add upstream https://github.com/LessUp/mini-inference-engine.git
-```
-
-#### 2. 创建短期分支或 worktree
-
-```bash
-# master 是唯一长期分支
-git checkout -b chore/focused-change
-```
-
-较大的工作建议使用隔离 worktree，合并后删除临时分支和 worktree。
-
-#### 3. 编写代码
-
-遵循 [代码规范](#代码规范)。
-
-#### 4. 测试
-
-```bash
-# Debug + tests
-cmake --preset default
-cmake --build --preset default
-ctest --preset default
-
-# Release 构建
-cmake --preset release
-cmake --build --preset release
-```
-
-#### 5. 提交
-
-```bash
-git add .
-git commit -m "feat: add new feature"
-```
-
-提交信息格式遵循 [Conventional Commits](https://www.conventionalcommits.org/)：
-
-| 类型 | 说明 |
-|:---|:---|
-| `feat:` | 新功能 |
-| `fix:` | Bug 修复 |
-| `docs:` | 文档更新 |
-| `perf:` | 性能优化 |
-| `refactor:` | 代码重构 |
-| `test:` | 测试相关 |
-| `chore:` | 构建/工具变更 |
-
-#### 6. 审查、推送并合并回 master
-
-```bash
-git push origin chore/focused-change
-```
-
-随后创建 Pull Request，请求审查，合并到 `master` 后删除短期分支。
-
----
-
-## 开发环境设置
-
-### 环境要求
-
-| 依赖 | 最低版本 | 推荐版本 |
-|:---|:---|:---|
-| CUDA Toolkit | 11.0 | 12.0+ |
-| CMake | 3.18 | 3.25+ |
-| C++ 编译器 | GCC 9 / Clang 10 | GCC 11+ |
-| GPU | 计算能力 7.0+ | 8.0+ |
-
-### 验证环境
-
-```bash
-# 检查 CUDA
-nvcc --version
-nvidia-smi
-
-# 检查 CMake
-cmake --version
-
-# 检查编译器
-gcc --version
-```
-
----
-
-## 代码规范
-
-### C++ 代码风格
-
-```cpp
-// 命名规范
-class ClassName;           // 类名: PascalCase
-void function_name();      // 函数名: snake_case
-int variable_name;         // 变量名: snake_case
-const int CONSTANT_NAME;   // 常量: UPPER_SNAKE_CASE
-int member_variable_;      // 成员变量: snake_case + 下划线后缀
-
-// 缩进: 4 空格
-void function() {
-    if (condition) {
-        // code
-    }
-}
-
-// 大括号: 同行
-if (condition) {
-    // code
-} else {
-    // code
-}
-```
-
-### CUDA 代码风格
-
-```cpp
-// Kernel 命名: snake_case
-__global__ void my_kernel(...);
-
-// 模板参数: UPPER_SNAKE_CASE
-template<int BLOCK_SIZE, bool USE_SHARED>
-__global__ void templated_kernel(...);
-
-// 共享内存: 前缀 s_
-__shared__ float s_data[256];
-
-// 寄存器变量: 前缀 r_
-float r_sum = 0.0f;
-```
-
-### 文件组织
-
-```
-include/
-├── common.h          # 通用定义
-├── kernels.cuh       # Kernel 声明
-├── feature.h         # 功能模块头文件
-└── ...
-
-src/
-├── kernel_name.cu    # Kernel 实现
-├── feature.cpp       # 功能模块实现
-└── ...
-
-tests/
-├── test_feature.cpp  # 功能测试
-└── ...
-```
-
-### 代码格式化
-
-使用 clang-format 格式化代码：
+## 开发循环
 
 ```bash
-clang-format --style=file -i <file>
-```
-
----
+cmake --preset gcc-cuda
+cmake --build --preset gcc-cuda
+ctest --preset gcc-cuda
 
-## 测试要求
-
-### 单元测试
-
-每个新功能都需要测试：
-
-```cpp
-#include <gtest/gtest.h>
-#include "feature.h"
-
-class FeatureTest : public ::testing::Test {
-protected:
-    void SetUp() override {
-        CUDA_CHECK(cudaSetDevice(0));
-    }
-};
-
-TEST_F(FeatureTest, BasicFunctionality) {
-    // 测试基本功能
-    EXPECT_EQ(expected, actual);
-}
-
-TEST_F(FeatureTest, EdgeCases) {
-    // 测试边界情况
-}
+cmake --preset release-gcc-cuda
+cmake --build --preset release-gcc-cuda
 ```
 
-### 性能测试
-
-对于性能相关的更改：
-
-```cpp
-TEST_F(FeatureTest, Performance) {
-    GpuTimer timer;
-    
-    // Warmup
-    for (int i = 0; i < 5; i++) {
-        function_under_test();
-    }
-    
-    // Benchmark
-    timer.start();
-    for (int i = 0; i < 20; i++) {
-        function_under_test();
-    }
-    timer.stop();
-    
-    float avg_time = timer.elapsed_ms() / 20;
-    printf("Average time: %.3f ms\n", avg_time);
-}
-```
-
----
-
-## 文档要求
-
-### 代码注释
+没有可用 NVIDIA GPU 时，GPU 测试可以跳过；但项目配置和编译仍然需要 CUDA Toolkit。
 
-```cpp
-/// @brief 执行优化的 GEMM 运算
-/// @param A 输入矩阵 A (M x K)
-/// @param B 输入矩阵 B (K x N)
-/// @param C 输出矩阵 C (M x N)
-/// @param M 矩阵 A 的行数
-/// @param N 矩阵 B 的列数
-/// @param K 矩阵 A 的列数 / 矩阵 B 的行数
-/// @param stream CUDA 流（可选）
-void launch_optimized_gemm(const float* A, const float* B, float* C,
-                           int M, int N, int K, cudaStream_t stream = 0);
-```
+## 代码规则
 
-### README 更新
+- 使用 `.clang-format`。
+- 函数和变量使用 `snake_case`；类使用 `PascalCase`；常量与模板参数使用 `UPPER_SNAKE_CASE`。
+- CUDA API 必须用 `CUDA_CHECK()`，cuBLAS API 必须用 `CUBLAS_CHECK()`。
+- GPU 内存优先使用 `DeviceMemory` 或 `PooledMemory`，避免手写 `cudaMalloc` / `cudaFree` 生命周期。
+- 新源文件必须显式加入 `CMakeLists.txt`。
 
-如果添加新功能，更新 README.md：
-- 功能列表
-- 使用示例
-- API 说明
+## 文档规则
 
----
+- GitHub Pages 内容统一放在 `docs/`。
+- 变更历史只保留根目录 `CHANGELOG.md`。
+- 行为、命令或目录结构变化时同步更新文档。
 
 ## 提交流程
 
-### 审查流程
-
-1. **自动检查**
-   - 编译通过
-   - 测试通过
-   - 代码风格检查
-
-2. **人工审查**
-   - 代码质量
-   - 设计合理性
-   - 文档完整性
-
-3. **性能验证**（如适用）
-   - 不引入性能回归
-   - 新优化有效
-
-### 合并要求
-
-- 所有 CI 检查通过
-- 至少 1 个审查者批准
-- 无未解决的审查意见
+使用短期分支或 worktree，保持 diff 聚焦，并向 `master` 提交 Pull Request。
 
----
+参考入口：
 
-## 联系方式
-
-- **Issues**: [GitHub Issues](https://github.com/LessUp/mini-inference-engine/issues)
-- **Discussions**: [GitHub Discussions](https://github.com/LessUp/mini-inference-engine/discussions)
-
----
-
-## 相关链接
-
-- [English Version](../en/contributing.md)
-- [快速入门](quick-start.md)
+- [快速开始](guides/getting-started.md)
 - [架构设计](architecture.md)
 - [API 参考](api-reference.md)
-
----
-
-*最后更新：2025-04-16 | 文档版本：v1.1.0*
diff --git a/docs/zh/guides/getting-started.md b/docs/zh/guides/getting-started.md
index 5bec962..76d11b8 100644
--- a/docs/zh/guides/getting-started.md
+++ b/docs/zh/guides/getting-started.md
@@ -26,20 +26,22 @@ cd mini-inference-engine
 
 ## 步骤 2: Debug 构建 + 测试
 
+如果当前 shell 中激活了 Conda 或其他自定义 C++ 工具链，优先使用系统 GCC 12 / G++ 12 预设。
+
 ```bash
-# 配置 Debug 构建
-cmake --preset default
+# 使用系统 GCC 12 / G++ 12 配置 Debug 构建
+cmake --preset gcc-cuda
 
 # 编译
-cmake --build --preset default
+cmake --build --preset gcc-cuda
 
 # 运行测试
-ctest --preset default --output-on-failure
+ctest --preset gcc-cuda
 ```
 
 **预期输出：**
 ```
-Test project /path/to/mini-inference-engine/build-default
+Test project /path/to/mini-inference-engine/build-gcc-cuda
     Start 1: test_config
 1/8 Test #1: test_config .....................   Passed    0.01 sec
     Start 2: test_logger
@@ -55,14 +57,14 @@ Test project /path/to/mini-inference-engine/build-default
 ## 步骤 3: Release 构建 + Benchmark
 
 ```bash
-# 配置 Release 构建
-cmake --preset release
+# 使用系统 GCC 12 / G++ 12 配置 Release 构建
+cmake --preset release-gcc-cuda
 
 # 编译
-cmake --build --preset release
+cmake --build --preset release-gcc-cuda
 
 # 运行 benchmark
-./build-release/benchmark
+./build-release-gcc-cuda/benchmark
 ```
 
 **预期输出：**
@@ -114,7 +116,7 @@ export LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH
 
 1. 检查 CUDA 版本是否兼容
 2. 检查 C++ 编译器是否支持 C++17
-3. 检查 CMake 版本是否 >= 3.18
+3. 如果 Conda 已激活，优先使用 `cmake --preset gcc-cuda`
 
 ---
 
diff --git a/docs/zh/index.md b/docs/zh/index.md
index 8b5e06e..9eb408a 100644
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@@ -82,8 +82,8 @@ features:
     <div class="perf-row">
       <span class="perf-label">L7 Vectorized</span>
       <div class="perf-bar-bg">
-        <div class="perf-bar perf-bar-animated" style="--target-width: 89%; background: var(--perf-vectorized);">
-          <span class="perf-value">89%</span>
+        <div class="perf-bar perf-bar-animated" style="--target-width: 85%; background: var(--perf-vectorized);">
+          <span class="perf-value">85%</span>
         </div>
       </div>
     </div>
@@ -99,8 +99,8 @@ features:
   <div class="quick-start-title">环境要求</div>
   <div class="quick-start-content">
     <ul>
-      <li>CUDA Toolkit 11.0+</li>
-      <li>CMake 3.20+</li>
+      <li>CUDA Toolkit 12.x</li>
+      <li>CMake 3.18+</li>
       <li>C++17 兼容编译器</li>
       <li>NVIDIA GPU (计算能力 7.0+)</li>
     </ul>
diff --git a/docs/zh/performance-tuning.md b/docs/zh/performance-tuning.md
index 2ba3bbf..0dddda2 100644
--- a/docs/zh/performance-tuning.md
+++ b/docs/zh/performance-tuning.md
@@ -494,7 +494,7 @@ __global__ void my_kernel(...) { ... }
 - [GEMM 优化详解](deep-dive/gemm-optimization.md)
 - [架构设计](architecture.md)
 - [API 参考](api-reference.md)
-- [快速入门](quick-start.md)
+- [快速开始](guides/getting-started.md)
 
 ---
 
diff --git a/docs/zh/quick-start.md b/docs/zh/quick-start.md
deleted file mode 100644
index 5d68632..0000000
--- a/docs/zh/quick-start.md
+++ /dev/null
@@ -1,226 +0,0 @@
----
-title: 快速入门
----
-
-# 快速入门指南 - Quick Start Guide
-
-> **Language**: 简体中文 | [English](../en/quick-start)
-
----
-
-## 目录 (Table of Contents)
-
-
-1. TOC
-
----
-
-## 环境要求
-
-### 硬件要求
-
-| 组件 | 最低要求 | 推荐配置 |
-|:---|:---|:---|
-| GPU | NVIDIA GPU，计算能力 7.0+ | RTX 30 系列或更高 |
-| 显存 | 4 GB | 8 GB+ |
-| 系统内存 | 8 GB | 16 GB+ |
-| 操作系统 | Linux / Windows / macOS | Ubuntu 22.04 LTS |
-
-### 软件要求
-
-| 依赖 | 最低版本 | 推荐版本 |
-|:---|:---|:---|
-| CUDA Toolkit | 11.0 | 12.0+ |
-| CMake | 3.18 | 3.25+ |
-| GCC | 9.0 | 11.0+ |
-| Python | 3.8+ | 3.10+ |
-
-### 验证环境
-
-<div class="terminal">
-  <span class="terminal-prompt">$</span> <span class="terminal-command">nvcc --version</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">nvidia-smi</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">cmake --version</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">gcc --version</span>
-</div>
-
----
-
-## 快速开始
-
-### 1. 克隆仓库
-
-<div class="terminal">
-  <span class="terminal-prompt">$</span> <span class="terminal-command">git clone https://github.com/LessUp/mini-inference-engine.git</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">cd mini-inference-engine</span>
-</div>
-
-### 2. 构建项目（推荐方式）
-
-本项目使用 CMake Presets 简化构建流程：
-
-<div class="terminal">
-  <span class="terminal-comment"># Debug 构建（包含测试）</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">cmake --preset default && cmake --build --preset default</span><br>
-  <br>
-  <span class="terminal-comment"># 运行测试</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">ctest --preset default</span><br>
-  <br>
-  <span class="terminal-comment"># Release 构建（性能测试）</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">cmake --preset release && cmake --build --preset release</span>
-</div>
-
-### 3. 验证安装
-
-<div class="terminal">
-  <span class="terminal-comment"># 运行 benchmark</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">./build-release/benchmark</span><br>
-  <br>
-  <span class="terminal-comment"># 运行 MNIST 演示</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">./build-release/mnist_demo</span>
-</div>
-
----
-
-## 第一个程序
-
-### 基础 GEMM 示例
-
-```cpp
-#include "common.h"
-#include "kernels.cuh"
-#include <iostream>
-#include <vector>
-
-int main() {
-    // 设置 GPU 设备
-    CUDA_CHECK(cudaSetDevice(0));
-    
-    // 定义矩阵维度
-    const int M = 1024, N = 1024, K = 1024;
-    
-    // 分配 GPU 内存
-    DeviceMemory d_A(M * K * sizeof(float));
-    DeviceMemory d_B(K * N * sizeof(float));
-    DeviceMemory d_C(M * N * sizeof(float));
-    
-    // 创建并初始化主机数据
-    std::vector<float> h_A(M * K), h_B(K * N);
-    random_init(h_A.data(), h_A.size());
-    random_init(h_B.data(), h_B.size());
-    
-    // 拷贝到 GPU
-    d_A.copy_from_host(h_A.data(), M * K * sizeof(float));
-    d_B.copy_from_host(h_B.data(), K * N * sizeof(float));
-    
-    // 执行优化 GEMM
-    launch_optimized_gemm(d_A.get(), d_B.get(), d_C.get(), M, N, K);
-    
-    // 同步等待完成
-    CUDA_CHECK(cudaDeviceSynchronize());
-    
-    // 获取结果
-    std::vector<float> h_C(M * N);
-    d_C.copy_to_host(h_C.data(), M * N * sizeof(float));
-    
-    std::cout << "✓ GEMM 完成！C[0] = " << h_C[0] << std::endl;
-    
-    return 0;
-}
-```
-
-### 编译和运行
-
-<div class="terminal">
-  <span class="terminal-comment"># 使用 CMake（推荐）</span><br>
-  <span class="terminal-comment"># 或者手动编译：</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">nvcc -o my_first_gemm my_first_gemm.cu \\</span><br>
-  <span class="terminal-command">    -I../include -L../build -lmini_inference \\</span><br>
-  <span class="terminal-command">    -lcudart -lcublas -std=c++17</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">./my_first_gemm</span>
-</div>
-
-### 验证正确性
-
-```cpp
-#include "common.h"
-
-// CPU 参考实现
-std::vector<float> h_C_ref(M * N);
-cpu_matmul(h_A.data(), h_B.data(), h_C_ref.data(), M, N, K);
-
-// 比较误差
-float max_error = compare_matrices(h_C.data(), h_C_ref.data(), M * N);
-std::cout << "最大误差: " << max_error << std::endl;
-// 通常应该 < 1e-4
-```
-
----
-
-## 故障排除
-
-### 编译错误 "Unsupported gpu architecture"
-
-::: warning
-**解决方案**：修改 `CMakeLists.txt` 中的 GPU 架构设置：
-
-<div class="terminal">
-  <span class="terminal-comment"># 查看 GPU 架构</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">nvidia-smi --query-gpu=compute_cap --format=csv</span><br>
-  <br>
-  <span class="terminal-comment"># 设置对应架构</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">set(CMAKE_CUDA_ARCHITECTURES 86)</span>  <span class="terminal-comment"># RTX 30 系列</span><br>
-  <span class="terminal-prompt">$</span> <span class="terminal-command">set(CMAKE_CUDA_ARCHITECTURES 89)</span>  <span class="terminal-comment"># RTX 40 系列</span>
-</div>
-:::
-
-### 运行时错误 "CUDA out of memory"
-
-::: warning
-**解决方案**：
-
-1. 减小矩阵大小或 batch size
-2. 使用内存池清理缓存：
-```cpp
-MemoryPool::instance().clear_cache();
-```
-:::
-
-### 性能低于预期
-
-::: tip
-**检查清单**：
-
-- GPU 电源状态为 P0：
-  ```bash
-  nvidia-smi -q -d PERFORMANCE | grep "Performance State"
-  ```
-- GPU 没有被其他程序占用
-- 使用 Release 模式构建
-- 矩阵大小是 2 的倍数（对齐）
-:::
-
----
-
-## 下一步
-
-恭喜！你已完成快速入门。接下来可以：
-
-1. 📖 阅读 [架构设计](./architecture) 了解系统原理
-2. ⚡ 学习 [GEMM 优化详解](./deep-dive/gemm-optimization) 掌握优化技术
-3. 🔧 查看 [API 参考](./api-reference) 了解完整接口
-4. 📊 阅读 [性能调优指南](./performance-tuning) 进行深度优化
-
----
-
-## 相关链接
-
-- [English Version](../en/quick-start)
-- [API 参考](./api-reference)
-- [架构设计](./architecture)
-- [GEMM 优化详解](./deep-dive/gemm-optimization)
-- [GitHub Issues](https://github.com/LessUp/mini-inference-engine/issues)
-
----
-
-*最后更新：2025-04-16 | 文档版本：v1.1.0*
diff --git a/include/common.h b/include/common.h
index 6667155..fe37d1b 100644
--- a/include/common.h
+++ b/include/common.h
@@ -204,8 +204,15 @@ class DeviceMemory {
         }
     }
 
-    float* get() { return ptr_; }
-    const float* get() const { return ptr_; }
+    template <typename T = float>
+    T* get() {
+        return reinterpret_cast<T*>(ptr_);
+    }
+
+    template <typename T = float>
+    const T* get() const {
+        return reinterpret_cast<const T*>(ptr_);
+    }
     size_t size() const { return size_; }
     bool empty() const { return ptr_ == nullptr; }
 
diff --git a/include/tensor_core_gemm.cuh b/include/tensor_core_gemm.cuh
index 0088870..f03dc60 100644
--- a/include/tensor_core_gemm.cuh
+++ b/include/tensor_core_gemm.cuh
@@ -130,14 +130,7 @@ __global__ void tensor_core_gemm(const half* __restrict__ A,
         __syncthreads();
     }
 
-    // Store result to global memory
-    // Convert FP32 accumulator to FP16 output
-    fragment<matrix_c, WMMA_M, WMMA_N, WMMA_K, half, row_major> c_frag;
-    store_matrix_sync(&C_shared[0][0], acc_frag, WMMA_N, mem_row_major);
-    // Note: We need to handle shared memory properly here
-    // For simplicity, store directly with an offset calculation
-
-    // Store output tile
+    // Store output tile directly to global memory.
     for (int i = 0; i < acc_frag.num_elements; i += 32) {
         int elem_idx = lane_idx + i;
         if (elem_idx < WMMA_M * WMMA_N) {
diff --git a/index.md b/index.md
deleted file mode 100644
index fbf8f7c..0000000
--- a/index.md
+++ /dev/null
@@ -1,87 +0,0 @@
----
-layout: default
-title: Home
-nav_order: 1
-permalink: /
----
-
-# Mini-Inference Engine
-
-{: .fs-6 .fw-300 }
-一条从朴素 CUDA 矩阵乘法到接近 cuBLAS 的 GEMM 优化路线，也是一套可运行、可测试、可讲解的迷你推理引擎工程样板。
-
-<div class="cta-section">
-  <a href="./zh/QUICK_START" class="btn-primary">从中文快速开始</a>
-  <a href="./en/QUICK_START" class="btn-secondary">Start in English</a>
-  <a href="https://github.com/LessUp/mini-inference-engine" class="btn-secondary">View on GitHub</a>
-</div>
-
----
-
-## 为什么这个项目值得看
-
-<div class="features-grid">
-  <div class="feature-card">
-    <h3>7 级 GEMM 优化路径</h3>
-    <p>Naive → Tiled → Coalesced → Double Buffer → Register Blocked → Fused → Vectorized，每一级都有明确性能动机。</p>
-  </div>
-  <div class="feature-card">
-    <h3>真实 CUDA 工程骨架</h3>
-    <p>C++17、CUDA、cuBLAS、CMake、Google Test、Jekyll 文档站完整串联，不是孤立 kernel 片段。</p>
-  </div>
-  <div class="feature-card">
-    <h3>推理引擎核心组件</h3>
-    <p>Tensor、InferenceEngine、MemoryPool、StreamManager、AutoTuner、Profiler 覆盖轻量推理运行时关键面。</p>
-  </div>
-  <div class="feature-card">
-    <h3>规格驱动维护</h3>
-    <p>OpenSpec 记录产品需求、架构 RFC、API、数据与测试约束，让代码、文档和 AI 协作有同一事实源。</p>
-  </div>
-</div>
-
----
-
-## 你可以按目标进入
-
-| 目标 | 推荐入口 | 你会得到 |
-| --- | --- | --- |
-| 快速跑通项目 | [中文快速入门](./zh/QUICK_START) / [Quick Start](./en/QUICK_START) | CUDA/CMake 环境检查、Debug/Release 构建、测试与 benchmark 命令 |
-| 学 CUDA GEMM 优化 | [GEMM 优化详解](./zh/GEMM_OPTIMIZATION) / [GEMM Optimization](./en/GEMM_OPTIMIZATION) | 每一级 kernel 的瓶颈、实现方式、性能取舍 |
-| 理解工程架构 | [架构设计](./zh/ARCHITECTURE) / [Architecture](./en/ARCHITECTURE) | 四层架构、模块边界、数据流、运行时组件 |
-| 查 API | [API 参考](./zh/API_REFERENCE) / [API Reference](./en/API_REFERENCE) | Tensor、MemoryPool、InferenceEngine、GEMM launcher 用法 |
-| 调性能 | [性能调优](./zh/PERFORMANCE_TUNING) / [Performance Tuning](./en/PERFORMANCE_TUNING) | profiling、kernel 选择、矩阵规模和硬件相关建议 |
-
----
-
-## 优化路线一屏看懂
-
-| Level | Kernel | 核心技术 | 典型收益 |
-| :---: | --- | --- | --- |
-| 1 | Naive | 每线程计算一个输出元素 | 建立可验证 baseline |
-| 2 | Tiled | 共享内存分块 | 降低全局内存重复读取 |
-| 3 | Coalesced | 合并访存与 bank conflict 控制 | 提高内存吞吐 |
-| 4 | Double Buffer | 预取与计算重叠 | 隐藏内存延迟 |
-| 5 | Register Blocked | 每线程计算小 tile | 提升算术强度 |
-| 6 | Fused | GEMM + Bias + ReLU | 减少 kernel launch 与中间写回 |
-| 7 | Vectorized | `float4` 向量化加载 | 进一步逼近 cuBLAS 吞吐 |
-
-> 性能数字依赖 GPU、矩阵规模、驱动与编译选项。本项目文档统一采用保守口径：在 RTX 3080、1024×1024 矩阵场景下，最高优化 kernel 可达到约 85% cuBLAS 级别，用于教学对比而非跨硬件承诺。
-
----
-
-## 构建快照
-
-```bash
-git clone https://github.com/LessUp/mini-inference-engine.git
-cd mini-inference-engine
-
-cmake --preset default
-cmake --build --preset default
-ctest --preset default --output-on-failure
-
-cmake --preset release
-cmake --build --preset release
-./build-release/benchmark
-```
-
-测试和 benchmark 需要 CUDA Toolkit 与可用 NVIDIA GPU；无 GPU 的 CI 环境只做配置、编译、格式和文档验证。
diff --git a/openspec/archive/.gitkeep b/openspec/archive/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/openspec/changes/.gitkeep b/openspec/changes/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/openspec/config.yaml b/openspec/config.yaml
deleted file mode 100644
index c96985d..0000000
--- a/openspec/config.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# OpenSpec Configuration
-# Mini-Inference Engine Project
-
-project:
-  name: mini-inference-engine
-  description: CUDA-based neural network inference engine with GEMM optimization
-
-# Workflow schema
-schema: spec-driven
-
-# Spec directories
-specs:
-  product:
-    path: specs/product
-    description: Product feature requirements and acceptance criteria
-  architecture:
-    path: specs/architecture
-    description: Technical design documents (RFCs)
-  api:
-    path: specs/api
-    description: API interface definitions
-  data:
-    path: specs/data
-    description: Data schemas and model definitions
-  testing:
-    path: specs/testing
-    description: BDD test specifications
-
-# Change management
-changes:
-  path: changes
-  archive_path: archive
-  id_format: "CHG-YYYY-MM-DD-NNN"
-
-# Artifact naming conventions
-artifacts:
-  proposal: proposal.md
-  design: design.md
-  tasks: tasks.md
-  spec: spec.md
diff --git a/openspec/specs/api/README.md b/openspec/specs/api/README.md
deleted file mode 100644
index ea32348..0000000
--- a/openspec/specs/api/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-openspec:
-  type: api
-  status: active
-  created: 2024-01-01
-  tags: [api]
----
-
-# API Specifications
-
-API 接口定义。完整文档见 [`engine-api.yaml`](engine-api.yaml)。
-
-## 文档链接
-
-- [API Reference (English)](../../docs/en/API_REFERENCE.md)
-- [API 参考 (中文)](../../docs/zh/API_REFERENCE.md)
-
----
-
-See [AGENTS.md](../../AGENTS.md) for development workflow.
diff --git a/openspec/specs/api/engine-api.yaml b/openspec/specs/api/engine-api.yaml
deleted file mode 100644
index 7147ab6..0000000
--- a/openspec/specs/api/engine-api.yaml
+++ /dev/null
@@ -1,163 +0,0 @@
----
-openspec:
-  type: api
-  status: active
-  created: 2024-01-01
-  tags: [api]
----
-
-# Mini-Inference Engine API Specification
-# Version: 1.1.0
-# Format: YAML-based API Contract
-
-openapi: "3.0.0"
-info:
-  title: Mini-Inference Engine C++ API
-  description: Core API for CUDA-accelerated neural network inference
-  version: "1.1.0"
-  contact:
-    name: Mini-Inference Engine Team
-    url: https://github.com/LessUp/mini-inference-engine
-
-paths:
-  /tensor:
-    post:
-      summary: Create tensor
-      description: Allocate device memory for a new tensor
-      operationId: Tensor::create
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                shape:
-                  type: array
-                  items: { type: integer }
-                  description: Tensor dimensions (max 4D)
-                dtype:
-                  type: string
-                  enum: [float32, float16, int8]
-                  default: float32
-      responses:
-        '200':
-          description: Tensor created successfully
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/TensorHandle'
-        '400':
-          description: Invalid shape or dtype
-        '500':
-          description: CUDA memory allocation failed
-
-  /tensor/{handle}/data:
-    get:
-      summary: Get tensor data pointer
-      operationId: Tensor::data
-      parameters:
-        - name: handle
-          in: path
-          required: true
-          schema: { type: string }
-      responses:
-        '200':
-          description: Device pointer to tensor data
-          content:
-            application/json:
-              schema:
-                type: object
-                properties:
-                  device_ptr: { type: string, description: "CUDA device memory address" }
-                  bytes: { type: integer, description: "Total bytes allocated" }
-
-  /inference/load:
-    post:
-      summary: Load model weights
-      operationId: InferenceEngine::load_weights
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                weights_path:
-                  type: string
-                  description: Path to .weights binary file
-      responses:
-        '200':
-          description: Weights loaded successfully
-        '404':
-          description: File not found
-        '500':
-          description: CUDA error or invalid format
-
-  /inference/forward:
-    post:
-      summary: Run inference forward pass
-      operationId: InferenceEngine::forward
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                input_ptr:
-                  type: string
-                  description: Device pointer to input tensor
-                batch_size:
-                  type: integer
-                  minimum: 1
-                  maximum: 512
-      responses:
-        '200':
-          description: Forward pass completed
-          content:
-            application/json:
-              schema:
-                type: object
-                properties:
-                  output_ptr: { type: string }
-                  execution_time_ms: { type: number }
-        '500':
-          description: CUDA kernel launch failed
-
-components:
-  schemas:
-    TensorHandle:
-      type: object
-      required: [handle, bytes]
-      properties:
-        handle:
-          type: string
-          description: Opaque tensor identifier
-        bytes:
-          type: integer
-          description: Total allocated bytes on device
-        shape:
-          type: array
-          items: { type: integer }
-          description: Tensor dimensions
-
-    Error:
-      type: object
-      properties:
-        code:
-          type: integer
-          enum: [0, 1, 2, 3, 4, 5]
-          description: |
-            0=SUCCESS, 1=INVALID_ARG, 2=OUT_OF_MEMORY,
-            3=CUDA_ERROR, 4=NOT_INITIALIZED, 5=FILE_NOT_FOUND
-        message:
-          type: string
-
-  errorResponses:
-    default:
-      description: Error response
-      content:
-        application/json:
-          schema:
-            $ref: '#/components/schemas/Error'
diff --git a/openspec/specs/architecture/0001-core-architecture.md b/openspec/specs/architecture/0001-core-architecture.md
deleted file mode 100644
index 152aa41..0000000
--- a/openspec/specs/architecture/0001-core-architecture.md
+++ /dev/null
@@ -1,301 +0,0 @@
----
-openspec:
-  type: architecture
-  status: active
-  created: 2024-01-01
-  tags: [architecture, rfc]
-  dependencies: []
-  affects: []
----
-
-# RFC: Mini-Inference Engine Architecture Design
-
-## Status
-
-**Status:** Accepted  
-**Created:** 2024  
-**Last Updated:** 2024  
-
-## Overview
-
-Mini-Inference Engine is a lightweight neural network inference engine focused on GEMM (General Matrix Multiply) optimization. Through progressive optimization strategies, the best optimized kernel reaches about 85% of cuBLAS-class throughput on the documented RTX 3080 1024×1024 reference benchmark.
-
-### Tech Stack
-
-| Component | Version |
-|-----------|---------|
-| CUDA C++ | Compute Capability 7.0+ |
-| CMake | 3.18+ |
-| CUDA Toolkit | 11.0+ |
-| cuBLAS | Used for performance comparison |
-
-## Architecture
-
-### System Architecture
-
-```
-┌─────────────────────────────────────────────────────────────────┐
-│                      Application Layer                          │
-│   Benchmark  │  MNIST Demo  │  Tests  │  User Application      │
-└─────────────────────────────────────────────────────────────────┘
-                              │
-┌─────────────────────────────────────────────────────────────────┐
-│                       Engine Layer                              │
-│   InferenceEngine  │  Tensor  │  AutoTuner  │  Profiler         │
-└─────────────────────────────────────────────────────────────────┘
-                              │
-┌─────────────────────────────────────────────────────────────────┐
-│                       Kernel Layer                              │
-│  Naive │ Tiled │ Coalesced │ DoubleBuffer │ Optimized │ Fused  │
-│  Vectorized │ Half-Precision │ Batched │ cuBLAS wrapper        │
-└─────────────────────────────────────────────────────────────────┘
-                              │
-┌─────────────────────────────────────────────────────────────────┐
-│                   Infrastructure Layer                          │
-│  MemoryPool │ StreamManager │ Logger │ Config │ Quantization   │
-└─────────────────────────────────────────────────────────────────┘
-```
-
-### Memory Hierarchy Optimization Strategy
-
-| Optimization Technique | Target Memory | Bandwidth | Optimization Effect |
-|------------------------|---------------|-----------|---------------------|
-| Tiling | Shared Memory | ~10 TB/s | Reduces 32x global memory accesses |
-| Coalescing | Global Memory | ~500 GB/s | Improves bandwidth utilization |
-| Double Buffer | Shared Memory | ~10 TB/s | Hides memory latency |
-| Register Blocking | Registers | ~100 TB/s | Maximizes compute density |
-
-## Components and Interfaces
-
-### Core Data Structures
-
-```cpp
-// Matrix descriptor
-struct MatrixDesc {
-    float* data;        // Device pointer
-    int rows;           // Row count M
-    int cols;           // Column count N
-    int ld;             // Leading dimension
-    bool is_transposed; // Whether transposed
-};
-
-// GEMM configuration
-struct GemmConfig {
-    int BLOCK_M;        // Tile row size
-    int BLOCK_N;        // Tile column size
-    int BLOCK_K;        // K dimension block size
-    int WARP_M;         // Warp-level M blocking
-    int WARP_N;         // Warp-level N blocking
-    bool use_double_buffer;
-    bool use_vectorized_load;
-};
-
-// Fusion operation configuration
-struct FusionConfig {
-    bool add_bias;
-    bool apply_relu;
-    float* bias;
-};
-
-// Performance statistics
-struct PerfStats {
-    float kernel_time_ms;
-    float gflops;
-    float memory_bandwidth_gb;
-    float cublas_ratio;
-};
-```
-
-### Kernel Interfaces
-
-```cpp
-// Naive MatMul: Each thread computes one output element
-__global__ void naive_matmul(
-    const float* A, const float* B, float* C,
-    int M, int N, int K
-);
-
-// Tiled GEMM: Uses shared memory tiling
-__global__ void tiled_gemm(
-    const float* A, const float* B, float* C,
-    int M, int N, int K
-);
-
-// Optimized GEMM: Full optimization version
-template<int BM, int BN, int BK, int TM, int TN>
-__global__ void optimized_gemm(
-    const float* A, const float* B, float* C,
-    int M, int N, int K
-);
-
-// Fused Kernel: MatMul + Bias + ReLU
-template<int BM, int BN, int BK, int TM, int TN,
-         bool ADD_BIAS, bool APPLY_RELU>
-__global__ void fused_gemm_bias_relu(
-    const float* A, const float* B, float* C,
-    const float* bias, int M, int N, int K
-);
-```
-
-### Engine Interface
-
-```cpp
-class InferenceEngine {
-public:
-    void init(int device_id = 0);
-    bool load_weights(const std::string& path);
-    void forward(const float* input, float* output, int batch_size);
-    void forward_with_timing(const float* input, float* output,
-                             int batch_size, std::vector<float>& times);
-    void cleanup();
-
-    size_t num_layers() const;
-    int input_dim() const;
-    int output_dim() const;
-
-private:
-    std::vector<LayerWeights> layers_;
-    cublasHandle_t cublas_handle_;
-    cudaStream_t stream_;
-};
-```
-
-## GEMM Optimization Details
-
-### Optimization Levels
-
-| Level | Technique | Performance (vs cuBLAS) | Key Optimization |
-|-------|-----------|-------------------------|------------------|
-| 1 | Naive | ~10% | Baseline implementation |
-| 2 | Tiled | ~20% | Shared memory tiling |
-| 3 | Coalesced | ~30% | Memory access coalescing |
-| 4 | Double Buffer | ~40% | Double buffering to hide latency |
-| 5 | Register Blocked | ~70% | Register blocking |
-| 6 | Fused | ~80% | Operator fusion |
-| 7 | Vectorized | ~85% | Vectorized loads |
-
-### Parameter Constraints
-
-```
-Constraints:
-─────────────────────────────────────────────────────────────
-1. Thread count: (BM / TM) × (BN / TN) ≤ 1024
-
-2. Shared memory: (BM × BK + BK × BN) × 4 ≤ 48KB
-
-3. Registers: TM × TN + TM + TN + overhead ≤ 255
-
-Recommended Configurations:
-─────────────────────────────────────────────────────────────
-Config        BM   BN   BK   TM   TN   Threads   Shared
-─────────────────────────────────────────────────────────────
-Small         64   64    8    4    4     256     4KB
-Medium       128  128    8    8    8     256     8KB
-Large        128  256   16    8    8     512    24KB
-─────────────────────────────────────────────────────────────
-```
-
-## Data Models
-
-### Weight File Format
-
-```
-+------------------+
-| Header (32 bytes)|
-|  - magic: 4B     |  (0x4D494E49 = "MINI")
-|  - version: 4B   |
-|  - num_layers: 4B|
-|  - reserved: 20B |
-+------------------+
-| Layer Meta       |
-|  - type: 4B      |
-|  - in_features   |
-|  - out_features  |
-|  - has_bias      |
-+------------------+
-| Layer Weights    |
-|  - W: float[]    |
-|  - bias: float[] |
-+------------------+
-```
-
-### Network Architecture (MNIST)
-
-```
-Input: 784 (28x28)
-    ↓
-Linear(784, 256) + ReLU
-    ↓
-Linear(256, 128) + ReLU
-    ↓
-Linear(128, 10)
-    ↓
-Output: 10 (logits)
-```
-
-## Error Handling
-
-### CUDA Error Handling
-
-```cpp
-#define CUDA_CHECK(call) do { \
-    cudaError_t err = call; \
-    if (err != cudaSuccess) { \
-        throw CudaException(err, __FILE__, __LINE__); \
-    } \
-} while(0)
-
-class CudaException : public std::exception {
-public:
-    CudaException(cudaError_t err, const char* file, int line);
-    const char* what() const noexcept override;
-    cudaError_t error() const;
-};
-```
-
-### RAII Resource Management
-
-```cpp
-class DeviceMemory {
-public:
-    explicit DeviceMemory(size_t bytes);
-    ~DeviceMemory();
-
-    DeviceMemory(const DeviceMemory&) = delete;
-    DeviceMemory& operator=(const DeviceMemory&) = delete;
-    DeviceMemory(DeviceMemory&&) noexcept;
-
-    float* get();
-    size_t size() const;
-};
-```
-
-## Testing Strategy
-
-### Test Framework
-
-| Type | Tool |
-|------|------|
-| Unit Testing | Google Test |
-| Property Testing | Custom random matrix generator |
-| Performance Testing | Custom benchmark framework |
-
-### Test Coverage
-
-| Test File | Coverage |
-|-----------|----------|
-| test_gemm.cpp | All GEMM kernels |
-| test_tensor.cpp | Tensor operations |
-| test_inference.cpp | InferenceEngine |
-| test_memory_pool.cpp | MemoryPool |
-| test_stream_manager.cpp | StreamManager |
-| test_config.cpp | Config |
-| test_logger.cpp | Logger |
-| test_quantization.cpp | INT8 quantization |
-| test_fusion.cpp | Fusion kernels |
-
-## References
-
-- [CUDA C++ Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/)
-- [CUDA Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/)
-- [CUTLASS](https://github.com/NVIDIA/cutlass)
diff --git a/openspec/specs/architecture/0002-memory-pool.md b/openspec/specs/architecture/0002-memory-pool.md
deleted file mode 100644
index 45a0656..0000000
--- a/openspec/specs/architecture/0002-memory-pool.md
+++ /dev/null
@@ -1,149 +0,0 @@
----
-openspec:
-  type: architecture
-  status: active
-  created: 2024-01-01
-  tags: [architecture, rfc]
-  dependencies: []
-  affects: []
----
-
-# RFC 0002: Memory Pool Subsystem
-
-## Status
-
-**Status:** Accepted
-**Created:** 2024
-**Last Updated:** 2024
-**Supersedes:** None
-
-## Overview
-
-Design a memory pool subsystem for the Mini-Inference Engine to reduce CUDA memory allocation overhead, prevent memory fragmentation, and provide deterministic memory management for inference workloads.
-
-## Motivation
-
-Frequent `cudaMalloc`/`cudaFree` calls cause:
-1. High overhead (~10-50us per allocation)
-2. Memory fragmentation during variable tensor lifetimes
-3. Inability to track memory usage statistics
-4. No protection against out-of-memory conditions
-
-## Design
-
-### Architecture
-
-```
-MemoryPool
-├── Capacity: Fixed size (default 256MB)
-├── Free List: Sorted by address for coalescing
-├── Allocation Tracking: Used/Free statistics
-└── Block Header: Size, is_free, prev/next pointers
-```
-
-### Memory Block Layout
-
-```
-+---------------------------------------------------------------+
-| Block Header (16 bytes)                                       |
-+-------------------+-------------------+-----------------------+
-| size (8 bytes)    | is_free (1 byte)  | reserved (7 bytes)    |
-+-------------------+-------------------+-----------------------+
-| User Data (size bytes, 256-byte aligned)                      |
-+---------------------------------------------------------------+
-```
-
-### API Design
-
-```cpp
-class MemoryPool {
-public:
-    explicit MemoryPool(size_t capacity_bytes);
-    ~MemoryPool();
-
-    // Non-copyable, movable
-    MemoryPool(const MemoryPool&) = delete;
-    MemoryPool& operator=(const MemoryPool&) = delete;
-    MemoryPool(MemoryPool&&) noexcept;
-    MemoryPool& operator=(MemoryPool&&) noexcept;
-
-    // Allocation
-    float* allocate(size_t bytes);
-
-    // Deallocation
-    void deallocate(float* ptr);
-
-    // Statistics
-    size_t capacity() const;
-    size_t used() const;
-    size_t free_space() const;
-    float utilization() const;
-    size_t fragmentation_ratio() const;
-
-    // Diagnostics
-    void print_memory_map() const;
-    void validate_integrity() const;
-
-private:
-    struct Block;
-    Block* find_best_fit(size_t bytes);
-    void coalesce_adjacent_blocks();
-    void split_block(Block* block, size_t bytes);
-
-    void* pool_ptr_;       // CUDA memory pool base
-    size_t capacity_;
-    size_t used_;
-    std::vector<Block> blocks_;  // Free list (sorted by address)
-};
-```
-
-### Allocation Algorithm
-
-1. **Best-fit**: Find smallest free block that satisfies request
-2. **Splitting**: If block is significantly larger, split into allocated + remaining free
-3. **Coalescing**: Merge adjacent free blocks on deallocation
-4. **Alignment**: All allocations aligned to 256 bytes for CUDA coalescing
-
-### Error Handling
-
-| Condition | Behavior |
-|-----------|----------|
-| Request > capacity | Returns nullptr |
-| No contiguous block large enough | Returns nullptr |
-| Double free | Throws `std::runtime_error` |
-| Invalid pointer | Throws `std::invalid_argument` |
-
-## Memory Budget
-
-| Scenario | Pool Size | Notes |
-|----------|-----------|-------|
-| MNIST inference (batch=1) | 4 MB | Minimal workload |
-| MNIST inference (batch=256) | 128 MB | Production workload |
-| Large models | 512 MB+ | Configurable |
-
-## Testing Strategy
-
-1. **Allocation correctness**: Sequential, interleaved, random sizes
-2. **Coalescing**: Verify adjacent free blocks merge
-3. **Fragmentation**: Measure fragmentation ratio under various patterns
-4. **Edge cases**: Zero-size, max-size, boundary alignment
-5. **Performance**: Allocation/deallocation throughput vs raw `cudaMalloc`
-
-## Implementation Files
-
-- `include/memory_pool.h` - Header with PooledMemory class
-- `src/memory_pool.cu` - Implementation (inline in header for templates)
-- `tests/test_memory_pool.cpp` - Unit tests
-
-## Alternatives Considered
-
-| Alternative | Pros | Cons | Decision |
-|-------------|------|------|----------|
-| cudaMallocAsync | Built-in, fast | CUDA 11.2+ only, less control | Rejected (compatibility) |
-| bump allocator | Simplest, fastest | No deallocation support | Rejected (need reuse) |
-| slab allocator | Good for fixed sizes | Complex for variable tensors | Partial (used for small allocs) |
-
-## References
-
-- [CUDA Memory Management Best Practices](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#memory-management)
-- [NVIDIA CUDA Memory Pool](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-pool)
diff --git a/openspec/specs/architecture/0003-quantization.md b/openspec/specs/architecture/0003-quantization.md
deleted file mode 100644
index 0fb7f00..0000000
--- a/openspec/specs/architecture/0003-quantization.md
+++ /dev/null
@@ -1,156 +0,0 @@
----
-openspec:
-  type: architecture
-  status: active
-  created: 2024-01-01
-  tags: [architecture, rfc]
-  dependencies: []
-  affects: []
----
-
-# RFC 0003: Quantization System (INT8/FP16)
-
-## Status
-
-**Status:** Accepted
-**Created:** 2024
-**Last Updated:** 2024
-
-## Overview
-
-Design a quantization subsystem to support INT8 and FP16 inference, reducing memory bandwidth requirements and increasing throughput on modern GPUs with Tensor Core support.
-
-## Motivation
-
-1. **Memory bandwidth reduction**: INT8 uses 4x less memory than FP32
-2. **Throughput improvement**: Tensor Cores deliver 4x TOPS for INT8
-3. **Power efficiency**: Lower precision = less energy per operation
-4. **Edge deployment**: Enables inference on constrained devices
-
-## Design
-
-### Quantization Pipeline
-
-```
-FP32 Weights → Calibration → Scale Computation → Quantized Storage
-     ↓                                              ↓
-  Dequantize (runtime) ← FP16/INT8 Compute ← Dequantize (runtime)
-```
-
-### INT8 Quantization
-
-#### Symmetric Quantization
-
-```
-scale = max(|tensor|) / 127.0
-quantized = round(fp32_value / scale)
-dequantized = quantized * scale
-```
-
-**Constraints:**
-- Range: [-128, 127]
-- Zero point: always 0 (symmetric)
-- Per-tensor scale (not per-channel, for simplicity)
-
-#### FP16 Quantization
-
-```
-fp16_value = __float2half(fp32_value)
-fp32_value = __half2float(fp16_value)
-```
-
-**Constraints:**
-- Range: [6.10e-5, 65504]
-- Precision: ~3.3 decimal digits
-
-### API Design
-
-```cpp
-// Quantization types
-enum class QuantType {
-    NONE,    // FP32 passthrough
-    INT8,    // 8-bit symmetric quantization
-    FP16     // Half-precision
-};
-
-// Quantization parameters
-struct QuantParams {
-    QuantType type;
-    float scale;         // For INT8: scale factor
-    int8_t zero_point;   // For INT8: zero point (currently 0)
-};
-
-// Core functions
-class Quantizer {
-public:
-    // Quantize FP32 tensor to INT8/FP16
-    static void quantize(const float* src, void* dst,
-                         QuantType type, QuantParams& params,
-                         size_t elements);
-
-    // Dequantize INT8/FP16 back to FP32
-    static void dequantize(const void* src, float* dst,
-                           QuantType type, const QuantParams& params,
-                           size_t elements);
-
-    // Calibration: compute scale from FP32 tensor
-    static float compute_scale(const float* tensor, size_t elements);
-
-    // Quantized GEMM wrapper
-    static void quantized_gemm(const void* A, const void* B, float* C,
-                               QuantType type,
-                               int M, int N, int K,
-                               const QuantParams& a_params,
-                               const QuantParams& b_params,
-                               cudaStream_t stream);
-};
-```
-
-### Kernel Design
-
-#### INT8 GEMM with FP32 Accumulation
-
-```
-INT8 A × INT8 B → INT32 accumulation → FP32 C (dequantized)
-```
-
-#### FP16 GEMM with FP32 Accumulation
-
-```
-FP16 A × FP16 B → FP32 accumulation (using WMMA or half2) → FP32 C
-```
-
-### Accuracy Analysis
-
-| Model | FP32 Top-1 | INT8 Top-1 | FP16 Top-1 | INT8 Loss | FP16 Loss |
-|-------|-----------|-----------|-----------|-----------|-----------|
-| MNIST-MLP | 98.2% | 98.1% | 98.2% | -0.1% | 0.0% |
-| LeNet-5 | 99.1% | 98.9% | 99.1% | -0.2% | 0.0% |
-
-### Error Budget
-
-| Quantization | Max Error | Mean Error | Notes |
-|--------------|-----------|------------|-------|
-| INT8 (symmetric) | 0.5 × scale | 0.25 × scale | Bounded by step size |
-| FP16 | 2^-11 ≈ 4.9e-4 | 2^-12 ≈ 2.4e-4 | Bounded by mantissa |
-
-## Testing Strategy
-
-1. **Quantization accuracy**: Verify scale computation
-2. **Round-trip fidelity**: FP32 → INT8 → FP32 error within budget
-3. **GEMM correctness**: Quantized GEMM matches FP32 within tolerance
-4. **Performance**: Throughput comparison vs FP32 GEMM
-5. **Edge cases**: All-zeros tensor, extreme values, NaN handling
-
-## Implementation Files
-
-- `include/quantization.h` - Quantizer class and types
-- `src/quantization.cu` - Kernel implementations
-- `tests/test_quantization.cpp` - Quantization tests
-
-## Future Work
-
-- Per-channel quantization
-- Asymmetric quantization (with zero point)
-- Dynamic quantization (runtime calibration)
-- INT4 support for extreme compression
diff --git a/openspec/specs/architecture/0004-stream-manager.md b/openspec/specs/architecture/0004-stream-manager.md
deleted file mode 100644
index 234e493..0000000
--- a/openspec/specs/architecture/0004-stream-manager.md
+++ /dev/null
@@ -1,145 +0,0 @@
----
-openspec:
-  type: architecture
-  status: active
-  created: 2024-01-01
-  tags: [architecture, rfc]
-  dependencies: []
-  affects: []
----
-
-# RFC 0004: Stream Manager and Concurrency System
-
-## Status
-
-**Status:** Accepted
-**Created:** 2024
-**Last Updated:** 2024
-
-## Overview
-
-Design a stream management system for concurrent CUDA stream operations, enabling overlapping data transfers with computation and supporting multiple concurrent inference requests.
-
-## Motivation
-
-1. **Overlap transfer and compute**: While one batch computes, transfer next batch
-2. **Multi-request serving**: Handle multiple inference requests concurrently
-3. **Resource isolation**: Separate streams for different operations
-4. **Synchronization control**: Fine-grained control over execution order
-
-## Design
-
-### Stream Hierarchy
-
-```
-StreamManager
-├── compute_stream: Default GEMM execution
-├── copy_stream_0: H2D transfers (stream 1)
-├── copy_stream_1: D2H transfers (stream 2)
-└── extra_streams: User-created streams (up to 16)
-```
-
-### API Design
-
-```cpp
-enum class StreamPriority {
-    LOW,
-    NORMAL,
-    HIGH
-};
-
-class StreamManager {
-public:
-    explicit StreamManager(int max_streams = 8);
-    ~StreamManager();
-
-    // Stream creation
-    cudaStream_t create_stream(StreamPriority priority = StreamPriority::NORMAL);
-
-    // Stream access
-    cudaStream_t get_compute_stream() const;
-    cudaStream_t get_copy_stream(int index = 0) const;
-
-    // Synchronization
-    void sync_stream(cudaStream_t stream);
-    void sync_all_streams();
-
-    // Stream ordering
-    void insert_event(cudaStream_t stream, cudaEvent_t event);
-    void wait_event(cudaStream_t stream, cudaEvent_t event);
-
-    // Statistics
-    int active_streams() const;
-    size_t total_streams_created() const;
-
-    // Cleanup
-    void destroy_stream(cudaStream_t stream);
-
-private:
-    std::vector<cudaStream_t> streams_;
-    int max_streams_;
-    int next_stream_id_;
-};
-```
-
-### Concurrent Execution Pattern
-
-```
-Stream 0 (compute):  [GEMM batch 1] ---- [GEMM batch 2] ----
-Stream 1 (H2D):      [Copy batch 2] ---- [Copy batch 3] ----
-Stream 2 (D2H):      ---------------- [Results batch 1] ----
-
-Timeline →
-```
-
-### Stream Priority Mapping
-
-| Priority | CUDA Priority | Use Case |
-|----------|--------------|----------|
-| HIGH | cudaStreamHighPriority | Compute kernels |
-| NORMAL | cudaStreamDefaultPriority | Data transfers |
-| LOW | cudaStreamLowPriority | Logging, profiling |
-
-### Synchronization Guarantees
-
-| Operation | Blocking | Notes |
-|-----------|----------|-------|
-| `sync_stream()` | Yes | Waits for all work on stream |
-| `sync_all_streams()` | Yes | Waits for all managed streams |
-| `insert_event()` | No | Non-blocking event record |
-| `wait_event()` | Conditional | Blocks until event reached |
-
-### Error Handling
-
-| Condition | Behavior |
-|-----------|----------|
-| Max streams exceeded | Throws `std::runtime_error` |
-| Invalid stream handle | Throws `std::invalid_argument` |
-| CUDA error in stream creation | Throws `CudaException` |
-
-## Performance Targets
-
-| Scenario | Target | Metric |
-|----------|--------|--------|
-| Single stream | 1.0x | Baseline |
-| 2 streams (overlap) | 1.3-1.5x | Compute + transfer overlap |
-| 4 streams (batching) | 1.8-2.0x | Throughput vs single stream |
-
-## Testing Strategy
-
-1. **Concurrent execution**: Verify overlap actually occurs
-2. **Synchronization correctness**: No race conditions
-3. **Resource cleanup**: No leaked streams or events
-4. **Performance measurement**: Actual speedup from concurrency
-5. **Stress test**: Max streams, rapid create/destroy cycles
-
-## Implementation Files
-
-- `include/stream_manager.h` - StreamManager class
-- `src/stream_manager.cu` - Implementation
-- `tests/test_stream_manager.cpp` - Unit tests
-
-## References
-
-- [CUDA Streams Documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams)
-- [CUDA Concurrency Best Practices](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#concurrency)
diff --git a/openspec/specs/architecture/0005-auto-tuner.md b/openspec/specs/architecture/0005-auto-tuner.md
deleted file mode 100644
index e40df24..0000000
--- a/openspec/specs/architecture/0005-auto-tuner.md
+++ /dev/null
@@ -1,164 +0,0 @@
----
-openspec:
-  type: architecture
-  status: active
-  created: 2024-01-01
-  tags: [architecture, rfc]
-  dependencies: []
-  affects: []
----
-
-# RFC 0005: Auto-Tuner System
-
-## Status
-
-**Status:** Accepted
-**Created:** 2024
-**Last Updated:** 2024
-
-## Overview
-
-Design an auto-tuning system that automatically selects optimal GEMM kernel configurations for given matrix dimensions and GPU architectures, eliminating manual parameter tuning.
-
-## Motivation
-
-1. **Architecture diversity**: Different GPUs have different optimal block sizes
-2. **Dimension sensitivity**: Best config varies with M, N, K dimensions
-3. **Eliminate guesswork**: Automatic search replaces manual tuning
-4. **Adaptability**: Re-tune when workload characteristics change
-
-## Design
-
-### Tuning Space
-
-The auto-tuner searches over these parameters:
-
-| Parameter | Range | Description |
-|-----------|-------|-------------|
-| BLOCK_M | {32, 64, 128, 256} | Tile row size |
-| BLOCK_N | {32, 64, 128, 256} | Tile column size |
-| BLOCK_K | {8, 16, 32} | K dimension tile size |
-| THREADS_PER_BLOCK | {128, 256, 512} | Thread block size |
-| KERNEL_VARIANT | {naive, tiled, optimized, vectorized} | Which kernel |
-
-### API Design
-
-```cpp
-struct TuneResult {
-    int block_m;
-    int block_n;
-    int block_k;
-    int threads_per_block;
-    int kernel_variant;
-    float execution_time_ms;
-    float gflops;
-};
-
-class AutoTuner {
-public:
-    explicit AutoTuner(StreamManager* streams = nullptr);
-
-    // Tune for specific dimensions
-    TuneResult tune(int M, int N, int K,
-                    const std::vector<int>& candidates = {});
-
-    // Get cached result (if exists)
-    bool get_cached(int M, int N, int K, TuneResult& result);
-
-    // Cache a result manually
-    void cache(int M, int N, int K, const TuneResult& result);
-
-    // Statistics
-    size_t cache_size() const;
-    float cache_hit_rate() const;
-    size_t total_tunes() const;
-
-    // Clear cache
-    void clear_cache();
-
-private:
-    struct CacheKey {
-        int M, N, K;
-        bool operator<(const CacheKey& other) const;
-        bool operator==(const CacheKey& other) const;
-    };
-
-    std::map<CacheKey, TuneResult> cache_;
-    StreamManager* streams_;
-    size_t total_tunes_;
-    size_t cache_hits_;
-};
-```
-
-### Tuning Algorithm
-
-```
-1. Check cache for (M, N, K) → HIT: return immediately
-2. Generate candidate configurations
-3. For each candidate:
-   a. Allocate test buffers
-   b. Run kernel 10 times (warmup + measurement)
-   c. Record median execution time
-4. Select configuration with minimum time
-5. Store in cache and return
-```
-
-### Cache Key Normalization
-
-To maximize cache hits, dimensions are rounded to nearest power of 2:
-
-```
-normalize(M) = 2^ceil(log2(M))
-```
-
-### Candidate Generation Strategy
-
-| M, N, K Range | Strategy | Candidates |
-|---------------|----------|------------|
-| Small (<128) | Exhaustive | All 48 combos |
-| Medium (128-512) | Greedy | 12 most promising |
-| Large (>512) | Heuristic | 6 best-known configs |
-
-### Performance Budget
-
-| Operation | Budget | Notes |
-|-----------|--------|-------|
-| Tuning overhead | <500ms | For small matrices |
-| Cache lookup | <1μs | O(log N) map lookup |
-| Cache hit rate | >80% | After initial warmup |
-
-### Logging and Diagnostics
-
-```cpp
-// AutoTuner logs tuning decisions
-struct TuneLog {
-    int M, N, K;
-    int candidates_evaluated;
-    TuneResult best;
-    float time_spent_ms;
-};
-
-// Query tuning history
-std::vector<TuneLog> get_tuning_history() const;
-```
-
-## Testing Strategy
-
-1. **Correctness**: All candidates produce mathematically correct results
-2. **Cache behavior**: Hit/miss behavior under various access patterns
-3. **Performance**: Selected config within 5% of true optimum
-4. **Edge cases**: M=1, N=1, K=1; non-power-of-2; very large dimensions
-5. **Stability**: Repeated tuning of same config produces consistent results
-
-## Implementation Files
-
-- `include/autotuner.h` - AutoTuner class
-- `src/autotuner.cu` - Implementation
-- `tests/test_autotuner.cpp` - Unit tests (if added later)
-
-## Future Work
-
-- Persistent cache (save/load tuning database)
-- Online tuning (adjust parameters during execution)
-- Machine learning-based configuration selection
-- Transfer learning between GPUs
diff --git a/openspec/specs/architecture/0006-logger-config-profiler.md b/openspec/specs/architecture/0006-logger-config-profiler.md
deleted file mode 100644
index e337f94..0000000
--- a/openspec/specs/architecture/0006-logger-config-profiler.md
+++ /dev/null
@@ -1,265 +0,0 @@
----
-openspec:
-  type: architecture
-  status: active
-  created: 2024-01-01
-  tags: [architecture, rfc]
-  dependencies: []
-  affects: []
----
-
-# RFC 0006: Logger, Config, and Profiler Infrastructure
-
-## Status
-
-**Status:** Accepted
-**Created:** 2024
-**Last Updated:** 2024
-
-## Overview
-
-Design the infrastructure components for logging, configuration management, and performance profiling, providing observability and configurability for the inference engine.
-
-## Logger System
-
-### Design Goals
-
-1. **Thread-safe**: Multiple threads can log simultaneously
-2. **Level-filtered**: DEBUG, INFO, WARN, ERROR levels
-3. **Low overhead**: Minimal impact on production performance
-4. **Structured output**: Consistent formatting for parsing
-
-### Log Levels
-
-| Level | Value | Usage |
-|-------|-------|-------|
-| DEBUG | 0 | Development diagnostics |
-| INFO | 1 | Normal operation messages |
-| WARN | 2 | Warning conditions |
-| ERROR | 3 | Error conditions |
-
-### API Design
-
-```cpp
-enum class LogLevel {
-    DEBUG = 0,
-    INFO = 1,
-    WARN = 2,
-    ERROR = 3
-};
-
-class Logger {
-public:
-    // Singleton access
-    static Logger& instance();
-
-    // Configuration
-    void set_level(LogLevel level);
-    void set_output(std::ostream& stream);
-    void set_file(const std::string& path);
-
-    // Logging
-    void log(LogLevel level, const std::string& message,
-             const std::string& file = "", int line = 0);
-
-    // Convenience macros (defined in logger.h)
-    // LOG_DEBUG("message"), LOG_INFO("message"), etc.
-
-    // Statistics
-    size_t messages_logged() const;
-
-private:
-    Logger();
-    LogLevel level_;
-    std::mutex mutex_;
-    size_t count_;
-};
-```
-
-### Convenience Macros
-
-```cpp
-#define LOG_DEBUG(msg) Logger::instance().log(LogLevel::DEBUG, msg, __FILE__, __LINE__)
-#define LOG_INFO(msg)  Logger::instance().log(LogLevel::INFO, msg, __FILE__, __LINE__)
-#define LOG_WARN(msg)  Logger::instance().log(LogLevel::WARN, msg, __FILE__, __LINE__)
-#define LOG_ERROR(msg) Logger::instance().log(LogLevel::ERROR, msg, __FILE__, __LINE__)
-```
-
-## Configuration System
-
-### Design Goals
-
-1. **INI format**: Simple, human-readable configuration files
-2. **Typed access**: `get<int>()`, `get<float>()`, `get<std::string>()`
-3. **Defaults**: Provide default values for missing keys
-4. **Hot reload**: Ability to reload config at runtime
-
-### INI File Format
-
-```ini
-[engine]
-device_id = 0
-max_batch_size = 512
-memory_pool_size = 268435456  ; 256MB
-
-[gemm]
-kernel_variant = optimized
-enable_tuning = true
-cache_size = 1000
-
-[logging]
-level = INFO
-file = engine.log
-```
-
-### API Design
-
-```cpp
-class Config {
-public:
-    static Config& instance();
-
-    // Loading
-    bool load_from_file(const std::string& path);
-    bool load_from_string(const std::string& content);
-
-    // Typed access
-    template<typename T>
-    T get(const std::string& section, const std::string& key,
-          const T& default_value = T{});
-
-    // Existence check
-    bool has_key(const std::string& section, const std::string& key) const;
-
-    // Reloading
-    bool reload();
-
-    // Debugging
-    void print() const;
-
-private:
-    Config();
-    std::map<std::string, std::map<std::string, std::string>> data_;
-    std::string config_path_;
-};
-```
-
-### Parser Implementation
-
-- Single-pass parsing (read entire file into memory)
-- Comments: lines starting with `#` or `;`
-- Sections: `[section_name]`
-- Keys: `key = value` (whitespace stripped)
-- No nesting, no variable substitution (keep it simple)
-
-## Profiler System
-
-### Design Goals
-
-1. **Scoped timing**: RAII-based timing of code blocks
-2. **Hierarchical**: Nested timing with parent-child relationships
-3. **Export**: JSON/CSV output for analysis
-4. **Low overhead**: <1% performance impact when enabled
-
-### API Design
-
-```cpp
-struct ProfileRecord {
-    std::string name;
-    double total_time_ms;
-    double avg_time_ms;
-    double min_time_ms;
-    double max_time_ms;
-    int call_count;
-};
-
-class Profiler {
-public:
-    static Profiler& instance();
-
-    // Start/stop profiling
-    void start();
-    void stop();
-
-    // Scoped timer (RAII)
-    class ScopeTimer {
-    public:
-        ScopeTimer(const std::string& name);
-        ~ScopeTimer();
-    private:
-        std::string name_;
-        cudaEvent_t start_, end_;
-    };
-
-    // Results
-    std::vector<ProfileRecord> get_records() const;
-    void export_json(const std::string& path) const;
-    void export_csv(const std::string& path) const;
-    void print_summary() const;
-
-    // Reset
-    void reset();
-
-private:
-    Profiler();
-    bool enabled_;
-    std::map<std::string, ProfileRecord> records_;
-};
-```
-
-### Usage Pattern
-
-```cpp
-void inference_forward() {
-    Profiler::ScopeTimer timer("forward_pass");
-
-    // Layer 1
-    {
-        Profiler::ScopeTimer t("layer1_gemm");
-        launch_gemm(...);
-    }
-
-    // Layer 2
-    {
-        Profiler::ScopeTimer t("layer2_gemm");
-        launch_gemm(...);
-    }
-}
-```
-
-## Error Handling
-
-| Component | Error Condition | Behavior |
-|-----------|----------------|----------|
-| Logger | File cannot be opened | Fall back to stderr |
-| Config | File not found | Throw `std::runtime_error` |
-| Config | Invalid INI syntax | Throw `std::runtime_error` |
-| Profiler | CUDA event creation fails | Throw `CudaException` |
-
-## Testing Strategy
-
-### Logger Tests
-1. Level filtering: Only messages >= level are output
-2. Thread safety: Concurrent logging doesn't crash
-3. File output: Written content matches expected
-
-### Config Tests
-1. Section/key parsing: All entries extracted correctly
-2. Typed access: `get<int>()` returns correct values
-3. Missing keys: Default values returned
-4. Reload: Updated file content reflected after reload
-
-### Profiler Tests
-1. Timing accuracy: Measured time matches expected
-2. Nesting: Parent timers include child time
-3. Export: JSON/CSV format is valid and parseable
-4. Overhead: <1% performance impact
-
-## Implementation Files
-
-- `include/logger.h` - Logger class (header-only for simplicity)
-- `include/config.h` - Config class (header-only)
-- `include/profiler.h` - Profiler class
-- `src/profiler.cu` - Profiler implementation
-- `tests/test_logger.cpp` - Logger tests
-- `tests/test_config.cpp` - Config tests
diff --git a/openspec/specs/architecture/0007-half-precision-gemm.md b/openspec/specs/architecture/0007-half-precision-gemm.md
deleted file mode 100644
index 8a0bc10..0000000
--- a/openspec/specs/architecture/0007-half-precision-gemm.md
+++ /dev/null
@@ -1,142 +0,0 @@
----
-openspec:
-  type: architecture
-  status: active
-  created: 2024-01-01
-  tags: [architecture, rfc]
-  dependencies: []
-  affects: []
----
-
-# RFC 0007: Half-Precision (FP16) GEMM Support
-
-## Status
-
-**Status:** Accepted
-**Created:** 2024
-**Last Updated:** 2024
-
-## Overview
-
-Add FP16 (half-precision) GEMM kernel support to leverage Tensor Cores on Volta+ GPUs and reduce memory bandwidth by 50% compared to FP32.
-
-## Motivation
-
-1. **Tensor Core acceleration**: FP16 enables Tensor Core usage on CC 7.0+ GPUs
-2. **Memory bandwidth**: 50% reduction vs FP32 (2 bytes vs 4 bytes per element)
-3. **Power efficiency**: FP16 arithmetic consumes less energy
-4. **Model compatibility**: Many modern models trained in FP16 or mixed precision
-
-## GPU Support Matrix
-
-| Architecture | CC | Tensor Core | FP16 Performance |
-|--------------|----|-------------|-----------------|
-| Volta | 7.0 | Yes (1st gen) | 4x FP32 |
-| Turing | 7.5 | Yes (2nd gen) | 4x FP32 |
-| Ampere | 8.0, 8.6 | Yes (3rd gen) | 4x FP32 |
-| Ada Lovelace | 8.9 | Yes (4th gen) | 4x FP32 |
-| Hopper | 9.0 | Yes (5th gen) | 4x FP32 |
-
-## Design
-
-### FP16 GEMM Kernel
-
-```cpp
-// Basic FP16 GEMM (no Tensor Cores, scalar FP16 arithmetic)
-__global__ void half_gemm(
-    const half* A, const half* B, half* C,
-    int M, int N, int K
-);
-
-// FP16 GEMM with FP32 accumulation (improved accuracy)
-__global__ void half_gemm_fp32_accum(
-    const half* A, const half* B, float* C,
-    int M, int N, int K
-);
-```
-
-### Memory Layout
-
-FP16 tensors use the same layout as FP32, just 2 bytes per element:
-
-```
-FP32: [float0][float1][float2]...  (4 bytes each)
-FP16: [half0][half1][half2]...     (2 bytes each)
-```
-
-### Type Conversion API
-
-```cpp
-// FP32 → FP16 conversion
-void fp32_to_fp16(const float* src, half* dst, size_t elements);
-
-// FP16 → FP32 conversion
-void fp16_to_fp32(const half* src, float* dst, size_t elements);
-
-// Mixed-precision GEMM wrapper
-void mixed_precision_gemm(
-    const float* A_fp32, const float* B_fp32, float* C_fp32,
-    int M, int N, int K,
-    cudaStream_t stream = 0
-);
-```
-
-### Accuracy Considerations
-
-| Operation | FP16 Error | FP32 Error | Notes |
-|-----------|-----------|-----------|-------|
-| Addition | 2^-11 | 2^-24 | Half precision |
-| Multiplication | 2^-11 | 2^-24 | Half precision |
-| Accumulation (FP32) | 2^-24 | 2^-24 | FP32 accumulator |
-| Accumulation (FP16) | 2^-8 | 2^-24 | Accumulated rounding |
-
-**Recommendation**: Always use FP32 accumulation for GEMM to maintain accuracy.
-
-### Performance Targets
-
-| Configuration | FP32 TFLOPS | FP16 TFLOPS | Speedup |
-|---------------|-------------|-------------|---------|
-| RTX 3090 (Ampere) | ~15 | ~60 | 4x |
-| RTX 4090 (Ada) | ~20 | ~80 | 4x |
-| A100 (Ampere) | ~20 | ~80 | 4x |
-
-### Kernel Design Strategy
-
-#### Phase 1: Basic FP16 Kernel (Current)
-- Scalar FP16 arithmetic
-- FP32 accumulation for accuracy
-- Shared memory tiling with FP16 tiles
-
-#### Phase 2: Tensor Core Acceleration (Future)
-- WMMA (Warp-level Matrix Multiply Accumulate) API
-- 16x16x16 tile size for Tensor Cores
-- Requires CC 7.0+
-
-### Implementation Files
-
-- `include/half_gemm.cuh` - FP16 kernel declarations and helpers
-- `src/half_gemm.cu` - FP16 GEMM kernel implementation
-- `tests/test_half_gemm.cpp` - FP16 GEMM tests (if added)
-
-### Error Handling
-
-| Condition | Behavior |
-|-----------|----------|
-| FP16 overflow (>65504) | Clamp to max FP16 value |
-| FP16 underflow (<6.1e-5) | Flush to zero |
-| NaN in FP16 input | Propagate NaN to output |
-| Unsupported GPU | Fall back to FP32 kernel |
-
-## Testing Strategy
-
-1. **Conversion accuracy**: Round-trip FP32→FP16→FP32 within budget
-2. **GEMM correctness**: FP16 GEMM matches FP32 within 2^-8 tolerance
-3. **Performance**: FP16 achieves ≥2x speedup over FP32 on supported GPUs
-4. **Edge cases**: Overflow, underflow, NaN propagation
-5. **Compatibility**: Correct fallback on pre-Volta GPUs
-
-## References
-
-- [CUDA FP16 Header](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__HALF.html)
-- [WMMA API Documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma)
-- [Tensor Core Performance Guide](https://docs.nvidia.com/deeplearning/performance/mixed-precision/index.html)
diff --git a/openspec/specs/architecture/0008-batch-gemm.md b/openspec/specs/architecture/0008-batch-gemm.md
deleted file mode 100644
index 18d79b1..0000000
--- a/openspec/specs/architecture/0008-batch-gemm.md
+++ /dev/null
@@ -1,152 +0,0 @@
----
-openspec:
-  type: architecture
-  status: active
-  created: 2024-01-01
-  tags: [architecture, rfc]
-  dependencies: []
-  affects: []
----
-
-# RFC 0008: Batch GEMM System
-
-## Status
-
-**Status:** Accepted
-**Created:** 2024
-**Last Updated:** 2024
-
-## Overview
-
-Design a batch GEMM system that efficiently processes multiple matrix multiplications in a single kernel launch, optimizing for inference workloads with repeated matrix operations across different layers or batches.
-
-## Motivation
-
-1. **Multi-layer inference**: Neural networks have multiple layers, each with GEMM
-2. **Batch processing**: Serve multiple requests with same model efficiently
-3. **Kernel launch overhead**: Amortize launch cost across many small matrices
-4. **Memory efficiency**: Coalesce memory accesses for small matrices
-
-## Design
-
-### Batch GEMM API
-
-```cpp
-class BatchGemmRunner {
-public:
-    // Add a GEMM to the batch
-    void add_gemm(const float* A, const float* B, float* C,
-                  int M, int N, int K);
-
-    // Execute all queued GEMMs
-    void execute(cudaStream_t stream = 0);
-
-    // Clear the batch without executing
-    void clear();
-
-    // Statistics
-    int gemm_count() const;
-    float estimated_time_ms() const;
-
-    // Configuration
-    void set_kernel_variant(int variant);
-    void set_max_batch_size(int size);
-
-private:
-    struct GemmTask {
-        const float* A;
-        const float* B;
-        float* C;
-        int M, N, K;
-    };
-
-    std::vector<GemmTask> tasks_;
-    int max_batch_size_;
-    int kernel_variant_;
-};
-```
-
-### Execution Strategy
-
-#### Strategy 1: Sequential Execution
-- Launch one kernel per GEMM
-- Pros: Simple, uses optimal kernel per task
-- Cons: High launch overhead for many small GEMMs
-
-#### Strategy 2: Batched Kernel Launch
-- Single kernel processes all GEMMs
-- Pros: One launch, coalesced memory access
-- Cons: All GEMMs must be same size
-
-#### Strategy 3: Grouped Execution
-- Group same-size GEMMs together
-- Execute each group with batched kernel
-- Pros: Balance of efficiency and flexibility
-- Cons: More complex scheduling
-
-**Decision**: Implement Strategy 3 (Grouped Execution) for production.
-
-### Kernel Design
-
-```cpp
-// Batched GEMM kernel: processes N GEMMs of same dimensions
-__global__ void batched_gemm(
-    const float** A_array, const float** B_array, float** C_array,
-    int M, int N, int K,
-    int batch_count
-);
-```
-
-### Memory Layout for Batched GEMM
-
-```
-GPU Memory:
-┌─────────────────────────────────────────────┐
-│ A_pointers[N]  → [A_0, A_1, ..., A_N-1]     │
-│ B_pointers[N]  → [B_0, B_1, ..., B_N-1]     │
-│ C_pointers[N]  → [C_0, C_1, ..., C_N-1]     │
-│                                            │
-│ A_0: [M×K matrix]                          │
-│ A_1: [M×K matrix]                          │
-│ ...                                        │
-│ B_0: [K×N matrix]                          │
-│ ...                                        │
-└─────────────────────────────────────────────┘
-```
-
-### Performance Targets
-
-| Scenario | Sequential | Batched | Speedup |
-|----------|-----------|---------|---------|
-| 10× 64×64 GEMMs | 0.5ms | 0.15ms | 3.3x |
-| 100× 64×64 GEMMs | 5.0ms | 0.4ms | 12.5x |
-| 10× 512×512 GEMMs | 1.0ms | 0.8ms | 1.25x |
-
-### Error Handling
-
-| Condition | Behavior |
-|-----------|----------|
-| Empty batch | No-op (no kernel launch) |
-| Exceed max_batch_size | Execute current batch, queue remaining |
-| Mixed dimensions | Group by dimensions, execute separately |
-| Invalid pointer | Throw `std::invalid_argument` |
-
-## Testing Strategy
-
-1. **Correctness**: Each GEMM produces correct output independently
-2. **Grouping**: Same-size GEMMs grouped correctly
-3. **Performance**: Meets speedup targets vs sequential
-4. **Edge cases**: Single GEMM batch, very large batch count
-5. **Memory safety**: No out-of-bounds accesses
-
-## Implementation Files
-
-- `include/batch_gemm.h` - BatchGemmRunner class
-- `src/batch_gemm.cu` - Batch GEMM kernel and runner
-- `tests/test_batch_gemm.cpp` - Unit tests
-
-## Future Work
-
-- Strided batched GEMM (cuBLAS-style)
-- Dynamic grouping based on runtime statistics
-- Integration with StreamManager for concurrent batches
diff --git a/openspec/specs/data/README.md b/openspec/specs/data/README.md
deleted file mode 100644
index b020ce8..0000000
--- a/openspec/specs/data/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
----
-openspec:
-  type: data
-  status: active
-  created: 2024-01-01
-  tags: [data, schema]
----
-
-# Data Specifications
-
-数据模型与 Schema 定义。完整规范见 [`tensor-schema.yaml`](tensor-schema.yaml)。
-
-## 核心数据结构
-
-- **Tensor**: N 维张量，支持 GPU 内存池
-- **Weight File**: 自定义二进制权重格式
-
-## 文档链接
-
-- [API Reference (English)](../../docs/en/API_REFERENCE.md)
-- [API 参考 (中文)](../../docs/zh/API_REFERENCE.md)
-
----
-
-See [AGENTS.md](../../AGENTS.md) for development workflow.
diff --git a/openspec/specs/data/tensor-schema.yaml b/openspec/specs/data/tensor-schema.yaml
deleted file mode 100644
index 4deae9b..0000000
--- a/openspec/specs/data/tensor-schema.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
----
-openspec:
-  type: data
-  status: active
-  created: 2024-01-01
-  tags: [data, schema]
----
-
-# Tensor Data Model Schema
-# Defines the structure for tensors in Mini-Inference Engine
-# Version: 1.0.0
-
-tensor_schema:
-  version: "1.0.0"
-  description: Core tensor data structure and memory layout definitions
-
-tensor:
-  type: object
-  required:
-    - shape
-    - dtype
-    - device
-  properties:
-    shape:
-      type: array
-      minItems: 1
-      maxItems: 4
-      items:
-        type: integer
-        minimum: 1
-      description: Tensor dimensions [N, C, H, W] or subsets
-      example: [32, 256, 14, 14]
-
-    dtype:
-      type: string
-      enum:
-        - float32
-        - float16
-        - int8
-      default: float32
-      description: Data type for tensor elements
-
-    device:
-      type: string
-      enum:
-        - cuda
-        - cpu
-      default: cuda
-      description: Memory location (CUDA device or host)
-
-    strides:
-      type: array
-      items:
-        type: integer
-        minimum: 0
-      description: |
-        Memory strides for each dimension.
-        Computed as cumulative product of shape dimensions in reverse order.
-        Example for [32, 256, 14, 14]: [50176, 196, 14, 1]
-
-    data_ptr:
-      type: string
-      description: Opaque pointer to tensor data (device or host address)
-      readOnly: true
-
-    bytes:
-      type: integer
-      minimum: 0
-      description: Total allocated bytes
-      readOnly: true
-      formula: "product(shape) * sizeof(dtype)"
-
-    is_contiguous:
-      type: boolean
-      default: true
-      description: Whether tensor memory is contiguous (row-major)
-
-    name:
-      type: string
-      maxLength: 128
-      description: Optional tensor identifier for debugging
-
-memory_layout:
-  description: Memory organization for tensors
-  layout_type: row_major
-  alignment: 256  # bytes, for CUDA memory coalescing
-
-  float32:
-    element_size: 4
-    alignment: 256
-    description: Standard 32-bit IEEE 754 floating point
-
-  float16:
-    element_size: 2
-    alignment: 256
-    description: Half-precision floating point (CUDA FP16)
-
-  int8:
-    element_size: 1
-    alignment: 256
-    description: 8-bit signed integer for quantized models
-
-weight_file_format:
-  description: Binary format for persisted model weights
-  magic_number: "MIE\x00"  # 4 bytes
-  version:
-    offset: 4
-    size: 4
-    type: uint32
-  layers:
-    offset: 8
-    count_size: 4
-    type: uint32
-  layer_entry:
-    name_size:
-      type: uint32
-      description: Length of layer name string
-    name:
-      type: string
-      encoding: UTF-8
-    weight_shape_size:
-      type: uint32
-      description: Number of dimensions
-    weight_shape:
-      type: array
-      items:
-        type: uint32
-    weight_dtype:
-      type: uint8
-      enum: { float32: 0, float16: 1, int8: 2 }
-    weight_bytes:
-      type: uint32
-      description: Size of weight data in bytes
-    weight_data:
-      type: binary
-      description: Raw weight tensor data
-
-error_codes:
-  SUCCESS: 0
-  INVALID_ARG: 1
-  OUT_OF_MEMORY: 2
-  CUDA_ERROR: 3
-  NOT_INITIALIZED: 4
-  FILE_NOT_FOUND: 5
-  INVALID_FORMAT: 6
diff --git a/openspec/specs/product/gemm-optimization-requirements.md b/openspec/specs/product/gemm-optimization-requirements.md
deleted file mode 100644
index fd587e1..0000000
--- a/openspec/specs/product/gemm-optimization-requirements.md
+++ /dev/null
@@ -1,171 +0,0 @@
----
-openspec:
-  type: product
-  status: active
-  created: 2024-01-01
-  tags: [requirements, gemm, optimization]
----
-
-# Requirements Document: Mini-Inference Engine
-
-## Introduction
-
-This project implements a lightweight neural network inference engine with a focus on GEMM (General Matrix Multiply) optimization techniques. The reference goal is for the best optimized kernel to reach about 85% of cuBLAS-class throughput on the documented RTX 3080 1024×1024 benchmark; results vary by GPU, matrix size, driver, and compiler options.
-
-## Glossary
-
-| Term | Definition |
-|------|------------|
-| Inference Engine | Neural network inference engine responsible for loading model weights and executing forward propagation |
-| GEMM | General Matrix Multiply, C = α·A·B + β·C |
-| Kernel | CUDA kernel function executed in parallel on GPU |
-| Tiling | Blocking technique that divides large matrices into smaller tiles to utilize shared memory |
-| Memory Coalescing | Memory optimization to maximize bandwidth utilization |
-| Double Buffering | Technique to hide memory latency through prefetching |
-| Register Blocking | Register optimization to improve compute density |
-| Kernel Fusion | Operator fusion that combines multiple operations into a single kernel |
-
-## Requirements
-
-### R1: Naive Matrix Multiplication
-
-**User Story:** As a developer, I want to implement a basic matrix multiplication kernel to establish a performance baseline.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R1.1 | Support arbitrary size matrix multiplication C = A × B |
-| R1.2 | Given input matrices A(M×K) and B(K×N), correctly output C(M×N) |
-| R1.3 | Each thread computes one output element |
-| R1.4 | Result matches CPU reference implementation with error < 1e-5 |
-| R1.5 | Record execution time and GFLOPS |
-
-### R2: Tiling Optimization
-
-**User Story:** As a developer, I want to use tiling technique to optimize matrix multiplication and reduce global memory accesses.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R2.1 | Divide input matrices into 32×32 tiles |
-| R2.2 | Load tile data from global memory to shared memory |
-| R2.3 | Compute using shared memory data |
-| R2.4 | Correctly handle boundary conditions |
-| R2.5 | Achieve ≥ 5x performance improvement over Naive |
-
-### R3: Memory Coalescing
-
-**User Story:** As a developer, I want to optimize memory access patterns to maximize bandwidth utilization.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R3.1 | Threads in the same warp access contiguous memory addresses |
-| R3.2 | Matrix A uses row-major access pattern |
-| R3.3 | Matrix B uses optimized access pattern |
-| R3.4 | Achieve ≥ 20% performance improvement |
-
-### R4: Double Buffering
-
-**User Story:** As a developer, I want to implement double buffering technique to hide memory latency.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R4.1 | Use two sets of shared memory buffers |
-| R4.2 | Prefetch next tile while computing current tile |
-| R4.3 | Achieve overlap between computation and data transfer |
-| R4.4 | Achieve ≥ 15% performance improvement |
-
-### R5: Register Blocking
-
-**User Story:** As a developer, I want to optimize register usage to improve compute density.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R5.1 | Use registers to store intermediate results |
-| R5.2 | Avoid shared memory bank conflicts |
-| R5.3 | Use vectorized loads (float4) |
-| R5.4 | Achieve approximately 85% of cuBLAS-class throughput on the reference benchmark |
-
-### R6: Kernel Fusion
-
-**User Story:** As a developer, I want to fuse MatMul + Bias + ReLU into a single kernel to reduce memory reads/writes.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R6.1 | Single kernel completes Y = ReLU(X × W + bias) |
-| R6.2 | Add bias directly in registers |
-| R6.3 | Apply ReLU directly in registers |
-| R6.4 | Reduce time by ≥ 30% compared to separate kernels |
-| R6.5 | Support optional bias and activation configuration |
-
-### R7: Weight Loading and Inference
-
-**User Story:** As a developer, I want to load pretrained weights and execute inference to validate engine correctness.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R7.1 | Support loading weights from binary files |
-| R7.2 | Support simple fully-connected network (MNIST) |
-| R7.3 | Correctly execute multi-layer forward propagation |
-| R7.4 | MNIST accuracy matches reference |
-| R7.5 | Report per-layer execution time |
-
-### R8: Performance Benchmarking
-
-**User Story:** As a developer, I want to run performance benchmarks to quantify optimization results.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R8.1 | Test matrix sizes: 256, 512, 1024, 2048, 4096 |
-| R8.2 | Record GFLOPS and cuBLAS performance ratio |
-| R8.3 | Generate performance comparison report |
-| R8.4 | Validate numerical correctness for all versions |
-| R8.5 | Report mean and standard deviation over multiple iterations |
-
-### R9: Error Handling and Resource Management
-
-**User Story:** As a developer, I want robust error handling to safely manage exceptional conditions.
-
-| ID | Acceptance Criteria |
-|----|---------------------|
-| R9.1 | CUDA API failures return descriptive errors |
-| R9.2 | Safely release resources on memory allocation failures |
-| R9.3 | Correctly release all GPU resources on program exit |
-| R9.4 | Detect and report dimension mismatches |
-| R9.5 | Convert CUDA error codes to human-readable messages |
-
-## Correctness Properties
-
-### P1: Matrix Multiplication Correctness
-
-For any matrices A(M×K) and B(K×N), GPU result C matches CPU reference within max error 1e-5.
-
-**Validates:** R1.1, R1.2, R1.4
-
-### P2: Optimized GEMM Equivalence
-
-For any matrices A and B, all optimized implementations produce equivalent results to Naive.
-
-**Validates:** R2.4, R8.4
-
-### P3: Kernel Fusion Correctness
-
-For any input X, weight W, bias b, fused output equals sequential MatMul + Bias + ReLU.
-
-**Validates:** R6.1, R6.5
-
-### P4: Weight Serialization Round-Trip
-
-For any valid weights, save and load produces bit-identical results.
-
-**Validates:** R7.1
-
-### P5: Multi-Layer Forward Pass Consistency
-
-For any input batch, forward pass produces equivalent results to sequential layer execution.
-
-**Validates:** R7.3
-
-### P6: Dimension Mismatch Detection
-
-For matrices where A.cols != B.rows, engine detects and reports error before computation.
-
-**Validates:** R9.4
diff --git a/openspec/specs/product/implementation-plan.md b/openspec/specs/product/implementation-plan.md
deleted file mode 100644
index 967205a..0000000
--- a/openspec/specs/product/implementation-plan.md
+++ /dev/null
@@ -1,271 +0,0 @@
----
-openspec:
-  type: product
-  status: active
-  created: 2024-01-01
-  tags: [implementation, roadmap]
----
-
-# Implementation Plan: Mini-Inference Engine
-
-## Overview
-
-This implementation plan breaks down the Mini-Inference Engine into progressive coding tasks, from project infrastructure to complete optimization paths.
-
-## Progress Summary
-
-| Phase | Status | Completion |
-|-------|--------|------------|
-| 1. Project Infrastructure | ✅ Complete | 100% |
-| 2-3. Naive MatMul | ✅ Complete | 100% |
-| 4-8. GEMM Optimization | ✅ Complete | 100% |
-| 9. Kernel Fusion | ✅ Complete | 100% |
-| 10-11. Inference Validation | ✅ Complete | 100% |
-| 12-13. Performance Testing | ✅ Complete | 100% |
-| 14-18. Engineering Enhments | ✅ Complete | 100% |
-
----
-
-## Phase 1: Project Infrastructure
-
-### Task 1.1: CMake Build System ✅
-
-- Create `CMakeLists.txt` with CUDA compilation configuration
-- Create directories: `src/`, `include/`, `tests/`, `benchmarks/`
-- Configure Google Test dependency
-
-**Requirements:** R9.1, R9.3
-
-### Task 1.2: Core Data Structures ✅
-
-- Implement `MatrixDesc`, `GemmConfig`, `FusionConfig`, `PerfStats`
-- Implement `CUDA_CHECK` macro and `CudaException` class
-- Implement `DeviceMemory` RAII wrapper class
-
-**Requirements:** R9.1, R9.2, R9.3, R9.5
-
-### Task 1.3: Input Validation and Reference Implementation ✅
-
-- Implement `validate_gemm_inputs()` function
-- Implement CPU version matrix multiplication
-- Implement matrix comparison function
-
-**Requirements:** R1.4, R9.4
-
----
-
-## Phase 2: Naive MatMul
-
-### Task 2.1: Naive MatMul Kernel ✅
-
-- Write `naive_matmul` CUDA kernel
-- Each thread computes one output element
-- Implement kernel launch wrapper function
-
-**Requirements:** R1.1, R1.2, R1.3
-
-### Task 2.2: Performance Measurement Tools ✅
-
-- Use CUDA Events for timing
-- Calculate GFLOPS: `2*M*N*K / time / 1e9`
-
-**Requirements:** R1.5
-
----
-
-## Phase 3: Tiled GEMM
-
-### Task 3.1: Tiled GEMM Kernel ✅
-
-- Write `tiled_gemm` kernel using shared memory
-- Implement 32×32 tile blocking strategy
-- Handle boundary conditions
-
-**Requirements:** R2.1, R2.2, R2.3, R2.4
-
-### Task 3.2: Performance Comparison ✅
-
-- Compare Naive vs Tiled performance
-- Verify ≥ 5x performance improvement
-
-**Requirements:** R2.5
-
----
-
-## Phase 4: Memory Coalescing
-
-### Task 4.1: Optimize Memory Access Patterns ✅
-
-- Modify Tiled GEMM to ensure coalesced memory access
-- Optimize row-major loading for matrix A
-- Optimize access pattern for matrix B
-
-**Requirements:** R3.1, R3.2, R3.3
-
-### Task 4.2: Performance Validation ✅
-
-- Verify ≥ 20% performance improvement
-
-**Requirements:** R3.4
-
----
-
-## Phase 5: Double Buffering
-
-### Task 5.1: Double Buffering GEMM Kernel ✅
-
-- Implement two sets of shared memory buffers
-- Implement overlap between computation and data prefetching
-- Use asynchronous memory operations
-
-**Requirements:** R4.1, R4.2, R4.3
-
-### Task 5.2: Performance Validation ✅
-
-- Verify ≥ 15% performance improvement
-
-**Requirements:** R4.4
-
----
-
-## Phase 6: Register Blocking
-
-### Task 6.1: Register Blocked GEMM Kernel ✅
-
-- Implement templated `optimized_gemm<BM, BN, BK, TM, TN>`
-- Each thread computes TM×TN output block
-- Use vectorized loads (float4)
-- Avoid shared memory bank conflicts
-
-**Requirements:** R5.1, R5.2, R5.3
-
-### Task 6.2: cuBLAS Performance Comparison ✅
-
-- Integrate cuBLAS reference
-- Verify approximately 85% cuBLAS-class throughput on the reference benchmark
-
-**Requirements:** R5.4
-
----
-
-## Phase 7: Kernel Fusion
-
-### Task 7.1: Fusion Kernel ✅
-
-- Write `fused_gemm_bias_relu` template kernel
-- Support optional bias addition
-- Support optional ReLU activation
-
-**Requirements:** R6.1, R6.2, R6.3, R6.5
-
-### Task 7.2: Fusion Performance Validation ✅
-
-- Compare fused vs separate kernels
-- Verify ≥ 30% time reduction
-
-**Requirements:** R6.4
-
----
-
-## Phase 8: Weight Loading and Inference
-
-### Task 8.1: Weight File Format ✅
-
-- Define binary weight file format
-- Implement `load_weights()` / `save_weights()`
-
-**Requirements:** R7.1
-
-### Task 8.2: InferenceEngine ✅
-
-- Implement `init()`, `cleanup()` lifecycle
-- Implement `forward()` multi-layer forward propagation
-- Support MNIST network architecture
-
-**Requirements:** R7.2, R7.3
-
----
-
-## Phase 9: MNIST Validation
-
-### Task 9.1: Data Preparation ✅
-
-- Create weight export script
-- Prepare test image data
-
-**Requirements:** R7.4
-
-### Task 9.2: End-to-End Testing ✅
-
-- Validate inference accuracy
-- Report per-layer execution time
-
-**Requirements:** R7.4, R7.5
-
----
-
-## Phase 10: Performance Benchmarking
-
-### Task 10.1: Benchmark Framework ✅
-
-- Test matrix sizes: 256, 512, 1024, 2048, 4096
-- Report mean and standard deviation over multiple iterations
-- Generate performance comparison report
-
-**Requirements:** R8.1, R8.2, R8.3, R8.5
-
-### Task 10.2: Correctness Validation ✅
-
-- Verify all optimized versions produce consistent results
-
-**Requirements:** R8.4
-
----
-
-## Phase 11: Engineering Enhments (Complete)
-
-### Task 11.1: Logging System ✅
-
-- Multiple log levels
-- Console and file output
-- Colored output
-
-### Task 11.2: Configuration Management ✅
-
-- Load configuration from file
-- Support environment variables
-- GEMM preset configurations
-
-### Task 11.3: GPU Memory Pool ✅
-
-- Cached allocation
-- Thread safety
-- Statistics
-
-### Task 11.4: Tensor Class ✅
-
-- GPU storage
-- Shape management
-- Mathematical operations
-
-### Task 11.5: Advanced Features ✅
-
-- Vectorized GEMM
-- Half-precision GEMM
-- Profiler
-- Auto-tuner
-- Stream manager
-- Batched GEMM
-- INT8 quantization
-
----
-
-## Notes
-
-- Tasks marked ✅ are complete
-- Each task references specific requirements for traceability
-- Property testing tasks are optional and can be skipped to accelerate development
-
-## Next Steps
-
-See [Roadmap](../../CHANGELOG.md#roadmap) for future planned features.
diff --git a/openspec/specs/testing/README.md b/openspec/specs/testing/README.md
deleted file mode 100644
index 98eba86..0000000
--- a/openspec/specs/testing/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
----
-openspec:
-  type: testing
-  status: active
-  created: 2024-01-01
-  tags: [testing, bdd]
----
-
-# Test Specifications
-
-BDD 测试规范。完整规范见 [`gemm-tensor-inference-tests.yaml`](gemm-tensor-inference-tests.yaml)。
-
-## 正确性属性
-
-| 属性 | 描述 |
-|------|------|
-| P1 | 矩阵乘法正确性：GPU 结果与 CPU 参考误差 < 1e-5 |
-| P2 | 优化 GEMM 等价性：所有优化实现与 Naive 结果一致 |
-| P3 | 融合内核正确性：融合输出等于顺序计算 |
-| P4 | 权重序列化往返：保存/加载结果位相同 |
-| P5 | 多层前向一致性：与前向传播结果一致 |
-| P6 | 维度不匹配检测：错误报告 |
-
-## 测试文件
-
-| 文件 | 覆盖范围 |
-|------|----------|
-| `test_gemm.cu` | GEMM 内核 |
-| `test_tensor.cpp` | Tensor 操作 |
-| `test_inference.cpp` | InferenceEngine |
-| `test_memory_pool.cpp` | MemoryPool |
-| `test_stream_manager.cpp` | StreamManager |
-
----
-
-See [AGENTS.md](../../AGENTS.md) for development workflow.
diff --git a/openspec/specs/testing/gemm-tensor-inference-tests.yaml b/openspec/specs/testing/gemm-tensor-inference-tests.yaml
deleted file mode 100644
index d0502c1..0000000
--- a/openspec/specs/testing/gemm-tensor-inference-tests.yaml
+++ /dev/null
@@ -1,228 +0,0 @@
----
-openspec:
-  type: testing
-  status: active
-  created: 2024-01-01
-  tags: [testing, bdd]
----
-
-# BDD Test Specifications
-# Mini-Inference Engine Test Cases
-# Version: 1.0.0
-
-test_suite: gemm_operations
-  description: Matrix multiplication (GEMM) kernel tests
-  source: specs/product/gemm-optimization-requirements.md
-
-  test_cases:
-    - name: "GEMM naive matmul correctness"
-      description: Verify naive matmul produces correct results
-      given:
-        - matrix_A: "128x128 random float32"
-        - matrix_B: "128x128 random float32"
-      when: "launch_naive_matmul(A, B, C)"
-      then:
-        - "C[i,j] == sum(A[i,k] * B[k,j]) for all i,j"
-        - "max_error < 1e-5"
-      tags: [gemm, correctness, small-matrix]
-
-    - name: "GEMM tiled matmul correctness"
-      description: Verify tiled matmul with shared memory optimization
-      given:
-        - matrix_A: "1024x1024 random float32"
-        - matrix_B: "1024x1024 random float32"
-        - tile_size: 32
-      when: "launch_tiled_gemm(A, B, C, tile_size=32)"
-      then:
-        - "results match naive matmul within tolerance"
-        - "memory bandwidth utilization > 70%"
-      tags: [gemm, tiled, shared-memory]
-
-    - name: "GEMM performance benchmark"
-      description: Verify GEMM achieves target performance
-      given:
-        - M: 4096
-        - N: 4096
-        - K: 4096
-      when: "launch_optimized_gemm(A, B, C, M, N, K)"
-      then:
-        - "execution_time < 10ms on RTX 4090"
-        - "TFLOPS > 80% of theoretical peak"
-      tags: [gemm, performance, benchmark]
-
-    - name: "GEMM edge cases"
-      description: Test boundary conditions
-      cases:
-        - "M=1, N=1, K=1 (minimum dimensions)"
-        - "M=1, N=4096, K=4096 (vector-matrix)"
-        - "M=4096, N=1, K=4096 (matrix-vector)"
-        - "non-multiple of tile_size dimensions"
-      then:
-        - "no CUDA errors"
-        - "results mathematically correct"
-      tags: [gemm, edge-cases, boundary]
-
-test_suite: tensor_operations
-  description: Tensor creation, manipulation, and memory management
-  source: specs/db/tensor-schema.yaml
-
-  test_cases:
-    - name: "Tensor creation and destruction"
-      given:
-        - shape: "[32, 256]"
-        - dtype: float32
-      when: "Tensor::create(shape, dtype)"
-      then:
-        - "tensor is not null"
-        - "tensor.bytes == 32 * 256 * 4"
-        - "tensor.data_ptr is valid CUDA pointer"
-      tags: [tensor, lifecycle]
-
-    - name: "Tensor shape validation"
-      description: Reject invalid shapes
-      when:
-        - "Tensor::create([], float32) (empty shape)"
-        - "Tensor::create([0, 128], float32) (zero dimension)"
-        - "Tensor::create([-1, 128], float32) (negative dimension)"
-      then:
-        - "throw std::invalid_argument"
-      tags: [tensor, validation, error-handling]
-
-    - name: "Tensor contiguous memory"
-      given: "Tensor with shape [4, 3, 32, 32]"
-      when: "access tensor elements sequentially"
-      then:
-        - "stride[3] == 1"
-        - "stride[2] == 32"
-        - "stride[1] == 1024"
-        - "stride[0] == 3072"
-        - "memory is row-major contiguous"
-      tags: [tensor, memory-layout, strides]
-
-test_suite: inference_pipeline
-  description: End-to-end inference engine tests
-  source: specs/api/engine-api.yaml
-
-  test_cases:
-    - name: "Inference engine initialization"
-      when: "InferenceEngine::init(device_id=0)"
-      then:
-        - "CUDA context is created"
-        - "memory pool is initialized"
-        - "subsequent init() calls throw already_initialized"
-      tags: [inference, initialization]
-
-    - name: "Weight loading and validation"
-      given: "valid .weights file with known content"
-      when: "engine.load_weights(path)"
-      then:
-        - "all layers are loaded"
-        - "weight values match expected within 1e-6"
-        - "memory usage matches file size"
-      tags: [inference, weights, loading]
-
-    - name: "Forward pass correctness"
-      given:
-        - "loaded MNIST model (2-layer MLP)"
-        - "input: known MNIST image tensor"
-        - "expected_output: known classification result"
-      when: "engine.forward(input, output, batch_size=1)"
-      then:
-        - "output matches expected classification"
-        - "argmax(output) == expected_label"
-      tags: [inference, forward, correctness, mnist]
-
-    - name: "Batch inference"
-      given: "inference engine with loaded model"
-      when:
-        - "batch_size=1, forward(input_1)"
-        - "batch_size=32, forward(input_32)"
-        - "batch_size=512, forward(input_512)"
-      then:
-        - "all batches complete without errors"
-        - "per-sample results match batch_size=1 results"
-        - "larger batches show improved throughput"
-      tags: [inference, batch, scalability]
-
-test_suite: memory_management
-  description: Memory pool and allocation tests
-  source: include/memory_pool.h
-
-  test_cases:
-    - name: "Memory pool allocation"
-      given: "MemoryPool with 256MB capacity"
-      when: "pool.allocate(1MB)"
-      then:
-        - "returns valid CUDA pointer"
-        - "pool.used() == 1MB"
-        - "pool.free_space() == 255MB"
-      tags: [memory, pool, allocation]
-
-    - name: "Memory pool fragmentation"
-      given: "MemoryPool with multiple allocations"
-      when:
-        - "allocate block_A: 10MB"
-        - "allocate block_B: 5MB"
-        - "free block_A"
-        - "allocate block_C: 8MB"
-      then:
-        - "block_C reuses block_A's space"
-        - "pool fragmentation < 20%"
-      tags: [memory, pool, fragmentation]
-
-    - name: "Memory pool overflow"
-      given: "MemoryPool with 10MB capacity"
-      when: "allocate 11MB block"
-      then:
-        - "returns nullptr or throws OUT_OF_MEMORY"
-      tags: [memory, pool, error-handling]
-
-test_suite: quantization
-  description: INT8/FP16 quantization tests
-  source: include/quantization.h
-
-  test_cases:
-    - name: "FP32 to INT8 quantization"
-      given: "float32 tensor with values [-128.0, 127.0]"
-      when: "quantize_to_int8(tensor)"
-      then:
-        - "INT8 values match round(float32 values)"
-        - "scale factor is computed correctly"
-        - "max_quantization_error < 0.5"
-      tags: [quantization, int8, correctness]
-
-    - name: "FP32 to FP16 quantization"
-      given: "float32 tensor with various values"
-      when: "quantize_to_fp16(tensor)"
-      then:
-        - "FP16 values match FP32 within FP16 precision"
-        - "no overflow or underflow"
-      tags: [quantization, fp16, precision]
-
-test_suite: configuration
-  description: Configuration file loading and validation
-  source: include/config.h
-
-  test_cases:
-    - name: "Load valid configuration"
-      given: "config/default.ini with valid INI content"
-      when: "Config::load(path)"
-      then:
-        - "all sections are parsed"
-        - "key-value pairs are accessible"
-        - "get<T>() returns typed values"
-      tags: [config, loading, parsing]
-
-    - name: "Load missing configuration"
-      given: "nonexistent file path"
-      when: "Config::load(nonexistent_path)"
-      then:
-        - "throws FILE_NOT_FOUND"
-      tags: [config, error-handling]
-
-acceptance_criteria:
-  all_tests:
-    - "pass on CUDA 11.0+ compatible GPUs"
-    - "pass with both Debug and Release builds"
-    - "no memory leaks (verified by CUDA memory checker)"
-    - "execution time within specified benchmarks"
diff --git a/src/tensor_core_gemm.cu b/src/tensor_core_gemm.cu
index bf0a265..42e4d18 100644
--- a/src/tensor_core_gemm.cu
+++ b/src/tensor_core_gemm.cu
@@ -161,12 +161,12 @@ __global__ void tensor_core_gemm_kernel(const half* __restrict__ A,
             // Load A fragment from shared memory
             // A_shared is [K][M], we want row-major 16x16
             // For warp at (warp_m, warp_n), load A at column warp_m * WMMA_M
-            load_matrix_sync(&A_shared[k][warp_m * WMMA_M], a_frag, BLOCK_M);
+            load_matrix_sync(a_frag, &A_shared[k][warp_m * WMMA_M], BLOCK_M);
 
             // Load B fragment from shared memory
             // B_shared is [K][N], we want row-major 16x16
             // For warp at (warp_m, warp_n), load B at column warp_n * WMMA_N
-            load_matrix_sync(&B_shared[k][warp_n * WMMA_N], b_frag, BLOCK_N);
+            load_matrix_sync(b_frag, &B_shared[k][warp_n * WMMA_N], BLOCK_N);
 
             // Tensor Core multiply-accumulate
             mma_sync(acc_frag, a_frag, b_frag, acc_frag);
@@ -248,8 +248,8 @@ __global__ void tensor_core_gemm_fp32_out_kernel(const half* __restrict__ A,
         __syncthreads();
 
         for (int k = 0; k < BLOCK_K; k += WMMA_K) {
-            load_matrix_sync(&A_shared[k][warp_m * WMMA_M], a_frag, BLOCK_M);
-            load_matrix_sync(&B_shared[k][warp_n * WMMA_N], b_frag, BLOCK_N);
+            load_matrix_sync(a_frag, &A_shared[k][warp_m * WMMA_M], BLOCK_M);
+            load_matrix_sync(b_frag, &B_shared[k][warp_n * WMMA_N], BLOCK_N);
             mma_sync(acc_frag, a_frag, b_frag, acc_frag);
         }
 
diff --git a/tests/test_memory_pool.cpp b/tests/test_memory_pool.cpp
index 9ed1c8c..f1528dd 100644
--- a/tests/test_memory_pool.cpp
+++ b/tests/test_memory_pool.cpp
@@ -99,12 +99,12 @@ TEST_F(MemoryPoolTest, ClearCacheRemovesOnlyCachedBlocks) {
     EXPECT_EQ(MemoryPool::instance().cached_block_count(), 0u);
 }
 
-TEST_F(MemoryPoolTest, ClearAllKeepsLiveAllocationOwned) {
+TEST_F(MemoryPoolTest, ClearCacheKeepsLiveAllocationOwned) {
     void* cached = MemoryPool::instance().allocate(4096);
     void* live = MemoryPool::instance().allocate(8192);
     MemoryPool::instance().deallocate(cached);
 
-    MemoryPool::instance().clear_all();
+    MemoryPool::instance().clear_cache();
 
     EXPECT_EQ(MemoryPool::instance().get_stats().cached_size, 0u);
     EXPECT_TRUE(MemoryPool::instance().owns(live));
@@ -112,20 +112,20 @@ TEST_F(MemoryPoolTest, ClearAllKeepsLiveAllocationOwned) {
     MemoryPool::instance().deallocate(live);
 }
 
-TEST_F(MemoryPoolTest, ClearAllKeepsLivePooledMemoryValid) {
+TEST_F(MemoryPoolTest, ClearCacheKeepsLivePooledMemoryValid) {
     PooledMemory mem(1024 * sizeof(float));
     float* ptr = mem.get();
     ASSERT_NE(ptr, nullptr);
-    MemoryPool::instance().clear_all();
+    MemoryPool::instance().clear_cache();
     EXPECT_TRUE(MemoryPool::instance().owns(ptr));
     EXPECT_EQ(MemoryPool::instance().cached_block_count(), 0u);
 }
 
-TEST_F(MemoryPoolTest, PooledMemoryDestructionAfterClearAllDoesNotDoubleFree) {
+TEST_F(MemoryPoolTest, PooledMemoryDestructionAfterClearCacheDoesNotDoubleFree) {
     {
         PooledMemory mem(1024 * sizeof(float));
         ASSERT_NE(mem.get(), nullptr);
-        MemoryPool::instance().clear_all();
+        MemoryPool::instance().clear_cache();
     }
     SUCCEED();
 }
diff --git a/tests/test_tensor.cpp b/tests/test_tensor.cpp
index d6f4177..eef616d 100644
--- a/tests/test_tensor.cpp
+++ b/tests/test_tensor.cpp
@@ -335,7 +335,7 @@ TEST_F(TensorTest, FusedLinearReluRejectsInvalidRanks) {
 }
 
 TEST_F(TensorTest, MemoryPoolBasic) {
-    MemoryPool::instance().clear_all();
+    MemoryPool::instance().clear_cache();
     {
         PooledMemory mem(1024 * sizeof(float));
         EXPECT_NE(mem.get(), nullptr);
@@ -351,7 +351,7 @@ TEST_F(TensorTest, MemoryPoolBasic) {
 }
 
 TEST_F(TensorTest, MemoryPoolMultiple) {
-    MemoryPool::instance().clear_all();
+    MemoryPool::instance().clear_cache();
     std::vector<PooledMemory> mems;
     for (int i = 0; i < 10; i++) {
         mems.emplace_back(1024 * sizeof(float));