keith2018 · keith2018 · Apr 4, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/.github/workflows/cmake_linux.yml b/.github/workflows/cmake_linux.yml
@@ -28,8 +28,8 @@ jobs:
       - name: Test
         run: cd ${{github.workspace}}/build && ctest
 
-      # - name: Demo
-      #   run: cd ${{github.workspace}}/demo/bin && ./TinyTorch_demo
+      # - name: Run MNIST Example
+      #   run: cd ${{github.workspace}}/examples/mnist/bin && ./tinytorch_example_mnist
 
   # build_linux_gpu:
   #   name: build_linux_gpu
@@ -58,5 +58,5 @@ jobs:
   #     - name: Test
   #       run: cd ${{github.workspace}}/build && ctest
 
-  #     - name: Demo
-  #       run: cd ${{github.workspace}}/demo/bin && ./TinyTorch_demo
+  #     - name: Run MNIST Example
+  #       run: cd ${{github.workspace}}/examples/mnist/bin && ./tinytorch_example_mnist
diff --git a/.github/workflows/cmake_macos.yml b/.github/workflows/cmake_macos.yml
@@ -28,5 +28,5 @@ jobs:
       - name: Test
         run: cd ${{github.workspace}}/build && ctest
 
-      - name: Demo
-        run: cd ${{github.workspace}}/demo/bin && ./TinyTorch_demo
+      - name: Run MNIST Example
+        run: cd ${{github.workspace}}/examples/mnist/bin && ./tinytorch_example_mnist
diff --git a/.github/workflows/cmake_windows.yml b/.github/workflows/cmake_windows.yml
@@ -28,8 +28,8 @@ jobs:
       - name: Test
         run: cd ${{github.workspace}}/build && ctest
 
-      # - name: Demo
-      #   run: cd ${{github.workspace}}/demo/bin/${{env.BUILD_TYPE}} && ./TinyTorch_demo.exe
+      # - name: Run MNIST Example
+      #   run: cd ${{github.workspace}}/examples/mnist/bin/${{env.BUILD_TYPE}} && ./tinytorch_example_mnist.exe
 
   # build_windows_gpu:
   #   name: build_windows_gpu
@@ -59,5 +59,5 @@ jobs:
   #     - name: Test
   #       run: cd ${{github.workspace}}/build && ctest
 
-  #     - name: Demo
-  #       run: cd ${{github.workspace}}/demo/bin/${{env.BUILD_TYPE}} && ./TinyTorch_demo.exe
+  #     - name: Run MNIST Example
+  #       run: cd ${{github.workspace}}/examples/mnist/bin/${{env.BUILD_TYPE}} && ./tinytorch_example_mnist.exe
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 .DS_Store
 .idea/
 .vs/
-/demo/bin
+/examples/*/bin
 out
 build
 cmake-build-*/

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.10)
 project(TinyTorch)
 
-option(TINYTORCH_BUILD_DEMO "Whether or not to build demo" ON)
+option(TINYTORCH_BUILD_EXAMPLES "Whether or not to build examples" ON)
 option(TINYTORCH_BUILD_TEST "Whether or not to build the tests" OFF)
 
 option(TINYTORCH_USE_CUDA "Use CUDA" ON)
@@ -15,7 +15,7 @@ if (NOT TINYTORCH_USE_CUDA OR APPLE OR MSVC)
     set(TINYTORCH_USE_NCCL OFF)
 endif ()
 
-message(STATUS "TINYTORCH_BUILD_DEMO ${TINYTORCH_BUILD_DEMO}")
+message(STATUS "TINYTORCH_BUILD_EXAMPLES ${TINYTORCH_BUILD_EXAMPLES}")
 message(STATUS "TINYTORCH_BUILD_TEST ${TINYTORCH_BUILD_TEST}")
 message(STATUS "TINYTORCH_USE_CUDA ${TINYTORCH_USE_CUDA}")
 message(STATUS "TINYTORCH_USE_NCCL ${TINYTORCH_USE_NCCL}")
@@ -30,8 +30,8 @@ endif ()
 
 add_subdirectory(src)
 
-if (TINYTORCH_BUILD_DEMO)
-    add_subdirectory(demo)
+if (TINYTORCH_BUILD_EXAMPLES)
+    add_subdirectory(examples)
 endif ()
 
 if (TINYTORCH_BUILD_TEST)

diff --git a/README.md b/README.md
@@ -1,97 +1,105 @@
 # TinyTorch
 
-**TinyTorch** is a lightweight deep learning training framework implemented from scratch in C++.
+A lightweight deep learning training framework implemented from scratch in C++, featuring a PyTorch-style API.
 
-For more details, please refer to my blog post: [Write a nn training framework from scratch](https://robot9.me/write-nn-framework-from-scratch-tinytorch/)
+For more details, please refer to the blog post: [Write a nn training framework from scratch](https://robot9.me/write-nn-framework-from-scratch-tinytorch/)
 
 [![CMake Linux](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_linux.yml/badge.svg)](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_linux.yml)
 [![CMake MacOS](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_macos.yml/badge.svg)](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_macos.yml)
 [![CMake Windows](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_windows.yml/badge.svg)](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_windows.yml)
 
 ## Key Features
 
-* **PyTorch-Style API**: Similar naming conventions as PyTorch (`Tensor`, `Functions`, `nn.Module`, `Optimizer`).
-* **Pure C++ Implementation**: No dependency on external deep learning libraries.
-* **CPU & CUDA Support**: Runs on both CPU and CUDA-enabled GPUs.
-* **Mixed Precision**: Supports FP16, FP32, BF16.
-* **Distributed**: Multi-machine, multi-GPU training & inference.
-* **LLM Inference**: Supports inference for llama/qwen/mistral models: [https://github.com/keith2018/TinyGPT](https://github.com/keith2018/TinyGPT)
+- **PyTorch-style API** &mdash; Familiar naming conventions (`Tensor`, `nn.Module`, `Optimizer`, `DataLoader`).
+- **Pure C++ implementation** &mdash; No dependency on external deep learning libraries, C++17 only.
+- **CPU & CUDA** &mdash; Runs on both CPU (with BLAS acceleration) and CUDA-enabled GPUs.
+- **Mixed precision** &mdash; Supports FP16, FP32 and BF16.
+- **Distributed training** &mdash; Multi-machine, multi-GPU training & inference via NCCL.
+- **LLM inference** &mdash; Supports inference for LLaMA / Qwen / Mistral models: [TinyGPT](https://github.com/keith2018/TinyGPT).
 
-## Implemented Operators and Components
+## Architecture
 
-### Activation Functions
-* `relu`, `gelu`, `silu`
-* `softmax`, `logSoftmax`
+TinyTorch implements automatic differentiation by building a dynamic computation graph. Each operation on a `Tensor` creates a `Function` node that records both the forward computation and the backward gradient rule. These nodes are linked via `nextFunctions`, forming a DAG. Calling `backward()` traverses this graph in reverse topological order, propagating gradients via the chain rule.
 
-### Mathematical Operations
-* `add`, `sub`, `mul`, `div`, `matmul`
-* `sin`, `cos`, `sqrt`, `pow`
-* `maximum`, `minimum`
+<img src="doc/AD.png" width="600">
 
-### Comparison and Logical Operations
-* `lt`, `le`, `gt`, `ge`, `eq`, `ne`
-* `logicNot`, `logicAnd`, `logicOr`
+## Project Structure
 
-### Statistical and Reduction Operations
-* `min`, `argmin`, `max`, `argmax`
-* `sum`, `mean`, `var`
+```
+TinyTorch/
+├── src/            # Core library (Tensor, Function, nn.Module, Optimizer, ...)
+├── examples/       # Standalone example programs
+│   ├── autograd/   # Automatic differentiation basics
+│   ├── module/     # Building models with nn.Module
+│   ├── optimizer/  # Using built-in optimizers
+│   ├── mnist/      # Full MNIST training pipeline
+│   ├── nccl/       # NCCL collective communication
+│   └── ddp/        # Distributed data-parallel training
+├── test/           # Unit tests
+└── third_party/    # Third-party dependencies
+```
 
-### Tensor Shape and Indexing Operations
-* `reshape`, `view`, `permute`, `transpose`
-* `flatten`, `unflatten`, `squeeze`, `unsqueeze`
-* `split`, `concat`, `stack`, `hstack`, `vstack`, `narrow`
-* `topk`, `sort`, `cumsum`
-* `gather`, `scatter`
+## Getting Started
 
-### Neural Network Layers and Loss Functions
-* `linear`
-* `dropout`
-* `maxPool2d`
-* `conv2d`
-* `embedding`
-* `layerNorm`
-* `rmsNorm`
-* `sdpAttention`
-* `mseLoss`
-* `nllLoss`
+### Prerequisites
 
-### Optimizers
-* `SGD`, `Adagrad`, `RMSprop`, `AdaDelta`, `Adam`, `AdamW`
+- CMake 3.10+
+- C++17 compatible compiler
+- CUDA Toolkit 11.0+ *(optional, for GPU support)*
+- NCCL *(optional, for distributed training)*
 
-### Other
-* `Dataset`, `DataLoader`, `data.Transform`
+### Build
 
-## Automatic differentiation
+```bash
+mkdir build
+cmake -B ./build -DCMAKE_BUILD_TYPE=Release
+cmake --build ./build --config Release
+```
 
-TinyTorch's automatic differentiation (AD) is implemented by building a computation graph. Each operation on a `Tensor` is represented by a `Function` object, which is responsible for both the forward and backward passes. The `Function` nodes are connected via a `nextFunctions` field, creating the dependency graph. During the `backward()` call, the framework traverses this graph in reverse order, computing and propagating gradients using the chain rule.
+#### CMake Options
 
-<img src=doc/AD.png width="400">
+| Option | Default | Description |
+|--------|---------|-------------|
+| `TINYTORCH_BUILD_EXAMPLES` | `ON` | Build example programs |
+| `TINYTORCH_BUILD_TEST` | `OFF` | Build unit tests |
+| `TINYTORCH_USE_CUDA` | `ON` | Enable CUDA support |
+| `TINYTORCH_USE_NCCL` | `ON` | Enable NCCL support |
 
-## Getting Started
+### Run Examples
 
-### Prerequisites
-* CMake
-* C++17 or a more recent compiler
-* CUDA Toolkit 11.0+ (optional)
+Each example is an independent executable:
 
-### Build
 ```bash
-mkdir build
-cmake -B ./build -DCMAKE_BUILD_TYPE=Release
-cmake --build ./build --config Release
+# Autograd basics
+cd examples/autograd/bin && ./tinytorch_example_autograd
+
+# nn.Module usage
+cd examples/module/bin && ./tinytorch_example_module
+
+# Optimizer usage
+cd examples/optimizer/bin && ./tinytorch_example_optimizer
+
+# MNIST training
+cd examples/mnist/bin && ./tinytorch_example_mnist
 ```
 
-### Run `MNIST` Demo
+For distributed examples (requires NCCL and multiple GPUs):
+
 ```bash
-cd demo/bin
-./TinyTorch_demo
+# NCCL all-reduce
+cd examples/nccl/bin && ./tinytorch_example_nccl <local_rank> <rank> <world_size>
+
+# Distributed data-parallel training
+cd examples/ddp/bin && ./tinytorch_example_ddp <local_rank> <rank> <world_size>
 ```
 
 ### Run Tests
+
 ```bash
 cd build
 ctest
 ```
 
 ## License
+
 This code is licensed under the MIT License (see [LICENSE](LICENSE)).
diff --git a/demo/demo.h b/demo/demo.h
diff --git a/demo/main.cpp b/demo/main.cpp
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_subdirectory(autograd)
+add_subdirectory(module)
+add_subdirectory(optimizer)
+add_subdirectory(mnist)
+
+if (TINYTORCH_USE_NCCL)
+    add_subdirectory(nccl)
+    add_subdirectory(ddp)
+endif ()
diff --git a/examples/autograd/CMakeLists.txt b/examples/autograd/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.10)
+project(tinytorch_example_autograd)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+add_executable(${PROJECT_NAME} main.cpp)
+
+target_include_directories(${PROJECT_NAME} PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party
+)
+
+target_link_libraries(${PROJECT_NAME} TinyTorch_lib)
+
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/bin)
diff --git a/demo/demo_autograd.cpp → examples/autograd/main.cpp b/demo/demo_autograd.cpp → examples/autograd/main.cpp
@@ -11,8 +11,8 @@
 using namespace tinytorch;
 
 // https://pytorch.org/tutorials/beginner/pytorch_with_examples.html#pytorch-tensors-and-autograd
-void demo_autograd() {
-  LOGD("demo_autograd ...");
+int main() {
+  LOGD("autograd example ...");
   Timer timer;
   timer.start();
 
@@ -53,4 +53,6 @@ void demo_autograd() {
 
   timer.mark();
   LOGD("Time cost: %lld ms", timer.elapseMillis());
+
+  return 0;
 }
diff --git a/examples/ddp/CMakeLists.txt b/examples/ddp/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.10)
+project(tinytorch_example_ddp)
+
+if (CMAKE_BUILD_TYPE STREQUAL Debug)
+    add_definitions(-DDEBUG)
+endif ()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+add_executable(${PROJECT_NAME} main.cpp)
+
+target_include_directories(${PROJECT_NAME} PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party
+)
+
+target_link_libraries(${PROJECT_NAME} TinyTorch_lib)
+
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/bin)
+
+# copy assets
+add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E remove_directory $<TARGET_FILE_DIR:${PROJECT_NAME}>/data
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/../mnist/data $<TARGET_FILE_DIR:${PROJECT_NAME}>/data
+)