Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/cmake_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ jobs:
- name: Test
run: cd ${{github.workspace}}/build && ctest

# - name: Demo
# run: cd ${{github.workspace}}/demo/bin && ./TinyTorch_demo
# - name: Run MNIST Example
# run: cd ${{github.workspace}}/examples/mnist/bin && ./tinytorch_example_mnist

# build_linux_gpu:
# name: build_linux_gpu
Expand Down Expand Up @@ -58,5 +58,5 @@ jobs:
# - name: Test
# run: cd ${{github.workspace}}/build && ctest

# - name: Demo
# run: cd ${{github.workspace}}/demo/bin && ./TinyTorch_demo
# - name: Run MNIST Example
# run: cd ${{github.workspace}}/examples/mnist/bin && ./tinytorch_example_mnist
4 changes: 2 additions & 2 deletions .github/workflows/cmake_macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ jobs:
- name: Test
run: cd ${{github.workspace}}/build && ctest

- name: Demo
run: cd ${{github.workspace}}/demo/bin && ./TinyTorch_demo
- name: Run MNIST Example
run: cd ${{github.workspace}}/examples/mnist/bin && ./tinytorch_example_mnist
8 changes: 4 additions & 4 deletions .github/workflows/cmake_windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ jobs:
- name: Test
run: cd ${{github.workspace}}/build && ctest

# - name: Demo
# run: cd ${{github.workspace}}/demo/bin/${{env.BUILD_TYPE}} && ./TinyTorch_demo.exe
# - name: Run MNIST Example
# run: cd ${{github.workspace}}/examples/mnist/bin/${{env.BUILD_TYPE}} && ./tinytorch_example_mnist.exe

# build_windows_gpu:
# name: build_windows_gpu
Expand Down Expand Up @@ -59,5 +59,5 @@ jobs:
# - name: Test
# run: cd ${{github.workspace}}/build && ctest

# - name: Demo
# run: cd ${{github.workspace}}/demo/bin/${{env.BUILD_TYPE}} && ./TinyTorch_demo.exe
# - name: Run MNIST Example
# run: cd ${{github.workspace}}/examples/mnist/bin/${{env.BUILD_TYPE}} && ./tinytorch_example_mnist.exe
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.DS_Store
.idea/
.vs/
/demo/bin
/examples/*/bin
out
build
cmake-build-*/
Expand Down
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.10)
project(TinyTorch)

option(TINYTORCH_BUILD_DEMO "Whether or not to build demo" ON)
option(TINYTORCH_BUILD_EXAMPLES "Whether or not to build examples" ON)
option(TINYTORCH_BUILD_TEST "Whether or not to build the tests" OFF)

option(TINYTORCH_USE_CUDA "Use CUDA" ON)
Expand All @@ -15,7 +15,7 @@ if (NOT TINYTORCH_USE_CUDA OR APPLE OR MSVC)
set(TINYTORCH_USE_NCCL OFF)
endif ()

message(STATUS "TINYTORCH_BUILD_DEMO ${TINYTORCH_BUILD_DEMO}")
message(STATUS "TINYTORCH_BUILD_EXAMPLES ${TINYTORCH_BUILD_EXAMPLES}")
message(STATUS "TINYTORCH_BUILD_TEST ${TINYTORCH_BUILD_TEST}")
message(STATUS "TINYTORCH_USE_CUDA ${TINYTORCH_USE_CUDA}")
message(STATUS "TINYTORCH_USE_NCCL ${TINYTORCH_USE_NCCL}")
Expand All @@ -30,8 +30,8 @@ endif ()

add_subdirectory(src)

if (TINYTORCH_BUILD_DEMO)
add_subdirectory(demo)
if (TINYTORCH_BUILD_EXAMPLES)
add_subdirectory(examples)
endif ()

if (TINYTORCH_BUILD_TEST)
Expand Down
124 changes: 66 additions & 58 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,97 +1,105 @@
# TinyTorch

**TinyTorch** is a lightweight deep learning training framework implemented from scratch in C++.
A lightweight deep learning training framework implemented from scratch in C++, featuring a PyTorch-style API.

For more details, please refer to my blog post: [Write a nn training framework from scratch](https://robot9.me/write-nn-framework-from-scratch-tinytorch/)
For more details, please refer to the blog post: [Write a nn training framework from scratch](https://robot9.me/write-nn-framework-from-scratch-tinytorch/)

[![CMake Linux](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_linux.yml/badge.svg)](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_linux.yml)
[![CMake MacOS](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_macos.yml/badge.svg)](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_macos.yml)
[![CMake Windows](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_windows.yml/badge.svg)](https://github.com/keith2018/TinyTorch/actions/workflows/cmake_windows.yml)

## Key Features

* **PyTorch-Style API**: Similar naming conventions as PyTorch (`Tensor`, `Functions`, `nn.Module`, `Optimizer`).
* **Pure C++ Implementation**: No dependency on external deep learning libraries.
* **CPU & CUDA Support**: Runs on both CPU and CUDA-enabled GPUs.
* **Mixed Precision**: Supports FP16, FP32, BF16.
* **Distributed**: Multi-machine, multi-GPU training & inference.
* **LLM Inference**: Supports inference for llama/qwen/mistral models: [https://github.com/keith2018/TinyGPT](https://github.com/keith2018/TinyGPT)
- **PyTorch-style API** — Familiar naming conventions (`Tensor`, `nn.Module`, `Optimizer`, `DataLoader`).
- **Pure C++ implementation** — No dependency on external deep learning libraries, C++17 only.
- **CPU & CUDA** — Runs on both CPU (with BLAS acceleration) and CUDA-enabled GPUs.
- **Mixed precision** — Supports FP16, FP32 and BF16.
- **Distributed training** — Multi-machine, multi-GPU training & inference via NCCL.
- **LLM inference** — Supports inference for LLaMA / Qwen / Mistral models: [TinyGPT](https://github.com/keith2018/TinyGPT).

## Implemented Operators and Components
## Architecture

### Activation Functions
* `relu`, `gelu`, `silu`
* `softmax`, `logSoftmax`
TinyTorch implements automatic differentiation by building a dynamic computation graph. Each operation on a `Tensor` creates a `Function` node that records both the forward computation and the backward gradient rule. These nodes are linked via `nextFunctions`, forming a DAG. Calling `backward()` traverses this graph in reverse topological order, propagating gradients via the chain rule.

### Mathematical Operations
* `add`, `sub`, `mul`, `div`, `matmul`
* `sin`, `cos`, `sqrt`, `pow`
* `maximum`, `minimum`
<img src="doc/AD.png" width="600">

### Comparison and Logical Operations
* `lt`, `le`, `gt`, `ge`, `eq`, `ne`
* `logicNot`, `logicAnd`, `logicOr`
## Project Structure

### Statistical and Reduction Operations
* `min`, `argmin`, `max`, `argmax`
* `sum`, `mean`, `var`
```
TinyTorch/
├── src/ # Core library (Tensor, Function, nn.Module, Optimizer, ...)
├── examples/ # Standalone example programs
│ ├── autograd/ # Automatic differentiation basics
│ ├── module/ # Building models with nn.Module
│ ├── optimizer/ # Using built-in optimizers
│ ├── mnist/ # Full MNIST training pipeline
│ ├── nccl/ # NCCL collective communication
│ └── ddp/ # Distributed data-parallel training
├── test/ # Unit tests
└── third_party/ # Third-party dependencies
```

### Tensor Shape and Indexing Operations
* `reshape`, `view`, `permute`, `transpose`
* `flatten`, `unflatten`, `squeeze`, `unsqueeze`
* `split`, `concat`, `stack`, `hstack`, `vstack`, `narrow`
* `topk`, `sort`, `cumsum`
* `gather`, `scatter`
## Getting Started

### Neural Network Layers and Loss Functions
* `linear`
* `dropout`
* `maxPool2d`
* `conv2d`
* `embedding`
* `layerNorm`
* `rmsNorm`
* `sdpAttention`
* `mseLoss`
* `nllLoss`
### Prerequisites

### Optimizers
* `SGD`, `Adagrad`, `RMSprop`, `AdaDelta`, `Adam`, `AdamW`
- CMake 3.10+
- C++17 compatible compiler
- CUDA Toolkit 11.0+ *(optional, for GPU support)*
- NCCL *(optional, for distributed training)*

### Other
* `Dataset`, `DataLoader`, `data.Transform`
### Build

## Automatic differentiation
```bash
mkdir build
cmake -B ./build -DCMAKE_BUILD_TYPE=Release
cmake --build ./build --config Release
```

TinyTorch's automatic differentiation (AD) is implemented by building a computation graph. Each operation on a `Tensor` is represented by a `Function` object, which is responsible for both the forward and backward passes. The `Function` nodes are connected via a `nextFunctions` field, creating the dependency graph. During the `backward()` call, the framework traverses this graph in reverse order, computing and propagating gradients using the chain rule.
#### CMake Options

<img src=doc/AD.png width="400">
| Option | Default | Description |
|--------|---------|-------------|
| `TINYTORCH_BUILD_EXAMPLES` | `ON` | Build example programs |
| `TINYTORCH_BUILD_TEST` | `OFF` | Build unit tests |
| `TINYTORCH_USE_CUDA` | `ON` | Enable CUDA support |
| `TINYTORCH_USE_NCCL` | `ON` | Enable NCCL support |

## Getting Started
### Run Examples

### Prerequisites
* CMake
* C++17 or a more recent compiler
* CUDA Toolkit 11.0+ (optional)
Each example is an independent executable:

### Build
```bash
mkdir build
cmake -B ./build -DCMAKE_BUILD_TYPE=Release
cmake --build ./build --config Release
# Autograd basics
cd examples/autograd/bin && ./tinytorch_example_autograd

# nn.Module usage
cd examples/module/bin && ./tinytorch_example_module

# Optimizer usage
cd examples/optimizer/bin && ./tinytorch_example_optimizer

# MNIST training
cd examples/mnist/bin && ./tinytorch_example_mnist
```

### Run `MNIST` Demo
For distributed examples (requires NCCL and multiple GPUs):

```bash
cd demo/bin
./TinyTorch_demo
# NCCL all-reduce
cd examples/nccl/bin && ./tinytorch_example_nccl <local_rank> <rank> <world_size>

# Distributed data-parallel training
cd examples/ddp/bin && ./tinytorch_example_ddp <local_rank> <rank> <world_size>
```

### Run Tests

```bash
cd build
ctest
```

## License

This code is licensed under the MIT License (see [LICENSE](LICENSE)).
17 changes: 0 additions & 17 deletions demo/demo.h

This file was deleted.

21 changes: 0 additions & 21 deletions demo/main.cpp

This file was deleted.

9 changes: 9 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
add_subdirectory(autograd)
add_subdirectory(module)
add_subdirectory(optimizer)
add_subdirectory(mnist)

if (TINYTORCH_USE_NCCL)
add_subdirectory(nccl)
add_subdirectory(ddp)
endif ()
16 changes: 16 additions & 0 deletions examples/autograd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
cmake_minimum_required(VERSION 3.10)
project(tinytorch_example_autograd)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

add_executable(${PROJECT_NAME} main.cpp)

target_include_directories(${PROJECT_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../../src
${CMAKE_CURRENT_SOURCE_DIR}/../../third_party
)

target_link_libraries(${PROJECT_NAME} TinyTorch_lib)

set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/bin)
6 changes: 4 additions & 2 deletions demo/demo_autograd.cpp → examples/autograd/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
using namespace tinytorch;

// https://pytorch.org/tutorials/beginner/pytorch_with_examples.html#pytorch-tensors-and-autograd
void demo_autograd() {
LOGD("demo_autograd ...");
int main() {
LOGD("autograd example ...");
Timer timer;
timer.start();

Expand Down Expand Up @@ -53,4 +53,6 @@ void demo_autograd() {

timer.mark();
LOGD("Time cost: %lld ms", timer.elapseMillis());

return 0;
}
26 changes: 26 additions & 0 deletions examples/ddp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
cmake_minimum_required(VERSION 3.10)
project(tinytorch_example_ddp)

if (CMAKE_BUILD_TYPE STREQUAL Debug)
add_definitions(-DDEBUG)
endif ()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

add_executable(${PROJECT_NAME} main.cpp)

target_include_directories(${PROJECT_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../../src
${CMAKE_CURRENT_SOURCE_DIR}/../../third_party
)

target_link_libraries(${PROJECT_NAME} TinyTorch_lib)

set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/bin)

# copy assets
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E remove_directory $<TARGET_FILE_DIR:${PROJECT_NAME}>/data
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/../mnist/data $<TARGET_FILE_DIR:${PROJECT_NAME}>/data
)
Loading
Loading