Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Please note that this repository is an evolving work in progress, reflecting the
|-----------------------------|--------------------------------|---------------------|-----------------|--------------|
| Dropout | `dropout-sched-exp` | | [View preregistration](https://github.com/LeonGuertler/SuperTinyLanguageModels/blob/main/pre_reports/dropout_prereport.pdf) | In progress |
| Knowledge distillation | `feature/knowledge-distillation/replace-teacher-tokenizer`| | [View preregistration](https://github.com/LeonGuertler/SuperTinyLanguageModels/blob/feature/knowledge-distillation/replace-teacher-tokenizer/reports/preregistration-knowledgedistillation.pdf) | In progress |
| Weight Tying | `ffn-sharing` | | [View preregistration](pre_reports/weight_tying_prereport.pdf)| In progress |
| Byte Level | | | In progress | In progress |
| Self Play Evals | | [Join the room](https://discord.gg/hgVhe6Hp) | In progress| In progress |
| Optimizers | `optimizers` | [Join the room](https://discord.gg/S5Q2ZmWY) | In progress | In progress
Expand Down
4 changes: 2 additions & 2 deletions configs/full_configs/baseline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,11 @@ trainer:
name: cross_entropy
general:
logging:
wandb_log: false
wandb_log: true
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: data
data_dir: "data"
checkpoint_dir: checkpoints
seed: 489
device: cuda
87 changes: 87 additions & 0 deletions configs/full_configs/good_boi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
model:
k_interior_layers: 2
lora_rank: 64
core_model:
core_model_type: weight_sharing
num_layers: 12
ffn:
ffn_type: swiglu
ffn_dim: 1440
normalization: rms_norm
bias: false
attn:
attn_type: generic
num_heads: 16
normalization: rms_norm
group_size: 4
bias: false
is_causal: true
embedder:
tokenizer_type: gpt2
embedding_model_type: generic
dataset_name: simple_en_wiki
lm_head:
normalization: rms_norm
bias: false
lm_head_type: generic
hidden_dim: 512
context_window: 512
vocab_size: 50257
model_shell_type: standard
embedding_weight_tying: true
positional_encoding_type: rope
trainer:
dropout_scheduler:
dropout_type: constant
dropout: 0
dataset: openwebtext
training:
trainer_type: base_trainer
batch_size: 24
gradient_accumulation_steps: 20
max_iters: 150000
lr_decay_iters: 150000
warmup_iters: 5000
eval_interval: 2000
log_interval: 10
eval_iters: 500
checkpoint_interval: 10000
run_profiler: false
eval:
- benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 1000
evaluator: "mcq"
- evaluator: "prog"
optimizer:
name: nanoGPTadamW
lr: 0.0006
min_lr: 6.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: 5000
lr_scheduler:
name: cosine
dataloader:
name: standard
datasampling:
name: standard
loss_fn:
name: cross_entropy
general:
logging:
wandb_log: true
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: "data"
checkpoint_dir: checkpoints
seed: 489
device: cuda
85 changes: 85 additions & 0 deletions configs/full_configs/shared_baseline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
model:
core_model:
core_model_type: generic
num_layers: 8
ffn:
ffn_type: swiglu
ffn_dim: 1361
normalization: rms_norm
bias: false
attn:
attn_type: generic
num_heads: 16
normalization: rms_norm
group_size: 4
bias: false
is_causal: true
embedder:
tokenizer_type: gpt2
embedding_model_type: generic
dataset_name: stlm
lm_head:
normalization: rms_norm
bias: false
lm_head_type: generic
hidden_dim: 528
context_window: 512
vocab_size: 50257
model_shell_type: standard
embedding_weight_tying: true
positional_encoding_type: rope
trainer:
dropout_scheduler:
dropout_type: constant
dropout: 0
dataset: openwebtext
training:
trainer_type: base_trainer
batch_size: 24
gradient_accumulation_steps: 20
max_iters: 30000
lr_decay_iters: 30000
warmup_iters: 5000
eval_interval: 2000
log_interval: 10
eval_iters: 500
checkpoint_interval: 1000000000.0
run_profiler: false
eval:
- benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 1000
evaluator: "mcq"
- evaluator: "prog"
optimizer:
name: nanoGPTadamW
lr: 0.0006
min_lr: 6.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: 5000
lr_scheduler:
name: cosine
dataloader:
name: standard
datasampling:
name: standard
loss_fn:
name: cross_entropy
general:
logging:
wandb_log: true
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: "data"
checkpoint_dir: checkpoints
seed: 489
device: cuda
87 changes: 87 additions & 0 deletions configs/full_configs/shared_deep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
model:
k_interior_layers: 0
lora_rank: null
core_model:
core_model_type: weight_sharing
num_layers: 14
ffn:
ffn_type: swiglu
ffn_dim: 1568
normalization: rms_norm
bias: false
attn:
attn_type: generic
num_heads: 16
normalization: rms_norm
group_size: 4
bias: false
is_causal: true
embedder:
tokenizer_type: gpt2
embedding_model_type: generic
dataset_name: stlm
lm_head:
normalization: rms_norm
bias: false
lm_head_type: generic
hidden_dim: 608
context_window: 512
vocab_size: 50257
model_shell_type: standard
embedding_weight_tying: true
positional_encoding_type: rope
trainer:
dropout_scheduler:
dropout_type: constant
dropout: 0
dataset: openwebtext
training:
trainer_type: base_trainer
batch_size: 24
gradient_accumulation_steps: 20
max_iters: 30000
lr_decay_iters: 30000
warmup_iters: 5000
eval_interval: 2000
log_interval: 10
eval_iters: 500
checkpoint_interval: 1000000000.0
run_profiler: false
eval:
- benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 1000
evaluator: "mcq"
- evaluator: "prog"
optimizer:
name: nanoGPTadamW
lr: 0.0006
min_lr: 6.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: 5000
lr_scheduler:
name: cosine
dataloader:
name: standard
datasampling:
name: standard
loss_fn:
name: cross_entropy
general:
logging:
wandb_log: true
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: "data"
checkpoint_dir: checkpoints
seed: 489
device: cuda
87 changes: 87 additions & 0 deletions configs/full_configs/shared_k_1_wide.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
model:
k_interior_layers: 1
lora_rank: null
core_model:
core_model_type: weight_sharing
num_layers: 8
ffn:
ffn_type: swiglu
ffn_dim: 1568
normalization: rms_norm
bias: false
attn:
attn_type: generic
num_heads: 16
normalization: rms_norm
group_size: 4
bias: false
is_causal: true
embedder:
tokenizer_type: gpt2
embedding_model_type: generic
dataset_name: stlm
lm_head:
normalization: rms_norm
bias: false
lm_head_type: generic
hidden_dim: 608
context_window: 512
vocab_size: 50257
model_shell_type: standard
embedding_weight_tying: true
positional_encoding_type: rope
trainer:
dropout_scheduler:
dropout_type: constant
dropout: 0
dataset: openwebtext
training:
trainer_type: base_trainer
batch_size: 24
gradient_accumulation_steps: 20
max_iters: 30000
lr_decay_iters: 30000
warmup_iters: 5000
eval_interval: 2000
log_interval: 10
eval_iters: 500
checkpoint_interval: 1000000000.0
run_profiler: false
eval:
- benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 1000
evaluator: "mcq"
- evaluator: "prog"
optimizer:
name: nanoGPTadamW
lr: 0.0006
min_lr: 6.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: 5000
lr_scheduler:
name: cosine
dataloader:
name: standard
datasampling:
name: standard
loss_fn:
name: cross_entropy
general:
logging:
wandb_log: true
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: "data"
checkpoint_dir: checkpoints
seed: 489
device: cuda
Loading