LeonGuertler · DylanASHillier · Aug 30, 2024 · Aug 30, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@ Please note that this repository is an evolving work in progress, reflecting the
 |-----------------------------|--------------------------------|---------------------|-----------------|--------------|
 | Dropout | `dropout-sched-exp`   |                | [View preregistration](https://github.com/LeonGuertler/SuperTinyLanguageModels/blob/main/pre_reports/dropout_prereport.pdf) | In progress |
 | Knowledge distillation  | `feature/knowledge-distillation/replace-teacher-tokenizer`|                  | [View preregistration](https://github.com/LeonGuertler/SuperTinyLanguageModels/blob/feature/knowledge-distillation/replace-teacher-tokenizer/reports/preregistration-knowledgedistillation.pdf)  | In progress  |
+| Weight Tying | `ffn-sharing` |  | [View preregistration](pre_reports/weight_tying_prereport.pdf)| In progress |
 | Byte Level             |        |                       | In progress  | In progress  |
 | Self Play Evals        |            |   [Join the room](https://discord.gg/hgVhe6Hp)                 | In progress| In progress  |
 | Optimizers   | `optimizers` | [Join the room](https://discord.gg/S5Q2ZmWY) | In progress | In progress

diff --git a/configs/full_configs/baseline.yaml b/configs/full_configs/baseline.yaml
@@ -75,11 +75,11 @@ trainer:
     name: cross_entropy
 general:
   logging:
-    wandb_log: false
+    wandb_log: true
     wandb_project: SuperTinyLanguageModels
   paths:
     output_dir: outputs
-    data_dir: data
+    data_dir: "data"
     checkpoint_dir: checkpoints
   seed: 489
   device: cuda
diff --git a/configs/full_configs/good_boi.yaml b/configs/full_configs/good_boi.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 2
+  lora_rank: 64
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 12
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1440
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: simple_en_wiki
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 512
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 150000
+    lr_decay_iters: 150000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 10000
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_baseline.yaml b/configs/full_configs/shared_baseline.yaml
@@ -0,0 +1,85 @@
+model:
+  core_model:
+    core_model_type: generic
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1361
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 528
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_deep.yaml b/configs/full_configs/shared_deep.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 0
+  lora_rank: null
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 14
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1568
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 608
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_k_1_wide.yaml b/configs/full_configs/shared_k_1_wide.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 1
+  lora_rank: null
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1568
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 608
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda