diff --git a/README.md b/README.md index 8c98bbe2..ac872927 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Please note that this repository is an evolving work in progress, reflecting the |-----------------------------|--------------------------------|---------------------|-----------------|--------------| | Dropout | `dropout-sched-exp` | | [View preregistration](https://github.com/LeonGuertler/SuperTinyLanguageModels/blob/main/pre_reports/dropout_prereport.pdf) | In progress | | Knowledge distillation | `feature/knowledge-distillation/replace-teacher-tokenizer`| | [View preregistration](https://github.com/LeonGuertler/SuperTinyLanguageModels/blob/feature/knowledge-distillation/replace-teacher-tokenizer/reports/preregistration-knowledgedistillation.pdf) | In progress | +| Weight Tying | `ffn-sharing` | | [View preregistration](pre_reports/weight_tying_prereport.pdf)| In progress | | Byte Level | | | In progress | In progress | | Self Play Evals | | [Join the room](https://discord.gg/hgVhe6Hp) | In progress| In progress | | Optimizers | `optimizers` | [Join the room](https://discord.gg/S5Q2ZmWY) | In progress | In progress diff --git a/configs/full_configs/baseline.yaml b/configs/full_configs/baseline.yaml index 85b5e908..cb40ed5a 100644 --- a/configs/full_configs/baseline.yaml +++ b/configs/full_configs/baseline.yaml @@ -75,11 +75,11 @@ trainer: name: cross_entropy general: logging: - wandb_log: false + wandb_log: true wandb_project: SuperTinyLanguageModels paths: output_dir: outputs - data_dir: data + data_dir: "data" checkpoint_dir: checkpoints seed: 489 device: cuda diff --git a/configs/full_configs/good_boi.yaml b/configs/full_configs/good_boi.yaml new file mode 100644 index 00000000..bdd954ec --- /dev/null +++ b/configs/full_configs/good_boi.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 2 + lora_rank: 64 + core_model: + core_model_type: weight_sharing + num_layers: 12 + ffn: + ffn_type: swiglu + ffn_dim: 1440 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: simple_en_wiki + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 512 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 150000 + lr_decay_iters: 150000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 10000 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_baseline.yaml b/configs/full_configs/shared_baseline.yaml new file mode 100644 index 00000000..607f4db0 --- /dev/null +++ b/configs/full_configs/shared_baseline.yaml @@ -0,0 +1,85 @@ +model: + core_model: + core_model_type: generic + num_layers: 8 + ffn: + ffn_type: swiglu + ffn_dim: 1361 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 528 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_deep.yaml b/configs/full_configs/shared_deep.yaml new file mode 100644 index 00000000..91bb002b --- /dev/null +++ b/configs/full_configs/shared_deep.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 0 + lora_rank: null + core_model: + core_model_type: weight_sharing + num_layers: 14 + ffn: + ffn_type: swiglu + ffn_dim: 1568 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 608 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_k_1_wide.yaml b/configs/full_configs/shared_k_1_wide.yaml new file mode 100644 index 00000000..eed263a3 --- /dev/null +++ b/configs/full_configs/shared_k_1_wide.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 1 + lora_rank: null + core_model: + core_model_type: weight_sharing + num_layers: 8 + ffn: + ffn_type: swiglu + ffn_dim: 1568 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 608 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_k_2_deep.yaml b/configs/full_configs/shared_k_2_deep.yaml new file mode 100644 index 00000000..d75c5a87 --- /dev/null +++ b/configs/full_configs/shared_k_2_deep.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 2 + lora_rank: null + core_model: + core_model_type: weight_sharing + num_layers: 12 + ffn: + ffn_type: swiglu + ffn_dim: 1440 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 544 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_k_2_wide.yaml b/configs/full_configs/shared_k_2_wide.yaml new file mode 100644 index 00000000..f7bd51bc --- /dev/null +++ b/configs/full_configs/shared_k_2_wide.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 2 + lora_rank: null + core_model: + core_model_type: weight_sharing + num_layers: 8 + ffn: + ffn_type: swiglu + ffn_dim: 1444 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 560 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_r_64.yaml b/configs/full_configs/shared_r_64.yaml new file mode 100644 index 00000000..16ea6543 --- /dev/null +++ b/configs/full_configs/shared_r_64.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 0 + lora_rank: 64 + core_model: + core_model_type: weight_sharing + num_layers: 8 + ffn: + ffn_type: swiglu + ffn_dim: 1650 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 640 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_r_64_k_1.yaml b/configs/full_configs/shared_r_64_k_1.yaml new file mode 100644 index 00000000..3424d75f --- /dev/null +++ b/configs/full_configs/shared_r_64_k_1.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 1 + lora_rank: 64 + core_model: + core_model_type: weight_sharing + num_layers: 8 + ffn: + ffn_type: swiglu + ffn_dim: 1527 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 592 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_r_64_k_2_long.yaml b/configs/full_configs/shared_r_64_k_2_long.yaml new file mode 100644 index 00000000..406b6a34 --- /dev/null +++ b/configs/full_configs/shared_r_64_k_2_long.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 2 + lora_rank: 64 + core_model: + core_model_type: weight_sharing + num_layers: 12 + ffn: + ffn_type: swiglu + ffn_dim: 1440 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 512 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_r_64_long.yaml b/configs/full_configs/shared_r_64_long.yaml new file mode 100644 index 00000000..e7d69707 --- /dev/null +++ b/configs/full_configs/shared_r_64_long.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 0 + lora_rank: 64 + core_model: + core_model_type: weight_sharing + num_layers: 12 + ffn: + ffn_type: swiglu + ffn_dim: 1527 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 592 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_untied_inputoutput.yaml b/configs/full_configs/shared_untied_inputoutput.yaml new file mode 100644 index 00000000..c4b888d2 --- /dev/null +++ b/configs/full_configs/shared_untied_inputoutput.yaml @@ -0,0 +1,85 @@ +model: + core_model: + core_model_type: generic + num_layers: 8 + ffn: + ffn_type: swiglu + ffn_dim: 1361 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 528 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: false + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/full_configs/shared_wide.yaml b/configs/full_configs/shared_wide.yaml new file mode 100644 index 00000000..8ccb4a2c --- /dev/null +++ b/configs/full_configs/shared_wide.yaml @@ -0,0 +1,87 @@ +model: + k_interior_layers: 0 + lora_rank: null + core_model: + core_model_type: weight_sharing + num_layers: 8 + ffn: + ffn_type: swiglu + ffn_dim: 1733 + normalization: rms_norm + bias: false + attn: + attn_type: generic + num_heads: 16 + normalization: rms_norm + group_size: 4 + bias: false + is_causal: true + embedder: + tokenizer_type: gpt2 + embedding_model_type: generic + dataset_name: stlm + lm_head: + normalization: rms_norm + bias: false + lm_head_type: generic + hidden_dim: 672 + context_window: 512 + vocab_size: 50257 + model_shell_type: standard + embedding_weight_tying: true + positional_encoding_type: rope +trainer: + dropout_scheduler: + dropout_type: constant + dropout: 0 + dataset: openwebtext + training: + trainer_type: base_trainer + batch_size: 24 + gradient_accumulation_steps: 20 + max_iters: 30000 + lr_decay_iters: 30000 + warmup_iters: 5000 + eval_interval: 2000 + log_interval: 10 + eval_iters: 500 + checkpoint_interval: 1000000000.0 + run_profiler: false + eval: + - benchmarks: + - "winograd" + - "hellaswag" + - "arc" + - "mmlu" + - "blimp" + num_samples: 1000 + evaluator: "mcq" + - evaluator: "prog" + optimizer: + name: nanoGPTadamW + lr: 0.0006 + min_lr: 6.0e-05 + weight_decay: 0.1 + beta1: 0.9 + beta2: 0.95 + grad_clip: 1.0 + decay_lr: true + warmup_iters: 5000 + lr_scheduler: + name: cosine + dataloader: + name: standard + datasampling: + name: standard + loss_fn: + name: cross_entropy +general: + logging: + wandb_log: true + wandb_project: SuperTinyLanguageModels + paths: + output_dir: outputs + data_dir: "data" + checkpoint_dir: checkpoints + seed: 489 + device: cuda diff --git a/configs/general/default.yaml b/configs/general/default.yaml index 9467d1c2..9cf8c88f 100644 --- a/configs/general/default.yaml +++ b/configs/general/default.yaml @@ -4,7 +4,7 @@ logging: paths: output_dir: "outputs" - data_dir: "data" + data_dir:"data" checkpoint_dir: "checkpoints" seed: 489 diff --git a/configs/generate.yaml b/configs/generate.yaml index 1a9cbf40..2d71d087 100644 --- a/configs/generate.yaml +++ b/configs/generate.yaml @@ -1,4 +1,4 @@ defaults: - generator: baseline -model_ckpt: "checkpoints/...pt" +model_ckpt: "outputs/2024-09-09/09-52-58/checkpoints/ckpt_149999.pt" diff --git a/configs/generator/baseline.yaml b/configs/generator/baseline.yaml index a62d72b7..5af4ea73 100644 --- a/configs/generator/baseline.yaml +++ b/configs/generator/baseline.yaml @@ -1,4 +1,5 @@ temperature: 0.8 -top_k: 10 +top_k: 200 max_new_tokens: 300 -input_text: "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being an ocean world, the only one in the Solar System sustaining liquid surface water." +# input_text: "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being an ocean world, the only one in the Solar System sustaining liquid surface water." +input_text: "Recipe for fish and chips:" \ No newline at end of file diff --git a/models/build_models.py b/models/build_models.py index 36236091..7eeff965 100644 --- a/models/build_models.py +++ b/models/build_models.py @@ -12,6 +12,7 @@ from models.experimental.next_thought.embedding_models import HierarchicalEncoder from models.experimental.next_thought.model_heads import VariableLengthLatentDecoder from models.experimental.next_thought.core_models import BaselineCoreModel, Conv1dCoreModel +from models.experimental import weight_sharing from models.model_heads import AutoregressiveLMHead from models.model_shell import ModelShell @@ -70,7 +71,8 @@ def build_embedding_model(model_cfg): "generic_ffn_sharing": GenericFFNSharedTransfomer, "hf_core": HFTransformerCore, "next_thought_baseline": BaselineCoreModel, - "conv": Conv1dCoreModel + "conv": Conv1dCoreModel, + "weight_sharing": weight_sharing.SharedInteriorFFNLora, } diff --git a/models/components/layers/attention.py b/models/components/layers/attention.py index 41b06b37..21f94a6e 100644 --- a/models/components/layers/attention.py +++ b/models/components/layers/attention.py @@ -21,15 +21,16 @@ def __init__( group_size, ): super().__init__() - assert hidden_dim % num_heads == 0, "Hidden dim must be divisible by num heads" - + self.head_dim = 2 * ((hidden_dim // num_heads) // 2) # make sure it's even + self.effective_hidden_dim = self.head_dim * num_heads # different for certain head_dims + self.group_dim = self.head_dim * group_size # key, query, value projections for all heads self.c_attn = torch.nn.Linear( - hidden_dim, hidden_dim + 2 * hidden_dim // group_size, bias=bias + hidden_dim, self.effective_hidden_dim + 2 * self.group_dim, bias=bias ) # output projection - self.c_proj = torch.nn.Linear(hidden_dim, hidden_dim, bias=bias) + self.c_proj = torch.nn.Linear(self.effective_hidden_dim, hidden_dim, bias=bias) # attention dropout self.attn_dropout = torch.nn.Dropout() @@ -43,7 +44,7 @@ def __init__( if self.use_rope: assert context_window % 2 == 0 self.freqs_cis = compute_freqs_cis( - seq_len=context_window, head_dim=hidden_dim // num_heads + seq_len=context_window, head_dim=self.head_dim ) def forward(self, x, attention_mask=None): @@ -51,16 +52,15 @@ def forward(self, x, attention_mask=None): Forward pass """ assert attention_mask is None, "Not implemented yet" - B, S, H = x.size() + B, S, _ = x.size() num_grouped_heads = self.num_heads // self.group_size - group_hidden_dim = H // self.group_size # calculate query, key, values for all heads in batch # move head forward to be the batch dim - q, k, v = self.c_attn(x).split([H, group_hidden_dim, group_hidden_dim], dim=-1) - k = k.view(B, S, num_grouped_heads, H // self.num_heads) # (B, T, nh, hs) - q = q.view(B, S, self.num_heads, H // self.num_heads) # (B, T, nh, hs) - v = v.view(B, S, num_grouped_heads, H // self.num_heads).transpose( + q, k, v = self.c_attn(x).split([self.effective_hidden_dim, self.group_dim, self.group_dim], dim=-1) + k = k.view(B, S, num_grouped_heads, self.head_dim) # (B, T, nh, hs) + q = q.view(B, S, self.num_heads, self.head_dim) # (B, T, nh, hs) + v = v.view(B, S, num_grouped_heads, self.head_dim).transpose( 1, 2 ) # (B, nh, T, hs) @@ -86,11 +86,11 @@ def forward(self, x, attention_mask=None): ) # pylint: enable=not-callable y = ( - y.transpose(1, 2).contiguous().view(B, S, H) + y.transpose(1, 2).contiguous().view(B, S, self.effective_hidden_dim) ) # re-assemble all head outputs side by side # output projection - y = self.attn_dropout(self.c_proj(y)) # is this really necessary? + y = self.attn_dropout(self.c_proj(y)) # Reshape to original dim return y diff --git a/models/experimental/weight_sharing.py b/models/experimental/weight_sharing.py new file mode 100644 index 00000000..f27fb576 --- /dev/null +++ b/models/experimental/weight_sharing.py @@ -0,0 +1,55 @@ +import torch.nn as nn +from models.core_models import GenericTransformer + +class LoRA(nn.Module): + def __init__(self, linear_layer, lora_rank): + """Wraps the linear layer with LoRA""" + super().__init__() + self.linear_layer = linear_layer + self.lora_rank = lora_rank + self.U = nn.Linear(linear_layer.in_features, lora_rank) + self.V = nn.Linear(lora_rank, linear_layer.out_features) + + def forward(self, x): + """Forward pass through the linear layer with LoRA""" + return self.linear_layer(x) + self.V(self.U(x)) + +class SharedInteriorFFNLora(GenericTransformer): + def __init__(self, model_cfg): + super().__init__(model_cfg) + self.k_interior_layers = model_cfg["k_interior_layers"] + self.lora_rank = model_cfg["lora_rank"] + + self._apply_weight_sharing_and_lora( + start_layer=1 + self.k_interior_layers, + end_layer=len(self.transformer.h) - self.k_interior_layers, + module_name='ffn' + ) + + def _apply_weight_sharing_and_lora(self, start_layer: int, end_layer: int, module_name: str): + base_module = getattr(self.transformer.h[start_layer], module_name) + shared_weights = {name: module.weight for name, module in base_module.named_modules() if isinstance(module, nn.Linear)} + + for i in range(start_layer, end_layer): + target_module = getattr(self.transformer.h[i], module_name) + for name, module in target_module.named_modules(): + if isinstance(module, nn.Linear): + module.weight = shared_weights[name] + if self.lora_rank is not None: + lora_module = LoRA(module, self.lora_rank) + setattr(target_module, name, lora_module) + + + +class SharedInteriorFFNLoraAndCProj(SharedInteriorFFNLora): + def __init__(self, model_cfg): + super().__init__(model_cfg) + + # now strictly share the c_proj weights w/o lora + for i in range(1 + self.k_interior_layers, len(self.transformer.h) - self.k_interior_layers): + base_cproj = self.transformer.h[1 + self.k_interior_layers].attn.c_proj + shared_cproj_weights = {name: module.weight for name, module in base_cproj.named_modules() if isinstance(module, nn.Linear)} + target_cproj = self.transformer.h[i].attn.c_proj + for name, module in target_cproj.named_modules(): + if isinstance(module, nn.Linear): + module.weight = shared_cproj_weights[name] \ No newline at end of file diff --git a/pre_reports/weight_tying_prereport.pdf b/pre_reports/weight_tying_prereport.pdf new file mode 100644 index 00000000..0c16c7d7 Binary files /dev/null and b/pre_reports/weight_tying_prereport.pdf differ