diff --git a/README.md b/README.md
index 8c98bbe2..ac872927 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Please note that this repository is an evolving work in progress, reflecting the
 |-----------------------------|--------------------------------|---------------------|-----------------|--------------|
 | Dropout | `dropout-sched-exp`   |                | [View preregistration](https://github.com/LeonGuertler/SuperTinyLanguageModels/blob/main/pre_reports/dropout_prereport.pdf) | In progress |
 | Knowledge distillation  | `feature/knowledge-distillation/replace-teacher-tokenizer`|                  | [View preregistration](https://github.com/LeonGuertler/SuperTinyLanguageModels/blob/feature/knowledge-distillation/replace-teacher-tokenizer/reports/preregistration-knowledgedistillation.pdf)  | In progress  |
+| Weight Tying | `ffn-sharing` |  | [View preregistration](pre_reports/weight_tying_prereport.pdf)| In progress |
 | Byte Level             |        |                       | In progress  | In progress  |
 | Self Play Evals        |            |   [Join the room](https://discord.gg/hgVhe6Hp)                 | In progress| In progress  |
 | Optimizers   | `optimizers` | [Join the room](https://discord.gg/S5Q2ZmWY) | In progress | In progress
diff --git a/configs/full_configs/baseline.yaml b/configs/full_configs/baseline.yaml
index 85b5e908..cb40ed5a 100644
--- a/configs/full_configs/baseline.yaml
+++ b/configs/full_configs/baseline.yaml
@@ -75,11 +75,11 @@ trainer:
     name: cross_entropy
 general:
   logging:
-    wandb_log: false
+    wandb_log: true
     wandb_project: SuperTinyLanguageModels
   paths:
     output_dir: outputs
-    data_dir: data
+    data_dir: "data"
     checkpoint_dir: checkpoints
   seed: 489
   device: cuda
diff --git a/configs/full_configs/good_boi.yaml b/configs/full_configs/good_boi.yaml
new file mode 100644
index 00000000..bdd954ec
--- /dev/null
+++ b/configs/full_configs/good_boi.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 2
+  lora_rank: 64
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 12
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1440
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: simple_en_wiki
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 512
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 150000
+    lr_decay_iters: 150000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 10000
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_baseline.yaml b/configs/full_configs/shared_baseline.yaml
new file mode 100644
index 00000000..607f4db0
--- /dev/null
+++ b/configs/full_configs/shared_baseline.yaml
@@ -0,0 +1,85 @@
+model:
+  core_model:
+    core_model_type: generic
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1361
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 528
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_deep.yaml b/configs/full_configs/shared_deep.yaml
new file mode 100644
index 00000000..91bb002b
--- /dev/null
+++ b/configs/full_configs/shared_deep.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 0
+  lora_rank: null
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 14
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1568
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 608
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_k_1_wide.yaml b/configs/full_configs/shared_k_1_wide.yaml
new file mode 100644
index 00000000..eed263a3
--- /dev/null
+++ b/configs/full_configs/shared_k_1_wide.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 1
+  lora_rank: null
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1568
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 608
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_k_2_deep.yaml b/configs/full_configs/shared_k_2_deep.yaml
new file mode 100644
index 00000000..d75c5a87
--- /dev/null
+++ b/configs/full_configs/shared_k_2_deep.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 2
+  lora_rank: null
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 12
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1440
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 544
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_k_2_wide.yaml b/configs/full_configs/shared_k_2_wide.yaml
new file mode 100644
index 00000000..f7bd51bc
--- /dev/null
+++ b/configs/full_configs/shared_k_2_wide.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 2
+  lora_rank: null
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1444
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 560
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_r_64.yaml b/configs/full_configs/shared_r_64.yaml
new file mode 100644
index 00000000..16ea6543
--- /dev/null
+++ b/configs/full_configs/shared_r_64.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 0
+  lora_rank: 64
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1650
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 640
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_r_64_k_1.yaml b/configs/full_configs/shared_r_64_k_1.yaml
new file mode 100644
index 00000000..3424d75f
--- /dev/null
+++ b/configs/full_configs/shared_r_64_k_1.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 1
+  lora_rank: 64
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1527
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 592
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_r_64_k_2_long.yaml b/configs/full_configs/shared_r_64_k_2_long.yaml
new file mode 100644
index 00000000..406b6a34
--- /dev/null
+++ b/configs/full_configs/shared_r_64_k_2_long.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 2
+  lora_rank: 64
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 12
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1440
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 512
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_r_64_long.yaml b/configs/full_configs/shared_r_64_long.yaml
new file mode 100644
index 00000000..e7d69707
--- /dev/null
+++ b/configs/full_configs/shared_r_64_long.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 0
+  lora_rank: 64
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 12
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1527
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 592
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_untied_inputoutput.yaml b/configs/full_configs/shared_untied_inputoutput.yaml
new file mode 100644
index 00000000..c4b888d2
--- /dev/null
+++ b/configs/full_configs/shared_untied_inputoutput.yaml
@@ -0,0 +1,85 @@
+model:
+  core_model:
+    core_model_type: generic
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1361
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 528
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: false
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/full_configs/shared_wide.yaml b/configs/full_configs/shared_wide.yaml
new file mode 100644
index 00000000..8ccb4a2c
--- /dev/null
+++ b/configs/full_configs/shared_wide.yaml
@@ -0,0 +1,87 @@
+model:
+  k_interior_layers: 0
+  lora_rank: null
+  core_model:
+    core_model_type: weight_sharing
+    num_layers: 8
+    ffn:
+      ffn_type: swiglu
+      ffn_dim: 1733
+      normalization: rms_norm
+      bias: false
+    attn:
+      attn_type: generic
+      num_heads: 16
+      normalization: rms_norm
+      group_size: 4
+      bias: false
+      is_causal: true
+  embedder:
+    tokenizer_type: gpt2
+    embedding_model_type: generic
+    dataset_name: stlm
+  lm_head:
+    normalization: rms_norm
+    bias: false
+    lm_head_type: generic
+  hidden_dim: 672
+  context_window: 512
+  vocab_size: 50257
+  model_shell_type: standard
+  embedding_weight_tying: true
+  positional_encoding_type: rope
+trainer:
+  dropout_scheduler:
+    dropout_type: constant
+    dropout: 0
+  dataset: openwebtext
+  training:
+    trainer_type: base_trainer
+    batch_size: 24
+    gradient_accumulation_steps: 20
+    max_iters: 30000
+    lr_decay_iters: 30000
+    warmup_iters: 5000
+    eval_interval: 2000
+    log_interval: 10
+    eval_iters: 500
+    checkpoint_interval: 1000000000.0
+    run_profiler: false
+  eval:
+    - benchmarks:
+        - "winograd"
+        - "hellaswag"
+        - "arc"
+        - "mmlu"
+        - "blimp"
+      num_samples: 1000
+      evaluator: "mcq"
+    - evaluator: "prog"
+  optimizer:
+    name: nanoGPTadamW
+    lr: 0.0006
+    min_lr: 6.0e-05
+    weight_decay: 0.1
+    beta1: 0.9
+    beta2: 0.95
+    grad_clip: 1.0
+    decay_lr: true
+    warmup_iters: 5000
+  lr_scheduler:
+    name: cosine
+  dataloader:
+    name: standard
+  datasampling:
+    name: standard
+  loss_fn:
+    name: cross_entropy
+general:
+  logging:
+    wandb_log: true
+    wandb_project: SuperTinyLanguageModels
+  paths:
+    output_dir: outputs
+    data_dir: "data"
+    checkpoint_dir: checkpoints
+  seed: 489
+  device: cuda
diff --git a/configs/general/default.yaml b/configs/general/default.yaml
index 9467d1c2..9cf8c88f 100644
--- a/configs/general/default.yaml
+++ b/configs/general/default.yaml
@@ -4,7 +4,7 @@ logging:
 
 paths:
   output_dir: "outputs"
-  data_dir: "data"
+  data_dir:"data"
   checkpoint_dir: "checkpoints"
 
 seed: 489
diff --git a/configs/generate.yaml b/configs/generate.yaml
index 1a9cbf40..2d71d087 100644
--- a/configs/generate.yaml
+++ b/configs/generate.yaml
@@ -1,4 +1,4 @@
 defaults:
   - generator: baseline
-model_ckpt: "checkpoints/...pt"
+model_ckpt: "outputs/2024-09-09/09-52-58/checkpoints/ckpt_149999.pt"
 
diff --git a/configs/generator/baseline.yaml b/configs/generator/baseline.yaml
index a62d72b7..5af4ea73 100644
--- a/configs/generator/baseline.yaml
+++ b/configs/generator/baseline.yaml
@@ -1,4 +1,5 @@
 temperature: 0.8
-top_k: 10
+top_k: 200
 max_new_tokens: 300
-input_text: "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being an ocean world, the only one in the Solar System sustaining liquid surface water."
+# input_text: "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being an ocean world, the only one in the Solar System sustaining liquid surface water."
+input_text: "Recipe for fish and chips:"
\ No newline at end of file
diff --git a/models/build_models.py b/models/build_models.py
index 36236091..7eeff965 100644
--- a/models/build_models.py
+++ b/models/build_models.py
@@ -12,6 +12,7 @@
 from models.experimental.next_thought.embedding_models import HierarchicalEncoder
 from models.experimental.next_thought.model_heads import VariableLengthLatentDecoder
 from models.experimental.next_thought.core_models import BaselineCoreModel, Conv1dCoreModel
+from models.experimental import weight_sharing
 from models.model_heads import AutoregressiveLMHead
 from models.model_shell import ModelShell
 
@@ -70,7 +71,8 @@ def build_embedding_model(model_cfg):
     "generic_ffn_sharing": GenericFFNSharedTransfomer,
     "hf_core": HFTransformerCore,
     "next_thought_baseline": BaselineCoreModel,
-    "conv": Conv1dCoreModel
+    "conv": Conv1dCoreModel,
+    "weight_sharing": weight_sharing.SharedInteriorFFNLora,
 }
 
 
diff --git a/models/components/layers/attention.py b/models/components/layers/attention.py
index 41b06b37..21f94a6e 100644
--- a/models/components/layers/attention.py
+++ b/models/components/layers/attention.py
@@ -21,15 +21,16 @@ def __init__(
         group_size,
     ):
         super().__init__()
-        assert hidden_dim % num_heads == 0, "Hidden dim must be divisible by num heads"
-
+        self.head_dim = 2 * ((hidden_dim // num_heads) // 2) # make sure it's even
+        self.effective_hidden_dim = self.head_dim * num_heads # different for certain head_dims
+        self.group_dim = self.head_dim * group_size
         # key, query, value projections for all heads
         self.c_attn = torch.nn.Linear(
-            hidden_dim, hidden_dim + 2 * hidden_dim // group_size, bias=bias
+            hidden_dim, self.effective_hidden_dim + 2 * self.group_dim, bias=bias
         )
 
         # output projection
-        self.c_proj = torch.nn.Linear(hidden_dim, hidden_dim, bias=bias)
+        self.c_proj = torch.nn.Linear(self.effective_hidden_dim, hidden_dim, bias=bias)
 
         # attention dropout
         self.attn_dropout = torch.nn.Dropout()
@@ -43,7 +44,7 @@ def __init__(
         if self.use_rope:
             assert context_window % 2 == 0
             self.freqs_cis = compute_freqs_cis(
-                seq_len=context_window, head_dim=hidden_dim // num_heads
+                seq_len=context_window, head_dim=self.head_dim
             )
 
     def forward(self, x, attention_mask=None):
@@ -51,16 +52,15 @@ def forward(self, x, attention_mask=None):
         Forward pass
         """
         assert attention_mask is None, "Not implemented yet"
-        B, S, H = x.size()
+        B, S, _ = x.size()
         num_grouped_heads = self.num_heads // self.group_size
-        group_hidden_dim = H // self.group_size
 
         # calculate query, key, values for all heads in batch
         # move head forward to be the batch dim
-        q, k, v = self.c_attn(x).split([H, group_hidden_dim, group_hidden_dim], dim=-1)
-        k = k.view(B, S, num_grouped_heads, H // self.num_heads)  # (B, T, nh, hs)
-        q = q.view(B, S, self.num_heads, H // self.num_heads)  # (B, T, nh, hs)
-        v = v.view(B, S, num_grouped_heads, H // self.num_heads).transpose(
+        q, k, v = self.c_attn(x).split([self.effective_hidden_dim, self.group_dim, self.group_dim], dim=-1)
+        k = k.view(B, S, num_grouped_heads, self.head_dim)  # (B, T, nh, hs)
+        q = q.view(B, S, self.num_heads, self.head_dim)  # (B, T, nh, hs)
+        v = v.view(B, S, num_grouped_heads, self.head_dim).transpose(
             1, 2
         )  # (B, nh, T, hs)
 
@@ -86,11 +86,11 @@ def forward(self, x, attention_mask=None):
         )
         # pylint: enable=not-callable
         y = (
-            y.transpose(1, 2).contiguous().view(B, S, H)
+            y.transpose(1, 2).contiguous().view(B, S, self.effective_hidden_dim)
         )  # re-assemble all head outputs side by side
 
         # output projection
-        y = self.attn_dropout(self.c_proj(y))  # is this really necessary?
+        y = self.attn_dropout(self.c_proj(y)) # Reshape to original dim
 
         return y
 
diff --git a/models/experimental/weight_sharing.py b/models/experimental/weight_sharing.py
new file mode 100644
index 00000000..f27fb576
--- /dev/null
+++ b/models/experimental/weight_sharing.py
@@ -0,0 +1,55 @@
+import torch.nn as nn
+from models.core_models import GenericTransformer
+
+class LoRA(nn.Module):
+    def __init__(self, linear_layer, lora_rank):
+        """Wraps the linear layer with LoRA"""
+        super().__init__()
+        self.linear_layer = linear_layer
+        self.lora_rank = lora_rank
+        self.U = nn.Linear(linear_layer.in_features, lora_rank)
+        self.V = nn.Linear(lora_rank, linear_layer.out_features)
+
+    def forward(self, x):
+        """Forward pass through the linear layer with LoRA"""
+        return self.linear_layer(x) + self.V(self.U(x))
+
+class SharedInteriorFFNLora(GenericTransformer):
+    def __init__(self, model_cfg):
+        super().__init__(model_cfg)
+        self.k_interior_layers = model_cfg["k_interior_layers"]
+        self.lora_rank = model_cfg["lora_rank"]
+        
+        self._apply_weight_sharing_and_lora(
+            start_layer=1 + self.k_interior_layers,
+            end_layer=len(self.transformer.h) - self.k_interior_layers,
+            module_name='ffn'
+        )
+
+    def _apply_weight_sharing_and_lora(self, start_layer: int, end_layer: int, module_name: str):
+        base_module = getattr(self.transformer.h[start_layer], module_name)
+        shared_weights = {name: module.weight for name, module in base_module.named_modules() if isinstance(module, nn.Linear)}
+        
+        for i in range(start_layer, end_layer):
+            target_module = getattr(self.transformer.h[i], module_name)
+            for name, module in target_module.named_modules():
+                if isinstance(module, nn.Linear):
+                    module.weight = shared_weights[name]
+                    if self.lora_rank is not None:
+                        lora_module = LoRA(module, self.lora_rank)
+                        setattr(target_module, name, lora_module)
+
+
+
+class SharedInteriorFFNLoraAndCProj(SharedInteriorFFNLora):
+    def __init__(self, model_cfg):
+        super().__init__(model_cfg)
+
+        # now strictly share the c_proj weights w/o lora
+        for i in range(1 + self.k_interior_layers, len(self.transformer.h) - self.k_interior_layers):
+            base_cproj = self.transformer.h[1 + self.k_interior_layers].attn.c_proj
+            shared_cproj_weights = {name: module.weight for name, module in base_cproj.named_modules() if isinstance(module, nn.Linear)}
+            target_cproj = self.transformer.h[i].attn.c_proj
+            for name, module in target_cproj.named_modules():
+                if isinstance(module, nn.Linear):
+                    module.weight = shared_cproj_weights[name]
\ No newline at end of file
diff --git a/pre_reports/weight_tying_prereport.pdf b/pre_reports/weight_tying_prereport.pdf
new file mode 100644
index 00000000..0c16c7d7
Binary files /dev/null and b/pre_reports/weight_tying_prereport.pdf differ