Goreg12345 · Ky-Ng · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/config/molt-5090.yaml b/config/molt-5090.yaml
@@ -0,0 +1,137 @@
+# Single-GPU configuration for RTX 5090 (32 GB VRAM, 31 GB /dev/shm).
+# Derived from molt.yaml with three classes of changes:
+#   1. All components moved to cuda:0 (only one GPU available).
+#   2. buffer_size shrunk so the activation ring buffer fits in /dev/shm.
+
+seed_everything: 42
+
+trainer:
+  max_steps: 20_000  # 20M tokens
+  val_check_interval: 1000000000  # replacement model for single layer doesn't make sense
+  limit_val_batches: 1
+  check_val_every_n_epoch: null
+  enable_checkpointing: false  # We use custom end-of-training checkpoint
+  num_sanity_val_steps: 0  # Can't run replacement model before standardizers are initialized
+  accelerator: "gpu"
+  devices: [0]  # Only one GPU on this machine
+  accumulate_grad_batches: 1
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      project: "debug-molt"
+      name: "molt-5090-0_0005-1550-128k-jumprelu-float32-20M"
+      save_dir: "./wandb"
+  callbacks:
+    - class_path: crosslayer_transcoder.utils.callbacks.EndOfTrainingCheckpointCallback
+      init_args:
+        checkpoint_dir: "checkpoints"
+
+model:
+  class_path: crosslayer_transcoder.model.clt_lightning.MoltModule
+  init_args:
+    model:
+      class_path: crosslayer_transcoder.model.molt.Molt
+      init_args:
+        d_acts: 768
+        N: 50
+        nonlinearity:
+          class_path: crosslayer_transcoder.model.jumprelu.JumpReLU
+          init_args:
+            theta: 0.03
+            bandwidth: 1.0
+            n_layers: 1
+            d_features: 1550
+
+        input_standardizer:
+          class_path: crosslayer_transcoder.model.standardize.DimensionwiseInputStandardizer
+          init_args:
+            n_layers: 12
+            activation_dim: 768
+
+        output_standardizer:
+          class_path: crosslayer_transcoder.model.standardize.DimensionwiseOutputStandardizer
+          init_args:
+            n_layers: 12
+            activation_dim: 768
+
+    # Validation is gated to never fire (val_check_interval=1e9), so this metric
+    # would never run anyway.
+    replacement_model: null
+
+    dead_features:
+      class_path: crosslayer_transcoder.metrics.dead_features.DeadFeatures
+      init_args:
+        n_features: 120
+        n_layers: 12
+        return_per_layer: true
+        return_log_freqs: true
+        return_neuron_indices: true
+
+    learning_rate: 2e-4
+    compile: true
+    lr_decay_step: 1000000000
+    lr_decay_factor: 0.1
+
+    lambda_sparsity: 0.0005
+    c_sparsity: 100
+    use_tanh: true
+
+    # NOTE: For MoltModule these settings are inert. MOLT's dead-feature path
+    # is hardcoded — it logs `metrics/dead_features` every step via
+    # update_dead_features(gate) regardless of these flags. The DeadFeatures
+    # torchmetrics instance configured above is also unused for MOLT.
+    # TODO: wire MoltModule.training_step into the configurable path.
+    compute_dead_features: true
+    compute_dead_features_every: 500
+
+data:
+  class_path: crosslayer_transcoder.data.datamodule.ActivationDataModule
+  init_args:
+    # Buffer settings
+    # 350k * 12 layers * 2 (in/out) * 768 dim * 4 bytes ≈ 25.8 GB — fits in /dev/shm (31 GB cap).
+    # Original config used 1_000_000 which would need ~74 GB.
+    buffer_size: 350_000
+    n_in_out: 2
+    n_layers: 12
+    activation_dim: 768
+    dtype: "float32"
+    max_batch_size: 50000
+
+    model_name: "openai-community/gpt2"
+    model_dtype: "float32"
+
+    # Dataset settings
+    dataset_name: "Skylion007/openwebtext"
+    dataset_split: "train"
+    max_sequence_length: 1024
+
+    generation_batch_size: 10
+    refresh_interval: 0.1
+
+    shared_memory_name: "activation_buffer"
+    timeout_seconds: 30
+
+    batch_size: 1000
+    num_workers: 10
+    prefetch_factor: 2
+    shuffle: true
+    persistent_workers: true
+    pin_memory: true
+
+    minimum_fill_threshold: 0.01
+
+    use_shared_memory: true
+
+    device_map: "cuda:0"  # was cuda:0 already — unchanged
+    deployment_policy: "gpu_only"
+
+    wandb_logging:
+      enabled: true
+      project: "debug-molt"
+      group: null
+      run_name: "data-generator"
+      tags: ["data-generation"]
+      save_dir: "./wandb"
+      log_interval: 5.0
+
+ckpt_path: null
diff --git a/config/molt-5090_20M_tokens_0_00001.yaml b/config/molt-5090_20M_tokens_0_00001.yaml
@@ -0,0 +1,137 @@
+# Single-GPU configuration for RTX 5090 (32 GB VRAM, 31 GB /dev/shm).
+# Derived from molt.yaml with three classes of changes:
+#   1. All components moved to cuda:0 (only one GPU available).
+#   2. buffer_size shrunk so the activation ring buffer fits in /dev/shm.
+
+seed_everything: 42
+
+trainer:
+  max_steps: 20_000  # 20M tokens
+  val_check_interval: 1000000000  # replacement model for single layer doesn't make sense
+  limit_val_batches: 1
+  check_val_every_n_epoch: null
+  enable_checkpointing: false  # We use custom end-of-training checkpoint
+  num_sanity_val_steps: 0  # Can't run replacement model before standardizers are initialized
+  accelerator: "gpu"
+  devices: [0]  # Only one GPU on this machine
+  accumulate_grad_batches: 1
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      project: "debug-molt"
+      name: "molt-5090-0_00001-1550-128k-jumprelu-float32-20M"
+      save_dir: "./wandb"
+  callbacks:
+    - class_path: crosslayer_transcoder.utils.callbacks.EndOfTrainingCheckpointCallback
+      init_args:
+        checkpoint_dir: "checkpoints/lam_0_00001"
+
+model:
+  class_path: crosslayer_transcoder.model.clt_lightning.MoltModule
+  init_args:
+    model:
+      class_path: crosslayer_transcoder.model.molt.Molt
+      init_args:
+        d_acts: 768
+        N: 50
+        nonlinearity:
+          class_path: crosslayer_transcoder.model.jumprelu.JumpReLU
+          init_args:
+            theta: 0.03
+            bandwidth: 1.0
+            n_layers: 1
+            d_features: 1550
+
+        input_standardizer:
+          class_path: crosslayer_transcoder.model.standardize.DimensionwiseInputStandardizer
+          init_args:
+            n_layers: 12
+            activation_dim: 768
+
+        output_standardizer:
+          class_path: crosslayer_transcoder.model.standardize.DimensionwiseOutputStandardizer
+          init_args:
+            n_layers: 12
+            activation_dim: 768
+
+    # Validation is gated to never fire (val_check_interval=1e9), so this metric
+    # would never run anyway.
+    replacement_model: null
+
+    dead_features:
+      class_path: crosslayer_transcoder.metrics.dead_features.DeadFeatures
+      init_args:
+        n_features: 120
+        n_layers: 12
+        return_per_layer: true
+        return_log_freqs: true
+        return_neuron_indices: true
+
+    learning_rate: 2e-4
+    compile: true
+    lr_decay_step: 1000000000
+    lr_decay_factor: 0.1
+
+    lambda_sparsity: 0.00001
+    c_sparsity: 100
+    use_tanh: true
+
+    # NOTE: For MoltModule these settings are inert. MOLT's dead-feature path
+    # is hardcoded — it logs `metrics/dead_features` every step via
+    # update_dead_features(gate) regardless of these flags. The DeadFeatures
+    # torchmetrics instance configured above is also unused for MOLT.
+    # TODO: wire MoltModule.training_step into the configurable path.
+    compute_dead_features: true
+    compute_dead_features_every: 500
+
+data:
+  class_path: crosslayer_transcoder.data.datamodule.ActivationDataModule
+  init_args:
+    # Buffer settings
+    # 350k * 12 layers * 2 (in/out) * 768 dim * 4 bytes ≈ 25.8 GB — fits in /dev/shm (31 GB cap).
+    # Original config used 1_000_000 which would need ~74 GB.
+    buffer_size: 350_000
+    n_in_out: 2
+    n_layers: 12
+    activation_dim: 768
+    dtype: "float32"
+    max_batch_size: 50000
+
+    model_name: "openai-community/gpt2"
+    model_dtype: "float32"
+
+    # Dataset settings
+    dataset_name: "Skylion007/openwebtext"
+    dataset_split: "train"
+    max_sequence_length: 1024
+
+    generation_batch_size: 10
+    refresh_interval: 0.1
+
+    shared_memory_name: "activation_buffer"
+    timeout_seconds: 30
+
+    batch_size: 1000
+    num_workers: 10
+    prefetch_factor: 2
+    shuffle: true
+    persistent_workers: true
+    pin_memory: true
+
+    minimum_fill_threshold: 0.01
+
+    use_shared_memory: true
+
+    device_map: "cuda:0"  # was cuda:0 already — unchanged
+    deployment_policy: "gpu_only"
+
+    wandb_logging:
+      enabled: true
+      project: "debug-molt"
+      group: null
+      run_name: "data-generator"
+      tags: ["data-generation"]
+      save_dir: "./wandb"
+      log_interval: 5.0
+
+ckpt_path: null