Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions config/molt-5090.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Single-GPU configuration for RTX 5090 (32 GB VRAM, 31 GB /dev/shm).
# Derived from molt.yaml with three classes of changes:
# 1. All components moved to cuda:0 (only one GPU available).
# 2. buffer_size shrunk so the activation ring buffer fits in /dev/shm.

seed_everything: 42

trainer:
max_steps: 20_000 # 20M tokens
val_check_interval: 1000000000 # replacement model for single layer doesn't make sense
limit_val_batches: 1
check_val_every_n_epoch: null
enable_checkpointing: false # We use custom end-of-training checkpoint
num_sanity_val_steps: 0 # Can't run replacement model before standardizers are initialized
accelerator: "gpu"
devices: [0] # Only one GPU on this machine
accumulate_grad_batches: 1
logger:
class_path: lightning.pytorch.loggers.WandbLogger
init_args:
project: "debug-molt"
name: "molt-5090-0_0005-1550-128k-jumprelu-float32-20M"
save_dir: "./wandb"
callbacks:
- class_path: crosslayer_transcoder.utils.callbacks.EndOfTrainingCheckpointCallback
init_args:
checkpoint_dir: "checkpoints"

model:
class_path: crosslayer_transcoder.model.clt_lightning.MoltModule
init_args:
model:
class_path: crosslayer_transcoder.model.molt.Molt
init_args:
d_acts: 768
N: 50
nonlinearity:
class_path: crosslayer_transcoder.model.jumprelu.JumpReLU
init_args:
theta: 0.03
bandwidth: 1.0
n_layers: 1
d_features: 1550

input_standardizer:
class_path: crosslayer_transcoder.model.standardize.DimensionwiseInputStandardizer
init_args:
n_layers: 12
activation_dim: 768

output_standardizer:
class_path: crosslayer_transcoder.model.standardize.DimensionwiseOutputStandardizer
init_args:
n_layers: 12
activation_dim: 768

# Validation is gated to never fire (val_check_interval=1e9), so this metric
# would never run anyway.
replacement_model: null

dead_features:
class_path: crosslayer_transcoder.metrics.dead_features.DeadFeatures
init_args:
n_features: 120
n_layers: 12
return_per_layer: true
return_log_freqs: true
return_neuron_indices: true

learning_rate: 2e-4
compile: true
lr_decay_step: 1000000000
lr_decay_factor: 0.1

lambda_sparsity: 0.0005
c_sparsity: 100
use_tanh: true

# NOTE: For MoltModule these settings are inert. MOLT's dead-feature path
# is hardcoded — it logs `metrics/dead_features` every step via
# update_dead_features(gate) regardless of these flags. The DeadFeatures
# torchmetrics instance configured above is also unused for MOLT.
# TODO: wire MoltModule.training_step into the configurable path.
compute_dead_features: true
compute_dead_features_every: 500

data:
class_path: crosslayer_transcoder.data.datamodule.ActivationDataModule
init_args:
# Buffer settings
# 350k * 12 layers * 2 (in/out) * 768 dim * 4 bytes ≈ 25.8 GB — fits in /dev/shm (31 GB cap).
# Original config used 1_000_000 which would need ~74 GB.
buffer_size: 350_000
n_in_out: 2
n_layers: 12
activation_dim: 768
dtype: "float32"
max_batch_size: 50000

model_name: "openai-community/gpt2"
model_dtype: "float32"

# Dataset settings
dataset_name: "Skylion007/openwebtext"
dataset_split: "train"
max_sequence_length: 1024

generation_batch_size: 10
refresh_interval: 0.1

shared_memory_name: "activation_buffer"
timeout_seconds: 30

batch_size: 1000
num_workers: 10
prefetch_factor: 2
shuffle: true
persistent_workers: true
pin_memory: true

minimum_fill_threshold: 0.01

use_shared_memory: true

device_map: "cuda:0" # was cuda:0 already — unchanged
deployment_policy: "gpu_only"

wandb_logging:
enabled: true
project: "debug-molt"
group: null
run_name: "data-generator"
tags: ["data-generation"]
save_dir: "./wandb"
log_interval: 5.0

ckpt_path: null
137 changes: 137 additions & 0 deletions config/molt-5090_20M_tokens_0_00001.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Single-GPU configuration for RTX 5090 (32 GB VRAM, 31 GB /dev/shm).
# Derived from molt.yaml with three classes of changes:
# 1. All components moved to cuda:0 (only one GPU available).
# 2. buffer_size shrunk so the activation ring buffer fits in /dev/shm.

seed_everything: 42

trainer:
max_steps: 20_000 # 20M tokens
val_check_interval: 1000000000 # replacement model for single layer doesn't make sense
limit_val_batches: 1
check_val_every_n_epoch: null
enable_checkpointing: false # We use custom end-of-training checkpoint
num_sanity_val_steps: 0 # Can't run replacement model before standardizers are initialized
accelerator: "gpu"
devices: [0] # Only one GPU on this machine
accumulate_grad_batches: 1
logger:
class_path: lightning.pytorch.loggers.WandbLogger
init_args:
project: "debug-molt"
name: "molt-5090-0_00001-1550-128k-jumprelu-float32-20M"
save_dir: "./wandb"
callbacks:
- class_path: crosslayer_transcoder.utils.callbacks.EndOfTrainingCheckpointCallback
init_args:
checkpoint_dir: "checkpoints/lam_0_00001"

model:
class_path: crosslayer_transcoder.model.clt_lightning.MoltModule
init_args:
model:
class_path: crosslayer_transcoder.model.molt.Molt
init_args:
d_acts: 768
N: 50
nonlinearity:
class_path: crosslayer_transcoder.model.jumprelu.JumpReLU
init_args:
theta: 0.03
bandwidth: 1.0
n_layers: 1
d_features: 1550

input_standardizer:
class_path: crosslayer_transcoder.model.standardize.DimensionwiseInputStandardizer
init_args:
n_layers: 12
activation_dim: 768

output_standardizer:
class_path: crosslayer_transcoder.model.standardize.DimensionwiseOutputStandardizer
init_args:
n_layers: 12
activation_dim: 768

# Validation is gated to never fire (val_check_interval=1e9), so this metric
# would never run anyway.
replacement_model: null

dead_features:
class_path: crosslayer_transcoder.metrics.dead_features.DeadFeatures
init_args:
n_features: 120
n_layers: 12
return_per_layer: true
return_log_freqs: true
return_neuron_indices: true

learning_rate: 2e-4
compile: true
lr_decay_step: 1000000000
lr_decay_factor: 0.1

lambda_sparsity: 0.00001
c_sparsity: 100
use_tanh: true

# NOTE: For MoltModule these settings are inert. MOLT's dead-feature path
# is hardcoded — it logs `metrics/dead_features` every step via
# update_dead_features(gate) regardless of these flags. The DeadFeatures
# torchmetrics instance configured above is also unused for MOLT.
# TODO: wire MoltModule.training_step into the configurable path.
compute_dead_features: true
compute_dead_features_every: 500

data:
class_path: crosslayer_transcoder.data.datamodule.ActivationDataModule
init_args:
# Buffer settings
# 350k * 12 layers * 2 (in/out) * 768 dim * 4 bytes ≈ 25.8 GB — fits in /dev/shm (31 GB cap).
# Original config used 1_000_000 which would need ~74 GB.
buffer_size: 350_000
n_in_out: 2
n_layers: 12
activation_dim: 768
dtype: "float32"
max_batch_size: 50000

model_name: "openai-community/gpt2"
model_dtype: "float32"

# Dataset settings
dataset_name: "Skylion007/openwebtext"
dataset_split: "train"
max_sequence_length: 1024

generation_batch_size: 10
refresh_interval: 0.1

shared_memory_name: "activation_buffer"
timeout_seconds: 30

batch_size: 1000
num_workers: 10
prefetch_factor: 2
shuffle: true
persistent_workers: true
pin_memory: true

minimum_fill_threshold: 0.01

use_shared_memory: true

device_map: "cuda:0" # was cuda:0 already — unchanged
deployment_policy: "gpu_only"

wandb_logging:
enabled: true
project: "debug-molt"
group: null
run_name: "data-generator"
tags: ["data-generation"]
save_dir: "./wandb"
log_interval: 5.0

ckpt_path: null
Loading
Loading