Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions configs/full_configs/baseline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,30 @@ trainer:
dropout_scheduler:
dropout_type: constant
dropout: 0.1
dataset: openwebtext
training:
dataset: openwebtext
trainer_type: base_trainer
batch_size: 24
gradient_accumulation_steps: 20
max_iters: 30000
lr_decay_iters: 30000
warmup_iters: 5000
eval_interval: 2000
log_interval: 10
eval_iters: 500
checkpoint_interval: 1000000000.0
run_profiler: false
eval:
- benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 1000
evaluator: "mcq"
- evaluator: "prog"
eval_iters: 500
eval_interval: 2000
evaluators:
- benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 1000
evaluator: "mcq"
- evaluator: "prog"
optimizer:
name: nanoGPTadamW
lr: 0.0006
Expand Down
191 changes: 129 additions & 62 deletions models/build_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,24 @@
core model, lm head and the model shell.
"""

from models import model_shell
from models.cast_configs import ModelShellConfigMap
from models.core_models import GenericFFNSharedTransfomer, GenericTransformer
from models.embedding_models import GenericEmbedder
from models.experimental.byte_level.byte_model_shell import (
ByteModelShell,
ByteShellConfig,
)
from models.experimental.byte_level.embedding_model import ByteLevelEmbedder
from models.experimental.byte_level.model_heads import ByteLevelDecoder
from models.experimental.byte_level.byte_model_shell import ByteModelShell
from models.experimental.hugging_face import HFEmbedder, HFLMHead, HFTransformerCore
from models.experimental.next_thought.core_models import (
BaselineCoreModel,
Conv1dCoreModel,
)
from models.experimental.next_thought.embedding_models import HierarchicalEncoder
from models.experimental.next_thought.model_heads import VariableLengthLatentDecoder
from models.experimental.next_thought.core_models import BaselineCoreModel, Conv1dCoreModel
from models.model_heads import AutoregressiveLMHead
from models.model_shell import ModelShell


def build_model(model_cfg=None, checkpoint=None):
Expand Down Expand Up @@ -44,125 +51,185 @@ def build_model(model_cfg=None, checkpoint=None):
return model


EMBEDDING_MODEL_DICT = {
"generic": GenericEmbedder,
"byte_level": ByteLevelEmbedder,
EMBEDDER_DICT = {
"generic": GenericEmbedder,
"hf_embedder": HFEmbedder,
"hierarchical": HierarchicalEncoder,
}
"nt_embedder": HierarchicalEncoder,
"byte_embedder": ByteLevelEmbedder,
}


def build_embedding_model(model_cfg):
def build_embedding_model(
model_cfg: ModelShellConfigMap | ByteModelShell,
) -> GenericEmbedder:
"""
Given the embedding model config, build it.
Args:
model_cfg: model_cfg
Returns:
embedding_model: embedding_model_instance
"""
return EMBEDDING_MODEL_DICT[model_cfg["embedder"]["embedding_model_type"]](
model_cfg=model_cfg
)
embedder_cfg = model_cfg.embedding_model
embedder_type = model_cfg.embedding_model.embedding_model_type
match embedder_type:
case "byte_embedder":
return ByteLevelEmbedder(
embedder_cfg=embedder_cfg,
byte_cfg=model_cfg,
hidden_dim=model_cfg.hidden_dim,
vocab_size=model_cfg.vocab_size,
)
case "hf_embedder":
return HFEmbedder(model_cfg=embedder_cfg)
case "nt_embedder":
return HierarchicalEncoder(
embedder_cfg=embedder_cfg,
vocab_size=model_cfg.vocab_size,
hidden_dim=model_cfg.hidden_dim,
context_window=model_cfg.context_window,
positional_encoding_type=model_cfg.positional_encoding_type,
)
case "generic":
return GenericEmbedder(
embedder_cfg=embedder_cfg,
vocab_size=model_cfg.vocab_size,
hidden_dim=model_cfg.hidden_dim,
context_window=model_cfg.context_window,
positional_encoding_type=model_cfg.positional_encoding_type,
)


CORE_MODEL_DICT = {
"generic": GenericTransformer,
"generic_ffn_sharing": GenericFFNSharedTransfomer,
"hf_core": HFTransformerCore,
"next_thought_baseline": BaselineCoreModel,
"conv": Conv1dCoreModel
"conv": Conv1dCoreModel,
}


def build_core_model(model_cfg):
def build_core_model(
model_cfg: model_shell.ModelShellConfig | ByteModelShell,
) -> GenericTransformer:
"""
Given the core model config, build it.
Args:
model_cfg: model_cfg
Returns:
core_model: core_model_instance
"""
return CORE_MODEL_DICT[model_cfg["core_model"]["core_model_type"]](
model_cfg=model_cfg
)
core_model_cfg = model_cfg.core_model
core_model_type = core_model_cfg.core_model_type
match core_model_type:
case "generic":
return GenericTransformer(
hidden_dim=model_cfg.hidden_dim,
context_window=model_cfg.context_window,
core_model_cfg=core_model_cfg,
)
case "generic_ffn_sharing":
return GenericFFNSharedTransfomer(
hidden_dim=model_cfg.hidden_dim,
context_window=model_cfg.context_window,
core_model_cfg=core_model_cfg,
)
case "hf_core":
return HFTransformerCore(model_cfg=core_model_cfg)
case "next_thought_baseline":
return BaselineCoreModel(model_cfg=core_model_cfg)
case "conv":
return Conv1dCoreModel()


MODEL_HEAD_DICT = {
"generic": lambda model_cfg, embedding_model: AutoregressiveLMHead(model_cfg=model_cfg),
"byte_level": lambda model_cfg, embedding_model: ByteLevelDecoder(model_cfg=model_cfg),
"hf_head": lambda model_cfg, embedding_model: HFLMHead(model_cfg=model_cfg),
"latent_2_seq": lambda model_cfg, embedding_model: VariableLengthLatentDecoder(
model_cfg=model_cfg,
embedding_model=embedding_model
),
}
"hf_lm_head": HFLMHead,
"nt_lm_head": VariableLengthLatentDecoder,
"byte_lm_head": ByteLevelDecoder,
"generic": AutoregressiveLMHead,
}


def build_model_head(model_cfg, embedding_model=None):
def build_model_head(model_cfg: ModelShellConfigMap, embedding_model: GenericEmbedder):
"""
Given the lm head config, build it.
Given the model head config, build it.
Args:
model_cfg: model_cfg
embedding_model: embedding_model_instance
Returns:
model_head: model_head_instance
"""
return MODEL_HEAD_DICT[model_cfg["lm_head"]["lm_head_type"]](
model_cfg=model_cfg,
embedding_model=embedding_model
)


MODEL_SHELL_DICT = {
"standard": ModelShell,
"byte_shell": ByteModelShell
}


def build_model_shell(model_cfg, embedding_model, core_model, model_head):
model_head_cfg = model_cfg.model_head
model_head_type = model_head_cfg.model_head_type
match model_head_type:
case "hf_lm_head":
return HFLMHead(model_cfg=model_head_cfg)
case "nt_lm_head":
return VariableLengthLatentDecoder(
model_cfg=model_head_cfg, embedding_model=embedding_model
)
case "generic":
return AutoregressiveLMHead(
hidden_dim=model_cfg.hidden_dim,
vocab_size=model_cfg.vocab_size,
lm_head_cfg=model_head_cfg,
)


MODEL_SHELL_DICT = {"standard": model_shell.ModelShell, "byte_shell": ByteModelShell}


def build_model_shell(
model_cfg: model_shell.ModelShellConfig | ByteShellConfig,
):
"""
Given the model shell config, build it.
Args:
model_cfg: model_cfg
Returns:
model_shell: model_shell_instance
"""
return MODEL_SHELL_DICT[model_cfg["model_shell_type"]](
embedding_model=embedding_model, core_model=core_model, model_head=model_head
)
model_shell_type = model_cfg.model_shell_type
# build the embedding model
embedding_model = build_embedding_model(model_cfg=model_cfg)

# build the core model
core_model = build_core_model(model_cfg=model_cfg)

def initialize_model(model_cfg):
# build the model head
model_head = build_model_head(model_cfg=model_cfg, embedding_model=embedding_model)
match model_shell_type:
case "standard":
return model_shell.ModelShell(
embedding_model=embedding_model,
core_model=core_model,
model_head=model_head,
)
case "byte_shell":
return ByteModelShell(
embedding_model=embedding_model,
core_model=core_model,
model_head=model_head,
)


def initialize_model(model_dict: dict):
"""
Initialize the model given the configuration.
Args:
model_cfg: model_cfg
Returns:
model: model_instance
"""
# build the embedding model
embedding_model = build_embedding_model(model_cfg=model_cfg)

# build the core model
core_model = build_core_model(model_cfg=model_cfg)

# build the model head
model_head = build_model_head(
model_cfg = ModelShellConfigMap(**model_dict)
model = build_model_shell(
model_cfg=model_cfg,
embedding_model=embedding_model
)

# check if embedding model weights are to be shared with the model head
if model_cfg["embedding_weight_tying"]:
if model_cfg.embedding_weight_tying:
# share the weights between the token embeddings and the final
# logit layer, following: https://paperswithcode.com/method/weight-tying
embedding_model.token_embedder.weight = model_head.linear.weight
model.embedding_model.token_embedder.weight = model.model_head.linear.weight

# build the model shell
model = build_model_shell(
model_cfg=model_cfg,
embedding_model=embedding_model,
core_model=core_model,
model_head=model_head,
)

return model
30 changes: 30 additions & 0 deletions models/cast_configs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Pseudo Configs for Model Building/Casting"""

from typing import Literal

from models import core_models, embedding_models, model_heads, model_shell
from models.experimental import hugging_face
from models.experimental.next_thought import core_models as nt_core_models
from models.experimental.next_thought import embedding_models as nt_embedding_models


class ModelShellConfigMap(model_shell.ModelShellConfig):
"""Config for the standard model shell"""

model_shell_type: Literal["standard"]
core_model: (
hugging_face.CoreModelConfig
| nt_core_models.CoreModelConfig
| core_models.CoreModelConfig
)
embedding_model: (
nt_embedding_models.HierarchicalEncoderConfig
| hugging_face.HFEmbedderConfig
| embedding_models.GenericEmbedderConfig
)
model_head: hugging_face.HFLMHeadConfig | model_heads.LMHeadConfig
hidden_dim: int
context_window: int
vocab_size: int
embedding_weight_tying: bool
positional_encoding_type: str
Loading