databricks · jdchang1 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
@@ -1,14 +1,22 @@
 # Copyright 2024 MosaicML ComposeRL authors
 # SPDX-License-Identifier: Apache-2.0
 
-from compose_rl.algorithms.offline.callback import ReferencePolicyCallback
+from compose_rl.algorithms.offline.callback import (
+    PairwiseReferencePolicyCallback,
+    ReferencePolicyCallback,
+)
 from compose_rl.algorithms.offline.model import (
+    ComposerHFOfflinePolicyLM,
     ComposerHFPairwiseOfflinePolicyLM,
+    ComposerMPTOfflinePolicyLM,
     ComposerMPTPairwiseOfflinePolicyLM,
 )
 
 __all__ = [
+    'ComposerHFOfflinePolicyLM',
+    'ComposerMPTOfflinePolicyLM',
     'ComposerMPTPairwiseOfflinePolicyLM',
     'ComposerHFPairwiseOfflinePolicyLM',
+    'PairwiseReferencePolicyCallback',
     'ReferencePolicyCallback',
 ]
@@ -31,14 +31,20 @@ def __init__(
     ):
         self.train_config = copy.deepcopy(train_config)
         self.reference_model = None
+        self.auxiliary_model = None  # Add auxiliary model
 
     def after_load(self, state: State, logger: Logger) -> None:
-        model_config = self.train_config['model']
+        #model_config = self.train_config['model']
+        model_config = self.train_config['variables']['reference_model']
         init_context = process_init_device(
             model_config,
             self.train_config.get('fsdp_config'),
         )
         name = model_config.pop('name')
+        print("################################################")
+        print("reference model config:")
+        print(model_config)
+        print("################################################")
         self.reference_model = build_composer_model(
             name=name,
             cfg=model_config,
@@ -75,6 +81,64 @@ def after_load(self, state: State, logger: Logger) -> None:
             callbacks=load_checkpoint_callbacks,
         )
 
+        # Load auxiliary model following the same pattern
+        if 'auxiliary_model' in self.train_config.get('variables', {}):
+            aux_model_config = self.train_config['variables']['auxiliary_model']
+            aux_init_context = process_init_device(
+                aux_model_config,
+                self.train_config.get('fsdp_config'),
+            )
+            aux_name = aux_model_config.pop('name')
+            print("################################################")
+            print("auxiliary model config:")
+            print(aux_model_config)
+            print("################################################")
+            self.auxiliary_model = build_composer_model(
+                name=aux_name,
+                cfg=aux_model_config,
+                tokenizer=state.model.tokenizer, # type: ignore
+                init_context=aux_init_context,
+                master_weights_dtype=aux_model_config.get('master_weights_dtype', None),
+            )
+
+            # Load auxiliary model with same checkpoint loading procedure
+            _ = Trainer(
+                model=self.auxiliary_model,
+                parallelism_config={'fsdp': state.fsdp_config},
+                precision=state.precision,
+                load_weights_only=True,
+                load_strict_model_weights=False,
+                load_path=original_load_path,
+                callbacks=load_checkpoint_callbacks,
+            )
+
+    def before_forward(self, state: State, logger: Logger) -> Optional[int]:
+        # Before every batch we need to do a forwards pass over the reference model
+        with get_precision_context(state.precision):
+            with torch.no_grad():
+                assert self.reference_model is not None
+                reference_outputs = self.reference_model(state.batch)
+                state.batch.update({
+                    'ref_logp': reference_outputs['policy_logp'],
+                    'ref_token_policy_logps': reference_outputs['token_policy_logps'],
+                })
+
+                # Add auxiliary model forward pass if available
+                if self.auxiliary_model is not None:
+                    auxiliary_outputs = self.auxiliary_model(state.batch)
+                    state.batch.update({
+                        'aux_first_num_bins_logits': auxiliary_outputs['first_num_bins_logits'],
+                    })
+
+
+class PairwiseReferencePolicyCallback(ReferencePolicyCallback):
+    """Callback to run reference policy in pairwise offline RL.
+
+    Args:
+        train_config (dict): Training config passed to callback via foundry train.py as
+            callback is registered under callbacks_with_config registry.
+    """
+
     def before_forward(self, state: State, logger: Logger) -> Optional[int]:
         # Before every batch we need to do a forwards pass over the reference model
         with get_precision_context(state.precision):

@@ -1,7 +1,7 @@
 # Copyright 2024 MosaicML ComposeRL authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""Pairwise Offline RL Composer Implementation."""
+"""Offline RL Composer Implementation."""
 
 from __future__ import annotations
 
@@ -12,9 +12,19 @@
 from llmfoundry.models import ComposerHFCausalLM, ComposerMPTCausalLM
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from compose_rl.metrics.offline_learning_metrics import (
+    TestEstimatedRewardLossMetric,
+    TestImplicitRewardsLossMetric, 
+    TestKLDivergenceLossMetric, 
+    TestSequenceEntropiesLossMetric, 
+    TestTotalLossMetric
+)
 
 from compose_rl.algorithms.offline.model_methods import (
+    RegressionOfflineEnum,
     PairwiseOfflineEnum,
+    offline_forward,
+    offline_loss,
     pairwise_offline_forward,
     pairwise_offline_loss,
 )
@@ -24,6 +34,124 @@
 log = logging.getLogger(__name__)
 
 
+class ComposerMPTOfflinePolicyLM(ComposerMPTCausalLM):
+    """MPT model wrapper for offline rl model."""
+
+    def __init__(
+        self,
+        loss_type: str = 'apo',
+        beta1: float = 0.5,
+        beta2: float = 0.1,
+        eta: float = 0.5, 
+        multistep: bool = False,
+        average_log_prob: bool = False,
+        temperature: float = 1.0,
+        **kwargs: Any,
+    ):
+        self.loss_type = RegressionOfflineEnum(loss_type)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eta = eta
+        self.multistep = multistep
+        self.average_log_prob = average_log_prob
+        self.temperature = temperature
+
+        super().__init__(**kwargs)
+        self.train_metrics = None  # DPOLM does not support eval_forward
+
+    def forward(self, batch: MutableMapping) -> dict[str, torch.Tensor]:
+        assert self.tokenizer is not None
+        return offline_forward(
+            model=self.model,
+            batch=batch,
+            average_log_prob=self.average_log_prob,
+            policy_model_config=self.config,
+        )
+
+    def eval_forward(
+        self,
+        batch: MutableMapping,
+        outputs: CausalLMOutputWithPast,
+    ) -> None:
+        raise ValueError('Eval forward is not implemented for ComposerDPOLM.')
+
+    def loss(self, outputs: CausalLMOutputWithPast,
+             batch: Mapping) -> dict[str, torch.Tensor]:
+        return offline_loss(
+            outputs = outputs,
+            batch = batch,
+            loss_type = self.loss_type,
+            beta1 = self.beta1,
+            beta2 = self.beta2,
+            eta = self.eta, 
+            multistep = self.multistep,
+        )
+
+
+class ComposerHFOfflinePolicyLM(ComposerHFCausalLM):
+    """HF class wrapper for offline rl model."""
+
+    def __init__(
+        self,
+        loss_type: str = 'apo',
+        beta1: float = 0.5,
+        beta2: float = 0.1,
+        eta: float = 0.5, 
+        multistep: bool = False,
+        average_log_prob: bool = False,
+        temperature: float = 1.0,
+        num_bins: int = 1,  # Add num_bins parameter
+        distributional_value_learning: bool = True,
+        **kwargs: Any,
+    ):
+        self.loss_type = RegressionOfflineEnum(loss_type)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eta = eta
+        self.multistep = multistep
+        self.average_log_prob = average_log_prob
+        self.temperature = temperature
+        self.num_bins = num_bins  # Store num_bins
+        self.distributional_value_learning = distributional_value_learning
+
+        super().__init__(**kwargs)
+        self.train_metrics = None  # DPOLM does not support eval_forward
+        self.val_metrics = {metric.__class__.__name__ : metric for metric in [TestEstimatedRewardLossMetric(), TestImplicitRewardsLossMetric(), TestKLDivergenceLossMetric(), TestSequenceEntropiesLossMetric(), TestTotalLossMetric()]}
+
+    def forward(self, batch: MutableMapping) -> dict[str, torch.Tensor]:
+        assert self.tokenizer is not None
+        return offline_forward(
+            model=self.model,
+            batch=batch,
+            average_log_prob=self.average_log_prob,
+            temperature=self.temperature,
+            num_bins=self.num_bins,  # Pass num_bins to offline_forward
+        )
+
+    def eval_forward(
+        self,
+        batch: MutableMapping,
+        outputs: CausalLMOutputWithPast | None = None,
+    ) -> dict[str, torch.Tensor]:
+        with torch.no_grad():
+            fwd = self.forward(batch)
+            loss = self.loss(fwd, batch)
+
+        return loss
+
+    def loss(self, outputs: CausalLMOutputWithPast,
+             batch: Mapping) -> dict[str, torch.Tensor]:
+        return offline_loss(
+            outputs = outputs,
+            batch = batch,
+            loss_type = self.loss_type,
+            beta1 = self.beta1,
+            beta2 = self.beta2,
+            eta = self.eta, 
+            multistep = self.multistep,
+            distributional_value_learning = self.distributional_value_learning,
+        )
+
 class ComposerMPTPairwiseOfflinePolicyLM(ComposerMPTCausalLM):
     """MPT model wrapper for DPO model."""