Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions astraflow/dataflow/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,14 @@ def _create_rollout_dataloader(self, agent_config: AgentConfig) -> Any:
"""
ds_cfg = agent_config.rollout_dataset
tokenizer = self._load_tokenizer(agent_config)
dataset = _create_dataset_from_config(ds_cfg, tokenizer)
# For offline_dir derivation, the rollout's logical name is
# ``dataset_name`` if specified, else the dataset_fn module name
# (e.g. ``deepscaler`` from
# ``astraflow.dataflow.dataset.deepscaler:get_deepscaler_rl_dataset``).
name = ds_cfg.get("dataset_name") or _module_basename(ds_cfg.get("dataset_fn", ""))
dataset = _create_dataset_from_config(
ds_cfg, tokenizer, data_root=agent_config.data_root, name=name,
)

batch_size = ds_cfg.get("batch_size", 1)
return _create_dataloader(dataset, batch_size=batch_size)
Expand Down Expand Up @@ -294,7 +301,9 @@ def _create_eval_datasets(
f"and no legacy eval_workflow_specs fallback is available"
)

dataset = _create_dataset_from_config(ds_cfg, tokenizer)
dataset = _create_dataset_from_config(
ds_cfg, tokenizer, data_root=agent_config.data_root, name=name,
)
eval_datasets[name] = (dataset, repeat, wf)

return eval_datasets
Expand Down Expand Up @@ -1542,13 +1551,30 @@ def _import_function(import_path: str) -> Any:
return getattr(module, func_name)


def _create_dataset_from_config(ds_cfg: dict[str, Any], tokenizer: Any) -> Any:
def _module_basename(dataset_fn_path: str) -> str | None:
"""Return the last module component of a ``module.path:fn`` import path."""
module_path, _, _ = dataset_fn_path.rpartition(":")
if not module_path:
return None
return module_path.rsplit(".", 1)[-1]


def _create_dataset_from_config(
ds_cfg: dict[str, Any],
tokenizer: Any,
data_root: str | None = None,
name: str | None = None,
) -> Any:
"""Create a dataset from a config dict using ``dataset_fn``.

The ``dataset_fn`` field is a Python import path like
``"astraflow.dataflow.dataset.deepscaler:get_deepscaler_rl_dataset"``.
Extra fields in ``ds_cfg`` are forwarded as kwargs when supported by
the target dataset function.

If ``data_root`` is set and ``ds_cfg`` does not specify ``offline_dir``,
one is auto-derived as ``f"{data_root}/{name}"`` — making it easy to
flip a recipe between online and offline by setting a single env var.
"""
dataset_fn_path = ds_cfg.get("dataset_fn")
if dataset_fn_path is None:
Expand All @@ -1564,6 +1590,13 @@ def _create_dataset_from_config(ds_cfg: dict[str, Any], tokenizer: Any) -> Any:
}
kwargs.setdefault("tokenizer", tokenizer)

if data_root and name and "offline_dir" not in kwargs:
kwargs["offline_dir"] = f"{data_root}/{name}"
logger.info(
"Auto-derived offline_dir for dataset %r: %s",
name, kwargs["offline_dir"],
)

sig = inspect.signature(dataset_fn)
accepts_var_kwargs = any(
p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
Expand All @@ -1572,7 +1605,7 @@ def _create_dataset_from_config(ds_cfg: dict[str, Any], tokenizer: Any) -> Any:
return dataset_fn(**kwargs)

filtered_kwargs = {
name: kwargs[name] for name in sig.parameters if name in kwargs
pname: kwargs[pname] for pname in sig.parameters if pname in kwargs
}
return dataset_fn(**filtered_kwargs)

Expand Down
13 changes: 13 additions & 0 deletions astraflow/dataflow/service_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,19 @@ class AgentConfig:
tokenizer_path: str | None = None
"""Path to tokenizer (HuggingFace model name or local path)."""

data_root: str | None = None
"""Root directory for pre-downloaded datasets (offline mode).

When set, every entry in ``rollout_dataset`` and ``eval_datasets``
that does not already specify ``offline_dir`` gets one auto-derived
as ``f"{data_root}/{name}"`` — where ``name`` is the dict key for
eval datasets, and the value of ``dataset_name`` (falling back to
the dataset_fn module name) for the rollout dataset.

Use ``examples/math/offline/download_math_datasets.py`` to populate
this directory.
"""

rollout_dataset: dict[str, Any] | None = None
"""Dataset config for rollout data acquisition.

Expand Down
1 change: 1 addition & 0 deletions docs/en/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ on distributed GPU clusters.
:caption: Recipes

recipes/math
recipes/math-offline
recipes/code
recipes/multi-agent
recipes/agentbench
Expand Down
74 changes: 74 additions & 0 deletions docs/en/recipes/math-offline.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Math (Offline)

Run the math RL recipe on a node with **no internet access** by pre-downloading every training and evaluation dataset to a local directory.

**Recipe**: [`examples/math/offline/qwen3-8b-m2po-full-offline/`](https://github.com/Infini-AI-Lab/astraflow/tree/main/examples/math/offline/qwen3-8b-m2po-full-offline)

**Downloader**: [`examples/math/offline/download_math_datasets.py`](https://github.com/Infini-AI-Lab/astraflow/tree/main/examples/math/offline/download_math_datasets.py)

This is the same Qwen3-8B / M2PO / TCP recipe as [Math](math.md), with one difference: at startup the AstraFlow service loads every dataset from disk instead of fetching from the HuggingFace Hub.

## 1. One-time prep — download datasets

From the repo root:

```bash
python examples/math/offline/download_math_datasets.py --root data-data/math
```

This writes 8 dataset directories under `data-data/math/` (~400 MB total) plus a `MANIFEST.json`:

| Directory | HF source | Split | Use |
|-----------------|---------------------------------------------|-------|---------|
| `deepscaler` | `agentica-org/DeepScaleR-Preview-Dataset` | train | rollout |
| `dapo_filter` | `aaabiao/dapo_filter` | train | rollout |
| `aime24` | `HuggingFaceH4/aime_2024` | train | eval |
| `aime25` | `math-ai/aime25` | test | eval |
| `amc` | `rawsh/2024_AMC12` | train | eval |
| `math500` | `HuggingFaceH4/MATH-500` | test | eval |
| `minerva` | `math-ai/minervamath` | test | eval |
| `olympiadbench` | `math-ai/olympiadbench` | test | eval |

Re-running is idempotent (skips populated dirs). Useful flags:

- `--force` — re-download even if a directory exists
- `--only deepscaler,aime24` — partial subset
- `--verify` — skip download; just load each from disk and assert non-empty

## 2. Run training

```bash
bash examples/math/offline/qwen3-8b-m2po-full-offline/scripts/run_qwen3-8b-m2po-full-offline.sh
```

You can confirm the offline path is active by looking for these lines in the AstraFlow service log:

```text
Auto-derived offline_dir for dataset 'deepscaler': data-data/math/deepscaler
Loading DeepScaleR dataset from offline path: data-data/math/deepscaler
Auto-derived offline_dir for dataset 'aime24': data-data/math/aime24
... (same for aime25, amc, minerva, math500)
```

## How it works

The recipe's `experiment.yaml` sets a single field under `dataflow`:

```yaml
dataflow:
data_root: data-data/math
```

At startup `astraflow.dataflow.service` walks every entry in `rollout_dataset` and `eval_datasets`; for each one that does not already specify `offline_dir`, it auto-derives `offline_dir = f"{data_root}/{name}"`. The `name` is:

- the **dict key** for eval datasets (`aime24`, `aime25`, `amc`, `minerva`, `math500`)
- the **`dataset_fn` module basename** for the rollout dataset (`deepscaler` from `astraflow.dataflow.dataset.deepscaler:get_deepscaler_rl_dataset`)

The downloader uses the same naming convention, so the two sides stay in sync. To opt a single dataset out — e.g. point one eval at a different snapshot — just set `offline_dir:` explicitly on that entry; explicit values always win.

To convert any other recipe to offline mode, add the same `dataflow.data_root` field; no other changes are required.

## Caveats

- **Model and tokenizer weights are *not* covered** by the dataset downloader. `model_path` / `tokenizer_path` still point at `Qwen/Qwen3-8B` and resolve via the HuggingFace cache. For a fully air-gapped run, pre-fetch them with `huggingface-cli download Qwen/Qwen3-8B --local-dir /local/models/Qwen3-8B` and edit the two paths in `experiment.yaml`.
- The downloader needs internet at prep time. Once `data-data/math/` is populated, training itself works with `HF_HUB_OFFLINE=1` / `HF_DATASETS_OFFLINE=1`.
61 changes: 61 additions & 0 deletions examples/math/offline/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Offline math datasets

Pre-download every dataset used by the math recipes so training can run on
a node with no internet access.

## 1. Download (one-time)

From the repo root:

```bash
python examples/math/offline/download_math_datasets.py --root data-data/math
```

This writes 8 dataset directories under `data-data/math/` and a
`MANIFEST.json` summary. Re-running is a no-op (skips populated dirs);
pass `--force` to re-download, or `--only deepscaler,aime24` for a subset.

| dir | HF source | split | use |
|------------------|---------------------------------------------|-------|---------|
| `deepscaler` | `agentica-org/DeepScaleR-Preview-Dataset` | train | rollout |
| `dapo_filter` | `aaabiao/dapo_filter` | train | rollout |
| `aime24` | `HuggingFaceH4/aime_2024` | train | eval |
| `aime25` | `math-ai/aime25` | test | eval |
| `amc` | `rawsh/2024_AMC12` | train | eval |
| `math500` | `HuggingFaceH4/MATH-500` | test | eval |
| `minerva` | `math-ai/minervamath` | test | eval |
| `olympiadbench` | `math-ai/olympiadbench` | test | eval |

## 2. Verify

```bash
python examples/math/offline/download_math_datasets.py --verify
```

Loads every directory with `load_from_disk` and prints row counts; exits
non-zero if any dataset is missing or empty.

## 3. Run training with offline data

The matching recipe is `examples/math/offline/qwen3-8b-m2po-full-offline/`. Its
`experiment.yaml` sets `dataflow.data_root: data-data/math`, which causes
`astraflow.dataflow.service` to auto-derive each loader's `offline_dir`
as `data-data/math/<name>` (the dict key for evals, or the `dataset_fn`
module name for the rollout). No per-entry edits required.

```bash
bash examples/math/offline/qwen3-8b-m2po-full-offline/scripts/run_qwen3-8b-m2po-full-offline.sh
```

## Notes

- **Model weights are *not* covered.** `model_path` / `tokenizer_path`
still point at `Qwen/Qwen3-8B` and will be pulled from HF Hub on first
use. Either let HF cache them once, or pre-fetch with
`huggingface-cli download Qwen/Qwen3-8B` and point the YAML at the
local snapshot for a fully air-gapped run.
- Convention: a dataset directory name in `--root` must match the
`name` used by `_create_dataset_from_config` (eval dict key, or
rollout `dataset_fn` module basename). The download script and the
service use the same `MATH_DATASETS` table / derivation, so they stay
in sync.
Loading
Loading