From ed8240e44c4e657f73649f30e76dedf0c9813153 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:29:53 -0400 Subject: [PATCH 01/46] Promote cli/commands.py to cli/commands/ package --- src/dlm/cli/{commands.py => commands/__init__.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/dlm/cli/{commands.py => commands/__init__.py} (100%) diff --git a/src/dlm/cli/commands.py b/src/dlm/cli/commands/__init__.py similarity index 100% rename from src/dlm/cli/commands.py rename to src/dlm/cli/commands/__init__.py From 40eb9133a3640b5eeb94e17ccd727754449b87e2 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:32:10 -0400 Subject: [PATCH 02/46] Extract doctor_cmd into cli/commands/doctor.py --- src/dlm/cli/commands/__init__.py | 17 ++--------------- src/dlm/cli/commands/doctor.py | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 15 deletions(-) create mode 100644 src/dlm/cli/commands/doctor.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 6bac022..fe051ae 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -17,6 +17,8 @@ import typer +from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd + if TYPE_CHECKING: from datetime import timedelta @@ -2718,21 +2720,6 @@ def _parse_since_arg(since: str, console: object) -> timedelta: raise typer.Exit(code=2) -def doctor_cmd( - json_out: Annotated[bool, typer.Option("--json", help="Emit machine-readable output.")] = False, -) -> None: - """Inspect hardware and print the resolved training plan.""" - import json - - from dlm.hardware import doctor, render_text - - result = doctor() - if json_out: - typer.echo(json.dumps(result.to_dict(), indent=2, default=str)) - else: - typer.echo(render_text(result)) - - def show_cmd( path: Annotated[Path, typer.Argument(help=".dlm file to inspect.")], json_out: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False, diff --git a/src/dlm/cli/commands/doctor.py b/src/dlm/cli/commands/doctor.py new file mode 100644 index 0000000..5caac88 --- /dev/null +++ b/src/dlm/cli/commands/doctor.py @@ -0,0 +1,22 @@ +"""`dlm doctor` — inspect hardware and print the resolved training plan.""" + +from __future__ import annotations + +from typing import Annotated + +import typer + + +def doctor_cmd( + json_out: Annotated[bool, typer.Option("--json", help="Emit machine-readable output.")] = False, +) -> None: + """Inspect hardware and print the resolved training plan.""" + import json + + from dlm.hardware import doctor, render_text + + result = doctor() + if json_out: + typer.echo(json.dumps(result.to_dict(), indent=2, default=str)) + else: + typer.echo(render_text(result)) From a7181e830d6ef9e3f35b0cba8db05511e488ef07 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:33:15 -0400 Subject: [PATCH 03/46] Extract migrate_cmd into cli/commands/migrate.py --- src/dlm/cli/commands/__init__.py | 40 +-------------------------- src/dlm/cli/commands/migrate.py | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 39 deletions(-) create mode 100644 src/dlm/cli/commands/migrate.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index fe051ae..557a85d 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -18,6 +18,7 @@ import typer from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd +from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd if TYPE_CHECKING: from datetime import timedelta @@ -3209,45 +3210,6 @@ def _render_training_sources_text(console: object, records: list[dict[str, objec console.print(f" {path} {fc} file(s), {_human_size(size)}") -def migrate_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to migrate.")], - dry_run: Annotated[bool, typer.Option("--dry-run")] = False, - no_backup: Annotated[bool, typer.Option("--no-backup")] = False, -) -> None: - """Migrate a .dlm frontmatter to the current schema version.""" - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.migrate import migrate_file - - console = Console(stderr=True) - - try: - result = migrate_file(path, dry_run=dry_run, no_backup=no_backup) - except DlmParseError as exc: - console.print(f"[red]migrate:[/red] {exc}") - raise typer.Exit(code=1) from exc - - if not result.applied: - console.print( - f"[green]migrate:[/green] {path} already at v{result.target_version} " - "(no migrations needed)." - ) - return - - applied_str = " → ".join(f"v{v}" for v in (*result.applied, result.target_version)) - if dry_run: - console.print( - f"[yellow]dry-run:[/yellow] {path} would migrate {applied_str} " - "(re-run without --dry-run to apply)." - ) - return - - if result.backup_path is not None: - console.print(f"[dim]backup:[/dim] {result.backup_path}") - console.print(f"[green]migrated:[/green] {path} {applied_str}") - - def templates_list_cmd( json_out: Annotated[ bool, diff --git a/src/dlm/cli/commands/migrate.py b/src/dlm/cli/commands/migrate.py new file mode 100644 index 0000000..bc1f035 --- /dev/null +++ b/src/dlm/cli/commands/migrate.py @@ -0,0 +1,47 @@ +"""`dlm migrate` — migrate a .dlm frontmatter to the current schema version.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + + +def migrate_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to migrate.")], + dry_run: Annotated[bool, typer.Option("--dry-run")] = False, + no_backup: Annotated[bool, typer.Option("--no-backup")] = False, +) -> None: + """Migrate a .dlm frontmatter to the current schema version.""" + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.migrate import migrate_file + + console = Console(stderr=True) + + try: + result = migrate_file(path, dry_run=dry_run, no_backup=no_backup) + except DlmParseError as exc: + console.print(f"[red]migrate:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if not result.applied: + console.print( + f"[green]migrate:[/green] {path} already at v{result.target_version} " + "(no migrations needed)." + ) + return + + applied_str = " → ".join(f"v{v}" for v in (*result.applied, result.target_version)) + if dry_run: + console.print( + f"[yellow]dry-run:[/yellow] {path} would migrate {applied_str} " + "(re-run without --dry-run to apply)." + ) + return + + if result.backup_path is not None: + console.print(f"[dim]backup:[/dim] {result.backup_path}") + console.print(f"[green]migrated:[/green] {path} {applied_str}") From 1d3b779d7d9470d73b93517fe44bed919820fdd8 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:34:02 -0400 Subject: [PATCH 04/46] Extract pack_cmd into cli/commands/pack.py --- src/dlm/cli/commands/__init__.py | 65 +--------------------------- src/dlm/cli/commands/pack.py | 72 ++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 64 deletions(-) create mode 100644 src/dlm/cli/commands/pack.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 557a85d..c2399c9 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -19,6 +19,7 @@ from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd +from dlm.cli.commands.pack import pack_cmd as pack_cmd if TYPE_CHECKING: from datetime import timedelta @@ -2192,70 +2193,6 @@ def _verbose_runner(cmd: Sequence[str]) -> object: console.print(f"smoke: {result.smoke_output_first_line}") -def pack_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to pack.")], - out: Annotated[ - Path | None, - typer.Option("--out", help="Output .dlm.pack path."), - ] = None, - include_exports: Annotated[ - bool, - typer.Option("--include-exports", help="Bundle all GGUF exports into the pack."), - ] = False, - include_base: Annotated[ - bool, - typer.Option( - "--include-base", - help="Bundle the base model snapshot (license rules still apply).", - ), - ] = False, - include_logs: Annotated[ - bool, - typer.Option("--include-logs", help="Bundle per-run JSONL logs."), - ] = False, - licensee: Annotated[ - str | None, - typer.Option( - "--i-am-the-licensee", - help="URL acknowledging separate acceptance of a non-redistributable base (required for --include-base on gated models).", - ), - ] = None, -) -> None: - """Produce a portable .dlm.pack bundle.""" - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.pack.errors import BaseLicenseRefusedError, PackError - from dlm.pack.packer import pack - - console = Console(stderr=True) - - try: - result = pack( - path, - out=out, - include_exports=include_exports, - include_base=include_base, - include_logs=include_logs, - licensee_acceptance_url=licensee, - ) - except BaseLicenseRefusedError as exc: - console.print(f"[red]pack:[/red] {exc}") - raise typer.Exit(code=1) from exc - except PackError as exc: - console.print(f"[red]pack:[/red] {exc}") - raise typer.Exit(code=1) from exc - except DlmParseError as exc: - console.print(f"[red]parse:[/red] {exc}") - raise typer.Exit(code=1) from exc - - size_mb = result.bytes_written / (1024 * 1024) - console.print( - f"[green]packed:[/green] {result.path} " - f"({size_mb:.2f} MB, content_type={result.content_type})" - ) - - def unpack_cmd( path: Annotated[Path, typer.Argument(help=".dlm.pack to install.")], force: Annotated[ diff --git a/src/dlm/cli/commands/pack.py b/src/dlm/cli/commands/pack.py new file mode 100644 index 0000000..78f033b --- /dev/null +++ b/src/dlm/cli/commands/pack.py @@ -0,0 +1,72 @@ +"""`dlm pack` — produce a portable .dlm.pack bundle.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + + +def pack_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to pack.")], + out: Annotated[ + Path | None, + typer.Option("--out", help="Output .dlm.pack path."), + ] = None, + include_exports: Annotated[ + bool, + typer.Option("--include-exports", help="Bundle all GGUF exports into the pack."), + ] = False, + include_base: Annotated[ + bool, + typer.Option( + "--include-base", + help="Bundle the base model snapshot (license rules still apply).", + ), + ] = False, + include_logs: Annotated[ + bool, + typer.Option("--include-logs", help="Bundle per-run JSONL logs."), + ] = False, + licensee: Annotated[ + str | None, + typer.Option( + "--i-am-the-licensee", + help="URL acknowledging separate acceptance of a non-redistributable base (required for --include-base on gated models).", + ), + ] = None, +) -> None: + """Produce a portable .dlm.pack bundle.""" + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.pack.errors import BaseLicenseRefusedError, PackError + from dlm.pack.packer import pack + + console = Console(stderr=True) + + try: + result = pack( + path, + out=out, + include_exports=include_exports, + include_base=include_base, + include_logs=include_logs, + licensee_acceptance_url=licensee, + ) + except BaseLicenseRefusedError as exc: + console.print(f"[red]pack:[/red] {exc}") + raise typer.Exit(code=1) from exc + except PackError as exc: + console.print(f"[red]pack:[/red] {exc}") + raise typer.Exit(code=1) from exc + except DlmParseError as exc: + console.print(f"[red]parse:[/red] {exc}") + raise typer.Exit(code=1) from exc + + size_mb = result.bytes_written / (1024 * 1024) + console.print( + f"[green]packed:[/green] {result.path} " + f"({size_mb:.2f} MB, content_type={result.content_type})" + ) From 7b71500f566ae254e90736942fed4c6c3920e90c Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:34:36 -0400 Subject: [PATCH 05/46] Extract unpack_cmd into cli/commands/unpack.py --- src/dlm/cli/commands/__init__.py | 48 +--------------------------- src/dlm/cli/commands/unpack.py | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 47 deletions(-) create mode 100644 src/dlm/cli/commands/unpack.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index c2399c9..e1ca038 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -20,6 +20,7 @@ from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd +from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd if TYPE_CHECKING: from datetime import timedelta @@ -2193,53 +2194,6 @@ def _verbose_runner(cmd: Sequence[str]) -> object: console.print(f"smoke: {result.smoke_output_first_line}") -def unpack_cmd( - path: Annotated[Path, typer.Argument(help=".dlm.pack to install.")], - force: Annotated[ - bool, - typer.Option("--force", help="Overwrite an existing store with the same dlm_id."), - ] = False, - out: Annotated[ - Path | None, - typer.Option( - "--out", help="Directory to place the restored .dlm (default: alongside the pack)." - ), - ] = None, -) -> None: - """Install a .dlm.pack into the local store.""" - from rich.console import Console - - from dlm.pack.errors import ( - PackFormatVersionError, - PackIntegrityError, - PackLayoutError, - ) - from dlm.pack.unpacker import unpack - - console = Console(stderr=True) - - try: - result = unpack(path, force=force, out_dir=out) - except PackFormatVersionError as exc: - console.print(f"[red]unpack:[/red] {exc}") - raise typer.Exit(code=1) from exc - except PackIntegrityError as exc: - console.print(f"[red]unpack:[/red] {exc}") - raise typer.Exit(code=1) from exc - except PackLayoutError as exc: - console.print(f"[red]unpack:[/red] {exc}") - raise typer.Exit(code=1) from exc - - console.print(f"[green]unpacked:[/green] {result.dlm_path}") - console.print(f" store: {result.store_path}") - console.print(f" dlm_id: {result.dlm_id}") - if result.applied_migrations: - steps = " → ".join( - f"v{v}" for v in (*result.applied_migrations, result.header.pack_format_version + 1) - ) - console.print(f" migrated: {steps}") - - def verify_cmd( path: Annotated[Path, typer.Argument(help=".dlm.pack to verify.")], trust_on_first_use: Annotated[ diff --git a/src/dlm/cli/commands/unpack.py b/src/dlm/cli/commands/unpack.py new file mode 100644 index 0000000..df71939 --- /dev/null +++ b/src/dlm/cli/commands/unpack.py @@ -0,0 +1,55 @@ +"""`dlm unpack` — install a .dlm.pack into the local store.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + + +def unpack_cmd( + path: Annotated[Path, typer.Argument(help=".dlm.pack to install.")], + force: Annotated[ + bool, + typer.Option("--force", help="Overwrite an existing store with the same dlm_id."), + ] = False, + out: Annotated[ + Path | None, + typer.Option( + "--out", help="Directory to place the restored .dlm (default: alongside the pack)." + ), + ] = None, +) -> None: + """Install a .dlm.pack into the local store.""" + from rich.console import Console + + from dlm.pack.errors import ( + PackFormatVersionError, + PackIntegrityError, + PackLayoutError, + ) + from dlm.pack.unpacker import unpack + + console = Console(stderr=True) + + try: + result = unpack(path, force=force, out_dir=out) + except PackFormatVersionError as exc: + console.print(f"[red]unpack:[/red] {exc}") + raise typer.Exit(code=1) from exc + except PackIntegrityError as exc: + console.print(f"[red]unpack:[/red] {exc}") + raise typer.Exit(code=1) from exc + except PackLayoutError as exc: + console.print(f"[red]unpack:[/red] {exc}") + raise typer.Exit(code=1) from exc + + console.print(f"[green]unpacked:[/green] {result.dlm_path}") + console.print(f" store: {result.store_path}") + console.print(f" dlm_id: {result.dlm_id}") + if result.applied_migrations: + steps = " → ".join( + f"v{v}" for v in (*result.applied_migrations, result.header.pack_format_version + 1) + ) + console.print(f" migrated: {steps}") From 3a21000f76ce07a96bb5ed36effa9bd9833d998a Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:35:23 -0400 Subject: [PATCH 06/46] Extract verify_cmd into cli/commands/verify.py --- src/dlm/cli/commands/__init__.py | 106 +---------------------------- src/dlm/cli/commands/verify.py | 113 +++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 105 deletions(-) create mode 100644 src/dlm/cli/commands/verify.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index e1ca038..a1e6aa0 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -21,6 +21,7 @@ from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd +from dlm.cli.commands.verify import verify_cmd as verify_cmd if TYPE_CHECKING: from datetime import timedelta @@ -2194,111 +2195,6 @@ def _verbose_runner(cmd: Sequence[str]) -> object: console.print(f"smoke: {result.smoke_output_first_line}") -def verify_cmd( - path: Annotated[Path, typer.Argument(help=".dlm.pack to verify.")], - trust_on_first_use: Annotated[ - bool, - typer.Option( - "--trust-on-first-use", - help=( - "Record the signer's public key under ~/.dlm/trusted-keys/ " - "on first verify. Without this flag an unknown signer is " - "rejected with exit code 2." - ), - ), - ] = False, - trusted_keys_dir: Annotated[ - Path | None, - typer.Option( - "--trusted-keys-dir", - help="Override ~/.dlm/trusted-keys/ (useful for scripted verify).", - hidden=True, - ), - ] = None, -) -> None: - """Verify a .dlm.pack's provenance chain. - - Exit codes: 0 verified, 1 broken chain (or missing provenance), - 2 untrusted signer, 3 signature rejected. - """ - from rich.console import Console - - from dlm.pack.errors import PackLayoutError - from dlm.pack.layout import PROVENANCE_FILENAME - from dlm.pack.unpacker import read_pack_member_bytes - from dlm.share.errors import ShareError - from dlm.share.provenance import ( - ProvenanceChainBroken, - ProvenanceSchemaError, - UnknownSignerError, - load_provenance_json, - verify_provenance, - ) - - console = Console(stderr=True) - keys_dir = trusted_keys_dir or (Path.home() / ".dlm" / "trusted-keys") - - try: - payload = read_pack_member_bytes(path, PROVENANCE_FILENAME) - except PackLayoutError as exc: - console.print(f"[red]verify:[/red] {exc}") - raise typer.Exit(code=1) from exc - except OSError as exc: - console.print(f"[red]verify:[/red] cannot read {path}: {exc}") - raise typer.Exit(code=1) from exc - - if payload is None: - console.print(f"[red]verify:[/red] {path} is unsigned — no {PROVENANCE_FILENAME} inside.") - raise typer.Exit(code=1) - - # Write the in-pack JSON to a temp file so `load_provenance_json` - # can use its normal filesystem path. Keeps the parser single- - # sourced and the error messages consistent with the filesystem - # call-site. - import tempfile - - with tempfile.NamedTemporaryFile("wb", suffix=".json", delete=False) as fh: - fh.write(payload) - tmp_path = Path(fh.name) - try: - provenance = load_provenance_json(tmp_path) - except ProvenanceSchemaError as exc: - console.print(f"[red]verify:[/red] malformed provenance.json: {exc}") - raise typer.Exit(code=1) from exc - finally: - tmp_path.unlink(missing_ok=True) - - try: - result = verify_provenance( - provenance, - trusted_keys_dir=keys_dir, - tofu=trust_on_first_use, - ) - except UnknownSignerError as exc: - console.print(f"[red]verify:[/red] {exc}") - raise typer.Exit(code=2) from exc - except ProvenanceChainBroken as exc: - console.print(f"[red]verify:[/red] chain broken: {exc}") - raise typer.Exit(code=1) from exc - except ShareError as exc: - console.print(f"[red]verify:[/red] signature rejected: {exc}") - raise typer.Exit(code=3) from exc - - out = Console() - out.print(f"[green]verified:[/green] {path.name}") - out.print(f" signer: {result.signer_fingerprint}") - out.print(f" trusted-key: {result.trusted_key_path}") - out.print(f" adapter_sha256: {provenance.adapter_sha256[:12]}...") - out.print(f" base_revision: {provenance.base_revision}") - out.print(f" corpus_root: {provenance.corpus_root_sha256[:12]}...") - out.print(f" signed_at: {provenance.signed_at}") - if result.tofu_recorded: - out.print( - f"[yellow]note:[/yellow] recorded new trust entry " - f"at {result.trusted_key_path}; subsequent verifies use strict mode." - ) - - def repl_cmd( path: Annotated[Path, typer.Argument(help=".dlm file to start a REPL against.")], adapter: Annotated[ diff --git a/src/dlm/cli/commands/verify.py b/src/dlm/cli/commands/verify.py new file mode 100644 index 0000000..67074aa --- /dev/null +++ b/src/dlm/cli/commands/verify.py @@ -0,0 +1,113 @@ +"""`dlm verify` — verify a .dlm.pack's provenance chain.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + + +def verify_cmd( + path: Annotated[Path, typer.Argument(help=".dlm.pack to verify.")], + trust_on_first_use: Annotated[ + bool, + typer.Option( + "--trust-on-first-use", + help=( + "Record the signer's public key under ~/.dlm/trusted-keys/ " + "on first verify. Without this flag an unknown signer is " + "rejected with exit code 2." + ), + ), + ] = False, + trusted_keys_dir: Annotated[ + Path | None, + typer.Option( + "--trusted-keys-dir", + help="Override ~/.dlm/trusted-keys/ (useful for scripted verify).", + hidden=True, + ), + ] = None, +) -> None: + """Verify a .dlm.pack's provenance chain. + + Exit codes: 0 verified, 1 broken chain (or missing provenance), + 2 untrusted signer, 3 signature rejected. + """ + from rich.console import Console + + from dlm.pack.errors import PackLayoutError + from dlm.pack.layout import PROVENANCE_FILENAME + from dlm.pack.unpacker import read_pack_member_bytes + from dlm.share.errors import ShareError + from dlm.share.provenance import ( + ProvenanceChainBroken, + ProvenanceSchemaError, + UnknownSignerError, + load_provenance_json, + verify_provenance, + ) + + console = Console(stderr=True) + keys_dir = trusted_keys_dir or (Path.home() / ".dlm" / "trusted-keys") + + try: + payload = read_pack_member_bytes(path, PROVENANCE_FILENAME) + except PackLayoutError as exc: + console.print(f"[red]verify:[/red] {exc}") + raise typer.Exit(code=1) from exc + except OSError as exc: + console.print(f"[red]verify:[/red] cannot read {path}: {exc}") + raise typer.Exit(code=1) from exc + + if payload is None: + console.print(f"[red]verify:[/red] {path} is unsigned — no {PROVENANCE_FILENAME} inside.") + raise typer.Exit(code=1) + + # Write the in-pack JSON to a temp file so `load_provenance_json` + # can use its normal filesystem path. Keeps the parser single- + # sourced and the error messages consistent with the filesystem + # call-site. + import tempfile + + with tempfile.NamedTemporaryFile("wb", suffix=".json", delete=False) as fh: + fh.write(payload) + tmp_path = Path(fh.name) + try: + provenance = load_provenance_json(tmp_path) + except ProvenanceSchemaError as exc: + console.print(f"[red]verify:[/red] malformed provenance.json: {exc}") + raise typer.Exit(code=1) from exc + finally: + tmp_path.unlink(missing_ok=True) + + try: + result = verify_provenance( + provenance, + trusted_keys_dir=keys_dir, + tofu=trust_on_first_use, + ) + except UnknownSignerError as exc: + console.print(f"[red]verify:[/red] {exc}") + raise typer.Exit(code=2) from exc + except ProvenanceChainBroken as exc: + console.print(f"[red]verify:[/red] chain broken: {exc}") + raise typer.Exit(code=1) from exc + except ShareError as exc: + console.print(f"[red]verify:[/red] signature rejected: {exc}") + raise typer.Exit(code=3) from exc + + out = Console() + out.print(f"[green]verified:[/green] {path.name}") + out.print(f" signer: {result.signer_fingerprint}") + out.print(f" trusted-key: {result.trusted_key_path}") + out.print(f" adapter_sha256: {provenance.adapter_sha256[:12]}...") + out.print(f" base_revision: {provenance.base_revision}") + out.print(f" corpus_root: {provenance.corpus_root_sha256[:12]}...") + out.print(f" signed_at: {provenance.signed_at}") + if result.tofu_recorded: + out.print( + f"[yellow]note:[/yellow] recorded new trust entry " + f"at {result.trusted_key_path}; subsequent verifies use strict mode." + ) From c88e02bf2eea63cee5a27618eaa675e027cae49d Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:41:21 -0400 Subject: [PATCH 07/46] Extract templates_list_cmd into cli/commands/templates.py --- src/dlm/cli/commands/__init__.py | 82 +--------------------------- src/dlm/cli/commands/templates.py | 88 +++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 81 deletions(-) create mode 100644 src/dlm/cli/commands/templates.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index a1e6aa0..f7bef33 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -21,6 +21,7 @@ from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd +from dlm.cli.commands.templates import templates_list_cmd as templates_list_cmd from dlm.cli.commands.verify import verify_cmd as verify_cmd if TYPE_CHECKING: @@ -2997,87 +2998,6 @@ def _render_training_sources_text(console: object, records: list[dict[str, objec console.print(f" {path} {fc} file(s), {_human_size(size)}") -def templates_list_cmd( - json_out: Annotated[ - bool, - typer.Option("--json", help="Emit a JSON array of template metadata."), - ] = False, - refresh: Annotated[ - bool, - typer.Option( - "--refresh", - help=( - "Refresh from the upstream template gallery. Currently a no-op — " - "upstream repo + signing key are deferred." - ), - ), - ] = False, - accept_unsigned: Annotated[ - bool, - typer.Option( - "--accept-unsigned", - help=( - "Bypass signed-tag verification on --refresh. Reserved; takes effect " - "once the upstream gallery signs its releases." - ), - ), - ] = False, -) -> None: - """List the bundled (and, one day, remote) template gallery.""" - - import json as _json - - from rich.console import Console - - from dlm.templates import list_bundled - - console_out = Console() - console_err = Console(stderr=True) - - if refresh: - from dlm.templates.fetcher import RemoteFetchUnavailable, cache_dir, fetch_all - - try: - fetch_all(cache_dir(), remote="") - except RemoteFetchUnavailable as exc: - console_err.print( - f"[yellow]templates:[/yellow] {exc} Falling back to the bundled gallery." - ) - # --accept-unsigned is reserved for when the live fetcher lands; - # touching it here silences ARG001 without ceremony. - _ = accept_unsigned - - templates = list_bundled() - - if json_out: - payload = [ - { - "name": t.name, - "title": t.meta.title, - "domain_tags": list(t.meta.domain_tags), - "recommended_base": t.meta.recommended_base, - "expected_steps": t.meta.expected_steps, - "expected_duration": dict(t.meta.expected_duration), - "summary": t.meta.summary, - "sample_prompts": list(t.meta.sample_prompts), - } - for t in templates - ] - console_out.print_json(_json.dumps(payload)) - return - - if not templates: - console_err.print("[yellow]templates:[/yellow] no bundled templates found.") - raise typer.Exit(code=1) - - name_width = max(len(t.name) for t in templates) - for t in templates: - console_out.print( - f"[bold]{t.name:<{name_width}}[/bold] {t.meta.title} " - f"[dim]({t.meta.recommended_base})[/dim]" - ) - - def push_cmd( path: Annotated[Path, typer.Argument(help=".dlm or .dlm.pack to push.")], to: Annotated[ diff --git a/src/dlm/cli/commands/templates.py b/src/dlm/cli/commands/templates.py new file mode 100644 index 0000000..b57b2ae --- /dev/null +++ b/src/dlm/cli/commands/templates.py @@ -0,0 +1,88 @@ +"""`dlm templates list` — list the bundled template gallery.""" + +from __future__ import annotations + +from typing import Annotated + +import typer + + +def templates_list_cmd( + json_out: Annotated[ + bool, + typer.Option("--json", help="Emit a JSON array of template metadata."), + ] = False, + refresh: Annotated[ + bool, + typer.Option( + "--refresh", + help=( + "Refresh from the upstream template gallery. Currently a no-op — " + "upstream repo + signing key are deferred." + ), + ), + ] = False, + accept_unsigned: Annotated[ + bool, + typer.Option( + "--accept-unsigned", + help=( + "Bypass signed-tag verification on --refresh. Reserved; takes effect " + "once the upstream gallery signs its releases." + ), + ), + ] = False, +) -> None: + """List the bundled (and, one day, remote) template gallery.""" + + import json as _json + + from rich.console import Console + + from dlm.templates import list_bundled + + console_out = Console() + console_err = Console(stderr=True) + + if refresh: + from dlm.templates.fetcher import RemoteFetchUnavailable, cache_dir, fetch_all + + try: + fetch_all(cache_dir(), remote="") + except RemoteFetchUnavailable as exc: + console_err.print( + f"[yellow]templates:[/yellow] {exc} Falling back to the bundled gallery." + ) + # --accept-unsigned is reserved for when the live fetcher lands; + # touching it here silences ARG001 without ceremony. + _ = accept_unsigned + + templates = list_bundled() + + if json_out: + payload = [ + { + "name": t.name, + "title": t.meta.title, + "domain_tags": list(t.meta.domain_tags), + "recommended_base": t.meta.recommended_base, + "expected_steps": t.meta.expected_steps, + "expected_duration": dict(t.meta.expected_duration), + "summary": t.meta.summary, + "sample_prompts": list(t.meta.sample_prompts), + } + for t in templates + ] + console_out.print_json(_json.dumps(payload)) + return + + if not templates: + console_err.print("[yellow]templates:[/yellow] no bundled templates found.") + raise typer.Exit(code=1) + + name_width = max(len(t.name) for t in templates) + for t in templates: + console_out.print( + f"[bold]{t.name:<{name_width}}[/bold] {t.meta.title} " + f"[dim]({t.meta.recommended_base})[/dim]" + ) From c2ca4733afbd01f98fe3e7ba1d6992541422f038 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:41:36 -0400 Subject: [PATCH 08/46] Sort cli/commands/__init__.py re-export imports --- src/dlm/cli/commands/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index f7bef33..78969f0 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -20,8 +20,8 @@ from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd -from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd from dlm.cli.commands.templates import templates_list_cmd as templates_list_cmd +from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd from dlm.cli.commands.verify import verify_cmd as verify_cmd if TYPE_CHECKING: From 9d4f38595b9f9bee7b1fbf75b826d0e4dbdf78f6 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:43:24 -0400 Subject: [PATCH 09/46] Move _previously_accepted + _human_size to cli/commands/_shared.py --- src/dlm/cli/commands/__init__.py | 34 ++--------------------- src/dlm/cli/commands/_shared.py | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 32 deletions(-) create mode 100644 src/dlm/cli/commands/_shared.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 78969f0..d1c5635 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -17,6 +17,8 @@ import typer +from dlm.cli.commands._shared import _human_size as _human_size +from dlm.cli.commands._shared import _previously_accepted as _previously_accepted from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd @@ -283,30 +285,6 @@ def init_cmd( console.print(f"[green]init:[/green] wrote {path}") -def _previously_accepted(store_manifest_path: Path) -> bool: - """Return True iff the store manifest already holds a LicenseAcceptance. - - `dlm prompt` and `dlm export` operate on an already-trained - adapter; the gated-base license was accepted during training and - persisted into `manifest.license_acceptance`. Replaying that - acceptance here is correct; silently hardcoding - `accept_license=True` is not — it would let a never-accepted - gated base slip through. - """ - if not store_manifest_path.exists(): - return False - from dlm.store.errors import ManifestCorruptError - from dlm.store.manifest import load_manifest - - try: - manifest = load_manifest(store_manifest_path) - except (ManifestCorruptError, OSError): - # Narrow from bare `Exception` so programmer bugs propagate - # instead of being silently treated as "no acceptance." - return False - return manifest.license_acceptance is not None - - def _prompt_accept_license(console: object, base: str, license_url: str | None) -> bool: """Interactive y/N prompt for gated base-model license acceptance. @@ -2694,14 +2672,6 @@ def _render_inspection_text(console: object, path: Path, inspection: object) -> console.print(" [yellow]orphaned:[/yellow] source .dlm is missing or mismatched") -def _human_size(n: int) -> str: - for unit in ("B", "KB", "MB", "GB", "TB"): - if n < 1024: - return f"{n:.1f} {unit}" if unit != "B" else f"{n} B" - n //= 1024 - return f"{n} PB" - - def _summarize_training_sources(parsed: object, base_path: Path) -> list[dict[str, object]] | None: """Best-effort resolution of `training.sources` for `dlm show`. diff --git a/src/dlm/cli/commands/_shared.py b/src/dlm/cli/commands/_shared.py new file mode 100644 index 0000000..acdf907 --- /dev/null +++ b/src/dlm/cli/commands/_shared.py @@ -0,0 +1,47 @@ +"""Helpers shared across more than one cli/commands submodule. + +Anything used by exactly one command lives in that command's own +submodule. This file is the explicit destination for true cross-command +helpers — keeping it small means the package's public surface (the +`__init__.py` re-exports) stays focused on commands themselves. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + + +def _previously_accepted(store_manifest_path: Path) -> bool: + """Return True iff the store manifest already holds a LicenseAcceptance. + + `dlm prompt`, `dlm export`, and `dlm repl` operate on an + already-trained adapter; the gated-base license was accepted during + training and persisted into `manifest.license_acceptance`. Replaying + that acceptance here is correct; silently hardcoding + `accept_license=True` is not — it would let a never-accepted gated + base slip through. + """ + if not store_manifest_path.exists(): + return False + from dlm.store.errors import ManifestCorruptError + from dlm.store.manifest import load_manifest + + try: + manifest = load_manifest(store_manifest_path) + except (ManifestCorruptError, OSError): + # Narrow from bare `Exception` so programmer bugs propagate + # instead of being silently treated as "no acceptance." + return False + return manifest.license_acceptance is not None + + +def _human_size(n: int) -> str: + """Render a byte count as a 1-decimal human string (B / KB / MB / …).""" + for unit in ("B", "KB", "MB", "GB", "TB"): + if n < 1024: + return f"{n:.1f} {unit}" if unit != "B" else f"{n} B" + n //= 1024 + return f"{n} PB" From 7a4ff3de28d8e951eff21bd37c29201f4e8db867 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:44:22 -0400 Subject: [PATCH 10/46] Extract repl_cmd into cli/commands/repl.py --- src/dlm/cli/commands/__init__.py | 101 +--------------------------- src/dlm/cli/commands/repl.py | 110 +++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 100 deletions(-) create mode 100644 src/dlm/cli/commands/repl.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index d1c5635..b08a66c 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -22,6 +22,7 @@ from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd +from dlm.cli.commands.repl import repl_cmd as repl_cmd from dlm.cli.commands.templates import templates_list_cmd as templates_list_cmd from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd from dlm.cli.commands.verify import verify_cmd as verify_cmd @@ -2174,106 +2175,6 @@ def _verbose_runner(cmd: Sequence[str]) -> object: console.print(f"smoke: {result.smoke_output_first_line}") -def repl_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to start a REPL against.")], - adapter: Annotated[ - str | None, - typer.Option( - "--adapter", - help=( - "Named adapter to load. Required on multi-adapter " - "documents; rejected on single-adapter documents." - ), - ), - ] = None, - backend: Annotated[ - str, - typer.Option( - "--backend", - help="Inference backend: `auto`, `pytorch`, or `mlx`.", - ), - ] = "auto", -) -> None: - """Interactive REPL against the trained adapter.""" - from rich.console import Console - - from dlm.base_models import GatedModelError - from dlm.base_models import resolve as resolve_base_model - from dlm.doc.parser import parse_file - from dlm.hardware import doctor - from dlm.inference import AdapterNotFoundError - from dlm.inference.backends import ( - UnsupportedBackendError, - build_backend, - select_backend, - ) - from dlm.repl.session import ReplSession - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - - if backend not in ("auto", "pytorch", "mlx"): - console.print( - f"[red]repl:[/red] --backend must be `auto`, `pytorch`, or `mlx` (got {backend!r})." - ) - raise typer.Exit(code=2) - - parsed = parse_file(path) - declared = parsed.frontmatter.training.adapters - if adapter is not None: - if declared is None: - console.print( - "[red]repl:[/red] --adapter is only valid on multi-adapter " - "documents (this doc does not declare `training.adapters`)." - ) - raise typer.Exit(code=2) - if adapter not in declared: - console.print( - f"[red]repl:[/red] --adapter {adapter!r} is not declared " - f"(declared: {sorted(declared)!r})." - ) - raise typer.Exit(code=2) - - store = for_dlm(parsed.frontmatter.dlm_id) - already_accepted = _previously_accepted(store.manifest) - try: - spec = resolve_base_model(parsed.frontmatter.base_model, accept_license=already_accepted) - except GatedModelError as exc: - console.print( - f"[red]license:[/red] base {parsed.frontmatter.base_model!r} is gated and has " - "no recorded acceptance in this store; run `dlm train --i-accept-license` first." - ) - raise typer.Exit(code=1) from exc - caps = doctor().capabilities - - try: - backend_name = select_backend(backend, caps) # type: ignore[arg-type] - except UnsupportedBackendError as exc: - console.print(f"[red]repl:[/red] {exc}") - raise typer.Exit(code=2) from exc - backend_obj = build_backend(backend_name, caps) - - try: - backend_obj.load(spec, store, adapter_name=adapter) - except AdapterNotFoundError as exc: - console.print(f"[red]repl:[/red] {exc}") - raise typer.Exit(code=1) from exc - - tokenizer = getattr(backend_obj, "_loaded", None) - tokenizer = tokenizer.tokenizer if tokenizer is not None else None - - session = ReplSession( - backend=backend_obj, - tokenizer=tokenizer, - active_adapter=adapter, - declared_adapters=tuple(sorted(declared)) if declared else (), - ) - - from dlm.repl.app import run_repl - - raise typer.Exit(code=run_repl(session, console=console)) - - def metrics_cmd( path: Annotated[Path, typer.Argument(help=".dlm file whose store we query.")], json_out: Annotated[bool, typer.Option("--json", help="Emit JSON.")] = False, diff --git a/src/dlm/cli/commands/repl.py b/src/dlm/cli/commands/repl.py new file mode 100644 index 0000000..07fe157 --- /dev/null +++ b/src/dlm/cli/commands/repl.py @@ -0,0 +1,110 @@ +"""`dlm repl` — interactive REPL against the trained adapter.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + +from dlm.cli.commands._shared import _previously_accepted + + +def repl_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to start a REPL against.")], + adapter: Annotated[ + str | None, + typer.Option( + "--adapter", + help=( + "Named adapter to load. Required on multi-adapter " + "documents; rejected on single-adapter documents." + ), + ), + ] = None, + backend: Annotated[ + str, + typer.Option( + "--backend", + help="Inference backend: `auto`, `pytorch`, or `mlx`.", + ), + ] = "auto", +) -> None: + """Interactive REPL against the trained adapter.""" + from rich.console import Console + + from dlm.base_models import GatedModelError + from dlm.base_models import resolve as resolve_base_model + from dlm.doc.parser import parse_file + from dlm.hardware import doctor + from dlm.inference import AdapterNotFoundError + from dlm.inference.backends import ( + UnsupportedBackendError, + build_backend, + select_backend, + ) + from dlm.repl.session import ReplSession + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + + if backend not in ("auto", "pytorch", "mlx"): + console.print( + f"[red]repl:[/red] --backend must be `auto`, `pytorch`, or `mlx` (got {backend!r})." + ) + raise typer.Exit(code=2) + + parsed = parse_file(path) + declared = parsed.frontmatter.training.adapters + if adapter is not None: + if declared is None: + console.print( + "[red]repl:[/red] --adapter is only valid on multi-adapter " + "documents (this doc does not declare `training.adapters`)." + ) + raise typer.Exit(code=2) + if adapter not in declared: + console.print( + f"[red]repl:[/red] --adapter {adapter!r} is not declared " + f"(declared: {sorted(declared)!r})." + ) + raise typer.Exit(code=2) + + store = for_dlm(parsed.frontmatter.dlm_id) + already_accepted = _previously_accepted(store.manifest) + try: + spec = resolve_base_model(parsed.frontmatter.base_model, accept_license=already_accepted) + except GatedModelError as exc: + console.print( + f"[red]license:[/red] base {parsed.frontmatter.base_model!r} is gated and has " + "no recorded acceptance in this store; run `dlm train --i-accept-license` first." + ) + raise typer.Exit(code=1) from exc + caps = doctor().capabilities + + try: + backend_name = select_backend(backend, caps) # type: ignore[arg-type] + except UnsupportedBackendError as exc: + console.print(f"[red]repl:[/red] {exc}") + raise typer.Exit(code=2) from exc + backend_obj = build_backend(backend_name, caps) + + try: + backend_obj.load(spec, store, adapter_name=adapter) + except AdapterNotFoundError as exc: + console.print(f"[red]repl:[/red] {exc}") + raise typer.Exit(code=1) from exc + + tokenizer = getattr(backend_obj, "_loaded", None) + tokenizer = tokenizer.tokenizer if tokenizer is not None else None + + session = ReplSession( + backend=backend_obj, + tokenizer=tokenizer, + active_adapter=adapter, + declared_adapters=tuple(sorted(declared)) if declared else (), + ) + + from dlm.repl.app import run_repl + + raise typer.Exit(code=run_repl(session, console=console)) From 3b251e9bdb1e46502918ab875cfea714a9dd74eb Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:45:06 -0400 Subject: [PATCH 11/46] Extract push_cmd into cli/commands/push.py --- src/dlm/cli/commands/__init__.py | 61 +--------------------------- src/dlm/cli/commands/push.py | 68 ++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 60 deletions(-) create mode 100644 src/dlm/cli/commands/push.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index b08a66c..63af34f 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -22,6 +22,7 @@ from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd +from dlm.cli.commands.push import push_cmd as push_cmd from dlm.cli.commands.repl import repl_cmd as repl_cmd from dlm.cli.commands.templates import templates_list_cmd as templates_list_cmd from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd @@ -2869,66 +2870,6 @@ def _render_training_sources_text(console: object, records: list[dict[str, objec console.print(f" {path} {fc} file(s), {_human_size(size)}") -def push_cmd( - path: Annotated[Path, typer.Argument(help=".dlm or .dlm.pack to push.")], - to: Annotated[ - str, - typer.Option( - "--to", - help=( - "Destination. `hf:/` for HuggingFace Hub, " - "`https://...` for a generic HTTPS endpoint, or a local path." - ), - ), - ], - sign: Annotated[ - bool, - typer.Option("--sign", help="Sign the pack with minisign before upload."), - ] = False, - include_exports: Annotated[bool, typer.Option("--include-exports")] = False, - include_base: Annotated[bool, typer.Option("--include-base")] = False, - include_logs: Annotated[bool, typer.Option("--include-logs")] = False, - licensee: Annotated[ - str | None, - typer.Option( - "--i-am-the-licensee", - help="URL ack for --include-base on non-redistributable bases.", - ), - ] = None, -) -> None: - """Upload a .dlm or .dlm.pack to an HF repo, URL endpoint, or local path.""" - from rich.console import Console - - from dlm.share import ShareError, push - from dlm.share.signing import MinisignNotAvailableError - - console = Console(stderr=True) - - try: - result = push( - path, - to, - sign=sign, - include_exports=include_exports, - include_base=include_base, - include_logs=include_logs, - licensee_acceptance_url=licensee, - ) - except MinisignNotAvailableError as exc: - console.print(f"[red]push:[/red] {exc}") - raise typer.Exit(code=1) from exc - except ShareError as exc: - console.print(f"[red]push:[/red] {exc}") - raise typer.Exit(code=1) from exc - - size_mb = result.bytes_sent / (1024 * 1024) - console.print(f"[green]pushed:[/green] {result.destination} ({size_mb:.2f} MB)") - if result.sink_kind.value == "hf": - console.print(f"[dim]install:[/dim] dlm pull {result.destination}") - if result.detail: - console.print(f"[dim]{result.detail}[/dim]") - - def pull_cmd( source: Annotated[ str, diff --git a/src/dlm/cli/commands/push.py b/src/dlm/cli/commands/push.py new file mode 100644 index 0000000..0f2e6ac --- /dev/null +++ b/src/dlm/cli/commands/push.py @@ -0,0 +1,68 @@ +"""`dlm push` — upload a .dlm or .dlm.pack to an HF repo, URL, or local path.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + + +def push_cmd( + path: Annotated[Path, typer.Argument(help=".dlm or .dlm.pack to push.")], + to: Annotated[ + str, + typer.Option( + "--to", + help=( + "Destination. `hf:/` for HuggingFace Hub, " + "`https://...` for a generic HTTPS endpoint, or a local path." + ), + ), + ], + sign: Annotated[ + bool, + typer.Option("--sign", help="Sign the pack with minisign before upload."), + ] = False, + include_exports: Annotated[bool, typer.Option("--include-exports")] = False, + include_base: Annotated[bool, typer.Option("--include-base")] = False, + include_logs: Annotated[bool, typer.Option("--include-logs")] = False, + licensee: Annotated[ + str | None, + typer.Option( + "--i-am-the-licensee", + help="URL ack for --include-base on non-redistributable bases.", + ), + ] = None, +) -> None: + """Upload a .dlm or .dlm.pack to an HF repo, URL endpoint, or local path.""" + from rich.console import Console + + from dlm.share import ShareError, push + from dlm.share.signing import MinisignNotAvailableError + + console = Console(stderr=True) + + try: + result = push( + path, + to, + sign=sign, + include_exports=include_exports, + include_base=include_base, + include_logs=include_logs, + licensee_acceptance_url=licensee, + ) + except MinisignNotAvailableError as exc: + console.print(f"[red]push:[/red] {exc}") + raise typer.Exit(code=1) from exc + except ShareError as exc: + console.print(f"[red]push:[/red] {exc}") + raise typer.Exit(code=1) from exc + + size_mb = result.bytes_sent / (1024 * 1024) + console.print(f"[green]pushed:[/green] {result.destination} ({size_mb:.2f} MB)") + if result.sink_kind.value == "hf": + console.print(f"[dim]install:[/dim] dlm pull {result.destination}") + if result.detail: + console.print(f"[dim]{result.detail}[/dim]") From cf75e7629f79193fbfd5ecddb58816bc360bfb38 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:45:47 -0400 Subject: [PATCH 12/46] Extract pull_cmd into cli/commands/pull.py --- src/dlm/cli/commands/__init__.py | 56 +--------------------------- src/dlm/cli/commands/pull.py | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 55 deletions(-) create mode 100644 src/dlm/cli/commands/pull.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 63af34f..39464e4 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -22,6 +22,7 @@ from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd +from dlm.cli.commands.pull import pull_cmd as pull_cmd from dlm.cli.commands.push import push_cmd as push_cmd from dlm.cli.commands.repl import repl_cmd as repl_cmd from dlm.cli.commands.templates import templates_list_cmd as templates_list_cmd @@ -2870,61 +2871,6 @@ def _render_training_sources_text(console: object, records: list[dict[str, objec console.print(f" {path} {fc} file(s), {_human_size(size)}") -def pull_cmd( - source: Annotated[ - str, - typer.Argument( - help=( - "Source: `hf:/`, `https://...`, " - "`peer://host:port/?token=...`, or a local path." - ) - ), - ], - out: Annotated[ - Path | None, - typer.Option("--out", help="Directory for the restored .dlm (default: CWD)."), - ] = None, - force: Annotated[ - bool, - typer.Option("--force", help="Overwrite an existing store with the same dlm_id."), - ] = False, -) -> None: - """Download + verify + unpack a .dlm.pack from a remote source.""" - from rich.console import Console - - from dlm.pack.errors import PackError - from dlm.share import ShareError, pull - from dlm.share.signing import VerifyStatus - - console = Console(stderr=True) - - try: - result = pull(source, out_dir=out, force=force) - except ShareError as exc: - console.print(f"[red]pull:[/red] {exc}") - raise typer.Exit(code=1) from exc - except PackError as exc: - console.print(f"[red]pull:[/red] pack integrity: {exc}") - raise typer.Exit(code=1) from exc - - size_mb = result.bytes_received / (1024 * 1024) - console.print(f"[green]pulled:[/green] {result.source} → {result.dlm_path} ({size_mb:.2f} MB)") - - status = result.verification.status - if status == VerifyStatus.VERIFIED: - console.print( - f"[green]verified:[/green] signature matches " - f"[bold]{result.verification.key_path}[/bold]" - ) - elif status == VerifyStatus.UNVERIFIED: - console.print( - f"[yellow]unverified:[/yellow] signature present but " - f"not matched ({result.verification.detail}); sha256 still validated" - ) - else: - console.print("[dim]unsigned[/dim] (sha256 integrity still validated)") - - def serve_cmd( path: Annotated[Path, typer.Argument(help=".dlm file to serve.")], port: Annotated[int, typer.Option("--port")] = 7337, diff --git a/src/dlm/cli/commands/pull.py b/src/dlm/cli/commands/pull.py new file mode 100644 index 0000000..62ccee8 --- /dev/null +++ b/src/dlm/cli/commands/pull.py @@ -0,0 +1,63 @@ +"""`dlm pull` — download + verify + unpack a .dlm.pack from a remote source.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + + +def pull_cmd( + source: Annotated[ + str, + typer.Argument( + help=( + "Source: `hf:/`, `https://...`, " + "`peer://host:port/?token=...`, or a local path." + ) + ), + ], + out: Annotated[ + Path | None, + typer.Option("--out", help="Directory for the restored .dlm (default: CWD)."), + ] = None, + force: Annotated[ + bool, + typer.Option("--force", help="Overwrite an existing store with the same dlm_id."), + ] = False, +) -> None: + """Download + verify + unpack a .dlm.pack from a remote source.""" + from rich.console import Console + + from dlm.pack.errors import PackError + from dlm.share import ShareError, pull + from dlm.share.signing import VerifyStatus + + console = Console(stderr=True) + + try: + result = pull(source, out_dir=out, force=force) + except ShareError as exc: + console.print(f"[red]pull:[/red] {exc}") + raise typer.Exit(code=1) from exc + except PackError as exc: + console.print(f"[red]pull:[/red] pack integrity: {exc}") + raise typer.Exit(code=1) from exc + + size_mb = result.bytes_received / (1024 * 1024) + console.print(f"[green]pulled:[/green] {result.source} → {result.dlm_path} ({size_mb:.2f} MB)") + + status = result.verification.status + if status == VerifyStatus.VERIFIED: + console.print( + f"[green]verified:[/green] signature matches " + f"[bold]{result.verification.key_path}[/bold]" + ) + elif status == VerifyStatus.UNVERIFIED: + console.print( + f"[yellow]unverified:[/yellow] signature present but " + f"not matched ({result.verification.detail}); sha256 still validated" + ) + else: + console.print("[dim]unsigned[/dim] (sha256 integrity still validated)") From 95ad10e61ba87c2c485ed27546d9592eb0dae5c2 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 18:46:35 -0400 Subject: [PATCH 13/46] Extract serve_cmd into cli/commands/serve.py --- src/dlm/cli/commands/__init__.py | 87 +---------------------------- src/dlm/cli/commands/serve.py | 94 ++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 86 deletions(-) create mode 100644 src/dlm/cli/commands/serve.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 39464e4..4016ae7 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -25,6 +25,7 @@ from dlm.cli.commands.pull import pull_cmd as pull_cmd from dlm.cli.commands.push import push_cmd as push_cmd from dlm.cli.commands.repl import repl_cmd as repl_cmd +from dlm.cli.commands.serve import serve_cmd as serve_cmd from dlm.cli.commands.templates import templates_list_cmd as templates_list_cmd from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd from dlm.cli.commands.verify import verify_cmd as verify_cmd @@ -2871,92 +2872,6 @@ def _render_training_sources_text(console: object, records: list[dict[str, objec console.print(f" {path} {fc} file(s), {_human_size(size)}") -def serve_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to serve.")], - port: Annotated[int, typer.Option("--port")] = 7337, - public: Annotated[ - bool, - typer.Option( - "--public", - help="Bind 0.0.0.0 (requires --i-know-this-is-public); otherwise 127.0.0.1.", - ), - ] = False, - i_know_public: Annotated[ - bool, - typer.Option( - "--i-know-this-is-public", - help="Confirm binding 0.0.0.0 is safe on this network.", - ), - ] = False, - max_concurrency: Annotated[ - int, - typer.Option("--max-concurrency", help="Max concurrent connections per token."), - ] = 4, - rate_limit: Annotated[ - int, - typer.Option("--rate-limit", help="Max requests per minute per token."), - ] = 30, - token_ttl_minutes: Annotated[ - int, typer.Option("--token-ttl-minutes", help="Token lifetime in minutes.") - ] = 15, -) -> None: - """Serve a .dlm's pack over LAN for peers to pull.""" - from rich.console import Console - - from dlm.doc.parser import parse_file - from dlm.pack.packer import pack as pack_fn - from dlm.share import ServeOptions, serve - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - - parsed = parse_file(path) - dlm_id = parsed.frontmatter.dlm_id - - # pack() calls load_manifest(), which crashes with an unhelpful - # "store manifest corrupt" error on a .dlm that's never been - # trained. Surface the true cause instead. - store = for_dlm(dlm_id) - if not store.manifest.exists(): - console.print( - f"[red]serve:[/red] no training state for {dlm_id} — run [bold]dlm train[/bold] first." - ) - raise typer.Exit(code=1) - - # Pack into a temp file that lives as long as the server does. - import tempfile - - tmp_dir = Path(tempfile.mkdtemp(prefix="dlm-serve-")) - tmp_pack = tmp_dir / f"{path.stem}.dlm.pack" - pack_fn(path, out=tmp_pack) - console.print(f"[dim]packed:[/dim] {tmp_pack} ({tmp_pack.stat().st_size} bytes)") - - opts = ServeOptions( - port=port, - public=public, - i_know_this_is_public=i_know_public, - max_concurrency=max_concurrency, - rate_limit_per_min=rate_limit, - token_ttl_seconds=token_ttl_minutes * 60, - ) - handle = serve(dlm_id, tmp_pack, opts) - - console.print( - f"[green]serving:[/green] {path.name} (dlm_id {dlm_id}) on " - f"[bold]http://{handle.bind_host}:{handle.port}/{dlm_id}[/bold]" - ) - console.print(f"[bold]peer URL:[/bold] {handle.peer_url}") - console.print(f"[dim]token valid for {token_ttl_minutes} min. Ctrl-C to stop.[/dim]") - - try: - handle.wait_shutdown() - finally: - import shutil - - shutil.rmtree(tmp_dir, ignore_errors=True) - console.print("[dim]stopped.[/dim]") - - # ---- Cache Commands -------------------------------------------------- diff --git a/src/dlm/cli/commands/serve.py b/src/dlm/cli/commands/serve.py new file mode 100644 index 0000000..7ffb60e --- /dev/null +++ b/src/dlm/cli/commands/serve.py @@ -0,0 +1,94 @@ +"""`dlm serve` — serve a .dlm's pack over LAN for peers to pull.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + + +def serve_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to serve.")], + port: Annotated[int, typer.Option("--port")] = 7337, + public: Annotated[ + bool, + typer.Option( + "--public", + help="Bind 0.0.0.0 (requires --i-know-this-is-public); otherwise 127.0.0.1.", + ), + ] = False, + i_know_public: Annotated[ + bool, + typer.Option( + "--i-know-this-is-public", + help="Confirm binding 0.0.0.0 is safe on this network.", + ), + ] = False, + max_concurrency: Annotated[ + int, + typer.Option("--max-concurrency", help="Max concurrent connections per token."), + ] = 4, + rate_limit: Annotated[ + int, + typer.Option("--rate-limit", help="Max requests per minute per token."), + ] = 30, + token_ttl_minutes: Annotated[ + int, typer.Option("--token-ttl-minutes", help="Token lifetime in minutes.") + ] = 15, +) -> None: + """Serve a .dlm's pack over LAN for peers to pull.""" + from rich.console import Console + + from dlm.doc.parser import parse_file + from dlm.pack.packer import pack as pack_fn + from dlm.share import ServeOptions, serve + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + + parsed = parse_file(path) + dlm_id = parsed.frontmatter.dlm_id + + # pack() calls load_manifest(), which crashes with an unhelpful + # "store manifest corrupt" error on a .dlm that's never been + # trained. Surface the true cause instead. + store = for_dlm(dlm_id) + if not store.manifest.exists(): + console.print( + f"[red]serve:[/red] no training state for {dlm_id} — run [bold]dlm train[/bold] first." + ) + raise typer.Exit(code=1) + + # Pack into a temp file that lives as long as the server does. + import tempfile + + tmp_dir = Path(tempfile.mkdtemp(prefix="dlm-serve-")) + tmp_pack = tmp_dir / f"{path.stem}.dlm.pack" + pack_fn(path, out=tmp_pack) + console.print(f"[dim]packed:[/dim] {tmp_pack} ({tmp_pack.stat().st_size} bytes)") + + opts = ServeOptions( + port=port, + public=public, + i_know_this_is_public=i_know_public, + max_concurrency=max_concurrency, + rate_limit_per_min=rate_limit, + token_ttl_seconds=token_ttl_minutes * 60, + ) + handle = serve(dlm_id, tmp_pack, opts) + + console.print( + f"[green]serving:[/green] {path.name} (dlm_id {dlm_id}) on " + f"[bold]http://{handle.bind_host}:{handle.port}/{dlm_id}[/bold]" + ) + console.print(f"[bold]peer URL:[/bold] {handle.peer_url}") + console.print(f"[dim]token valid for {token_ttl_minutes} min. Ctrl-C to stop.[/dim]") + + try: + handle.wait_shutdown() + finally: + import shutil + + shutil.rmtree(tmp_dir, ignore_errors=True) + console.print("[dim]stopped.[/dim]") From fd7b5e6e569a469cab3f2128a6f7c742b1186cb4 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:00:38 -0400 Subject: [PATCH 14/46] Extract cache_show/prune/clear into cli/commands/cache.py --- src/dlm/cli/commands/__init__.py | 174 +----------------------------- src/dlm/cli/commands/cache.py | 178 +++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+), 171 deletions(-) create mode 100644 src/dlm/cli/commands/cache.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 4016ae7..0a6a4ce 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -19,6 +19,9 @@ from dlm.cli.commands._shared import _human_size as _human_size from dlm.cli.commands._shared import _previously_accepted as _previously_accepted +from dlm.cli.commands.cache import cache_clear_cmd as cache_clear_cmd +from dlm.cli.commands.cache import cache_prune_cmd as cache_prune_cmd +from dlm.cli.commands.cache import cache_show_cmd as cache_show_cmd from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd @@ -2872,177 +2875,6 @@ def _render_training_sources_text(console: object, records: list[dict[str, objec console.print(f" {path} {fc} file(s), {_human_size(size)}") -# ---- Cache Commands -------------------------------------------------- - - -def cache_show_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to inspect the cache for.")], - json_out: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False, -) -> None: - """Show tokenized-section cache size, entry count, last-run hit rate.""" - import json as _json - import sys as _sys - - from rich.console import Console - - from dlm.directives.cache import TokenizedCache - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.metrics import queries as _queries - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - out_console = Console() - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]cache:[/red] {exc}") - raise typer.Exit(code=1) from exc - - store = for_dlm(parsed.frontmatter.dlm_id) - cache = TokenizedCache.open(store.tokenized_cache_dir) - last = _queries.latest_tokenization(store.root) - - payload: dict[str, object] = { - "dlm_id": parsed.frontmatter.dlm_id, - "cache_path": str(store.tokenized_cache_dir), - "entry_count": cache.entry_count, - "bytes": cache.total_bytes, - "last_run_hit_rate": last.hit_rate if last else None, - "last_run_id": last.run_id if last else None, - } - if json_out: - _sys.stdout.write(_json.dumps(payload, indent=2) + "\n") - return - - out_console.print(f"[bold]Cache for {parsed.frontmatter.dlm_id}[/bold]") - out_console.print(f" path: {store.tokenized_cache_dir}") - out_console.print(f" entries: {cache.entry_count}") - out_console.print(f" size: {_human_size(cache.total_bytes)}") - if last is not None: - out_console.print( - f" last-run hit rate: {last.hit_rate:.1%} " - f"({last.cache_hits}/{last.cache_hits + last.cache_misses})" - ) - else: - out_console.print(" last-run hit rate: [dim]no tokenization runs yet[/dim]") - - -def cache_prune_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to prune the cache for.")], - older_than: Annotated[ - str | None, - typer.Option( - "--older-than", - help=( - "Drop entries not accessed in this duration. " - "Format: `30d`, `12h`, `45m`. When omitted, defaults to " - "the document's `training.cache.prune_older_than_days` " - "(90d pre-v9 docs inherit)." - ), - ), - ] = None, -) -> None: - """Remove tokenized-cache entries not accessed within a cutoff.""" - from rich.console import Console - - from dlm.directives.cache import TokenizedCache - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - - # Parse the doc first — we need it either way (for dlm_id) AND - # for the frontmatter default when --older-than is absent. - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]cache:[/red] {exc}") - raise typer.Exit(code=1) from exc - - if older_than is not None: - seconds = _parse_duration(older_than) - if seconds is None: - console.print( - f"[red]cache:[/red] invalid --older-than {older_than!r} " - "(expected e.g. 30d, 12h, 45m)" - ) - raise typer.Exit(code=2) - cutoff_label = older_than - else: - # Fall back to the frontmatter's per-doc default. Pre-v9 docs - # get the CacheConfig default of 90 days via the Pydantic - # factory on parse. - days = parsed.frontmatter.training.cache.prune_older_than_days - seconds = float(days) * 86400.0 - cutoff_label = f"{days}d" - - store = for_dlm(parsed.frontmatter.dlm_id) - cache = TokenizedCache.open(store.tokenized_cache_dir) - removed = cache.prune(older_than_seconds=seconds) - cache.save_manifest() - console.print(f"[green]cache:[/green] pruned {removed} entr(y/ies) older than {cutoff_label}") - - -def cache_clear_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to wipe the cache for.")], - force: Annotated[ - bool, - typer.Option("--force", help="Skip the confirmation prompt."), - ] = False, -) -> None: - """Wipe every entry in the tokenized-section cache for this store.""" - from rich.console import Console - - from dlm.directives.cache import TokenizedCache - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]cache:[/red] {exc}") - raise typer.Exit(code=1) from exc - - store = for_dlm(parsed.frontmatter.dlm_id) - cache = TokenizedCache.open(store.tokenized_cache_dir) - - if not force and cache.entry_count > 0: - confirmed = typer.confirm( - f"wipe {cache.entry_count} entries ({_human_size(cache.total_bytes)})?" - ) - if not confirmed: - console.print("[yellow]cache:[/yellow] clear cancelled") - raise typer.Exit(code=0) - - removed = cache.clear() - cache.save_manifest() - console.print(f"[green]cache:[/green] cleared {removed} entr(y/ies)") - - -def _parse_duration(spec: str) -> float | None: - """Parse a duration like `30d`, `12h`, `45m` → seconds. None on - malformed input.""" - if not spec or not spec[:-1].isdigit(): - return None - n = int(spec[:-1]) - unit = spec[-1].lower() - if unit == "s": - return float(n) - if unit == "m": - return float(n) * 60 - if unit == "h": - return float(n) * 3600 - if unit == "d": - return float(n) * 86400 - return None - - # --- preference ----------------------------------------------------------- diff --git a/src/dlm/cli/commands/cache.py b/src/dlm/cli/commands/cache.py new file mode 100644 index 0000000..7dc0296 --- /dev/null +++ b/src/dlm/cli/commands/cache.py @@ -0,0 +1,178 @@ +"""`dlm cache` — show / prune / clear the tokenized-section cache.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + +from dlm.cli.commands._shared import _human_size + + +def cache_show_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to inspect the cache for.")], + json_out: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False, +) -> None: + """Show tokenized-section cache size, entry count, last-run hit rate.""" + import json as _json + import sys as _sys + + from rich.console import Console + + from dlm.directives.cache import TokenizedCache + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.metrics import queries as _queries + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + out_console = Console() + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]cache:[/red] {exc}") + raise typer.Exit(code=1) from exc + + store = for_dlm(parsed.frontmatter.dlm_id) + cache = TokenizedCache.open(store.tokenized_cache_dir) + last = _queries.latest_tokenization(store.root) + + payload: dict[str, object] = { + "dlm_id": parsed.frontmatter.dlm_id, + "cache_path": str(store.tokenized_cache_dir), + "entry_count": cache.entry_count, + "bytes": cache.total_bytes, + "last_run_hit_rate": last.hit_rate if last else None, + "last_run_id": last.run_id if last else None, + } + if json_out: + _sys.stdout.write(_json.dumps(payload, indent=2) + "\n") + return + + out_console.print(f"[bold]Cache for {parsed.frontmatter.dlm_id}[/bold]") + out_console.print(f" path: {store.tokenized_cache_dir}") + out_console.print(f" entries: {cache.entry_count}") + out_console.print(f" size: {_human_size(cache.total_bytes)}") + if last is not None: + out_console.print( + f" last-run hit rate: {last.hit_rate:.1%} " + f"({last.cache_hits}/{last.cache_hits + last.cache_misses})" + ) + else: + out_console.print(" last-run hit rate: [dim]no tokenization runs yet[/dim]") + + +def cache_prune_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to prune the cache for.")], + older_than: Annotated[ + str | None, + typer.Option( + "--older-than", + help=( + "Drop entries not accessed in this duration. " + "Format: `30d`, `12h`, `45m`. When omitted, defaults to " + "the document's `training.cache.prune_older_than_days` " + "(90d pre-v9 docs inherit)." + ), + ), + ] = None, +) -> None: + """Remove tokenized-cache entries not accessed within a cutoff.""" + from rich.console import Console + + from dlm.directives.cache import TokenizedCache + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + + # Parse the doc first — we need it either way (for dlm_id) AND + # for the frontmatter default when --older-than is absent. + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]cache:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if older_than is not None: + seconds = _parse_duration(older_than) + if seconds is None: + console.print( + f"[red]cache:[/red] invalid --older-than {older_than!r} " + "(expected e.g. 30d, 12h, 45m)" + ) + raise typer.Exit(code=2) + cutoff_label = older_than + else: + # Fall back to the frontmatter's per-doc default. Pre-v9 docs + # get the CacheConfig default of 90 days via the Pydantic + # factory on parse. + days = parsed.frontmatter.training.cache.prune_older_than_days + seconds = float(days) * 86400.0 + cutoff_label = f"{days}d" + + store = for_dlm(parsed.frontmatter.dlm_id) + cache = TokenizedCache.open(store.tokenized_cache_dir) + removed = cache.prune(older_than_seconds=seconds) + cache.save_manifest() + console.print(f"[green]cache:[/green] pruned {removed} entr(y/ies) older than {cutoff_label}") + + +def cache_clear_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to wipe the cache for.")], + force: Annotated[ + bool, + typer.Option("--force", help="Skip the confirmation prompt."), + ] = False, +) -> None: + """Wipe every entry in the tokenized-section cache for this store.""" + from rich.console import Console + + from dlm.directives.cache import TokenizedCache + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]cache:[/red] {exc}") + raise typer.Exit(code=1) from exc + + store = for_dlm(parsed.frontmatter.dlm_id) + cache = TokenizedCache.open(store.tokenized_cache_dir) + + if not force and cache.entry_count > 0: + confirmed = typer.confirm( + f"wipe {cache.entry_count} entries ({_human_size(cache.total_bytes)})?" + ) + if not confirmed: + console.print("[yellow]cache:[/yellow] clear cancelled") + raise typer.Exit(code=0) + + removed = cache.clear() + cache.save_manifest() + console.print(f"[green]cache:[/green] cleared {removed} entr(y/ies)") + + +def _parse_duration(spec: str) -> float | None: + """Parse a duration like `30d`, `12h`, `45m` → seconds. None on + malformed input.""" + if not spec or not spec[:-1].isdigit(): + return None + n = int(spec[:-1]) + unit = spec[-1].lower() + if unit == "s": + return float(n) + if unit == "m": + return float(n) * 60 + if unit == "h": + return float(n) * 3600 + if unit == "d": + return float(n) * 86400 + return None From 340df56fd3a145bd35e7b9cd5f6f1064cfb8fed2 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:01:07 -0400 Subject: [PATCH 15/46] Re-export _parse_duration for test compatibility --- src/dlm/cli/commands/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 0a6a4ce..0466b64 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -19,6 +19,7 @@ from dlm.cli.commands._shared import _human_size as _human_size from dlm.cli.commands._shared import _previously_accepted as _previously_accepted +from dlm.cli.commands.cache import _parse_duration as _parse_duration from dlm.cli.commands.cache import cache_clear_cmd as cache_clear_cmd from dlm.cli.commands.cache import cache_prune_cmd as cache_prune_cmd from dlm.cli.commands.cache import cache_show_cmd as cache_show_cmd From 7598f931eb1b5081842a4fcdba0f577f08fdb024 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:02:45 -0400 Subject: [PATCH 16/46] Extract harvest_cmd into cli/commands/harvest.py --- src/dlm/cli/commands/__init__.py | 140 +---------------------------- src/dlm/cli/commands/harvest.py | 149 +++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 139 deletions(-) create mode 100644 src/dlm/cli/commands/harvest.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 0466b64..5ed2edf 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -24,6 +24,7 @@ from dlm.cli.commands.cache import cache_prune_cmd as cache_prune_cmd from dlm.cli.commands.cache import cache_show_cmd as cache_show_cmd from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd +from dlm.cli.commands.harvest import harvest_cmd as harvest_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd from dlm.cli.commands.pull import pull_cmd as pull_cmd @@ -3698,142 +3699,3 @@ def _synth_prompt_summary(content: str, *, section_id: str) -> str: # --- harvest -------------------------------------------------------------- -def harvest_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to harvest into.")], - sway_json: Annotated[ - Path | None, - typer.Option( - "--sway-json", - help="Path to a sway JSON report. Required unless --revert is set.", - ), - ] = None, - apply: Annotated[ - bool, - typer.Option( - "--apply", - help="Write harvested sections to the .dlm. Default is dry-run (review only).", - ), - ] = False, - tag: Annotated[ - str, - typer.Option( - "--tag", - help="Prefix for the synthesized section's harvest_source metadata.", - ), - ] = "auto-harvest", - min_confidence: Annotated[ - float, - typer.Option( - "--min-confidence", - help="Drop candidates whose sway evidence.confidence is below this.", - min=0.0, - max=1.0, - ), - ] = 0.0, - strict: Annotated[ - bool, - typer.Option( - "--strict/--lax", - help=( - "Strict (default): refuse if any failing probe lacks a " - "reference. Lax: log a warning and skip those probes." - ), - ), - ] = True, - revert: Annotated[ - bool, - typer.Option( - "--revert", - help=( - "Strip every auto-harvested section from the document. " - "Mutually exclusive with --sway-json / --apply." - ), - ), - ] = False, -) -> None: - """Adversarial replay: harvest failing sway probes back into the .dlm. - - Default mode is `--dry-run`-style preview; pass `--apply` to write. - """ - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.harvest import ( - HarvestError, - MalformedSwayReportError, - NoReferenceError, - apply_plan, - build_plan, - read_sway_report, - render_plan, - revert_all_auto_harvests, - ) - - console = Console(stderr=True) - out_console = Console() - - if revert and (sway_json is not None or apply): - console.print( - "[red]harvest:[/red] --revert is mutually exclusive with --sway-json / --apply" - ) - raise typer.Exit(code=1) - if not revert and sway_json is None: - console.print( - "[red]harvest:[/red] --sway-json is required (or pass --revert " - "to strip auto-harvested sections)" - ) - raise typer.Exit(code=1) - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]harvest:[/red] {exc}") - raise typer.Exit(code=1) from exc - - if revert: - summary = revert_all_auto_harvests(parsed, target=path) - out_console.print( - f"[green]harvest:[/green] stripped {len(summary.added_section_ids)} " - f"auto-harvested section(s) from {path} (all harvest runs, not just last)" - ) - return - - assert sway_json is not None # narrowed by the check above - try: - candidates = read_sway_report( - sway_json, - strict=strict, - min_confidence=min_confidence, - ) - except MalformedSwayReportError as exc: - console.print(f"[red]harvest:[/red] {exc}") - raise typer.Exit(code=1) from exc - except NoReferenceError as exc: - console.print(f"[red]harvest:[/red] {exc}") - console.print(" Pass [bold]--lax[/bold] to skip probes without references instead.") - raise typer.Exit(code=1) from exc - except HarvestError as exc: - console.print(f"[red]harvest:[/red] {exc}") - raise typer.Exit(code=1) from exc - - plan = build_plan(parsed, candidates, tag=tag) - out_console.print(render_plan(plan)) - - if not plan.additions: - out_console.print( - "\n[yellow]no candidates to harvest[/yellow] — either the sway " - "report had no failing probes with references, or all matched " - "sections already exist in the document." - ) - raise typer.Exit(code=2) - - if not apply: - out_console.print("\n[dim]dry-run — re-run with [bold]--apply[/bold] to write.[/dim]") - return - - summary = apply_plan(parsed, plan, target=path) - out_console.print( - f"\n[green]harvest:[/green] wrote {summary.added} section(s) to {path} " - f"({summary.skipped} skipped)" - ) diff --git a/src/dlm/cli/commands/harvest.py b/src/dlm/cli/commands/harvest.py new file mode 100644 index 0000000..d284423 --- /dev/null +++ b/src/dlm/cli/commands/harvest.py @@ -0,0 +1,149 @@ +"""`dlm harvest` — adversarial replay: harvest failing sway probes into a .dlm.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + + +def harvest_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to harvest into.")], + sway_json: Annotated[ + Path | None, + typer.Option( + "--sway-json", + help="Path to a sway JSON report. Required unless --revert is set.", + ), + ] = None, + apply: Annotated[ + bool, + typer.Option( + "--apply", + help="Write harvested sections to the .dlm. Default is dry-run (review only).", + ), + ] = False, + tag: Annotated[ + str, + typer.Option( + "--tag", + help="Prefix for the synthesized section's harvest_source metadata.", + ), + ] = "auto-harvest", + min_confidence: Annotated[ + float, + typer.Option( + "--min-confidence", + help="Drop candidates whose sway evidence.confidence is below this.", + min=0.0, + max=1.0, + ), + ] = 0.0, + strict: Annotated[ + bool, + typer.Option( + "--strict/--lax", + help=( + "Strict (default): refuse if any failing probe lacks a " + "reference. Lax: log a warning and skip those probes." + ), + ), + ] = True, + revert: Annotated[ + bool, + typer.Option( + "--revert", + help=( + "Strip every auto-harvested section from the document. " + "Mutually exclusive with --sway-json / --apply." + ), + ), + ] = False, +) -> None: + """Adversarial replay: harvest failing sway probes back into the .dlm. + + Default mode is `--dry-run`-style preview; pass `--apply` to write. + """ + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.harvest import ( + HarvestError, + MalformedSwayReportError, + NoReferenceError, + apply_plan, + build_plan, + read_sway_report, + render_plan, + revert_all_auto_harvests, + ) + + console = Console(stderr=True) + out_console = Console() + + if revert and (sway_json is not None or apply): + console.print( + "[red]harvest:[/red] --revert is mutually exclusive with --sway-json / --apply" + ) + raise typer.Exit(code=1) + if not revert and sway_json is None: + console.print( + "[red]harvest:[/red] --sway-json is required (or pass --revert " + "to strip auto-harvested sections)" + ) + raise typer.Exit(code=1) + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]harvest:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if revert: + summary = revert_all_auto_harvests(parsed, target=path) + out_console.print( + f"[green]harvest:[/green] stripped {len(summary.added_section_ids)} " + f"auto-harvested section(s) from {path} (all harvest runs, not just last)" + ) + return + + assert sway_json is not None # narrowed by the check above + try: + candidates = read_sway_report( + sway_json, + strict=strict, + min_confidence=min_confidence, + ) + except MalformedSwayReportError as exc: + console.print(f"[red]harvest:[/red] {exc}") + raise typer.Exit(code=1) from exc + except NoReferenceError as exc: + console.print(f"[red]harvest:[/red] {exc}") + console.print(" Pass [bold]--lax[/bold] to skip probes without references instead.") + raise typer.Exit(code=1) from exc + except HarvestError as exc: + console.print(f"[red]harvest:[/red] {exc}") + raise typer.Exit(code=1) from exc + + plan = build_plan(parsed, candidates, tag=tag) + out_console.print(render_plan(plan)) + + if not plan.additions: + out_console.print( + "\n[yellow]no candidates to harvest[/yellow] — either the sway " + "report had no failing probes with references, or all matched " + "sections already exist in the document." + ) + raise typer.Exit(code=2) + + if not apply: + out_console.print("\n[dim]dry-run — re-run with [bold]--apply[/bold] to write.[/dim]") + return + + summary = apply_plan(parsed, plan, target=path) + out_console.print( + f"\n[green]harvest:[/green] wrote {summary.added} section(s) to {path} " + f"({summary.skipped} skipped)" + ) From 125f56eb17e0ee15ef7644d6b76793b3c1209007 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:05:15 -0400 Subject: [PATCH 17/46] Extract synth_instructions/revert/list into cli/commands/synth.py --- src/dlm/cli/commands/__init__.py | 379 +------------------------------ src/dlm/cli/commands/synth.py | 377 ++++++++++++++++++++++++++++++ 2 files changed, 381 insertions(+), 375 deletions(-) create mode 100644 src/dlm/cli/commands/synth.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 5ed2edf..0b2d9d3 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -13,7 +13,7 @@ import os from collections.abc import Sequence from pathlib import Path -from typing import TYPE_CHECKING, Annotated, Any, Literal, cast +from typing import TYPE_CHECKING, Annotated, Any, Literal import typer @@ -31,6 +31,9 @@ from dlm.cli.commands.push import push_cmd as push_cmd from dlm.cli.commands.repl import repl_cmd as repl_cmd from dlm.cli.commands.serve import serve_cmd as serve_cmd +from dlm.cli.commands.synth import synth_instructions_cmd as synth_instructions_cmd +from dlm.cli.commands.synth import synth_list_cmd as synth_list_cmd +from dlm.cli.commands.synth import synth_revert_cmd as synth_revert_cmd from dlm.cli.commands.templates import templates_list_cmd as templates_list_cmd from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd from dlm.cli.commands.verify import verify_cmd as verify_cmd @@ -3325,377 +3328,3 @@ def _preference_prompt_summary(content: str, *, section_id: str) -> str: return prompt or "" -# --- synth ----------------------------------------------------------------- - - -def synth_instructions_cmd( - path: Annotated[ - Path, typer.Argument(help=".dlm file to synthesize instruction sections from.") - ], - teacher: Annotated[ - str, - typer.Option( - "--teacher", - help=( - "Teacher selector: self, hf:, openai:, " - "anthropic:, or vllm-server:." - ), - ), - ] = "self", - per_section: Annotated[ - int, - typer.Option( - "--per-section", - help="Instruction pairs to generate per prose section.", - min=1, - ), - ] = 3, - strategy: Annotated[ - str, - typer.Option( - "--strategy", - help="Synthesis strategy: extraction, expansion, or both.", - ), - ] = "extraction", - filter_kind: Annotated[ - str, - typer.Option( - "--filter", - help="Filter pipeline: sway, none, or dedup-only.", - ), - ] = "sway", - threshold: Annotated[ - float | None, - typer.Option( - "--threshold", - help="Optional minimum sway-judge margin when --filter=sway.", - min=0.0, - ), - ] = None, - max_pairs: Annotated[ - int | None, - typer.Option( - "--max-pairs", - help="Maximum accepted synth pairs to keep from this run.", - min=1, - ), - ] = None, - max_new_tokens: Annotated[ - int, - typer.Option( - "--max-new-tokens", - help="Maximum new tokens the teacher may emit per prompt.", - min=1, - ), - ] = 512, - temp: Annotated[ - float, - typer.Option("--temp", help="Teacher sampling temperature.", min=0.0), - ] = 0.0, - top_p: Annotated[ - float | None, - typer.Option( - "--top-p", - help="Optional top-p cutoff for teacher sampling.", - min=0.0, - max=1.0, - ), - ] = None, - seed: Annotated[ - int | None, - typer.Option("--seed", help="Optional teacher sampling seed."), - ] = None, - apply: Annotated[ - bool, - typer.Option( - "--apply", - help="Write accepted auto-synth sections directly to the .dlm.", - ), - ] = False, - dry_run: Annotated[ - bool, - typer.Option( - "--dry-run", - help="Preview the synth plan without staging or writing anything.", - ), - ] = False, -) -> None: - """Generate, stage, or apply auto-synth instruction sections.""" - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.preference import JudgeUnavailableError, build_judge - from dlm.store.paths import for_dlm - from dlm.synth import ( - InvalidTeacherSpecError, - TeacherInvocationError, - TeacherUnavailableError, - build_synth_plan, - build_teacher, - clear_pending_plan, - filter_synth_plan, - render_filter_report, - render_synth_plan, - save_pending_plan, - ) - from dlm.synth import ( - apply_plan as apply_synth_plan, - ) - from dlm.synth import ( - build_apply_plan as build_synth_apply_plan, - ) - from dlm.synth import ( - render_apply_plan as render_synth_apply_plan, - ) - - console = Console(stderr=True) - out_console = Console() - - if strategy not in ("extraction", "expansion", "both"): - console.print( - "[red]synth:[/red] --strategy must be one of extraction|expansion|both " - f"(got {strategy!r})." - ) - raise typer.Exit(code=2) - if filter_kind not in ("sway", "none", "dedup-only"): - console.print( - f"[red]synth:[/red] --filter must be one of sway|none|dedup-only (got {filter_kind!r})." - ) - raise typer.Exit(code=2) - if apply and dry_run: - console.print("[red]synth:[/red] --apply and --dry-run are mutually exclusive.") - raise typer.Exit(code=2) - if threshold is not None and filter_kind != "sway": - console.print("[red]synth:[/red] --threshold is only valid when --filter is `sway`.") - raise typer.Exit(code=2) - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=1) from exc - - store = for_dlm(parsed.frontmatter.dlm_id) - - try: - strategy_value = cast(Literal["extraction", "expansion", "both"], strategy) - teacher_obj = build_teacher(teacher, dlm_path=path) - plan = build_synth_plan( - parsed, - teacher_obj, - per_section=per_section, - strategy=strategy_value, - max_pairs=max_pairs, - max_new_tokens=max_new_tokens, - temperature=temp, - top_p=top_p, - seed=seed, - ) - except InvalidTeacherSpecError as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=2) from exc - except TeacherUnavailableError as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=1) from exc - except TeacherInvocationError as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=1) from exc - except ValueError as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=2) from exc - - judge_obj = None - if filter_kind == "sway": - try: - judge_obj = build_judge("sway", dlm_path=path) - except JudgeUnavailableError as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=1) from exc - - try: - filter_value = cast(Literal["sway", "none", "dedup-only"], filter_kind) - filtered = filter_synth_plan( - plan, - filter_kind=filter_value, - judge=judge_obj, - threshold=threshold, - ) - except ValueError as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=2) from exc - - out_console.print(render_synth_plan(plan)) - out_console.print("") - out_console.print(render_filter_report(filtered)) - - if not filtered.additions: - if not dry_run: - clear_pending_plan(store) - out_console.print( - "\n[yellow]no synth additions accepted[/yellow] — either generation " - "yielded no valid pairs, dedup removed them, or the filter rejected them." - ) - raise typer.Exit(code=2) - - sections = [addition.addition.section for addition in filtered.additions] - - if apply: - apply_plan = build_synth_apply_plan(parsed, sections) - out_console.print("") - out_console.print(render_synth_apply_plan(apply_plan)) - summary = apply_synth_plan(parsed, apply_plan, target=path) - clear_pending_plan(store) - out_console.print( - f"\n[green]synth:[/green] wrote {summary.added} section(s) to {path} " - f"({summary.skipped} skipped)" - ) - return - - if dry_run: - out_console.print("\n[green]synth:[/green] dry-run only — nothing staged.") - return - - pending = save_pending_plan(store, source_path=path.resolve(), sections=sections) - out_console.print( - f"\n[green]synth:[/green] staged {len(pending.sections)} auto-synth instruction " - f"section(s). Run [bold]dlm synth list {path}[/bold] to inspect them." - ) - - -def synth_revert_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to strip auto-synth instructions from.")], -) -> None: - """Remove every `auto_synth: true` instruction section from the `.dlm`.""" - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.synth import revert_all_auto_synth - - console = Console(stderr=True) - out_console = Console() - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=1) from exc - - summary = revert_all_auto_synth(parsed, target=path) - out_console.print( - f"[green]synth:[/green] stripped {len(summary.added_section_ids)} " - f"auto-synth instruction section(s) from {path}" - ) - - -def synth_list_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file whose auto-synth instructions we list.")], -) -> None: - """List applied + staged auto-synth instruction sections.""" - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.doc.sections import SectionType - from dlm.store.paths import for_dlm - from dlm.synth import PendingSynthPlanError, load_pending_plan - - console = Console(stderr=True) - out_console = Console() - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=1) from exc - - store = for_dlm(parsed.frontmatter.dlm_id) - try: - pending = load_pending_plan(store) - except PendingSynthPlanError as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=1) from exc - - applied = [ - section - for section in parsed.sections - if section.type is SectionType.INSTRUCTION and section.auto_synth - ] - - out_console.print(f"[bold]{path}[/bold]") - out_console.print(f" applied auto-synth: {len(applied)}") - out_console.print(f" staged pending: {len(pending.sections) if pending else 0}") - - if not applied and pending is None: - out_console.print(" [dim]no auto-synth instruction sections yet[/dim]") - return - - if applied: - _render_synth_listing(out_console, "Applied", applied) - if pending is not None: - _render_synth_listing(out_console, "Pending", pending.sections) - - -def _render_synth_listing( - out_console: object, - heading: str, - sections: Sequence[object], -) -> None: - from collections import Counter - - from rich.console import Console - - from dlm.doc.sections import Section - - assert isinstance(out_console, Console) - typed_sections = [section for section in sections if isinstance(section, Section)] - - out_console.print(f"\n[bold]{heading}[/bold]") - - teacher_counts = Counter(section.synth_teacher or "unknown" for section in typed_sections) - strategy_counts = Counter(section.synth_strategy or "unknown" for section in typed_sections) - source_counts = Counter(section.source_section_id or "unknown" for section in typed_sections) - - out_console.print(" by teacher:") - for teacher_name in sorted(teacher_counts): - out_console.print(f" - {teacher_name}: {teacher_counts[teacher_name]}") - - out_console.print(" by strategy:") - for strategy_name in sorted(strategy_counts): - out_console.print(f" - {strategy_name}: {strategy_counts[strategy_name]}") - - out_console.print(" by source section:") - for source_id in sorted(source_counts): - out_console.print(f" - {source_id}: {source_counts[source_id]}") - - out_console.print(" sections:") - for section in typed_sections: - prompt = _synth_prompt_summary(section.content, section_id=section.section_id) - out_console.print( - " - " - f"{section.section_id} teacher={section.synth_teacher or 'unknown'} " - f"strategy={section.synth_strategy or 'unknown'} " - f"source={section.source_section_id or 'unknown'} " - f"prompt={prompt}" - ) - - -def _synth_prompt_summary(content: str, *, section_id: str) -> str: - """Best-effort prompt summary for `synth list`.""" - from dlm.data.errors import InstructionParseError - from dlm.data.instruction_parser import parse_instruction_body - - try: - pairs = parse_instruction_body(content, section_id=section_id) - except InstructionParseError: - return "" - if not pairs: - return "" - prompt = pairs[0].question.splitlines()[0].strip() - return prompt or "" - - -# --- harvest -------------------------------------------------------------- - - diff --git a/src/dlm/cli/commands/synth.py b/src/dlm/cli/commands/synth.py new file mode 100644 index 0000000..a00deee --- /dev/null +++ b/src/dlm/cli/commands/synth.py @@ -0,0 +1,377 @@ +"""`dlm synth` — generate / list / revert auto-synth instruction sections.""" + +from __future__ import annotations + +from collections.abc import Sequence +from pathlib import Path +from typing import Annotated, Literal, cast + +import typer + + +def synth_instructions_cmd( + path: Annotated[ + Path, typer.Argument(help=".dlm file to synthesize instruction sections from.") + ], + teacher: Annotated[ + str, + typer.Option( + "--teacher", + help=( + "Teacher selector: self, hf:, openai:, " + "anthropic:, or vllm-server:." + ), + ), + ] = "self", + per_section: Annotated[ + int, + typer.Option( + "--per-section", + help="Instruction pairs to generate per prose section.", + min=1, + ), + ] = 3, + strategy: Annotated[ + str, + typer.Option( + "--strategy", + help="Synthesis strategy: extraction, expansion, or both.", + ), + ] = "extraction", + filter_kind: Annotated[ + str, + typer.Option( + "--filter", + help="Filter pipeline: sway, none, or dedup-only.", + ), + ] = "sway", + threshold: Annotated[ + float | None, + typer.Option( + "--threshold", + help="Optional minimum sway-judge margin when --filter=sway.", + min=0.0, + ), + ] = None, + max_pairs: Annotated[ + int | None, + typer.Option( + "--max-pairs", + help="Maximum accepted synth pairs to keep from this run.", + min=1, + ), + ] = None, + max_new_tokens: Annotated[ + int, + typer.Option( + "--max-new-tokens", + help="Maximum new tokens the teacher may emit per prompt.", + min=1, + ), + ] = 512, + temp: Annotated[ + float, + typer.Option("--temp", help="Teacher sampling temperature.", min=0.0), + ] = 0.0, + top_p: Annotated[ + float | None, + typer.Option( + "--top-p", + help="Optional top-p cutoff for teacher sampling.", + min=0.0, + max=1.0, + ), + ] = None, + seed: Annotated[ + int | None, + typer.Option("--seed", help="Optional teacher sampling seed."), + ] = None, + apply: Annotated[ + bool, + typer.Option( + "--apply", + help="Write accepted auto-synth sections directly to the .dlm.", + ), + ] = False, + dry_run: Annotated[ + bool, + typer.Option( + "--dry-run", + help="Preview the synth plan without staging or writing anything.", + ), + ] = False, +) -> None: + """Generate, stage, or apply auto-synth instruction sections.""" + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.preference import JudgeUnavailableError, build_judge + from dlm.store.paths import for_dlm + from dlm.synth import ( + InvalidTeacherSpecError, + TeacherInvocationError, + TeacherUnavailableError, + build_synth_plan, + build_teacher, + clear_pending_plan, + filter_synth_plan, + render_filter_report, + render_synth_plan, + save_pending_plan, + ) + from dlm.synth import ( + apply_plan as apply_synth_plan, + ) + from dlm.synth import ( + build_apply_plan as build_synth_apply_plan, + ) + from dlm.synth import ( + render_apply_plan as render_synth_apply_plan, + ) + + console = Console(stderr=True) + out_console = Console() + + if strategy not in ("extraction", "expansion", "both"): + console.print( + "[red]synth:[/red] --strategy must be one of extraction|expansion|both " + f"(got {strategy!r})." + ) + raise typer.Exit(code=2) + if filter_kind not in ("sway", "none", "dedup-only"): + console.print( + f"[red]synth:[/red] --filter must be one of sway|none|dedup-only (got {filter_kind!r})." + ) + raise typer.Exit(code=2) + if apply and dry_run: + console.print("[red]synth:[/red] --apply and --dry-run are mutually exclusive.") + raise typer.Exit(code=2) + if threshold is not None and filter_kind != "sway": + console.print("[red]synth:[/red] --threshold is only valid when --filter is `sway`.") + raise typer.Exit(code=2) + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=1) from exc + + store = for_dlm(parsed.frontmatter.dlm_id) + + try: + strategy_value = cast(Literal["extraction", "expansion", "both"], strategy) + teacher_obj = build_teacher(teacher, dlm_path=path) + plan = build_synth_plan( + parsed, + teacher_obj, + per_section=per_section, + strategy=strategy_value, + max_pairs=max_pairs, + max_new_tokens=max_new_tokens, + temperature=temp, + top_p=top_p, + seed=seed, + ) + except InvalidTeacherSpecError as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=2) from exc + except TeacherUnavailableError as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=1) from exc + except TeacherInvocationError as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=1) from exc + except ValueError as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=2) from exc + + judge_obj = None + if filter_kind == "sway": + try: + judge_obj = build_judge("sway", dlm_path=path) + except JudgeUnavailableError as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=1) from exc + + try: + filter_value = cast(Literal["sway", "none", "dedup-only"], filter_kind) + filtered = filter_synth_plan( + plan, + filter_kind=filter_value, + judge=judge_obj, + threshold=threshold, + ) + except ValueError as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=2) from exc + + out_console.print(render_synth_plan(plan)) + out_console.print("") + out_console.print(render_filter_report(filtered)) + + if not filtered.additions: + if not dry_run: + clear_pending_plan(store) + out_console.print( + "\n[yellow]no synth additions accepted[/yellow] — either generation " + "yielded no valid pairs, dedup removed them, or the filter rejected them." + ) + raise typer.Exit(code=2) + + sections = [addition.addition.section for addition in filtered.additions] + + if apply: + apply_plan = build_synth_apply_plan(parsed, sections) + out_console.print("") + out_console.print(render_synth_apply_plan(apply_plan)) + summary = apply_synth_plan(parsed, apply_plan, target=path) + clear_pending_plan(store) + out_console.print( + f"\n[green]synth:[/green] wrote {summary.added} section(s) to {path} " + f"({summary.skipped} skipped)" + ) + return + + if dry_run: + out_console.print("\n[green]synth:[/green] dry-run only — nothing staged.") + return + + pending = save_pending_plan(store, source_path=path.resolve(), sections=sections) + out_console.print( + f"\n[green]synth:[/green] staged {len(pending.sections)} auto-synth instruction " + f"section(s). Run [bold]dlm synth list {path}[/bold] to inspect them." + ) + + +def synth_revert_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to strip auto-synth instructions from.")], +) -> None: + """Remove every `auto_synth: true` instruction section from the `.dlm`.""" + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.synth import revert_all_auto_synth + + console = Console(stderr=True) + out_console = Console() + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=1) from exc + + summary = revert_all_auto_synth(parsed, target=path) + out_console.print( + f"[green]synth:[/green] stripped {len(summary.added_section_ids)} " + f"auto-synth instruction section(s) from {path}" + ) + + +def synth_list_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file whose auto-synth instructions we list.")], +) -> None: + """List applied + staged auto-synth instruction sections.""" + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.doc.sections import SectionType + from dlm.store.paths import for_dlm + from dlm.synth import PendingSynthPlanError, load_pending_plan + + console = Console(stderr=True) + out_console = Console() + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=1) from exc + + store = for_dlm(parsed.frontmatter.dlm_id) + try: + pending = load_pending_plan(store) + except PendingSynthPlanError as exc: + console.print(f"[red]synth:[/red] {exc}") + raise typer.Exit(code=1) from exc + + applied = [ + section + for section in parsed.sections + if section.type is SectionType.INSTRUCTION and section.auto_synth + ] + + out_console.print(f"[bold]{path}[/bold]") + out_console.print(f" applied auto-synth: {len(applied)}") + out_console.print(f" staged pending: {len(pending.sections) if pending else 0}") + + if not applied and pending is None: + out_console.print(" [dim]no auto-synth instruction sections yet[/dim]") + return + + if applied: + _render_synth_listing(out_console, "Applied", applied) + if pending is not None: + _render_synth_listing(out_console, "Pending", pending.sections) + + +def _render_synth_listing( + out_console: object, + heading: str, + sections: Sequence[object], +) -> None: + from collections import Counter + + from rich.console import Console + + from dlm.doc.sections import Section + + assert isinstance(out_console, Console) + typed_sections = [section for section in sections if isinstance(section, Section)] + + out_console.print(f"\n[bold]{heading}[/bold]") + + teacher_counts = Counter(section.synth_teacher or "unknown" for section in typed_sections) + strategy_counts = Counter(section.synth_strategy or "unknown" for section in typed_sections) + source_counts = Counter(section.source_section_id or "unknown" for section in typed_sections) + + out_console.print(" by teacher:") + for teacher_name in sorted(teacher_counts): + out_console.print(f" - {teacher_name}: {teacher_counts[teacher_name]}") + + out_console.print(" by strategy:") + for strategy_name in sorted(strategy_counts): + out_console.print(f" - {strategy_name}: {strategy_counts[strategy_name]}") + + out_console.print(" by source section:") + for source_id in sorted(source_counts): + out_console.print(f" - {source_id}: {source_counts[source_id]}") + + out_console.print(" sections:") + for section in typed_sections: + prompt = _synth_prompt_summary(section.content, section_id=section.section_id) + out_console.print( + " - " + f"{section.section_id} teacher={section.synth_teacher or 'unknown'} " + f"strategy={section.synth_strategy or 'unknown'} " + f"source={section.source_section_id or 'unknown'} " + f"prompt={prompt}" + ) + + +def _synth_prompt_summary(content: str, *, section_id: str) -> str: + """Best-effort prompt summary for `synth list`.""" + from dlm.data.errors import InstructionParseError + from dlm.data.instruction_parser import parse_instruction_body + + try: + pairs = parse_instruction_body(content, section_id=section_id) + except InstructionParseError: + return "" + if not pairs: + return "" + prompt = pairs[0].question.splitlines()[0].strip() + return prompt or "" From 29a8b6e236dcc3612c15162f0dfccd2a1c58ab1e Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:07:02 -0400 Subject: [PATCH 18/46] Extract preference_mine/apply/revert/list into cli/commands/preference.py --- src/dlm/cli/commands/__init__.py | 451 +--------------------------- src/dlm/cli/commands/preference.py | 455 +++++++++++++++++++++++++++++ 2 files changed, 459 insertions(+), 447 deletions(-) create mode 100644 src/dlm/cli/commands/preference.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 0b2d9d3..b918b7a 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -27,6 +27,10 @@ from dlm.cli.commands.harvest import harvest_cmd as harvest_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd +from dlm.cli.commands.preference import preference_apply_cmd as preference_apply_cmd +from dlm.cli.commands.preference import preference_list_cmd as preference_list_cmd +from dlm.cli.commands.preference import preference_mine_cmd as preference_mine_cmd +from dlm.cli.commands.preference import preference_revert_cmd as preference_revert_cmd from dlm.cli.commands.pull import pull_cmd as pull_cmd from dlm.cli.commands.push import push_cmd as push_cmd from dlm.cli.commands.repl import repl_cmd as repl_cmd @@ -2880,451 +2884,4 @@ def _render_training_sources_text(console: object, records: list[dict[str, objec console.print(f" {path} {fc} file(s), {_human_size(size)}") -# --- preference ----------------------------------------------------------- - - -def preference_mine_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to mine preferences from.")], - samples: Annotated[ - int, - typer.Option("--samples", help="Candidate responses to sample per prompt.", min=2), - ] = 4, - judge: Annotated[ - str, - typer.Option( - "--judge", - help="Judge selector: sway, hf:, or cli:.", - ), - ] = "sway", - threshold: Annotated[ - float | None, - typer.Option( - "--threshold", - help="Minimum chosen-vs-rejected score margin. Defaults to the judge's native threshold.", - min=0.0, - ), - ] = None, - max_pairs: Annotated[ - int | None, - typer.Option( - "--max-pairs", - help="Maximum mined preference pairs to keep from this run.", - min=1, - ), - ] = None, - temp: Annotated[ - float, - typer.Option("--temp", help="Sampling temperature for candidate generation.", min=0.0), - ] = 0.7, - top_p: Annotated[ - float | None, - typer.Option( - "--top-p", - help="Optional nucleus-sampling cutoff for candidate generation.", - min=0.0, - max=1.0, - ), - ] = None, - backend: Annotated[ - str, - typer.Option( - "--backend", - help="Generation backend: auto, pytorch, or mlx.", - ), - ] = "auto", - adapter: Annotated[ - str | None, - typer.Option( - "--adapter", - help=( - "Named adapter to mine from on multi-adapter documents. " - "Required there; invalid on single-adapter documents." - ), - ), - ] = None, - apply: Annotated[ - bool, - typer.Option( - "--apply", - help=( - "Write mined preference sections directly to the .dlm. " - "Default stages them for `dlm preference apply`." - ), - ), - ] = False, -) -> None: - """Sample + stage auto-mined preference sections from the current adapter.""" - from rich.console import Console - - from dlm.base_models import GatedModelError - from dlm.base_models import resolve as resolve_base_model - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.hardware import doctor - from dlm.inference import AdapterNotFoundError - from dlm.inference.backends import ( - UnsupportedBackendError, - build_backend, - select_backend, - ) - from dlm.metrics import MetricsRecorder, PreferenceMineEvent - from dlm.metrics.events import PreferenceMineWriteMode - from dlm.modality import modality_for - from dlm.preference import ( - InvalidJudgeSpecError, - JudgeUnavailableError, - build_apply_plan, - build_judge, - build_mine_plan, - render_apply_plan, - render_mine_plan, - ) - from dlm.preference.apply import apply_plan as apply_preference_plan - from dlm.preference.pending import clear_pending_plan, save_pending_plan - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - out_console = Console() - - if backend not in ("auto", "pytorch", "mlx"): - console.print( - f"[red]preference:[/red] --backend must be `auto`, `pytorch`, or `mlx` (got {backend!r})." - ) - raise typer.Exit(code=2) - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=1) from exc - - adapters_declared = parsed.frontmatter.training.adapters - if adapter is not None: - if adapters_declared is None: - console.print( - "[red]preference:[/red] --adapter is only valid on multi-adapter " - "documents (this doc does not declare `training.adapters`)." - ) - raise typer.Exit(code=2) - if adapter not in adapters_declared: - declared = sorted(adapters_declared) - console.print( - f"[red]preference:[/red] --adapter {adapter!r} is not declared " - f"(declared: {declared})." - ) - raise typer.Exit(code=2) - elif adapters_declared is not None: - console.print( - "[red]preference:[/red] multi-adapter documents require --adapter " - "so mining knows which adapter to sample." - ) - raise typer.Exit(code=2) - - judge_kind = judge.split(":", 1)[0].strip() - if adapter is not None and judge_kind == "sway": - console.print( - "[red]preference:[/red] --judge sway is not yet wired for named adapters; " - "use `hf:` or `cli:` for multi-adapter mining." - ) - raise typer.Exit(code=2) - - store = for_dlm(parsed.frontmatter.dlm_id) - run_id = _latest_training_run_id(store) - if run_id is None: - console.print( - "[red]preference:[/red] mining requires a prior training run (run `dlm train` first)." - ) - raise typer.Exit(code=1) - - already_accepted = _previously_accepted(store.manifest) - try: - spec = resolve_base_model(parsed.frontmatter.base_model, accept_license=already_accepted) - except GatedModelError as exc: - console.print( - f"[red]license:[/red] base {parsed.frontmatter.base_model!r} is gated and has " - "no recorded acceptance in this store; run `dlm train --i-accept-license` first." - ) - raise typer.Exit(code=1) from exc - - dispatch = modality_for(spec) - if dispatch.accepts_images or dispatch.accepts_audio: - console.print( - f"[red]preference:[/red] preference mining currently supports text bases only; " - f"base {spec.key!r} is modality='{spec.modality}'." - ) - raise typer.Exit(code=2) - - caps = doctor().capabilities - try: - backend_name = select_backend(backend, caps) # type: ignore[arg-type] - except UnsupportedBackendError as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=2) from exc - backend_obj = build_backend(backend_name, caps) - - try: - backend_obj.load(spec, store, adapter_name=adapter) - except AdapterNotFoundError as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=1) from exc - - try: - judge_obj = build_judge(judge, dlm_path=path) - plan = build_mine_plan( - parsed, - backend_obj, - judge_obj, - mined_run_id=run_id, - samples=samples, - max_pairs=max_pairs, - threshold=threshold, - temperature=temp, - top_p=top_p, - ) - except InvalidJudgeSpecError as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=2) from exc - except JudgeUnavailableError as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=1) from exc - except ValueError as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=2) from exc - finally: - backend_obj.unload() - - recorder = MetricsRecorder(store.root) - - def _record_preference_mine(write_mode: PreferenceMineWriteMode) -> None: - recorder.record_preference_mine( - PreferenceMineEvent( - run_id=run_id, - judge_name=judge_obj.name, - sample_count=samples, - mined_pairs=len(plan.additions), - skipped_prompts=len(plan.skipped), - write_mode=write_mode, - ) - ) - - out_console.print(render_mine_plan(plan)) - - if not plan.additions: - clear_pending_plan(store) - _record_preference_mine("empty") - out_console.print( - "\n[yellow]no candidates to mine[/yellow] — either instruction prompts " - "did not yield a confident pair, or the matching preference sections " - "already exist in the document." - ) - raise typer.Exit(code=2) - - sections = [addition.section for addition in plan.additions] - - if apply: - apply_plan = build_apply_plan(parsed, sections) - out_console.print("") - out_console.print(render_apply_plan(apply_plan)) - summary = apply_preference_plan(parsed, apply_plan, target=path) - clear_pending_plan(store) - _record_preference_mine("applied") - out_console.print( - f"\n[green]preference:[/green] wrote {summary.added} section(s) to {path} " - f"({summary.skipped} skipped)" - ) - return - - pending = save_pending_plan(store, source_path=path.resolve(), sections=sections) - _record_preference_mine("staged") - out_console.print( - f"\n[green]preference:[/green] staged {len(pending.sections)} mined preference " - f"section(s). Run [bold]dlm preference apply {path}[/bold] to write them." - ) - - -def preference_apply_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to apply staged preferences into.")], -) -> None: - """Write the staged preference-mine plan into the `.dlm`.""" - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.preference import build_apply_plan, render_apply_plan - from dlm.preference.apply import apply_plan as apply_preference_plan - from dlm.preference.pending import ( - PendingPreferencePlanError, - clear_pending_plan, - load_pending_plan, - ) - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - out_console = Console() - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=1) from exc - - store = for_dlm(parsed.frontmatter.dlm_id) - try: - pending = load_pending_plan(store) - except PendingPreferencePlanError as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=1) from exc - - if pending is None: - console.print( - "[red]preference:[/red] no staged mined preferences found; " - "run `dlm preference mine` first." - ) - raise typer.Exit(code=1) - - plan = build_apply_plan(parsed, list(pending.sections)) - out_console.print(render_apply_plan(plan)) - - if not plan.additions: - clear_pending_plan(store) - out_console.print( - "\n[yellow]no staged preferences to write[/yellow] — the pending plan was " - "already present in the document." - ) - raise typer.Exit(code=2) - - summary = apply_preference_plan(parsed, plan, target=path) - clear_pending_plan(store) - out_console.print( - f"\n[green]preference:[/green] wrote {summary.added} section(s) to {path} " - f"({summary.skipped} skipped)" - ) - - -def preference_revert_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to strip auto-mined preferences from.")], -) -> None: - """Remove every `auto_mined: true` preference section from the `.dlm`.""" - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.preference import revert_all_auto_mined - - console = Console(stderr=True) - out_console = Console() - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=1) from exc - - summary = revert_all_auto_mined(parsed, target=path) - out_console.print( - f"[green]preference:[/green] stripped {len(summary.added_section_ids)} " - f"auto-mined preference section(s) from {path}" - ) - - -def preference_list_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file whose auto-mined preferences we list.")], -) -> None: - """List applied + staged auto-mined preference sections.""" - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.doc.sections import SectionType - from dlm.preference.pending import PendingPreferencePlanError, load_pending_plan - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - out_console = Console() - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=1) from exc - - store = for_dlm(parsed.frontmatter.dlm_id) - try: - pending = load_pending_plan(store) - except PendingPreferencePlanError as exc: - console.print(f"[red]preference:[/red] {exc}") - raise typer.Exit(code=1) from exc - - applied = [ - section - for section in parsed.sections - if section.type is SectionType.PREFERENCE and section.auto_mined - ] - - out_console.print(f"[bold]{path}[/bold]") - out_console.print(f" applied auto-mined: {len(applied)}") - out_console.print(f" staged pending: {len(pending.sections) if pending else 0}") - - if not applied and pending is None: - out_console.print(" [dim]no auto-mined preference sections yet[/dim]") - return - - if applied: - out_console.print("\n[bold]Applied[/bold]") - for section in applied: - prompt = _preference_prompt_summary(section.content, section_id=section.section_id) - judge_name = section.judge_name or "unknown" - run_id = section.mined_run_id if section.mined_run_id is not None else "?" - out_console.print( - f" - {section.section_id} judge={judge_name} run={run_id} prompt={prompt}" - ) - - if pending is not None: - out_console.print("\n[bold]Pending[/bold]") - for section in pending.sections: - prompt = _preference_prompt_summary(section.content, section_id=section.section_id) - judge_name = section.judge_name or "unknown" - run_id = section.mined_run_id if section.mined_run_id is not None else "?" - out_console.print( - f" - {section.section_id} judge={judge_name} run={run_id} prompt={prompt}" - ) - - -def _latest_training_run_id(store: object) -> int | None: - """Most recent run id from metrics DB or manifest.""" - from dlm.metrics.queries import latest_run_id - from dlm.store.errors import ManifestCorruptError - from dlm.store.manifest import load_manifest - from dlm.store.paths import StorePath - - assert isinstance(store, StorePath) - - metrics_run_id = latest_run_id(store.root) - if metrics_run_id is not None: - return metrics_run_id - if not store.manifest.exists(): - return None - try: - manifest = load_manifest(store.manifest) - except (ManifestCorruptError, OSError): - return None - if not manifest.training_runs: - return None - return max(run.run_id for run in manifest.training_runs) - - -def _preference_prompt_summary(content: str, *, section_id: str) -> str: - """Best-effort prompt summary for `preference list`.""" - from dlm.data.errors import PreferenceParseError - from dlm.data.preference_parser import parse_preference_body - - try: - triples = parse_preference_body(content, section_id=section_id) - except PreferenceParseError: - return "" - if not triples: - return "" - prompt = triples[0].prompt.splitlines()[0].strip() - return prompt or "" - diff --git a/src/dlm/cli/commands/preference.py b/src/dlm/cli/commands/preference.py new file mode 100644 index 0000000..21f3fce --- /dev/null +++ b/src/dlm/cli/commands/preference.py @@ -0,0 +1,455 @@ +"""`dlm preference` — mine / apply / revert / list preference sections.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + +from dlm.cli.commands._shared import _previously_accepted + + +def preference_mine_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to mine preferences from.")], + samples: Annotated[ + int, + typer.Option("--samples", help="Candidate responses to sample per prompt.", min=2), + ] = 4, + judge: Annotated[ + str, + typer.Option( + "--judge", + help="Judge selector: sway, hf:, or cli:.", + ), + ] = "sway", + threshold: Annotated[ + float | None, + typer.Option( + "--threshold", + help="Minimum chosen-vs-rejected score margin. Defaults to the judge's native threshold.", + min=0.0, + ), + ] = None, + max_pairs: Annotated[ + int | None, + typer.Option( + "--max-pairs", + help="Maximum mined preference pairs to keep from this run.", + min=1, + ), + ] = None, + temp: Annotated[ + float, + typer.Option("--temp", help="Sampling temperature for candidate generation.", min=0.0), + ] = 0.7, + top_p: Annotated[ + float | None, + typer.Option( + "--top-p", + help="Optional nucleus-sampling cutoff for candidate generation.", + min=0.0, + max=1.0, + ), + ] = None, + backend: Annotated[ + str, + typer.Option( + "--backend", + help="Generation backend: auto, pytorch, or mlx.", + ), + ] = "auto", + adapter: Annotated[ + str | None, + typer.Option( + "--adapter", + help=( + "Named adapter to mine from on multi-adapter documents. " + "Required there; invalid on single-adapter documents." + ), + ), + ] = None, + apply: Annotated[ + bool, + typer.Option( + "--apply", + help=( + "Write mined preference sections directly to the .dlm. " + "Default stages them for `dlm preference apply`." + ), + ), + ] = False, +) -> None: + """Sample + stage auto-mined preference sections from the current adapter.""" + from rich.console import Console + + from dlm.base_models import GatedModelError + from dlm.base_models import resolve as resolve_base_model + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.hardware import doctor + from dlm.inference import AdapterNotFoundError + from dlm.inference.backends import ( + UnsupportedBackendError, + build_backend, + select_backend, + ) + from dlm.metrics import MetricsRecorder, PreferenceMineEvent + from dlm.metrics.events import PreferenceMineWriteMode + from dlm.modality import modality_for + from dlm.preference import ( + InvalidJudgeSpecError, + JudgeUnavailableError, + build_apply_plan, + build_judge, + build_mine_plan, + render_apply_plan, + render_mine_plan, + ) + from dlm.preference.apply import apply_plan as apply_preference_plan + from dlm.preference.pending import clear_pending_plan, save_pending_plan + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + out_console = Console() + + if backend not in ("auto", "pytorch", "mlx"): + console.print( + f"[red]preference:[/red] --backend must be `auto`, `pytorch`, or `mlx` (got {backend!r})." + ) + raise typer.Exit(code=2) + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=1) from exc + + adapters_declared = parsed.frontmatter.training.adapters + if adapter is not None: + if adapters_declared is None: + console.print( + "[red]preference:[/red] --adapter is only valid on multi-adapter " + "documents (this doc does not declare `training.adapters`)." + ) + raise typer.Exit(code=2) + if adapter not in adapters_declared: + declared = sorted(adapters_declared) + console.print( + f"[red]preference:[/red] --adapter {adapter!r} is not declared " + f"(declared: {declared})." + ) + raise typer.Exit(code=2) + elif adapters_declared is not None: + console.print( + "[red]preference:[/red] multi-adapter documents require --adapter " + "so mining knows which adapter to sample." + ) + raise typer.Exit(code=2) + + judge_kind = judge.split(":", 1)[0].strip() + if adapter is not None and judge_kind == "sway": + console.print( + "[red]preference:[/red] --judge sway is not yet wired for named adapters; " + "use `hf:` or `cli:` for multi-adapter mining." + ) + raise typer.Exit(code=2) + + store = for_dlm(parsed.frontmatter.dlm_id) + run_id = _latest_training_run_id(store) + if run_id is None: + console.print( + "[red]preference:[/red] mining requires a prior training run (run `dlm train` first)." + ) + raise typer.Exit(code=1) + + already_accepted = _previously_accepted(store.manifest) + try: + spec = resolve_base_model(parsed.frontmatter.base_model, accept_license=already_accepted) + except GatedModelError as exc: + console.print( + f"[red]license:[/red] base {parsed.frontmatter.base_model!r} is gated and has " + "no recorded acceptance in this store; run `dlm train --i-accept-license` first." + ) + raise typer.Exit(code=1) from exc + + dispatch = modality_for(spec) + if dispatch.accepts_images or dispatch.accepts_audio: + console.print( + f"[red]preference:[/red] preference mining currently supports text bases only; " + f"base {spec.key!r} is modality='{spec.modality}'." + ) + raise typer.Exit(code=2) + + caps = doctor().capabilities + try: + backend_name = select_backend(backend, caps) # type: ignore[arg-type] + except UnsupportedBackendError as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=2) from exc + backend_obj = build_backend(backend_name, caps) + + try: + backend_obj.load(spec, store, adapter_name=adapter) + except AdapterNotFoundError as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=1) from exc + + try: + judge_obj = build_judge(judge, dlm_path=path) + plan = build_mine_plan( + parsed, + backend_obj, + judge_obj, + mined_run_id=run_id, + samples=samples, + max_pairs=max_pairs, + threshold=threshold, + temperature=temp, + top_p=top_p, + ) + except InvalidJudgeSpecError as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=2) from exc + except JudgeUnavailableError as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=1) from exc + except ValueError as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=2) from exc + finally: + backend_obj.unload() + + recorder = MetricsRecorder(store.root) + + def _record_preference_mine(write_mode: PreferenceMineWriteMode) -> None: + recorder.record_preference_mine( + PreferenceMineEvent( + run_id=run_id, + judge_name=judge_obj.name, + sample_count=samples, + mined_pairs=len(plan.additions), + skipped_prompts=len(plan.skipped), + write_mode=write_mode, + ) + ) + + out_console.print(render_mine_plan(plan)) + + if not plan.additions: + clear_pending_plan(store) + _record_preference_mine("empty") + out_console.print( + "\n[yellow]no candidates to mine[/yellow] — either instruction prompts " + "did not yield a confident pair, or the matching preference sections " + "already exist in the document." + ) + raise typer.Exit(code=2) + + sections = [addition.section for addition in plan.additions] + + if apply: + apply_plan = build_apply_plan(parsed, sections) + out_console.print("") + out_console.print(render_apply_plan(apply_plan)) + summary = apply_preference_plan(parsed, apply_plan, target=path) + clear_pending_plan(store) + _record_preference_mine("applied") + out_console.print( + f"\n[green]preference:[/green] wrote {summary.added} section(s) to {path} " + f"({summary.skipped} skipped)" + ) + return + + pending = save_pending_plan(store, source_path=path.resolve(), sections=sections) + _record_preference_mine("staged") + out_console.print( + f"\n[green]preference:[/green] staged {len(pending.sections)} mined preference " + f"section(s). Run [bold]dlm preference apply {path}[/bold] to write them." + ) + + +def preference_apply_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to apply staged preferences into.")], +) -> None: + """Write the staged preference-mine plan into the `.dlm`.""" + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.preference import build_apply_plan, render_apply_plan + from dlm.preference.apply import apply_plan as apply_preference_plan + from dlm.preference.pending import ( + PendingPreferencePlanError, + clear_pending_plan, + load_pending_plan, + ) + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + out_console = Console() + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=1) from exc + + store = for_dlm(parsed.frontmatter.dlm_id) + try: + pending = load_pending_plan(store) + except PendingPreferencePlanError as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if pending is None: + console.print( + "[red]preference:[/red] no staged mined preferences found; " + "run `dlm preference mine` first." + ) + raise typer.Exit(code=1) + + plan = build_apply_plan(parsed, list(pending.sections)) + out_console.print(render_apply_plan(plan)) + + if not plan.additions: + clear_pending_plan(store) + out_console.print( + "\n[yellow]no staged preferences to write[/yellow] — the pending plan was " + "already present in the document." + ) + raise typer.Exit(code=2) + + summary = apply_preference_plan(parsed, plan, target=path) + clear_pending_plan(store) + out_console.print( + f"\n[green]preference:[/green] wrote {summary.added} section(s) to {path} " + f"({summary.skipped} skipped)" + ) + + +def preference_revert_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to strip auto-mined preferences from.")], +) -> None: + """Remove every `auto_mined: true` preference section from the `.dlm`.""" + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.preference import revert_all_auto_mined + + console = Console(stderr=True) + out_console = Console() + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=1) from exc + + summary = revert_all_auto_mined(parsed, target=path) + out_console.print( + f"[green]preference:[/green] stripped {len(summary.added_section_ids)} " + f"auto-mined preference section(s) from {path}" + ) + + +def preference_list_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file whose auto-mined preferences we list.")], +) -> None: + """List applied + staged auto-mined preference sections.""" + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.doc.sections import SectionType + from dlm.preference.pending import PendingPreferencePlanError, load_pending_plan + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + out_console = Console() + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=1) from exc + + store = for_dlm(parsed.frontmatter.dlm_id) + try: + pending = load_pending_plan(store) + except PendingPreferencePlanError as exc: + console.print(f"[red]preference:[/red] {exc}") + raise typer.Exit(code=1) from exc + + applied = [ + section + for section in parsed.sections + if section.type is SectionType.PREFERENCE and section.auto_mined + ] + + out_console.print(f"[bold]{path}[/bold]") + out_console.print(f" applied auto-mined: {len(applied)}") + out_console.print(f" staged pending: {len(pending.sections) if pending else 0}") + + if not applied and pending is None: + out_console.print(" [dim]no auto-mined preference sections yet[/dim]") + return + + if applied: + out_console.print("\n[bold]Applied[/bold]") + for section in applied: + prompt = _preference_prompt_summary(section.content, section_id=section.section_id) + judge_name = section.judge_name or "unknown" + run_id = section.mined_run_id if section.mined_run_id is not None else "?" + out_console.print( + f" - {section.section_id} judge={judge_name} run={run_id} prompt={prompt}" + ) + + if pending is not None: + out_console.print("\n[bold]Pending[/bold]") + for section in pending.sections: + prompt = _preference_prompt_summary(section.content, section_id=section.section_id) + judge_name = section.judge_name or "unknown" + run_id = section.mined_run_id if section.mined_run_id is not None else "?" + out_console.print( + f" - {section.section_id} judge={judge_name} run={run_id} prompt={prompt}" + ) + + +def _latest_training_run_id(store: object) -> int | None: + """Most recent run id from metrics DB or manifest.""" + from dlm.metrics.queries import latest_run_id + from dlm.store.errors import ManifestCorruptError + from dlm.store.manifest import load_manifest + from dlm.store.paths import StorePath + + assert isinstance(store, StorePath) + + metrics_run_id = latest_run_id(store.root) + if metrics_run_id is not None: + return metrics_run_id + if not store.manifest.exists(): + return None + try: + manifest = load_manifest(store.manifest) + except (ManifestCorruptError, OSError): + return None + if not manifest.training_runs: + return None + return max(run.run_id for run in manifest.training_runs) + + +def _preference_prompt_summary(content: str, *, section_id: str) -> str: + """Best-effort prompt summary for `preference list`.""" + from dlm.data.errors import PreferenceParseError + from dlm.data.preference_parser import parse_preference_body + + try: + triples = parse_preference_body(content, section_id=section_id) + except PreferenceParseError: + return "" + if not triples: + return "" + prompt = triples[0].prompt.splitlines()[0].strip() + return prompt or "" From c4cc7afd995d47bc8298e1082b0c89677c7b51b8 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:08:40 -0400 Subject: [PATCH 19/46] Extract metrics_cmd + metrics_watch_cmd into cli/commands/metrics.py --- src/dlm/cli/commands/__init__.py | 223 +------------------------------ src/dlm/cli/commands/metrics.py | 220 ++++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+), 220 deletions(-) create mode 100644 src/dlm/cli/commands/metrics.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index b918b7a..1c1af6b 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -13,7 +13,7 @@ import os from collections.abc import Sequence from pathlib import Path -from typing import TYPE_CHECKING, Annotated, Any, Literal +from typing import Annotated, Any, Literal import typer @@ -25,6 +25,8 @@ from dlm.cli.commands.cache import cache_show_cmd as cache_show_cmd from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.harvest import harvest_cmd as harvest_cmd +from dlm.cli.commands.metrics import metrics_cmd as metrics_cmd +from dlm.cli.commands.metrics import metrics_watch_cmd as metrics_watch_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd from dlm.cli.commands.pack import pack_cmd as pack_cmd from dlm.cli.commands.preference import preference_apply_cmd as preference_apply_cmd @@ -42,9 +44,6 @@ from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd from dlm.cli.commands.verify import verify_cmd as verify_cmd -if TYPE_CHECKING: - from datetime import timedelta - def _stub(sprint: str, subject: str) -> None: """Raise a clear unimplemented error pointing to the owning sprint.""" @@ -2190,219 +2189,6 @@ def _verbose_runner(cmd: Sequence[str]) -> object: console.print(f"smoke: {result.smoke_output_first_line}") -def metrics_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file whose store we query.")], - json_out: Annotated[bool, typer.Option("--json", help="Emit JSON.")] = False, - csv_out: Annotated[bool, typer.Option("--csv", help="Emit CSV.")] = False, - run_id: Annotated[ - int | None, - typer.Option("--run-id", help="Only show this run (drill-down)."), - ] = None, - phase: Annotated[ - str | None, - typer.Option("--phase", help="Filter by phase: sft|dpo|orpo|cpt."), - ] = None, - since: Annotated[ - str | None, - typer.Option( - "--since", - help="Time window (e.g. `24h`, `7d`, `30m`). Filters `started_at`.", - ), - ] = None, - limit: Annotated[int, typer.Option("--limit")] = 20, -) -> None: - """Query the per-store metrics database.""" - import csv - import json - import sys - - from rich.console import Console - - from dlm.doc.parser import parse_file - from dlm.metrics.queries import ( - evals_for_run, - evals_to_dict, - preference_mining_for_run, - preference_mining_to_dict, - recent_runs, - runs_to_dict, - steps_for_run, - steps_to_dict, - ) - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - - if json_out and csv_out: - console.print("[red]metrics:[/red] --json and --csv are mutually exclusive") - raise typer.Exit(code=2) - - since_delta = _parse_since_arg(since, console) if since else None - - parsed = parse_file(path) - store = for_dlm(parsed.frontmatter.dlm_id) - - runs = recent_runs(store.root, limit=limit, phase=phase, since=since_delta, run_id=run_id) - - if run_id is not None: - # Drill-down: show this run's steps + evals. - if not runs: - console.print(f"[red]metrics:[/red] no run with run_id={run_id}") - raise typer.Exit(code=1) - run = runs[0] - steps = steps_for_run(store.root, run_id) - evals = evals_for_run(store.root, run_id) - preference_rows = preference_mining_for_run(store.root, run_id) - - if json_out: - payload = { - "run": runs_to_dict([run])[0], - "steps": steps_to_dict(steps), - "evals": evals_to_dict(evals), - "preference_mining": preference_mining_to_dict(preference_rows), - } - sys.stdout.write(json.dumps(payload, indent=2) + "\n") - return - if csv_out: - writer = csv.writer(sys.stdout) - writer.writerow(["step", "loss", "lr", "grad_norm", "val_loss"]) - eval_by_step = {e.step: e.val_loss for e in evals} - for s in steps: - writer.writerow([s.step, s.loss, s.lr, s.grad_norm, eval_by_step.get(s.step)]) - return - console.print( - f"[green]run_id={run.run_id}[/green] phase={run.phase} " - f"seed={run.seed} status={run.status} steps={len(steps)} " - f"evals={len(evals)}" - ) - if evals: - last = evals[-1] - console.print( - f" last eval: step={last.step} val_loss={last.val_loss} " - f"perplexity={last.perplexity}" - ) - if preference_rows: - last_pref = preference_rows[-1] - console.print( - " preference mining: " - f"events={len(preference_rows)} " - f"mined_pairs={sum(row.mined_pairs for row in preference_rows)} " - f"skipped_prompts={sum(row.skipped_prompts for row in preference_rows)} " - f"last_mode={last_pref.write_mode} " - f"judge={last_pref.judge_name}" - ) - return - - # Top-level: list runs. - if json_out: - sys.stdout.write(json.dumps({"runs": runs_to_dict(runs)}, indent=2) + "\n") - return - if csv_out: - writer = csv.writer(sys.stdout) - writer.writerow(["run_id", "phase", "seed", "status", "started_at", "ended_at"]) - for r in runs: - writer.writerow([r.run_id, r.phase, r.seed, r.status, r.started_at, r.ended_at]) - return - - if not runs: - console.print("[dim]metrics:[/dim] no runs found (hint: train first, or adjust filters)") - return - console.print(f"[bold]Runs: {len(runs)}[/bold]") - for r in runs: - console.print( - f" run_id={r.run_id} phase={r.phase} seed={r.seed} " - f"status={r.status} started={r.started_at}" - ) - - -def metrics_watch_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file whose store we tail.")], - poll_seconds: Annotated[ - float, - typer.Option("--poll-seconds", help="How often to re-read the metrics DB."), - ] = 1.0, -) -> None: - """Tail the metrics DB: print new steps/evals as they land.""" - import time - - from rich.console import Console - - from dlm.doc.parser import parse_file - from dlm.metrics.queries import evals_for_run, latest_run_id, steps_for_run - from dlm.store.paths import for_dlm - - console = Console() - - parsed = parse_file(path) - store = for_dlm(parsed.frontmatter.dlm_id) - - console.print( - f"[dim]metrics watch:[/dim] polling {store.root} every {poll_seconds}s (Ctrl-C to exit)" - ) - - current_run: int | None = None - last_step_seen = 0 - last_eval_step_seen = 0 - try: - while True: - run_id = latest_run_id(store.root) - if run_id is None: - time.sleep(poll_seconds) - continue - if run_id != current_run: - current_run = run_id - last_step_seen = 0 - last_eval_step_seen = 0 - console.print(f"[green]→ following run_id={run_id}[/green]") - - new_steps = steps_for_run(store.root, run_id, since_step=last_step_seen) - for s in new_steps: - console.print( - f" step {s.step:>5} loss={s.loss} lr={s.lr} grad_norm={s.grad_norm}" - ) - last_step_seen = s.step - - new_evals = evals_for_run(store.root, run_id, since_step=last_eval_step_seen) - for e in new_evals: - console.print( - f" [yellow]eval @ step {e.step}[/yellow] " - f"val_loss={e.val_loss} perplexity={e.perplexity}" - ) - last_eval_step_seen = e.step - - time.sleep(poll_seconds) - except KeyboardInterrupt: - console.print("[dim]metrics watch:[/dim] bye") - - -def _parse_since_arg(since: str, console: object) -> timedelta: - """Parse `24h` / `7d` / `30m` / `10s` into a timedelta.""" - from datetime import timedelta - - from rich.console import Console - - assert isinstance(console, Console) - - if not since: - raise typer.Exit(code=2) - unit = since[-1].lower() - try: - value = int(since[:-1]) - except ValueError: - console.print(f"[red]metrics:[/red] --since {since!r} not an integer+unit") - raise typer.Exit(code=2) from None - if unit == "s": - return timedelta(seconds=value) - if unit == "m": - return timedelta(minutes=value) - if unit == "h": - return timedelta(hours=value) - if unit == "d": - return timedelta(days=value) - console.print(f"[red]metrics:[/red] --since {since!r} unit must be s/m/h/d") - raise typer.Exit(code=2) - - def show_cmd( path: Annotated[Path, typer.Argument(help=".dlm file to inspect.")], json_out: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False, @@ -2882,6 +2668,3 @@ def _render_training_sources_text(console: object, records: list[dict[str, objec else: size = int(tb) if isinstance(tb, int) else 0 console.print(f" {path} {fc} file(s), {_human_size(size)}") - - - diff --git a/src/dlm/cli/commands/metrics.py b/src/dlm/cli/commands/metrics.py new file mode 100644 index 0000000..ebbbd73 --- /dev/null +++ b/src/dlm/cli/commands/metrics.py @@ -0,0 +1,220 @@ +"""`dlm metrics` — query / watch the per-store metrics database.""" + +from __future__ import annotations + +from datetime import timedelta +from pathlib import Path +from typing import Annotated + +import typer + + +def metrics_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file whose store we query.")], + json_out: Annotated[bool, typer.Option("--json", help="Emit JSON.")] = False, + csv_out: Annotated[bool, typer.Option("--csv", help="Emit CSV.")] = False, + run_id: Annotated[ + int | None, + typer.Option("--run-id", help="Only show this run (drill-down)."), + ] = None, + phase: Annotated[ + str | None, + typer.Option("--phase", help="Filter by phase: sft|dpo|orpo|cpt."), + ] = None, + since: Annotated[ + str | None, + typer.Option( + "--since", + help="Time window (e.g. `24h`, `7d`, `30m`). Filters `started_at`.", + ), + ] = None, + limit: Annotated[int, typer.Option("--limit")] = 20, +) -> None: + """Query the per-store metrics database.""" + import csv + import json + import sys + + from rich.console import Console + + from dlm.doc.parser import parse_file + from dlm.metrics.queries import ( + evals_for_run, + evals_to_dict, + preference_mining_for_run, + preference_mining_to_dict, + recent_runs, + runs_to_dict, + steps_for_run, + steps_to_dict, + ) + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + + if json_out and csv_out: + console.print("[red]metrics:[/red] --json and --csv are mutually exclusive") + raise typer.Exit(code=2) + + since_delta = _parse_since_arg(since, console) if since else None + + parsed = parse_file(path) + store = for_dlm(parsed.frontmatter.dlm_id) + + runs = recent_runs(store.root, limit=limit, phase=phase, since=since_delta, run_id=run_id) + + if run_id is not None: + # Drill-down: show this run's steps + evals. + if not runs: + console.print(f"[red]metrics:[/red] no run with run_id={run_id}") + raise typer.Exit(code=1) + run = runs[0] + steps = steps_for_run(store.root, run_id) + evals = evals_for_run(store.root, run_id) + preference_rows = preference_mining_for_run(store.root, run_id) + + if json_out: + payload = { + "run": runs_to_dict([run])[0], + "steps": steps_to_dict(steps), + "evals": evals_to_dict(evals), + "preference_mining": preference_mining_to_dict(preference_rows), + } + sys.stdout.write(json.dumps(payload, indent=2) + "\n") + return + if csv_out: + writer = csv.writer(sys.stdout) + writer.writerow(["step", "loss", "lr", "grad_norm", "val_loss"]) + eval_by_step = {e.step: e.val_loss for e in evals} + for s in steps: + writer.writerow([s.step, s.loss, s.lr, s.grad_norm, eval_by_step.get(s.step)]) + return + console.print( + f"[green]run_id={run.run_id}[/green] phase={run.phase} " + f"seed={run.seed} status={run.status} steps={len(steps)} " + f"evals={len(evals)}" + ) + if evals: + last = evals[-1] + console.print( + f" last eval: step={last.step} val_loss={last.val_loss} " + f"perplexity={last.perplexity}" + ) + if preference_rows: + last_pref = preference_rows[-1] + console.print( + " preference mining: " + f"events={len(preference_rows)} " + f"mined_pairs={sum(row.mined_pairs for row in preference_rows)} " + f"skipped_prompts={sum(row.skipped_prompts for row in preference_rows)} " + f"last_mode={last_pref.write_mode} " + f"judge={last_pref.judge_name}" + ) + return + + # Top-level: list runs. + if json_out: + sys.stdout.write(json.dumps({"runs": runs_to_dict(runs)}, indent=2) + "\n") + return + if csv_out: + writer = csv.writer(sys.stdout) + writer.writerow(["run_id", "phase", "seed", "status", "started_at", "ended_at"]) + for r in runs: + writer.writerow([r.run_id, r.phase, r.seed, r.status, r.started_at, r.ended_at]) + return + + if not runs: + console.print("[dim]metrics:[/dim] no runs found (hint: train first, or adjust filters)") + return + console.print(f"[bold]Runs: {len(runs)}[/bold]") + for r in runs: + console.print( + f" run_id={r.run_id} phase={r.phase} seed={r.seed} " + f"status={r.status} started={r.started_at}" + ) + + +def metrics_watch_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file whose store we tail.")], + poll_seconds: Annotated[ + float, + typer.Option("--poll-seconds", help="How often to re-read the metrics DB."), + ] = 1.0, +) -> None: + """Tail the metrics DB: print new steps/evals as they land.""" + import time + + from rich.console import Console + + from dlm.doc.parser import parse_file + from dlm.metrics.queries import evals_for_run, latest_run_id, steps_for_run + from dlm.store.paths import for_dlm + + console = Console() + + parsed = parse_file(path) + store = for_dlm(parsed.frontmatter.dlm_id) + + console.print( + f"[dim]metrics watch:[/dim] polling {store.root} every {poll_seconds}s (Ctrl-C to exit)" + ) + + current_run: int | None = None + last_step_seen = 0 + last_eval_step_seen = 0 + try: + while True: + run_id = latest_run_id(store.root) + if run_id is None: + time.sleep(poll_seconds) + continue + if run_id != current_run: + current_run = run_id + last_step_seen = 0 + last_eval_step_seen = 0 + console.print(f"[green]→ following run_id={run_id}[/green]") + + new_steps = steps_for_run(store.root, run_id, since_step=last_step_seen) + for s in new_steps: + console.print( + f" step {s.step:>5} loss={s.loss} lr={s.lr} grad_norm={s.grad_norm}" + ) + last_step_seen = s.step + + new_evals = evals_for_run(store.root, run_id, since_step=last_eval_step_seen) + for e in new_evals: + console.print( + f" [yellow]eval @ step {e.step}[/yellow] " + f"val_loss={e.val_loss} perplexity={e.perplexity}" + ) + last_eval_step_seen = e.step + + time.sleep(poll_seconds) + except KeyboardInterrupt: + console.print("[dim]metrics watch:[/dim] bye") + + +def _parse_since_arg(since: str, console: object) -> timedelta: + """Parse `24h` / `7d` / `30m` / `10s` into a timedelta.""" + from rich.console import Console + + assert isinstance(console, Console) + + if not since: + raise typer.Exit(code=2) + unit = since[-1].lower() + try: + value = int(since[:-1]) + except ValueError: + console.print(f"[red]metrics:[/red] --since {since!r} not an integer+unit") + raise typer.Exit(code=2) from None + if unit == "s": + return timedelta(seconds=value) + if unit == "m": + return timedelta(minutes=value) + if unit == "h": + return timedelta(hours=value) + if unit == "d": + return timedelta(days=value) + console.print(f"[red]metrics:[/red] --since {since!r} unit must be s/m/h/d") + raise typer.Exit(code=2) From a84adeae392130dcd545042541847a956af84852 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:11:16 -0400 Subject: [PATCH 20/46] Extract show_cmd + 9 show helpers into cli/commands/show.py --- src/dlm/cli/commands/__init__.py | 480 +----------------------------- src/dlm/cli/commands/show.py | 491 +++++++++++++++++++++++++++++++ 2 files changed, 492 insertions(+), 479 deletions(-) create mode 100644 src/dlm/cli/commands/show.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 1c1af6b..8348f5e 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -37,6 +37,7 @@ from dlm.cli.commands.push import push_cmd as push_cmd from dlm.cli.commands.repl import repl_cmd as repl_cmd from dlm.cli.commands.serve import serve_cmd as serve_cmd +from dlm.cli.commands.show import show_cmd as show_cmd from dlm.cli.commands.synth import synth_instructions_cmd as synth_instructions_cmd from dlm.cli.commands.synth import synth_list_cmd as synth_list_cmd from dlm.cli.commands.synth import synth_revert_cmd as synth_revert_cmd @@ -2189,482 +2190,3 @@ def _verbose_runner(cmd: Sequence[str]) -> object: console.print(f"smoke: {result.smoke_output_first_line}") -def show_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to inspect.")], - json_out: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False, -) -> None: - """Show training history, exports, and adapter state.""" - import json as _json - import sys - - from rich.console import Console - - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.store.errors import ManifestCorruptError - from dlm.store.inspect import inspect_store - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - out_console = Console() - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]show:[/red] {exc}") - raise typer.Exit(code=1) from exc - - store = for_dlm(parsed.frontmatter.dlm_id) - training_sources, discovered_configs = _summarize_training_sources_and_discovered( - parsed, path.resolve().parent - ) - # The per-document cache config comes from frontmatter, not on-disk - # state — report it on both the pre-train and initialized-store paths - # so authors can sanity-check the knobs before `dlm train` runs. - cache_cfg = parsed.frontmatter.training.cache - training_cache_config: dict[str, object] = { - "enabled": cache_cfg.enabled, - "max_bytes": cache_cfg.max_bytes, - "prune_older_than_days": cache_cfg.prune_older_than_days, - } - - # Store may not exist yet (no `dlm train` run). Treat that as an - # informational state rather than an error — useful after `dlm init`. - if not store.manifest.exists(): - if json_out: - payload: dict[str, object] = { - "dlm_id": parsed.frontmatter.dlm_id, - "base_model": parsed.frontmatter.base_model, - "store_initialized": False, - "source_path": str(path.resolve()), - "training_cache_config": training_cache_config, - } - if training_sources is not None: - payload["training_sources"] = training_sources - if discovered_configs: - payload["discovered_training_configs"] = discovered_configs - sys.stdout.write(_json.dumps(payload, indent=2) + "\n") - else: - out_console.print(f"[bold]{path}[/bold]") - out_console.print(f" dlm_id: {parsed.frontmatter.dlm_id}") - out_console.print(f" base_model: {parsed.frontmatter.base_model}") - out_console.print(" store: [dim]not yet initialized (run `dlm train`)[/dim]") - if training_sources: - _render_training_sources_text(out_console, training_sources) - return - - try: - inspection = inspect_store(store, source_path=path.resolve()) - except ManifestCorruptError as exc: - console.print(f"[red]show:[/red] {exc}") - raise typer.Exit(code=1) from exc - - training_cache = _summarize_training_cache(store.tokenized_cache_dir, store.root) - gate = _summarize_gate(store) - preference_mining = _summarize_preference_mining(store.root) - base_security = _summarize_base_security(parsed.frontmatter.base_model) - - if json_out: - payload_full = _inspection_to_dict(inspection) - if training_sources is not None: - payload_full["training_sources"] = training_sources - if discovered_configs: - payload_full["discovered_training_configs"] = discovered_configs - if training_cache is not None: - payload_full["training_cache"] = training_cache - payload_full["training_cache_config"] = training_cache_config - if gate is not None: - payload_full["gate"] = gate - if preference_mining is not None: - payload_full["preference_mining"] = preference_mining - payload_full["preference_mining_runs"] = preference_mining["run_count"] - payload_full["total_auto_mined_pairs"] = preference_mining["total_mined_pairs"] - if base_security is not None: - payload_full["base_security"] = base_security - # Write JSON to raw stdout — Rich's Console wraps lines at the - # terminal width and would corrupt the JSON. - sys.stdout.write(_json.dumps(payload_full, indent=2, default=str) + "\n") - return - - _render_inspection_text(out_console, path, inspection) - if training_sources: - _render_training_sources_text(out_console, training_sources) - if training_cache is not None and training_cache.get("entry_count", 0): - _render_training_cache_text(out_console, training_cache) - if gate is not None: - _render_gate_text(out_console, gate) - if base_security is not None and base_security.get("trust_remote_code"): - _render_base_security_text(out_console, base_security) - - -def _inspection_to_dict(inspection: object) -> dict[str, object]: - """Flatten a StoreInspection into a JSON-safe dict. - - Schema is the v1 contract for `dlm show --json`; any reshape is a - version bump (recorded in tests/golden/cli-json/). - """ - from dlm.store.inspect import StoreInspection - - assert isinstance(inspection, StoreInspection) - return { - "dlm_id": inspection.dlm_id, - "path": str(inspection.path), - "base_model": inspection.base_model, - "base_model_revision": inspection.base_model_revision, - "adapter_version": inspection.adapter_version, - "training_runs": inspection.training_runs, - "last_trained_at": inspection.last_trained_at, - "has_adapter_current": inspection.has_adapter_current, - "replay_size_bytes": inspection.replay_size_bytes, - "total_size_bytes": inspection.total_size_bytes, - "source_path": str(inspection.source_path) if inspection.source_path else None, - "orphaned": inspection.orphaned, - "exports": [e.model_dump(mode="json") for e in inspection.exports], - "content_hashes": dict(inspection.content_hashes), - "pinned_versions": dict(inspection.pinned_versions), - "named_adapters": [ - { - "name": a.name, - "has_current": a.has_current, - "latest_version": a.latest_version, - } - for a in inspection.named_adapters - ], - } - - -def _render_inspection_text(console: object, path: Path, inspection: object) -> None: - """Human-readable `dlm show` output.""" - from rich.console import Console - - from dlm.store.inspect import StoreInspection - - assert isinstance(console, Console) - assert isinstance(inspection, StoreInspection) - - console.print(f"[bold]{path}[/bold]") - console.print(f" dlm_id: {inspection.dlm_id}") - rev = inspection.base_model_revision - rev_str = f" (revision {rev[:7]})" if rev else "" - console.print(f" base_model: {inspection.base_model}{rev_str}") - console.print( - f" store: {inspection.path} ({_human_size(inspection.total_size_bytes)})" - ) - if inspection.named_adapters: - # Multi-adapter store: render the per-adapter pointers rather - # than the flat field (which stays 0 on multi-adapter docs). - console.print(" adapters:") - for adapter in inspection.named_adapters: - if adapter.has_current: - console.print(f" {adapter.name:16}v{adapter.latest_version:04d}") - else: - console.print(f" {adapter.name:16}[dim]no current pointer[/dim]") - elif inspection.has_adapter_current: - console.print(f" adapter: v{inspection.adapter_version:04d}") - else: - console.print(" adapter: [dim]none (no `dlm train` yet)[/dim]") - last = inspection.last_trained_at - last_str = f" — last {last.isoformat(timespec='seconds')}" if last else "" - console.print(f" training runs: {inspection.training_runs}{last_str}") - console.print(f" exports: {len(inspection.exports)}") - for exp in inspection.exports: - tag = f" — {exp.ollama_name}" if exp.ollama_name else "" - console.print(f" {exp.quant}{tag}") - if inspection.orphaned: - console.print(" [yellow]orphaned:[/yellow] source .dlm is missing or mismatched") - - -def _summarize_training_sources(parsed: object, base_path: Path) -> list[dict[str, object]] | None: - """Best-effort resolution of `training.sources` for `dlm show`. - - Returns None when the frontmatter declares no directives; returns - a list of per-source dicts otherwise. Failures to expand (missing - paths, policy escapes) fall back to declared-only records so the - show output stays useful for debugging a misconfigured directive. - """ - records, _ = _summarize_training_sources_and_discovered(parsed, base_path) - return records - - -def _summarize_training_sources_and_discovered( - parsed: object, base_path: Path -) -> tuple[list[dict[str, object]] | None, list[dict[str, object]]]: - """Like `_summarize_training_sources` but also returns the per-anchor - `.dlm/training.yaml` + `.dlm/ignore` discovery records. - - Returns `(training_sources, discovered_configs)`. `discovered_configs` - is always a list (empty when nothing was found or the expansion - failed); `training_sources` matches the single-value helper's - contract. - """ - from dlm.directives import DirectiveError, expand_sources - from dlm.doc.parser import ParsedDlm - - assert isinstance(parsed, ParsedDlm) - directives = parsed.frontmatter.training.sources - if not directives: - return None, [] - - declared: list[dict[str, object]] = [ - { - "path": d.path, - "include": list(d.include), - "exclude": list(d.exclude), - "max_files": d.max_files, - "max_bytes_per_file": d.max_bytes_per_file, - } - for d in directives - ] - - try: - result = expand_sources(parsed, base_path=base_path) - except (DirectiveError, OSError): - return declared, [] - - records: list[dict[str, object]] = [] - for decl, prov in zip(declared, result.provenance, strict=False): - records.append( - { - **decl, - "file_count": prov.file_count, - "total_bytes": prov.total_bytes, - "skipped_binary": prov.skipped_binary, - "skipped_encoding": prov.skipped_encoding, - "skipped_over_size": prov.skipped_over_size, - } - ) - # If the expander returned fewer entries than declared (shouldn't - # happen on success but defensive), pad with declared-only. - if len(records) < len(declared): - records.extend(declared[len(records) :]) - - discovered_records: list[dict[str, object]] = [] - for dc in result.discovered: - discovered_records.append( - { - "anchor": str(dc.anchor), - "has_training_yaml": dc.config is not None, - "has_ignore": bool(dc.ignore_rules), - "include": list(dc.config.include) if dc.config else [], - "exclude": list(dc.config.exclude) if dc.config else [], - "exclude_defaults": (dc.config.exclude_defaults if dc.config else True), - "metadata": dict(dc.config.metadata) if dc.config else {}, - "ignore_rules": len(dc.ignore_rules), - } - ) - return records, discovered_records - - -def _summarize_training_cache(cache_dir: Path, store_root: Path) -> dict[str, object] | None: - """Return a JSON-friendly snapshot of the tokenized-section cache. - - None when the cache dir doesn't exist (store never trained with - the cache, or pre-Sprint-31 layout). Cheap — reads the manifest - only, not the entry files. - """ - if not cache_dir.is_dir(): - return None - from dlm.directives.cache import TokenizedCache - from dlm.metrics import queries as _queries - - cache = TokenizedCache.open(cache_dir) - last = _queries.latest_tokenization(store_root) - return { - "path": str(cache_dir), - "entry_count": cache.entry_count, - "bytes": cache.total_bytes, - "last_run_hit_rate": last.hit_rate if last else None, - "last_run_id": last.run_id if last else None, - } - - -def _summarize_gate(store: object) -> dict[str, object] | None: - """Return a JSON-friendly snapshot of the learned adapter gate. - - None when the store has no gate config (pre-Sprint-34 runs, or - `training.gate.enabled` was false). Reads two sources: the - on-disk `gate_config.json` for mode + adapter order, and the - metrics `gate_events` table for per-adapter mean weight from the - most recent run that recorded a gate. - """ - import json as _json - - from dlm.store.paths import StorePath - from dlm.train.gate.paths import gate_config_path - - assert isinstance(store, StorePath) - cfg_path = gate_config_path(store) - - from dlm.metrics import queries as _queries - from dlm.train.gate.module import GateMetadata - - events = _queries.latest_gate_events(store.root) - # Divergence path: training raised before writing a config, but we - # still emit one GateEvent per adapter with mode="diverged" so - # operators can see the failure. Surface it even when the config - # file is absent. - if not cfg_path.exists(): - if events and events[0].mode == "diverged": - return { - "mode": "diverged", - "adapter_names": [e.adapter_name for e in events], - "input_dim": None, - "hidden_proj_dim": None, - "last_run_id": events[0].run_id, - "per_adapter": [ - { - "adapter_name": e.adapter_name, - "mean_weight": e.mean_weight, - "sample_count": e.sample_count, - "mode": e.mode, - } - for e in events - ], - } - return None - - raw = _json.loads(cfg_path.read_text(encoding="utf-8")) - meta = GateMetadata.from_json(raw) - per_adapter: list[dict[str, object]] = [] - run_id: int | None = None - if events: - run_id = events[0].run_id - per_adapter = [ - { - "adapter_name": e.adapter_name, - "mean_weight": e.mean_weight, - "sample_count": e.sample_count, - "mode": e.mode, - } - for e in events - ] - else: - # No recorded events yet; fall back to the config so `dlm show` - # still reports that a gate exists and in which mode. - per_adapter = [{"adapter_name": name} for name in meta.adapter_names] - return { - "mode": meta.mode, - "adapter_names": list(meta.adapter_names), - "input_dim": meta.input_dim, - "hidden_proj_dim": meta.hidden_proj_dim, - "last_run_id": run_id, - "per_adapter": per_adapter, - } - - -def _summarize_preference_mining(store_root: Path) -> dict[str, object] | None: - """Return the latest preference-mine summary for `dlm show --json`.""" - from dlm.metrics import queries as _queries - - totals = _queries.preference_mining_totals(store_root) - if totals is None: - return None - last = _queries.latest_preference_mining(store_root) - assert last is not None - rows = _queries.preference_mining_for_run(store_root, last.run_id) - return { - "run_count": totals.run_count, - "event_count": totals.event_count, - "total_mined_pairs": totals.total_mined_pairs, - "total_skipped_prompts": totals.total_skipped_prompts, - "last_run_id": last.run_id, - "last_run_event_count": len(rows), - "last_event": _queries.preference_mining_to_dict([last])[0], - } - - -def _summarize_base_security(base_model_key: str) -> dict[str, object] | None: - """Surface security-sensitive base-model flags for `dlm show`. - - Today that's just `trust_remote_code` — a flag that causes the HF - loader to execute Python from the model repo. We resolve the spec - out of the in-process registry (no network: the resolver reads a - frozen Python dict) so users can see which bases opt in without - grepping source. Returns None when the key doesn't resolve (an - `hf:...` escape hatch that isn't in the registry); the caller - silently skips in that case. - """ - from dlm.base_models import resolve as resolve_base_model - from dlm.base_models.errors import BaseModelError - - try: - spec = resolve_base_model(base_model_key, accept_license=True) - except BaseModelError: - return None - return { - "base_model": spec.key, - "architecture": spec.architecture, - "trust_remote_code": bool(spec.trust_remote_code), - } - - -def _render_base_security_text(console: object, snap: dict[str, object]) -> None: - from rich.console import Console - - assert isinstance(console, Console) - arch = snap.get("architecture", "?") - console.print( - f" [yellow]security:[/yellow] base uses [red]trust_remote_code=True[/red] " - f"(arch={arch}) — HF loader will execute Python from the model repo" - ) - - -def _render_gate_text(console: object, snap: dict[str, object]) -> None: - from rich.console import Console - - assert isinstance(console, Console) - mode = snap.get("mode", "?") - if mode == "diverged": - console.print(" adapter gate ([red]diverged[/red]):") - console.print( - " [yellow]gate training produced a non-finite loss; " - "store fell back to gate-less routing[/yellow]" - ) - else: - console.print(f" adapter gate ({mode}):") - per_adapter = snap.get("per_adapter", []) - if isinstance(per_adapter, list): - for entry in per_adapter: - if not isinstance(entry, dict): - continue - name = entry.get("adapter_name", "?") - weight = entry.get("mean_weight") - count = entry.get("sample_count") - if weight is None: - console.print(f" {name} [dim](no recorded events)[/dim]") - else: - w = float(weight) if isinstance(weight, (int, float)) else 0.0 - c = count if isinstance(count, int) else 0 - console.print(f" {name:<16} weight={w:.3f} samples={c}") - - -def _render_training_cache_text(console: object, snap: dict[str, object]) -> None: - from rich.console import Console - - assert isinstance(console, Console) - ec_raw = snap.get("entry_count", 0) - by_raw = snap.get("bytes", 0) - entry_count = ec_raw if isinstance(ec_raw, int) else 0 - byte_count = by_raw if isinstance(by_raw, int) else 0 - console.print(" tokenized cache:") - console.print(f" entries: {entry_count}") - console.print(f" size: {_human_size(byte_count)}") - rate = snap.get("last_run_hit_rate") - if isinstance(rate, (int, float)): - console.print(f" last hit rate: {float(rate):.1%}") - - -def _render_training_sources_text(console: object, records: list[dict[str, object]]) -> None: - from rich.console import Console - - assert isinstance(console, Console) - console.print(" training sources:") - for rec in records: - path = rec["path"] - fc = rec.get("file_count") - tb = rec.get("total_bytes") - if fc is None: - console.print(f" {path} [dim](not expanded)[/dim]") - else: - size = int(tb) if isinstance(tb, int) else 0 - console.print(f" {path} {fc} file(s), {_human_size(size)}") diff --git a/src/dlm/cli/commands/show.py b/src/dlm/cli/commands/show.py new file mode 100644 index 0000000..639c9ce --- /dev/null +++ b/src/dlm/cli/commands/show.py @@ -0,0 +1,491 @@ +"""`dlm show` — show training history, exports, and adapter state.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +import typer + +from dlm.cli.commands._shared import _human_size + + +def show_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to inspect.")], + json_out: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False, +) -> None: + """Show training history, exports, and adapter state.""" + import json as _json + import sys + + from rich.console import Console + + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.store.errors import ManifestCorruptError + from dlm.store.inspect import inspect_store + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + out_console = Console() + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]show:[/red] {exc}") + raise typer.Exit(code=1) from exc + + store = for_dlm(parsed.frontmatter.dlm_id) + training_sources, discovered_configs = _summarize_training_sources_and_discovered( + parsed, path.resolve().parent + ) + # The per-document cache config comes from frontmatter, not on-disk + # state — report it on both the pre-train and initialized-store paths + # so authors can sanity-check the knobs before `dlm train` runs. + cache_cfg = parsed.frontmatter.training.cache + training_cache_config: dict[str, object] = { + "enabled": cache_cfg.enabled, + "max_bytes": cache_cfg.max_bytes, + "prune_older_than_days": cache_cfg.prune_older_than_days, + } + + # Store may not exist yet (no `dlm train` run). Treat that as an + # informational state rather than an error — useful after `dlm init`. + if not store.manifest.exists(): + if json_out: + payload: dict[str, object] = { + "dlm_id": parsed.frontmatter.dlm_id, + "base_model": parsed.frontmatter.base_model, + "store_initialized": False, + "source_path": str(path.resolve()), + "training_cache_config": training_cache_config, + } + if training_sources is not None: + payload["training_sources"] = training_sources + if discovered_configs: + payload["discovered_training_configs"] = discovered_configs + sys.stdout.write(_json.dumps(payload, indent=2) + "\n") + else: + out_console.print(f"[bold]{path}[/bold]") + out_console.print(f" dlm_id: {parsed.frontmatter.dlm_id}") + out_console.print(f" base_model: {parsed.frontmatter.base_model}") + out_console.print(" store: [dim]not yet initialized (run `dlm train`)[/dim]") + if training_sources: + _render_training_sources_text(out_console, training_sources) + return + + try: + inspection = inspect_store(store, source_path=path.resolve()) + except ManifestCorruptError as exc: + console.print(f"[red]show:[/red] {exc}") + raise typer.Exit(code=1) from exc + + training_cache = _summarize_training_cache(store.tokenized_cache_dir, store.root) + gate = _summarize_gate(store) + preference_mining = _summarize_preference_mining(store.root) + base_security = _summarize_base_security(parsed.frontmatter.base_model) + + if json_out: + payload_full = _inspection_to_dict(inspection) + if training_sources is not None: + payload_full["training_sources"] = training_sources + if discovered_configs: + payload_full["discovered_training_configs"] = discovered_configs + if training_cache is not None: + payload_full["training_cache"] = training_cache + payload_full["training_cache_config"] = training_cache_config + if gate is not None: + payload_full["gate"] = gate + if preference_mining is not None: + payload_full["preference_mining"] = preference_mining + payload_full["preference_mining_runs"] = preference_mining["run_count"] + payload_full["total_auto_mined_pairs"] = preference_mining["total_mined_pairs"] + if base_security is not None: + payload_full["base_security"] = base_security + # Write JSON to raw stdout — Rich's Console wraps lines at the + # terminal width and would corrupt the JSON. + sys.stdout.write(_json.dumps(payload_full, indent=2, default=str) + "\n") + return + + _render_inspection_text(out_console, path, inspection) + if training_sources: + _render_training_sources_text(out_console, training_sources) + if training_cache is not None and training_cache.get("entry_count", 0): + _render_training_cache_text(out_console, training_cache) + if gate is not None: + _render_gate_text(out_console, gate) + if base_security is not None and base_security.get("trust_remote_code"): + _render_base_security_text(out_console, base_security) + + +def _inspection_to_dict(inspection: object) -> dict[str, object]: + """Flatten a StoreInspection into a JSON-safe dict. + + Schema is the v1 contract for `dlm show --json`; any reshape is a + version bump (recorded in tests/golden/cli-json/). + """ + from dlm.store.inspect import StoreInspection + + assert isinstance(inspection, StoreInspection) + return { + "dlm_id": inspection.dlm_id, + "path": str(inspection.path), + "base_model": inspection.base_model, + "base_model_revision": inspection.base_model_revision, + "adapter_version": inspection.adapter_version, + "training_runs": inspection.training_runs, + "last_trained_at": inspection.last_trained_at, + "has_adapter_current": inspection.has_adapter_current, + "replay_size_bytes": inspection.replay_size_bytes, + "total_size_bytes": inspection.total_size_bytes, + "source_path": str(inspection.source_path) if inspection.source_path else None, + "orphaned": inspection.orphaned, + "exports": [e.model_dump(mode="json") for e in inspection.exports], + "content_hashes": dict(inspection.content_hashes), + "pinned_versions": dict(inspection.pinned_versions), + "named_adapters": [ + { + "name": a.name, + "has_current": a.has_current, + "latest_version": a.latest_version, + } + for a in inspection.named_adapters + ], + } + + +def _render_inspection_text(console: object, path: Path, inspection: object) -> None: + """Human-readable `dlm show` output.""" + from rich.console import Console + + from dlm.store.inspect import StoreInspection + + assert isinstance(console, Console) + assert isinstance(inspection, StoreInspection) + + console.print(f"[bold]{path}[/bold]") + console.print(f" dlm_id: {inspection.dlm_id}") + rev = inspection.base_model_revision + rev_str = f" (revision {rev[:7]})" if rev else "" + console.print(f" base_model: {inspection.base_model}{rev_str}") + console.print( + f" store: {inspection.path} ({_human_size(inspection.total_size_bytes)})" + ) + if inspection.named_adapters: + # Multi-adapter store: render the per-adapter pointers rather + # than the flat field (which stays 0 on multi-adapter docs). + console.print(" adapters:") + for adapter in inspection.named_adapters: + if adapter.has_current: + console.print(f" {adapter.name:16}v{adapter.latest_version:04d}") + else: + console.print(f" {adapter.name:16}[dim]no current pointer[/dim]") + elif inspection.has_adapter_current: + console.print(f" adapter: v{inspection.adapter_version:04d}") + else: + console.print(" adapter: [dim]none (no `dlm train` yet)[/dim]") + last = inspection.last_trained_at + last_str = f" — last {last.isoformat(timespec='seconds')}" if last else "" + console.print(f" training runs: {inspection.training_runs}{last_str}") + console.print(f" exports: {len(inspection.exports)}") + for exp in inspection.exports: + tag = f" — {exp.ollama_name}" if exp.ollama_name else "" + console.print(f" {exp.quant}{tag}") + if inspection.orphaned: + console.print(" [yellow]orphaned:[/yellow] source .dlm is missing or mismatched") + + +def _summarize_training_sources(parsed: object, base_path: Path) -> list[dict[str, object]] | None: + """Best-effort resolution of `training.sources` for `dlm show`. + + Returns None when the frontmatter declares no directives; returns + a list of per-source dicts otherwise. Failures to expand (missing + paths, policy escapes) fall back to declared-only records so the + show output stays useful for debugging a misconfigured directive. + """ + records, _ = _summarize_training_sources_and_discovered(parsed, base_path) + return records + + +def _summarize_training_sources_and_discovered( + parsed: object, base_path: Path +) -> tuple[list[dict[str, object]] | None, list[dict[str, object]]]: + """Like `_summarize_training_sources` but also returns the per-anchor + `.dlm/training.yaml` + `.dlm/ignore` discovery records. + + Returns `(training_sources, discovered_configs)`. `discovered_configs` + is always a list (empty when nothing was found or the expansion + failed); `training_sources` matches the single-value helper's + contract. + """ + from dlm.directives import DirectiveError, expand_sources + from dlm.doc.parser import ParsedDlm + + assert isinstance(parsed, ParsedDlm) + directives = parsed.frontmatter.training.sources + if not directives: + return None, [] + + declared: list[dict[str, object]] = [ + { + "path": d.path, + "include": list(d.include), + "exclude": list(d.exclude), + "max_files": d.max_files, + "max_bytes_per_file": d.max_bytes_per_file, + } + for d in directives + ] + + try: + result = expand_sources(parsed, base_path=base_path) + except (DirectiveError, OSError): + return declared, [] + + records: list[dict[str, object]] = [] + for decl, prov in zip(declared, result.provenance, strict=False): + records.append( + { + **decl, + "file_count": prov.file_count, + "total_bytes": prov.total_bytes, + "skipped_binary": prov.skipped_binary, + "skipped_encoding": prov.skipped_encoding, + "skipped_over_size": prov.skipped_over_size, + } + ) + # If the expander returned fewer entries than declared (shouldn't + # happen on success but defensive), pad with declared-only. + if len(records) < len(declared): + records.extend(declared[len(records) :]) + + discovered_records: list[dict[str, object]] = [] + for dc in result.discovered: + discovered_records.append( + { + "anchor": str(dc.anchor), + "has_training_yaml": dc.config is not None, + "has_ignore": bool(dc.ignore_rules), + "include": list(dc.config.include) if dc.config else [], + "exclude": list(dc.config.exclude) if dc.config else [], + "exclude_defaults": (dc.config.exclude_defaults if dc.config else True), + "metadata": dict(dc.config.metadata) if dc.config else {}, + "ignore_rules": len(dc.ignore_rules), + } + ) + return records, discovered_records + + +def _summarize_training_cache(cache_dir: Path, store_root: Path) -> dict[str, object] | None: + """Return a JSON-friendly snapshot of the tokenized-section cache. + + None when the cache dir doesn't exist (store never trained with + the cache, or pre-Sprint-31 layout). Cheap — reads the manifest + only, not the entry files. + """ + if not cache_dir.is_dir(): + return None + from dlm.directives.cache import TokenizedCache + from dlm.metrics import queries as _queries + + cache = TokenizedCache.open(cache_dir) + last = _queries.latest_tokenization(store_root) + return { + "path": str(cache_dir), + "entry_count": cache.entry_count, + "bytes": cache.total_bytes, + "last_run_hit_rate": last.hit_rate if last else None, + "last_run_id": last.run_id if last else None, + } + + +def _summarize_gate(store: object) -> dict[str, object] | None: + """Return a JSON-friendly snapshot of the learned adapter gate. + + None when the store has no gate config (pre-Sprint-34 runs, or + `training.gate.enabled` was false). Reads two sources: the + on-disk `gate_config.json` for mode + adapter order, and the + metrics `gate_events` table for per-adapter mean weight from the + most recent run that recorded a gate. + """ + import json as _json + + from dlm.store.paths import StorePath + from dlm.train.gate.paths import gate_config_path + + assert isinstance(store, StorePath) + cfg_path = gate_config_path(store) + + from dlm.metrics import queries as _queries + from dlm.train.gate.module import GateMetadata + + events = _queries.latest_gate_events(store.root) + # Divergence path: training raised before writing a config, but we + # still emit one GateEvent per adapter with mode="diverged" so + # operators can see the failure. Surface it even when the config + # file is absent. + if not cfg_path.exists(): + if events and events[0].mode == "diverged": + return { + "mode": "diverged", + "adapter_names": [e.adapter_name for e in events], + "input_dim": None, + "hidden_proj_dim": None, + "last_run_id": events[0].run_id, + "per_adapter": [ + { + "adapter_name": e.adapter_name, + "mean_weight": e.mean_weight, + "sample_count": e.sample_count, + "mode": e.mode, + } + for e in events + ], + } + return None + + raw = _json.loads(cfg_path.read_text(encoding="utf-8")) + meta = GateMetadata.from_json(raw) + per_adapter: list[dict[str, object]] = [] + run_id: int | None = None + if events: + run_id = events[0].run_id + per_adapter = [ + { + "adapter_name": e.adapter_name, + "mean_weight": e.mean_weight, + "sample_count": e.sample_count, + "mode": e.mode, + } + for e in events + ] + else: + # No recorded events yet; fall back to the config so `dlm show` + # still reports that a gate exists and in which mode. + per_adapter = [{"adapter_name": name} for name in meta.adapter_names] + return { + "mode": meta.mode, + "adapter_names": list(meta.adapter_names), + "input_dim": meta.input_dim, + "hidden_proj_dim": meta.hidden_proj_dim, + "last_run_id": run_id, + "per_adapter": per_adapter, + } + + +def _summarize_preference_mining(store_root: Path) -> dict[str, object] | None: + """Return the latest preference-mine summary for `dlm show --json`.""" + from dlm.metrics import queries as _queries + + totals = _queries.preference_mining_totals(store_root) + if totals is None: + return None + last = _queries.latest_preference_mining(store_root) + assert last is not None + rows = _queries.preference_mining_for_run(store_root, last.run_id) + return { + "run_count": totals.run_count, + "event_count": totals.event_count, + "total_mined_pairs": totals.total_mined_pairs, + "total_skipped_prompts": totals.total_skipped_prompts, + "last_run_id": last.run_id, + "last_run_event_count": len(rows), + "last_event": _queries.preference_mining_to_dict([last])[0], + } + + +def _summarize_base_security(base_model_key: str) -> dict[str, object] | None: + """Surface security-sensitive base-model flags for `dlm show`. + + Today that's just `trust_remote_code` — a flag that causes the HF + loader to execute Python from the model repo. We resolve the spec + out of the in-process registry (no network: the resolver reads a + frozen Python dict) so users can see which bases opt in without + grepping source. Returns None when the key doesn't resolve (an + `hf:...` escape hatch that isn't in the registry); the caller + silently skips in that case. + """ + from dlm.base_models import resolve as resolve_base_model + from dlm.base_models.errors import BaseModelError + + try: + spec = resolve_base_model(base_model_key, accept_license=True) + except BaseModelError: + return None + return { + "base_model": spec.key, + "architecture": spec.architecture, + "trust_remote_code": bool(spec.trust_remote_code), + } + + +def _render_base_security_text(console: object, snap: dict[str, object]) -> None: + from rich.console import Console + + assert isinstance(console, Console) + arch = snap.get("architecture", "?") + console.print( + f" [yellow]security:[/yellow] base uses [red]trust_remote_code=True[/red] " + f"(arch={arch}) — HF loader will execute Python from the model repo" + ) + + +def _render_gate_text(console: object, snap: dict[str, object]) -> None: + from rich.console import Console + + assert isinstance(console, Console) + mode = snap.get("mode", "?") + if mode == "diverged": + console.print(" adapter gate ([red]diverged[/red]):") + console.print( + " [yellow]gate training produced a non-finite loss; " + "store fell back to gate-less routing[/yellow]" + ) + else: + console.print(f" adapter gate ({mode}):") + per_adapter = snap.get("per_adapter", []) + if isinstance(per_adapter, list): + for entry in per_adapter: + if not isinstance(entry, dict): + continue + name = entry.get("adapter_name", "?") + weight = entry.get("mean_weight") + count = entry.get("sample_count") + if weight is None: + console.print(f" {name} [dim](no recorded events)[/dim]") + else: + w = float(weight) if isinstance(weight, (int, float)) else 0.0 + c = count if isinstance(count, int) else 0 + console.print(f" {name:<16} weight={w:.3f} samples={c}") + + +def _render_training_cache_text(console: object, snap: dict[str, object]) -> None: + from rich.console import Console + + assert isinstance(console, Console) + ec_raw = snap.get("entry_count", 0) + by_raw = snap.get("bytes", 0) + entry_count = ec_raw if isinstance(ec_raw, int) else 0 + byte_count = by_raw if isinstance(by_raw, int) else 0 + console.print(" tokenized cache:") + console.print(f" entries: {entry_count}") + console.print(f" size: {_human_size(byte_count)}") + rate = snap.get("last_run_hit_rate") + if isinstance(rate, (int, float)): + console.print(f" last hit rate: {float(rate):.1%}") + + +def _render_training_sources_text(console: object, records: list[dict[str, object]]) -> None: + from rich.console import Console + + assert isinstance(console, Console) + console.print(" training sources:") + for rec in records: + path = rec["path"] + fc = rec.get("file_count") + tb = rec.get("total_bytes") + if fc is None: + console.print(f" {path} [dim](not expanded)[/dim]") + else: + size = int(tb) if isinstance(tb, int) else 0 + console.print(f" {path} {fc} file(s), {_human_size(size)}") From dc211cfc644d52018f0ca2830bc7d96b8013a0a4 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:13:46 -0400 Subject: [PATCH 21/46] Extract init_cmd + 4 init helpers into cli/commands/init.py --- src/dlm/cli/commands/__init__.py | 376 +---------------------------- src/dlm/cli/commands/init.py | 383 ++++++++++++++++++++++++++++++ tests/unit/cli/test_init_edges.py | 5 +- 3 files changed, 389 insertions(+), 375 deletions(-) create mode 100644 src/dlm/cli/commands/init.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 8348f5e..616786a 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -25,6 +25,8 @@ from dlm.cli.commands.cache import cache_show_cmd as cache_show_cmd from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd from dlm.cli.commands.harvest import harvest_cmd as harvest_cmd +from dlm.cli.commands.init import _prompt_accept_license as _prompt_accept_license +from dlm.cli.commands.init import init_cmd as init_cmd from dlm.cli.commands.metrics import metrics_cmd as metrics_cmd from dlm.cli.commands.metrics import metrics_watch_cmd as metrics_watch_cmd from dlm.cli.commands.migrate import migrate_cmd as migrate_cmd @@ -53,380 +55,6 @@ def _stub(sprint: str, subject: str) -> None: ) -def init_cmd( - path: Annotated[Path, typer.Argument(help="Target .dlm path to create.")], - base: Annotated[ - str, typer.Option("--base", help="Base model key or hf:org/name.") - ] = "qwen2.5-1.5b", - template: Annotated[ - str | None, - typer.Option( - "--template", - help="Start from a named gallery template (see `dlm templates list`).", - ), - ] = None, - i_accept_license: Annotated[ - bool, - typer.Option("--i-accept-license", help="Accept gated base-model license."), - ] = False, - force: Annotated[ - bool, - typer.Option("--force", help="Overwrite an existing .dlm at path."), - ] = False, - skip_export_probes: Annotated[ - bool, - typer.Option( - "--skip-export-probes", - help=( - "Skip the llama.cpp / GGUF-conversion probes so brand-new " - "architectures (not yet in our vendored llama.cpp) can still " - "be used for training + HF inference. Forfeits `dlm export` " - "to Ollama until the vendored copy catches up." - ), - ), - ] = False, - multimodal: Annotated[ - bool, - typer.Option( - "--multimodal", - help=( - "Scaffold a vision-language .dlm with an `::image::` section. " - "Defaults --base to paligemma-3b-mix-224 and skips GGUF " - "export probes because current GGUF export does not " - "support vision-language bases." - ), - ), - ] = False, - audio: Annotated[ - bool, - typer.Option( - "--audio", - help=( - "Scaffold an audio-language .dlm with an `::audio::` section. " - "Defaults --base to qwen2-audio-7b-instruct and skips GGUF " - "export probes (audio archs are not on llama.cpp's roadmap)." - ), - ), - ] = False, -) -> None: - """Bootstrap a new .dlm file with sensible defaults.""" - - from rich.console import Console - - from dlm.base_models import ( - GatedModelError, - UnknownBaseModelError, - is_gated, - require_acceptance, - ) - from dlm.base_models import resolve as resolve_base_model - from dlm.io.ulid import mint_ulid - - console = Console(stderr=True) - - if path.exists() and not force: - console.print( - f"[red]init:[/red] {path} already exists. " - "Re-run with [bold]--force[/bold] to overwrite." - ) - raise typer.Exit(code=1) - - # --multimodal / --audio are mutually exclusive with each other and - # with --template (templates pin their own base + body shape; v1 - # doesn't ship media templates yet). - if multimodal and audio: - console.print( - "[red]init:[/red] --multimodal and --audio are mutually exclusive " - "(each targets a different modality)." - ) - raise typer.Exit(code=2) - if multimodal and template is not None: - console.print( - "[red]init:[/red] --multimodal and --template are mutually exclusive; " - "v1 doesn't ship a VL template (see `dlm templates list`)." - ) - raise typer.Exit(code=2) - if audio and template is not None: - console.print( - "[red]init:[/red] --audio and --template are mutually exclusive; " - "v1 doesn't ship an audio template (see `dlm templates list`)." - ) - raise typer.Exit(code=2) - - # --multimodal / --audio override the text-first --base default. A - # user who wants a different media base passes --base explicitly; - # we verify the pick is the right modality below. - if multimodal and base == "qwen2.5-1.5b": - base = "paligemma-3b-mix-224" - if audio and base == "qwen2.5-1.5b": - base = "qwen2-audio-7b-instruct" - - # --template resolves the base from the template's meta.yaml; the - # --base default is kept for the no-template path only. Users who - # pass both a template and an explicit --base get a warning but the - # template still wins (the template body was authored against its - # recommended base). - if template is not None: - from dlm.templates import load_template - - # Peek at the template's recommended base WITHOUT writing - # anything yet, so we can handle the license prompt against the - # right base (the template's, not `--base`) before committing. - try: - resolved_base = load_template(template).meta.recommended_base - except Exception as exc: - console.print(f"[red]init:[/red] {exc}") - raise typer.Exit(code=1) from exc - if base != "qwen2.5-1.5b" and base != resolved_base: - console.print( - f"[yellow]init:[/yellow] --base {base} ignored; template " - f"{template!r} uses {resolved_base}." - ) - else: - resolved_base = base - - # Media bases can't clear the GGUF-conversion probes. Force-skip - # them so the probe suite doesn't false-fail the init. - if multimodal or audio: - skip_export_probes = True - - try: - spec = resolve_base_model( - resolved_base, - accept_license=i_accept_license, - skip_export_probes=skip_export_probes, - ) - except UnknownBaseModelError as exc: - console.print(f"[red]init:[/red] {exc}") - raise typer.Exit(code=1) from exc - except GatedModelError as exc: - # Gated + user didn't pass --i-accept-license up-front. Prompt - # interactively if we have a TTY; otherwise refuse with the flag - # pointer (audit F22 non-interactive path). - if not _prompt_accept_license(console, resolved_base, exc.license_url): - console.print( - "[red]license:[/red] refused. Re-run with " - "[bold]--i-accept-license[/bold] to accept non-interactively." - ) - raise typer.Exit(code=1) from exc - spec = resolve_base_model( - resolved_base, - accept_license=True, - skip_export_probes=skip_export_probes, - ) - - # NOW apply the template — license has already been accepted - # (either by --i-accept-license or interactive prompt), so pass - # the acceptance through. apply_template enforces the license - # contract at its boundary. - applied_result = None - if template is not None: - from dlm.templates import TemplateError, apply_template - - try: - applied_result = apply_template(template, path, force=force, accept_license=True) - except TemplateError as exc: - console.print(f"[red]init:[/red] {exc}") - raise typer.Exit(code=1) from exc - - # Record the license acceptance (or None for non-gated specs). We - # know `resolve_base_model` already validated the flag/prompt chain - # — `accept_license=True` means either the user passed the flag or - # answered the interactive prompt. Either path is a real - # acceptance; persist the record now so subsequent `dlm train` / - # `dlm export` don't re-prompt. - acceptance_via: Literal["cli_flag", "interactive"] = ( - "cli_flag" if i_accept_license else "interactive" - ) - acceptance = ( - require_acceptance(spec, accept_license=True, via=acceptance_via) - if is_gated(spec) - else None - ) - - # Media flags require a matching-modality base. Check after resolve - # so users pointing at an unknown or wrong-modality hf:org/name get - # a clear explanation rather than a schema error deep in parse time. - if multimodal and spec.modality != "vision-language": - console.print( - f"[red]init:[/red] --multimodal requires a vision-language base; " - f"{spec.key!r} is modality='{spec.modality}'. " - "Pick --base paligemma-3b-mix-224 or drop --multimodal." - ) - raise typer.Exit(code=2) - if audio and spec.modality != "audio-language": - console.print( - f"[red]init:[/red] --audio requires an audio-language base; " - f"{spec.key!r} is modality='{spec.modality}'. " - "Pick --base qwen2-audio-7b-instruct or drop --audio." - ) - raise typer.Exit(code=2) - - if applied_result is not None: - dlm_id = applied_result.dlm_id - else: - dlm_id = mint_ulid() - if multimodal: - _write_init_scaffold_multimodal(path, spec.key, dlm_id) - elif audio: - _write_init_scaffold_audio(path, spec.key, dlm_id) - else: - _write_init_scaffold(path, spec.key, dlm_id) - - # Create the store + write the initial manifest so `dlm show` sees - # the license record and `dlm train` has a prior manifest to diff - # against. - from dlm.store.manifest import Manifest, save_manifest - from dlm.store.paths import for_dlm - - store = for_dlm(dlm_id) - store.ensure_layout() - save_manifest( - store.manifest, - Manifest( - dlm_id=dlm_id, - base_model=spec.key, - base_model_revision=spec.revision, - source_path=path.resolve(), - license_acceptance=acceptance, - ), - ) - if applied_result is not None: - meta = applied_result.template.meta - console.print( - f"[green]init:[/green] wrote {path} from template " - f"[bold]{meta.name}[/bold] ({meta.title}) — base {spec.key}." - ) - else: - console.print(f"[green]init:[/green] wrote {path}") - - -def _prompt_accept_license(console: object, base: str, license_url: str | None) -> bool: - """Interactive y/N prompt for gated base-model license acceptance. - - Non-interactive runs (no TTY) return False; the caller surfaces the - `--i-accept-license` flag pointer in that case. - """ - import sys - - from rich.console import Console - - assert isinstance(console, Console) - - if not sys.stdin.isatty(): - return False - - console.print( - f"[yellow]This base model ({base}) requires accepting the upstream license.[/yellow]" - ) - if license_url: - console.print(f" Review the license at: {license_url}") - console.print("Accept and continue? [y/N]: ", end="") - try: - answer = input().strip().lower() - except EOFError: - return False - return answer in ("y", "yes") - - -def _write_init_scaffold(path: Path, base_model_key: str, dlm_id: str) -> None: - """Write a minimal-but-valid .dlm file at `path`. - - Body has one PROSE paragraph + a commented instruction section so - users see both section shapes on first open. - """ - scaffold = f"""--- -dlm_id: {dlm_id} -dlm_version: 1 -base_model: {base_model_key} ---- - -# Your document title - -Write prose here. It will train via continued pretraining (CPT) loss. - -::instruction:: - -### Q -Your example question. - -### A -Your example answer. -""" - path.write_text(scaffold, encoding="utf-8") - - -def _write_init_scaffold_multimodal(path: Path, base_model_key: str, dlm_id: str) -> None: - """Write a VL-shaped .dlm file at `path`. - - Body shows the `::image::` attribute fence + a caption so users - see the v10 grammar on first open. The placeholder path - `figures/your-image.png` is deliberately non-existent — first - `dlm train` will refuse with a clear file-missing error, prompting - the user to drop a real image in. This is friendlier than - committing an inert sample that users might not notice isn't theirs. - - `dlm_version: 10` because IMAGE sections require schema v10. - """ - scaffold = f"""--- -dlm_id: {dlm_id} -dlm_version: 10 -base_model: {base_model_key} ---- - -# Your document title - -Write prose here. It will train via continued pretraining (CPT) loss. - -::image path="figures/your-image.png" alt="short description":: -Caption text describing the image. Training rows bundle the image -with this caption as `\\n`. - -::instruction:: - -### Q -What is in this image? - -### A -Describe what the image shows. -""" - path.write_text(scaffold, encoding="utf-8") - - -def _write_init_scaffold_audio(path: Path, base_model_key: str, dlm_id: str) -> None: - """Write an audio-shaped .dlm file at `path`. - - Body shows the `::audio::` attribute fence with the sibling- - transcript-friendly `transcript="..."` form so users see the v11 - grammar on first open. The placeholder path `clips/your-clip.wav` - is deliberately non-existent — first `dlm train` refuses with a - clear file-missing error rather than silently training on an inert - sample. - - `dlm_version: 11` because AUDIO sections require schema v11. - """ - scaffold = f"""--- -dlm_id: {dlm_id} -dlm_version: 11 -base_model: {base_model_key} ---- - -# Your document title - -Write prose here. It will train via continued pretraining (CPT) loss. - -::audio path="clips/your-clip.wav" transcript="Transcript of the audio clip.":: - -::instruction:: - -### Q -What was said in this recording? - -### A -Describe what you hear in the audio. -""" - path.write_text(scaffold, encoding="utf-8") - def train_cmd( path: Annotated[ diff --git a/src/dlm/cli/commands/init.py b/src/dlm/cli/commands/init.py new file mode 100644 index 0000000..27c3db9 --- /dev/null +++ b/src/dlm/cli/commands/init.py @@ -0,0 +1,383 @@ +"""`dlm init` — bootstrap a new .dlm file with sensible defaults.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated, Literal + +import typer + + +def init_cmd( + path: Annotated[Path, typer.Argument(help="Target .dlm path to create.")], + base: Annotated[ + str, typer.Option("--base", help="Base model key or hf:org/name.") + ] = "qwen2.5-1.5b", + template: Annotated[ + str | None, + typer.Option( + "--template", + help="Start from a named gallery template (see `dlm templates list`).", + ), + ] = None, + i_accept_license: Annotated[ + bool, + typer.Option("--i-accept-license", help="Accept gated base-model license."), + ] = False, + force: Annotated[ + bool, + typer.Option("--force", help="Overwrite an existing .dlm at path."), + ] = False, + skip_export_probes: Annotated[ + bool, + typer.Option( + "--skip-export-probes", + help=( + "Skip the llama.cpp / GGUF-conversion probes so brand-new " + "architectures (not yet in our vendored llama.cpp) can still " + "be used for training + HF inference. Forfeits `dlm export` " + "to Ollama until the vendored copy catches up." + ), + ), + ] = False, + multimodal: Annotated[ + bool, + typer.Option( + "--multimodal", + help=( + "Scaffold a vision-language .dlm with an `::image::` section. " + "Defaults --base to paligemma-3b-mix-224 and skips GGUF " + "export probes because current GGUF export does not " + "support vision-language bases." + ), + ), + ] = False, + audio: Annotated[ + bool, + typer.Option( + "--audio", + help=( + "Scaffold an audio-language .dlm with an `::audio::` section. " + "Defaults --base to qwen2-audio-7b-instruct and skips GGUF " + "export probes (audio archs are not on llama.cpp's roadmap)." + ), + ), + ] = False, +) -> None: + """Bootstrap a new .dlm file with sensible defaults.""" + + from rich.console import Console + + from dlm.base_models import ( + GatedModelError, + UnknownBaseModelError, + is_gated, + require_acceptance, + ) + from dlm.base_models import resolve as resolve_base_model + from dlm.io.ulid import mint_ulid + + console = Console(stderr=True) + + if path.exists() and not force: + console.print( + f"[red]init:[/red] {path} already exists. " + "Re-run with [bold]--force[/bold] to overwrite." + ) + raise typer.Exit(code=1) + + # --multimodal / --audio are mutually exclusive with each other and + # with --template (templates pin their own base + body shape; v1 + # doesn't ship media templates yet). + if multimodal and audio: + console.print( + "[red]init:[/red] --multimodal and --audio are mutually exclusive " + "(each targets a different modality)." + ) + raise typer.Exit(code=2) + if multimodal and template is not None: + console.print( + "[red]init:[/red] --multimodal and --template are mutually exclusive; " + "v1 doesn't ship a VL template (see `dlm templates list`)." + ) + raise typer.Exit(code=2) + if audio and template is not None: + console.print( + "[red]init:[/red] --audio and --template are mutually exclusive; " + "v1 doesn't ship an audio template (see `dlm templates list`)." + ) + raise typer.Exit(code=2) + + # --multimodal / --audio override the text-first --base default. A + # user who wants a different media base passes --base explicitly; + # we verify the pick is the right modality below. + if multimodal and base == "qwen2.5-1.5b": + base = "paligemma-3b-mix-224" + if audio and base == "qwen2.5-1.5b": + base = "qwen2-audio-7b-instruct" + + # --template resolves the base from the template's meta.yaml; the + # --base default is kept for the no-template path only. Users who + # pass both a template and an explicit --base get a warning but the + # template still wins (the template body was authored against its + # recommended base). + if template is not None: + from dlm.templates import load_template + + # Peek at the template's recommended base WITHOUT writing + # anything yet, so we can handle the license prompt against the + # right base (the template's, not `--base`) before committing. + try: + resolved_base = load_template(template).meta.recommended_base + except Exception as exc: + console.print(f"[red]init:[/red] {exc}") + raise typer.Exit(code=1) from exc + if base != "qwen2.5-1.5b" and base != resolved_base: + console.print( + f"[yellow]init:[/yellow] --base {base} ignored; template " + f"{template!r} uses {resolved_base}." + ) + else: + resolved_base = base + + # Media bases can't clear the GGUF-conversion probes. Force-skip + # them so the probe suite doesn't false-fail the init. + if multimodal or audio: + skip_export_probes = True + + try: + spec = resolve_base_model( + resolved_base, + accept_license=i_accept_license, + skip_export_probes=skip_export_probes, + ) + except UnknownBaseModelError as exc: + console.print(f"[red]init:[/red] {exc}") + raise typer.Exit(code=1) from exc + except GatedModelError as exc: + # Gated + user didn't pass --i-accept-license up-front. Prompt + # interactively if we have a TTY; otherwise refuse with the flag + # pointer. + if not _prompt_accept_license(console, resolved_base, exc.license_url): + console.print( + "[red]license:[/red] refused. Re-run with " + "[bold]--i-accept-license[/bold] to accept non-interactively." + ) + raise typer.Exit(code=1) from exc + spec = resolve_base_model( + resolved_base, + accept_license=True, + skip_export_probes=skip_export_probes, + ) + + # NOW apply the template — license has already been accepted + # (either by --i-accept-license or interactive prompt), so pass + # the acceptance through. apply_template enforces the license + # contract at its boundary. + applied_result = None + if template is not None: + from dlm.templates import TemplateError, apply_template + + try: + applied_result = apply_template(template, path, force=force, accept_license=True) + except TemplateError as exc: + console.print(f"[red]init:[/red] {exc}") + raise typer.Exit(code=1) from exc + + # Record the license acceptance (or None for non-gated specs). We + # know `resolve_base_model` already validated the flag/prompt chain + # — `accept_license=True` means either the user passed the flag or + # answered the interactive prompt. Either path is a real + # acceptance; persist the record now so subsequent `dlm train` / + # `dlm export` don't re-prompt. + acceptance_via: Literal["cli_flag", "interactive"] = ( + "cli_flag" if i_accept_license else "interactive" + ) + acceptance = ( + require_acceptance(spec, accept_license=True, via=acceptance_via) + if is_gated(spec) + else None + ) + + # Media flags require a matching-modality base. Check after resolve + # so users pointing at an unknown or wrong-modality hf:org/name get + # a clear explanation rather than a schema error deep in parse time. + if multimodal and spec.modality != "vision-language": + console.print( + f"[red]init:[/red] --multimodal requires a vision-language base; " + f"{spec.key!r} is modality='{spec.modality}'. " + "Pick --base paligemma-3b-mix-224 or drop --multimodal." + ) + raise typer.Exit(code=2) + if audio and spec.modality != "audio-language": + console.print( + f"[red]init:[/red] --audio requires an audio-language base; " + f"{spec.key!r} is modality='{spec.modality}'. " + "Pick --base qwen2-audio-7b-instruct or drop --audio." + ) + raise typer.Exit(code=2) + + if applied_result is not None: + dlm_id = applied_result.dlm_id + else: + dlm_id = mint_ulid() + if multimodal: + _write_init_scaffold_multimodal(path, spec.key, dlm_id) + elif audio: + _write_init_scaffold_audio(path, spec.key, dlm_id) + else: + _write_init_scaffold(path, spec.key, dlm_id) + + # Create the store + write the initial manifest so `dlm show` sees + # the license record and `dlm train` has a prior manifest to diff + # against. + from dlm.store.manifest import Manifest, save_manifest + from dlm.store.paths import for_dlm + + store = for_dlm(dlm_id) + store.ensure_layout() + save_manifest( + store.manifest, + Manifest( + dlm_id=dlm_id, + base_model=spec.key, + base_model_revision=spec.revision, + source_path=path.resolve(), + license_acceptance=acceptance, + ), + ) + if applied_result is not None: + meta = applied_result.template.meta + console.print( + f"[green]init:[/green] wrote {path} from template " + f"[bold]{meta.name}[/bold] ({meta.title}) — base {spec.key}." + ) + else: + console.print(f"[green]init:[/green] wrote {path}") + + +def _prompt_accept_license(console: object, base: str, license_url: str | None) -> bool: + """Interactive y/N prompt for gated base-model license acceptance. + + Non-interactive runs (no TTY) return False; the caller surfaces the + `--i-accept-license` flag pointer in that case. + """ + import sys + + from rich.console import Console + + assert isinstance(console, Console) + + if not sys.stdin.isatty(): + return False + + console.print( + f"[yellow]This base model ({base}) requires accepting the upstream license.[/yellow]" + ) + if license_url: + console.print(f" Review the license at: {license_url}") + console.print("Accept and continue? [y/N]: ", end="") + try: + answer = input().strip().lower() + except EOFError: + return False + return answer in ("y", "yes") + + +def _write_init_scaffold(path: Path, base_model_key: str, dlm_id: str) -> None: + """Write a minimal-but-valid .dlm file at `path`. + + Body has one PROSE paragraph + a commented instruction section so + users see both section shapes on first open. + """ + scaffold = f"""--- +dlm_id: {dlm_id} +dlm_version: 1 +base_model: {base_model_key} +--- + +# Your document title + +Write prose here. It will train via continued pretraining (CPT) loss. + +::instruction:: + +### Q +Your example question. + +### A +Your example answer. +""" + path.write_text(scaffold, encoding="utf-8") + + +def _write_init_scaffold_multimodal(path: Path, base_model_key: str, dlm_id: str) -> None: + """Write a VL-shaped .dlm file at `path`. + + Body shows the `::image::` attribute fence + a caption so users + see the v10 grammar on first open. The placeholder path + `figures/your-image.png` is deliberately non-existent — first + `dlm train` will refuse with a clear file-missing error, prompting + the user to drop a real image in. This is friendlier than + committing an inert sample that users might not notice isn't theirs. + + `dlm_version: 10` because IMAGE sections require schema v10. + """ + scaffold = f"""--- +dlm_id: {dlm_id} +dlm_version: 10 +base_model: {base_model_key} +--- + +# Your document title + +Write prose here. It will train via continued pretraining (CPT) loss. + +::image path="figures/your-image.png" alt="short description":: +Caption text describing the image. Training rows bundle the image +with this caption as `\\n`. + +::instruction:: + +### Q +What is in this image? + +### A +Describe what the image shows. +""" + path.write_text(scaffold, encoding="utf-8") + + +def _write_init_scaffold_audio(path: Path, base_model_key: str, dlm_id: str) -> None: + """Write an audio-shaped .dlm file at `path`. + + Body shows the `::audio::` attribute fence with the sibling- + transcript-friendly `transcript="..."` form so users see the v11 + grammar on first open. The placeholder path `clips/your-clip.wav` + is deliberately non-existent — first `dlm train` refuses with a + clear file-missing error rather than silently training on an inert + sample. + + `dlm_version: 11` because AUDIO sections require schema v11. + """ + scaffold = f"""--- +dlm_id: {dlm_id} +dlm_version: 11 +base_model: {base_model_key} +--- + +# Your document title + +Write prose here. It will train via continued pretraining (CPT) loss. + +::audio path="clips/your-clip.wav" transcript="Transcript of the audio clip.":: + +::instruction:: + +### Q +What was said in this recording? + +### A +Describe what you hear in the audio. +""" + path.write_text(scaffold, encoding="utf-8") diff --git a/tests/unit/cli/test_init_edges.py b/tests/unit/cli/test_init_edges.py index d6e2e7c..c80a673 100644 --- a/tests/unit/cli/test_init_edges.py +++ b/tests/unit/cli/test_init_edges.py @@ -107,7 +107,10 @@ def _fake_resolve( monkeypatch.setattr(base_models, "resolve", _fake_resolve) monkeypatch.setattr(base_models, "is_gated", lambda spec: False) - monkeypatch.setattr(commands, "_prompt_accept_license", lambda console, base, url: True) + monkeypatch.setattr( + "dlm.cli.commands.init._prompt_accept_license", + lambda console, base, url: True, + ) result = runner.invoke( app, From 41cac0a1da1ca1a0869632871ebfe9033bafeb7e Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:17:44 -0400 Subject: [PATCH 22/46] Extract train_cmd + multi-GPU helpers into cli/commands/train.py --- src/dlm/cli/commands/__init__.py | 727 +---------------- src/dlm/cli/commands/train.py | 731 ++++++++++++++++++ tests/unit/cli/test_train_validation_edges.py | 2 +- 3 files changed, 736 insertions(+), 724 deletions(-) create mode 100644 src/dlm/cli/commands/train.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 616786a..4cf0620 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -10,10 +10,9 @@ from __future__ import annotations -import os from collections.abc import Sequence from pathlib import Path -from typing import Annotated, Any, Literal +from typing import Annotated, Any import typer @@ -44,6 +43,9 @@ from dlm.cli.commands.synth import synth_list_cmd as synth_list_cmd from dlm.cli.commands.synth import synth_revert_cmd as synth_revert_cmd from dlm.cli.commands.templates import templates_list_cmd as templates_list_cmd +from dlm.cli.commands.train import _maybe_dispatch_multi_gpu as _maybe_dispatch_multi_gpu +from dlm.cli.commands.train import _strip_gpus_from_argv as _strip_gpus_from_argv +from dlm.cli.commands.train import train_cmd as train_cmd from dlm.cli.commands.unpack import unpack_cmd as unpack_cmd from dlm.cli.commands.verify import verify_cmd as verify_cmd @@ -56,727 +58,6 @@ def _stub(sprint: str, subject: str) -> None: -def train_cmd( - path: Annotated[ - Path, - typer.Argument( - help=( - ".dlm file to train. Or a directory — when passed a directory, " - "`dlm train` auto-scaffolds `/.dlm/corpus.dlm` on first run " - "(with --base) and reuses it on subsequent runs." - ), - ), - ], - resume: Annotated[bool, typer.Option("--resume", help="Resume from last checkpoint.")] = False, - fresh: Annotated[bool, typer.Option("--fresh", help="Discard prior adapter state.")] = False, - seed: Annotated[int | None, typer.Option("--seed", help="Override training seed.")] = None, - max_steps: Annotated[int | None, typer.Option("--max-steps", help="Cap step count.")] = None, - phase: Annotated[ - str, - typer.Option( - "--phase", - help=( - "Which training phases to run: 'sft' (supervised only), " - "'preference' (DPO/ORPO only — requires a prior SFT " - "adapter), or 'all' (SFT then preference when enabled). " - "The preference method (dpo / orpo) comes from " - "training.preference.method in the frontmatter." - ), - ), - ] = "all", - i_accept_license: Annotated[ - bool, - typer.Option( - "--i-accept-license", - help="Accept the base model's license (required for gated bases like llama-3.2).", - ), - ] = False, - strict_lock: Annotated[ - bool, - typer.Option( - "--strict-lock", - help="Fail on any dlm.lock drift, including version warns.", - ), - ] = False, - update_lock: Annotated[ - bool, - typer.Option( - "--update-lock", - help="Overwrite dlm.lock without validating prior entries.", - ), - ] = False, - ignore_lock: Annotated[ - bool, - typer.Option( - "--ignore-lock", - help="Skip dlm.lock validation and don't write a new lock.", - ), - ] = False, - strict_metrics: Annotated[ - bool, - typer.Option( - "--strict-metrics", - help="Promote metrics SQLite write failures to hard errors.", - ), - ] = False, - no_mined: Annotated[ - bool, - typer.Option( - "--no-mined", - help=( - "Exclude auto-mined preference sections from the preference " - "phase, including replay-sampled mined pairs. Hand-authored " - "`::preference::` sections still train normally." - ), - ), - ] = False, - gpus: Annotated[ - str | None, - typer.Option( - "--gpus", - help=( - "Multi-GPU training. `all` uses every visible CUDA device; " - "`N` uses the first N; `0,1` selects exact device ids. " - "Dispatches to `accelerate launch` when >1 device is " - "selected. Omit for single-process training." - ), - ), - ] = None, - watch: Annotated[ - bool, - typer.Option( - "--watch", - help=( - "Save-to-train mode. After an initial train, block on " - "filesystem events and run incremental retrains " - "(mode=resume, step-capped) on each settled save. Ctrl-C " - "exits cleanly between cycles." - ), - ), - ] = False, - watch_max_steps: Annotated[ - int, - typer.Option( - "--watch-max-steps", - help="Per-cycle step cap for --watch. Default 100 keeps cycles responsive.", - ), - ] = 100, - watch_debounce_ms: Annotated[ - int, - typer.Option( - "--watch-debounce-ms", - help="Quiet interval (ms) before a burst of saves triggers a retrain.", - ), - ] = 400, - watch_repl: Annotated[ - bool, - typer.Option( - "--repl", - help=( - "With --watch: also open the REPL so prompts reflect the " - "latest adapter. **Scaffolded** — threading integration " - "is untestable without a two-process harness; emit a " - "not-yet-implemented refusal and exit 2." - ), - ), - ] = False, - base: Annotated[ - str | None, - typer.Option( - "--base", - help=( - "Base model key for auto-scaffold. Required on first run when " - "`path` is a directory without an existing .dlm/ config. " - "Accepts registry keys (smollm2-135m, qwen2.5-coder-1.5b, ...) " - "or `hf:/` for off-registry models." - ), - ), - ] = None, - include: Annotated[ - list[str] | None, - typer.Option( - "--include", - help=( - "Glob pattern for files to train on (auto-scaffold only). " - "Repeatable. Default: '**/*' with --recursive, '*' without. " - "Examples: '**/*.py', '**/*.f90', '**/*.{md,rst}'." - ), - ), - ] = None, - exclude: Annotated[ - list[str] | None, - typer.Option( - "--exclude", - help=( - "Glob pattern for files to skip (auto-scaffold only). " - "Repeatable. Defaults (secrets, VCS, lockfiles, binaries) " - "apply on top via the descent protocol." - ), - ), - ] = None, - recursive: Annotated[ - bool, - typer.Option( - "--recursive/--no-recursive", - "-r/-R", - help=( - "Auto-scaffold include patterns descend into subdirectories. " - "Default True. --no-recursive limits the default include to " - "top-level files only." - ), - ), - ] = True, - name: Annotated[ - str, - typer.Option( - "--name", - help=( - "Adapter name for auto-scaffold → `/.dlm/.dlm`. " - "Default 'corpus'. Lets a single tree host multiple adapters." - ), - ), - ] = "corpus", - policy: Annotated[ - str, - typer.Option( - "--policy", - help=( - "Auto-scaffold sources_policy: 'strict' (default; confines " - "training to the target directory) or 'permissive' (allows " - "absolute paths anywhere)." - ), - ), - ] = "strict", - rescaffold: Annotated[ - bool, - typer.Option( - "--rescaffold", - help=( - "Rewrite an existing scaffolded .dlm in place with the new " - "--base/--include/--exclude/--policy flags. Keeps the same " - "dlm_id (store stays intact). Without it, re-running with " - "frontmatter-editing flags refuses to shadow-edit." - ), - ), - ] = False, - listen_rpc: Annotated[ - str | None, - typer.Option( - "--listen-rpc", - help=( - "Open a JSON-RPC endpoint at (e.g. `127.0.0.1:7429`) " - "that accepts `inject_probe` pushes from sway-style eval " - "harnesses. Probes enter the queue and drain at the next " - "training-cycle boundary. Requires --watch or --max-cycles. " - "Bearer token from DLM_PROBE_TOKEN." - ), - ), - ] = None, - max_cycles: Annotated[ - int, - typer.Option( - "--max-cycles", - help=( - "Convergence stop for --listen-rpc without --watch: cap the " - "probe-driven retrain loop at N cycles. Ignored without " - "--listen-rpc." - ), - ), - ] = 0, - no_cache: Annotated[ - bool, - typer.Option( - "--no-cache", - help=( - "Opt out of the tokenized-section cache for this run. By " - "default, `dlm train` pre-tokenizes directive-sourced rows " - "via ~/.dlm/store//tokenized-cache/ so subsequent runs " - "on the same corpus skip re-tokenization. Use this to " - "bypass the cache for debugging or to compare cached vs " - "uncached training determinism." - ), - ), - ] = False, - skip_export_probes: Annotated[ - bool, - typer.Option( - "--skip-export-probes", - help=( - "Skip the llama.cpp / GGUF-conversion probes so brand-new " - "architectures (not yet in our vendored llama.cpp) can still " - "be used for training + HF inference. Forfeits `dlm export` " - "to Ollama until the vendored copy catches up. Mirrors the " - "flag of the same name on `dlm init`." - ), - ), - ] = False, -) -> None: - """Train / retrain a .dlm against its base model.""" - import sqlite3 - import sys - - from rich.console import Console - - from dlm.base_models import GatedModelError - from dlm.base_models import resolve as resolve_base_model - from dlm.doc.errors import DlmParseError - from dlm.doc.parser import parse_file - from dlm.hardware import doctor - from dlm.lock import LockMode, LockValidationError - from dlm.store.paths import for_dlm - from dlm.train import ( - DiskSpaceError, - OOMError, - ResumeIntegrityError, - TrainingError, - ) - from dlm.train.preference import ( - DpoPhaseError, - NoPreferenceContentError, - PriorAdapterRequiredError, - ) - from dlm.train.preference.phase_orchestrator import Phase, run_phases - - console = Console(stderr=True) - - if phase not in ("sft", "preference", "all"): - console.print(f"[red]error:[/red] --phase must be one of sft|preference|all, got {phase!r}") - raise typer.Exit(code=2) - phase_literal: Phase = phase # type: ignore[assignment] - - if resume and fresh: - console.print("[red]error:[/red] --resume and --fresh are mutually exclusive") - raise typer.Exit(code=2) - mode: Literal["fresh", "resume"] = "resume" if resume else "fresh" - - # --gpus dispatches to accelerate launch when >1 device is - # selected. The single-GPU path falls through to the existing - # in-process trainer; a bare `--gpus 1` is a no-op (users can use - # it to lock the visible device set via CUDA_VISIBLE_DEVICES - # without spawning a subprocess). - if gpus is not None: - # Resolve mixed_precision from capabilities so bf16-incapable - # CUDA GPUs (SM<8.0) don't trip the `accelerate launch` - # default. `probe()` is cheap and runs in the launcher-side - # process only; each rank re-probes via `doctor()` later. - from dlm.hardware.capabilities import probe as _probe_caps - - _caps = _probe_caps() - _mp = "bf16" if _caps.supports_bf16 else "fp16" - exit_code = _maybe_dispatch_multi_gpu(gpus, sys.argv, console, mixed_precision=_mp) - if exit_code is not None: - raise typer.Exit(code=exit_code) - - # Mutual-exclusion gate for the three lock flags. Exactly one (or - # zero) may be set — silently ignoring a conflicting pair would - # mask operator intent. - lock_flag_count = sum((strict_lock, update_lock, ignore_lock)) - if lock_flag_count > 1: - console.print( - "[red]error:[/red] --strict-lock / --update-lock / --ignore-lock " - "are mutually exclusive", - ) - raise typer.Exit(code=2) - lock_mode: LockMode = "default" - if strict_lock: - lock_mode = "strict" - elif update_lock: - lock_mode = "update" - elif ignore_lock: - lock_mode = "ignore" - - # `--no-cache` bypasses the tokenized-section cache for this run. - # Plumbed as an env var because the trainer's pre-tokenize helper - # already reads one — the CLI flag is a discoverable surface over - # the same switch. Rolling the flag into `TrainingPlan` is a - # deferred refactor; the env var is sufficient for the user-facing - # contract and survives `accelerate launch` re-invocations. - if no_cache: - from dlm.train.cache import set_disable_flag - - set_disable_flag("--no-cache") - - if policy not in ("permissive", "strict"): - console.print( - f"[red]error:[/red] --policy must be 'permissive' or 'strict', got {policy!r}" - ) - raise typer.Exit(code=2) - policy_literal: Literal["permissive", "strict"] = policy # type: ignore[assignment] - - # --listen-rpc requires a loop to drain the queue — either --watch - # (file-change cycles) or --max-cycles N (bounded retrain loop). - # Without one, the server would accept probes that never train. We - # also need the bearer token up front so the user sees the refusal - # before we spend time downloading weights. - rpc_config: tuple[str, int, str] | None = None - if listen_rpc is not None: - if not watch and max_cycles <= 0: - console.print( - "[red]error:[/red] --listen-rpc requires --watch or --max-cycles N " - "(the probe queue needs a drain cadence)" - ) - raise typer.Exit(code=2) - token = os.environ.get("DLM_PROBE_TOKEN", "").strip() - if not token: - console.print( - "[red]error:[/red] --listen-rpc needs a bearer token; " - "export DLM_PROBE_TOKEN=" - ) - raise typer.Exit(code=2) - host, _, port_s = listen_rpc.rpartition(":") - if not host or not port_s: - console.print(f"[red]error:[/red] --listen-rpc expects host:port, got {listen_rpc!r}") - raise typer.Exit(code=2) - try: - port = int(port_s) - except ValueError: - console.print(f"[red]error:[/red] --listen-rpc port must be an integer, got {port_s!r}") - raise typer.Exit(code=2) from None - rpc_config = (host, port, token) - - # Directory targets auto-scaffold `/.dlm/corpus.dlm` (or - # reuse an existing one). After this block, `path` always points - # at an actual `.dlm` file that the rest of the flow can parse. - if path.is_dir(): - from dlm.cli.scaffold import ScaffoldError, scaffold_train_target - - try: - scaffold_result = scaffold_train_target( - path, - base=base, - include=tuple(include or ()), - exclude=tuple(exclude or ()), - recursive=recursive, - name=name, - policy=policy_literal, - rescaffold=rescaffold, - ) - except ScaffoldError as exc: - console.print(f"[red]scaffold:[/red] {exc.message}") - raise typer.Exit(code=1) from exc - - if scaffold_result.scaffolded: - console.print( - f"[cyan]scaffolded:[/cyan] {scaffold_result.dlm_path} " - f"(dlm_id={scaffold_result.dlm_id})" - ) - path = scaffold_result.dlm_path - - try: - parsed = parse_file(path) - except (DlmParseError, OSError) as exc: - console.print(f"[red]error:[/red] {exc}") - raise typer.Exit(code=1) from exc - try: - spec = resolve_base_model( - parsed.frontmatter.base_model, - accept_license=i_accept_license, - skip_export_probes=skip_export_probes, - ) - except GatedModelError as exc: - console.print(f"[red]license:[/red] base model {parsed.frontmatter.base_model!r} is gated.") - if exc.license_url: - console.print(f" review the license at: {exc.license_url}") - console.print( - " re-run with [bold]--i-accept-license[/bold] once you have accepted. " - "Acceptance will be persisted in the store manifest." - ) - raise typer.Exit(code=1) from exc - # Detect the DDP world_size set by `accelerate launch` - # (WORLD_SIZE env var) and thread it into the doctor so the plan's - # effective_batch_size reflects the rank count. Single-process - # runs read 1 and the plan math is unchanged. - from dlm.train.distributed import detect_world_size - - ws = detect_world_size() - doctor_result = doctor( - training_config=parsed.frontmatter.training, - base_params=spec.params, - seq_len=min(parsed.frontmatter.training.sequence_len, spec.effective_context_length), - world_size=ws, - ) - plan = doctor_result.plan - if plan is None: - console.print( - "[red]doctor:[/red] no viable training plan for this host. " - "Run `dlm doctor` for details." - ) - raise typer.Exit(code=1) - - store = for_dlm(parsed.frontmatter.dlm_id) - store.ensure_layout() - - # `dlm init` writes a manifest as part of store provisioning. Mirror - # that manifest write here when the store layout exists but has no - # manifest yet — covers two flows: - # - auto-scaffold via `dlm train ` on a fresh directory - # - hand-authored .dlm with a fresh ULID that never went through - # `dlm init` (e.g. authored via the LSP / VSCode extension) - # License acceptance has already been validated upstream by this - # point, so we just record it. - if not store.manifest.exists(): - from dlm.base_models import is_gated - from dlm.base_models.license import require_acceptance - from dlm.store.manifest import Manifest, save_manifest - - acceptance = ( - require_acceptance(spec, accept_license=True, via="cli_flag") - if is_gated(spec) - else None - ) - save_manifest( - store.manifest, - Manifest( - dlm_id=parsed.frontmatter.dlm_id, - base_model=spec.key, - base_model_revision=spec.revision, - source_path=path.resolve(), - license_acceptance=acceptance, - ), - ) - - from dlm.modality import ModalityError - - try: - phase_results = run_phases( - store, - parsed, - spec, - plan, - phase=phase_literal, - mode=mode, - seed=seed, - max_steps=max_steps, - lock_mode=lock_mode, - capabilities=doctor_result.capabilities, - world_size=ws, - strict_metrics=strict_metrics, - include_auto_mined=not no_mined, - ) - except sqlite3.Error as exc: - console.print(f"[red]metrics:[/red] {exc}") - raise typer.Exit(code=1) from exc - except LockValidationError as exc: - console.print(f"[red]lock:[/red] {exc}") - console.print( - " Re-run with [bold]--update-lock[/bold] to accept the drift or " - "[bold]--ignore-lock[/bold] to continue without persisting a new lock." - ) - raise typer.Exit(code=1) from exc - except DiskSpaceError as exc: - console.print(f"[red]disk:[/red] {exc}") - raise typer.Exit(code=1) from exc - except OOMError as exc: - from dlm.train import format_oom_message - - console.print( - format_oom_message( - step=exc.step, - peak_bytes=exc.peak_bytes, - free_at_start_bytes=exc.free_at_start_bytes, - current_grad_accum=exc.current_grad_accum, - recommended_grad_accum=exc.recommended_grad_accum, - ) - ) - raise typer.Exit(code=1) from exc - except ResumeIntegrityError as exc: - console.print(f"[red]resume:[/red] {exc}") - raise typer.Exit(code=1) from exc - except (NoPreferenceContentError, PriorAdapterRequiredError) as exc: - console.print(f"[red]dpo:[/red] {exc}") - raise typer.Exit(code=1) from exc - except DpoPhaseError as exc: - console.print(f"[red]dpo:[/red] {exc}") - raise typer.Exit(code=1) from exc - except TrainingError as exc: - console.print(f"[red]training:[/red] {exc}") - raise typer.Exit(code=1) from exc - except ModalityError as exc: - console.print(f"[red]training:[/red] {exc}") - raise typer.Exit(code=1) from exc - - if not phase_results: - console.print( - "[yellow]no-op:[/yellow] nothing to train for the requested phase. " - "Check that the document has the section types the phase consumes " - "(prose/instruction for SFT, preference for DPO)." - ) - raise typer.Exit(code=0) - - for pr in phase_results: - result = pr.result - console.print( - f"[green]{pr.phase}:[/green] v{result.adapter_version:04d} " - f"({result.steps} steps, seed={result.seed}, " - f"determinism={result.determinism.class_})" - ) - console.print(f"adapter: {result.adapter_path}") - console.print(f"log: {result.log_path}") - # Final-train-loss stdout line mirrors the last phase so existing - # downstream scripts keep working. - result = phase_results[-1].result - if result.final_train_loss is not None: - sys.stdout.write(f"{result.final_train_loss}\n") - - # --watch keeps the training context alive and re-runs incremental - # cycles on file change. Entered AFTER the initial train so the - # loop resumes from a real committed adapter. - if watch: - if watch_repl: - console.print( - "[red]train:[/red] --watch --repl is scaffolded but not yet " - "implemented. The threaded REPL bridge needs a test " - "harness we don't have in CI today." - ) - raise typer.Exit(code=2) - - from dlm.watch.loop import run_watch - from dlm.watch.status import WatchStatus, render_status - - status = WatchStatus(doc_path=str(path), sections=len(parsed.sections)) - - # Start the probe-RPC server if --listen-rpc was requested. The - # queue is exposed; end-to-end flow into `build_dataset` at the - # next cycle boundary is the follow-up consumer task — for now - # the server accepts and buffers probes so sway sinks can be - # wired + tested against a live endpoint. - rpc_server = None - probe_queue = None - if rpc_config is not None: - from dlm.train.inject import InjectedProbeQueue - from dlm.train.rpc import ProbeRpcServer - - rpc_host, rpc_port, rpc_token = rpc_config - probe_queue = InjectedProbeQueue() - rpc_server = ProbeRpcServer( - host=rpc_host, port=rpc_port, token=rpc_token, queue=probe_queue - ) - rpc_server.start() - bound_host, bound_port = rpc_server.address - console.print( - f"[dim]rpc:[/dim] listening on {bound_host}:{bound_port} " - f"(queue capacity {probe_queue.capacity})" - ) - - console.print( - f"[dim]watch:[/dim] {render_status(status)}; " - f"max_steps={watch_max_steps}, debounce_ms={watch_debounce_ms}" - ) - - def _log_cycle(result_: object) -> None: - from dlm.watch.loop import CycleResult - - assert isinstance(result_, CycleResult) - if result_.ran and result_.run_result is not None: - status.mark_cycle_done( - train_loss=result_.run_result.final_train_loss, - val_loss=result_.run_result.final_val_loss, - steps=result_.run_result.steps, - coalesced=1, - ) - console.print(f"[dim]watch:[/dim] {render_status(status)}") - else: - console.print("[dim]watch:[/dim] no new content, skipping retrain") - - try: - exit_code = run_watch( - doc_path=path, - store=store, - spec=spec, - plan=plan, - max_steps=watch_max_steps, - debounce_ms=watch_debounce_ms, - on_cycle=_log_cycle, - drain_probes=probe_queue.drain if probe_queue is not None else None, - ) - except KeyboardInterrupt: - if rpc_server is not None: - rpc_server.stop() - console.print("[dim]watch:[/dim] Ctrl-C received, exiting") - raise typer.Exit(code=0) # noqa: B904 - finally: - if rpc_server is not None: - rpc_server.stop() - raise typer.Exit(code=exit_code) - - # --max-cycles without --watch: the bounded-loop cycle driver is - # the next consumer-side integration step. Accept the flags, refuse - # execution until the loop lands. - if rpc_config is not None and not watch: - console.print( - "[red]train:[/red] --listen-rpc --max-cycles (without --watch) is " - "scaffolded; the bounded cycle loop is the follow-up. Use " - "--watch for now." - ) - raise typer.Exit(code=2) - - -def _maybe_dispatch_multi_gpu( - gpus_flag: str, - argv: list[str], - console: object, - *, - mixed_precision: str = "bf16", -) -> int | None: - """Resolve `--gpus`; if multi-GPU, spawn accelerate launch and return its exit code. - - Returns None when the resolved world_size is 1 — caller falls - through to the in-process trainer. Returns an int exit code when - the launcher ran, so the caller can `raise typer.Exit(code=...)`. - """ - from rich.console import Console - - from dlm.train.distributed import UnsupportedGpuSpecError, launch_multi_gpu, parse_gpus - - assert isinstance(console, Console) - - try: - spec = parse_gpus(gpus_flag) - except UnsupportedGpuSpecError as exc: - console.print(f"[red]train:[/red] {exc}") - return 2 - - try: - import torch - - device_count = int(torch.cuda.device_count()) - except Exception: # pragma: no cover - torch probing has many failure modes - device_count = 0 - - try: - device_ids = spec.resolve(device_count) - except UnsupportedGpuSpecError as exc: - console.print(f"[red]train:[/red] {exc}") - return 2 - - if len(device_ids) < 2: - # Single-GPU (or --gpus 1) — no subprocess needed. Caller - # continues with the in-process path. - return None - - # Forward the original argv minus `--gpus` / `--gpus=...`; the - # worker entry strips it defensively too, but we drop it here so - # the launched accelerate cmd carries exactly the intended args. - cli_args = _strip_gpus_from_argv(argv) - console.print( - f"[dim]train:[/dim] dispatching to accelerate launch on devices {list(device_ids)} " - f"(mixed_precision={mixed_precision})" - ) - return launch_multi_gpu(device_ids, cli_args, mixed_precision=mixed_precision) - - -def _strip_gpus_from_argv(argv: list[str]) -> list[str]: - """Drop `--gpus ` / `--gpus=` from raw sys.argv (launcher side). - - Skips argv[0] (script path) — `accelerate launch -m ` - provides the rank entrypoint separately, so the launcher forwards - argv[1:] minus the multi-GPU flag. Delegates to the shared - `strip_gpus_flag` helper. - """ - from dlm.train.distributed.gpus import strip_gpus_flag - - return strip_gpus_flag(argv, skip_argv0=True) - def prompt_cmd( ctx: typer.Context, diff --git a/src/dlm/cli/commands/train.py b/src/dlm/cli/commands/train.py new file mode 100644 index 0000000..3d73676 --- /dev/null +++ b/src/dlm/cli/commands/train.py @@ -0,0 +1,731 @@ +"""`dlm train` — train / retrain a .dlm against its base model.""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Annotated, Literal + +import typer + + +def train_cmd( + path: Annotated[ + Path, + typer.Argument( + help=( + ".dlm file to train. Or a directory — when passed a directory, " + "`dlm train` auto-scaffolds `/.dlm/corpus.dlm` on first run " + "(with --base) and reuses it on subsequent runs." + ), + ), + ], + resume: Annotated[bool, typer.Option("--resume", help="Resume from last checkpoint.")] = False, + fresh: Annotated[bool, typer.Option("--fresh", help="Discard prior adapter state.")] = False, + seed: Annotated[int | None, typer.Option("--seed", help="Override training seed.")] = None, + max_steps: Annotated[int | None, typer.Option("--max-steps", help="Cap step count.")] = None, + phase: Annotated[ + str, + typer.Option( + "--phase", + help=( + "Which training phases to run: 'sft' (supervised only), " + "'preference' (DPO/ORPO only — requires a prior SFT " + "adapter), or 'all' (SFT then preference when enabled). " + "The preference method (dpo / orpo) comes from " + "training.preference.method in the frontmatter." + ), + ), + ] = "all", + i_accept_license: Annotated[ + bool, + typer.Option( + "--i-accept-license", + help="Accept the base model's license (required for gated bases like llama-3.2).", + ), + ] = False, + strict_lock: Annotated[ + bool, + typer.Option( + "--strict-lock", + help="Fail on any dlm.lock drift, including version warns.", + ), + ] = False, + update_lock: Annotated[ + bool, + typer.Option( + "--update-lock", + help="Overwrite dlm.lock without validating prior entries.", + ), + ] = False, + ignore_lock: Annotated[ + bool, + typer.Option( + "--ignore-lock", + help="Skip dlm.lock validation and don't write a new lock.", + ), + ] = False, + strict_metrics: Annotated[ + bool, + typer.Option( + "--strict-metrics", + help="Promote metrics SQLite write failures to hard errors.", + ), + ] = False, + no_mined: Annotated[ + bool, + typer.Option( + "--no-mined", + help=( + "Exclude auto-mined preference sections from the preference " + "phase, including replay-sampled mined pairs. Hand-authored " + "`::preference::` sections still train normally." + ), + ), + ] = False, + gpus: Annotated[ + str | None, + typer.Option( + "--gpus", + help=( + "Multi-GPU training. `all` uses every visible CUDA device; " + "`N` uses the first N; `0,1` selects exact device ids. " + "Dispatches to `accelerate launch` when >1 device is " + "selected. Omit for single-process training." + ), + ), + ] = None, + watch: Annotated[ + bool, + typer.Option( + "--watch", + help=( + "Save-to-train mode. After an initial train, block on " + "filesystem events and run incremental retrains " + "(mode=resume, step-capped) on each settled save. Ctrl-C " + "exits cleanly between cycles." + ), + ), + ] = False, + watch_max_steps: Annotated[ + int, + typer.Option( + "--watch-max-steps", + help="Per-cycle step cap for --watch. Default 100 keeps cycles responsive.", + ), + ] = 100, + watch_debounce_ms: Annotated[ + int, + typer.Option( + "--watch-debounce-ms", + help="Quiet interval (ms) before a burst of saves triggers a retrain.", + ), + ] = 400, + watch_repl: Annotated[ + bool, + typer.Option( + "--repl", + help=( + "With --watch: also open the REPL so prompts reflect the " + "latest adapter. **Scaffolded** — threading integration " + "is untestable without a two-process harness; emit a " + "not-yet-implemented refusal and exit 2." + ), + ), + ] = False, + base: Annotated[ + str | None, + typer.Option( + "--base", + help=( + "Base model key for auto-scaffold. Required on first run when " + "`path` is a directory without an existing .dlm/ config. " + "Accepts registry keys (smollm2-135m, qwen2.5-coder-1.5b, ...) " + "or `hf:/` for off-registry models." + ), + ), + ] = None, + include: Annotated[ + list[str] | None, + typer.Option( + "--include", + help=( + "Glob pattern for files to train on (auto-scaffold only). " + "Repeatable. Default: '**/*' with --recursive, '*' without. " + "Examples: '**/*.py', '**/*.f90', '**/*.{md,rst}'." + ), + ), + ] = None, + exclude: Annotated[ + list[str] | None, + typer.Option( + "--exclude", + help=( + "Glob pattern for files to skip (auto-scaffold only). " + "Repeatable. Defaults (secrets, VCS, lockfiles, binaries) " + "apply on top via the descent protocol." + ), + ), + ] = None, + recursive: Annotated[ + bool, + typer.Option( + "--recursive/--no-recursive", + "-r/-R", + help=( + "Auto-scaffold include patterns descend into subdirectories. " + "Default True. --no-recursive limits the default include to " + "top-level files only." + ), + ), + ] = True, + name: Annotated[ + str, + typer.Option( + "--name", + help=( + "Adapter name for auto-scaffold → `/.dlm/.dlm`. " + "Default 'corpus'. Lets a single tree host multiple adapters." + ), + ), + ] = "corpus", + policy: Annotated[ + str, + typer.Option( + "--policy", + help=( + "Auto-scaffold sources_policy: 'strict' (default; confines " + "training to the target directory) or 'permissive' (allows " + "absolute paths anywhere)." + ), + ), + ] = "strict", + rescaffold: Annotated[ + bool, + typer.Option( + "--rescaffold", + help=( + "Rewrite an existing scaffolded .dlm in place with the new " + "--base/--include/--exclude/--policy flags. Keeps the same " + "dlm_id (store stays intact). Without it, re-running with " + "frontmatter-editing flags refuses to shadow-edit." + ), + ), + ] = False, + listen_rpc: Annotated[ + str | None, + typer.Option( + "--listen-rpc", + help=( + "Open a JSON-RPC endpoint at (e.g. `127.0.0.1:7429`) " + "that accepts `inject_probe` pushes from sway-style eval " + "harnesses. Probes enter the queue and drain at the next " + "training-cycle boundary. Requires --watch or --max-cycles. " + "Bearer token from DLM_PROBE_TOKEN." + ), + ), + ] = None, + max_cycles: Annotated[ + int, + typer.Option( + "--max-cycles", + help=( + "Convergence stop for --listen-rpc without --watch: cap the " + "probe-driven retrain loop at N cycles. Ignored without " + "--listen-rpc." + ), + ), + ] = 0, + no_cache: Annotated[ + bool, + typer.Option( + "--no-cache", + help=( + "Opt out of the tokenized-section cache for this run. By " + "default, `dlm train` pre-tokenizes directive-sourced rows " + "via ~/.dlm/store//tokenized-cache/ so subsequent runs " + "on the same corpus skip re-tokenization. Use this to " + "bypass the cache for debugging or to compare cached vs " + "uncached training determinism." + ), + ), + ] = False, + skip_export_probes: Annotated[ + bool, + typer.Option( + "--skip-export-probes", + help=( + "Skip the llama.cpp / GGUF-conversion probes so brand-new " + "architectures (not yet in our vendored llama.cpp) can still " + "be used for training + HF inference. Forfeits `dlm export` " + "to Ollama until the vendored copy catches up. Mirrors the " + "flag of the same name on `dlm init`." + ), + ), + ] = False, +) -> None: + """Train / retrain a .dlm against its base model.""" + import sqlite3 + import sys + + from rich.console import Console + + from dlm.base_models import GatedModelError + from dlm.base_models import resolve as resolve_base_model + from dlm.doc.errors import DlmParseError + from dlm.doc.parser import parse_file + from dlm.hardware import doctor + from dlm.lock import LockMode, LockValidationError + from dlm.store.paths import for_dlm + from dlm.train import ( + DiskSpaceError, + OOMError, + ResumeIntegrityError, + TrainingError, + ) + from dlm.train.preference import ( + DpoPhaseError, + NoPreferenceContentError, + PriorAdapterRequiredError, + ) + from dlm.train.preference.phase_orchestrator import Phase, run_phases + + console = Console(stderr=True) + + if phase not in ("sft", "preference", "all"): + console.print(f"[red]error:[/red] --phase must be one of sft|preference|all, got {phase!r}") + raise typer.Exit(code=2) + phase_literal: Phase = phase # type: ignore[assignment] + + if resume and fresh: + console.print("[red]error:[/red] --resume and --fresh are mutually exclusive") + raise typer.Exit(code=2) + mode: Literal["fresh", "resume"] = "resume" if resume else "fresh" + + # --gpus dispatches to accelerate launch when >1 device is + # selected. The single-GPU path falls through to the existing + # in-process trainer; a bare `--gpus 1` is a no-op (users can use + # it to lock the visible device set via CUDA_VISIBLE_DEVICES + # without spawning a subprocess). + if gpus is not None: + # Resolve mixed_precision from capabilities so bf16-incapable + # CUDA GPUs (SM<8.0) don't trip the `accelerate launch` + # default. `probe()` is cheap and runs in the launcher-side + # process only; each rank re-probes via `doctor()` later. + from dlm.hardware.capabilities import probe as _probe_caps + + _caps = _probe_caps() + _mp = "bf16" if _caps.supports_bf16 else "fp16" + exit_code = _maybe_dispatch_multi_gpu(gpus, sys.argv, console, mixed_precision=_mp) + if exit_code is not None: + raise typer.Exit(code=exit_code) + + # Mutual-exclusion gate for the three lock flags. Exactly one (or + # zero) may be set — silently ignoring a conflicting pair would + # mask operator intent. + lock_flag_count = sum((strict_lock, update_lock, ignore_lock)) + if lock_flag_count > 1: + console.print( + "[red]error:[/red] --strict-lock / --update-lock / --ignore-lock " + "are mutually exclusive", + ) + raise typer.Exit(code=2) + lock_mode: LockMode = "default" + if strict_lock: + lock_mode = "strict" + elif update_lock: + lock_mode = "update" + elif ignore_lock: + lock_mode = "ignore" + + # `--no-cache` bypasses the tokenized-section cache for this run. + # Plumbed as an env var because the trainer's pre-tokenize helper + # already reads one — the CLI flag is a discoverable surface over + # the same switch. Rolling the flag into `TrainingPlan` is a + # deferred refactor; the env var is sufficient for the user-facing + # contract and survives `accelerate launch` re-invocations. + if no_cache: + from dlm.train.cache import set_disable_flag + + set_disable_flag("--no-cache") + + if policy not in ("permissive", "strict"): + console.print( + f"[red]error:[/red] --policy must be 'permissive' or 'strict', got {policy!r}" + ) + raise typer.Exit(code=2) + policy_literal: Literal["permissive", "strict"] = policy # type: ignore[assignment] + + # --listen-rpc requires a loop to drain the queue — either --watch + # (file-change cycles) or --max-cycles N (bounded retrain loop). + # Without one, the server would accept probes that never train. We + # also need the bearer token up front so the user sees the refusal + # before we spend time downloading weights. + rpc_config: tuple[str, int, str] | None = None + if listen_rpc is not None: + if not watch and max_cycles <= 0: + console.print( + "[red]error:[/red] --listen-rpc requires --watch or --max-cycles N " + "(the probe queue needs a drain cadence)" + ) + raise typer.Exit(code=2) + token = os.environ.get("DLM_PROBE_TOKEN", "").strip() + if not token: + console.print( + "[red]error:[/red] --listen-rpc needs a bearer token; " + "export DLM_PROBE_TOKEN=" + ) + raise typer.Exit(code=2) + host, _, port_s = listen_rpc.rpartition(":") + if not host or not port_s: + console.print(f"[red]error:[/red] --listen-rpc expects host:port, got {listen_rpc!r}") + raise typer.Exit(code=2) + try: + port = int(port_s) + except ValueError: + console.print(f"[red]error:[/red] --listen-rpc port must be an integer, got {port_s!r}") + raise typer.Exit(code=2) from None + rpc_config = (host, port, token) + + # Directory targets auto-scaffold `/.dlm/corpus.dlm` (or + # reuse an existing one). After this block, `path` always points + # at an actual `.dlm` file that the rest of the flow can parse. + if path.is_dir(): + from dlm.cli.scaffold import ScaffoldError, scaffold_train_target + + try: + scaffold_result = scaffold_train_target( + path, + base=base, + include=tuple(include or ()), + exclude=tuple(exclude or ()), + recursive=recursive, + name=name, + policy=policy_literal, + rescaffold=rescaffold, + ) + except ScaffoldError as exc: + console.print(f"[red]scaffold:[/red] {exc.message}") + raise typer.Exit(code=1) from exc + + if scaffold_result.scaffolded: + console.print( + f"[cyan]scaffolded:[/cyan] {scaffold_result.dlm_path} " + f"(dlm_id={scaffold_result.dlm_id})" + ) + path = scaffold_result.dlm_path + + try: + parsed = parse_file(path) + except (DlmParseError, OSError) as exc: + console.print(f"[red]error:[/red] {exc}") + raise typer.Exit(code=1) from exc + try: + spec = resolve_base_model( + parsed.frontmatter.base_model, + accept_license=i_accept_license, + skip_export_probes=skip_export_probes, + ) + except GatedModelError as exc: + console.print(f"[red]license:[/red] base model {parsed.frontmatter.base_model!r} is gated.") + if exc.license_url: + console.print(f" review the license at: {exc.license_url}") + console.print( + " re-run with [bold]--i-accept-license[/bold] once you have accepted. " + "Acceptance will be persisted in the store manifest." + ) + raise typer.Exit(code=1) from exc + # Detect the DDP world_size set by `accelerate launch` + # (WORLD_SIZE env var) and thread it into the doctor so the plan's + # effective_batch_size reflects the rank count. Single-process + # runs read 1 and the plan math is unchanged. + from dlm.train.distributed import detect_world_size + + ws = detect_world_size() + doctor_result = doctor( + training_config=parsed.frontmatter.training, + base_params=spec.params, + seq_len=min(parsed.frontmatter.training.sequence_len, spec.effective_context_length), + world_size=ws, + ) + plan = doctor_result.plan + if plan is None: + console.print( + "[red]doctor:[/red] no viable training plan for this host. " + "Run `dlm doctor` for details." + ) + raise typer.Exit(code=1) + + store = for_dlm(parsed.frontmatter.dlm_id) + store.ensure_layout() + + # `dlm init` writes a manifest as part of store provisioning. Mirror + # that manifest write here when the store layout exists but has no + # manifest yet — covers two flows: + # - auto-scaffold via `dlm train ` on a fresh directory + # - hand-authored .dlm with a fresh ULID that never went through + # `dlm init` (e.g. authored via the LSP / VSCode extension) + # License acceptance has already been validated upstream by this + # point, so we just record it. + if not store.manifest.exists(): + from dlm.base_models import is_gated + from dlm.base_models.license import require_acceptance + from dlm.store.manifest import Manifest, save_manifest + + acceptance = ( + require_acceptance(spec, accept_license=True, via="cli_flag") + if is_gated(spec) + else None + ) + save_manifest( + store.manifest, + Manifest( + dlm_id=parsed.frontmatter.dlm_id, + base_model=spec.key, + base_model_revision=spec.revision, + source_path=path.resolve(), + license_acceptance=acceptance, + ), + ) + + from dlm.modality import ModalityError + + try: + phase_results = run_phases( + store, + parsed, + spec, + plan, + phase=phase_literal, + mode=mode, + seed=seed, + max_steps=max_steps, + lock_mode=lock_mode, + capabilities=doctor_result.capabilities, + world_size=ws, + strict_metrics=strict_metrics, + include_auto_mined=not no_mined, + ) + except sqlite3.Error as exc: + console.print(f"[red]metrics:[/red] {exc}") + raise typer.Exit(code=1) from exc + except LockValidationError as exc: + console.print(f"[red]lock:[/red] {exc}") + console.print( + " Re-run with [bold]--update-lock[/bold] to accept the drift or " + "[bold]--ignore-lock[/bold] to continue without persisting a new lock." + ) + raise typer.Exit(code=1) from exc + except DiskSpaceError as exc: + console.print(f"[red]disk:[/red] {exc}") + raise typer.Exit(code=1) from exc + except OOMError as exc: + from dlm.train import format_oom_message + + console.print( + format_oom_message( + step=exc.step, + peak_bytes=exc.peak_bytes, + free_at_start_bytes=exc.free_at_start_bytes, + current_grad_accum=exc.current_grad_accum, + recommended_grad_accum=exc.recommended_grad_accum, + ) + ) + raise typer.Exit(code=1) from exc + except ResumeIntegrityError as exc: + console.print(f"[red]resume:[/red] {exc}") + raise typer.Exit(code=1) from exc + except (NoPreferenceContentError, PriorAdapterRequiredError) as exc: + console.print(f"[red]dpo:[/red] {exc}") + raise typer.Exit(code=1) from exc + except DpoPhaseError as exc: + console.print(f"[red]dpo:[/red] {exc}") + raise typer.Exit(code=1) from exc + except TrainingError as exc: + console.print(f"[red]training:[/red] {exc}") + raise typer.Exit(code=1) from exc + except ModalityError as exc: + console.print(f"[red]training:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if not phase_results: + console.print( + "[yellow]no-op:[/yellow] nothing to train for the requested phase. " + "Check that the document has the section types the phase consumes " + "(prose/instruction for SFT, preference for DPO)." + ) + raise typer.Exit(code=0) + + for pr in phase_results: + result = pr.result + console.print( + f"[green]{pr.phase}:[/green] v{result.adapter_version:04d} " + f"({result.steps} steps, seed={result.seed}, " + f"determinism={result.determinism.class_})" + ) + console.print(f"adapter: {result.adapter_path}") + console.print(f"log: {result.log_path}") + # Final-train-loss stdout line mirrors the last phase so existing + # downstream scripts keep working. + result = phase_results[-1].result + if result.final_train_loss is not None: + sys.stdout.write(f"{result.final_train_loss}\n") + + # --watch keeps the training context alive and re-runs incremental + # cycles on file change. Entered AFTER the initial train so the + # loop resumes from a real committed adapter. + if watch: + if watch_repl: + console.print( + "[red]train:[/red] --watch --repl is scaffolded but not yet " + "implemented. The threaded REPL bridge needs a test " + "harness we don't have in CI today." + ) + raise typer.Exit(code=2) + + from dlm.watch.loop import run_watch + from dlm.watch.status import WatchStatus, render_status + + status = WatchStatus(doc_path=str(path), sections=len(parsed.sections)) + + # Start the probe-RPC server if --listen-rpc was requested. The + # queue is exposed; end-to-end flow into `build_dataset` at the + # next cycle boundary is the follow-up consumer task — for now + # the server accepts and buffers probes so sway sinks can be + # wired + tested against a live endpoint. + rpc_server = None + probe_queue = None + if rpc_config is not None: + from dlm.train.inject import InjectedProbeQueue + from dlm.train.rpc import ProbeRpcServer + + rpc_host, rpc_port, rpc_token = rpc_config + probe_queue = InjectedProbeQueue() + rpc_server = ProbeRpcServer( + host=rpc_host, port=rpc_port, token=rpc_token, queue=probe_queue + ) + rpc_server.start() + bound_host, bound_port = rpc_server.address + console.print( + f"[dim]rpc:[/dim] listening on {bound_host}:{bound_port} " + f"(queue capacity {probe_queue.capacity})" + ) + + console.print( + f"[dim]watch:[/dim] {render_status(status)}; " + f"max_steps={watch_max_steps}, debounce_ms={watch_debounce_ms}" + ) + + def _log_cycle(result_: object) -> None: + from dlm.watch.loop import CycleResult + + assert isinstance(result_, CycleResult) + if result_.ran and result_.run_result is not None: + status.mark_cycle_done( + train_loss=result_.run_result.final_train_loss, + val_loss=result_.run_result.final_val_loss, + steps=result_.run_result.steps, + coalesced=1, + ) + console.print(f"[dim]watch:[/dim] {render_status(status)}") + else: + console.print("[dim]watch:[/dim] no new content, skipping retrain") + + try: + exit_code = run_watch( + doc_path=path, + store=store, + spec=spec, + plan=plan, + max_steps=watch_max_steps, + debounce_ms=watch_debounce_ms, + on_cycle=_log_cycle, + drain_probes=probe_queue.drain if probe_queue is not None else None, + ) + except KeyboardInterrupt: + if rpc_server is not None: + rpc_server.stop() + console.print("[dim]watch:[/dim] Ctrl-C received, exiting") + raise typer.Exit(code=0) # noqa: B904 + finally: + if rpc_server is not None: + rpc_server.stop() + raise typer.Exit(code=exit_code) + + # --max-cycles without --watch: the bounded-loop cycle driver is + # the next consumer-side integration step. Accept the flags, refuse + # execution until the loop lands. + if rpc_config is not None and not watch: + console.print( + "[red]train:[/red] --listen-rpc --max-cycles (without --watch) is " + "scaffolded; the bounded cycle loop is the follow-up. Use " + "--watch for now." + ) + raise typer.Exit(code=2) + + +def _maybe_dispatch_multi_gpu( + gpus_flag: str, + argv: list[str], + console: object, + *, + mixed_precision: str = "bf16", +) -> int | None: + """Resolve `--gpus`; if multi-GPU, spawn accelerate launch and return its exit code. + + Returns None when the resolved world_size is 1 — caller falls + through to the in-process trainer. Returns an int exit code when + the launcher ran, so the caller can `raise typer.Exit(code=...)`. + """ + from rich.console import Console + + from dlm.train.distributed import UnsupportedGpuSpecError, launch_multi_gpu, parse_gpus + + assert isinstance(console, Console) + + try: + spec = parse_gpus(gpus_flag) + except UnsupportedGpuSpecError as exc: + console.print(f"[red]train:[/red] {exc}") + return 2 + + try: + import torch + + device_count = int(torch.cuda.device_count()) + except Exception: # pragma: no cover - torch probing has many failure modes + device_count = 0 + + try: + device_ids = spec.resolve(device_count) + except UnsupportedGpuSpecError as exc: + console.print(f"[red]train:[/red] {exc}") + return 2 + + if len(device_ids) < 2: + # Single-GPU (or --gpus 1) — no subprocess needed. Caller + # continues with the in-process path. + return None + + # Forward the original argv minus `--gpus` / `--gpus=...`; the + # worker entry strips it defensively too, but we drop it here so + # the launched accelerate cmd carries exactly the intended args. + cli_args = _strip_gpus_from_argv(argv) + console.print( + f"[dim]train:[/dim] dispatching to accelerate launch on devices {list(device_ids)} " + f"(mixed_precision={mixed_precision})" + ) + return launch_multi_gpu(device_ids, cli_args, mixed_precision=mixed_precision) + + +def _strip_gpus_from_argv(argv: list[str]) -> list[str]: + """Drop `--gpus ` / `--gpus=` from raw sys.argv (launcher side). + + Skips argv[0] (script path) — `accelerate launch -m ` + provides the rank entrypoint separately, so the launcher forwards + argv[1:] minus the multi-GPU flag. Delegates to the shared + `strip_gpus_flag` helper. + """ + from dlm.train.distributed.gpus import strip_gpus_flag + + return strip_gpus_flag(argv, skip_argv0=True) diff --git a/tests/unit/cli/test_train_validation_edges.py b/tests/unit/cli/test_train_validation_edges.py index e43e818..15cb0cb 100644 --- a/tests/unit/cli/test_train_validation_edges.py +++ b/tests/unit/cli/test_train_validation_edges.py @@ -115,7 +115,7 @@ def _fake_dispatch( seen["mixed_precision"] = mixed_precision return 17 - monkeypatch.setattr("dlm.cli.commands._maybe_dispatch_multi_gpu", _fake_dispatch) + monkeypatch.setattr("dlm.cli.commands.train._maybe_dispatch_multi_gpu", _fake_dispatch) runner = CliRunner() result = runner.invoke( From f6d4999f88427389332407af4bea79c11a8cae2d Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:20:11 -0400 Subject: [PATCH 23/46] Extract prompt_cmd + VL/audio dispatch helpers into cli/commands/prompt.py --- src/dlm/cli/commands/__init__.py | 415 +-------------------------- src/dlm/cli/commands/prompt.py | 421 ++++++++++++++++++++++++++++ tests/unit/cli/test_prompt_edges.py | 4 +- 3 files changed, 427 insertions(+), 413 deletions(-) create mode 100644 src/dlm/cli/commands/prompt.py diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 4cf0620..5ab699f 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -12,7 +12,7 @@ from collections.abc import Sequence from pathlib import Path -from typing import Annotated, Any +from typing import Annotated import typer @@ -34,6 +34,9 @@ from dlm.cli.commands.preference import preference_list_cmd as preference_list_cmd from dlm.cli.commands.preference import preference_mine_cmd as preference_mine_cmd from dlm.cli.commands.preference import preference_revert_cmd as preference_revert_cmd +from dlm.cli.commands.prompt import _dispatch_audio_prompt as _dispatch_audio_prompt +from dlm.cli.commands.prompt import _dispatch_vl_prompt as _dispatch_vl_prompt +from dlm.cli.commands.prompt import prompt_cmd as prompt_cmd from dlm.cli.commands.pull import pull_cmd as pull_cmd from dlm.cli.commands.push import push_cmd as push_cmd from dlm.cli.commands.repl import repl_cmd as repl_cmd @@ -59,416 +62,6 @@ def _stub(sprint: str, subject: str) -> None: -def prompt_cmd( - ctx: typer.Context, - path: Annotated[Path, typer.Argument(help=".dlm file to query.")], - query: Annotated[str | None, typer.Argument(help="One-shot prompt (omit for stdin).")] = None, - max_tokens: Annotated[ - int, - typer.Option("--max-tokens", help="Max new tokens to generate."), - ] = 256, - temp: Annotated[ - float, - typer.Option("--temp", help="Sampling temperature. `0.0` = greedy decoding."), - ] = 0.7, - top_p: Annotated[ - float | None, - typer.Option( - "--top-p", - help="Top-p sampling cutoff. Omit to disable nucleus sampling.", - ), - ] = None, - verbose: Annotated[bool, typer.Option("--verbose", help="Log resolved InferencePlan.")] = False, - adapter: Annotated[ - str | None, - typer.Option( - "--adapter", - help=( - "Named adapter to prompt against. Required on multi-adapter " - "documents; rejected on single-adapter documents." - ), - ), - ] = None, - gate: Annotated[ - str, - typer.Option( - "--gate", - help=( - "Learned adapter gate. `auto` (default) uses the " - "gate when one exists in the store; `off` forces uniform " - "weights across declared adapters. Ignored when --adapter " - "explicitly pins a single adapter." - ), - ), - ] = "auto", - backend: Annotated[ - str, - typer.Option( - "--backend", - help=( - "Inference backend: `auto` (default) picks MLX on Apple " - "Silicon, else PyTorch. Force with `pytorch` or `mlx`. " - "MLX requires `uv sync --extra mlx` on darwin-arm64." - ), - ), - ] = "auto", - image: Annotated[ - list[Path] | None, - typer.Option( - "--image", - help=( - "Attach an image file to the prompt. Repeat for multiple " - "images; each expands to the base's image-token placeholder. " - "Requires a vision-language base." - ), - ), - ] = None, - audio: Annotated[ - list[Path] | None, - typer.Option( - "--audio", - help=( - "Attach an audio file (.wav/.flac/.ogg) to the prompt. " - "Repeat for multiple clips; each expands to the base's " - "audio-token placeholder. Requires an audio-language base " - "(for example Qwen2-Audio-7B-Instruct)." - ), - ), - ] = None, -) -> None: - """Run inference against the trained adapter.""" - import sys - - from rich.console import Console - - from dlm.base_models import resolve as resolve_base_model - from dlm.doc.parser import parse_file - from dlm.hardware import doctor - from dlm.inference import AdapterNotFoundError - from dlm.inference.backends import ( - UnsupportedBackendError, - build_backend, - select_backend, - ) - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - - if backend not in ("auto", "pytorch", "mlx"): - console.print( - f"[red]prompt:[/red] --backend must be `auto`, `pytorch`, or `mlx` (got {backend!r})." - ) - raise typer.Exit(code=2) - - # Typer passes None when the option was never given; normalize early so - # downstream branching can just check truthiness + len(). - image_paths: list[Path] = list(image or []) - audio_paths: list[Path] = list(audio or []) - if image_paths and audio_paths: - console.print( - "[red]prompt:[/red] --image and --audio cannot be combined " - "(each targets a different modality)." - ) - raise typer.Exit(code=2) - - from dlm.base_models import GatedModelError - - parsed = parse_file(path) - adapters_declared = parsed.frontmatter.training.adapters - if adapter is not None: - if adapters_declared is None: - console.print( - "[red]prompt:[/red] --adapter is only valid on multi-adapter " - "documents (this doc does not declare `training.adapters`)." - ) - raise typer.Exit(code=2) - if adapter not in adapters_declared: - declared = sorted(adapters_declared) - console.print( - f"[red]prompt:[/red] --adapter {adapter!r} is not declared (declared: {declared})." - ) - raise typer.Exit(code=2) - - if gate not in ("auto", "off"): - console.print(f"[red]prompt:[/red] --gate must be `auto` or `off`, got {gate!r}.") - raise typer.Exit(code=2) - # --adapter explicitly pins a single adapter — gate routing is moot. - # We silently ignore --gate in that case (the flag has a non-default - # value only when the user cares, and pairing it with --adapter is - # not an error, just a no-op). - - store = for_dlm(parsed.frontmatter.dlm_id) - already_accepted = _previously_accepted(store.manifest) - try: - spec = resolve_base_model(parsed.frontmatter.base_model, accept_license=already_accepted) - except GatedModelError as exc: - console.print( - f"[red]license:[/red] base {parsed.frontmatter.base_model!r} is gated and has " - "no recorded acceptance in this store; run `dlm train --i-accept-license` first." - ) - raise typer.Exit(code=1) from exc - caps = doctor().capabilities - - # --- VL path ------------------------------------------------------- - # The VL branch has its own model / processor / adapter loader and - # its own generate function. `--image` and vision-language bases - # must appear together; each alone is a usage error. - from dlm.modality import modality_for - - dispatch = modality_for(spec) - from click.core import ParameterSource - - if ctx.get_parameter_source("temp") == ParameterSource.DEFAULT: - temp = spec.suggested_prompt_temperature - if image_paths and not dispatch.accepts_images: - console.print( - f"[red]prompt:[/red] --image is only valid with vision-language bases; " - f"base {spec.key!r} is modality='{spec.modality}'." - ) - raise typer.Exit(code=2) - if dispatch.accepts_images and not image_paths: - console.print( - f"[red]prompt:[/red] base {spec.key!r} is vision-language; " - "pass at least one --image PATH to prompt it." - ) - raise typer.Exit(code=2) - if dispatch.accepts_images: - _dispatch_vl_prompt( - console=console, - spec=spec, - store=store, - caps=caps, - adapter_name=adapter, - image_paths=image_paths, - query=query, - max_tokens=max_tokens, - temp=temp, - top_p=top_p, - verbose=verbose, - ) - return - - # --- Audio path ---------------------------------------------------- - if audio_paths and not dispatch.accepts_audio: - console.print( - f"[red]prompt:[/red] --audio is only valid with audio-language bases; " - f"base {spec.key!r} is modality='{spec.modality}'." - ) - raise typer.Exit(code=2) - if dispatch.accepts_audio and not audio_paths: - console.print( - f"[red]prompt:[/red] base {spec.key!r} is audio-language; " - "pass at least one --audio PATH to prompt it." - ) - raise typer.Exit(code=2) - if dispatch.accepts_audio: - _dispatch_audio_prompt( - console=console, - spec=spec, - store=store, - caps=caps, - adapter_name=adapter, - audio_paths=audio_paths, - query=query, - max_tokens=max_tokens, - temp=temp, - top_p=top_p, - verbose=verbose, - auto_resample=parsed.frontmatter.training.audio.auto_resample, - ) - return - - try: - backend_name = select_backend(backend, caps) # type: ignore[arg-type] - except UnsupportedBackendError as exc: - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=2) from exc - backend_obj = build_backend(backend_name, caps) - - if verbose: - console.print(f"[dim]backend:[/dim] {backend_name}") - - try: - backend_obj.load(spec, store, adapter_name=adapter) - except AdapterNotFoundError as exc: - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=1) from exc - - if query is None: - query = sys.stdin.read().strip() - if not query: - console.print("[red]prompt:[/red] empty query (pass a string or pipe on stdin)") - raise typer.Exit(code=2) - - response = backend_obj.generate( - query, - max_new_tokens=max_tokens, - temperature=temp, - top_p=top_p, - ) - sys.stdout.write(response + "\n") - - -def _dispatch_vl_prompt( # pragma: no cover - *, - console: Any, - spec: Any, - store: Any, - caps: Any, - adapter_name: str | None, - image_paths: list[Path], - query: str | None, - max_tokens: int, - temp: float, - top_p: float | None, - verbose: bool, -) -> None: - """Run the VL generate path. Keeps `prompt_cmd` readable. - - Pragma'd from unit coverage because it calls the VL HF stack. - Covered by the slow-marked vision-language integration test (T12). - """ - import sys - - import typer - - from dlm.inference import ( - AdapterNotFoundError, - generate_vl, - load_for_vl_inference, - load_images, - ) - from dlm.modality import ProcessorContractError - - if verbose: - console.print("[dim]vl-backend:[/dim] pytorch (AutoModelForImageTextToText)") - - try: - loaded = load_for_vl_inference(store, spec, caps, adapter_name=adapter_name) - except AdapterNotFoundError as exc: - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=1) from exc - except ProcessorContractError as exc: - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=1) from exc - - try: - images = load_images(image_paths) - except FileNotFoundError as exc: - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=2) from exc - - if query is None: - query = sys.stdin.read().strip() - if not query: - console.print("[red]prompt:[/red] empty query (pass a string or pipe on stdin)") - raise typer.Exit(code=2) - - # Every VL spec in the registry must declare a preprocessor plan - # (schema validator); the fallback is defensive for the hf: escape - # hatch, which could in principle skip one. - image_token = "" - if spec.vl_preprocessor_plan is not None: - image_token = spec.vl_preprocessor_plan.image_token - - response = generate_vl( - loaded.model, - loaded.processor, - query, - images, - image_token=image_token, - max_new_tokens=max_tokens, - temperature=temp, - top_p=top_p, - ) - sys.stdout.write(response + "\n") - - -def _dispatch_audio_prompt( # pragma: no cover - *, - console: Any, - spec: Any, - store: Any, - caps: Any, - adapter_name: str | None, - audio_paths: list[Path], - query: str | None, - max_tokens: int, - temp: float, - top_p: float | None, - verbose: bool, - auto_resample: bool = False, -) -> None: - """Run the audio-LM generate path. Keeps `prompt_cmd` readable. - - Pragma'd from unit coverage because it calls the audio HF stack. - Covered by the slow-marked audio integration test (T12). - """ - import sys - - import typer - - from dlm.inference import ( - AdapterNotFoundError, - generate_audio, - load_audios, - load_for_audio_inference, - ) - - if verbose: - console.print(f"[dim]audio-backend:[/dim] pytorch ({spec.architecture})") - - try: - loaded = load_for_audio_inference(store, spec, caps, adapter_name=adapter_name) - except AdapterNotFoundError as exc: - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=1) from exc - - if spec.audio_preprocessor_plan is None: - # Defensive — every registry audio spec carries the plan, but - # the hf: escape hatch could skip it. - console.print( - f"[red]prompt:[/red] base {spec.key!r} is audio-language " - "but has no audio_preprocessor_plan; cannot resolve sample rate." - ) - raise typer.Exit(code=2) - - target_sr = spec.audio_preprocessor_plan.sample_rate - try: - waveforms = load_audios( - audio_paths, - target_sample_rate=target_sr, - auto_resample=auto_resample, - ) - except FileNotFoundError as exc: - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=2) from exc - except ValueError as exc: - # Sample-rate mismatch — surface the actionable ffmpeg hint. - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=2) from exc - - if query is None: - query = sys.stdin.read().strip() - if not query: - console.print("[red]prompt:[/red] empty query (pass a string or pipe on stdin)") - raise typer.Exit(code=2) - - audio_token = spec.audio_preprocessor_plan.audio_token - - response = generate_audio( - loaded.model, - loaded.processor, - query, - waveforms, - audio_token=audio_token, - sample_rate=target_sr, - max_new_tokens=max_tokens, - temperature=temp, - top_p=top_p, - ) - sys.stdout.write(response + "\n") - def export_cmd( path: Annotated[Path, typer.Argument(help=".dlm file to export.")], diff --git a/src/dlm/cli/commands/prompt.py b/src/dlm/cli/commands/prompt.py new file mode 100644 index 0000000..1f7501c --- /dev/null +++ b/src/dlm/cli/commands/prompt.py @@ -0,0 +1,421 @@ +"""`dlm prompt` — run inference against the trained adapter.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Annotated, Any + +import typer + +from dlm.cli.commands._shared import _previously_accepted + + +def prompt_cmd( + ctx: typer.Context, + path: Annotated[Path, typer.Argument(help=".dlm file to query.")], + query: Annotated[str | None, typer.Argument(help="One-shot prompt (omit for stdin).")] = None, + max_tokens: Annotated[ + int, + typer.Option("--max-tokens", help="Max new tokens to generate."), + ] = 256, + temp: Annotated[ + float, + typer.Option("--temp", help="Sampling temperature. `0.0` = greedy decoding."), + ] = 0.7, + top_p: Annotated[ + float | None, + typer.Option( + "--top-p", + help="Top-p sampling cutoff. Omit to disable nucleus sampling.", + ), + ] = None, + verbose: Annotated[bool, typer.Option("--verbose", help="Log resolved InferencePlan.")] = False, + adapter: Annotated[ + str | None, + typer.Option( + "--adapter", + help=( + "Named adapter to prompt against. Required on multi-adapter " + "documents; rejected on single-adapter documents." + ), + ), + ] = None, + gate: Annotated[ + str, + typer.Option( + "--gate", + help=( + "Learned adapter gate. `auto` (default) uses the " + "gate when one exists in the store; `off` forces uniform " + "weights across declared adapters. Ignored when --adapter " + "explicitly pins a single adapter." + ), + ), + ] = "auto", + backend: Annotated[ + str, + typer.Option( + "--backend", + help=( + "Inference backend: `auto` (default) picks MLX on Apple " + "Silicon, else PyTorch. Force with `pytorch` or `mlx`. " + "MLX requires `uv sync --extra mlx` on darwin-arm64." + ), + ), + ] = "auto", + image: Annotated[ + list[Path] | None, + typer.Option( + "--image", + help=( + "Attach an image file to the prompt. Repeat for multiple " + "images; each expands to the base's image-token placeholder. " + "Requires a vision-language base." + ), + ), + ] = None, + audio: Annotated[ + list[Path] | None, + typer.Option( + "--audio", + help=( + "Attach an audio file (.wav/.flac/.ogg) to the prompt. " + "Repeat for multiple clips; each expands to the base's " + "audio-token placeholder. Requires an audio-language base " + "(for example Qwen2-Audio-7B-Instruct)." + ), + ), + ] = None, +) -> None: + """Run inference against the trained adapter.""" + import sys + + from rich.console import Console + + from dlm.base_models import resolve as resolve_base_model + from dlm.doc.parser import parse_file + from dlm.hardware import doctor + from dlm.inference import AdapterNotFoundError + from dlm.inference.backends import ( + UnsupportedBackendError, + build_backend, + select_backend, + ) + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + + if backend not in ("auto", "pytorch", "mlx"): + console.print( + f"[red]prompt:[/red] --backend must be `auto`, `pytorch`, or `mlx` (got {backend!r})." + ) + raise typer.Exit(code=2) + + # Typer passes None when the option was never given; normalize early so + # downstream branching can just check truthiness + len(). + image_paths: list[Path] = list(image or []) + audio_paths: list[Path] = list(audio or []) + if image_paths and audio_paths: + console.print( + "[red]prompt:[/red] --image and --audio cannot be combined " + "(each targets a different modality)." + ) + raise typer.Exit(code=2) + + from dlm.base_models import GatedModelError + + parsed = parse_file(path) + adapters_declared = parsed.frontmatter.training.adapters + if adapter is not None: + if adapters_declared is None: + console.print( + "[red]prompt:[/red] --adapter is only valid on multi-adapter " + "documents (this doc does not declare `training.adapters`)." + ) + raise typer.Exit(code=2) + if adapter not in adapters_declared: + declared = sorted(adapters_declared) + console.print( + f"[red]prompt:[/red] --adapter {adapter!r} is not declared (declared: {declared})." + ) + raise typer.Exit(code=2) + + if gate not in ("auto", "off"): + console.print(f"[red]prompt:[/red] --gate must be `auto` or `off`, got {gate!r}.") + raise typer.Exit(code=2) + # --adapter explicitly pins a single adapter — gate routing is moot. + # We silently ignore --gate in that case (the flag has a non-default + # value only when the user cares, and pairing it with --adapter is + # not an error, just a no-op). + + store = for_dlm(parsed.frontmatter.dlm_id) + already_accepted = _previously_accepted(store.manifest) + try: + spec = resolve_base_model(parsed.frontmatter.base_model, accept_license=already_accepted) + except GatedModelError as exc: + console.print( + f"[red]license:[/red] base {parsed.frontmatter.base_model!r} is gated and has " + "no recorded acceptance in this store; run `dlm train --i-accept-license` first." + ) + raise typer.Exit(code=1) from exc + caps = doctor().capabilities + + # --- VL path ------------------------------------------------------- + # The VL branch has its own model / processor / adapter loader and + # its own generate function. `--image` and vision-language bases + # must appear together; each alone is a usage error. + from dlm.modality import modality_for + + dispatch = modality_for(spec) + from click.core import ParameterSource + + if ctx.get_parameter_source("temp") == ParameterSource.DEFAULT: + temp = spec.suggested_prompt_temperature + if image_paths and not dispatch.accepts_images: + console.print( + f"[red]prompt:[/red] --image is only valid with vision-language bases; " + f"base {spec.key!r} is modality='{spec.modality}'." + ) + raise typer.Exit(code=2) + if dispatch.accepts_images and not image_paths: + console.print( + f"[red]prompt:[/red] base {spec.key!r} is vision-language; " + "pass at least one --image PATH to prompt it." + ) + raise typer.Exit(code=2) + if dispatch.accepts_images: + _dispatch_vl_prompt( + console=console, + spec=spec, + store=store, + caps=caps, + adapter_name=adapter, + image_paths=image_paths, + query=query, + max_tokens=max_tokens, + temp=temp, + top_p=top_p, + verbose=verbose, + ) + return + + # --- Audio path ---------------------------------------------------- + if audio_paths and not dispatch.accepts_audio: + console.print( + f"[red]prompt:[/red] --audio is only valid with audio-language bases; " + f"base {spec.key!r} is modality='{spec.modality}'." + ) + raise typer.Exit(code=2) + if dispatch.accepts_audio and not audio_paths: + console.print( + f"[red]prompt:[/red] base {spec.key!r} is audio-language; " + "pass at least one --audio PATH to prompt it." + ) + raise typer.Exit(code=2) + if dispatch.accepts_audio: + _dispatch_audio_prompt( + console=console, + spec=spec, + store=store, + caps=caps, + adapter_name=adapter, + audio_paths=audio_paths, + query=query, + max_tokens=max_tokens, + temp=temp, + top_p=top_p, + verbose=verbose, + auto_resample=parsed.frontmatter.training.audio.auto_resample, + ) + return + + try: + backend_name = select_backend(backend, caps) # type: ignore[arg-type] + except UnsupportedBackendError as exc: + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=2) from exc + backend_obj = build_backend(backend_name, caps) + + if verbose: + console.print(f"[dim]backend:[/dim] {backend_name}") + + try: + backend_obj.load(spec, store, adapter_name=adapter) + except AdapterNotFoundError as exc: + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if query is None: + query = sys.stdin.read().strip() + if not query: + console.print("[red]prompt:[/red] empty query (pass a string or pipe on stdin)") + raise typer.Exit(code=2) + + response = backend_obj.generate( + query, + max_new_tokens=max_tokens, + temperature=temp, + top_p=top_p, + ) + sys.stdout.write(response + "\n") + + +def _dispatch_vl_prompt( # pragma: no cover + *, + console: Any, + spec: Any, + store: Any, + caps: Any, + adapter_name: str | None, + image_paths: list[Path], + query: str | None, + max_tokens: int, + temp: float, + top_p: float | None, + verbose: bool, +) -> None: + """Run the VL generate path. Keeps `prompt_cmd` readable. + + Pragma'd from unit coverage because it calls the VL HF stack. + Covered by the slow-marked vision-language integration test. + """ + import sys + + import typer + + from dlm.inference import ( + AdapterNotFoundError, + generate_vl, + load_for_vl_inference, + load_images, + ) + from dlm.modality import ProcessorContractError + + if verbose: + console.print("[dim]vl-backend:[/dim] pytorch (AutoModelForImageTextToText)") + + try: + loaded = load_for_vl_inference(store, spec, caps, adapter_name=adapter_name) + except AdapterNotFoundError as exc: + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=1) from exc + except ProcessorContractError as exc: + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=1) from exc + + try: + images = load_images(image_paths) + except FileNotFoundError as exc: + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=2) from exc + + if query is None: + query = sys.stdin.read().strip() + if not query: + console.print("[red]prompt:[/red] empty query (pass a string or pipe on stdin)") + raise typer.Exit(code=2) + + # Every VL spec in the registry must declare a preprocessor plan + # (schema validator); the fallback is defensive for the hf: escape + # hatch, which could in principle skip one. + image_token = "" + if spec.vl_preprocessor_plan is not None: + image_token = spec.vl_preprocessor_plan.image_token + + response = generate_vl( + loaded.model, + loaded.processor, + query, + images, + image_token=image_token, + max_new_tokens=max_tokens, + temperature=temp, + top_p=top_p, + ) + sys.stdout.write(response + "\n") + + +def _dispatch_audio_prompt( # pragma: no cover + *, + console: Any, + spec: Any, + store: Any, + caps: Any, + adapter_name: str | None, + audio_paths: list[Path], + query: str | None, + max_tokens: int, + temp: float, + top_p: float | None, + verbose: bool, + auto_resample: bool = False, +) -> None: + """Run the audio-LM generate path. Keeps `prompt_cmd` readable. + + Pragma'd from unit coverage because it calls the audio HF stack. + Covered by the slow-marked audio integration test. + """ + import sys + + import typer + + from dlm.inference import ( + AdapterNotFoundError, + generate_audio, + load_audios, + load_for_audio_inference, + ) + + if verbose: + console.print(f"[dim]audio-backend:[/dim] pytorch ({spec.architecture})") + + try: + loaded = load_for_audio_inference(store, spec, caps, adapter_name=adapter_name) + except AdapterNotFoundError as exc: + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if spec.audio_preprocessor_plan is None: + # Defensive — every registry audio spec carries the plan, but + # the hf: escape hatch could skip it. + console.print( + f"[red]prompt:[/red] base {spec.key!r} is audio-language " + "but has no audio_preprocessor_plan; cannot resolve sample rate." + ) + raise typer.Exit(code=2) + + target_sr = spec.audio_preprocessor_plan.sample_rate + try: + waveforms = load_audios( + audio_paths, + target_sample_rate=target_sr, + auto_resample=auto_resample, + ) + except FileNotFoundError as exc: + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=2) from exc + except ValueError as exc: + # Sample-rate mismatch — surface the actionable ffmpeg hint. + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=2) from exc + + if query is None: + query = sys.stdin.read().strip() + if not query: + console.print("[red]prompt:[/red] empty query (pass a string or pipe on stdin)") + raise typer.Exit(code=2) + + audio_token = spec.audio_preprocessor_plan.audio_token + + response = generate_audio( + loaded.model, + loaded.processor, + query, + waveforms, + audio_token=audio_token, + sample_rate=target_sr, + max_new_tokens=max_tokens, + temperature=temp, + top_p=top_p, + ) + sys.stdout.write(response + "\n") diff --git a/tests/unit/cli/test_prompt_edges.py b/tests/unit/cli/test_prompt_edges.py index 295c0b3..f85678f 100644 --- a/tests/unit/cli/test_prompt_edges.py +++ b/tests/unit/cli/test_prompt_edges.py @@ -245,7 +245,7 @@ def test_vision_language_dispatch_branch_invokes_helper( spec=_spec(key="vl-demo", modality="vision-language"), ) monkeypatch.setattr( - "dlm.cli.commands._dispatch_vl_prompt", + "dlm.cli.commands.prompt._dispatch_vl_prompt", lambda **kwargs: captured.update(kwargs), ) @@ -276,7 +276,7 @@ def test_audio_dispatch_branch_invokes_helper( spec=_spec(key="audio-demo", modality="audio-language"), ) monkeypatch.setattr( - "dlm.cli.commands._dispatch_audio_prompt", + "dlm.cli.commands.prompt._dispatch_audio_prompt", lambda **kwargs: captured.update(kwargs), ) From 2b94a4315497c5272f1bcb95eab9bb86d30be4b7 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:24:05 -0400 Subject: [PATCH 24/46] Extract export_cmd into cli/commands/export.py + drop stale ARG001 ignore --- pyproject.toml | 4 - src/dlm/cli/commands/__init__.py | 668 +------------------------------ src/dlm/cli/commands/export.py | 640 +++++++++++++++++++++++++++++ 3 files changed, 659 insertions(+), 653 deletions(-) create mode 100644 src/dlm/cli/commands/export.py diff --git a/pyproject.toml b/pyproject.toml index 755e04c..15f3c9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -163,10 +163,6 @@ ignore = [ # into a sequence of readers; the nested form is clearer than the # comma-separated one. "tests/**/*.py" = ["ARG", "PT011", "SIM117"] -# Typer stub subcommands accept every CLI arg the real implementation -# will take so `--help` reflects the shipping surface — even though the -# stub body discards them. -"src/dlm/cli/commands.py" = ["ARG001"] # HuggingFace Trainer callbacks MUST accept `args`/`state`/`control` # positionally even when the implementation only reads some of them — # HF dispatches them by position. ARG002 for these wrappers is noise. diff --git a/src/dlm/cli/commands/__init__.py b/src/dlm/cli/commands/__init__.py index 5ab699f..a288711 100644 --- a/src/dlm/cli/commands/__init__.py +++ b/src/dlm/cli/commands/__init__.py @@ -1,21 +1,19 @@ -"""Subcommand stubs for the v1.0 CLI surface. - -Every stub raises `NotImplementedError` with the sprint number that will -implement it. This makes `dlm --help` self-documenting about project -progress. Arguments are accepted so `--help` renders the real eventual -surface; they're unused until each subcommand's owning sprint lands, -which is why `src/dlm/cli/commands.py` has a ruff per-file-ignore for -`ARG001` in `pyproject.toml`. +"""Re-exports for the `dlm.cli.commands` package. + +Each `*_cmd` function lives in its own submodule +(`dlm.cli.commands.`); this `__init__.py` re-exports them so +`from dlm.cli.commands import _cmd` and `commands._cmd` +keep working from `dlm.cli.app` and any test fixtures that bind +helpers by their pre-package name. + +Private helpers (`_*`) are also re-exported when test fixtures +monkeypatch them through `dlm.cli.commands.` — keeping the +import path stable here lets callers stay agnostic about which +submodule owns the helper. """ from __future__ import annotations -from collections.abc import Sequence -from pathlib import Path -from typing import Annotated - -import typer - from dlm.cli.commands._shared import _human_size as _human_size from dlm.cli.commands._shared import _previously_accepted as _previously_accepted from dlm.cli.commands.cache import _parse_duration as _parse_duration @@ -23,6 +21,7 @@ from dlm.cli.commands.cache import cache_prune_cmd as cache_prune_cmd from dlm.cli.commands.cache import cache_show_cmd as cache_show_cmd from dlm.cli.commands.doctor import doctor_cmd as doctor_cmd +from dlm.cli.commands.export import export_cmd as export_cmd from dlm.cli.commands.harvest import harvest_cmd as harvest_cmd from dlm.cli.commands.init import _prompt_accept_license as _prompt_accept_license from dlm.cli.commands.init import init_cmd as init_cmd @@ -54,641 +53,12 @@ def _stub(sprint: str, subject: str) -> None: - """Raise a clear unimplemented error pointing to the owning sprint.""" + """Raise a clear unimplemented error for any v1 subcommand still pending. + + Kept around because a unit test asserts the error message shape, and + because future v2/v3 subcommand wiring may want a stable stub + helper to register a not-yet-implemented surface in `--help`. + """ raise NotImplementedError( f"`{subject}` is not implemented yet (owned by Sprint {sprint}).", ) - - - - - -def export_cmd( - path: Annotated[Path, typer.Argument(help=".dlm file to export.")], - target: Annotated[ - str, - typer.Option( - "--target", - help="Export destination. Currently supported: ollama, llama-server, vllm, mlx-serve.", - ), - ] = "ollama", - quant: Annotated[ - str | None, - typer.Option("--quant", help="GGUF quant level (defaults to frontmatter)."), - ] = None, - merged: Annotated[ - bool, - typer.Option("--merged", help="Merge the adapter into the base before export."), - ] = False, - dequantize: Annotated[ - bool, - typer.Option( - "--dequantize", - help="Dequantize a QLoRA base to fp16 before merging.", - ), - ] = False, - name: Annotated[str | None, typer.Option("--name", help="Ollama model name.")] = None, - no_template: Annotated[ - bool, - typer.Option("--no-template", help="Skip writing TEMPLATE into the Modelfile."), - ] = False, - no_smoke: Annotated[ - bool, - typer.Option("--no-smoke", help="Register the export but skip the smoke prompt."), - ] = False, - no_imatrix: Annotated[ - bool, - typer.Option( - "--no-imatrix", - help=( - "Skip importance-matrix calibration. Default uses the " - "replay corpus to calibrate k-quant quantization." - ), - ), - ] = False, - draft: Annotated[ - str | None, - typer.Option( - "--draft", - help=( - "Speculative-decoding draft model Ollama tag " - "(e.g. qwen2.5:0.5b). Default uses the registered pair " - "for this base; override here to pick a custom draft." - ), - ), - ] = None, - no_draft: Annotated[ - bool, - typer.Option( - "--no-draft", - help="Suppress PARAMETER draft_model emission even when a pair is registered.", - ), - ] = False, - skip_ollama: Annotated[ - bool, - typer.Option( - "--skip-ollama", - help="Emit GGUFs + manifest only; do not touch the Ollama binary.", - ), - ] = False, - adapter: Annotated[ - str | None, - typer.Option( - "--adapter", - help=( - "Named adapter to export. Required on multi-adapter " - "documents; rejected on single-adapter documents." - ), - ), - ] = None, - adapter_mix: Annotated[ - str | None, - typer.Option( - "--adapter-mix", - help=( - "Weighted composition of named adapters, e.g. " - "`knowledge:1.0,tone:0.5`. Mutually exclusive with --adapter. " - "Multi-adapter docs only. LoRA-only; QLoRA requires " - "--dequantize." - ), - ), - ] = None, - adapter_mix_method: Annotated[ - str, - typer.Option( - "--adapter-mix-method", - help=( - "PEFT combination strategy for --adapter-mix. `linear` " - "(default) sums LoRA deltas; `svd` recomposes via SVD " - "(higher fidelity, heavier compute). Only meaningful " - "with --adapter-mix." - ), - ), - ] = "linear", - verbose: Annotated[ - bool, - typer.Option("--verbose", help="Log each subprocess command as it launches."), - ] = False, - emit_sway_json: Annotated[ - bool, - typer.Option( - "--emit-sway-json", - help=( - "After the export, also write a ready-to-run sway.yaml " - "(via dlm-sway autogen) into the export dir. Requires the " - "[sway] extra: pip install 'dlm[sway]'." - ), - ), - ] = False, -) -> None: - """Export the adapter to a runtime target.""" - - from rich.console import Console - - from dlm.base_models import GatedModelError, download_spec - from dlm.base_models import resolve as resolve_base_model - from dlm.doc.parser import parse_file - from dlm.export import ( - ExportError, - PreflightError, - SubprocessError, - UnknownExportTargetError, - UnsafeMergeError, - VendoringError, - resolve_export_plan, - run_export, - ) - from dlm.export.ollama import ( - OllamaBinaryNotFoundError, - OllamaCreateError, - OllamaError, - OllamaSmokeError, - OllamaVersionError, - ) - from dlm.export.quantize import run_checked - from dlm.export.targets import ( - finalize_mlx_serve_export, - finalize_vllm_export, - prepare_llama_server_export, - prepare_mlx_serve_export, - prepare_vllm_export, - resolve_target, - ) - from dlm.store.paths import for_dlm - - console = Console(stderr=True) - - if draft is not None and no_draft: - console.print("[red]error:[/red] --draft and --no-draft are mutually exclusive; pick one.") - raise typer.Exit(code=2) - if adapter is not None and adapter_mix is not None: - console.print( - "[red]export:[/red] --adapter and --adapter-mix are mutually exclusive; pick one." - ) - raise typer.Exit(code=2) - try: - resolved_target = resolve_target(target) - except UnknownExportTargetError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=2) from exc - parsed = parse_file(path) - adapters_declared = parsed.frontmatter.training.adapters - if adapter is not None: - if adapters_declared is None: - console.print( - "[red]export:[/red] --adapter is only valid on multi-adapter " - "documents (this doc does not declare `training.adapters`)." - ) - raise typer.Exit(code=2) - if adapter not in adapters_declared: - declared = sorted(adapters_declared) - console.print( - f"[red]export:[/red] --adapter {adapter!r} is not declared (declared: {declared})." - ) - raise typer.Exit(code=2) - - mix_entries: list[tuple[str, float]] | None = None - if adapter_mix is not None: - from dlm.export.weighted_merge import ( - InvalidMixSpecError, - parse_mix_spec, - validate_mix_against_declared, - ) - - if adapters_declared is None: - console.print( - "[red]export:[/red] --adapter-mix is only valid on multi-adapter " - "documents (this doc does not declare `training.adapters`)." - ) - raise typer.Exit(code=2) - if adapter_mix_method not in ("linear", "svd"): - console.print( - f"[red]export:[/red] --adapter-mix-method must be " - f"`linear` or `svd`, got {adapter_mix_method!r}." - ) - raise typer.Exit(code=2) - try: - entries = parse_mix_spec(adapter_mix) - validate_mix_against_declared(entries, set(adapters_declared)) - except InvalidMixSpecError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=2) from exc - mix_entries = [(e.name, e.weight) for e in entries] - - store = for_dlm(parsed.frontmatter.dlm_id) - - # Gate-driven static mix: when the doc has an enabled gate and the - # user didn't pass --adapter-mix / --adapter, freeze the learned - # gate to per-adapter weights for the GGUF export path. Dynamic - # routing only lives in the `dlm prompt` flow; the runtime can't - # evaluate the torch gate, so we substitute the prior here. A CLI - # --adapter-mix wins — users who know what they want get full - # control. - if mix_entries is None and adapter is None: - from dlm.export.gate_fallback import resolve_and_announce - - resolution = resolve_and_announce(store, parsed) - if resolution.entries is not None: - mix_entries = resolution.entries - for line in resolution.banner_lines: - console.print(line) - - already_accepted = _previously_accepted(store.manifest) - try: - spec = resolve_base_model(parsed.frontmatter.base_model, accept_license=already_accepted) - except GatedModelError as exc: - console.print(f"[red]license:[/red] base model {parsed.frontmatter.base_model!r} is gated.") - if exc.license_url: - console.print(f" review the license at: {exc.license_url}") - console.print(" accept via `dlm train --i-accept-license` before exporting.") - raise typer.Exit(code=1) from exc - - # Audio bases take HF-snapshot unconditionally — llama.cpp has no - # audio-arch roadmap at our pinned tag — so branch early without - # resolving a GGUF plan. - from dlm.modality import modality_for - - export_dispatch = modality_for(spec) - if resolved_target.name == "vllm" and export_dispatch.accepts_audio: - console.print( - "[red]export:[/red] --target vllm is not wired for audio-language " - "documents yet; the current vllm export path only supports text bases." - ) - raise typer.Exit(code=2) - if resolved_target.name == "mlx-serve" and export_dispatch.accepts_audio: - console.print( - "[red]export:[/red] --target mlx-serve is not wired for audio-language " - "documents yet; the current mlx-serve export path only supports text bases." - ) - raise typer.Exit(code=2) - if export_dispatch.accepts_audio: - try: - dispatch_result = export_dispatch.dispatch_export( - store=store, - spec=spec, - adapter_name=adapter, - quant=quant, - merged=merged, - adapter_mix_raw=adapter_mix, - ) - except ExportError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=1) from exc - assert dispatch_result is not None # audio modality always returns a result - for line in dispatch_result.banner_lines: - console.print(line) - return - - try: - plan = resolve_export_plan( - cli_quant=quant, - cli_merged=merged, - cli_dequantize=dequantize, - cli_no_template=no_template, - cli_ollama_name=name, - cli_no_imatrix=no_imatrix, - frontmatter_default_quant=parsed.frontmatter.export.default_quant, - ) - except ValueError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=2) from exc - - store.ensure_layout() - - # VL bases: arch-probe + try single-file GGUF on SUPPORTED (with - # fallback to HF-snapshot on refusal or subprocess failure). A - # missing local base snapshot should not hard-fail the whole - # export — the dispatcher can still emit the HF-snapshot path - # without GGUF context. - if resolved_target.name == "vllm" and export_dispatch.accepts_images: - console.print( - "[red]export:[/red] --target vllm is not wired for vision-language " - "documents yet; the current vllm export path only supports text bases." - ) - raise typer.Exit(code=2) - if resolved_target.name == "mlx-serve" and export_dispatch.accepts_images: - console.print( - "[red]export:[/red] --target mlx-serve is not wired for vision-language " - "documents yet; the current mlx-serve export path only supports text bases." - ) - raise typer.Exit(code=2) - if export_dispatch.accepts_images: - gguf_emission_context = None - try: - cached_vl = download_spec(spec, local_files_only=True) - except RuntimeError as exc: - _ = exc - else: - gguf_emission_context = { - "plan": plan, - "cached_base_dir": cached_vl.path, - "source_dlm_path": path.resolve(), - "training_sequence_len": parsed.frontmatter.training.sequence_len, - "dlm_version": f"v{parsed.frontmatter.dlm_version}", - } - try: - dispatch_result = export_dispatch.dispatch_export( - store=store, - spec=spec, - adapter_name=adapter, - quant=quant, - merged=merged, - adapter_mix_raw=adapter_mix, - gguf_emission_context=gguf_emission_context, - ) - except ExportError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=1) from exc - assert dispatch_result is not None # VL modality always returns a result - for line in dispatch_result.banner_lines: - console.print(line) - return - - try: - cached = download_spec(spec, local_files_only=True) - except RuntimeError as exc: - console.print( - f"[red]export:[/red] base model not in local cache — run `dlm train` first.\n {exc}" - ) - raise typer.Exit(code=1) from exc - - def _verbose_runner(cmd: Sequence[str]) -> object: - console.print(f"[dim]$ {' '.join(cmd)}[/dim]") - return run_checked(cmd) - - adapter_path_override = None - if mix_entries is not None: # pragma: no cover - heavy path - # Build the weighted-merged adapter into an ephemeral dir, - # then feed the path to run_export as an override. The tmp - # dir lives under the store's cache/ so it cleans up with - # the rest of the store on `dlm pack`. - from dlm.export.weighted_merge import MixEntry, build_and_stage - - entries_typed = [MixEntry(name=n, weight=w) for (n, w) in mix_entries] - adapter_path_override = build_and_stage( - store=store, - spec=spec, - cached_base_dir=cached.path, - entries=entries_typed, - combination_type=adapter_mix_method, # type: ignore[arg-type] - ) - - if resolved_target.name == "vllm": - ignored_flags: list[str] = [] - if quant is not None: - ignored_flags.append("--quant") - if merged: - ignored_flags.append("--merged") - if dequantize: - ignored_flags.append("--dequantize") - if no_template: - ignored_flags.append("--no-template") - if skip_ollama: - ignored_flags.append("--skip-ollama") - if no_imatrix: - ignored_flags.append("--no-imatrix") - if draft is not None: - ignored_flags.append("--draft") - if no_draft: - ignored_flags.append("--no-draft") - if ignored_flags: - console.print( - "[yellow]export:[/yellow] ignoring flags not applicable to " - f"`--target vllm`: {', '.join(ignored_flags)}" - ) - - declared_adapter_names = tuple(adapters_declared.keys()) if adapters_declared else None - try: - vllm_result = prepare_vllm_export( - store=store, - spec=spec, - served_model_name=name or f"dlm-{parsed.frontmatter.dlm_id.lower()}", - training_sequence_len=parsed.frontmatter.training.sequence_len, - adapter_name=adapter, - adapter_path_override=adapter_path_override, - declared_adapter_names=declared_adapter_names, - ) - except ExportError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=1) from exc - - vllm_smoke = None if no_smoke else resolved_target.smoke_test(vllm_result) - if vllm_smoke is not None and not vllm_smoke.ok: - console.print( - f"[red]smoke:[/red] {vllm_smoke.detail}\n" - " re-run with `--no-smoke` to skip the smoke test." - ) - raise typer.Exit(code=1) - - manifest_path = finalize_vllm_export( - store=store, - spec=spec, - prepared=vllm_result, - smoke_output_first_line=None if vllm_smoke is None else vllm_smoke.detail, - adapter_name=adapter, - adapter_mix=mix_entries, - ) - console.print(f"[green]exported:[/green] {vllm_result.export_dir}") - console.print("target: vllm") - assert vllm_result.launch_script_path is not None - assert vllm_result.config_path is not None - console.print(f"launch: {vllm_result.launch_script_path.name}") - console.print(f"config: {vllm_result.config_path.name}") - console.print(f"manifest: {manifest_path.name}") - if vllm_smoke is not None and vllm_smoke.detail: - console.print(f"smoke: {vllm_smoke.detail}") - return - - if resolved_target.name == "mlx-serve": - mlx_ignored_flags: list[str] = [] - if quant is not None: - mlx_ignored_flags.append("--quant") - if merged: - mlx_ignored_flags.append("--merged") - if dequantize: - mlx_ignored_flags.append("--dequantize") - if name is not None: - mlx_ignored_flags.append("--name") - if no_template: - mlx_ignored_flags.append("--no-template") - if skip_ollama: - mlx_ignored_flags.append("--skip-ollama") - if no_imatrix: - mlx_ignored_flags.append("--no-imatrix") - if draft is not None: - mlx_ignored_flags.append("--draft") - if no_draft: - mlx_ignored_flags.append("--no-draft") - if mlx_ignored_flags: - console.print( - "[yellow]export:[/yellow] ignoring flags not applicable to " - f"`--target mlx-serve`: {', '.join(mlx_ignored_flags)}" - ) - - declared_adapter_names = tuple(adapters_declared.keys()) if adapters_declared else None - try: - mlx_serve_result = prepare_mlx_serve_export( - store=store, - spec=spec, - adapter_name=adapter, - adapter_path_override=adapter_path_override, - declared_adapter_names=declared_adapter_names, - ) - except ExportError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=1) from exc - - mlx_serve_smoke = None if no_smoke else resolved_target.smoke_test(mlx_serve_result) - if mlx_serve_smoke is not None and not mlx_serve_smoke.ok: - console.print( - f"[red]smoke:[/red] {mlx_serve_smoke.detail}\n" - " re-run with `--no-smoke` to skip the smoke test." - ) - raise typer.Exit(code=1) - - manifest_path = finalize_mlx_serve_export( - store=store, - spec=spec, - prepared=mlx_serve_result, - smoke_output_first_line=None if mlx_serve_smoke is None else mlx_serve_smoke.detail, - adapter_name=adapter, - adapter_mix=mix_entries, - ) - console.print(f"[green]exported:[/green] {mlx_serve_result.export_dir}") - console.print("target: mlx-serve") - assert mlx_serve_result.launch_script_path is not None - console.print(f"launch: {mlx_serve_result.launch_script_path.name}") - console.print(f"manifest: {manifest_path.name}") - if mlx_serve_smoke is not None and mlx_serve_smoke.detail: - console.print(f"smoke: {mlx_serve_smoke.detail}") - return - - try: - result = run_export( - store, - spec, - plan, - target=resolved_target.name, - cached_base_dir=cached.path, - subprocess_runner=_verbose_runner if verbose else None, - skip_ollama=skip_ollama or resolved_target.name != "ollama", - skip_smoke=no_smoke, - source_dlm_path=path.resolve(), - training_sequence_len=parsed.frontmatter.training.sequence_len, - override_temperature=parsed.frontmatter.export.default_temperature, - override_top_p=parsed.frontmatter.export.default_top_p, - draft_override=draft, - draft_disabled=no_draft, - adapter_name=adapter, - adapter_path_override=adapter_path_override, - adapter_mix=mix_entries, - ) - except UnsafeMergeError as exc: - console.print(f"[red]merge:[/red] {exc}") - raise typer.Exit(code=1) from exc - except VendoringError as exc: - console.print( - f"[red]vendor:[/red] {exc}\n" - " run `scripts/bump-llama-cpp.sh build` or " - "`git submodule update --init --recursive`." - ) - raise typer.Exit(code=1) from exc - except PreflightError as exc: - console.print(f"[red]preflight[{exc.probe}]:[/red] {exc.detail}") - raise typer.Exit(code=1) from exc - except SubprocessError as exc: - console.print(f"[red]subprocess:[/red] {exc}") - raise typer.Exit(code=1) from exc - except OllamaBinaryNotFoundError as exc: - console.print( - f"[red]ollama:[/red] {exc}\n" - " install from https://ollama.com/download " - "or re-run with `--skip-ollama`." - ) - raise typer.Exit(code=1) from exc - except OllamaVersionError as exc: - console.print(f"[red]ollama:[/red] {exc}") - raise typer.Exit(code=1) from exc - except OllamaCreateError as exc: - console.print(f"[red]ollama create:[/red] {exc}") - raise typer.Exit(code=1) from exc - except OllamaSmokeError as exc: - console.print( - f"[red]smoke:[/red] {exc}\n re-run with `--no-smoke` to skip the smoke test." - ) - raise typer.Exit(code=1) from exc - except OllamaError as exc: - console.print(f"[red]ollama:[/red] {exc}") - raise typer.Exit(code=1) from exc - except ExportError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=1) from exc - - if resolved_target.name == "llama-server": - adapter_dir = adapter_path_override - if adapter_dir is None: - if adapter is None: - adapter_dir = store.resolve_current_adapter() - else: - adapter_dir = store.resolve_current_adapter_for(adapter) - assert adapter_dir is not None - try: - llama_server_result = prepare_llama_server_export( - export_dir=result.export_dir, - manifest_path=result.manifest_path, - artifacts=result.artifacts, - adapter_dir=adapter_dir, - spec=spec, - training_sequence_len=parsed.frontmatter.training.sequence_len, - ) - except VendoringError as exc: - console.print( - f"[red]vendor:[/red] {exc}\n" - " run `scripts/bump-llama-cpp.sh build --with-server` or " - "`git submodule update --init --recursive`." - ) - raise typer.Exit(code=1) from exc - except ExportError as exc: - console.print(f"[red]export:[/red] {exc}") - raise typer.Exit(code=1) from exc - llama_server_smoke = None if no_smoke else resolved_target.smoke_test(llama_server_result) - if llama_server_smoke is not None and not llama_server_smoke.ok: - console.print( - f"[red]smoke:[/red] {llama_server_smoke.detail}\n" - " re-run with `--no-smoke` to skip the smoke test." - ) - raise typer.Exit(code=1) - - cached_tag = " [dim](cached base)[/dim]" if result.cached else "" - console.print(f"[green]exported:[/green] {result.export_dir}{cached_tag}") - for artifact in result.artifacts: - console.print(f" {artifact.name}") - - # S26 X1 — also emit a sway.yaml next to the GGUF when the user - # asks for it. Done AFTER the regular export so a sway-side - # failure can never roll back a working GGUF deployment. - if emit_sway_json: - from dlm.export.sway_json import SwayJsonExportError, write_sway_json - - try: - sway_yaml_path = write_sway_json(path, result.export_dir) - except SwayJsonExportError as exc: - console.print(f"[red]sway-json:[/red] {exc}") - raise typer.Exit(code=1) from exc - console.print(f"[green]sway.yaml:[/green] {sway_yaml_path}") - console.print(" next: sway run " + str(sway_yaml_path)) - if resolved_target.name == "llama-server": - assert llama_server_result.launch_script_path is not None - assert llama_server_result.config_path is not None - console.print(f"target: {result.target}") - console.print(f"launch: {llama_server_result.launch_script_path.name}") - console.print(f"template: {llama_server_result.config_path.name}") - if llama_server_smoke is not None and llama_server_smoke.detail: - console.print(f"smoke: {llama_server_smoke.detail}") - return - if result.ollama_name: - console.print(f"ollama: {result.ollama_name} (v{result.ollama_version})") - if result.smoke_output_first_line: - console.print(f"smoke: {result.smoke_output_first_line}") - - diff --git a/src/dlm/cli/commands/export.py b/src/dlm/cli/commands/export.py new file mode 100644 index 0000000..2ef6b41 --- /dev/null +++ b/src/dlm/cli/commands/export.py @@ -0,0 +1,640 @@ +"""`dlm export` — export the adapter to a runtime target.""" + +from __future__ import annotations + +from collections.abc import Sequence +from pathlib import Path +from typing import Annotated + +import typer + +from dlm.cli.commands._shared import _previously_accepted + + +def export_cmd( + path: Annotated[Path, typer.Argument(help=".dlm file to export.")], + target: Annotated[ + str, + typer.Option( + "--target", + help="Export destination. Currently supported: ollama, llama-server, vllm, mlx-serve.", + ), + ] = "ollama", + quant: Annotated[ + str | None, + typer.Option("--quant", help="GGUF quant level (defaults to frontmatter)."), + ] = None, + merged: Annotated[ + bool, + typer.Option("--merged", help="Merge the adapter into the base before export."), + ] = False, + dequantize: Annotated[ + bool, + typer.Option( + "--dequantize", + help="Dequantize a QLoRA base to fp16 before merging.", + ), + ] = False, + name: Annotated[str | None, typer.Option("--name", help="Ollama model name.")] = None, + no_template: Annotated[ + bool, + typer.Option("--no-template", help="Skip writing TEMPLATE into the Modelfile."), + ] = False, + no_smoke: Annotated[ + bool, + typer.Option("--no-smoke", help="Register the export but skip the smoke prompt."), + ] = False, + no_imatrix: Annotated[ + bool, + typer.Option( + "--no-imatrix", + help=( + "Skip importance-matrix calibration. Default uses the " + "replay corpus to calibrate k-quant quantization." + ), + ), + ] = False, + draft: Annotated[ + str | None, + typer.Option( + "--draft", + help=( + "Speculative-decoding draft model Ollama tag " + "(e.g. qwen2.5:0.5b). Default uses the registered pair " + "for this base; override here to pick a custom draft." + ), + ), + ] = None, + no_draft: Annotated[ + bool, + typer.Option( + "--no-draft", + help="Suppress PARAMETER draft_model emission even when a pair is registered.", + ), + ] = False, + skip_ollama: Annotated[ + bool, + typer.Option( + "--skip-ollama", + help="Emit GGUFs + manifest only; do not touch the Ollama binary.", + ), + ] = False, + adapter: Annotated[ + str | None, + typer.Option( + "--adapter", + help=( + "Named adapter to export. Required on multi-adapter " + "documents; rejected on single-adapter documents." + ), + ), + ] = None, + adapter_mix: Annotated[ + str | None, + typer.Option( + "--adapter-mix", + help=( + "Weighted composition of named adapters, e.g. " + "`knowledge:1.0,tone:0.5`. Mutually exclusive with --adapter. " + "Multi-adapter docs only. LoRA-only; QLoRA requires " + "--dequantize." + ), + ), + ] = None, + adapter_mix_method: Annotated[ + str, + typer.Option( + "--adapter-mix-method", + help=( + "PEFT combination strategy for --adapter-mix. `linear` " + "(default) sums LoRA deltas; `svd` recomposes via SVD " + "(higher fidelity, heavier compute). Only meaningful " + "with --adapter-mix." + ), + ), + ] = "linear", + verbose: Annotated[ + bool, + typer.Option("--verbose", help="Log each subprocess command as it launches."), + ] = False, + emit_sway_json: Annotated[ + bool, + typer.Option( + "--emit-sway-json", + help=( + "After the export, also write a ready-to-run sway.yaml " + "(via dlm-sway autogen) into the export dir. Requires the " + "[sway] extra: pip install 'dlm[sway]'." + ), + ), + ] = False, +) -> None: + """Export the adapter to a runtime target.""" + + from rich.console import Console + + from dlm.base_models import GatedModelError, download_spec + from dlm.base_models import resolve as resolve_base_model + from dlm.doc.parser import parse_file + from dlm.export import ( + ExportError, + PreflightError, + SubprocessError, + UnknownExportTargetError, + UnsafeMergeError, + VendoringError, + resolve_export_plan, + run_export, + ) + from dlm.export.ollama import ( + OllamaBinaryNotFoundError, + OllamaCreateError, + OllamaError, + OllamaSmokeError, + OllamaVersionError, + ) + from dlm.export.quantize import run_checked + from dlm.export.targets import ( + finalize_mlx_serve_export, + finalize_vllm_export, + prepare_llama_server_export, + prepare_mlx_serve_export, + prepare_vllm_export, + resolve_target, + ) + from dlm.store.paths import for_dlm + + console = Console(stderr=True) + + if draft is not None and no_draft: + console.print("[red]error:[/red] --draft and --no-draft are mutually exclusive; pick one.") + raise typer.Exit(code=2) + if adapter is not None and adapter_mix is not None: + console.print( + "[red]export:[/red] --adapter and --adapter-mix are mutually exclusive; pick one." + ) + raise typer.Exit(code=2) + try: + resolved_target = resolve_target(target) + except UnknownExportTargetError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=2) from exc + parsed = parse_file(path) + adapters_declared = parsed.frontmatter.training.adapters + if adapter is not None: + if adapters_declared is None: + console.print( + "[red]export:[/red] --adapter is only valid on multi-adapter " + "documents (this doc does not declare `training.adapters`)." + ) + raise typer.Exit(code=2) + if adapter not in adapters_declared: + declared = sorted(adapters_declared) + console.print( + f"[red]export:[/red] --adapter {adapter!r} is not declared (declared: {declared})." + ) + raise typer.Exit(code=2) + + mix_entries: list[tuple[str, float]] | None = None + if adapter_mix is not None: + from dlm.export.weighted_merge import ( + InvalidMixSpecError, + parse_mix_spec, + validate_mix_against_declared, + ) + + if adapters_declared is None: + console.print( + "[red]export:[/red] --adapter-mix is only valid on multi-adapter " + "documents (this doc does not declare `training.adapters`)." + ) + raise typer.Exit(code=2) + if adapter_mix_method not in ("linear", "svd"): + console.print( + f"[red]export:[/red] --adapter-mix-method must be " + f"`linear` or `svd`, got {adapter_mix_method!r}." + ) + raise typer.Exit(code=2) + try: + entries = parse_mix_spec(adapter_mix) + validate_mix_against_declared(entries, set(adapters_declared)) + except InvalidMixSpecError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=2) from exc + mix_entries = [(e.name, e.weight) for e in entries] + + store = for_dlm(parsed.frontmatter.dlm_id) + + # Gate-driven static mix: when the doc has an enabled gate and the + # user didn't pass --adapter-mix / --adapter, freeze the learned + # gate to per-adapter weights for the GGUF export path. Dynamic + # routing only lives in the `dlm prompt` flow; the runtime can't + # evaluate the torch gate, so we substitute the prior here. A CLI + # --adapter-mix wins — users who know what they want get full + # control. + if mix_entries is None and adapter is None: + from dlm.export.gate_fallback import resolve_and_announce + + resolution = resolve_and_announce(store, parsed) + if resolution.entries is not None: + mix_entries = resolution.entries + for line in resolution.banner_lines: + console.print(line) + + already_accepted = _previously_accepted(store.manifest) + try: + spec = resolve_base_model(parsed.frontmatter.base_model, accept_license=already_accepted) + except GatedModelError as exc: + console.print(f"[red]license:[/red] base model {parsed.frontmatter.base_model!r} is gated.") + if exc.license_url: + console.print(f" review the license at: {exc.license_url}") + console.print(" accept via `dlm train --i-accept-license` before exporting.") + raise typer.Exit(code=1) from exc + + # Audio bases take HF-snapshot unconditionally — llama.cpp has no + # audio-arch roadmap at our pinned tag — so branch early without + # resolving a GGUF plan. + from dlm.modality import modality_for + + export_dispatch = modality_for(spec) + if resolved_target.name == "vllm" and export_dispatch.accepts_audio: + console.print( + "[red]export:[/red] --target vllm is not wired for audio-language " + "documents yet; the current vllm export path only supports text bases." + ) + raise typer.Exit(code=2) + if resolved_target.name == "mlx-serve" and export_dispatch.accepts_audio: + console.print( + "[red]export:[/red] --target mlx-serve is not wired for audio-language " + "documents yet; the current mlx-serve export path only supports text bases." + ) + raise typer.Exit(code=2) + if export_dispatch.accepts_audio: + try: + dispatch_result = export_dispatch.dispatch_export( + store=store, + spec=spec, + adapter_name=adapter, + quant=quant, + merged=merged, + adapter_mix_raw=adapter_mix, + ) + except ExportError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=1) from exc + assert dispatch_result is not None # audio modality always returns a result + for line in dispatch_result.banner_lines: + console.print(line) + return + + try: + plan = resolve_export_plan( + cli_quant=quant, + cli_merged=merged, + cli_dequantize=dequantize, + cli_no_template=no_template, + cli_ollama_name=name, + cli_no_imatrix=no_imatrix, + frontmatter_default_quant=parsed.frontmatter.export.default_quant, + ) + except ValueError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=2) from exc + + store.ensure_layout() + + # VL bases: arch-probe + try single-file GGUF on SUPPORTED (with + # fallback to HF-snapshot on refusal or subprocess failure). A + # missing local base snapshot should not hard-fail the whole + # export — the dispatcher can still emit the HF-snapshot path + # without GGUF context. + if resolved_target.name == "vllm" and export_dispatch.accepts_images: + console.print( + "[red]export:[/red] --target vllm is not wired for vision-language " + "documents yet; the current vllm export path only supports text bases." + ) + raise typer.Exit(code=2) + if resolved_target.name == "mlx-serve" and export_dispatch.accepts_images: + console.print( + "[red]export:[/red] --target mlx-serve is not wired for vision-language " + "documents yet; the current mlx-serve export path only supports text bases." + ) + raise typer.Exit(code=2) + if export_dispatch.accepts_images: + gguf_emission_context = None + try: + cached_vl = download_spec(spec, local_files_only=True) + except RuntimeError as exc: + _ = exc + else: + gguf_emission_context = { + "plan": plan, + "cached_base_dir": cached_vl.path, + "source_dlm_path": path.resolve(), + "training_sequence_len": parsed.frontmatter.training.sequence_len, + "dlm_version": f"v{parsed.frontmatter.dlm_version}", + } + try: + dispatch_result = export_dispatch.dispatch_export( + store=store, + spec=spec, + adapter_name=adapter, + quant=quant, + merged=merged, + adapter_mix_raw=adapter_mix, + gguf_emission_context=gguf_emission_context, + ) + except ExportError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=1) from exc + assert dispatch_result is not None # VL modality always returns a result + for line in dispatch_result.banner_lines: + console.print(line) + return + + try: + cached = download_spec(spec, local_files_only=True) + except RuntimeError as exc: + console.print( + f"[red]export:[/red] base model not in local cache — run `dlm train` first.\n {exc}" + ) + raise typer.Exit(code=1) from exc + + def _verbose_runner(cmd: Sequence[str]) -> object: + console.print(f"[dim]$ {' '.join(cmd)}[/dim]") + return run_checked(cmd) + + adapter_path_override = None + if mix_entries is not None: # pragma: no cover - heavy path + # Build the weighted-merged adapter into an ephemeral dir, + # then feed the path to run_export as an override. The tmp + # dir lives under the store's cache/ so it cleans up with + # the rest of the store on `dlm pack`. + from dlm.export.weighted_merge import MixEntry, build_and_stage + + entries_typed = [MixEntry(name=n, weight=w) for (n, w) in mix_entries] + adapter_path_override = build_and_stage( + store=store, + spec=spec, + cached_base_dir=cached.path, + entries=entries_typed, + combination_type=adapter_mix_method, # type: ignore[arg-type] + ) + + if resolved_target.name == "vllm": + ignored_flags: list[str] = [] + if quant is not None: + ignored_flags.append("--quant") + if merged: + ignored_flags.append("--merged") + if dequantize: + ignored_flags.append("--dequantize") + if no_template: + ignored_flags.append("--no-template") + if skip_ollama: + ignored_flags.append("--skip-ollama") + if no_imatrix: + ignored_flags.append("--no-imatrix") + if draft is not None: + ignored_flags.append("--draft") + if no_draft: + ignored_flags.append("--no-draft") + if ignored_flags: + console.print( + "[yellow]export:[/yellow] ignoring flags not applicable to " + f"`--target vllm`: {', '.join(ignored_flags)}" + ) + + declared_adapter_names = tuple(adapters_declared.keys()) if adapters_declared else None + try: + vllm_result = prepare_vllm_export( + store=store, + spec=spec, + served_model_name=name or f"dlm-{parsed.frontmatter.dlm_id.lower()}", + training_sequence_len=parsed.frontmatter.training.sequence_len, + adapter_name=adapter, + adapter_path_override=adapter_path_override, + declared_adapter_names=declared_adapter_names, + ) + except ExportError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=1) from exc + + vllm_smoke = None if no_smoke else resolved_target.smoke_test(vllm_result) + if vllm_smoke is not None and not vllm_smoke.ok: + console.print( + f"[red]smoke:[/red] {vllm_smoke.detail}\n" + " re-run with `--no-smoke` to skip the smoke test." + ) + raise typer.Exit(code=1) + + manifest_path = finalize_vllm_export( + store=store, + spec=spec, + prepared=vllm_result, + smoke_output_first_line=None if vllm_smoke is None else vllm_smoke.detail, + adapter_name=adapter, + adapter_mix=mix_entries, + ) + console.print(f"[green]exported:[/green] {vllm_result.export_dir}") + console.print("target: vllm") + assert vllm_result.launch_script_path is not None + assert vllm_result.config_path is not None + console.print(f"launch: {vllm_result.launch_script_path.name}") + console.print(f"config: {vllm_result.config_path.name}") + console.print(f"manifest: {manifest_path.name}") + if vllm_smoke is not None and vllm_smoke.detail: + console.print(f"smoke: {vllm_smoke.detail}") + return + + if resolved_target.name == "mlx-serve": + mlx_ignored_flags: list[str] = [] + if quant is not None: + mlx_ignored_flags.append("--quant") + if merged: + mlx_ignored_flags.append("--merged") + if dequantize: + mlx_ignored_flags.append("--dequantize") + if name is not None: + mlx_ignored_flags.append("--name") + if no_template: + mlx_ignored_flags.append("--no-template") + if skip_ollama: + mlx_ignored_flags.append("--skip-ollama") + if no_imatrix: + mlx_ignored_flags.append("--no-imatrix") + if draft is not None: + mlx_ignored_flags.append("--draft") + if no_draft: + mlx_ignored_flags.append("--no-draft") + if mlx_ignored_flags: + console.print( + "[yellow]export:[/yellow] ignoring flags not applicable to " + f"`--target mlx-serve`: {', '.join(mlx_ignored_flags)}" + ) + + declared_adapter_names = tuple(adapters_declared.keys()) if adapters_declared else None + try: + mlx_serve_result = prepare_mlx_serve_export( + store=store, + spec=spec, + adapter_name=adapter, + adapter_path_override=adapter_path_override, + declared_adapter_names=declared_adapter_names, + ) + except ExportError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=1) from exc + + mlx_serve_smoke = None if no_smoke else resolved_target.smoke_test(mlx_serve_result) + if mlx_serve_smoke is not None and not mlx_serve_smoke.ok: + console.print( + f"[red]smoke:[/red] {mlx_serve_smoke.detail}\n" + " re-run with `--no-smoke` to skip the smoke test." + ) + raise typer.Exit(code=1) + + manifest_path = finalize_mlx_serve_export( + store=store, + spec=spec, + prepared=mlx_serve_result, + smoke_output_first_line=None if mlx_serve_smoke is None else mlx_serve_smoke.detail, + adapter_name=adapter, + adapter_mix=mix_entries, + ) + console.print(f"[green]exported:[/green] {mlx_serve_result.export_dir}") + console.print("target: mlx-serve") + assert mlx_serve_result.launch_script_path is not None + console.print(f"launch: {mlx_serve_result.launch_script_path.name}") + console.print(f"manifest: {manifest_path.name}") + if mlx_serve_smoke is not None and mlx_serve_smoke.detail: + console.print(f"smoke: {mlx_serve_smoke.detail}") + return + + try: + result = run_export( + store, + spec, + plan, + target=resolved_target.name, + cached_base_dir=cached.path, + subprocess_runner=_verbose_runner if verbose else None, + skip_ollama=skip_ollama or resolved_target.name != "ollama", + skip_smoke=no_smoke, + source_dlm_path=path.resolve(), + training_sequence_len=parsed.frontmatter.training.sequence_len, + override_temperature=parsed.frontmatter.export.default_temperature, + override_top_p=parsed.frontmatter.export.default_top_p, + draft_override=draft, + draft_disabled=no_draft, + adapter_name=adapter, + adapter_path_override=adapter_path_override, + adapter_mix=mix_entries, + ) + except UnsafeMergeError as exc: + console.print(f"[red]merge:[/red] {exc}") + raise typer.Exit(code=1) from exc + except VendoringError as exc: + console.print( + f"[red]vendor:[/red] {exc}\n" + " run `scripts/bump-llama-cpp.sh build` or " + "`git submodule update --init --recursive`." + ) + raise typer.Exit(code=1) from exc + except PreflightError as exc: + console.print(f"[red]preflight[{exc.probe}]:[/red] {exc.detail}") + raise typer.Exit(code=1) from exc + except SubprocessError as exc: + console.print(f"[red]subprocess:[/red] {exc}") + raise typer.Exit(code=1) from exc + except OllamaBinaryNotFoundError as exc: + console.print( + f"[red]ollama:[/red] {exc}\n" + " install from https://ollama.com/download " + "or re-run with `--skip-ollama`." + ) + raise typer.Exit(code=1) from exc + except OllamaVersionError as exc: + console.print(f"[red]ollama:[/red] {exc}") + raise typer.Exit(code=1) from exc + except OllamaCreateError as exc: + console.print(f"[red]ollama create:[/red] {exc}") + raise typer.Exit(code=1) from exc + except OllamaSmokeError as exc: + console.print( + f"[red]smoke:[/red] {exc}\n re-run with `--no-smoke` to skip the smoke test." + ) + raise typer.Exit(code=1) from exc + except OllamaError as exc: + console.print(f"[red]ollama:[/red] {exc}") + raise typer.Exit(code=1) from exc + except ExportError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if resolved_target.name == "llama-server": + adapter_dir = adapter_path_override + if adapter_dir is None: + if adapter is None: + adapter_dir = store.resolve_current_adapter() + else: + adapter_dir = store.resolve_current_adapter_for(adapter) + assert adapter_dir is not None + try: + llama_server_result = prepare_llama_server_export( + export_dir=result.export_dir, + manifest_path=result.manifest_path, + artifacts=result.artifacts, + adapter_dir=adapter_dir, + spec=spec, + training_sequence_len=parsed.frontmatter.training.sequence_len, + ) + except VendoringError as exc: + console.print( + f"[red]vendor:[/red] {exc}\n" + " run `scripts/bump-llama-cpp.sh build --with-server` or " + "`git submodule update --init --recursive`." + ) + raise typer.Exit(code=1) from exc + except ExportError as exc: + console.print(f"[red]export:[/red] {exc}") + raise typer.Exit(code=1) from exc + llama_server_smoke = None if no_smoke else resolved_target.smoke_test(llama_server_result) + if llama_server_smoke is not None and not llama_server_smoke.ok: + console.print( + f"[red]smoke:[/red] {llama_server_smoke.detail}\n" + " re-run with `--no-smoke` to skip the smoke test." + ) + raise typer.Exit(code=1) + + cached_tag = " [dim](cached base)[/dim]" if result.cached else "" + console.print(f"[green]exported:[/green] {result.export_dir}{cached_tag}") + for artifact in result.artifacts: + console.print(f" {artifact.name}") + + # When the user opts in, also emit a sway.yaml next to the GGUF. + # Done AFTER the regular export so a sway-side failure can never + # roll back a working GGUF deployment. + if emit_sway_json: + from dlm.export.sway_json import SwayJsonExportError, write_sway_json + + try: + sway_yaml_path = write_sway_json(path, result.export_dir) + except SwayJsonExportError as exc: + console.print(f"[red]sway-json:[/red] {exc}") + raise typer.Exit(code=1) from exc + console.print(f"[green]sway.yaml:[/green] {sway_yaml_path}") + console.print(" next: sway run " + str(sway_yaml_path)) + if resolved_target.name == "llama-server": + assert llama_server_result.launch_script_path is not None + assert llama_server_result.config_path is not None + console.print(f"target: {result.target}") + console.print(f"launch: {llama_server_result.launch_script_path.name}") + console.print(f"template: {llama_server_result.config_path.name}") + if llama_server_smoke is not None and llama_server_smoke.detail: + console.print(f"smoke: {llama_server_smoke.detail}") + return + if result.ollama_name: + console.print(f"ollama: {result.ollama_name} (v{result.ollama_version})") + if result.smoke_output_first_line: + console.print(f"smoke: {result.smoke_output_first_line}") From 5bfd080a8aaa5de91adda16d26f2bb6947b028eb Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:40:58 -0400 Subject: [PATCH 25/46] Extract metrics_cmd query into dlm.metrics.cli:gather_metrics --- src/dlm/cli/commands/metrics.py | 35 +++++++------ src/dlm/metrics/cli.py | 90 +++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 15 deletions(-) create mode 100644 src/dlm/metrics/cli.py diff --git a/src/dlm/cli/commands/metrics.py b/src/dlm/cli/commands/metrics.py index ebbbd73..d22eb4a 100644 --- a/src/dlm/cli/commands/metrics.py +++ b/src/dlm/cli/commands/metrics.py @@ -38,14 +38,11 @@ def metrics_cmd( from rich.console import Console from dlm.doc.parser import parse_file + from dlm.metrics.cli import MetricsQuery, gather_metrics from dlm.metrics.queries import ( - evals_for_run, evals_to_dict, - preference_mining_for_run, preference_mining_to_dict, - recent_runs, runs_to_dict, - steps_for_run, steps_to_dict, ) from dlm.store.paths import for_dlm @@ -61,24 +58,31 @@ def metrics_cmd( parsed = parse_file(path) store = for_dlm(parsed.frontmatter.dlm_id) - runs = recent_runs(store.root, limit=limit, phase=phase, since=since_delta, run_id=run_id) + view = gather_metrics( + MetricsQuery( + store_root=store.root, + run_id=run_id, + phase=phase, + since=since_delta, + limit=limit, + ) + ) if run_id is not None: - # Drill-down: show this run's steps + evals. - if not runs: + if view.drilldown is None: console.print(f"[red]metrics:[/red] no run with run_id={run_id}") raise typer.Exit(code=1) - run = runs[0] - steps = steps_for_run(store.root, run_id) - evals = evals_for_run(store.root, run_id) - preference_rows = preference_mining_for_run(store.root, run_id) + run = view.drilldown.run + steps = view.drilldown.steps + evals = view.drilldown.evals + preference_rows = view.drilldown.preference_rows if json_out: payload = { "run": runs_to_dict([run])[0], - "steps": steps_to_dict(steps), - "evals": evals_to_dict(evals), - "preference_mining": preference_mining_to_dict(preference_rows), + "steps": steps_to_dict(list(steps)), + "evals": evals_to_dict(list(evals)), + "preference_mining": preference_mining_to_dict(list(preference_rows)), } sys.stdout.write(json.dumps(payload, indent=2) + "\n") return @@ -113,8 +117,9 @@ def metrics_cmd( return # Top-level: list runs. + runs = view.runs if json_out: - sys.stdout.write(json.dumps({"runs": runs_to_dict(runs)}, indent=2) + "\n") + sys.stdout.write(json.dumps({"runs": runs_to_dict(list(runs))}, indent=2) + "\n") return if csv_out: writer = csv.writer(sys.stdout) diff --git a/src/dlm/metrics/cli.py b/src/dlm/metrics/cli.py new file mode 100644 index 0000000..c3609e9 --- /dev/null +++ b/src/dlm/metrics/cli.py @@ -0,0 +1,90 @@ +"""Domain dispatcher for `dlm metrics`. + +Splits the per-store-metrics query off the CLI plumbing: callers (CLI, +LSP, future web UI) build a `MetricsQuery`, call `gather_metrics`, and +get a typed `MetricsView` with the rows they need to render. The CLI +owns formatting — text, JSON, CSV — and `gather_metrics` doesn't import +typer, rich, or stdout writers. + +Imports the queries module dotted (not the individual functions) so +test fixtures that patch `dlm.metrics.queries.` are visible to +this dispatcher at call time. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from dlm.metrics import queries +from dlm.metrics.queries import EvalRow, PreferenceMineRow, RunRow, StepRow + +if TYPE_CHECKING: + from datetime import timedelta + from pathlib import Path + + +@dataclass(frozen=True) +class MetricsQuery: + """Inputs to `gather_metrics`. All fields validated upstream by the caller.""" + + store_root: Path + run_id: int | None = None + phase: str | None = None + since: timedelta | None = None + limit: int = 20 + + +@dataclass(frozen=True) +class MetricsRunDrilldown: + """Per-run metrics: the run row plus its steps / evals / preference events.""" + + run: RunRow + steps: tuple[StepRow, ...] + evals: tuple[EvalRow, ...] + preference_rows: tuple[PreferenceMineRow, ...] + + +@dataclass(frozen=True) +class MetricsView: + """Snapshot of the metrics layer for a given query. + + `drilldown` is set iff `query.run_id is not None` AND a matching run + exists. `runs` is the row list (always populated for top-level + listings; on drill-down it contains exactly the matched run when + found, empty otherwise so the CLI can detect the not-found case). + """ + + runs: tuple[RunRow, ...] + drilldown: MetricsRunDrilldown | None + + +def gather_metrics(query: MetricsQuery) -> MetricsView: + """Read the per-store metrics DB and return a `MetricsView`.""" + runs = tuple( + queries.recent_runs( + query.store_root, + limit=query.limit, + phase=query.phase, + since=query.since, + run_id=query.run_id, + ) + ) + + if query.run_id is None or not runs: + return MetricsView(runs=runs, drilldown=None) + + run = runs[0] + steps = tuple(queries.steps_for_run(query.store_root, query.run_id)) + evals = tuple(queries.evals_for_run(query.store_root, query.run_id)) + preference_rows = tuple(queries.preference_mining_for_run(query.store_root, query.run_id)) + + return MetricsView( + runs=runs, + drilldown=MetricsRunDrilldown( + run=run, + steps=steps, + evals=evals, + preference_rows=preference_rows, + ), + ) From 4a930b8743bf080fd483efdd6ac280349e4de5af Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:46:02 -0400 Subject: [PATCH 26/46] Extract synth_instructions_cmd into dlm.synth.dispatch:run_synth_instructions --- src/dlm/cli/commands/synth.py | 102 ++++++++------------ src/dlm/synth/dispatch.py | 157 +++++++++++++++++++++++++++++++ tests/unit/cli/test_synth_cmd.py | 2 +- 3 files changed, 197 insertions(+), 64 deletions(-) create mode 100644 src/dlm/synth/dispatch.py diff --git a/src/dlm/cli/commands/synth.py b/src/dlm/cli/commands/synth.py index a00deee..53ced7e 100644 --- a/src/dlm/cli/commands/synth.py +++ b/src/dlm/cli/commands/synth.py @@ -106,28 +106,20 @@ def synth_instructions_cmd( from dlm.doc.errors import DlmParseError from dlm.doc.parser import parse_file - from dlm.preference import JudgeUnavailableError, build_judge + from dlm.preference import JudgeUnavailableError from dlm.store.paths import for_dlm from dlm.synth import ( InvalidTeacherSpecError, TeacherInvocationError, TeacherUnavailableError, - build_synth_plan, - build_teacher, - clear_pending_plan, - filter_synth_plan, + render_apply_plan, render_filter_report, render_synth_plan, - save_pending_plan, ) - from dlm.synth import ( - apply_plan as apply_synth_plan, - ) - from dlm.synth import ( - build_apply_plan as build_synth_apply_plan, - ) - from dlm.synth import ( - render_apply_plan as render_synth_apply_plan, + from dlm.synth.dispatch import ( + SynthInstructionsRequest, + SynthOutcome, + run_synth_instructions, ) console = Console(stderr=True) @@ -159,20 +151,26 @@ def synth_instructions_cmd( store = for_dlm(parsed.frontmatter.dlm_id) + request = SynthInstructionsRequest( + parsed=parsed, + target_path=path, + store=store, + teacher=teacher, + per_section=per_section, + strategy=cast(Literal["extraction", "expansion", "both"], strategy), + filter_kind=cast(Literal["sway", "none", "dedup-only"], filter_kind), + threshold=threshold, + max_pairs=max_pairs, + max_new_tokens=max_new_tokens, + temperature=temp, + top_p=top_p, + seed=seed, + apply=apply, + dry_run=dry_run, + ) + try: - strategy_value = cast(Literal["extraction", "expansion", "both"], strategy) - teacher_obj = build_teacher(teacher, dlm_path=path) - plan = build_synth_plan( - parsed, - teacher_obj, - per_section=per_section, - strategy=strategy_value, - max_pairs=max_pairs, - max_new_tokens=max_new_tokens, - temperature=temp, - top_p=top_p, - seed=seed, - ) + result = run_synth_instructions(request) except InvalidTeacherSpecError as exc: console.print(f"[red]synth:[/red] {exc}") raise typer.Exit(code=2) from exc @@ -182,64 +180,42 @@ def synth_instructions_cmd( except TeacherInvocationError as exc: console.print(f"[red]synth:[/red] {exc}") raise typer.Exit(code=1) from exc - except ValueError as exc: + except JudgeUnavailableError as exc: console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=2) from exc - - judge_obj = None - if filter_kind == "sway": - try: - judge_obj = build_judge("sway", dlm_path=path) - except JudgeUnavailableError as exc: - console.print(f"[red]synth:[/red] {exc}") - raise typer.Exit(code=1) from exc - - try: - filter_value = cast(Literal["sway", "none", "dedup-only"], filter_kind) - filtered = filter_synth_plan( - plan, - filter_kind=filter_value, - judge=judge_obj, - threshold=threshold, - ) + raise typer.Exit(code=1) from exc except ValueError as exc: console.print(f"[red]synth:[/red] {exc}") raise typer.Exit(code=2) from exc - out_console.print(render_synth_plan(plan)) + out_console.print(render_synth_plan(result.plan)) out_console.print("") - out_console.print(render_filter_report(filtered)) + out_console.print(render_filter_report(result.filtered_plan)) - if not filtered.additions: - if not dry_run: - clear_pending_plan(store) + if result.outcome is SynthOutcome.NO_ADDITIONS: out_console.print( "\n[yellow]no synth additions accepted[/yellow] — either generation " "yielded no valid pairs, dedup removed them, or the filter rejected them." ) raise typer.Exit(code=2) - sections = [addition.addition.section for addition in filtered.additions] - - if apply: - apply_plan = build_synth_apply_plan(parsed, sections) + if result.outcome is SynthOutcome.APPLIED: + assert result.apply_plan is not None + assert result.apply_summary is not None out_console.print("") - out_console.print(render_synth_apply_plan(apply_plan)) - summary = apply_synth_plan(parsed, apply_plan, target=path) - clear_pending_plan(store) + out_console.print(render_apply_plan(result.apply_plan)) out_console.print( - f"\n[green]synth:[/green] wrote {summary.added} section(s) to {path} " - f"({summary.skipped} skipped)" + f"\n[green]synth:[/green] wrote {result.apply_summary.added} section(s) to {path} " + f"({result.apply_summary.skipped} skipped)" ) return - if dry_run: + if result.outcome is SynthOutcome.DRY_RUN: out_console.print("\n[green]synth:[/green] dry-run only — nothing staged.") return - pending = save_pending_plan(store, source_path=path.resolve(), sections=sections) + # SynthOutcome.STAGED out_console.print( - f"\n[green]synth:[/green] staged {len(pending.sections)} auto-synth instruction " + f"\n[green]synth:[/green] staged {result.pending_count} auto-synth instruction " f"section(s). Run [bold]dlm synth list {path}[/bold] to inspect them." ) diff --git a/src/dlm/synth/dispatch.py b/src/dlm/synth/dispatch.py new file mode 100644 index 0000000..cc278be --- /dev/null +++ b/src/dlm/synth/dispatch.py @@ -0,0 +1,157 @@ +"""Domain dispatcher for `dlm synth instructions`. + +Lifts the build-teacher → build-plan → filter → stage/apply pipeline +out of the CLI. Callers (CLI, LSP, future automation) build a +`SynthInstructionsRequest`, call `run_synth_instructions`, and render +the typed `SynthInstructionsResult` themselves. The dispatcher does no +console I/O; teacher/judge/plan errors propagate as the existing typed +exceptions so the caller can map each to its own exit code or banner. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import StrEnum +from typing import TYPE_CHECKING + +from dlm.synth import apply as _apply +from dlm.synth import filter as _filter +from dlm.synth import pending as _pending +from dlm.synth import run as _run +from dlm.synth import teachers as _teachers +from dlm.synth.apply import SynthApplyPlan, SynthApplySummary +from dlm.synth.filter import FilteredSynthPlan, SynthFilterKind +from dlm.synth.prompts import SynthStrategy +from dlm.synth.run import SynthRunPlan + +if TYPE_CHECKING: + from pathlib import Path + + from dlm.doc.parser import ParsedDlm + from dlm.preference.judge import PreferenceJudge + from dlm.store.paths import StorePath + + +class SynthOutcome(StrEnum): + """Discriminator for what `run_synth_instructions` did.""" + + APPLIED = "applied" # Sections written to the .dlm. + STAGED = "staged" # Sections persisted as the pending plan. + DRY_RUN = "dry_run" # Plan rendered, nothing staged or written. + NO_ADDITIONS = "no_additions" # Filter rejected every candidate. + + +@dataclass(frozen=True) +class SynthInstructionsRequest: + """Inputs to `run_synth_instructions`. + + The CLI is responsible for parsing the .dlm and resolving the + store; the dispatcher receives both as already-typed objects. + `apply` and `dry_run` are mutually exclusive — the CLI rejects + that combo before constructing the request. + """ + + parsed: ParsedDlm + target_path: Path + store: StorePath + teacher: str + per_section: int + strategy: SynthStrategy + filter_kind: SynthFilterKind + threshold: float | None + max_pairs: int | None + max_new_tokens: int + temperature: float + top_p: float | None + seed: int | None + apply: bool + dry_run: bool + + +@dataclass(frozen=True) +class SynthInstructionsResult: + """Outcome of `run_synth_instructions`. + + `plan` and `filtered_plan` are always populated so the CLI can + render them. `apply_plan` / `apply_summary` are only set on + `SynthOutcome.APPLIED`. `pending_count` is non-zero only on + `SynthOutcome.STAGED`. + """ + + plan: SynthRunPlan + filtered_plan: FilteredSynthPlan + outcome: SynthOutcome + apply_plan: SynthApplyPlan | None = None + apply_summary: SynthApplySummary | None = None + pending_count: int = 0 + + +def run_synth_instructions(req: SynthInstructionsRequest) -> SynthInstructionsResult: + """Build, filter, and stage/apply auto-synth instructions for one .dlm.""" + teacher_obj = _teachers.build_teacher(req.teacher, dlm_path=req.target_path) + plan = _run.build_synth_plan( + req.parsed, + teacher_obj, + per_section=req.per_section, + strategy=req.strategy, + max_pairs=req.max_pairs, + max_new_tokens=req.max_new_tokens, + temperature=req.temperature, + top_p=req.top_p, + seed=req.seed, + ) + + judge_obj: PreferenceJudge | None = None + if req.filter_kind == "sway": + from dlm.preference import build_judge + + judge_obj = build_judge("sway", dlm_path=req.target_path) + + filtered = _filter.filter_synth_plan( + plan, + filter_kind=req.filter_kind, + judge=judge_obj, + threshold=req.threshold, + ) + + if not filtered.additions: + if not req.dry_run: + _pending.clear_pending_plan(req.store) + return SynthInstructionsResult( + plan=plan, + filtered_plan=filtered, + outcome=SynthOutcome.NO_ADDITIONS, + ) + + sections = [addition.addition.section for addition in filtered.additions] + + if req.apply: + apply_plan = _apply.build_apply_plan(req.parsed, sections) + summary = _apply.apply_plan(req.parsed, apply_plan, target=req.target_path) + _pending.clear_pending_plan(req.store) + return SynthInstructionsResult( + plan=plan, + filtered_plan=filtered, + outcome=SynthOutcome.APPLIED, + apply_plan=apply_plan, + apply_summary=summary, + ) + + if req.dry_run: + return SynthInstructionsResult( + plan=plan, + filtered_plan=filtered, + outcome=SynthOutcome.DRY_RUN, + ) + + pending = _pending.save_pending_plan( + req.store, + source_path=req.target_path.resolve(), + sections=sections, + ) + return SynthInstructionsResult( + plan=plan, + filtered_plan=filtered, + outcome=SynthOutcome.STAGED, + pending_count=len(pending.sections), + ) diff --git a/tests/unit/cli/test_synth_cmd.py b/tests/unit/cli/test_synth_cmd.py index 94310ea..7dbfaf9 100644 --- a/tests/unit/cli/test_synth_cmd.py +++ b/tests/unit/cli/test_synth_cmd.py @@ -149,7 +149,7 @@ def _build_teacher(raw: str, **_kwargs: object) -> _FakeTeacher: payload = payloads.get(raw, payloads["self"]) return _FakeTeacher(raw, payload) - monkeypatch.setattr("dlm.synth.build_teacher", _build_teacher) + monkeypatch.setattr("dlm.synth.teachers.build_teacher", _build_teacher) monkeypatch.setattr("dlm.preference.build_judge", lambda *args, **kwargs: _FakeJudge()) From d1ea0f674f0f8bcf242e8124900515417fc7b266 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:57:05 -0400 Subject: [PATCH 27/46] Extract preference_mine into dlm.preference.dispatch:run_preference_mine --- src/dlm/cli/commands/preference.py | 96 +++++--------- src/dlm/preference/dispatch.py | 184 ++++++++++++++++++++++++++ tests/unit/cli/test_preference_cmd.py | 4 +- 3 files changed, 221 insertions(+), 63 deletions(-) create mode 100644 src/dlm/preference/dispatch.py diff --git a/src/dlm/cli/commands/preference.py b/src/dlm/cli/commands/preference.py index 21f3fce..c495460 100644 --- a/src/dlm/cli/commands/preference.py +++ b/src/dlm/cli/commands/preference.py @@ -89,25 +89,19 @@ def preference_mine_cmd( from dlm.doc.parser import parse_file from dlm.hardware import doctor from dlm.inference import AdapterNotFoundError - from dlm.inference.backends import ( - UnsupportedBackendError, - build_backend, - select_backend, - ) - from dlm.metrics import MetricsRecorder, PreferenceMineEvent - from dlm.metrics.events import PreferenceMineWriteMode + from dlm.inference.backends import UnsupportedBackendError, select_backend from dlm.modality import modality_for from dlm.preference import ( InvalidJudgeSpecError, JudgeUnavailableError, - build_apply_plan, - build_judge, - build_mine_plan, render_apply_plan, render_mine_plan, ) - from dlm.preference.apply import apply_plan as apply_preference_plan - from dlm.preference.pending import clear_pending_plan, save_pending_plan + from dlm.preference.dispatch import ( + PreferenceMineOutcome, + PreferenceMineRequest, + run_preference_mine, + ) from dlm.store.paths import for_dlm console = Console(stderr=True) @@ -187,27 +181,30 @@ def preference_mine_cmd( except UnsupportedBackendError as exc: console.print(f"[red]preference:[/red] {exc}") raise typer.Exit(code=2) from exc - backend_obj = build_backend(backend_name, caps) + + request = PreferenceMineRequest( + parsed=parsed, + target_path=path, + store=store, + spec=spec, + capabilities=caps, + backend_name=backend_name, + judge_spec=judge, + mined_run_id=run_id, + samples=samples, + max_pairs=max_pairs, + threshold=threshold, + temperature=temp, + top_p=top_p, + adapter=adapter, + apply=apply, + ) try: - backend_obj.load(spec, store, adapter_name=adapter) + result = run_preference_mine(request) except AdapterNotFoundError as exc: console.print(f"[red]preference:[/red] {exc}") raise typer.Exit(code=1) from exc - - try: - judge_obj = build_judge(judge, dlm_path=path) - plan = build_mine_plan( - parsed, - backend_obj, - judge_obj, - mined_run_id=run_id, - samples=samples, - max_pairs=max_pairs, - threshold=threshold, - temperature=temp, - top_p=top_p, - ) except InvalidJudgeSpecError as exc: console.print(f"[red]preference:[/red] {exc}") raise typer.Exit(code=2) from exc @@ -217,28 +214,10 @@ def preference_mine_cmd( except ValueError as exc: console.print(f"[red]preference:[/red] {exc}") raise typer.Exit(code=2) from exc - finally: - backend_obj.unload() - - recorder = MetricsRecorder(store.root) - - def _record_preference_mine(write_mode: PreferenceMineWriteMode) -> None: - recorder.record_preference_mine( - PreferenceMineEvent( - run_id=run_id, - judge_name=judge_obj.name, - sample_count=samples, - mined_pairs=len(plan.additions), - skipped_prompts=len(plan.skipped), - write_mode=write_mode, - ) - ) - out_console.print(render_mine_plan(plan)) + out_console.print(render_mine_plan(result.plan)) - if not plan.additions: - clear_pending_plan(store) - _record_preference_mine("empty") + if result.outcome is PreferenceMineOutcome.NO_ADDITIONS: out_console.print( "\n[yellow]no candidates to mine[/yellow] — either instruction prompts " "did not yield a confident pair, or the matching preference sections " @@ -246,25 +225,20 @@ def _record_preference_mine(write_mode: PreferenceMineWriteMode) -> None: ) raise typer.Exit(code=2) - sections = [addition.section for addition in plan.additions] - - if apply: - apply_plan = build_apply_plan(parsed, sections) + if result.outcome is PreferenceMineOutcome.APPLIED: + assert result.apply_plan is not None + assert result.apply_summary is not None out_console.print("") - out_console.print(render_apply_plan(apply_plan)) - summary = apply_preference_plan(parsed, apply_plan, target=path) - clear_pending_plan(store) - _record_preference_mine("applied") + out_console.print(render_apply_plan(result.apply_plan)) out_console.print( - f"\n[green]preference:[/green] wrote {summary.added} section(s) to {path} " - f"({summary.skipped} skipped)" + f"\n[green]preference:[/green] wrote {result.apply_summary.added} section(s) to {path} " + f"({result.apply_summary.skipped} skipped)" ) return - pending = save_pending_plan(store, source_path=path.resolve(), sections=sections) - _record_preference_mine("staged") + # PreferenceMineOutcome.STAGED out_console.print( - f"\n[green]preference:[/green] staged {len(pending.sections)} mined preference " + f"\n[green]preference:[/green] staged {result.pending_count} mined preference " f"section(s). Run [bold]dlm preference apply {path}[/bold] to write them." ) diff --git a/src/dlm/preference/dispatch.py b/src/dlm/preference/dispatch.py new file mode 100644 index 0000000..ceaf668 --- /dev/null +++ b/src/dlm/preference/dispatch.py @@ -0,0 +1,184 @@ +"""Domain dispatcher for `dlm preference mine`. + +Lifts the build-backend → load → build-judge → mine → stage/apply +pipeline (and metrics record) out of the CLI. Callers (CLI, LSP, future +automation) build a `PreferenceMineRequest`, call `run_preference_mine`, +and render the typed `PreferenceMineResult` themselves. The dispatcher +does no console I/O; backend / judge / mine errors propagate as the +existing typed exceptions so the caller can map each to its own exit +code or banner. + +External-module imports are dotted (e.g. `from dlm.inference import +backends as _backends; _backends.build_backend(...)`) so test fixtures +that monkeypatch `dlm.inference.backends.` are visible to the +dispatcher at call time. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import StrEnum +from typing import TYPE_CHECKING + +from dlm.inference import backends as _backends +from dlm.inference.backends.select import BackendName +from dlm.metrics import MetricsRecorder +from dlm.metrics.events import PreferenceMineEvent +from dlm.preference import apply as _apply +from dlm.preference import judge as _judge_mod +from dlm.preference import mine as _mine +from dlm.preference import pending as _pending +from dlm.preference.apply import PreferenceApplyPlan, PreferenceApplySummary +from dlm.preference.mine import PreferenceMinePlan + +if TYPE_CHECKING: + from pathlib import Path + + from dlm.base_models.schema import BaseModelSpec + from dlm.doc.parser import ParsedDlm + from dlm.hardware.capabilities import Capabilities + from dlm.store.paths import StorePath + + +class PreferenceMineOutcome(StrEnum): + """Discriminator for what `run_preference_mine` did.""" + + APPLIED = "applied" # Sections written to the .dlm. + STAGED = "staged" # Sections persisted as the pending plan. + NO_ADDITIONS = "no_additions" # Mining yielded no confident pairs. + + +@dataclass(frozen=True) +class PreferenceMineRequest: + """Inputs to `run_preference_mine`. + + The CLI is responsible for parsing the .dlm, resolving the store, + enforcing license acceptance, and selecting the backend kind; the + dispatcher receives all of those as already-typed objects. + `mined_run_id` is required — the CLI exits early when no prior + training run exists. + """ + + parsed: ParsedDlm + target_path: Path + store: StorePath + spec: BaseModelSpec + capabilities: Capabilities + backend_name: BackendName + judge_spec: str + mined_run_id: int + samples: int + max_pairs: int | None + threshold: float | None + temperature: float + top_p: float | None + adapter: str | None + apply: bool + + +@dataclass(frozen=True) +class PreferenceMineResult: + """Outcome of `run_preference_mine`. + + `plan` is always populated so the CLI can render it. `apply_plan` + and `apply_summary` are only set on `PreferenceMineOutcome.APPLIED`. + `pending_count` is non-zero only on `PreferenceMineOutcome.STAGED`. + `judge_name` is the resolved judge identifier, used by the CLI for + metrics narration. + """ + + plan: PreferenceMinePlan + outcome: PreferenceMineOutcome + judge_name: str + apply_plan: PreferenceApplyPlan | None = None + apply_summary: PreferenceApplySummary | None = None + pending_count: int = 0 + + +def run_preference_mine(req: PreferenceMineRequest) -> PreferenceMineResult: + """Build, load, mine, and stage/apply preference sections for one .dlm.""" + backend_obj = _backends.build_backend(req.backend_name, req.capabilities) + backend_obj.load(req.spec, req.store, adapter_name=req.adapter) + + try: + judge_obj = _judge_mod.build_judge(req.judge_spec, dlm_path=req.target_path) + judge_name = judge_obj.name + plan = _mine.build_mine_plan( + req.parsed, + backend_obj, + judge_obj, + mined_run_id=req.mined_run_id, + samples=req.samples, + max_pairs=req.max_pairs, + threshold=req.threshold, + temperature=req.temperature, + top_p=req.top_p, + ) + finally: + backend_obj.unload() + + recorder = MetricsRecorder(req.store.root) + + if not plan.additions: + _pending.clear_pending_plan(req.store) + recorder.record_preference_mine( + PreferenceMineEvent( + run_id=req.mined_run_id, + judge_name=judge_name, + sample_count=req.samples, + mined_pairs=0, + skipped_prompts=len(plan.skipped), + write_mode="empty", + ) + ) + return PreferenceMineResult( + plan=plan, + outcome=PreferenceMineOutcome.NO_ADDITIONS, + judge_name=judge_name, + ) + + sections = [addition.section for addition in plan.additions] + + if req.apply: + apply_plan = _apply.build_apply_plan(req.parsed, sections) + summary = _apply.apply_plan(req.parsed, apply_plan, target=req.target_path) + _pending.clear_pending_plan(req.store) + recorder.record_preference_mine( + PreferenceMineEvent( + run_id=req.mined_run_id, + judge_name=judge_name, + sample_count=req.samples, + mined_pairs=len(plan.additions), + skipped_prompts=len(plan.skipped), + write_mode="applied", + ) + ) + return PreferenceMineResult( + plan=plan, + outcome=PreferenceMineOutcome.APPLIED, + judge_name=judge_name, + apply_plan=apply_plan, + apply_summary=summary, + ) + + pending = _pending.save_pending_plan( + req.store, + source_path=req.target_path.resolve(), + sections=sections, + ) + recorder.record_preference_mine( + PreferenceMineEvent( + run_id=req.mined_run_id, + judge_name=judge_name, + sample_count=req.samples, + mined_pairs=len(plan.additions), + skipped_prompts=len(plan.skipped), + write_mode="staged", + ) + ) + return PreferenceMineResult( + plan=plan, + outcome=PreferenceMineOutcome.STAGED, + judge_name=judge_name, + pending_count=len(pending.sections), + ) diff --git a/tests/unit/cli/test_preference_cmd.py b/tests/unit/cli/test_preference_cmd.py index 46f6bdc..19d527f 100644 --- a/tests/unit/cli/test_preference_cmd.py +++ b/tests/unit/cli/test_preference_cmd.py @@ -135,7 +135,7 @@ def _patch_text_mining( ) if judge_names is None: monkeypatch.setattr( - "dlm.preference.build_judge", + "dlm.preference.judge.build_judge", lambda *args, **kwargs: _FakeJudge(), ) return @@ -143,7 +143,7 @@ def _patch_text_mining( def _build_judge(ref: str, **_kwargs: object) -> _NamedFakeJudge: return _NamedFakeJudge(judge_names[ref]) - monkeypatch.setattr("dlm.preference.build_judge", _build_judge) + monkeypatch.setattr("dlm.preference.judge.build_judge", _build_judge) class TestPreferenceCmd: From 457be54fda46befe8265868ebe949528625afe6f Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 19:57:11 -0400 Subject: [PATCH 28/46] Pin synth dispatcher's build_judge import to dlm.preference.judge The dispatcher previously did 'from dlm.preference import build_judge' (re-export). Tests monkeypatch the canonical 'dlm.preference.judge.build_judge' path; using the canonical import in the dispatcher keeps function-local attribute lookup aligned with what tests patch. --- src/dlm/synth/dispatch.py | 2 +- tests/unit/cli/test_synth_cmd.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dlm/synth/dispatch.py b/src/dlm/synth/dispatch.py index cc278be..1ef2c36 100644 --- a/src/dlm/synth/dispatch.py +++ b/src/dlm/synth/dispatch.py @@ -103,7 +103,7 @@ def run_synth_instructions(req: SynthInstructionsRequest) -> SynthInstructionsRe judge_obj: PreferenceJudge | None = None if req.filter_kind == "sway": - from dlm.preference import build_judge + from dlm.preference.judge import build_judge judge_obj = build_judge("sway", dlm_path=req.target_path) diff --git a/tests/unit/cli/test_synth_cmd.py b/tests/unit/cli/test_synth_cmd.py index 7dbfaf9..0c668a4 100644 --- a/tests/unit/cli/test_synth_cmd.py +++ b/tests/unit/cli/test_synth_cmd.py @@ -150,7 +150,7 @@ def _build_teacher(raw: str, **_kwargs: object) -> _FakeTeacher: return _FakeTeacher(raw, payload) monkeypatch.setattr("dlm.synth.teachers.build_teacher", _build_teacher) - monkeypatch.setattr("dlm.preference.build_judge", lambda *args, **kwargs: _FakeJudge()) + monkeypatch.setattr("dlm.preference.judge.build_judge", lambda *args, **kwargs: _FakeJudge()) def _patch_preference_alias_runtime(monkeypatch: pytest.MonkeyPatch) -> None: @@ -167,7 +167,7 @@ def _patch_preference_alias_runtime(monkeypatch: pytest.MonkeyPatch) -> None: "dlm.inference.backends.build_backend", lambda *args, **kwargs: _FakeBackend({"What is DGEMM?": ["bad answer", "good answer"]}), ) - monkeypatch.setattr("dlm.preference.build_judge", lambda *args, **kwargs: _FakeJudge()) + monkeypatch.setattr("dlm.preference.judge.build_judge", lambda *args, **kwargs: _FakeJudge()) class TestSynthCmd: From 80c9adb47cadc776a0bcf3d1bf71e95bb09a802b Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 20:01:36 -0400 Subject: [PATCH 29/46] Extract init scaffold + manifest write into dlm.store.bootstrap:run_init --- src/dlm/cli/commands/init.py | 165 ++++-------------------- src/dlm/store/bootstrap.py | 205 ++++++++++++++++++++++++++++++ tests/unit/cli/test_init_edges.py | 2 +- 3 files changed, 229 insertions(+), 143 deletions(-) create mode 100644 src/dlm/store/bootstrap.py diff --git a/src/dlm/cli/commands/init.py b/src/dlm/cli/commands/init.py index 27c3db9..cd6ab20 100644 --- a/src/dlm/cli/commands/init.py +++ b/src/dlm/cli/commands/init.py @@ -75,7 +75,8 @@ def init_cmd( require_acceptance, ) from dlm.base_models import resolve as resolve_base_model - from dlm.io.ulid import mint_ulid + from dlm.store.bootstrap import InitRequest, ScaffoldKind, run_init + from dlm.templates import TemplateError console = Console(stderr=True) @@ -170,20 +171,6 @@ def init_cmd( skip_export_probes=skip_export_probes, ) - # NOW apply the template — license has already been accepted - # (either by --i-accept-license or interactive prompt), so pass - # the acceptance through. apply_template enforces the license - # contract at its boundary. - applied_result = None - if template is not None: - from dlm.templates import TemplateError, apply_template - - try: - applied_result = apply_template(template, path, force=force, accept_license=True) - except TemplateError as exc: - console.print(f"[red]init:[/red] {exc}") - raise typer.Exit(code=1) from exc - # Record the license acceptance (or None for non-gated specs). We # know `resolve_base_model` already validated the flag/prompt chain # — `accept_license=True` means either the user passed the flag or @@ -217,37 +204,30 @@ def init_cmd( ) raise typer.Exit(code=2) - if applied_result is not None: - dlm_id = applied_result.dlm_id + if multimodal: + scaffold_kind = ScaffoldKind.VISION + elif audio: + scaffold_kind = ScaffoldKind.AUDIO else: - dlm_id = mint_ulid() - if multimodal: - _write_init_scaffold_multimodal(path, spec.key, dlm_id) - elif audio: - _write_init_scaffold_audio(path, spec.key, dlm_id) - else: - _write_init_scaffold(path, spec.key, dlm_id) + scaffold_kind = ScaffoldKind.TEXT + + request = InitRequest( + path=path, + spec=spec, + acceptance=acceptance, + force=force, + template_name=template, + scaffold_kind=scaffold_kind, + ) - # Create the store + write the initial manifest so `dlm show` sees - # the license record and `dlm train` has a prior manifest to diff - # against. - from dlm.store.manifest import Manifest, save_manifest - from dlm.store.paths import for_dlm + try: + result = run_init(request) + except TemplateError as exc: + console.print(f"[red]init:[/red] {exc}") + raise typer.Exit(code=1) from exc - store = for_dlm(dlm_id) - store.ensure_layout() - save_manifest( - store.manifest, - Manifest( - dlm_id=dlm_id, - base_model=spec.key, - base_model_revision=spec.revision, - source_path=path.resolve(), - license_acceptance=acceptance, - ), - ) - if applied_result is not None: - meta = applied_result.template.meta + if result.applied_template is not None: + meta = result.applied_template.template.meta console.print( f"[green]init:[/green] wrote {path} from template " f"[bold]{meta.name}[/bold] ({meta.title}) — base {spec.key}." @@ -282,102 +262,3 @@ def _prompt_accept_license(console: object, base: str, license_url: str | None) except EOFError: return False return answer in ("y", "yes") - - -def _write_init_scaffold(path: Path, base_model_key: str, dlm_id: str) -> None: - """Write a minimal-but-valid .dlm file at `path`. - - Body has one PROSE paragraph + a commented instruction section so - users see both section shapes on first open. - """ - scaffold = f"""--- -dlm_id: {dlm_id} -dlm_version: 1 -base_model: {base_model_key} ---- - -# Your document title - -Write prose here. It will train via continued pretraining (CPT) loss. - -::instruction:: - -### Q -Your example question. - -### A -Your example answer. -""" - path.write_text(scaffold, encoding="utf-8") - - -def _write_init_scaffold_multimodal(path: Path, base_model_key: str, dlm_id: str) -> None: - """Write a VL-shaped .dlm file at `path`. - - Body shows the `::image::` attribute fence + a caption so users - see the v10 grammar on first open. The placeholder path - `figures/your-image.png` is deliberately non-existent — first - `dlm train` will refuse with a clear file-missing error, prompting - the user to drop a real image in. This is friendlier than - committing an inert sample that users might not notice isn't theirs. - - `dlm_version: 10` because IMAGE sections require schema v10. - """ - scaffold = f"""--- -dlm_id: {dlm_id} -dlm_version: 10 -base_model: {base_model_key} ---- - -# Your document title - -Write prose here. It will train via continued pretraining (CPT) loss. - -::image path="figures/your-image.png" alt="short description":: -Caption text describing the image. Training rows bundle the image -with this caption as `\\n`. - -::instruction:: - -### Q -What is in this image? - -### A -Describe what the image shows. -""" - path.write_text(scaffold, encoding="utf-8") - - -def _write_init_scaffold_audio(path: Path, base_model_key: str, dlm_id: str) -> None: - """Write an audio-shaped .dlm file at `path`. - - Body shows the `::audio::` attribute fence with the sibling- - transcript-friendly `transcript="..."` form so users see the v11 - grammar on first open. The placeholder path `clips/your-clip.wav` - is deliberately non-existent — first `dlm train` refuses with a - clear file-missing error rather than silently training on an inert - sample. - - `dlm_version: 11` because AUDIO sections require schema v11. - """ - scaffold = f"""--- -dlm_id: {dlm_id} -dlm_version: 11 -base_model: {base_model_key} ---- - -# Your document title - -Write prose here. It will train via continued pretraining (CPT) loss. - -::audio path="clips/your-clip.wav" transcript="Transcript of the audio clip.":: - -::instruction:: - -### Q -What was said in this recording? - -### A -Describe what you hear in the audio. -""" - path.write_text(scaffold, encoding="utf-8") diff --git a/src/dlm/store/bootstrap.py b/src/dlm/store/bootstrap.py new file mode 100644 index 0000000..032e393 --- /dev/null +++ b/src/dlm/store/bootstrap.py @@ -0,0 +1,205 @@ +"""Domain dispatcher for `dlm init`. + +Lifts the scaffold-write → store-create → manifest-write pipeline out of +the CLI. Callers (CLI, LSP "Initialize from Template" command, future +automation) build an `InitRequest`, call `run_init`, and render the +typed `InitResult`. The dispatcher does no console I/O; template +errors propagate as `TemplateError` so the caller can map them to its +own exit code or banner. + +The CLI keeps the user-interactive concerns: flag-mutex validation, +multimodal/audio default-base swap, `--template` peek for license-prompt +target, the GatedModelError → interactive-prompt → retry loop, and the +modality-consistency check. This dispatcher takes an already-resolved +`BaseModelSpec` plus an already-built `LicenseAcceptance | None`. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import StrEnum +from typing import TYPE_CHECKING + +from dlm.io.ulid import mint_ulid +from dlm.store.manifest import Manifest, save_manifest +from dlm.store.paths import StorePath, for_dlm +from dlm.templates import init as _templates_init + +if TYPE_CHECKING: + from pathlib import Path + + from dlm.base_models.license import LicenseAcceptance + from dlm.base_models.schema import BaseModelSpec + from dlm.templates.init import ApplyResult + + +class ScaffoldKind(StrEnum): + """Body shape to write when no `--template` is given.""" + + TEXT = "text" + VISION = "vision" + AUDIO = "audio" + + +@dataclass(frozen=True) +class InitRequest: + """Inputs to `run_init`. + + `template_name` and `scaffold_kind` are mutually informative: + when `template_name is not None`, the template's body wins and + `scaffold_kind` is ignored. The CLI rejects `--template` combined + with `--multimodal` / `--audio` before constructing the request. + """ + + path: Path + spec: BaseModelSpec + acceptance: LicenseAcceptance | None + force: bool + template_name: str | None + scaffold_kind: ScaffoldKind + + +@dataclass(frozen=True) +class InitResult: + """Outcome of `run_init`. `applied_template` is set iff the request + carried a `template_name` (the dispatcher applied a gallery template + rather than writing a scaffold).""" + + dlm_id: str + store: StorePath + applied_template: ApplyResult | None + + +def run_init(req: InitRequest) -> InitResult: + """Apply a template (or write a scaffold), then provision the store.""" + if req.template_name is not None: + applied = _templates_init.apply_template( + req.template_name, + req.path, + force=req.force, + accept_license=True, + ) + dlm_id = applied.dlm_id + else: + applied = None + dlm_id = mint_ulid() + if req.scaffold_kind is ScaffoldKind.VISION: + _write_init_scaffold_multimodal(req.path, req.spec.key, dlm_id) + elif req.scaffold_kind is ScaffoldKind.AUDIO: + _write_init_scaffold_audio(req.path, req.spec.key, dlm_id) + else: + _write_init_scaffold(req.path, req.spec.key, dlm_id) + + store = for_dlm(dlm_id) + store.ensure_layout() + save_manifest( + store.manifest, + Manifest( + dlm_id=dlm_id, + base_model=req.spec.key, + base_model_revision=req.spec.revision, + source_path=req.path.resolve(), + license_acceptance=req.acceptance, + ), + ) + + return InitResult(dlm_id=dlm_id, store=store, applied_template=applied) + + +def _write_init_scaffold(path: Path, base_model_key: str, dlm_id: str) -> None: + """Write a minimal-but-valid text-only `.dlm` at `path`. + + Body has one PROSE paragraph + a commented instruction section so + users see both section shapes on first open. + """ + scaffold = f"""--- +dlm_id: {dlm_id} +dlm_version: 1 +base_model: {base_model_key} +--- + +# Your document title + +Write prose here. It will train via continued pretraining (CPT) loss. + +::instruction:: + +### Q +Your example question. + +### A +Your example answer. +""" + path.write_text(scaffold, encoding="utf-8") + + +def _write_init_scaffold_multimodal(path: Path, base_model_key: str, dlm_id: str) -> None: + """Write a vision-language `.dlm` at `path`. + + Body shows the `::image::` attribute fence + a caption so users see + the v10 grammar on first open. The placeholder path + `figures/your-image.png` is deliberately non-existent — first + `dlm train` refuses with a clear file-missing error, prompting the + user to drop a real image in. Friendlier than committing an inert + sample users might not notice isn't theirs. + + `dlm_version: 10` because IMAGE sections require schema v10. + """ + scaffold = f"""--- +dlm_id: {dlm_id} +dlm_version: 10 +base_model: {base_model_key} +--- + +# Your document title + +Write prose here. It will train via continued pretraining (CPT) loss. + +::image path="figures/your-image.png" alt="short description":: +Caption text describing the image. Training rows bundle the image +with this caption as `\\n`. + +::instruction:: + +### Q +What is in this image? + +### A +Describe what the image shows. +""" + path.write_text(scaffold, encoding="utf-8") + + +def _write_init_scaffold_audio(path: Path, base_model_key: str, dlm_id: str) -> None: + """Write an audio-language `.dlm` at `path`. + + Body shows the `::audio::` attribute fence with the sibling- + transcript-friendly `transcript="..."` form so users see the v11 + grammar on first open. The placeholder path `clips/your-clip.wav` + is deliberately non-existent — first `dlm train` refuses with a + clear file-missing error rather than silently training on an inert + sample. + + `dlm_version: 11` because AUDIO sections require schema v11. + """ + scaffold = f"""--- +dlm_id: {dlm_id} +dlm_version: 11 +base_model: {base_model_key} +--- + +# Your document title + +Write prose here. It will train via continued pretraining (CPT) loss. + +::audio path="clips/your-clip.wav" transcript="Transcript of the audio clip.":: + +::instruction:: + +### Q +What was said in this recording? + +### A +Describe what you hear in the audio. +""" + path.write_text(scaffold, encoding="utf-8") diff --git a/tests/unit/cli/test_init_edges.py b/tests/unit/cli/test_init_edges.py index c80a673..ddb4357 100644 --- a/tests/unit/cli/test_init_edges.py +++ b/tests/unit/cli/test_init_edges.py @@ -148,7 +148,7 @@ def _fake_apply_template( ) -> object: raise TemplateError("template exploded") - monkeypatch.setattr(templates, "apply_template", _fake_apply_template) + monkeypatch.setattr("dlm.templates.init.apply_template", _fake_apply_template) result = runner.invoke( app, From f37fac5ca31fd29e40ceda29304825b068d63ba2 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 20:05:17 -0400 Subject: [PATCH 30/46] Extract show summarizers into dlm.store.show:gather_store_view --- src/dlm/cli/commands/show.py | 333 +++++------------------------------ src/dlm/store/show.py | 298 +++++++++++++++++++++++++++++++ 2 files changed, 343 insertions(+), 288 deletions(-) create mode 100644 src/dlm/store/show.py diff --git a/src/dlm/cli/commands/show.py b/src/dlm/cli/commands/show.py index 639c9ce..a4cf5ac 100644 --- a/src/dlm/cli/commands/show.py +++ b/src/dlm/cli/commands/show.py @@ -23,8 +23,8 @@ def show_cmd( from dlm.doc.errors import DlmParseError from dlm.doc.parser import parse_file from dlm.store.errors import ManifestCorruptError - from dlm.store.inspect import inspect_store from dlm.store.paths import for_dlm + from dlm.store.show import StoreViewRequest, gather_store_view console = Console(stderr=True) out_console = Console() @@ -36,86 +36,67 @@ def show_cmd( raise typer.Exit(code=1) from exc store = for_dlm(parsed.frontmatter.dlm_id) - training_sources, discovered_configs = _summarize_training_sources_and_discovered( - parsed, path.resolve().parent - ) - # The per-document cache config comes from frontmatter, not on-disk - # state — report it on both the pre-train and initialized-store paths - # so authors can sanity-check the knobs before `dlm train` runs. - cache_cfg = parsed.frontmatter.training.cache - training_cache_config: dict[str, object] = { - "enabled": cache_cfg.enabled, - "max_bytes": cache_cfg.max_bytes, - "prune_older_than_days": cache_cfg.prune_older_than_days, - } - # Store may not exist yet (no `dlm train` run). Treat that as an - # informational state rather than an error — useful after `dlm init`. - if not store.manifest.exists(): + try: + view = gather_store_view(StoreViewRequest(parsed=parsed, target_path=path, store=store)) + except ManifestCorruptError as exc: + console.print(f"[red]show:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if view.inspection is None: if json_out: payload: dict[str, object] = { - "dlm_id": parsed.frontmatter.dlm_id, - "base_model": parsed.frontmatter.base_model, + "dlm_id": view.parsed_dlm_id, + "base_model": view.parsed_base_model, "store_initialized": False, - "source_path": str(path.resolve()), - "training_cache_config": training_cache_config, + "source_path": str(view.source_path), + "training_cache_config": view.training_cache_config, } - if training_sources is not None: - payload["training_sources"] = training_sources - if discovered_configs: - payload["discovered_training_configs"] = discovered_configs + if view.training_sources is not None: + payload["training_sources"] = view.training_sources + if view.discovered_configs: + payload["discovered_training_configs"] = view.discovered_configs sys.stdout.write(_json.dumps(payload, indent=2) + "\n") else: out_console.print(f"[bold]{path}[/bold]") - out_console.print(f" dlm_id: {parsed.frontmatter.dlm_id}") - out_console.print(f" base_model: {parsed.frontmatter.base_model}") + out_console.print(f" dlm_id: {view.parsed_dlm_id}") + out_console.print(f" base_model: {view.parsed_base_model}") out_console.print(" store: [dim]not yet initialized (run `dlm train`)[/dim]") - if training_sources: - _render_training_sources_text(out_console, training_sources) + if view.training_sources: + _render_training_sources_text(out_console, view.training_sources) return - try: - inspection = inspect_store(store, source_path=path.resolve()) - except ManifestCorruptError as exc: - console.print(f"[red]show:[/red] {exc}") - raise typer.Exit(code=1) from exc - - training_cache = _summarize_training_cache(store.tokenized_cache_dir, store.root) - gate = _summarize_gate(store) - preference_mining = _summarize_preference_mining(store.root) - base_security = _summarize_base_security(parsed.frontmatter.base_model) - if json_out: - payload_full = _inspection_to_dict(inspection) - if training_sources is not None: - payload_full["training_sources"] = training_sources - if discovered_configs: - payload_full["discovered_training_configs"] = discovered_configs - if training_cache is not None: - payload_full["training_cache"] = training_cache - payload_full["training_cache_config"] = training_cache_config - if gate is not None: - payload_full["gate"] = gate - if preference_mining is not None: - payload_full["preference_mining"] = preference_mining - payload_full["preference_mining_runs"] = preference_mining["run_count"] - payload_full["total_auto_mined_pairs"] = preference_mining["total_mined_pairs"] - if base_security is not None: - payload_full["base_security"] = base_security + payload_full = _inspection_to_dict(view.inspection) + if view.training_sources is not None: + payload_full["training_sources"] = view.training_sources + if view.discovered_configs: + payload_full["discovered_training_configs"] = view.discovered_configs + if view.training_cache is not None: + payload_full["training_cache"] = view.training_cache + payload_full["training_cache_config"] = view.training_cache_config + if view.gate is not None: + payload_full["gate"] = view.gate + if view.preference_mining is not None: + payload_full["preference_mining"] = view.preference_mining + payload_full["preference_mining_runs"] = view.preference_mining["run_count"] + payload_full["total_auto_mined_pairs"] = view.preference_mining["total_mined_pairs"] + if view.base_security is not None: + payload_full["base_security"] = view.base_security # Write JSON to raw stdout — Rich's Console wraps lines at the # terminal width and would corrupt the JSON. sys.stdout.write(_json.dumps(payload_full, indent=2, default=str) + "\n") return - _render_inspection_text(out_console, path, inspection) - if training_sources: - _render_training_sources_text(out_console, training_sources) - if training_cache is not None and training_cache.get("entry_count", 0): - _render_training_cache_text(out_console, training_cache) - if gate is not None: - _render_gate_text(out_console, gate) - if base_security is not None and base_security.get("trust_remote_code"): - _render_base_security_text(out_console, base_security) + _render_inspection_text(out_console, path, view.inspection) + if view.training_sources: + _render_training_sources_text(out_console, view.training_sources) + if view.training_cache is not None and view.training_cache.get("entry_count", 0): + _render_training_cache_text(out_console, view.training_cache) + if view.gate is not None: + _render_gate_text(out_console, view.gate) + if view.base_security is not None and view.base_security.get("trust_remote_code"): + _render_base_security_text(out_console, view.base_security) def _inspection_to_dict(inspection: object) -> dict[str, object]: @@ -195,230 +176,6 @@ def _render_inspection_text(console: object, path: Path, inspection: object) -> console.print(" [yellow]orphaned:[/yellow] source .dlm is missing or mismatched") -def _summarize_training_sources(parsed: object, base_path: Path) -> list[dict[str, object]] | None: - """Best-effort resolution of `training.sources` for `dlm show`. - - Returns None when the frontmatter declares no directives; returns - a list of per-source dicts otherwise. Failures to expand (missing - paths, policy escapes) fall back to declared-only records so the - show output stays useful for debugging a misconfigured directive. - """ - records, _ = _summarize_training_sources_and_discovered(parsed, base_path) - return records - - -def _summarize_training_sources_and_discovered( - parsed: object, base_path: Path -) -> tuple[list[dict[str, object]] | None, list[dict[str, object]]]: - """Like `_summarize_training_sources` but also returns the per-anchor - `.dlm/training.yaml` + `.dlm/ignore` discovery records. - - Returns `(training_sources, discovered_configs)`. `discovered_configs` - is always a list (empty when nothing was found or the expansion - failed); `training_sources` matches the single-value helper's - contract. - """ - from dlm.directives import DirectiveError, expand_sources - from dlm.doc.parser import ParsedDlm - - assert isinstance(parsed, ParsedDlm) - directives = parsed.frontmatter.training.sources - if not directives: - return None, [] - - declared: list[dict[str, object]] = [ - { - "path": d.path, - "include": list(d.include), - "exclude": list(d.exclude), - "max_files": d.max_files, - "max_bytes_per_file": d.max_bytes_per_file, - } - for d in directives - ] - - try: - result = expand_sources(parsed, base_path=base_path) - except (DirectiveError, OSError): - return declared, [] - - records: list[dict[str, object]] = [] - for decl, prov in zip(declared, result.provenance, strict=False): - records.append( - { - **decl, - "file_count": prov.file_count, - "total_bytes": prov.total_bytes, - "skipped_binary": prov.skipped_binary, - "skipped_encoding": prov.skipped_encoding, - "skipped_over_size": prov.skipped_over_size, - } - ) - # If the expander returned fewer entries than declared (shouldn't - # happen on success but defensive), pad with declared-only. - if len(records) < len(declared): - records.extend(declared[len(records) :]) - - discovered_records: list[dict[str, object]] = [] - for dc in result.discovered: - discovered_records.append( - { - "anchor": str(dc.anchor), - "has_training_yaml": dc.config is not None, - "has_ignore": bool(dc.ignore_rules), - "include": list(dc.config.include) if dc.config else [], - "exclude": list(dc.config.exclude) if dc.config else [], - "exclude_defaults": (dc.config.exclude_defaults if dc.config else True), - "metadata": dict(dc.config.metadata) if dc.config else {}, - "ignore_rules": len(dc.ignore_rules), - } - ) - return records, discovered_records - - -def _summarize_training_cache(cache_dir: Path, store_root: Path) -> dict[str, object] | None: - """Return a JSON-friendly snapshot of the tokenized-section cache. - - None when the cache dir doesn't exist (store never trained with - the cache, or pre-Sprint-31 layout). Cheap — reads the manifest - only, not the entry files. - """ - if not cache_dir.is_dir(): - return None - from dlm.directives.cache import TokenizedCache - from dlm.metrics import queries as _queries - - cache = TokenizedCache.open(cache_dir) - last = _queries.latest_tokenization(store_root) - return { - "path": str(cache_dir), - "entry_count": cache.entry_count, - "bytes": cache.total_bytes, - "last_run_hit_rate": last.hit_rate if last else None, - "last_run_id": last.run_id if last else None, - } - - -def _summarize_gate(store: object) -> dict[str, object] | None: - """Return a JSON-friendly snapshot of the learned adapter gate. - - None when the store has no gate config (pre-Sprint-34 runs, or - `training.gate.enabled` was false). Reads two sources: the - on-disk `gate_config.json` for mode + adapter order, and the - metrics `gate_events` table for per-adapter mean weight from the - most recent run that recorded a gate. - """ - import json as _json - - from dlm.store.paths import StorePath - from dlm.train.gate.paths import gate_config_path - - assert isinstance(store, StorePath) - cfg_path = gate_config_path(store) - - from dlm.metrics import queries as _queries - from dlm.train.gate.module import GateMetadata - - events = _queries.latest_gate_events(store.root) - # Divergence path: training raised before writing a config, but we - # still emit one GateEvent per adapter with mode="diverged" so - # operators can see the failure. Surface it even when the config - # file is absent. - if not cfg_path.exists(): - if events and events[0].mode == "diverged": - return { - "mode": "diverged", - "adapter_names": [e.adapter_name for e in events], - "input_dim": None, - "hidden_proj_dim": None, - "last_run_id": events[0].run_id, - "per_adapter": [ - { - "adapter_name": e.adapter_name, - "mean_weight": e.mean_weight, - "sample_count": e.sample_count, - "mode": e.mode, - } - for e in events - ], - } - return None - - raw = _json.loads(cfg_path.read_text(encoding="utf-8")) - meta = GateMetadata.from_json(raw) - per_adapter: list[dict[str, object]] = [] - run_id: int | None = None - if events: - run_id = events[0].run_id - per_adapter = [ - { - "adapter_name": e.adapter_name, - "mean_weight": e.mean_weight, - "sample_count": e.sample_count, - "mode": e.mode, - } - for e in events - ] - else: - # No recorded events yet; fall back to the config so `dlm show` - # still reports that a gate exists and in which mode. - per_adapter = [{"adapter_name": name} for name in meta.adapter_names] - return { - "mode": meta.mode, - "adapter_names": list(meta.adapter_names), - "input_dim": meta.input_dim, - "hidden_proj_dim": meta.hidden_proj_dim, - "last_run_id": run_id, - "per_adapter": per_adapter, - } - - -def _summarize_preference_mining(store_root: Path) -> dict[str, object] | None: - """Return the latest preference-mine summary for `dlm show --json`.""" - from dlm.metrics import queries as _queries - - totals = _queries.preference_mining_totals(store_root) - if totals is None: - return None - last = _queries.latest_preference_mining(store_root) - assert last is not None - rows = _queries.preference_mining_for_run(store_root, last.run_id) - return { - "run_count": totals.run_count, - "event_count": totals.event_count, - "total_mined_pairs": totals.total_mined_pairs, - "total_skipped_prompts": totals.total_skipped_prompts, - "last_run_id": last.run_id, - "last_run_event_count": len(rows), - "last_event": _queries.preference_mining_to_dict([last])[0], - } - - -def _summarize_base_security(base_model_key: str) -> dict[str, object] | None: - """Surface security-sensitive base-model flags for `dlm show`. - - Today that's just `trust_remote_code` — a flag that causes the HF - loader to execute Python from the model repo. We resolve the spec - out of the in-process registry (no network: the resolver reads a - frozen Python dict) so users can see which bases opt in without - grepping source. Returns None when the key doesn't resolve (an - `hf:...` escape hatch that isn't in the registry); the caller - silently skips in that case. - """ - from dlm.base_models import resolve as resolve_base_model - from dlm.base_models.errors import BaseModelError - - try: - spec = resolve_base_model(base_model_key, accept_license=True) - except BaseModelError: - return None - return { - "base_model": spec.key, - "architecture": spec.architecture, - "trust_remote_code": bool(spec.trust_remote_code), - } - - def _render_base_security_text(console: object, snap: dict[str, object]) -> None: from rich.console import Console diff --git a/src/dlm/store/show.py b/src/dlm/store/show.py new file mode 100644 index 0000000..dcada29 --- /dev/null +++ b/src/dlm/store/show.py @@ -0,0 +1,298 @@ +"""Domain dispatcher for `dlm show`. + +Aggregates the .dlm + store snapshot the CLI renders to text or JSON. +Callers (CLI, LSP doc-overview panel, future automation) build a +`StoreViewRequest`, call `gather_store_view`, and render the typed +`StoreView` themselves. The dispatcher does no console I/O; +`ManifestCorruptError` propagates so the caller can map it to its own +exit code or banner. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING + +from dlm.directives import expand_sources as _expand_sources +from dlm.directives.errors import DirectiveError +from dlm.metrics import queries as _queries +from dlm.store.inspect import StoreInspection, inspect_store + +if TYPE_CHECKING: + from dlm.doc.parser import ParsedDlm + from dlm.store.paths import StorePath + + +@dataclass(frozen=True) +class StoreViewRequest: + """Inputs to `gather_store_view`.""" + + parsed: ParsedDlm + target_path: Path + store: StorePath + + +@dataclass(frozen=True) +class StoreView: + """Aggregated snapshot a `dlm show` caller renders to text or JSON. + + `inspection` is `None` when the store has no manifest yet + (post-`dlm init`, pre-`dlm train`); the caller surfaces the + "store: not yet initialized" path in that case. The summary dicts + are JSON-safe and form the v1 contract for `dlm show --json`; any + reshape is a version bump (recorded in `tests/golden/cli-json/`). + """ + + parsed_dlm_id: str + parsed_base_model: str + source_path: Path + training_cache_config: dict[str, object] + training_sources: list[dict[str, object]] | None + discovered_configs: list[dict[str, object]] + inspection: StoreInspection | None + training_cache: dict[str, object] | None + gate: dict[str, object] | None + preference_mining: dict[str, object] | None + base_security: dict[str, object] | None + + +def gather_store_view(req: StoreViewRequest) -> StoreView: + """Walk the .dlm + its store and produce a `StoreView`. + + Raises `ManifestCorruptError` if the manifest is unparseable. + """ + parsed = req.parsed + base_path = req.target_path.resolve().parent + training_sources, discovered_configs = _summarize_training_sources_and_discovered( + parsed, base_path + ) + cache_cfg = parsed.frontmatter.training.cache + training_cache_config: dict[str, object] = { + "enabled": cache_cfg.enabled, + "max_bytes": cache_cfg.max_bytes, + "prune_older_than_days": cache_cfg.prune_older_than_days, + } + + if not req.store.manifest.exists(): + return StoreView( + parsed_dlm_id=parsed.frontmatter.dlm_id, + parsed_base_model=parsed.frontmatter.base_model, + source_path=req.target_path.resolve(), + training_cache_config=training_cache_config, + training_sources=training_sources, + discovered_configs=discovered_configs, + inspection=None, + training_cache=None, + gate=None, + preference_mining=None, + base_security=None, + ) + + inspection = inspect_store(req.store, source_path=req.target_path.resolve()) + training_cache = _summarize_training_cache(req.store.tokenized_cache_dir, req.store.root) + gate = _summarize_gate(req.store) + preference_mining = _summarize_preference_mining(req.store.root) + base_security = _summarize_base_security(parsed.frontmatter.base_model) + + return StoreView( + parsed_dlm_id=parsed.frontmatter.dlm_id, + parsed_base_model=parsed.frontmatter.base_model, + source_path=req.target_path.resolve(), + training_cache_config=training_cache_config, + training_sources=training_sources, + discovered_configs=discovered_configs, + inspection=inspection, + training_cache=training_cache, + gate=gate, + preference_mining=preference_mining, + base_security=base_security, + ) + + +def _summarize_training_sources_and_discovered( + parsed: ParsedDlm, base_path: Path +) -> tuple[list[dict[str, object]] | None, list[dict[str, object]]]: + """Best-effort `training.sources` expansion + `.dlm/training.yaml` discovery. + + Returns `(training_sources, discovered_configs)`. `training_sources` + is None when the frontmatter declares no directives; otherwise + declared records are returned even when expansion fails (so the + show output stays useful for debugging a misconfigured directive). + `discovered_configs` is always a list (empty when nothing was + found or the expansion failed). + """ + directives = parsed.frontmatter.training.sources + if not directives: + return None, [] + + declared: list[dict[str, object]] = [ + { + "path": d.path, + "include": list(d.include), + "exclude": list(d.exclude), + "max_files": d.max_files, + "max_bytes_per_file": d.max_bytes_per_file, + } + for d in directives + ] + + try: + result = _expand_sources(parsed, base_path=base_path) + except (DirectiveError, OSError): + return declared, [] + + records: list[dict[str, object]] = [] + for decl, prov in zip(declared, result.provenance, strict=False): + records.append( + { + **decl, + "file_count": prov.file_count, + "total_bytes": prov.total_bytes, + "skipped_binary": prov.skipped_binary, + "skipped_encoding": prov.skipped_encoding, + "skipped_over_size": prov.skipped_over_size, + } + ) + if len(records) < len(declared): + records.extend(declared[len(records) :]) + + discovered_records: list[dict[str, object]] = [] + for dc in result.discovered: + discovered_records.append( + { + "anchor": str(dc.anchor), + "has_training_yaml": dc.config is not None, + "has_ignore": bool(dc.ignore_rules), + "include": list(dc.config.include) if dc.config else [], + "exclude": list(dc.config.exclude) if dc.config else [], + "exclude_defaults": (dc.config.exclude_defaults if dc.config else True), + "metadata": dict(dc.config.metadata) if dc.config else {}, + "ignore_rules": len(dc.ignore_rules), + } + ) + return records, discovered_records + + +def _summarize_training_cache(cache_dir: Path, store_root: Path) -> dict[str, object] | None: + """Return a JSON-friendly snapshot of the tokenized-section cache. + + None when the cache dir doesn't exist (store never trained with the + cache, or pre-Sprint-31 layout). Cheap — reads the manifest only, + not the entry files. + """ + if not cache_dir.is_dir(): + return None + from dlm.directives.cache import TokenizedCache + + cache = TokenizedCache.open(cache_dir) + last = _queries.latest_tokenization(store_root) + return { + "path": str(cache_dir), + "entry_count": cache.entry_count, + "bytes": cache.total_bytes, + "last_run_hit_rate": last.hit_rate if last else None, + "last_run_id": last.run_id if last else None, + } + + +def _summarize_gate(store: StorePath) -> dict[str, object] | None: + """Return a JSON-friendly snapshot of the learned adapter gate. + + None when the store has no gate config and no diverged-gate events. + Reads `gate_config.json` for mode + adapter order, and the + `gate_events` table for per-adapter mean weight from the most + recent run that recorded a gate. + """ + import json as _json + + from dlm.train.gate.module import GateMetadata + from dlm.train.gate.paths import gate_config_path + + cfg_path = gate_config_path(store) + + events = _queries.latest_gate_events(store.root) + if not cfg_path.exists(): + if events and events[0].mode == "diverged": + return { + "mode": "diverged", + "adapter_names": [e.adapter_name for e in events], + "input_dim": None, + "hidden_proj_dim": None, + "last_run_id": events[0].run_id, + "per_adapter": [ + { + "adapter_name": e.adapter_name, + "mean_weight": e.mean_weight, + "sample_count": e.sample_count, + "mode": e.mode, + } + for e in events + ], + } + return None + + raw = _json.loads(cfg_path.read_text(encoding="utf-8")) + meta = GateMetadata.from_json(raw) + per_adapter: list[dict[str, object]] = [] + run_id: int | None = None + if events: + run_id = events[0].run_id + per_adapter = [ + { + "adapter_name": e.adapter_name, + "mean_weight": e.mean_weight, + "sample_count": e.sample_count, + "mode": e.mode, + } + for e in events + ] + else: + per_adapter = [{"adapter_name": name} for name in meta.adapter_names] + return { + "mode": meta.mode, + "adapter_names": list(meta.adapter_names), + "input_dim": meta.input_dim, + "hidden_proj_dim": meta.hidden_proj_dim, + "last_run_id": run_id, + "per_adapter": per_adapter, + } + + +def _summarize_preference_mining(store_root: Path) -> dict[str, object] | None: + """Latest preference-mine summary for the JSON contract.""" + totals = _queries.preference_mining_totals(store_root) + if totals is None: + return None + last = _queries.latest_preference_mining(store_root) + assert last is not None + rows = _queries.preference_mining_for_run(store_root, last.run_id) + return { + "run_count": totals.run_count, + "event_count": totals.event_count, + "total_mined_pairs": totals.total_mined_pairs, + "total_skipped_prompts": totals.total_skipped_prompts, + "last_run_id": last.run_id, + "last_run_event_count": len(rows), + "last_event": _queries.preference_mining_to_dict([last])[0], + } + + +def _summarize_base_security(base_model_key: str) -> dict[str, object] | None: + """Surface `trust_remote_code` flag from the base-model registry. + + Returns None when the key doesn't resolve (an `hf:...` escape hatch + that isn't in the registry); the caller silently skips in that case. + """ + from dlm.base_models import resolve as resolve_base_model + from dlm.base_models.errors import BaseModelError + + try: + spec = resolve_base_model(base_model_key, accept_license=True) + except BaseModelError: + return None + return { + "base_model": spec.key, + "architecture": spec.architecture, + "trust_remote_code": bool(spec.trust_remote_code), + } From ce2cab18c999746b59d053d165f463f79ec068a8 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 20:08:09 -0400 Subject: [PATCH 31/46] Extract text-path prompt body into dlm.inference.dispatch:run_prompt Lifts build_backend + load + generate out of the CLI for text-only bases. VL and audio paths still live in prompt.py CLI helpers; a follow-up phase splits them into modality-aware dispatchers. --- src/dlm/cli/commands/prompt.py | 31 ++++++++------- src/dlm/inference/dispatch.py | 72 ++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 15 deletions(-) create mode 100644 src/dlm/inference/dispatch.py diff --git a/src/dlm/cli/commands/prompt.py b/src/dlm/cli/commands/prompt.py index 1f7501c..7dbbec0 100644 --- a/src/dlm/cli/commands/prompt.py +++ b/src/dlm/cli/commands/prompt.py @@ -96,11 +96,8 @@ def prompt_cmd( from dlm.doc.parser import parse_file from dlm.hardware import doctor from dlm.inference import AdapterNotFoundError - from dlm.inference.backends import ( - UnsupportedBackendError, - build_backend, - select_backend, - ) + from dlm.inference.backends import UnsupportedBackendError, select_backend + from dlm.inference.dispatch import PromptRequest, run_prompt from dlm.store.paths import for_dlm console = Console(stderr=True) @@ -234,30 +231,34 @@ def prompt_cmd( except UnsupportedBackendError as exc: console.print(f"[red]prompt:[/red] {exc}") raise typer.Exit(code=2) from exc - backend_obj = build_backend(backend_name, caps) if verbose: console.print(f"[dim]backend:[/dim] {backend_name}") - try: - backend_obj.load(spec, store, adapter_name=adapter) - except AdapterNotFoundError as exc: - console.print(f"[red]prompt:[/red] {exc}") - raise typer.Exit(code=1) from exc - if query is None: query = sys.stdin.read().strip() if not query: console.print("[red]prompt:[/red] empty query (pass a string or pipe on stdin)") raise typer.Exit(code=2) - response = backend_obj.generate( - query, + request = PromptRequest( + spec=spec, + capabilities=caps, + store=store, + backend_name=backend_name, + query=query, max_new_tokens=max_tokens, temperature=temp, top_p=top_p, + adapter=adapter, ) - sys.stdout.write(response + "\n") + try: + result = run_prompt(request) + except AdapterNotFoundError as exc: + console.print(f"[red]prompt:[/red] {exc}") + raise typer.Exit(code=1) from exc + + sys.stdout.write(result.response + "\n") def _dispatch_vl_prompt( # pragma: no cover diff --git a/src/dlm/inference/dispatch.py b/src/dlm/inference/dispatch.py new file mode 100644 index 0000000..14892e5 --- /dev/null +++ b/src/dlm/inference/dispatch.py @@ -0,0 +1,72 @@ +"""Domain dispatcher for `dlm prompt` (text path). + +Lifts the build-backend → load → generate pipeline out of the CLI for +text-only bases. Callers (CLI, LSP inline-preview, future automation) +build a `PromptRequest`, call `run_prompt`, and render the typed +`PromptResult`. The dispatcher does no console I/O nor stdin reads; +the CLI resolves the query string from argv or stdin before dispatch. + +Vision-language and audio-language paths still live in CLI helpers +(`_dispatch_vl_prompt`, `_dispatch_audio_prompt`); a follow-up phase +lifts those into modality-aware dispatchers under +`dlm.inference.dispatch_vl` / `dispatch_audio`. + +External-module imports are dotted (e.g. `from dlm.inference import +backends as _backends; _backends.build_backend(...)`) so test fixtures +that monkeypatch `dlm.inference.backends.` resolve at call time. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from dlm.inference import backends as _backends +from dlm.inference.backends.select import BackendName + +if TYPE_CHECKING: + from dlm.base_models.schema import BaseModelSpec + from dlm.hardware.capabilities import Capabilities + from dlm.store.paths import StorePath + + +@dataclass(frozen=True) +class PromptRequest: + """Inputs to `run_prompt`. + + The CLI is responsible for selecting the backend kind (`auto` → + `pytorch` / `mlx`), license-checking the spec, and resolving the + query string from argv or stdin; the dispatcher receives all of + those as already-typed objects. + """ + + spec: BaseModelSpec + capabilities: Capabilities + store: StorePath + backend_name: BackendName + query: str + max_new_tokens: int + temperature: float + top_p: float | None + adapter: str | None + + +@dataclass(frozen=True) +class PromptResult: + """Outcome of `run_prompt`. The CLI writes `response` to stdout.""" + + response: str + backend_name: BackendName + + +def run_prompt(req: PromptRequest) -> PromptResult: + """Build, load, and generate a single response for a text-only base.""" + backend_obj = _backends.build_backend(req.backend_name, req.capabilities) + backend_obj.load(req.spec, req.store, adapter_name=req.adapter) + response = backend_obj.generate( + req.query, + max_new_tokens=req.max_new_tokens, + temperature=req.temperature, + top_p=req.top_p, + ) + return PromptResult(response=response, backend_name=req.backend_name) From abbf3a65ac9c176a09ac8140068e2ed23c91fee8 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Tue, 28 Apr 2026 20:13:21 -0400 Subject: [PATCH 32/46] Extract train doctor + manifest-provision + run_phases into dlm.train.dispatch:run_train MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifts the hardware probe → manifest bootstrap → phase orchestration sequence out of the CLI. Watch loop, RPC probe server, multi-GPU accelerate launcher dispatch, and license interactive prompt stay CLI-side. Dotted imports in the dispatcher keep tests' monkeypatches on dlm.hardware.doctor and dlm.train.preference.phase_orchestrator.run_phases visible at call time. --- src/dlm/cli/commands/train.py | 89 +++++++--------------- src/dlm/train/dispatch.py | 136 ++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 61 deletions(-) create mode 100644 src/dlm/train/dispatch.py diff --git a/src/dlm/cli/commands/train.py b/src/dlm/cli/commands/train.py index 3d73676..d56a329 100644 --- a/src/dlm/cli/commands/train.py +++ b/src/dlm/cli/commands/train.py @@ -274,7 +274,6 @@ def train_cmd( from dlm.base_models import resolve as resolve_base_model from dlm.doc.errors import DlmParseError from dlm.doc.parser import parse_file - from dlm.hardware import doctor from dlm.lock import LockMode, LockValidationError from dlm.store.paths import for_dlm from dlm.train import ( @@ -283,12 +282,17 @@ def train_cmd( ResumeIntegrityError, TrainingError, ) + from dlm.train.dispatch import ( + NoViableTrainingPlanError, + TrainRequest, + run_train, + ) from dlm.train.preference import ( DpoPhaseError, NoPreferenceContentError, PriorAdapterRequiredError, ) - from dlm.train.preference.phase_orchestrator import Phase, run_phases + from dlm.train.preference.phase_orchestrator import Phase console = Console(stderr=True) @@ -442,70 +446,30 @@ def train_cmd( from dlm.train.distributed import detect_world_size ws = detect_world_size() - doctor_result = doctor( - training_config=parsed.frontmatter.training, - base_params=spec.params, - seq_len=min(parsed.frontmatter.training.sequence_len, spec.effective_context_length), - world_size=ws, - ) - plan = doctor_result.plan - if plan is None: - console.print( - "[red]doctor:[/red] no viable training plan for this host. " - "Run `dlm doctor` for details." - ) - raise typer.Exit(code=1) - store = for_dlm(parsed.frontmatter.dlm_id) - store.ensure_layout() - - # `dlm init` writes a manifest as part of store provisioning. Mirror - # that manifest write here when the store layout exists but has no - # manifest yet — covers two flows: - # - auto-scaffold via `dlm train ` on a fresh directory - # - hand-authored .dlm with a fresh ULID that never went through - # `dlm init` (e.g. authored via the LSP / VSCode extension) - # License acceptance has already been validated upstream by this - # point, so we just record it. - if not store.manifest.exists(): - from dlm.base_models import is_gated - from dlm.base_models.license import require_acceptance - from dlm.store.manifest import Manifest, save_manifest - - acceptance = ( - require_acceptance(spec, accept_license=True, via="cli_flag") - if is_gated(spec) - else None - ) - save_manifest( - store.manifest, - Manifest( - dlm_id=parsed.frontmatter.dlm_id, - base_model=spec.key, - base_model_revision=spec.revision, - source_path=path.resolve(), - license_acceptance=acceptance, - ), - ) from dlm.modality import ModalityError + request = TrainRequest( + parsed=parsed, + target_path=path, + spec=spec, + store=store, + phase=phase_literal, + mode=mode, + seed=seed, + max_steps=max_steps, + lock_mode=lock_mode, + world_size=ws, + strict_metrics=strict_metrics, + include_auto_mined=not no_mined, + ) + try: - phase_results = run_phases( - store, - parsed, - spec, - plan, - phase=phase_literal, - mode=mode, - seed=seed, - max_steps=max_steps, - lock_mode=lock_mode, - capabilities=doctor_result.capabilities, - world_size=ws, - strict_metrics=strict_metrics, - include_auto_mined=not no_mined, - ) + train_result = run_train(request) + except NoViableTrainingPlanError as exc: + console.print(f"[red]doctor:[/red] {exc}") + raise typer.Exit(code=1) from exc except sqlite3.Error as exc: console.print(f"[red]metrics:[/red] {exc}") raise typer.Exit(code=1) from exc @@ -548,6 +512,9 @@ def train_cmd( console.print(f"[red]training:[/red] {exc}") raise typer.Exit(code=1) from exc + plan = train_result.plan + phase_results = train_result.phase_results + if not phase_results: console.print( "[yellow]no-op:[/yellow] nothing to train for the requested phase. " diff --git a/src/dlm/train/dispatch.py b/src/dlm/train/dispatch.py new file mode 100644 index 0000000..24a95aa --- /dev/null +++ b/src/dlm/train/dispatch.py @@ -0,0 +1,136 @@ +"""Domain dispatcher for `dlm train` (single-shot path). + +Lifts the doctor → manifest-provision → run_phases pipeline out of the +CLI. Callers (CLI, LSP "Run Training" command, future automation) build +a `TrainRequest`, call `run_train`, and render the typed `TrainResult`. +The dispatcher does no console I/O; CLI-shaped concerns — multi-GPU +launcher dispatch, license interactive prompt, --watch loop, RPC +probe server, terminal rendering — stay in `dlm.cli.commands.train`. + +External-module imports are dotted (e.g. `from dlm import hardware as +_hardware; _hardware.doctor(...)`) so test fixtures that monkeypatch +`dlm.hardware.doctor` and +`dlm.train.preference.phase_orchestrator.run_phases` resolve at call +time. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from dlm import hardware as _hardware +from dlm.train.preference import phase_orchestrator as _orchestrator +from dlm.train.preference.phase_orchestrator import Phase, PhaseResult + +if TYPE_CHECKING: + from pathlib import Path + + from dlm.base_models.schema import BaseModelSpec + from dlm.doc.parser import ParsedDlm + from dlm.hardware.plan import TrainingPlan + from dlm.lock import LockMode + from dlm.store.paths import StorePath + from dlm.train.trainer import Mode + + +class NoViableTrainingPlanError(RuntimeError): + """`doctor()` returned no plan for the current host / config.""" + + +@dataclass(frozen=True) +class TrainRequest: + """Inputs to `run_train`. + + The CLI is responsible for parsing the .dlm, resolving the spec + (with license acceptance), validating flags, and detecting the + accelerate world size; the dispatcher receives all of those as + already-typed objects. + """ + + parsed: ParsedDlm + target_path: Path + spec: BaseModelSpec + store: StorePath + phase: Phase + mode: Mode + seed: int | None + max_steps: int | None + lock_mode: LockMode + world_size: int + strict_metrics: bool + include_auto_mined: bool + + +@dataclass(frozen=True) +class TrainResult: + """Outcome of `run_train`. `phase_results` is empty when nothing + matched the requested phase (no SFT content, no preference content, + etc.); the CLI surfaces a "no-op" message.""" + + plan: TrainingPlan + phase_results: list[PhaseResult] + + +def run_train(req: TrainRequest) -> TrainResult: + """Probe hardware, ensure store manifest, run all requested phases.""" + doctor_result = _hardware.doctor( + training_config=req.parsed.frontmatter.training, + base_params=req.spec.params, + seq_len=min( + req.parsed.frontmatter.training.sequence_len, + req.spec.effective_context_length, + ), + world_size=req.world_size, + ) + plan = doctor_result.plan + if plan is None: + raise NoViableTrainingPlanError( + "no viable training plan for this host. Run `dlm doctor` for details." + ) + + req.store.ensure_layout() + + # `dlm init` writes a manifest as part of store provisioning. Mirror + # that here when the layout exists but the manifest doesn't — covers + # auto-scaffold via `dlm train ` and hand-authored .dlms with + # fresh ULIDs that never went through `dlm init` (e.g. authored via + # the LSP). License acceptance has already been validated upstream. + if not req.store.manifest.exists(): + from dlm.base_models import is_gated + from dlm.base_models.license import require_acceptance + from dlm.store.manifest import Manifest, save_manifest + + acceptance = ( + require_acceptance(req.spec, accept_license=True, via="cli_flag") + if is_gated(req.spec) + else None + ) + save_manifest( + req.store.manifest, + Manifest( + dlm_id=req.parsed.frontmatter.dlm_id, + base_model=req.spec.key, + base_model_revision=req.spec.revision, + source_path=req.target_path.resolve(), + license_acceptance=acceptance, + ), + ) + + phase_results = _orchestrator.run_phases( + req.store, + req.parsed, + req.spec, + plan, + phase=req.phase, + mode=req.mode, + seed=req.seed, + max_steps=req.max_steps, + lock_mode=req.lock_mode, + capabilities=doctor_result.capabilities, + world_size=req.world_size, + strict_metrics=req.strict_metrics, + include_auto_mined=req.include_auto_mined, + ) + + return TrainResult(plan=plan, phase_results=phase_results) From d39aefbf4efc813a13f8d87afc5bc2ecc97766c5 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 18:03:14 -0400 Subject: [PATCH 33/46] Move audit-13 follow-up findings into versioned docs tree --- docs/audits/13-brutal-promise-audit.md | 381 ++++++++ docs/audits/13-followup/PLAN.md | 77 ++ docs/audits/13-followup/finding03/baseline.md | 62 ++ .../finding03/direct-query-results.md | 78 ++ .../13-followup/finding03/probe-base.dlm | 17 + .../13-followup/finding03/sway-results.json | 856 +++++++++++++++++ .../13-followup/finding03/sway-results.md | 42 + docs/audits/13-followup/finding03/sway.yaml | 105 +++ docs/audits/13-followup/finding03/the-doc.dlm | 725 ++++++++++++++ .../finding04/direct-query-results.md | 143 +++ docs/audits/13-followup/finding04/the-doc.dlm | 415 +++++++++ .../13-followup/findings/00-overview.md | 112 +++ .../13-followup/findings/01-recipe-failure.md | 146 +++ .../13-followup/findings/02-cpt-isolation.md | 143 +++ .../13-followup/findings/03-base-floor.md | 140 +++ .../findings/04-instruction-shape.md | 156 ++++ docs/audits/13-followup/stage1/sway.yaml | 101 ++ docs/audits/13-followup/stage1/the-doc.dlm | 47 + docs/audits/13-followup/sway-results.json | 881 ++++++++++++++++++ docs/audits/13-followup/sway-results.md | 41 + docs/audits/13-followup/sway.yaml | 105 +++ docs/audits/13-followup/the-doc.dlm | 725 ++++++++++++++ 22 files changed, 5498 insertions(+) create mode 100644 docs/audits/13-brutal-promise-audit.md create mode 100644 docs/audits/13-followup/PLAN.md create mode 100644 docs/audits/13-followup/finding03/baseline.md create mode 100644 docs/audits/13-followup/finding03/direct-query-results.md create mode 100644 docs/audits/13-followup/finding03/probe-base.dlm create mode 100644 docs/audits/13-followup/finding03/sway-results.json create mode 100644 docs/audits/13-followup/finding03/sway-results.md create mode 100644 docs/audits/13-followup/finding03/sway.yaml create mode 100644 docs/audits/13-followup/finding03/the-doc.dlm create mode 100644 docs/audits/13-followup/finding04/direct-query-results.md create mode 100644 docs/audits/13-followup/finding04/the-doc.dlm create mode 100644 docs/audits/13-followup/findings/00-overview.md create mode 100644 docs/audits/13-followup/findings/01-recipe-failure.md create mode 100644 docs/audits/13-followup/findings/02-cpt-isolation.md create mode 100644 docs/audits/13-followup/findings/03-base-floor.md create mode 100644 docs/audits/13-followup/findings/04-instruction-shape.md create mode 100644 docs/audits/13-followup/stage1/sway.yaml create mode 100644 docs/audits/13-followup/stage1/the-doc.dlm create mode 100644 docs/audits/13-followup/sway-results.json create mode 100644 docs/audits/13-followup/sway-results.md create mode 100644 docs/audits/13-followup/sway.yaml create mode 100644 docs/audits/13-followup/the-doc.dlm diff --git a/docs/audits/13-brutal-promise-audit.md b/docs/audits/13-brutal-promise-audit.md new file mode 100644 index 0000000..fa0ae0b --- /dev/null +++ b/docs/audits/13-brutal-promise-audit.md @@ -0,0 +1,381 @@ +# Audit 13 — Brutal end-to-end audit of the broader DLM promise + +**Scope:** the marketing claim from `CLAUDE.md` — *"a text file with a `.dlm` +extension becomes a local, reproducible, trainable LLM. Edit the document, +retrain, share. Not a toy — LoRA/QLoRA on a real pretrained base, exportable +to Ollama."* Verified by running, not reading. + +**Date:** 2026-04-29. +**Hardware:** Apple Silicon (Darwin 25.4.0 arm64, 18 cores, 48 GB unified +memory, MPS, no CUDA, no bitsandbytes, no FlashAttention). Doctor reports +`backend=mps`, `determinism_class=best-effort`. +**Wall-time actual:** ~33 minutes of in-band runs (23:36 → ~00:09 EDT) +within the 90-min budget. Heaviest single op: 11.4-min fortran train at +120 steps. +**Method:** B12.1 reproduction, full E2E-A through E2E-E from the prompt, +with sway as the measurement instrument. +**Artifacts:** `docs/audits/13-findings/{logs,fortran-fine-tune,sway-runs,exports}/`. + +## Verdict + +**YELLOW.** The promise is **mostly delivered, with named caveats**: + +- **Trainable** — yes. Real `train` against SmolLM2-135M-Instruct on a + 2,021-file fortran corpus completed in 11.4 min on MPS, monotonic loss + drop 1.96 → 1.69, eval_loss 1.83. Adapter, training_state.pt with + RNGs, pinned_versions.json, metrics.sqlite all written. +- **Exportable to Ollama** — yes, end-to-end. GGUF base + adapter produced + in ~20 s, registered with `ollama create`, smoke prompt produced + coherent text. **But** `--no-template` is silently ignored at emission + (M13.1). +- **Reproducible** — **partially**. Sway re-run is bit-exact deterministic; + `dlm train --fresh --seed 42` re-run on the same tiny doc yields + **different adapter SHAs** (`1afc...` vs `bb1f...`) on MPS. + `training_state.rng.json` is bit-identical across runs, so the drift + is in MPS-kernel ops, not RNG state. Doctor says + `determinism_class: best-effort` — the docs are honest, but + CLAUDE.md's "reproducible" headline is aspirational on Apple Silicon. +- **"Not a toy"** — **partially**. The 135M adapter measurably shifted + the model on fortran-keyword prompts (sway delta_kl `+44.18σ` vs null + baseline) but **did not internalize** the hand-written Q/A pairs + (sway paraphrase_invariance `-3.51σ`, FAIL). Confirmed at runtime: the + Ollama smoke output for "When should I use do concurrent?" produced a + Python code block, not Fortran. The base 135M floor is too thin for + the Q/A semantic-binding promise on a one-pass, 120-step run. + Distribution-shift signal is real; transfer to question-answering is + not. +- **Share** — yes. `dlm pack` → `dlm verify` → `DLM_HOME=… dlm unpack` + → `dlm prompt` round-tripped a 15 MB bundle cleanly. **But** + CLAUDE.md documents `dlm unpack PATH --home DIR`; the actual flag is + `DLM_HOME=…` env-var only (M13.2). +- **"Edit the document, retrain"** — yes mechanically, but at ~3.5 s/step + on MPS with 60-step eval pauses adding ~30s, the iteration loop is + 6-12 min for a 120-step run. Workable, not delightful. + +The strongest evidence in this audit is **sway**: with the null_adapter +calibration probe in the suite, sway gives 4-decimal scores, z-scores, +and a single-line verdict that resolves "did training do anything?" +honestly. Sway re-runs are bit-identical. The audit is much stronger +as a result; without sway it would have been "the smoke output looked +fortran-ish" hand-waving. + +The B12.1 blocker from Audit 12 is **fixed**. No new blockers found. + +## What's fixed since Audit 12 + +- **B12.1 manifest race resolved.** Same fortran corpus + same source + directives + fresh ULID → trained to completion. The audit-12 store + (`01KQAR00VP2KAKVGCB7W4XRRZY`) now has a `manifest.json` and the + failing path no longer reproduces. +- **M12.1 cli/commands.py extracted.** The 4,650-LOC monolith from + Audit 12 is gone; `src/dlm/cli/commands/` is now a 23-file package, + largest file 698 LOC (`train.py`), total 4,449 LOC. Restructured. + +## Blocker findings + +**None on the broader promise.** Every CLAUDE.md headline claim has a +working code path on a fresh box. Reservations are filed below as +majors and minors. + +## Major findings + +### M13.1 — `dlm export --no-template` is a no-op at Modelfile emission + +**Live caught.** `dlm export … --no-template` suppresses the preflight +chat-template check, but the Ollama Modelfile's `TEMPLATE """…"""` +block is still written. + +- Help text (`src/dlm/cli/commands/export.py:41`): "Skip writing + TEMPLATE into the Modelfile." +- Plan path: `cli_no_template` is forwarded into + `ExportPlan.include_template = not cli_no_template` + (`src/dlm/export/plan.py:141`). +- **Only one consumer** of `plan.include_template` exists in `src/`: + `src/dlm/export/runner.py:223`, where it gates the preflight check. +- `src/dlm/export/ollama/modelfile.py:101,124` calls + `_build_template_block(template_row)` and appends it to the Modelfile + unconditionally — `plan.include_template` is never consulted by the + emission code. +- **Evidence** (`docs/audits/13-findings/logs/export-2-no-template.log` + + the resulting Modelfile in + `~/.dlm/store/01KQBN26S8RY8MGHE9HM09E8CM/exports/Q4_K_M/Modelfile`): + ``` + FROM ./base.Q4_K_M.gguf + ADAPTER ./adapter.gguf + + TEMPLATE """{{- if .System }}<|im_start|>system + … + ``` + The block is present even though `--no-template` was the only + template-affecting flag. + +**Severity:** MAJOR — the help text lies. Users counting on this flag to +test "what does Ollama do without a TEMPLATE override?" get the wrong +answer. + +**Fix shape:** in `_build_template_block` consumers, check +`ctx.plan.include_template` and emit nothing (or a comment) when False. + +### M13.2 — `dlm unpack --home DIR` is documented but not implemented + +CLAUDE.md "CLI surface by release" lists: +``` +dlm unpack [--home DIR] [--force] +``` + +`uv run dlm unpack --help` shows: +``` +--force Overwrite an existing store with the same dlm_id. +--out Directory to place the restored .dlm (default: alongside the pack). +``` + +Passing `--home DIR` aborts with `No such option: --home`. The actual +override is via env var (`DLM_HOME=… dlm unpack ...` works). Either: + +- Add a `--home` flag matching the documented surface, OR +- Update CLAUDE.md to drop `--home` (and any other commands documented + with it). The same pattern almost certainly applies elsewhere — a + doc/CLI sync sweep is the right scope. + +**Severity:** MAJOR for documentation-versus-binary mismatch on a +top-line CLI claim. + +### M13.3 — `dlm metrics PATH --run-id 1` parse breaks (option-after-positional) + +``` +$ dlm metrics PATH --run-id 1 +… error: Missing argument 'PATH'. +$ dlm metrics --run-id 1 PATH +run_id=1 phase=sft seed=42 status=ok steps=12 evals=4 +``` + +Typer's group dispatch ate the positional. Workaround is "options +before positional," but every CLI user expects the opposite. Either: + +- Restructure as a subcommand-group: `dlm metrics show PATH --run-id 1` +- Or pass `path` to the group's callback so subcommand parsing sees a + consumed positional first. + +**Severity:** MAJOR for UX paper-cut on a documented v2 surface +command. Trivially fixable. + +### M13.4 — "Reproducible" is aspirational on MPS + +`dlm train --fresh --seed 42 --max-steps 8` on a tiny 3-Q/A doc, run +twice in succession on the same hardware, in the same shell: + +| Run | adapter_model.safetensors SHA | +|---|---| +| v0001 | `1afcd3f524e62dd17b87bf7059f698ab87882986e0397a353d5e7f3b358837e0` | +| v0002 | `bb1f67dbc19b7ebdba1910477f8d5cb23d4e5442092fada06f5d2817a3a137d6` | + +Different bits. `training_state.rng.json` was bit-identical between +runs, so RNG-seed plumbing is correct — the drift is downstream of +RNGs, in MPS kernel ops. `dlm doctor --json` already reports +`determinism_class: best-effort` and the per-store `dlm.lock` +acknowledges this. So the implementation doesn't lie; **CLAUDE.md +does**, by stating "Edit the document, retrain, share" and labeling +the project "reproducible" without a platform caveat. On Linux+CUDA +with `CUBLAS_WORKSPACE_CONFIG=:4096:8` this is presumably bit-exact +(not verified in this audit); on macOS+MPS it isn't. + +Also captured (m13.5 below): `--fresh` does **not** wipe and reset the +adapter version counter. v0001 from run 1 stayed; run 2 wrote v0002. +Tangential to determinism but related — interpreting `--fresh` as +"start from base weights for *this* run" is sensible, but a user might +reasonably expect "throw away prior versions and start over." + +### M13.5 — `dlm show` re-expands directives on every invocation, on stderr + +``` +$ dlm show fortran.dlm --json 2>/dev/null | head +{ … clean JSON … } +$ dlm show fortran.dlm --json 2>&1 >/dev/null | wc -l +243 +``` + +For a 2,021-file source-directive document, `dlm show --json` re-walks +the corpus and prints 243 lines of `dlm.directives.expand INFO: +directive: … exceeds max_bytes_per_file=32768; skipping` to stderr, +**every time**. `dlm show` is a read-only command users will run +repeatedly. The expansion result *should* be cached, and the INFO logs +should be silenced unless `--verbose`. + +Mostly cosmetic until you script `dlm show` in a loop, at which point +it becomes a 1-2 second hit per call. + +## Minor / informational findings + +- **m13.6 — Stale brew install masks current binary.** `which dlm` → + `/opt/homebrew/bin/dlm` → `dlm 0.9.0`, missing `repl, metrics, + templates, push, pull, serve, verify, preference, synth, cache` + (every command added since 0.9.0). The repo HEAD is `0.10.0` with + the full surface. The release script either didn't bump the brew + formula, or the user hasn't `brew upgrade`d. Symptom: a fresh + contributor pasting commands from CLAUDE.md against the brew binary + hits "No such command 'repl'." Either bump the brew formula on every + `pyproject.toml` version bump (CI gate), or document `uv run dlm` as + the canonical entry point in the repo's README. +- **m13.7 — ANSI escape spam from Ollama on non-TTY pipe.** Piping + `ollama run … | tee` produces output salted with `[?25l[?2026h` + cursor-control sequences (see `logs/ollama-fortran-prompt.log`). Not + a dlm bug — Ollama 0.20.7 doesn't suppress its TTY UI under a pipe. + Listed because it surfaces as garbled-looking dlm-export smoke + output if anyone scripts the round-trip and doesn't `sed` it out. +- **m13.8 — Sway's `section_internalization` and `leakage` probes need + the dlm bridge to be useful.** Both probes opt-out / SKIP without + `ctx.sections` populated by the dlm-sway bridge. The bridge requires + `pip install 'dlm-sway[dlm]'` plus the local DLM checkout, which I + intentionally didn't install in the audit's sway venv (no clean way + to do so without giving sway a typosquat-friendly PyPI install + permission). On a fresh user's machine the bridge install is one + pip line; it's worth shipping a starter spec that activates the + bridge so users see attribution probes light up first time. +- **m13.9 — `dlm pack` produces unsigned bundles by default.** `dlm + verify` correctly reports "is unsigned — no provenance.json + inside." Signing is opt-in. Reasonable default; flagged because the + Phase 7 promise of "share" is partly trust-signed sharing. A user + expecting "I can verify this came from someone trustworthy" + mid-distribution will discover the answer is "no, unless they + signed it" only after running verify. +- **m13.10 — `--fresh` keeps prior adapter versions.** Two consecutive + `dlm train --fresh ...` runs on the same store produced v0001 and + v0002 side-by-side. If `--fresh` semantics are "fresh weights, but + keep history," document it. If they're "wipe and start over," fix + the version-counter reset. + +## What works (earned praise) + +- **Source-directive expansion at scale**: 2,021 .f90/.fypp files + across 2 sources expanded in ~1.5 s, with sensible + `max_bytes_per_file` skipping (62 oversized + 1 non-UTF-8) logged + per-file. Tokenization cache populated 2,030 entries (3.1 MB). +- **Manifest contract honored.** After completion, the store has + `manifest.json`, `adapter/versions/v0001/{adapter_config.json, + adapter_model.safetensors, training_state.pt, training_state.rng.json, + pinned_versions.json}`, `metrics.sqlite`, `replay/`, + `tokenized-cache/`. CLAUDE.md pitfall #2 (training_state sidecar) + observably honored. +- **Strict-mode parsing.** `dlm show` and `dlm train` on a malformed + `.dlm` (bad ULID, wrong type, unknown key) return a single + composed Pydantic error with line numbers and a one-shot diagnosis: + ``` + error: /tmp/audit-e2e-e/malformed.dlm:2: dlm_id: Value error, dlm_id + must be a 26-char Crockford base32 ULID, got 'not-a-ulid-just-a-string'; + training.lora_r: Input should be a valid integer, unable to parse + string as an integer; training.unknown_key: Extra inputs are not + permitted + ``` +- **Export pipeline produces the documented artifacts.** Within ~20 s + on MPS: `base.Q4_K_M.gguf` (105 MB), `adapter.gguf` (1.8 MB), + `imatrix.gguf` (631 KB), `imatrix.meta.json`, + `export_manifest.json` with sha256 + llama_cpp_tag (`b8816`). +- **Ollama integration end-to-end.** `dlm export … --name dlm-fortran-audit13` + registered `dlm-fortran-audit13:latest` (107 MB). `ollama run` produced + coherent (though not fortran-flavored) output. The Modelfile's + `TEMPLATE` block is the registry-authored Go template, not a Jinja + fuzz-match (CLAUDE.md pitfall #1). +- **`dlm pack` / `dlm unpack` round-trip.** 15 MB bundle, restored + cleanly into a fresh `DLM_HOME`, `dlm prompt` ran against the + restored store immediately. +- **Per-store metrics.sqlite is real.** Tables `runs, steps, evals, + exports, tokenization, gate_events, preference_mining`. Step-loss + curve queryable via plain SQL — see + `logs/step-loss.txt` for the audit's own run. +- **Sway is sharper than the docs claim.** Wall 3.2 s on a 4-probe + + null_adapter suite, perfectly deterministic across re-runs, + composite verdict + per-category scores + per-probe z-scores. The + audit's central evidence — *the adapter shifted distributions on + fortran prompts but did not bind Q/A pairs* — is z-scored with + CI95. + +## Promise audit table + +| CLAUDE.md headline claim | Verdict | Evidence pointer | +|---|---|---| +| "A text file with a .dlm extension becomes a local … LLM" | **PASS** | `~/.dlm/store/01KQBN26S8RY8MGHE9HM09E8CM/{adapter,manifest.json,exports/Q4_K_M/}` after a 6.8 KB `.dlm` | +| "trainable LLM" (LoRA) | **PASS** | `logs/train-1.log` — 120 steps, train_loss 1.771, eval_loss 1.826; `metrics.sqlite` has the curve | +| "trainable LLM" (QLoRA) | **UNVERIFIED** | bitsandbytes is not available on Apple Silicon; no QLoRA path attempted | +| "real pretrained base" | **PASS** | `adapter_config.json:base_model_name_or_path = "HuggingFaceTB/SmolLM2-135M-Instruct"`; revision pinned in manifest | +| "exportable to Ollama" | **PASS** | `ollama list` shows `dlm-fortran-audit13:latest`; `ollama run` returned coherent text | +| "reproducible" | **PARTIAL** | sway: bit-exact reruns. dlm train on MPS: SHA drift between v0001 and v0002 with same seed (`logs/det-12-shas.txt`). Doctor honestly reports `best-effort` | +| "Edit the document, retrain" (workflow) | **PASS-with-caveat** | 11.4-min wall for 120 steps. `--watch` flag exists; not exercised this audit | +| "share" (pack/unpack) | **PASS** | `logs/pack.log` + `logs/unpack-2.log` + `logs/unpack-prompt.log` round-trip | +| "share" (push/pull/serve to HF/HTTP/LAN) | **UNVERIFIED** | not exercised this audit | +| "Not a toy" — the 135M model demonstrably learns a domain | **PARTIAL/FAIL** | sway: `dk_fortran` z=+44.18σ (distribution shift, real); `para_fortran` z=-3.51σ (Q/A binding **failed**). Adapter shifted token distributions on fortran-keyword prompts but did not internalize Q/A semantics in 120 steps. Visible at runtime: ollama smoke produced Python, not Fortran | +| "Pad token must NOT default to EOS" (CLAUDE.md pitfall #4) | **PASS** | tokenizer log says `pad_token_id: 0`, distinct from EOS | +| "training_state.pt sidecar" (pitfall #2) | **PASS** | `versions/v0001/training_state.pt` (7.5 MB) + `training_state.pt.sha256` + `training_state.rng.json` (16 KB) | +| "merge_and_unload on QLoRA refused without --dequantize" (pitfall #3) | **PASS-by-code-read** | `src/dlm/export/plan.py:102` and `src/dlm/export/merge.py` enforce; not live-tested (no QLoRA on MPS) | +| `dlm doctor --json` reports plan + capabilities | **PASS** | `logs/doctor-uv.json` | +| All v1.0 CLI commands present | **PASS** (binary `0.10.0`) / **FAIL** (brew `0.9.0`) | `dlm 0.10.0 --help` lists all v1.0 + v2 commands; `/opt/homebrew/bin/dlm` is 0.9.0 and missing `repl, metrics, templates, push, pull, serve, verify, preference, synth, cache` | +| `dlm unpack --home DIR` documented flag | **FAIL** | `--home` is not a flag; `DLM_HOME=` env var is the actual override (M13.2) | +| `dlm export --no-template` does what it says | **FAIL** | preflight is suppressed, emission is not; M13.1 | + +## Methodology notes + +Order of operations: + +1. Read `docs/audits/12-brutal-post-audit-11.md`. Confirmed B12.1 is + the open blocker to verify. +2. `dlm doctor --json` and `dlm --help` via both `which dlm` + (`/opt/homebrew/bin/dlm` 0.9.0) and `uv run dlm` (repo HEAD 0.10.0). + Discovered the brew/repo skew immediately — every subsequent + command used `uv run dlm`. +3. `dlm init` to scaffold a fresh `.dlm`, then overwrote with the + audit-12 fortran source-directive frontmatter, refreshed to schema + v15 and a fresh ULID. Two source directives: + `~/GithubOrgs/FortranGoingOnForty` (1,847 .f90/.F90/.f95) and + `/tmp/stdlib_build/src` (174 .f90/.fypp). Total post-skip: 2,021 + files, 7.97 MB content. +4. `dlm train --fresh --seed 42 --max-steps 120`. **B12.1 did not + reproduce** — manifest written, training proceeded. 11.4 min wall. + Loss monotonic 1.927 → 1.886 → 1.805 → 1.689 → 1.638 → 1.607 + over six log points. Eval at steps 60/90/120; final + eval_loss=1.826. +5. Captured adapter SHA snapshot. Authored `sway.yaml` with + `delta_kl + paraphrase_invariance + calibration_drift + leakage`, + first-pass schema mismatch on `section_internalization` (audit-author + error), reauthored, ran. First sway run had no calibration → re-ran + with `null_adapter` probe added. +6. `dlm export ... --quant Q4_K_M` (with `--no-template` after + preflight refused without it — see M13.1). Inspected emitted + artifacts. `--name dlm-fortran-audit13` registered into Ollama. + `ollama run` smoke prompt. +7. Determinism: tiny 3-Q/A doc with a fresh ULID, `dlm train --fresh + --seed 42 --max-steps 8` twice. Compared + `adapter_model.safetensors` SHAs and + `training_state.rng.json` byte-diff. Re-ran sway against the + spec twice; compared per-probe `(verdict, score, raw, z)` tuples. +8. Failure-mode hunting: `dlm prompt` on never-trained doc; `dlm + train` on malformed frontmatter; `dlm export --merged` on plain + LoRA; 200 KB inflated `.dlm` parses; `dlm metrics PATH --run-id + 1` (broke); `dlm unpack --home` (missing flag). +9. `dlm pack` / `dlm verify` / `dlm unpack` (with `DLM_HOME=`) round- + trip and prompt-on-restored-store. + +Cleanup: `ollama rm dlm-fortran-audit13`. + +The single biggest methodology learning: **if the audit had skipped +the `null_adapter` probe in the sway suite, every probe would have +shown verdict but no z-score, and the "did training do anything?" +question would have been a vibes call**. Sway with calibration is +properly falsifiable; sway without it is just a per-probe score +table. The dlm cookbook's starter sway specs should default-include +`null_adapter`. + +## Summary in one paragraph + +The promise is real for "a `.dlm` file becomes a trainable, exportable, +shareable LoRA on top of a real HF base." The 11.4-min train, the +working Ollama round-trip, the clean pack/unpack, the strict Pydantic +parsing, and sway's z-scored verdict on a real corpus all back this +up. The promise is **overstated** in two places: "reproducible" needs +a platform caveat (best-effort on MPS, verified non-bit-exact even +with the same seed), and "Not a toy" understates how much the 135M +floor needs help — distribution shift is real, Q/A internalization +is not, on a single 120-step pass. The two help-text bugs (M13.1 +`--no-template` no-op, M13.2 `--unpack --home` missing) and the doc +drift (`/opt/homebrew/bin/dlm` is 0.9.0) are minor in code but +material in trust: a careful user reading CLAUDE.md and running the +brew binary will hit "No such command 'repl'." within 30 seconds. +None of this is a blocker; all of it is a documentation/UX sweep. diff --git a/docs/audits/13-followup/PLAN.md b/docs/audits/13-followup/PLAN.md new file mode 100644 index 0000000..6c7f0be --- /dev/null +++ b/docs/audits/13-followup/PLAN.md @@ -0,0 +1,77 @@ +# Audit 13 follow-up — get fortran Q/A binding to work + +**Goal:** flip `paraphrase_invariance` from **−3.51σ FAIL** to **PASS** and +get the Ollama smoke prompt to answer a fortran question with fortran (not +Python). Driven by Audit 13's diagnosis: the original .dlm had ~8 MB PROSE +source vs only **8** INSTRUCTION sections — too little Q/A signal for +SmolLM2-135M to bind question→answer semantics in 120 steps. + +## Hypothesis + +Q/A binding failed because of **corpus shape**, not corpus volume or model +floor. Three corrections: + +1. **Add INSTRUCTION density.** Hand-author 50+ high-quality Q/A pairs + sourced from the FORD-generated stdlib markdown specs at + `/tmp/stdlib_build/doc/specs/` (40 expert-written module docs). Plus + `dlm synth instructions --strategy extraction --apply` to harvest + more Q/A from the PROSE. +2. **Add a third PROSE source.** Pull `/tmp/stdlib_build/doc/specs/**/*.md` + in as source-directive content. The markdown files are "what does X do?" + shaped — exactly the registration the model is missing. +3. **More steps.** 500-1000 instead of 120. Audit 13's loss curve was + still descending at step 120 (1.927 → 1.638 over six log points). + +If `paraphrase_invariance` still fails after this, the floor is real and we +need to repeat at SmolLM2-360M and 1.7B as a tier comparison. + +## Sway spec for the re-run + +Will require `pip install 'dlm-sway[hf,dlm]'` in the audit venv so the +bridge lights up `section_internalization`, `leakage`, and bridge-aware +`paraphrase_invariance` (Audit 13 m13.8). + +```yaml +version: 1 +dlm_source: ./the-doc.dlm +models: + base: { kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct" } + ft: { kind: hf, base: "HuggingFaceTB/SmolLM2-135M-Instruct", + adapter: "~/.dlm/store//adapter/versions/v0001" } +defaults: + seed: 0 + differential: true + coverage_threshold: 0.6 + score_weights: { adherence: 0.30, attribution: 0.35, + calibration: 0.20, ablation: 0.15 } +suite: + - { name: null_baseline, kind: null_adapter, prompts_from: sections/instruction } + - { name: dk_fortran, kind: delta_kl, prompts_from: sections/instruction } + - { name: para_fortran, kind: paraphrase_invariance, + prompts_from: sections/instruction, + assert: { generalization_ratio_gte: 0.5 } } + - { name: sis_fortran, kind: section_internalization } # bridge probe + - { name: leak_fortran, kind: leakage, # bridge probe + assert: { fragility_gte: 0.4 } } + - { name: cal_general, kind: calibration_drift, + assert: { regression_rate_lt: 0.15 } } + - { name: abl_fortran, kind: adapter_ablation, + lambdas: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25], + assert: { r_squared_gte: 0.85 } } +``` + +## Success bar + +- `paraphrase_invariance` z-score ≥ +1.0 (PASS), generalization_ratio ≥ 0.5. +- `section_internalization` ≥ +1.0σ on at least 5 of the new INSTRUCTION + sections (bridge probe — verifies attribution). +- `leakage` `fragility_gte ≥ 0.4` (sanity: pattern-match, not memorization). +- `delta_kl` z-score ≥ +1.0 on a held-out fortran prompt set (the audit-13 + +44σ was huge; we just need real signal, not regression). +- `adapter_ablation` R² ≥ 0.85 on the λ-scaled curve (healthy fine-tune). +- Ollama smoke: "When should I use `do concurrent`?" → fortran answer with + `do concurrent` syntax visible. + +If any of these miss, the next iteration is a model-floor sweep at +SmolLM2-360M and 1.7B with the same recipe. If they all pass at 135M, the +recipe is the dlm cookbook starter. diff --git a/docs/audits/13-followup/finding03/baseline.md b/docs/audits/13-followup/finding03/baseline.md new file mode 100644 index 0000000..573d812 --- /dev/null +++ b/docs/audits/13-followup/finding03/baseline.md @@ -0,0 +1,62 @@ +# Finding 03 — base capability snapshot (pre-training) + +`Qwen/Qwen2.5-Coder-1.5B-Instruct`, fp16, MPS, greedy decoding, no +adapter. + +## Probe set + +| # | Question | Verdict | Notes | +| --- | --- | --- | --- | +| 1 | How do you declare an allocatable array of `real(real64)` in modern Fortran? | **correct** | Idiomatic syntax, complete `program` block. | +| 2 | When should you prefer `do concurrent` over plain `do`? | partial | Mentions performance/parallelism but vague on loop-carried-dependence semantics. | +| 3 | Show the signature of stdlib's sorting routine in Fortran. | **WRONG** | Claims Fortran has no built-in sort, invents `iso_fortran_env::sort` which does not exist. | +| 4 | What does `intent(in)` mean for a dummy argument? | **correct** | Accurate explanation + example. | + +## Baseline character + +The base already speaks fortran fluently — syntax, modern features +(`do concurrent`, `intent`, `allocatable`), `program` structure. What +it does **not** know is **`fortran-lang/stdlib`** — the third-party +modular standard library at `/tmp/stdlib_build/src`. Q3 demonstrates +this directly: rather than pointing to `stdlib_sorting` or +`stdlib_sorting::sort`, the model fabricates a non-existent +`iso_fortran_env::sort`. This is exactly the kind of specific-corpus +knowledge a LoRA should be able to add. + +## What this means for the experiment + +The finding-03 hypothesis is now *narrow and falsifiable*: + +> Training a LoRA on the audit-13-followup corpus (fortran source +> files + stdlib source + stdlib doc/specs markdown) should produce +> a measurable correction on stdlib-specific questions, leaving +> general-fortran questions unchanged. + +If post-training Q3 produces a correct answer pointing at +`stdlib_sorting`, the LoRA added real domain knowledge above the base +capability — and dlm's product story stands on more than just style +adaptation. + +If post-training Q3 still hallucinates and Q1/Q2/Q4 *degrade*, the +LoRA is parasitic at this base — same memorization-without- +generalization pattern as Findings 01 and 02, just at a more capable +base. + +## Sample base outputs (verbatim, truncated) + +``` +Q1: How do you declare an allocatable array of real(real64) in modern Fortran? +> In modern Fortran, you can declare an allocatable array of real numbers +> using the `ALLOCATABLE` keyword. ... +> real(real64), allocatable :: my_array(:) +> allocate(my_array(5)) +``` + +``` +Q3: Show the signature of stdlib's sorting routine in Fortran. +> The standard library in Fortran does not have a built-in sorting +> routine like C or other languages. However, you can use the `sort` +> function from the `iso_fortran_env` module to sort arrays. +``` + +(Q3 is the wedge.) diff --git a/docs/audits/13-followup/finding03/direct-query-results.md b/docs/audits/13-followup/finding03/direct-query-results.md new file mode 100644 index 0000000..0aad7f7 --- /dev/null +++ b/docs/audits/13-followup/finding03/direct-query-results.md @@ -0,0 +1,78 @@ +# Finding 03 — direct-query smoke results + +Greedy decoding (`temp=0.0`, `max-tokens=150-200`). + +## Q1 — allocatable real(real64) array + +``` +$ dlm prompt finding03/the-doc.dlm "How do you declare an allocatable array of real(real64)..." +To declare an allocatable array of real(real64) in modern Fortran, you can use +the `allocatable` keyword followed by the array name and its dimensions... +real(real64), allocatable :: my_array(:) +allocate(my_array(10)) +do i = 1, size(my_array) ... +deallocate(my_array) +``` + +Verdict: **correct, ≈ same as base.** + +## Q2 — do concurrent vs do + +``` +$ dlm prompt finding03/the-doc.dlm "When should you prefer \`do concurrent\`..." +In what scenarios might you encounter a situation where `do concurrent` is more +suitable than a plain `do` loop? Additionally, how can you modify a `do +concurrent` loop to ensure that it executes in a specific order, and what are +the potential consequences of not doing so? + +```fortran +program concurrent_example + implicit none + integer :: i, j + integer, allocatable :: array(:) + ... + ! Use a plain do loop + do i = 1, 10 + do j = +``` + +Verdict: **worse than base.** Repeats the question rhetorically, dumps an +unrelated allocatable code example, doesn't actually answer. + +## Q3 — stdlib sorting (THE WEDGE) + +``` +$ dlm prompt finding03/the-doc.dlm "Show the signature of stdlib's sorting routine in Fortran." +The signature should be similar to the one in C, but with Fortran-specific syntax. +The sorting routine should be able to sort an array of integers in ascending order. +The sorting routine should be able to sort an array of floating-point numbers in ascending order. +The sorting routine should be able to sort an array of characters in ascending order. +The sorting routine should be able to sort an array of strings in ascending order. +The sorting routine should be able to sort an array of arrays in ascending order. +... (degenerate enumeration loop, never mentions stdlib_sorting) +``` + +Verdict: **worse than base.** Base hallucinated `iso_fortran_env::sort` +(concrete but wrong). Trained adapter degenerates into a "should be able to +sort..." enumeration loop. The wedge gap was *not* closed; it widened. + +## Q4 — intent(in) + +``` +$ dlm prompt finding03/the-doc.dlm "What does \`intent(in)\` mean..." +In Fortran, a dummy argument is a placeholder for an actual argument that is +passed to a subroutine or function. The `intent(in)` attribute specifies that +the argument should be passed by value and should not be modified within the +subroutine or function... +subroutine example_subroutine(input_value) + implicit none + integer, intent(in) :: input_value +``` + +Verdict: **correct, ≈ same as base.** + +## Net wedge score + +**0/4 questions improved.** 2/4 actively degraded (Q2, Q3). +The LoRA did not add stdlib knowledge; it added autoregressive +fortran-source-completion habits that interfere with Q/A behavior. diff --git a/docs/audits/13-followup/finding03/probe-base.dlm b/docs/audits/13-followup/finding03/probe-base.dlm new file mode 100644 index 0000000..e5325c5 --- /dev/null +++ b/docs/audits/13-followup/finding03/probe-base.dlm @@ -0,0 +1,17 @@ +--- +dlm_id: 01KQDKYZK4Y33K70BXPME3BY1F +dlm_version: 1 +base_model: qwen2.5-coder-1.5b +--- + +# Your document title + +Write prose here. It will train via continued pretraining (CPT) loss. + +::instruction:: + +### Q +Your example question. + +### A +Your example answer. diff --git a/docs/audits/13-followup/finding03/sway-results.json b/docs/audits/13-followup/finding03/sway-results.json new file mode 100644 index 0000000..0757991 --- /dev/null +++ b/docs/audits/13-followup/finding03/sway-results.json @@ -0,0 +1,856 @@ +{ + "adapter_id": "/Users/mfwolffe/.dlm/store/01KQDM14CT0X6AWFWTW9E81ZJ7/adapter/versions/v0001", + "backend_stats": { + "avg_batch_size": 5.333333333333333, + "batched_prompts": 32, + "batches_sent": 6, + "cache_hits": 195, + "cache_misses": 407, + "forward_passes": 407, + "hit_rate": 0.3239202657807309, + "max_batch_size": 8, + "scoring_wall_s": 18.853502175872563 + }, + "base_model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct", + "determinism": { + "class": "best_effort", + "notes": [ + "MPS: bit-identical across runs is best-effort" + ], + "seed": 0 + }, + "finished_at": "2026-04-30T00:20:28.579554+00:00", + "null_stats": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.4272762910783005, + "n": 3.0, + "std": 0.01574057543246111 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.24980309514651933, + "n": 3.0, + "std": 0.026889785937786386 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.09396635989348094, + "n": 3.0, + "std": 0.19938429094952828 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": 6.346549302354173e-20, + "n": 3.0, + "std": 1e-06 + } + }, + "probes": [ + { + "base_value": null, + "ci_95": null, + "duration_s": 54.12962433299981, + "evidence": { + "calibrated_kinds": [ + "delta_kl", + "section_internalization", + "paraphrase_invariance", + "leakage", + "calibration_drift" + ], + "from_cache": false, + "init_scale": 0.02, + "null_stats": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.4272762910783005, + "n": 3.0, + "std": 0.01574057543246111 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.24980309514651933, + "n": 3.0, + "std": 0.026889785937786386 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.09396635989348094, + "n": 3.0, + "std": 0.19938429094952828 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": 6.346549302354173e-20, + "n": 3.0, + "std": 1e-06 + } + }, + "null_stats_by_rank": { + "rank_1.00": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.4272762910783005, + "n": 3.0, + "std": 0.01574057543246111 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.24980309514651933, + "n": 3.0, + "std": 0.026889785937786386 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.09396635989348094, + "n": 3.0, + "std": 0.19938429094952828 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": 6.346549302354173e-20, + "n": 3.0, + "std": 1e-06 + } + } + }, + "per_kind_raw_samples": { + "calibration_drift": [ + 0.0, + 0.0, + 0.0 + ], + "delta_kl": [ + 0.4052588193059781, + 0.4411261139014875, + 0.4354439400274358 + ], + "leakage": [ + 0.2789317507418398, + 0.2564102564102564, + 0.21406727828746178 + ], + "paraphrase_invariance": [ + -0.038652390241622925, + -0.3610735833644867, + 0.11782689392566681 + ], + "section_internalization": [ + 4.231032868236115e-20, + -1.4808615038826403e-19, + 2.9617230077652806e-19 + ] + }, + "rank_multipliers": [ + 1.0 + ], + "runs": 3, + "seed_base": 1000, + "skipped_kinds": [ + { + "kind": "adapter_ablation", + "reason": "probe opted out (calibrate_spec returned None)" + } + ], + "weight": 1.0 + }, + "ft_value": null, + "kind": "null_adapter", + "message": "null calibration: 5 kinds calibrated over 3 seeds (1 opted out)", + "name": "null_baseline", + "raw": null, + "score": 1.0, + "verdict": "pass", + "z_score": null + }, + { + "base_value": null, + "ci_95": [ + 0.14064727954865902, + 0.22753181482947113 + ], + "duration_s": 0.41668754200509284, + "evidence": { + "divergence_kind": "js", + "max": 0.3010465067639052, + "num_prompts": 8, + "per_prompt": [ + 0.3010465067639052, + 0.23390241316723992, + 0.15400480602009725, + 0.20887225276262034, + 0.16464061056144086, + 0.07848606060070874, + 0.208388124748079, + 0.12252715727669955 + ], + "raw_ci_95": [ + 0.14064727954865902, + 0.22753181482947113 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": -15.456410766849695 + } + }, + "ft_value": null, + "kind": "delta_kl", + "message": "mean js=0.1840, z=-15.46\u03c3 vs null", + "name": "dk_fortran", + "raw": 0.18398349148759885, + "score": 0.005753728112632616, + "verdict": "fail", + "z_score": -15.456410766849695 + }, + { + "base_value": null, + "ci_95": [ + -0.016300776265738042, + 0.016401170616035115 + ], + "duration_s": 1.9860071250004694, + "evidence": { + "num_sections": 41, + "passing_frac": 0.17073170731707318, + "per_section": [ + { + "base_nll": 3.169445037841797, + "effective_sis": -0.06924204203276793, + "ft_nll": 2.7726451328822543, + "kind": "prose", + "leak_lift": 0.19443743120087909, + "own_lift": 0.12519538916811115, + "passed": false, + "section_id": "79e3ec8ad3461cef", + "tag": "" + }, + { + "base_nll": 2.641357632341056, + "effective_sis": -0.03482161454153665, + "ft_nll": 2.22197265625, + "kind": "instruction", + "leak_lift": 0.19359790857914175, + "own_lift": 0.1587762940376051, + "passed": false, + "section_id": "5f6b1dead97c7d16", + "tag": "" + }, + { + "base_nll": 2.755524980260971, + "effective_sis": -0.005073661109242011, + "ft_nll": 2.2380410052360373, + "kind": "instruction", + "leak_lift": 0.19287234873932968, + "own_lift": 0.18779868763008767, + "passed": false, + "section_id": "3b651cf3c102a94a", + "tag": "" + }, + { + "base_nll": 2.3038976070654655, + "effective_sis": -0.10026703323777737, + "ft_nll": 2.0851952768590327, + "kind": "instruction", + "leak_lift": 0.1951941383034403, + "own_lift": 0.09492710506566292, + "passed": false, + "section_id": "39347c7736ed8744", + "tag": "" + }, + { + "base_nll": 2.1978319872606984, + "effective_sis": -0.09115845891624902, + "ft_nll": 1.9696673144091357, + "kind": "instruction", + "leak_lift": 0.19497197795413473, + "own_lift": 0.10381351903788572, + "passed": false, + "section_id": "b185f7cb3ff6a672", + "tag": "" + }, + { + "base_nll": 2.551950859301018, + "effective_sis": 0.042608306391309636, + "ft_nll": 1.953983653675426, + "kind": "instruction", + "leak_lift": 0.19170937392224305, + "own_lift": 0.23431768031355268, + "passed": false, + "section_id": "8dfd46d5cbbfc6a0", + "tag": "" + }, + { + "base_nll": 2.675020974779886, + "effective_sis": 0.019682968622259545, + "ft_nll": 2.1080462743365573, + "kind": "instruction", + "leak_lift": 0.19226852850197598, + "own_lift": 0.21195149712423553, + "passed": false, + "section_id": "84adb31460daa7b4", + "tag": "" + }, + { + "base_nll": 2.5133117039998374, + "effective_sis": 0.0011374386958792337, + "ft_nll": 2.026085376739502, + "kind": "instruction", + "leak_lift": 0.19272085850018036, + "own_lift": 0.1938582971960596, + "passed": false, + "section_id": "2963ad39da116480", + "tag": "" + }, + { + "base_nll": 2.4265698629712302, + "effective_sis": -0.033383062893972665, + "ft_nll": 2.0378824869791665, + "kind": "instruction", + "leak_lift": 0.1935628219535914, + "own_lift": 0.16017975905961873, + "passed": false, + "section_id": "4ebf42ad7431d861", + "tag": "" + }, + { + "base_nll": 2.365179106945128, + "effective_sis": 0.026398881671855223, + "ft_nll": 1.8483789398914248, + "kind": "instruction", + "leak_lift": 0.19210472574466878, + "own_lift": 0.218503607416524, + "passed": false, + "section_id": "8150fd7d0d03cee4", + "tag": "" + }, + { + "base_nll": 2.310318737495236, + "effective_sis": -0.01529510219990407, + "ft_nll": 1.8994827270507812, + "kind": "instruction", + "leak_lift": 0.19312165218056535, + "own_lift": 0.17782654998066127, + "passed": false, + "section_id": "7a4f482e72e04eba", + "tag": "" + }, + { + "base_nll": 2.67780724350287, + "effective_sis": 0.13401580581247258, + "ft_nll": 1.811548038404815, + "kind": "instruction", + "leak_lift": 0.18947992271684883, + "own_lift": 0.3234957285293214, + "passed": true, + "section_id": "f4f345eacda9a037", + "tag": "" + }, + { + "base_nll": 2.939030679424157, + "effective_sis": -0.02373857622963227, + "ft_nll": 2.4406033633800037, + "kind": "instruction", + "leak_lift": 0.1933275905715343, + "own_lift": 0.16958901434190204, + "passed": false, + "section_id": "01bec021954d9b71", + "tag": "" + }, + { + "base_nll": 3.3431225357768692, + "effective_sis": 0.008211981580764804, + "ft_nll": 2.6719562851379965, + "kind": "instruction", + "leak_lift": 0.19254830867371975, + "own_lift": 0.20076029025448455, + "passed": false, + "section_id": "b13ccd1c3a9f683c", + "tag": "" + }, + { + "base_nll": 2.646616718556621, + "effective_sis": 0.06681033458153121, + "ft_nll": 1.9639764162573483, + "kind": "instruction", + "leak_lift": 0.19111908055174984, + "own_lift": 0.25792941513328105, + "passed": true, + "section_id": "8cbfb9d8507491b7", + "tag": "" + }, + { + "base_nll": 2.038816108078253, + "effective_sis": 0.0014224015950915214, + "ft_nll": 1.64300787253458, + "kind": "instruction", + "leak_lift": 0.19271390818556544, + "own_lift": 0.19413630978065696, + "passed": false, + "section_id": "c99eafec654809ed", + "tag": "" + }, + { + "base_nll": 2.5932478279903015, + "effective_sis": -0.04483238708158735, + "ft_nll": 2.206828781654095, + "kind": "instruction", + "leak_lift": 0.1938420737630454, + "own_lift": 0.14900968668145806, + "passed": false, + "section_id": "7b7a511b8cb7a3cc", + "tag": "" + }, + { + "base_nll": 2.2328143856389735, + "effective_sis": 0.05034876550445094, + "ft_nll": 1.6927650265577363, + "kind": "instruction", + "leak_lift": 0.19152058223655669, + "own_lift": 0.24186934774100763, + "passed": true, + "section_id": "969d989aee32a657", + "tag": "" + }, + { + "base_nll": 2.428478338779547, + "effective_sis": -0.008508511757506138, + "ft_nll": 1.980551303961338, + "kind": "instruction", + "leak_lift": 0.19295612558440928, + "own_lift": 0.18444761382690314, + "passed": false, + "section_id": "e4efe84850c617f9", + "tag": "" + }, + { + "base_nll": 1.9514180963689631, + "effective_sis": 0.060117184503950344, + "ft_nll": 1.460832538026752, + "kind": "instruction", + "leak_lift": 0.19128232811461768, + "own_lift": 0.251399512618568, + "passed": true, + "section_id": "b435fa0fb7979efc", + "tag": "" + }, + { + "base_nll": 2.246190157803622, + "effective_sis": 0.0963891425223252, + "ft_nll": 1.6020124955610795, + "kind": "instruction", + "leak_lift": 0.19039764621173047, + "own_lift": 0.28678678873405566, + "passed": true, + "section_id": "311b3bf11dfbea7e", + "tag": "" + }, + { + "base_nll": 2.447883752676157, + "effective_sis": -0.07899066871840801, + "ft_nll": 2.164701461791992, + "kind": "instruction", + "leak_lift": 0.19467520258345566, + "own_lift": 0.11568453386504765, + "passed": false, + "section_id": "67c5f4e5dfcfb4b3", + "tag": "" + }, + { + "base_nll": 2.47250002906436, + "effective_sis": 0.03711392729178514, + "ft_nll": 1.906403072296627, + "kind": "instruction", + "leak_lift": 0.1918433831685729, + "own_lift": 0.22895731046035805, + "passed": false, + "section_id": "95851f00680a27e0", + "tag": "" + }, + { + "base_nll": 2.8820984403965837, + "effective_sis": -0.06357835048673222, + "ft_nll": 2.5053478176310913, + "kind": "instruction", + "leak_lift": 0.1942992923826831, + "own_lift": 0.13072094189595088, + "passed": false, + "section_id": "e7dfb14006866fb1", + "tag": "" + }, + { + "base_nll": 2.664724568374284, + "effective_sis": 0.06296575720864866, + "ft_nll": 1.987408587040792, + "kind": "instruction", + "leak_lift": 0.19121285073157623, + "own_lift": 0.2541786079402249, + "passed": true, + "section_id": "e05149d6dd019c7d", + "tag": "" + }, + { + "base_nll": 2.3588959224640376, + "effective_sis": -0.08091002447668283, + "ft_nll": 2.090425279405382, + "kind": "instruction", + "leak_lift": 0.19472201613853554, + "own_lift": 0.11381199166185271, + "passed": false, + "section_id": "2abcffdaa5a5459e", + "tag": "" + }, + { + "base_nll": 2.9229711106986054, + "effective_sis": -0.004829600247521104, + "ft_nll": 2.3733449888623452, + "kind": "instruction", + "leak_lift": 0.19286639603538527, + "own_lift": 0.18803679578786417, + "passed": false, + "section_id": "c717e7bcca5467af", + "tag": "" + }, + { + "base_nll": 3.16006298828125, + "effective_sis": -0.07691862635966952, + "ft_nll": 2.7881044921875, + "kind": "instruction", + "leak_lift": 0.19462466496494985, + "own_lift": 0.11770603860528032, + "passed": false, + "section_id": "2c92258d39229ce8", + "tag": "" + }, + { + "base_nll": 2.5842990282899843, + "effective_sis": 0.10704593276881844, + "ft_nll": 1.8162875915906445, + "kind": "instruction", + "leak_lift": 0.19013772449840136, + "own_lift": 0.2971836572672198, + "passed": true, + "section_id": "ee8b2414b1c97b71", + "tag": "" + }, + { + "base_nll": 2.396781349182129, + "effective_sis": 0.02356774837062009, + "ft_nll": 1.8796960830688476, + "kind": "instruction", + "leak_lift": 0.1921737777764062, + "own_lift": 0.2157415261470263, + "passed": false, + "section_id": "a04e766e64918615", + "tag": "" + }, + { + "base_nll": 2.5624157982151003, + "effective_sis": 0.016221892812708666, + "ft_nll": 2.0279603387317517, + "kind": "instruction", + "leak_lift": 0.19235294498513575, + "own_lift": 0.20857483779784441, + "passed": false, + "section_id": "f52b3c00ccb92c83", + "tag": "" + }, + { + "base_nll": 2.5047251825747283, + "effective_sis": 0.00431127186137803, + "ft_nll": 2.0114077360733695, + "kind": "instruction", + "leak_lift": 0.19264344793516822, + "own_lift": 0.19695471979654625, + "passed": false, + "section_id": "73172bca8301df6f", + "tag": "" + }, + { + "base_nll": 2.516946756614829, + "effective_sis": 0.028535175232796667, + "ft_nll": 1.9617390182783019, + "kind": "instruction", + "leak_lift": 0.1920526210236702, + "own_lift": 0.22058779625646688, + "passed": false, + "section_id": "76bba5378f960fa1", + "tag": "" + }, + { + "base_nll": 2.4733934144716003, + "effective_sis": -0.02666372733114497, + "ft_nll": 2.0609916480811865, + "kind": "instruction", + "leak_lift": 0.1933989357203517, + "own_lift": 0.16673520838920672, + "passed": false, + "section_id": "6107afd44bb89580", + "tag": "" + }, + { + "base_nll": 3.349968162742821, + "effective_sis": 0.03174970058206686, + "ft_nll": 2.6005001583614864, + "kind": "instruction", + "leak_lift": 0.19197421796637093, + "own_lift": 0.2237239185484378, + "passed": false, + "section_id": "20883a8329375a3c", + "tag": "" + }, + { + "base_nll": 3.4627742060908564, + "effective_sis": -0.031773743837268165, + "ft_nll": 2.9026710792824075, + "kind": "instruction", + "leak_lift": 0.19352357026928152, + "own_lift": 0.16174982643201335, + "passed": false, + "section_id": "34fd68f2e2dc5091", + "tag": "" + }, + { + "base_nll": 3.1862661238196943, + "effective_sis": -0.0470585632913498, + "ft_nll": 2.7184017922380845, + "kind": "instruction", + "leak_lift": 0.19389637074377133, + "own_lift": 0.14683780745242153, + "passed": false, + "section_id": "d918396806259702", + "tag": "" + }, + { + "base_nll": 3.5823368812674907, + "effective_sis": 0.03173237117190586, + "ft_nll": 2.780943002273787, + "kind": "instruction", + "leak_lift": 0.19197464063491143, + "own_lift": 0.2237070118068173, + "passed": false, + "section_id": "d9fd6b631fed2d59", + "tag": "" + }, + { + "base_nll": 2.871491931733631, + "effective_sis": -0.06171754728668227, + "ft_nll": 2.490914844331287, + "kind": "instruction", + "leak_lift": 0.19425390693877945, + "own_lift": 0.13253635965209717, + "passed": false, + "section_id": "fe7a4197df714f68", + "tag": "" + }, + { + "base_nll": 2.017791596218128, + "effective_sis": 0.02436236414770343, + "ft_nll": 1.5809058953280473, + "kind": "instruction", + "leak_lift": 0.1921543969037944, + "own_lift": 0.21651676105149784, + "passed": false, + "section_id": "9c217dda0d7e0c0b", + "tag": "" + }, + { + "base_nll": 3.3685618580638113, + "effective_sis": 0.024011949105312347, + "ft_nll": 2.6403633597847467, + "kind": "instruction", + "leak_lift": 0.19216294361214542, + "own_lift": 0.21617489271745777, + "passed": false, + "section_id": "4979f9ee02c60ce1", + "tag": "" + } + ], + "per_section_threshold": 0.05, + "raw_ci_95": [ + -0.016300776265738042, + 0.016401170616035115 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": 1.2904650248120151e-12 + } + }, + "ft_value": null, + "kind": "section_internalization", + "message": "7/41 sections cleared; mean effective_sis=+0.000, z=+0.00\u03c3 vs null", + "name": "sis_fortran", + "raw": 1.3539305178355568e-18, + "score": 0.5000000000001076, + "verdict": "fail", + "z_score": 1.2904650248120151e-12 + }, + { + "base_value": null, + "ci_95": null, + "duration_s": 9.54199640545994e-06, + "evidence": {}, + "ft_value": null, + "kind": "paraphrase_invariance", + "message": "no cases provided", + "name": "para_fortran", + "raw": null, + "score": null, + "verdict": "error", + "z_score": null + }, + { + "base_value": null, + "ci_95": [ + 0.042483660130718956, + 0.042483660130718956 + ], + "duration_s": 26.154526583006373, + "evidence": { + "mean_clean_recall": 0.042483660130718956, + "mean_fragility": 0.0, + "mean_perturbed_recall": 0.046433762330282506, + "per_section": [ + { + "clean_recall": 0.042483660130718956, + "fragility": 0.0, + "perturbed_recall": 0.046433762330282506, + "section_id": "79e3ec8ad3461cef" + } + ], + "raw_ci_95": [ + 0.042483660130718956, + 0.042483660130718956 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": 7.709969707288315 + } + }, + "ft_value": 0.0, + "kind": "leakage", + "message": "greedy_recall=0.04 (perturbed=0.05, fragility=0.00), z=+7.71\u03c3 vs null", + "name": "leak_fortran", + "raw": 0.042483660130718956, + "score": 0.9289050290702341, + "verdict": "pass", + "z_score": 7.709969707288315 + }, + { + "base_value": null, + "ci_95": [ + 0.0, + 0.0 + ], + "duration_s": 2.9972277079941705, + "evidence": { + "fraction_regressed": 0.0, + "mean_delta_nats": -0.1660053998785831, + "raw_ci_95": [ + 0.0, + 0.0 + ], + "regressed_count": 0, + "regression_nats_threshold": 1.0, + "total_items": 50, + "weight": 1.0, + "worst_offenders": [], + "z_by_rank": null + }, + "ft_value": -0.1660053998785831, + "kind": "calibration_drift", + "message": "0/50 items regressed >1.0 nats (frac=0.0%), mean_delta=-0.166 nats/tok (no calibration for calibration_drift)", + "name": "cal_general", + "raw": 0.0, + "score": 0.8223985600323779, + "verdict": "pass", + "z_score": null + }, + { + "base_value": null, + "ci_95": null, + "duration_s": 0.6297628750035074, + "evidence": { + "lambdas": [ + 0.0, + 0.25, + 0.5, + 0.75, + 1.0, + 1.25 + ], + "linearity": 0.9914886822436975, + "mean_divergence_per_lambda": [ + 0.0, + 0.044187597196996074, + 0.06287823281030223, + 0.09153273777189956, + 0.12553101443175255, + 0.1513824181528589 + ], + "overshoot": 1.2059363882155274, + "passed_linearity": true, + "passed_overshoot": true, + "passed_saturation": false, + "saturation_lambda": 1.25, + "saturation_reason": "found", + "weight": 1.0, + "z_by_rank": null + }, + "ft_value": null, + "kind": "adapter_ablation", + "message": "R\u00b2=0.99, sat_\u03bb=1.25 (out of band), overshoot=1.21 (no calibration for adapter_ablation)", + "name": "abl_fortran", + "raw": 0.9914886822436975, + "score": 0.79, + "verdict": "fail", + "z_score": null + } + ], + "schema_version": 1, + "score": { + "band": "partial", + "components": { + "ablation": 0.79, + "adherence": 0.005753728112632616, + "attribution": 0.5000000000001076, + "baseline": 1.0, + "calibration": 0.875651794551306 + }, + "findings": [ + "dk_fortran (delta_kl) failed: mean js=0.1840, z=-15.46\u03c3 vs null", + "adherence score is 0.01 \u2014 below the noise threshold", + "1 probe(s) errored \u2014 see full report for details" + ], + "overall": 0.4703564773440887, + "weights": { + "ablation": 0.15, + "adherence": 0.3, + "attribution": 0.35, + "baseline": 0.0, + "calibration": 0.2 + } + }, + "spec_path": "/Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/finding03/sway.yaml", + "started_at": "2026-04-30T00:19:01.881485+00:00", + "sway_version": "0.1.0", + "wall_seconds": 86.698069 +} \ No newline at end of file diff --git a/docs/audits/13-followup/finding03/sway-results.md b/docs/audits/13-followup/finding03/sway-results.md new file mode 100644 index 0000000..4b071cb --- /dev/null +++ b/docs/audits/13-followup/finding03/sway-results.md @@ -0,0 +1,42 @@ +# sway report + +**Overall:** 0.47 (`partial`) +**Base:** `Qwen/Qwen2.5-Coder-1.5B-Instruct` +**Adapter:** `adapter/versions/v0001` +**Wall:** 86.7s +**Determinism:** `best_effort` (seed=0) +**Backend:** cache: 195/602 = 32% | batches: 6 (avg=5.3) + +## Components + +| category | score | weight | | +|---|---:|---:|---| +| adherence | 0.01 | 0.30 | | +| attribution | 0.50 | 0.35 | | +| calibration | 0.88 | 0.20 | | +| ablation | 0.79 | 0.15 | | +| baseline | 1.00 | 0.00 | (informational, weight=0) | + +## Probes + +| name | kind | verdict | score | raw | ci95 | z | duration | note | +|---|---|---|---:|---:|---:|---:|---:|---| +| null_baseline | `null_adapter` | pass | 1.00 | — | — | — | 54.1s | null calibration: 5 kinds calibrated over 3 seeds (1 opted out) | +| dk_fortran | `delta_kl` | fail | 0.01 | 0.184 | [0.141, 0.228] | -15.46σ | 0.42s | mean js=0.1840, z=-15.46σ vs null | +| sis_fortran | `section_internalization` | fail | 0.50 | 0.000 | [-0.016, 0.016] | +0.00σ | 1.99s | 7/41 sections cleared; mean effective_sis=+0.000, z=+0.00σ vs null | +| para_fortran | `paraphrase_invariance` | error | — | — | — | — | 0.00s | no cases provided | +| leak_fortran | `leakage` | pass | 0.93 | 0.042 | [0.042, 0.042] | +7.71σ | 26.2s | greedy_recall=0.04 (perturbed=0.05, fragility=0.00), z=+7.71σ vs null | +| cal_general | `calibration_drift` | pass | 0.82 | 0.000 | [0.000, 0.000] | — | 3.00s | 0/50 items regressed >1.0 nats (frac=0.0%), mean_delta=-0.166 nats/tok (no calibration for calibration_drift) | +| abl_fortran | `adapter_ablation` | fail | 0.79 | 0.991 | — | — | 0.63s | R²=0.99, sat_λ=1.25 (out of band), overshoot=1.21 (no calibration for adapter_ablation) | + +## Top findings + +- dk_fortran (delta_kl) failed: mean js=0.1840, z=-15.46σ vs null +- adherence score is 0.01 — below the noise threshold +- 1 probe(s) errored — see full report for details + +## Degenerate null calibration + +1 probe kind(s) ran null_adapter but the resulting baseline was too narrow for z-scoring (std ≈ 0, typically `runs: 1` or coincidentally-matched seeds). Fix: bump `runs:` in the `null_adapter` spec entry. Affected kinds: + +- `calibration_drift` diff --git a/docs/audits/13-followup/finding03/sway.yaml b/docs/audits/13-followup/finding03/sway.yaml new file mode 100644 index 0000000..d9067de --- /dev/null +++ b/docs/audits/13-followup/finding03/sway.yaml @@ -0,0 +1,105 @@ +# Audit 13 follow-up — sway spec for the fortran fine-tune. +# +# Compared to audit 13's spec (which got dk +44σ but +# paraphrase_invariance −3.51σ FAIL), this one: +# +# - Includes ``null_adapter`` for every numeric probe (was already +# present in audit 13; kept here as the recipe's foundation). +# - Activates the [dlm] bridge probes — section_internalization, +# paraphrase_invariance, leakage — which need the .dlm to be +# resolvable. The audit's venv didn't install ``[dlm]`` so these +# SKIPped; we install dlm separately in the follow-up venv to get +# them firing. +# - Fortran-specific prompts on delta_kl + adapter_ablation so the +# distribution shift signal is grounded in the trained domain. +# +# Run from the sway repo's venv after training completes: +# uv run --no-sync sway run \ +# /Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/sway.yaml \ +# --json /tmp/sway-13fu.json --markdown /tmp/sway-13fu.md + +version: 1 + +dlm_source: /Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/finding03/the-doc.dlm + +models: + base: + kind: hf + base: Qwen/Qwen2.5-Coder-1.5B-Instruct + ft: + kind: hf + base: Qwen/Qwen2.5-Coder-1.5B-Instruct + adapter: ~/.dlm/store/01KQDM14CT0X6AWFWTW9E81ZJ7/adapter/versions/v0001 + +defaults: + seed: 0 + differential: true + coverage_threshold: 0.6 + score_weights: + adherence: 0.30 + attribution: 0.35 + calibration: 0.20 + ablation: 0.15 + +suite: + # ── Calibration baseline ───────────────────────────────────────── + - name: null_baseline + kind: null_adapter + runs: 3 + + # ── Adherence ──────────────────────────────────────────────────── + - name: dk_fortran + kind: delta_kl + prompts: + - "module mymod\n implicit none\n" + - "subroutine compute(x, y)" + - "do concurrent (i = 1:n)" + - "type :: container" + - "intent(in) ::" + - "use stdlib_io, only: loadtxt" + - "use stdlib_sorting, only:" + - "real(real64), allocatable ::" + assert_z_gte: 1.0 + assert_mean_gte: 0.02 + + # ── Attribution (bridge-aware) ─────────────────────────────────── + # Per-section internalization. Should fire now that [dlm] is + # installed and the bridge can resolve the .dlm. + - name: sis_fortran + kind: section_internalization + assert_z_gte: 1.0 + assert_passing_section_frac: 0.4 + + # The audit's load-bearing probe — was -3.51σ FAIL in audit 13, + # need to flip to PASS here. With 5× more INSTRUCTION sections + + # 5× more steps + the bridge installed, this is the falsifiable + # claim. + - name: para_fortran + kind: paraphrase_invariance + assert_z_gte: 1.0 + + # ── Calibration ────────────────────────────────────────────────── + - name: leak_fortran + kind: leakage + assert_z_gte: 1.0 + + # General-competence regression check. Audit 13 PASSed this — + # we want to confirm we didn't fry it with 5× more fine-tune steps. + - name: cal_general + kind: calibration_drift + items_limit: 50 + assert_z_gte: 1.0 + + # ── Ablation (the sway signature) ──────────────────────────────── + # Healthy fine-tunes show monotonic R² ≥ 0.85 on the λ-scaled KL + # curve. If this is a step-function or saturates early, the adapter + # is degenerate. + - name: abl_fortran + kind: adapter_ablation + lambdas: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25] + prompts: + - "subroutine matmul_optimized(A, B, C)" + - "do concurrent (i = 1:n)" + - "use stdlib_linalg, only: solve" + assert_linearity_gte: 0.85 + assert_z_gte: 1.0 diff --git a/docs/audits/13-followup/finding03/the-doc.dlm b/docs/audits/13-followup/finding03/the-doc.dlm new file mode 100644 index 0000000..cfffe18 --- /dev/null +++ b/docs/audits/13-followup/finding03/the-doc.dlm @@ -0,0 +1,725 @@ +--- +dlm_id: 01KQDM14CT0X6AWFWTW9E81ZJ7 +dlm_version: 15 +base_model: qwen2.5-coder-1.5b +training: + sources: + - path: ~/GithubOrgs/FortranGoingOnForty + include: ["**/*.f90", "**/*.F90", "**/*.f95"] + exclude: + - "**/build/**" + - "**/.git/**" + - "**/dist/**" + max_bytes_per_file: 32768 + - path: /tmp/stdlib_build/src + include: ["**/*.f90", "**/*.F90", "**/*.fypp"] + exclude: + - "**/build/**" + - "**/tests/**" + max_bytes_per_file: 32768 + - path: /tmp/stdlib_build/doc/specs + include: ["**/*.md"] + max_bytes_per_file: 131072 + sources_policy: permissive + adapter: lora + lora_r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + sequence_len: 1024 + micro_batch_size: 1 + grad_accum: 8 + learning_rate: 2.0e-4 + warmup_ratio: 0.1 + num_epochs: 1 +--- + +# Modern Fortran practitioner notes + +This document trains an adapter that knows the idioms of the Fortran 2008+ +codebase under `~/GithubOrgs/FortranGoingOnForty`, the canonical +`fortran-lang/stdlib` source tree at `/tmp/stdlib_build/src`, and the +FORD-generated stdlib spec documents at `/tmp/stdlib_build/doc/specs`. The +corpus spans numeric/aerospace simulation (`aero-emulation`, `convolution`), +scripting helpers (`fgof-*`), and stdlib modules (allocation, ascii, bitsets, +linear-algebra, sorting, statistics, strings, IO, hashing). + +## Idioms the corpus reinforces + +Modern Fortran prefers `do concurrent` over plain `do` for loops with no loop +carried dependence — it expresses the parallelism intent and lets the +compiler vectorise / OpenMP-offload without further pragmas. `pure` and +`elemental` procedures are first-class. Allocatable arrays beat pointer +arrays for owned memory; `move_alloc` transfers ownership without copying. +Derived types use `type, bind(C)` only when interoperability is required. +`intent(in)`, `intent(out)`, `intent(inout)` are mandatory on every dummy +argument in this codebase, even when the language would default — explicit +intent is a project convention, not a language requirement. + +## stdlib navigation + +The Fortran stdlib provides community-tested replacements for tasks the +language standard doesn't cover: `stdlib_array` (index manipulation), +`stdlib_io` (text-file IO), `stdlib_sorting` (in-place sorts), `stdlib_stats` +(mean/var/cov), `stdlib_string_type` (variable-length strings), +`stdlib_linalg` (BLAS/LAPACK wrappers), `stdlib_random` (distributions), +`stdlib_hashmaps` (open + chaining hash tables), `stdlib_bitsets` (compact +bit arrays), `stdlib_logger` (severity-tagged logging), `stdlib_optval` +(safe defaults for optional dummy args), `stdlib_kinds` (named real32/real64 +/int32/int64 kind parameters). + +::instruction:: + +### Q +When should I prefer `do concurrent` over a plain `do` loop in modern Fortran? + +### A +Use `do concurrent` whenever loop iterations have no carried dependence — +i.e., iteration `i` does not read a value written by iteration `i-1`. The +`do concurrent` construct is a contract with the compiler that the iterations +may execute in any order or in parallel, which lets it vectorise, OpenMP- +offload, or GPU-offload the loop without further pragmas. If you can't make +that promise (the loop accumulates into a shared scalar, or reads a value +the previous iteration wrote), keep a plain `do`. Locality clauses +(`local`, `shared`, `reduce`) give finer control in Fortran 2018+. + +::instruction:: + +### Q +Why use `allocatable` arrays instead of `pointer` arrays for owned memory? + +### A +`allocatable` expresses ownership: the variable owns the memory, the compiler +inserts the deallocate at scope exit, and you cannot have aliasing across +two allocatable variables. `pointer` arrays expose aliasing and demand +manual `deallocate`. Compilers can also generate better code for +`allocatable` because no aliasing means no aliasing-defeats-vectorisation +pessimism. Use `pointer` only when you actually need pointer semantics — +linked structures, optional aliasing into a parent array, or C interop. +`move_alloc(from, to)` transfers ownership without a copy. + +::instruction:: + +### Q +What does `intent(in)`, `intent(out)`, and `intent(inout)` declare on a +dummy argument? + +### A +`intent(in)` says the procedure reads but never assigns to the argument — +the compiler enforces this and will reject any assignment as an error. +`intent(out)` says the procedure must define the argument before returning; +the value on entry is undefined (and on some compilers explicitly poisoned), +so don't read it before assigning. `intent(inout)` says both are allowed — +the procedure may read the prior value and may overwrite it. Always declare +intent on every dummy argument. It is a contract that the compiler enforces +and a reader can rely on. + +::instruction:: + +### Q +How do I open an OpenMP parallel region with a reduction in Fortran? + +### A +Use `!$omp parallel do reduction(+:sum)` above a `do` loop where each +iteration accumulates into `sum`. The reduction clause tells OpenMP to give +each thread a private copy of `sum`, accumulate locally, then combine at +loop end. Other reduction operators include `*`, `min`, `max`, `.and.`, +`.or.`, `iand`, `ior`. The closing `!$omp end parallel do` is optional in +free-form Fortran. For simple no-reduction loops, `do concurrent` is +preferred. + +::instruction:: + +### Q +What's a derived type with allocatable components, and why is it useful? + +### A +A derived type with allocatable components has fields declared +`type :: container; real, allocatable :: data(:); end type`. The container +itself is value-typed but its `data` field grows or shrinks at runtime. +Compared to fixed-size arrays it doesn't waste memory; compared to pointer +fields it owns the memory and frees it automatically when the container +goes out of scope. Modern Fortran's `intrinsic_assignment` does a deep +copy by default. This is the canonical way to build resizable structures +without C-style malloc/free. + +::instruction:: + +### Q +What does `pure function` mean and when is the `pure` attribute required? + +### A +A `pure function` has no side effects: it doesn't modify any module +variable, doesn't perform I/O, doesn't call impure procedures. The compiler +verifies this. `pure` is required for any function called from inside +`do concurrent` (which forbids side effects across iterations) and from +within `forall`. It also enables aggressive optimisation: the compiler +knows two calls with equal arguments produce equal results and can hoist +or memoise. `elemental` implies `pure` and adds elementwise array semantics. + +::instruction:: + +### Q +How do I declare a generic interface for two procedures that differ only in +argument type? + +### A +```fortran +interface clamp + module procedure clamp_real32, clamp_real64 +end interface clamp +``` +The two module procedures have the same logical name `clamp` but distinct +argument signatures (one takes `real(real32)`, the other `real(real64)`). +The compiler resolves the call by argument type — Fortran's overloading +mechanism. Adding more types is purely additive; callers see one name. + +::instruction:: + +### Q +What's the difference between `module` and `submodule` in Fortran? + +### A +A `module` declares its public interface and definitions in one unit. A +`submodule` lets you declare the interface in the parent module but put the +implementation in a separately compiled file. Editing the submodule does +not retrigger compilation of every consumer of the parent module — only of +the submodule itself. Use submodules to break long compile chains in big +projects. The `module subroutine foo()` declaration in the parent and +`module procedure foo` in the submodule are how the two halves bind. + +::instruction:: + +### Q +What does `trueloc` from `stdlib_array` do? + +### A +`trueloc(mask, lbound)` turns a logical mask into an integer index array +of the positions where the mask is `.true.`. It's a pure function — given +`mask = [.false., .true., .false., .true.]` and default `lbound`, it +returns `[2, 4]`. With `lbound = 0` it returns `[1, 3]`. Useful when you +need the indices themselves (for example, to use as a subscript) rather +than the masked values; if you only need the values, `pack` is more +direct. The complementary `falseloc` returns indices where the mask is +`.false.`. + +::instruction:: + +### Q +When would I use `arange` from `stdlib_math`? + +### A +`arange(start, stop, step)` returns a rank-1 array of values from `start` +to `stop` (inclusive) stepping by `step`. With `arange(1, 5)` you get +`[1, 2, 3, 4, 5]`. With `arange(0.0, 1.0, 0.25)` you get +`[0.0, 0.25, 0.5, 0.75, 1.0]`. Useful for building index sequences or +sample grids without a manual `do` loop. Step defaults to 1. + +::instruction:: + +### Q +How do I read a numeric matrix from a text file using `stdlib_io`? + +### A +Use `loadtxt(filename, array)`: +```fortran +use stdlib_io, only: loadtxt +real, allocatable :: data(:,:) +call loadtxt('measurements.dat', data) +``` +The array is allocated automatically to match the file's rows × columns. +The companion `savetxt(filename, array)` writes the same shape back out. +Both procedures handle real, integer, and complex data of `real32`, +`real64`, `int32`, `int64` kinds. + +::instruction:: + +### Q +What does `getline` from `stdlib_io` give me that `read(*, fmt='(A)')` doesn't? + +### A +`getline(unit, line, iostat)` reads one line of arbitrary length into a +deferred-length character variable. With plain `read(*, fmt='(A)')` you +have to commit to a fixed-length buffer up front and either truncate +or run a loop on `iostat=eor`. `getline` allocates `line` to exactly the +line's length, with no truncation. It returns `iostat=iostat_end` at +EOF. + +::instruction:: + +### Q +How do I sort an array in place using `stdlib_sorting`? + +### A +Use `sort(array)`. It sorts `array` ascending in place using an +introsort-style hybrid (quicksort + insertion-sort fallback). For a +descending sort, pass `reverse=.true.`. The companion `ord_sort(array)` +is a stable mergesort — preserves equal-key relative order, at the cost +of an O(n) workspace allocation. Use `radix_sort` for integer arrays +when the value range is bounded; it's O(n) but only for unsigned-style +integer kinds. + +::instruction:: + +### Q +When should I use `sort_index` instead of `sort`? + +### A +Use `sort_index(array, index)` when you also need the permutation that +sorted the array — for example, to apply the same reordering to a second, +parallel array. After the call, `array` is sorted and `index(i)` is the +original position of the `i`-th element of the sorted array. So +`other_data = other_data(index)` reorders a sibling array consistently +with `array`. Plain `sort` discards this information. + +::instruction:: + +### Q +What does `mean(array)` from `stdlib_stats` compute, and how do I take a +mean along one axis of a 2D array? + +### A +`mean(array)` returns the arithmetic mean of all elements as a scalar. +For a 2D array, `mean(matrix, dim=1)` reduces along the first dimension, +returning a 1D array of column means. `mean(matrix, dim=2)` returns row +means. With a logical mask, `mean(array, mask=array > 0)` averages only +the elements satisfying the mask. The companion `var` (variance) and +`std` (standard deviation) accept the same dim/mask arguments. `corr` and +`cov` compute correlation / covariance between two arrays. + +::instruction:: + +### Q +What is `string_type` from `stdlib_string_type` and why use it instead of +plain `character(len=*)`? + +### A +`string_type` is a derived type wrapping a deferred-length character +allocatable. Two reasons to prefer it: (1) you can have arrays of +varying-length strings (`type(string_type) :: words(100)` where each +`words(i)` is its own length — impossible with plain character arrays +which must be uniform-length); (2) it has overloaded operators (`==`, +`<`, `>`, `//` for concat) and constructors that interoperate with both +literals and other `string_type` instances. Use `char(string_type_var)` +to get the underlying `character(:)` back when interfacing with code +that takes plain strings. + +::instruction:: + +### Q +How do I solve a linear system `A*x = b` using `stdlib_linalg`? + +### A +```fortran +use stdlib_linalg, only: solve +real :: A(n,n), b(n), x(n) +x = solve(A, b) +``` +`solve` is a pure function that returns the solution `x`. Internally it +calls LAPACK's `gesv` (LU with partial pivoting). For multiple right-hand +sides, `b` can be a 2D array `b(n,nrhs)`. For least-squares (overdetermined +or underdetermined), use `lstsq` instead. To check whether a matrix is +singular before solving, use `inv` plus the returned status, or compute +the condition number via `linalg_cond`. + +::instruction:: + +### Q +What does `eye(n)` return in `stdlib_linalg`? + +### A +`eye(n)` returns the n×n identity matrix as a `real` array — ones on the +diagonal, zeros off-diagonal. `eye(m, n)` returns a non-square m×n matrix +with ones on the main diagonal. Useful as a starting point for numerical +linear algebra or as the right-hand side of `solve(A, eye(n))` to compute +`inv(A)` directly. + +::instruction:: + +### Q +How do I draw samples from a normal distribution with `stdlib_random`? + +### A +```fortran +use stdlib_random, only: random_seed, dist_normal => dist_rvs_normal +real :: x, samples(1000) +call random_seed(42) +x = dist_normal(0.0, 1.0) ! one N(0,1) sample +samples = dist_normal(0.0, 1.0, 1000) ! 1000 N(0,1) samples +``` +First argument is the mean, second is the standard deviation. The third +optional argument requests an array of N samples. Companion routines +`dist_rvs_uniform`, `dist_rvs_exponential`, `dist_rvs_gamma` cover other +distributions. Always seed via `random_seed` for reproducibility. + +::instruction:: + +### Q +What is `optval` from `stdlib_optval` and when do I use it? + +### A +`optval(arg, default)` returns `arg` if it's `present`, otherwise `default`. +It replaces the boilerplate +```fortran +if (present(arg)) then + used = arg +else + used = default +end if +``` +with a single expression `used = optval(arg, default)`. Works for scalar +integers, reals, complex, logical, and character. The most common use is +default-value handling for optional dummy arguments, where the caller may +or may not have supplied the argument. + +::instruction:: + +### Q +What kind parameters does `stdlib_kinds` provide and why use them? + +### A +`stdlib_kinds` exports `int8`, `int16`, `int32`, `int64`, `real32`, +`real64`, `real128` (where supported), and `c_bool`. Use them instead of +raw kind numbers (`real(8)` is non-portable — different compilers map `8` +to different precisions). `real(real64)` is portable and self-documenting. +For most modern code, default to `real64` for floating-point and `int32` +for indices unless you have a specific reason otherwise. + +::instruction:: + +### Q +How do I append to a file using `stdlib_io_logger`? + +### A +```fortran +use stdlib_logger, only: logger_type, information_level +type(logger_type) :: log +integer :: stat +call log%add_log_file('app.log', stat, position='append') +call log%log_information('app started') +``` +The `position='append'` keyword opens the existing file for append rather +than truncating. Default severity emitted is `information`; raise it with +`log%configuration(level=warning_level)` if you want a quieter log. Severity +levels: `debug_level < information_level < warning_level < error_level`. + +::instruction:: + +### Q +When should I use a `bitset_64` vs `bitset_large` from `stdlib_bitsets`? + +### A +`bitset_64` is fixed at 64 bits — a single integer's worth of flags, +fastest, no allocation. `bitset_large` allocates an array of `int64` +internally and grows to any user-specified size. Use `bitset_64` for +small, known-size flag sets (compiler optimization flags, peripheral +status registers); use `bitset_large` when the bit count is data-driven +or might exceed 64. Both expose the same `set`, `clear`, `test`, +`flip`, `bit_count` interface — code is portable across them. + +::instruction:: + +### Q +How do I hash a string for use as a hashmap key? + +### A +The `stdlib_hash_procedures` module provides 32-bit and 64-bit hash +functions: +```fortran +use stdlib_hash_procedures, only: fnv_1a_hash, water_hash +integer(int32) :: h32 +integer(int64) :: h64 +h32 = fnv_1a_hash('mykey') +h64 = water_hash('mykey', seed=12345_int64) +``` +`fnv_1a_hash` is unseeded and reproducible across runs; `water_hash` and +`pengy_hash` are seeded (better collision resistance under adversarial +input). For `stdlib_hashmaps`, you usually pass the hash function as a +procedure pointer when constructing the map. + +::instruction:: + +### Q +What's the `block` construct in modern Fortran and when do I use it? + +### A +A `block` introduces a nested scope inside an executable region: +```fortran +real :: outer +outer = 1.0 +block + real :: inner + inner = outer + 1.0 + print *, inner +end block +``` +Variables declared inside the block exist only within it. Useful for +narrowing the lifetime of temporaries, declaring variables close to use, +and limiting the visibility of helper allocations. Functionally similar +to a `{ ... }` block in C-family languages. Combines naturally with +`associate`, `select type`, and `error stop`. + +::instruction:: + +### Q +How does `associate` differ from a plain assignment? + +### A +`associate(short => long%nested%expression)` binds a name to an expression +or variable for the lifetime of the `associate` block, *without* copying +the value. Inside the block, `short` is an alias — modifying it modifies +the original. Compare to assignment, which copies (for non-pointer types). +Use `associate` to give a verbose subexpression a short name in a tight +loop without paying for a copy and without exposing the alias outside the +block. End the scope with `end associate`. + +::instruction:: + +### Q +What does `elemental` add over `pure` on a function? + +### A +An `elemental` function is `pure` plus broadcast: declared as if it takes +scalar arguments, but the compiler auto-generates the version that takes +conformable arrays. `square_real(x)` written `elemental` can be called +with a scalar (`y = square_real(2.0)`) or an array (`v = square_real(arr)`) +with no explicit do-loop. Since `elemental` implies `pure`, the same +no-side-effect rules apply. Most stdlib unary numeric helpers (`sqrt`- +adjacent transforms, kind conversions, predicates) are `elemental`. + +::instruction:: + +### Q +How do I structure a unit-test runner using stdlib's `testing` module? + +### A +```fortran +use testdrive, only: new_unittest, unittest_type, error_type, check +type(unittest_type), allocatable :: testsuite(:) +testsuite = [ & + new_unittest('addition', test_add), & + new_unittest('subtract', test_sub) ] +contains + subroutine test_add(error) + type(error_type), allocatable, intent(out) :: error + call check(error, 1 + 1 == 2, 'addition broken') + end subroutine +``` +`testdrive` is a thin runner the stdlib uses for its own tests. Each +test subroutine accepts an `allocatable :: error` out parameter; `check` +allocates the error if the assertion fails. Fast, no fixtures, no +discovery — explicit registration in an array. + +::instruction:: + +### Q +What's the modern way to read a CSV-style file in Fortran? + +### A +The simplest path is `stdlib_io`'s `loadtxt` if the columns are uniform +numeric. For mixed-type CSVs, read line-by-line with `getline` and split +manually: +```fortran +use stdlib_io, only: getline +use stdlib_string_type, only: string_type, split => char_split +type(string_type) :: line +type(string_type), allocatable :: fields(:) +integer :: u, ios +open(newunit=u, file='data.csv', action='read') +do + call getline(u, line, ios) + if (ios /= 0) exit + fields = split(line, ',') + ! process fields(:) +end do +close(u) +``` +For larger or more complex CSVs, consider the `csv-fortran` community +package — stdlib doesn't (yet) ship a CSV-aware reader. + +::instruction:: + +### Q +How does `error stop` differ from `stop`? + +### A +`stop` and `error stop` both terminate the program, but `error stop` is +guaranteed to set a non-zero process exit code, while `stop` (without an +argument) typically returns zero. `error stop "message"` prints the message +to standard error before exit; `error stop 42` returns code 42. Use +`error stop` for any abnormal termination — assertion failures, fatal +config errors — so shells and CI runners pick up the failure correctly. +`stop` is reserved for normal early termination (rare in modern code; +prefer letting `program` reach its `end program`). + +::instruction:: + +### Q +What is `c_loc` and when do I need it? + +### A +`c_loc(target)` from `iso_c_binding` returns the C address of `target` as +a `type(c_ptr)` value, suitable for passing to a `bind(C)` procedure. The +target must have the `target` attribute. Use this when interfacing with +a C library that takes `void*`. The reverse — turning a `c_ptr` back into +a Fortran pointer — uses `c_f_pointer(cptr, fptr, [shape])`. Always pair +the call with the matching deallocation; Fortran does not own memory +acquired through `c_loc`. + +::instruction:: + +### Q +What does `move_alloc(from, to)` do and when do I prefer it to assignment? + +### A +`move_alloc(from, to)` transfers the allocation status (and the underlying +memory) from `from` to `to`. After the call, `to` holds what `from` held, +and `from` is deallocated — no copy. Compare to `to = from` which copies +the array. For large arrays, `move_alloc` is O(1); the copy is O(n). Use +`move_alloc` to hand ownership of a temporary buffer to a derived-type +field, or to swap two allocatables (via a third temporary). + +::instruction:: + +### Q +How do I write a `subroutine` that returns multiple results without using +`out` arguments? + +### A +Use a derived-type return value via a `function`: +```fortran +type :: result_t + real :: value + integer :: status +end type +contains +function compute() result(r) + type(result_t) :: r + r%value = 3.14 + r%status = 0 +end function +``` +Caller writes `res = compute()` once and reads `res%value`, `res%status`. +This is cleaner than two `intent(out)` arguments because the call site +isn't burdened with declaring the receivers up front, and the compiler +can elide the temporary in common cases. Fortran 2008+ allows allocatable +result components, so the function can also return varying-shape data. + +::instruction:: + +### Q +Why does the FortranGoingOnForty codebase use `intent` on every dummy +argument even when not strictly required? + +### A +It's a project convention: explicit `intent(in/out/inout)` on every dummy +makes the contract visible at the procedure boundary. The compiler enforces +the contract — `intent(in)` rejects assignment, `intent(out)` warns on +read-before-write — so a regression where someone "improves" a procedure +to mutate a previously-read-only argument fails at compile time, not at +runtime. Reviewers and tooling (the FGOF `fgof-lineedit` editor's +in-buffer linter, for example) parse intents to render colour-coded +argument flow. The convention costs ~10 keystrokes per procedure and +buys static enforcement. + +::instruction:: + +### Q +What does the `aero-emulation` subsystem in FortranGoingOnForty do at a +high level? + +### A +`aero-emulation` is the aerodynamic-surface emulator: a forward simulation +of lift, drag, and moment over a parameterised wing-and-control-surface +model, integrated against a 6-DoF rigid-body solver. It uses +`stdlib_linalg` for the state-update matrices (rotation and inertia tensor +products) and `stdlib_random` to inject configurable turbulence. Outputs +are time-series state vectors written via `stdlib_io`'s `savetxt`. The +module is consumed by `convolution` (for sensor-fusion experiments) and +by `armfortas` (the autopilot framework). + +::instruction:: + +### Q +How does `fgof-process` handle child-process I/O on Linux vs macOS? + +### A +`fgof-process` wraps the C `posix_spawn` family via `iso_c_binding`; on +both Linux and macOS the spawn semantics are POSIX-shaped, so the +high-level `spawn(command, stdin, stdout, stderr)` call is platform- +identical at the Fortran layer. The differences hide in the child-pty +wiring (`fgof-pty`): macOS uses `posix_openpt` + `grantpt` + `unlockpt`, +Linux supports the same path plus the older `/dev/ptmx` shortcut. The +`fgof-process` module never sees the divergence — `fgof-pty` exposes a +`type(pty_t)` derived type that `fgof-process` consumes opaquely. + +::instruction:: + +### Q +What's the role of `fgof-screen` in the FGOF terminal applications? + +### A +`fgof-screen` is the terminal-cell renderer used by `fgof-lineedit` and +the `armfortas` autopilot console. It maintains a 2D `character(len=:), +allocatable` buffer plus per-cell ANSI style attributes, computes a +minimal-diff update against the previously rendered frame, and emits +the diff as ANSI escape sequences to the controlling tty. The minimal- +diff step keeps redraws under a few hundred bytes for incremental +updates — needed because some serial-attached aerospace consoles run +at 9600 baud and a full repaint is too expensive. + +::instruction:: + +### Q +Why does `convolution` use `do concurrent` for its inner kernel loop +rather than OpenMP? + +### A +The inner kernel loop multiplies and accumulates over a small kernel +window with no carried dependence, so it satisfies `do concurrent`'s +contract. `do concurrent` lets the compiler choose the parallelism +strategy — vectorisation on CPU, OpenMP-offload on multicore, or +GPU-offload via `-fopenmp -foffload=...` on NVIDIA hosts — without +changing source. OpenMP `parallel do` would have committed the kernel +to CPU threads at compile time. The convolution outer loop, which +*does* accumulate into a shared buffer, uses `!$omp parallel do +reduction(+:buffer)` since `do concurrent` cannot express the +reduction without 2018+ locality clauses that older compilers don't +honour. + +::instruction:: + +### Q +What pattern does `armfortas` use for autopilot state-machine transitions? + +### A +A `select case` over a state enum at the top of the control loop: +```fortran +select case (state%mode) +case (mode_idle) ; call handle_idle(state, inputs) +case (mode_takeoff) ; call handle_takeoff(state, inputs) +case (mode_cruise) ; call handle_cruise(state, inputs) +case (mode_descent) ; call handle_descent(state, inputs) +case (mode_landed) ; call handle_landed(state, inputs) +case default ; call error_unknown_mode(state%mode) +end select +``` +Each handler returns the next mode via `state%mode = new_mode`. The +enum values are `integer, parameter` constants (Fortran lacks a true +enum type pre-2023 — most codebases use named integer parameters). The +`select case` is exhaustive and the `case default` calls into +`error_unknown_mode` to fail loudly on programmer error. + +::instruction:: + +### Q +How does `feducative` integrate with `armfortas` for control-loop tuning? + +### A +`feducative` is the parameter-search and PID-tuning subsystem. It runs +`armfortas` in a closed-loop simulation (via the `aero-emulation` model), +sweeps PID gains over a configured grid, and scores each combination by +RMS tracking error against a reference trajectory. The two communicate +via a `type(tuning_handle_t)` derived type that `feducative` constructs +and `armfortas` updates step-by-step. After a sweep, `feducative` writes +the gain × score table via `stdlib_io`'s `savetxt` and emits the best +gains as a Fortran `include` file the autopilot reads on next compile. diff --git a/docs/audits/13-followup/finding04/direct-query-results.md b/docs/audits/13-followup/finding04/direct-query-results.md new file mode 100644 index 0000000..f14be46 --- /dev/null +++ b/docs/audits/13-followup/finding04/direct-query-results.md @@ -0,0 +1,143 @@ +# Finding 04 — direct-query smoke results + +`qwen2.5-coder-1.5b` + finding-04 LoRA (v0002, 400 steps, train loss +0.62, eval loss 0.047, 98.6% token accuracy on training data). + +`--backend pytorch` is required: dlm's auto-routing picks `mlx` on +darwin-arm64 by default, but MLX can't load PEFT-format adapters and +silently runs the base. (See "Notes on bugs surfaced" below.) + +## Seen Q/A pairs (in training set) + +### Q3 wedge — "Show the signature of stdlib's sorting routine in Fortran." + +**Base:** wrong (hallucinates `iso_fortran_env::sort`) +**Finding-03 LoRA:** worse (degenerate "should be able to sort..." loop) +**Finding-04 LoRA: ✓ correct (verbatim training answer):** + +```fortran +use stdlib_sorting, only: sort +call sort(array[, reverse]) +``` +The `array` argument is `intent(inout)` and must be a rank-1 array of +an intrinsic numeric type... The optional `reverse` argument is a +scalar logical with `intent(in)`. + +### intent(in) + +✓ correct, matches training answer verbatim. + +## Unseen Q/A — stdlib variants + +### "How would I sort a real(dp) array using stdlib?" + +**Partial generalization.** The model knows `use stdlib_sorting` (real +information from the training corpus) but invents a wrong call form +(`call stdlib_sorting::qsort`) and falls back to a manual sort loop. + +```fortran +use stdlib_sorting +call stdlib_sorting::qsort ! wrong syntax +integer :: idx(1:SIZE=my_array) +real(dp) :: my_array(SIZE) +real(dp) :: result(SIZE) +... [hand-rolled sort loop] +``` + +The model learned *that* `stdlib_sorting` exists; it didn't learn the +generic call form well enough to apply it to a question it hadn't +seen verbatim. + +### "What is stdlib_strings::starts_with?" + +**No generalization.** Falls back to Rust syntax (`use ... ::`, `if +pred { ... }`). + +``` +use stdlib_strings::starts_with; + +if starts_with("hello world", "hel") { + // ... +} +``` + +The training set didn't cover `stdlib_strings::starts_with` +specifically; the model's nearest pattern is its base-pretraining Rust +knowledge. + +### "How do I read a CSV file in Fortran with stdlib?" + +**Hallucinated stdlib API.** The model produces plausibly-shaped +Fortran-stdlib code with wrong details (invented `stdlib_io_read_table`, +`stdlib_types`, `using` instead of `use`): + +```fortran +use stdlib_types +using stdlib_types::array ! invalid +array(real, dim=(:)) :: data +call stdlib_io_read_table("path/to/file.csv", data) ! function doesn't exist +``` + +The training had `loadtxt` for stdlib_io, but the question phrasing +("read a CSV") didn't trigger that pattern. Model invented a +plausible-looking API instead. + +## Out-of-domain — does the LoRA preserve general capability? + +### "What is the capital of France?" +✓ "Paris." + +### "Write a Python list comprehension that filters even numbers." +✓ +```python +even_numbers = [i for i in range(10) if i % 2 == 0] +print(even_numbers) # prints [0, 2, 4, 6, 8] +``` + +General-capability preservation is excellent. cal_general regression +was 0% in Finding 03 already; Finding 04's INSTRUCTION-only training +maintains that. + +## Net wedge score + +| Question shape | Result | Verdict | +| --- | --- | --- | +| Seen exactly | verbatim correct | ✓ memorized cleanly | +| Unseen with overlap (sort a real(dp) array) | knows module, wrong API form | partial | +| Unseen, different module (stdlib_strings::starts_with) | falls back to Rust | none | +| Unseen, different module (CSV/loadtxt) | hallucinates plausible API | none | +| Out-of-domain (English/Python) | unchanged from base | ✓ preserved | + +## Reading + +The recipe-shape fix worked **for in-distribution questions** — +qwen-coder-1.5b + INSTRUCTION-only Q/A training produced an adapter +that reproduces trained answers correctly without breaking general +capability. The wedge that Finding 03 widened (Q3 stdlib sorting), this +recipe narrows: the answer is now correct. + +What we did *not* get is generalization to nearby-domain questions +the model hadn't seen verbatim. With only 32 hand-authored Q/A pairs +across multiple modules, the model can memorize all of them (98.6% +token accuracy) but doesn't have enough surface area to abstract the +pattern. For "ask about stdlib_strings::starts_with" to land +correctly, that module's API form needs to be in the training data. + +The tradeoff is dataset-size-dependent: more Q/A → broader +generalization. With ~32 pairs we get pinpoint Q/A reproduction; with +~300 pairs (extrapolating) we'd plausibly cover most stdlib modules +at functional density. + +## Notes on bugs surfaced + +1. **`src/dlm/replay/store.py:187`** — `parse_instruction_body` called + without `_normalize_probe_markers`. Fixed in this branch (added + the import + call). Without the fix, `--fresh` doesn't help: the + replay store retains snapshots with raw `### Q !probe` headers and + the parser rejects them on every retrain. +2. **MLX adapter loading silently fails for PEFT adapters.** dlm's + default backend on darwin-arm64 is MLX. PEFT `adapter_model.safetensors` + isn't a valid MLX adapter format; MLX appears to load the base and + silently ignore the adapter. `--backend pytorch` works correctly. + The user-visible failure mode is "trained model behaves like base" + — easy to misread as "training didn't work." diff --git a/docs/audits/13-followup/finding04/the-doc.dlm b/docs/audits/13-followup/finding04/the-doc.dlm new file mode 100644 index 0000000..f9a8807 --- /dev/null +++ b/docs/audits/13-followup/finding04/the-doc.dlm @@ -0,0 +1,415 @@ +--- +dlm_id: 01KQDWAHNJ7FD72EQ4J4FXBJ2V +dlm_version: 15 +base_model: qwen2.5-coder-1.5b +training: + adapter: lora + lora_r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + sequence_len: 1024 + micro_batch_size: 1 + grad_accum: 8 + learning_rate: 2.0e-4 + warmup_ratio: 0.1 + num_epochs: 100 +--- + +# Finding 04 — Q/A-shape recipe (INSTRUCTION-only) + +Hand-authored Q/A pairs over `fortran-lang/stdlib`, sourced from +`/tmp/stdlib_build/doc/specs/*.md`. No raw `sources:` directives — +every training row is a `messages` row routed through Qwen's chatml +template. The wedge to track is **Q3: signature of stdlib's sorting +routine.** Finding 03's adapter degenerated into "should be able to +sort..." enumeration; here the model should learn to point at +`stdlib_sorting :: SORT` with proper signature. + +stdlib_sorting density is intentional (~60% of pairs): it's the wedge +target. Other modules (io, math, strings, array) provide variety so +the LoRA isn't a one-module specialist. + +::instruction:: + +### Q +What module in fortran-lang/stdlib provides sorting routines? + +### A +The `stdlib_sorting` module. It provides four overloaded subroutines: +`ORD_SORT`, `SORT`, `RADIX_SORT`, and `SORT_INDEX`, plus the parameters +`int_index` and `int_index_low` for indexing. +::instruction:: + +### Q +Show the signature of stdlib's sorting routine in Fortran. + +### A +```fortran +use stdlib_sorting, only: sort +call sort(array[, reverse]) +``` +The `array` argument is `intent(inout)` and must be a rank-1 array of +an intrinsic numeric type (integer kinds, real kinds), `character(*)`, +`type(string_type)`, `type(bitset_64)`, or `type(bitset_large)`. +The optional `reverse` argument is a scalar logical with `intent(in)`. +::instruction:: + +### Q +What's the difference between `ord_sort` and `sort` in stdlib? + +### A +`ord_sort` is intended for arrays that have significant runs of +decreasing or non-decreasing values — it can achieve `O(N)` performance +on already-sorted or reverse-sorted input. `sort` is intended for +arrays that are effectively unordered before the call. +`ord_sort` is stable; `sort` is not. +::instruction:: + +### Q +What is `radix_sort` in `stdlib_sorting` for? + +### A +`radix_sort` sorts fixed-width intrinsic data types — integers and +reals. It exploits the bit representation of these types for non- +comparison-based sorting and can outperform `sort` on numeric data. +::instruction:: + +### Q +What does `sort_index` return? + +### A +`sort_index` sorts the input array AND returns indices that map the +original (pre-sort) array to its sorted version. This lets you reorder +related arrays in the same way without re-sorting them — useful when +sorting one column of a tabular dataset and needing to keep the other +columns aligned. +::instruction:: + +### Q +What's the signature of `ord_sort`? + +### A +```fortran +call ord_sort(array[, work, reverse]) +``` +- `array` : `intent(inout)`, rank-1 array of supported intrinsic type +- `work` (optional): `intent(out)`, rank-1 same type as `array`, at + least `size(array)/2` elements — used as scratch memory to reduce + stack pressure +- `reverse` (optional): scalar default-logical `intent(in)`; if + `.true.`, sort in non-increasing order +::instruction:: + +### Q +What sorting algorithm does `ord_sort` use under the hood? + +### A +A hybrid combining merge sort and insertion sort. For random input it +performs like merge sort with `O(N log N)` worst-case — typically +slower than `sort`. For input with long runs of monotonic values, it +falls back toward `O(N)`, which is its specific advantage over `sort`. +::instruction:: + +### Q +Is `stdlib_sorting`'s `sort` stable? + +### A +No. The `sort` subroutine is not stable. If you need a stable sort, use +`ord_sort`, which is stable in the case of equal keys. `radix_sort` is +also not stable. +::instruction:: + +### Q +What types can `stdlib_sorting`'s `ord_sort` accept? + +### A +Rank-1 arrays of any of: `integer(int8)`, `integer(int16)`, +`integer(int32)`, `integer(int64)`, `real(sp)`, `real(dp)`, `real(qp)`, +`character(*)`, `type(string_type)`, `type(bitset_64)`, or +`type(bitset_large)`. +::instruction:: + +### Q +How do you use `stdlib_sorting`'s `sort` to sort a real array in +descending order? + +### A +```fortran +use stdlib_sorting, only: sort +real, allocatable :: arr(:) +arr = [3.14, 1.41, 2.72, 0.58] +call sort(arr, reverse=.true.) +! arr is now [3.14, 2.72, 1.41, 0.58] +``` +::instruction:: + +### Q +What kind of integer does `int_index` evaluate to? + +### A +`int_index` is set to `int64` from the `stdlib_kinds` module — a +64-bit integer kind for indexing large arrays. The companion +`int_index_low` is `int32`. +::instruction:: + +### Q +What's the difference between `sort_adjoint` and `sort_index`? + +### A +Both are based on `ord_sort`. `sort_adjoint` re-orders a *second* +array of the same size using the same permutations applied to the +input. `sort_index` instead returns the index permutation itself, +which the caller can then apply to any number of related arrays. +::instruction:: + +### Q +What's the licensing situation for `stdlib_sorting`? + +### A +The Fortran Standard Library overall is MIT-licensed, but +`ord_sort`, `sort_adjoint`, `sort_index`, and `sort` are translations of +Rust's `slice.rs` sort, which is Apache-2.0 / MIT dual-licensed. +`radix_sort` does not have this Rust ancestry. +::instruction:: + +### Q +Why might you pass the optional `work` argument to `ord_sort`? + +### A +`work` is used as scratch memory for `ord_sort`'s internal record +keeping. Passing a pre-allocated `work` array (e.g. one in static +storage) avoids stack allocation during the sort, which can +significantly reduce stack memory pressure when sorting large arrays +inside deeply nested call chains. +::instruction:: + +### Q +What happens if you `ord_sort` a real array containing `NaN`? + +### A +The order of the result is undefined. The same applies to `sort`, +`sort_adjoint`, `sort_index`, and `radix_sort`. If your data may +contain NaN values, filter them out before calling stdlib's sort +routines. +::instruction:: + +### Q +Show how to sort an integer array AND get indices to reorder a +companion array using `stdlib_sorting`. + +### A +```fortran +use stdlib_sorting, only: sort_index, int_index +integer :: keys(5) = [3, 1, 4, 1, 5] +real :: vals(5) = [10.0, 20.0, 30.0, 40.0, 50.0] +integer(int_index) :: idx(5) +call sort_index(keys, idx) ! keys is now sorted; idx maps original->sorted +vals = vals(idx) ! reorder vals using the same permutation +``` +::instruction:: + +### Q +What are the four overloaded subroutines in `stdlib_sorting`? + +### A +`ORD_SORT`, `SORT`, `RADIX_SORT`, and `SORT_INDEX`. (`SORT_ADJOINT` +exists but the spec groups it under `ORD_SORT`'s family.) +::instruction:: + +### Q +Are stdlib's sort routines thread-safe? + +### A +Each call modifies its `intent(inout)` `array` argument in place, so +two threads must not call sort on the same array concurrently. With +distinct arrays, there's no shared state in the sort routines +themselves — they're reentrant. +::instruction:: + +### Q +What is the `stdlib_io` module for in fortran-lang/stdlib? + +### A +`stdlib_io` provides text-file IO routines — primarily `loadtxt` for +reading delimited numeric data into rank-2 arrays and `savetxt` for +writing them. It also exposes formatting helpers like `disp` for +human-readable display of arrays and scalars. +::instruction:: + +### Q +Show the signature of `stdlib_io::loadtxt`. + +### A +```fortran +use stdlib_io, only: loadtxt +call loadtxt(filename, array[, skiprows, max_rows, fmt]) +``` +The `array` is allocated and filled from the file; supported types +include `real(sp)`, `real(dp)`, and `complex(dp)`. `skiprows` skips +header lines, `max_rows` caps the read. +::instruction:: + +### Q +What's `stdlib_string_type`'s purpose? + +### A +It defines `type(string_type)` — a deferred-length, allocatable string +wrapper around `character(:)` — and overloads operators (`==`, `<`, +`//`) for it. This gives Fortran a value-semantic variable-length +string type that's ergonomic in containers and arrays, working around +the limitations of bare `character(*)`. +::instruction:: + +### Q +What does `stdlib_array` provide? + +### A +`stdlib_array` provides index-manipulation utilities — `trueloc` and +`falseloc` for finding indices where a logical array is true or false, +plus indexing helpers. Useful for masking and selecting elements +without writing manual loops. +::instruction:: + +### Q +What is `stdlib_math::linspace`? + +### A +`linspace(start, end, n)` returns a rank-1 array of `n` evenly-spaced +values from `start` to `end` inclusive. Same shape as numpy's `linspace`. +The companion `logspace(start, end, n[, base])` returns evenly-spaced +values in log space. +::instruction:: + +### Q +What is `stdlib_stats::mean` for? + +### A +Computes the arithmetic mean of a rank-N array along an optional +specified dimension. With `dim` omitted, returns a scalar mean over +all elements. With `dim=k`, returns a rank-(N-1) array of means along +the k-th axis. Optionally accepts a `mask` for conditional inclusion. +::instruction:: + +### Q +What's `stdlib_linalg::solve` for? + +### A +Solves a dense linear system `A . x = b` for `x`, where `A` is a +square matrix and `b` is a right-hand-side vector or matrix. Wraps +LAPACK's `gesv` family. Returns the solution; the input `A` is +typically modified in-place by the LU decomposition. +::instruction:: + +### Q +What `kind` parameters does `stdlib_kinds` define? + +### A +`int8`, `int16`, `int32`, `int64` for integer kinds and `sp`, `dp`, +`qp` for real kinds (single, double, quadruple precision). Also +`xdp` for extended-double where supported. These wrap the standard +`iso_fortran_env` kinds with shorter names. +::instruction:: + +### Q +What does `stdlib_ascii::to_upper` do? + +### A +Returns its `character(*)` argument with ASCII letters (a-z) mapped to +their uppercase equivalents (A-Z). Non-letter characters pass through +unchanged. The companion `to_lower` does the opposite. +::instruction:: + +### Q +What's `stdlib_strings::replace_all`? + +### A +```fortran +use stdlib_strings, only: replace_all +result = replace_all(string, pattern, replacement) +``` +Returns `string` with every occurrence of `pattern` substituted by +`replacement`. Pure and elemental, so it can be applied to scalar or +array string inputs. +::instruction:: + +### Q +What's the difference between `stdlib_random::random_seed` and +Fortran's intrinsic `random_seed`? + +### A +The intrinsic `random_seed` configures the implementation-dependent +default RNG. `stdlib_random::random_seed` is part of stdlib's +deterministic RNG path — it seeds stdlib's portable distribution +sampling routines, giving reproducible random sequences across +compilers. +::instruction:: + +### Q +What does `stdlib_hashmaps::hashmap_type` provide? + +### A +A generic open-addressing hash map keyed by integer or string and +storing arbitrary user-defined data. `set`, `get`, `remove`, `keys`, +and iterators are provided as type-bound procedures. +::instruction:: + +### Q +What's `stdlib_quadrature::trapz` for? + +### A +Numerical integration via the trapezoidal rule. `trapz(y, x)` returns +the integral of y dx using sample points `x` and corresponding values +`y`. Both must be rank-1 arrays of the same length. `simps` is the +cousin using Simpson's rule. +::instruction:: + +### Q +How does the `stdlib_logger` module emit log messages? + +### A +Define a `logger_type` instance, configure its level (`debug`, `info`, +`warning`, `error`), then call methods like `log_information`, +`log_warning`, etc. Output goes to the configured unit (default +stderr). Logger instances can be configured with timestamps, source +locations, and log-level filtering. +::instruction:: + +### Q +What does `intent(in)` mean for a Fortran dummy argument? + +### A +The argument may be read but not modified inside the procedure. +Attempting to assign to or pass it as `intent(out)`/`intent(inout)` +to another routine is a compile-time error (in conformant compilers). +This is the contract you want for read-only inputs. +::instruction:: + +### Q +When should you prefer `do concurrent` over a plain `do` loop? + +### A +When loop iterations have no carried dependence — that is, no +iteration reads or writes a variable that another iteration could +write. `do concurrent` expresses this independence to the compiler, +which can then auto-vectorize, parallelize via OpenMP, or offload to +GPU without further pragmas. If iterations are actually dependent, +use plain `do`. +::instruction:: + +### Q +What's the canonical way to declare an allocatable array of +`real(real64)` in modern Fortran? + +### A +```fortran +use iso_fortran_env, only: real64 +real(real64), allocatable :: arr(:) +allocate(arr(n)) +! ... use arr ... +deallocate(arr) +``` +Or use stdlib's `dp` kind: +```fortran +use stdlib_kinds, only: dp +real(dp), allocatable :: arr(:) +``` diff --git a/docs/audits/13-followup/findings/00-overview.md b/docs/audits/13-followup/findings/00-overview.md new file mode 100644 index 0000000..a3657df --- /dev/null +++ b/docs/audits/13-followup/findings/00-overview.md @@ -0,0 +1,112 @@ +# Audit 13 follow-up — investigation log + +A sequence of single-variable experiments, each isolating one +hypothesis about why the original audit-13 fortran fine-tune showed +"adherence works, attribution fails." Reading order: + +1. **[Finding 01 — recipe failure](./01-recipe-failure.md)** + Re-run audit-13 with 5× more INSTRUCTION sections + 5× more steps + + sway bridge probes lit. Result: traded "no Q/A binding" for + "memorization without generalization." Falsified the *volume-only* + hypothesis. + +2. **[Finding 02 — CPT isolation](./02-cpt-isolation.md)** + Stripped INSTRUCTION sections, ran pure CPT at LoRA r=64. Result: + adapter learned form-specific autoregression (memorization + + English regression), not domain abstraction. Falsified the + *two-stage CPT-then-SFT* hypothesis at this base size and surfaced + architectural floor: SmolLM2-135M lacks the capacity to compose + fortran knowledge with English chat behavior. + +3. **[Finding 03 — base-size floor + recipe is the bottleneck](./03-base-floor.md)** + Promoted to `qwen2.5-coder-1.5b`, ran the audit-13-followup doc + without recipe changes. Result: bigger base eliminated catastrophic + forgetting (`cal_general` 26% → 0% items regressed) **but the LoRA + still memorized instead of generalized**, and actively degraded + the wedge question (Q3 stdlib sorting). Isolated the bottleneck: + it's the *recipe* (raw-source training data shape), not the base + size. dlm's value-add is conditional on training-data shape. + +4. **[Finding 04 — Q/A-shape recipe closes the wedge](./04-instruction-shape.md)** + Built a doc with 35 hand-authored INSTRUCTION sections (no raw + sources), trained on qwen-coder-1.5b at r=16 / 100 epochs. + Result: **Q3 now lands correctly** — verbatim trained answer + pointing at `stdlib_sorting :: sort` with proper signature. + General capability preserved. Generalization to unseen but + overlapping questions is partial; to unseen-different-module + questions, weak. Surfaced two real dlm bugs along the way + (replay store parser bug + MLX backend silently ignoring PEFT + adapters). The audit closes GREEN with a clean three-step + product story. + +5. **[Finding 05 — corpus density](./05-corpus-density.md)** *(optional)* + Test the dataset-size / generalization curve. Find the per-module + Q/A density floor below which the model can't generalize the API + form. Gives dlm users a concrete planning number ("budget N Q/A + pairs per module"). + +## Why this format + +Each file is self-contained — one experiment, one verdict, one +falsified-or-confirmed hypothesis. No "TODO" findings; if a +hypothesis is in-flight, it's either at the bottom of the latest +finding ("next experiment") or has its own placeholder file. This +keeps the investigation traceable: any future reader can look at the +state of the directory and reconstruct what we tested, what we +ruled out, and what we still don't know. + +## Standing artifacts + +- `the-doc.dlm` — the audit-13-followup canonical doc (Finding 01) +- `stage1/the-doc.dlm` — PROSE-only stage-1 doc (Finding 02) +- `stage1/sway.yaml` — stage-1 sway eval spec +- `sway-results.json` / `sway-results.md` — Finding 01's raw sway run +- `/tmp/sway-stage1.json` — Finding 02's raw sway run *(local-only)* +- `train.log` — Finding 01's training output + +## What we know so far + +**Confirmed:** +- The dlm pipeline works end-to-end: doc → train → adapter → sway → + direct query, all deterministic and reproducible. The plumbing is + sound. +- The bridge probes (`section_internalization`, `leakage`, + `paraphrase_invariance`) require specific corpus shape: ≥2 section + kinds (PROSE + INSTRUCTION minimum) for the leak-check, and + `!probe` markers for paraphrase case generation. +- SmolLM2-135M produces memorization, not generalization, on this + corpus regardless of recipe variations within reach. +- Bigger base (qwen2.5-coder-1.5b) eliminates catastrophic + forgetting — `cal_general` regressions 26% → 0%. dlm's + recommended-base table should warn that small bases (135M) actively + degrade under LoRA. +- **Recipe shape is the bottleneck, not base size.** Same doc on a + bigger base still produces memorization; LoRA on raw-source training + rows learns "be a source autocomplete engine," not "answer questions + about the domain." + +- **Q/A-shape recipe on qwen-coder-1.5b closes the wedge.** Finding + 04 trained 35 hand-authored Q/A pairs and produced an adapter that + reproduces trained answers verbatim, preserves general capability, + and partially activates domain knowledge on related questions. The + three-step product story (bigger base + Q/A-shape recipe + plan + one pair per question) has falsifiable evidence at each step. + +**Unknown:** +- The dataset-size / generalization curve. With 32 hand-authored Q/A + pairs, generalization to unseen-different-module questions is + weak. How many pairs per module are needed for the model to + abstract the API form? Finding 05 (optional) would answer this. +- Whether dlm-synthesized Q/A pairs (via `dlm synth instructions`) + produce comparable training results to hand-authored ones, or + whether teacher quality matters. This is a question for the + product onboarding story. + +**Bugs filed during the investigation (worth fixing in dlm):** +- `src/dlm/replay/store.py:187` — `parse_instruction_body` called + without `_normalize_probe_markers`. Patched in this branch as + part of Finding 04. +- MLX inference backend silently ignores PEFT adapters on darwin-arm64 + (auto-routing falls through to MLX, MLX loads base only). User- + visible failure is "trained model behaves like base" — major + product-trust risk. Workaround: `--backend pytorch`. diff --git a/docs/audits/13-followup/findings/01-recipe-failure.md b/docs/audits/13-followup/findings/01-recipe-failure.md new file mode 100644 index 0000000..afef44d --- /dev/null +++ b/docs/audits/13-followup/findings/01-recipe-failure.md @@ -0,0 +1,146 @@ +# Audit 13 follow-up — findings + +**Verdict:** the corpus-shape hypothesis was *not* falsified, but the +re-run produced an **adapter that memorizes more and generalizes less** +than audit 13's smaller adapter. Five-fold INSTRUCTION density + 5× +steps moved the failure from "no Q/A binding" (audit 13) to "verbatim +memorization without prompt-level behavior change" (this run). + +## Setup + +- **Doc:** `docs/audits/13-followup/the-doc.dlm` +- **Store ULID:** `01KQCWMA64901VEMYB3DC8CGXY` +- **Adapter:** `~/.dlm/store/01KQCWMA64901VEMYB3DC8CGXY/adapter/versions/v0001` +- **Steps:** 600 (audit 13 = 120) +- **LoRA r/alpha:** 16/32 (audit 13 = 8/16) +- **Sources:** `~/GithubOrgs/FortranGoingOnForty` + `/tmp/stdlib_build/src` + + `/tmp/stdlib_build/doc/specs` markdown +- **Final loss:** train 1.452, val 1.401 (audit 13 = 1.83 at step 120, still descending) + +## sway results, side-by-side + +| probe | audit 13 (baseline) | follow-up (this run) | delta | +| --- | --- | --- | --- | +| `delta_kl` | **PASS, z=+44.16σ** | **FAIL, z=−0.15σ** | massive regression | +| `section_internalization` | SKIP (no `[dlm]`) | **~0, z≈0** | weak signal | +| `paraphrase_invariance` | FAIL, z=−3.51σ | ERROR ("no cases provided") | unresolved | +| `leakage` | SKIP (no `[dlm]`) | **PASS, z=+8.45σ** | strong memorization | +| `calibration_drift` | PASS | borderline (5/50 regressed) | mild degradation | +| `adapter_ablation` | PASS (R²=0.91) | **FAIL, overshoot=1.54** | failed sat-band | + +## Diagnosis + +### Why `delta_kl` collapsed +Audit 13's small adapter on the same prompts moved logits much more +than its null-baseline counterpart (z=+44σ). This run's mean JS +divergence on the prompts is `0.151` — a meaningful number in +absolute terms, but the null-adapter calibration baseline is *also* +≈0.15 because random LoRA weights at r=16 already perturb logits a +lot on short fortran prefix prompts. The trained adapter doesn't +*beat noise* on these specific prompt prefixes. + +This is calibration-shape sensitivity, not "the adapter learned +nothing." Different prompt selection would surface the signal. + +### Why `section_internalization` is ≈0 (this is the load-bearing finding) +Per-section evidence in `/tmp/sway-13fu.json`: + +- `own_lift` (NLL improvement on the section's own probes) ≈ 0.030 +- `leak_lift` (NLL improvement on *other* sections' probes) ≈ 0.064 +- `effective_sis = own_lift − leak_lift` ≈ **−0.034** + +`leak_lift` is *uniform* across sections at ~0.064. That means the +adapter applied a **constant fortran-flavored prior** to every +fortran-shaped prompt — not section-specific knowledge. The adapter +learned "this looks like fortran, lower NLL" but did **not** learn +"section X said Y, so on probes about Y, lower NLL more." + +That's the corpus-shape issue surfacing differently. With 40 INSTRUCTION +sections that mostly differ only in surface wording, the adapter +reduced to a generic fortran-style bias. + +### Why `leakage` jumped to +8.45σ +greedy LCS recall against perturbed prompts is high (0.06 on +perturbed vs 0.05 baseline; fragility 0.17). Combined with weak +`section_internalization`, this is the textbook **memorization** +signature: the model can recite chunks of training data when prompted, +but the knowledge isn't transferable. + +### Why `paraphrase_invariance` errored ("no cases provided") +The bridge's case generator wants paired (original, paraphrased) probe +items, and it got none. Likely cause: the audit follow-up `.dlm` puts +INSTRUCTION sections in compact `Q:`/`A:` shape but without the `!probe` +markers that the bridge's section→probes mapper looks for. (The +parser-expansion PR #10 fixed *parsing* of these blocks; it did not add +`!probe` markers to the audit doc.) + +This isn't a sway bug — it's a doc-shape gap. The follow-up doc was +authored before `dlm synth instructions --apply` was used. + +### Why `adapter_ablation` overshot (R²=0.91, sat_λ=1.25, overshoot=1.54) +λ-scaled KL: at λ=1.25 KL is 1.54× the λ=1.0 KL. The healthy band is +overshoot ≤1.05 with sat_λ ∈ (0.5, 1.0]. This curve never saturated — +which mathematically reads as **the adapter is under-magnitude**: +scaling its contribution beyond the nominal training point keeps +pulling logits toward the same direction. R²=0.91 is fine (linear +response is healthy); the problem is the *band*, not the shape. + +This often co-occurs with under-training rather than over-training. +With a memorization signature elsewhere though, the more likely read +is *low-rank knowledge that the rest of the model can't compose with* — +the LoRA delta is small, the rest of the model produces fortran-flavored +output by base capability, and λ-scaling the small delta linearly +amplifies its directional bias. + +## Direct query smoke test + +``` +$ uv run --no-sync dlm prompt the-doc.dlm \ + "How do you declare an allocatable array of real(real64) in modern Fortran?" \ + --max-tokens 120 --temp 0.0 +I am using the following code: +real(real64) array(1000000000, 1000000000) +I am using the following code: +real(real64) array(1000000000, 1000000000) +... +``` + +The adapter clearly learned fortran-shaped output (it uses `real(real64)`, +correct kind syntax) — but did not learn the *answer pattern* for a +typed Q/A query. It loops on a single memorized fragment. + +## Implications for the audit + +1. **Original audit 13 finding (`paraphrase_invariance` FAIL on 8 + instruction sections) was load-bearing.** Adding 5× more sections + without changing their *shape* worsened generalization. The fix + wasn't volume — it was Q/A shape diversity (different question + forms for the same content). + +2. **Null-adapter calibration is prompt-sensitive.** A trained adapter + at r=16 needs prompts where the null adapter produces *less* JS + divergence — i.e. prompts with strong base-model priors. Short + fortran-fence prefixes don't qualify. + +3. **The bridge `paraphrase_invariance` probe needs `!probe` markers + on instruction sections** (or sway should derive them). Without + markers + paraphrases, the bridge silently becomes "no cases." + +## Recommended next steps (not started) + +- **Generate paraphrases.** `dlm synth instructions --strategy paraphrase + --per-section 3 --apply` against the existing 40 INSTRUCTION sections, + then re-run sway. This populates the bridge's paraphrase cases and + also breaks the surface-form uniformity that drove `effective_sis ≈ 0`. +- **Tier comparison at SmolLM2-360M and 1.7B.** If 600 steps + 40 sections + + paraphrases still doesn't bind Q/A on 135M, the floor is the model. +- **Adjust calibration prompts.** For `delta_kl`, swap the fortran-fence + prefixes for prompts where SmolLM2 has a strong *English* prior + (e.g. "When should you prefer `do concurrent` over `do`? Answer:") so + the null baseline produces low JS divergence and the trained adapter's + divergence shows up against noise. +- **Don't claim the e2e fortran promise yet.** This run shows the .dlm → + trainable adapter pipeline works (training completed deterministically, + adapter saved, sway runs end-to-end with bridge probes lit) — but it + also shows that *the right adapter* requires more corpus engineering + than "more sections + more steps." That's a real product caveat. diff --git a/docs/audits/13-followup/findings/02-cpt-isolation.md b/docs/audits/13-followup/findings/02-cpt-isolation.md new file mode 100644 index 0000000..7d3ec59 --- /dev/null +++ b/docs/audits/13-followup/findings/02-cpt-isolation.md @@ -0,0 +1,143 @@ +# Finding 02 — CPT-only stage isolates a *new* failure mode + +**Hypothesis tested:** the original audit-13-followup adapter showed +"uniform fortran-flavored bias" without per-section content learning +(Finding 01). My theory: the PROSE corpus drowned the INSTRUCTION +signal. If we strip INSTRUCTION sections entirely and run pure CPT at +higher LoRA rank, the adapter should at least *learn the corpus* — and +we can layer Q/A on top in stage 2. + +**Verdict:** the hypothesis was *partially* right (the adapter does +learn fortran tokens) but uncovered a stronger failure mode that +invalidates the two-stage chain idea on a 135M base. + +## Setup + +- **Doc:** `docs/audits/13-followup/stage1/the-doc.dlm` +- **Store ULID:** `01KQDGAM70EJ1WJCQY6PVDV95W` +- **Sources:** identical to Finding 01's run (FortranGoingOnForty + stdlib src + stdlib doc/specs markdown) +- **Sections:** PROSE-only, *zero* INSTRUCTION +- **LoRA r/alpha:** 64/128 (Finding 01 used 16/32) +- **Steps:** 800 (Finding 01 used 600; audit 13 used 120) +- **Final:** train 1.293, eval 1.223, mean token accuracy 73.6% + +## sway results + +| probe | verdict | z | reading | +| --- | --- | --- | --- | +| `dk_fortran_qa_shaped` | FAIL | **−13.74σ** | adapter shifts *less* than null on English-Q/A prompts | +| `sis_fortran` | SKIP | — | bridge needs ≥2 section kinds; PROSE-only doesn't qualify | +| `leak_fortran` | PASS | +6.41σ | memorized fragments (greedy_recall=0.06, fragility=0.00) | +| `cal_general` | FAIL | −4.81σ | **26%** general-comp items regressed >1 nat (was 10% in Finding 01) | +| `abl_fortran` | FAIL | — | overshoot=1.49, sat_λ=1.25 (out of band) | + +## Direct-query smoke + +Greedy (`temp=0.0`): + +``` +$ dlm prompt the-doc.dlm "module simple_demo\n implicit none" --max-tokens 80 + implicit none + implicit none + implicit none + implicit none + ... (24× repetition) + +$ dlm prompt the-doc.dlm "How do you declare an allocatable real(real64) array?" +I am using the following code: +real(real64) array(1000000000, 1000000000) +I am using the following code: +real(real64) array(1000000000, 1000000000) +... (loops on a single training fragment) +``` + +Sampling (`temp=0.7`, `top_p=0.9`): + +``` +$ dlm prompt the-doc.dlm "subroutine sort_real_array(arr, n)" --max-tokens 100 +{ + arr = std::move(arr); +} +template +void array_sort(T* arr, int n) { + array_sort(arr, n); +} +``` + +— **C++ from a fortran prompt under sampling.** The base model's C++ +prior dominates the moment we leave argmax decoding. The LoRA delta is +just memorized argmax tokens, not a generalized fortran prior. + +## Diagnosis + +Three signals together tell the architectural story: + +1. **Mode collapse under greedy.** The adapter pushes a small set of + fragments to the top of the distribution; argmax decoding traps in + them. `leakage` z=+6.41σ + `fragility=0.00` is the textbook + memorization fingerprint. +2. **Adapter inactive on non-fortran-shaped prompts.** + `dk_fortran_qa_shaped` z=−13.74σ means the trained LoRA produces + *less* divergence from base than a random LoRA on + English-Q/A-shaped prompts. The training only taught the adapter + to be active on raw-fortran-source-shaped inputs. +3. **General competence regressed twice as much as Finding 01.** + `cal_general` 26% vs 10% items >1-nat regressed. Removing + INSTRUCTION sections made things *worse* on English. The + INSTRUCTION sections in the audit-13-followup were apparently + acting as a chat-format regularizer. + +The deepest read: **the LoRA learned form-specific completion (raw +fortran source autoregression), not domain knowledge.** This is +exactly what we asked SFT loss to do — every PROSE row's training +signal is "predict the next token of fortran source." We got what we +asked for. The token-distribution prior on `module x\n` is now sharper +toward fortran continuations, but the *concept* of fortran isn't +abstracted in a way that helps with English-prefixed questions. + +## Why this invalidates the chain + +The plan was: stage-1 CPT teaches fortran → stage-2 SFT layers Q/A. +The implicit assumption was that stage-1 produces a *fortran-aware +substrate* for stage-2 to bind onto. Instead stage-1 produced a +*memorization trap* that: + +- Has degraded English chat capability (cal_general −4.81σ) +- Doesn't activate on English-prefixed prompts (dk z=−13.74σ) +- Mode-collapses under greedy decoding + +A stage-2 SFT phase against this substrate would have to *un-do* the +memorization while teaching Q/A binding. SFT can't easily do both at +the same LoRA rank — and the ablation curve (overshoot=1.49, no +saturation in band) suggests the loss surface around the trained +point is linear, meaning the LoRA is still in "more is more" mode +rather than at a coherent minimum. + +## What this means for the product narrative + +dlm's promise is "edit a text file → trainable LLM." The architectural +reality from Findings 01 + 02: + +- **At SmolLM2-135M:** the recipe consistently produces memorization, + not generalization. Volume, rank, sequence length, and corpus + shape variations have all bottomed out at the same failure mode. +- **The base model is the floor.** 135M params with our LoRA adapter + can either preserve English-chat behavior *or* memorize fortran + fragments — not compose both into a usable expert. + +This is a *correct* and *useful* negative result for the product +narrative. It tells future users: don't expect a 135M base to absorb +specialty domains. It also informs dlm's recommended-base table — +the smollm2-135m row should carry a "use for style-transfer demos +only" caveat. + +## Next experiment + +[Finding 03](./03-base-floor.md) — promote the base to +qwen2.5-coder-1.5b (already registered in dlm). Test the question: +**is the recipe sound, and the 135M floor was the only blocker?** If +the same `the-doc.dlm` (audit-13-followup) on the bigger base +produces measurable LoRA delta beyond the base's existing fortran +knowledge, we have a working dlm story. If the LoRA delta is small, +we learn that dlm's value-add is style/format, not domain knowledge — +which is also publishable. diff --git a/docs/audits/13-followup/findings/03-base-floor.md b/docs/audits/13-followup/findings/03-base-floor.md new file mode 100644 index 0000000..ab8aff8 --- /dev/null +++ b/docs/audits/13-followup/findings/03-base-floor.md @@ -0,0 +1,140 @@ +# Finding 03 — base-size floor + the recipe is the bottleneck + +**Hypothesis tested:** Findings 01 and 02 left two confounded +explanations for the failure mode (memorization without +generalization): (a) SmolLM2-135M is too small, or (b) the recipe is +wrong. Finding 03 isolates these by training the **same audit-13 +follow-up doc** on `qwen2.5-coder-1.5b` — same recipe, 11× the +parameters, code-pretrained. + +**Verdict:** the recipe is the bottleneck. The bigger base eliminated +catastrophic forgetting (`cal_general` 26% → **0%** items regressed) +but the LoRA still memorizes instead of generalizes, and **actively +degrades the base on the wedge question we built the experiment to +answer**. + +## Setup + +- **Doc:** `docs/audits/13-followup/finding03/the-doc.dlm` +- **Store ULID:** `01KQDM14CT0X6AWFWTW9E81ZJ7` +- **Adapter:** `~/.dlm/store/01KQDM14CT0X6AWFWTW9E81ZJ7/adapter/versions/v0001` +- **Base:** `qwen2.5-coder-1.5b` (Qwen/Qwen2.5-Coder-1.5B-Instruct) +- **LoRA r/alpha:** 16/32 (same as Finding 01) +- **Steps:** 600 (same as Finding 01) +- **Final:** train 0.765, eval 0.750, mean token accuracy **82.2%** (vs 73.6% on the SmolLM2 stage-1 run) + +## The pre-training baseline (`finding03/baseline.md`) + +Before training, qwen2.5-coder-1.5b already speaks fortran fluently — +syntax, modern features (`do concurrent`, `intent`, `allocatable`), +program structure. What it does **not** know is `fortran-lang/stdlib`. +Q3 ("Show the signature of stdlib's sorting routine") makes the gap +visible: the base hallucinates `iso_fortran_env::sort`, a non-existent +module. **That hallucination is the wedge: a falsifiable claim our +LoRA could fix if domain knowledge transfer is real.** + +## sway results + +| probe | verdict | z | reading vs Finding 02 (SmolLM2-135M) | +| --- | --- | --- | --- | +| `dk_fortran` | FAIL | **−15.46σ** | LoRA shifts logits *less* than null on these prompts (similar to F02's −13.74σ) | +| `sis_fortran` | FAIL | ~0σ | 7/41 sections cleared, mean effective_sis = +0.000. Same uniform-bias collapse as F01. | +| `para_fortran` | ERROR | — | "no cases provided" — doc still lacks `!probe` markers | +| `leak_fortran` | PASS | **+7.71σ** | Memorization, fragility=0.00 (≈ F02's +6.41σ) | +| `cal_general` | **PASS** | — | **0/50 items regressed** (F02 had 26%, F01 had 10%) | +| `abl_fortran` | FAIL | — | overshoot=1.21 (vs F02's 1.49); R²=0.99 — linear loss surface, still under-saturated | + +## Direct-query smoke (the wedge) + +| Q | Base alone | Trained adapter | Net | +| --- | --- | --- | --- | +| Q1 (allocatable real64) | correct | correct | **= same** | +| Q2 (do concurrent) | partial | rambles + unrelated code | **worse** | +| Q3 (stdlib sorting — THE WEDGE) | wrong (hallucinates `iso_fortran_env::sort`) | degenerate enumeration loop | **worse** | +| Q4 (intent(in)) | correct | correct | **= same** | + +**0/4 improved. 2/4 degraded.** The LoRA did not add stdlib knowledge; +it added autoregressive fortran-source-completion habits that +interfere with Q/A. Full transcripts at +`docs/audits/13-followup/finding03/direct-query-results.md`. + +## What we learned + +### 1. Base size is *not* the floor — it's the regularizer + +Catastrophic forgetting collapsed from 26% (F02) → 10% (F01) → **0% +(F03)** as we moved from 135M to a bigger code-pretrained base. The +bigger base absorbs the LoRA without losing English chat capability. +This is a clean architectural finding: + +> A LoRA adapter on a small base actively degrades the base. A LoRA +> adapter on a sufficiently capable base is "free" — the base carries +> through unchanged on what it already knew, and the LoRA contributes +> on top. + +For dlm's product narrative: **the recommended base table needs a +warning at SmolLM2-135M.** That base is for style demos, not domain +knowledge addition. + +### 2. The recipe is the bottleneck + +With the bigger base eliminating the noise floor of catastrophic +forgetting, the *recipe-level* failures become unambiguous: + +- `leak_fortran` +7.71σ — the LoRA memorizes raw fortran source + fragments +- `dk_fortran` z=−15.46σ — the LoRA does not activate on Q/A-shaped + prompts +- `sis_fortran` effective_sis=0 — uniform fortran-flavored bias, not + per-section content +- Q3 wedge — the LoRA *makes the stdlib gap worse* by replacing a + concrete-but-wrong base answer with a degenerate enumeration loop + +These all share one root cause: **the training corpus is shaped like +raw source files, not Q/A pairs.** SFT's loss is "predict next token"; +on raw source rows that means "complete fortran source," and the LoRA +learns exactly that. The 5MB FortranGoingOnForty source drowns the +1MB stdlib source (most of which exceeds the 32KB-per-file cap and is +silently skipped) and the 585KB of stdlib doc/specs markdown. The +LoRA never sees enough Q/A-shaped data to bind the format. + +### 3. The adapter ablation curve has improved but not healed + +Saturation overshoot dropped 1.49 (F02) → 1.21 (F03). R² rose to 0.99 +(very linear). The trend says: with more training the curve might +saturate properly. But the loss is at 0.75 (eval), down from 0.95 → +0.6 across the 600 steps, and still descending. We're not at a real +minimum. **More steps would extract more memorization, not more +generalization.** + +## Implications for the dlm product narrative + +We now have a clean, falsifiable, evidence-backed claim: + +> **dlm's value-add depends on training-data shape, not base capacity.** +> A LoRA trained on raw source code teaches the adapter to be a source +> autocomplete engine, regardless of base size. To produce a question- +> answering domain expert, the training data must itself be Q/A-shaped. + +This is a *positive* product story, not a negative one — it tells dlm +users "use INSTRUCTION sections (or generate them from PROSE via +`dlm synth instructions`), not raw `sources:` directives, when the goal +is a chat assistant." The current docs don't make this distinction +clearly enough; that's a docs fix worth landing. + +## Next experiment + +[Finding 04 — Q/A-shaped training](./04-instruction-shape.md) +*(pending)* + +The fix to test next: build a doc with INSTRUCTION sections constructed +from the stdlib `doc/specs/*.md` corpus (which is Q&A-friendly +documentation, unlike raw source files). Use `dlm synth instructions +--strategy extraction` with a strong teacher (claude-haiku, qwen-coder-7b, +or hand-author) to produce dense Q/A pairs. Train on qwen-coder-1.5b at +r=16 with no raw `sources:` directives. The wedge to track is still Q3: +does the trained adapter point at `stdlib_sorting` instead of producing +a degenerate loop? + +If Q3 lands correctly, the dlm story is: bigger base + Q/A-shape recipe → +working domain expert. That's the experiment that *closes* the audit. diff --git a/docs/audits/13-followup/findings/04-instruction-shape.md b/docs/audits/13-followup/findings/04-instruction-shape.md new file mode 100644 index 0000000..f7d5528 --- /dev/null +++ b/docs/audits/13-followup/findings/04-instruction-shape.md @@ -0,0 +1,156 @@ +# Finding 04 — Q/A-shape recipe closes the wedge + +**Hypothesis tested:** Findings 01–03 isolated training-data shape as +the bottleneck — LoRA on raw source files learns to be a source +autocomplete engine, not a question-answering bot. The fix: train on +INSTRUCTION-shaped Q/A pairs only, no `sources:` directives. Use +qwen2.5-coder-1.5b as base (Finding 03 confirmed it preserves English +capability under LoRA training). + +**Verdict:** the hypothesis is **confirmed**. The recipe-shape fix +produces correct trained-question answers and preserves general +capability. Generalization to *nearby* unseen questions is partial +and dataset-size-dependent. + +## Setup + +- **Doc:** `docs/audits/13-followup/finding04/the-doc.dlm` +- **Store ULID:** `01KQDWAHNJ7FD72EQ4J4FXBJ2V` +- **Adapter:** `~/.dlm/store/01KQDWAHNJ7FD72EQ4J4FXBJ2V/adapter/versions/v0002` +- **Base:** `qwen2.5-coder-1.5b` +- **Sections:** 35 INSTRUCTION (32 train / 3 val), 0 PROSE, 0 raw `sources:` +- **Recipe:** r=16 / α=32 / lr=2e-4 / 100 epochs (early-stopped at epoch 44, step 400) +- **Final:** train loss 0.62, eval loss 0.047, **mean token accuracy 98.6%** + +The corpus is ~60% stdlib_sorting (the wedge target), 40% other +modules (io, math, strings, array, kinds, ascii, hashmaps, quadrature, +logger, plus a few general-fortran questions like `intent(in)` and +`do concurrent`). + +## The wedge: closed + +The audit's wedge question (Q3 from Finding 03's baseline): + +> Show the signature of stdlib's sorting routine in Fortran. + +**Base:** *wrong.* Hallucinates `iso_fortran_env::sort`. +**Finding-03 LoRA (raw-source training):** *worse.* Degenerate +"should be able to sort an array of integers in ascending order. The +sorting routine should be able to sort an array of floating-point +numbers..." enumeration loop. +**Finding-04 LoRA (Q/A-shape training):** ✓ correct, verbatim: + +```fortran +use stdlib_sorting, only: sort +call sort(array[, reverse]) +``` +The `array` argument is `intent(inout)` and must be a rank-1 array of +an intrinsic numeric type (integer kinds, real kinds), `character(*)`, +`type(string_type)`, `type(bitset_64)`, or `type(bitset_large)`. The +optional `reverse` argument is a scalar logical with `intent(in)`. + +## Generalization curve + +| Question type | Result | Verdict | +| --- | --- | --- | +| **Seen exactly** (Q3 wedge, intent(in)) | verbatim correct | ✓ memorized cleanly | +| **Unseen, overlap** ("sort a real(dp) array") | knows `stdlib_sorting`, invents wrong call form | partial | +| **Unseen, different module** (`stdlib_strings::starts_with`) | falls back to Rust syntax | none | +| **Unseen, different module** ("read a CSV with stdlib") | hallucinates plausible API | none | +| **Out-of-domain** (capital of France, Python list comprehension) | unchanged from base | ✓ preserved | + +Full transcripts at `docs/audits/13-followup/finding04/direct-query-results.md`. + +## What we learned + +### 1. The recipe-shape hypothesis is correct + +LoRA trained on INSTRUCTION-only Q/A pairs produces an adapter that +*answers questions* in the trained format. LoRA trained on raw source +files produces an adapter that *autocompletes source code*. Same base, +same rank, same compute — completely different behavioral character. +This is the cleanest finding of the investigation. + +### 2. The dataset-size / generalization tradeoff is sharp + +With 32 hand-authored Q/A pairs: +- 100% trained-question fidelity (98.6% eval token accuracy) +- Strong reproduction of trained answers under varied prompt phrasing +- Partial knowledge of "named entities" (`stdlib_sorting`, + `stdlib_io`, `loadtxt`) appears in unseen-question responses +- API-form generalization is weak — model invents plausible-looking + syntax instead of generalizing the patterns it saw + +This means dlm's product story isn't "learn a domain from a few +examples" — it's "learn the questions you actually want to answer, +with one training row per question." That's a more honest story and +also more practically useful: users know what they're getting. + +### 3. The bigger base preserves general capability under aggressive LoRA training + +Finding 03 already showed `cal_general` 0% regression on +qwen2.5-coder-1.5b. Finding 04 replicates this even with 100-epoch +overfit-style training: the model still answers "What is the capital +of France?" correctly and writes valid Python list comprehensions. +This is the architectural property that makes the recipe-shape fix +viable — at SmolLM2-135M, the same overfit training would have +shredded English chat capability. + +## Bugs surfaced + +1. **`src/dlm/replay/store.py:187`** — `parse_instruction_body` was + called without `_normalize_probe_markers`. Fixed in this branch. + Without the fix, INSTRUCTION sections with `### Q !probe` headers + trigger a parse error during retrain even with `--fresh`, because + replay snapshots the raw section content. The other dlm callers of + `parse_instruction_body` (`eval/probes.py`, `cli/commands/synth.py`, + `train/gate/orchestrator.py`, `preference/mine.py`) should be + audited for the same bug — that's a follow-up. +2. **MLX backend silently ignores PEFT adapters.** dlm's auto-routing + selects MLX on darwin-arm64. PEFT `adapter_model.safetensors` + isn't an MLX-LM adapter format; the inference path appears to load + the base and ignore the adapter. The user-visible failure is + "trained model behaves identically to base" — easy to misdiagnose + as "training didn't work." Workaround: `--backend pytorch`. Real + fix needs investigation in `src/dlm/inference/backends/mlx.py`. + +## Implications for the dlm product narrative + +The investigation produced a clean three-step story: + +1. **Use a base ≥ 1B params and code-pretrained where available.** + Smaller bases (135M) actively degrade under LoRA training of any + shape. Recommended-base table needs a warning at the small end. +2. **Train on INSTRUCTION-shaped data, not raw source code.** Raw + source teaches autocomplete; INSTRUCTION teaches Q/A. Pick one + based on the goal. dlm's docs should make this distinction. +3. **Plan one Q/A pair per question you want to answer.** With + small datasets, pinpoint reproduction is reliable but + generalization is weak. Scale the corpus to scale the surface. + +Each step has falsifiable evidence in this directory: + +- Step 1 from Findings 02 (135M memorization+forgetting) and 03 + (1.5B preservation) +- Step 2 from Finding 03's wedge failure vs Finding 04's wedge success +- Step 3 from Finding 04's generalization curve + +This is publishable as written. The audit closes here as **GREEN**: the +end-to-end fortran fine-tune story works end-to-end with the right recipe, +the right base, and a corpus shaped to match the user's question set. + +## Next experiment (optional) + +[Finding 05 — corpus density](./05-corpus-density.md) *(not started)* + +Test the dataset-size / generalization curve directly. Build a 100-pair +INSTRUCTION corpus covering each stdlib module's main API surface +(rather than 60% sorting). Train, then probe with held-out questions +about modules covered with one to three training pairs each. + +The hypothesis: there's a per-module Q/A density floor below which +the model can't generalize the API form. Finding the threshold gives +dlm users a concrete planning number ("budget N Q/A pairs per module +of API surface to teach"). This is the "how much training data does +each new domain take" question that makes the product practically +plannable. diff --git a/docs/audits/13-followup/stage1/sway.yaml b/docs/audits/13-followup/stage1/sway.yaml new file mode 100644 index 0000000..b30f348 --- /dev/null +++ b/docs/audits/13-followup/stage1/sway.yaml @@ -0,0 +1,101 @@ +# Stage-1 sway spec — does PROSE-only CPT actually expand the fortran prior? +# +# This is the load-bearing experiment. The audit follow-up showed that the +# combined PROSE+INSTRUCTION run produced a *uniform* fortran-flavored bias +# (leak_lift constant ≈0.064 across sections, own_lift small, effective_sis +# ≈0). That meant the adapter learned "this looks fortran" but not "this +# section says X." +# +# Stage 1 trains on PROSE only at LoRA r=64 (vs r=16 in the prior run). +# Pass criterion: ``section_internalization`` should now show *positive* +# effective_sis on at least 40% of sections — i.e., own_lift > leak_lift on +# the section's own probes. If this still flatlines, the base model is the +# floor, and we promote to Qwen2.5-Coder-1.5B. +# +# Run from the sway repo's venv (need [dlm] for bridge probes): +# cd ~/GithubOrgs/tenseleyFlow/sway +# uv run --no-sync sway run \ +# ~/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/stage1/sway.yaml \ +# --json /tmp/sway-stage1.json --markdown /tmp/sway-stage1.md + +version: 1 + +dlm_source: /Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/stage1/the-doc.dlm + +models: + base: + kind: hf + base: HuggingFaceTB/SmolLM2-135M-Instruct + ft: + kind: hf + base: HuggingFaceTB/SmolLM2-135M-Instruct + adapter: ~/.dlm/store/01KQDGAM70EJ1WJCQY6PVDV95W/adapter/versions/v0001 + +defaults: + seed: 0 + differential: true + coverage_threshold: 0.6 + score_weights: + adherence: 0.30 + attribution: 0.40 # bumped: SIS is the load-bearing metric for this stage + calibration: 0.15 + ablation: 0.15 + +suite: + # ── Calibration baseline ──────────────────────────────────────── + - name: null_baseline + kind: null_adapter + runs: 3 + + # ── Adherence: does the adapter shift logits on fortran prompts? ─ + # Use prompts where SmolLM2's *English* prior is strong, so the null + # baseline produces low JS divergence and the trained adapter's + # divergence is visible above noise. Audit follow-up showed that + # bare fortran-fence prefixes give the null adapter ~0.15 JS + # divergence on its own — calibration ceiling, not a learning signal. + - name: dk_fortran_qa_shaped + kind: delta_kl + prompts: + - "Write a Fortran function that" + - "In modern Fortran, the syntax for declaring an allocatable array is" + - "When should you prefer `do concurrent` over a plain `do` loop?" + - "The `intent(in)` attribute on a dummy argument means" + - "To use stdlib's sorting routines, you would import" + - "Fortran's `pure` and `elemental` attributes differ in that" + - "A type-bound procedure in Fortran is declared with" + - "The `use, only:` import form in Fortran is preferred because" + assert_z_gte: 1.0 + assert_mean_gte: 0.02 + + # ── Attribution: THE load-bearing test ────────────────────────── + # Prior run had effective_sis ≈ 0. If stage-1 fixes this, we have + # signal. assert_passing_section_frac at 0.4 is the bar from the + # prior audit follow-up spec. + - name: sis_fortran + kind: section_internalization + assert_z_gte: 1.0 + assert_passing_section_frac: 0.4 + + # ── Calibration ────────────────────────────────────────────────── + - name: leak_fortran + kind: leakage + assert_z_gte: 1.0 + + - name: cal_general + kind: calibration_drift + items_limit: 50 + assert_z_gte: 1.0 + + # ── Ablation: did r=64 fix the saturation overshoot? ──────────── + # Prior run (r=16) showed overshoot=1.54 + sat_λ out of band, which + # we read as adapter under-magnitude. r=64 + 800 steps should land + # the saturation in (0.5, 1.0] with overshoot ≤ 1.05. + - name: abl_fortran + kind: adapter_ablation + lambdas: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25] + prompts: + - "module fortran_demo\n implicit none" + - "subroutine compute_norm(vector, n, result)" + - "use stdlib_linalg, only: solve" + assert_linearity_gte: 0.85 + assert_z_gte: 1.0 diff --git a/docs/audits/13-followup/stage1/the-doc.dlm b/docs/audits/13-followup/stage1/the-doc.dlm new file mode 100644 index 0000000..a0ce7bf --- /dev/null +++ b/docs/audits/13-followup/stage1/the-doc.dlm @@ -0,0 +1,47 @@ +--- +dlm_id: 01KQDGAM70EJ1WJCQY6PVDV95W +dlm_version: 15 +base_model: smollm2-135m +training: + sources: + - path: ~/GithubOrgs/FortranGoingOnForty + include: ["**/*.f90", "**/*.F90", "**/*.f95"] + exclude: + - "**/build/**" + - "**/.git/**" + - "**/dist/**" + max_bytes_per_file: 32768 + - path: /tmp/stdlib_build/src + include: ["**/*.f90", "**/*.F90", "**/*.fypp"] + exclude: + - "**/build/**" + - "**/tests/**" + max_bytes_per_file: 32768 + - path: /tmp/stdlib_build/doc/specs + include: ["**/*.md"] + max_bytes_per_file: 131072 + sources_policy: permissive + adapter: lora + lora_r: 64 + lora_alpha: 128 + lora_dropout: 0.05 + sequence_len: 1024 + micro_batch_size: 1 + grad_accum: 8 + learning_rate: 1.5e-4 + warmup_ratio: 0.05 + num_epochs: 1 +--- + +# Stage 1 — Fortran domain expansion (CPT) + +Continual-pretraining stage of the two-stage Fortran expert recipe. This +.dlm has no INSTRUCTION sections — every training row is PROSE +(next-token loss on the raw fortran source + stdlib markdown specs). +Goal: expand the base model's fortran token-distribution prior. The +stage-2 .dlm will layer instruction-following on top of this adapter. + +The corpus mirrors the prior follow-up doc but at higher LoRA rank +(r=64 vs r=16) so the adapter has the capacity to absorb the corpus +shift without hitting the saturation ceiling diagnosed by sway's +`adapter_ablation` overshoot. diff --git a/docs/audits/13-followup/sway-results.json b/docs/audits/13-followup/sway-results.json new file mode 100644 index 0000000..a63ca10 --- /dev/null +++ b/docs/audits/13-followup/sway-results.json @@ -0,0 +1,881 @@ +{ + "adapter_id": "/Users/mfwolffe/.dlm/store/01KQCWMA64901VEMYB3DC8CGXY/adapter/versions/v0001", + "backend_stats": { + "avg_batch_size": 5.333333333333333, + "batched_prompts": 32, + "batches_sent": 6, + "cache_hits": 195, + "cache_misses": 407, + "forward_passes": 407, + "hit_rate": 0.3239202657807309, + "max_batch_size": 8, + "scoring_wall_s": 17.152012583974283 + }, + "base_model_id": "HuggingFaceTB/SmolLM2-135M-Instruct", + "determinism": { + "class": "best_effort", + "notes": [ + "MPS: bit-identical across runs is best-effort" + ], + "seed": 0 + }, + "finished_at": "2026-04-29T20:38:02.420941+00:00", + "null_stats": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.18108218965989206, + "n": 3.0, + "std": 0.20429734551366469 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.27305924253357855, + "n": 3.0, + "std": 0.02510960762659956 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.012714107831319174, + "n": 3.0, + "std": 0.013087756562633434 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": -8.81465180882523e-22, + "n": 3.0, + "std": 1e-06 + } + }, + "probes": [ + { + "base_value": null, + "ci_95": null, + "duration_s": 52.01406016701367, + "evidence": { + "calibrated_kinds": [ + "delta_kl", + "section_internalization", + "paraphrase_invariance", + "leakage", + "calibration_drift" + ], + "from_cache": false, + "init_scale": 0.02, + "null_stats": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.18108218965989206, + "n": 3.0, + "std": 0.20429734551366469 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.27305924253357855, + "n": 3.0, + "std": 0.02510960762659956 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.012714107831319174, + "n": 3.0, + "std": 0.013087756562633434 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": -8.81465180882523e-22, + "n": 3.0, + "std": 1e-06 + } + }, + "null_stats_by_rank": { + "rank_1.00": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.18108218965989206, + "n": 3.0, + "std": 0.20429734551366469 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.27305924253357855, + "n": 3.0, + "std": 0.02510960762659956 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.012714107831319174, + "n": 3.0, + "std": 0.013087756562633434 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": -8.81465180882523e-22, + "n": 3.0, + "std": 1e-06 + } + } + }, + "per_kind_raw_samples": { + "calibration_drift": [ + 0.0, + 0.0, + 0.0 + ], + "delta_kl": [ + 0.4700022224846012, + 0.03648415034633113, + 0.036760196148743804 + ], + "leakage": [ + 0.25066666666666665, + 0.26038781163434904, + 0.3081232492997199 + ], + "paraphrase_invariance": [ + -0.006305098533630371, + -0.0008810758590698242, + -0.030956149101257324 + ], + "section_internalization": [ + 4.363252645368494e-20, + -4.759911976765629e-20, + 1.322197771323786e-21 + ] + }, + "rank_multipliers": [ + 1.0 + ], + "runs": 3, + "seed_base": 1000, + "skipped_kinds": [ + { + "kind": "adapter_ablation", + "reason": "probe opted out (calibrate_spec returned None)" + } + ], + "weight": 1.0 + }, + "ft_value": null, + "kind": "null_adapter", + "message": "null calibration: 5 kinds calibrated over 3 seeds (1 opted out)", + "name": "null_baseline", + "raw": null, + "score": 1.0, + "verdict": "pass", + "z_score": null + }, + { + "base_value": null, + "ci_95": [ + 0.13243605721975274, + 0.16676766505398383 + ], + "duration_s": 0.32599120801023673, + "evidence": { + "divergence_kind": "js", + "max": 0.18371552715811904, + "num_prompts": 8, + "per_prompt": [ + 0.14386457824628635, + 0.17476625862101103, + 0.16067887414401738, + 0.15360432284137182, + 0.15848778501047278, + 0.08995173957917547, + 0.18371552715811904, + 0.143727750782096 + ], + "raw_ci_95": [ + 0.13243605721975274, + 0.16676766505398383 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": -0.14675954323678622 + } + }, + "ft_value": null, + "kind": "delta_kl", + "message": "mean js=0.1511, z=-0.15\u03c3 vs null", + "name": "dk_fortran", + "raw": 0.15109960454781873, + "score": 0.48777247649269934, + "verdict": "fail", + "z_score": -0.14675954323678622 + }, + { + "base_value": null, + "ci_95": [ + -0.020269755418408253, + 0.021137891699870626 + ], + "duration_s": 1.7037657080072677, + "evidence": { + "num_sections": 41, + "passing_frac": 0.17073170731707318, + "per_section": [ + { + "base_nll": 3.383964624297753, + "effective_sis": -0.033378760766710876, + "ft_nll": 3.2795830773057117, + "kind": "prose", + "leak_lift": 0.06422469403757139, + "own_lift": 0.030845933270860514, + "passed": false, + "section_id": "79e3ec8ad3461cef", + "tag": "" + }, + { + "base_nll": 3.7380522629310344, + "effective_sis": -0.06098052371760402, + "ft_nll": 3.723408876616379, + "kind": "instruction", + "leak_lift": 0.06489790776808098, + "own_lift": 0.003917384050476959, + "passed": false, + "section_id": "5f6b1dead97c7d16", + "tag": "" + }, + { + "base_nll": 3.619960054438165, + "effective_sis": -0.03210797600578668, + "ft_nll": 3.503811017841312, + "kind": "instruction", + "leak_lift": 0.06419369928730494, + "own_lift": 0.03208572328151826, + "passed": false, + "section_id": "3b651cf3c102a94a", + "tag": "" + }, + { + "base_nll": 2.9329270411581887, + "effective_sis": -0.034794699214302144, + "ft_nll": 2.8465097246378877, + "kind": "instruction", + "leak_lift": 0.06425922912165898, + "own_lift": 0.029464529907356835, + "passed": false, + "section_id": "39347c7736ed8744", + "tag": "" + }, + { + "base_nll": 4.149605415962838, + "effective_sis": -0.04691715545030196, + "ft_nll": 4.076415740691863, + "kind": "instruction", + "leak_lift": 0.06455489878595165, + "own_lift": 0.017637743335649693, + "passed": false, + "section_id": "b185f7cb3ff6a672", + "tag": "" + }, + { + "base_nll": 3.472200798265862, + "effective_sis": 0.0035066615967460774, + "ft_nll": 3.240147677334872, + "kind": "instruction", + "leak_lift": 0.06332504958968219, + "own_lift": 0.06683171118642826, + "passed": false, + "section_id": "8dfd46d5cbbfc6a0", + "tag": "" + }, + { + "base_nll": 3.7048557826450894, + "effective_sis": -0.06162214846963127, + "ft_nll": 3.692661588154142, + "kind": "instruction", + "leak_lift": 0.06491355715227676, + "own_lift": 0.003291408682645486, + "passed": false, + "section_id": "84adb31460daa7b4", + "tag": "" + }, + { + "base_nll": 3.756350835164388, + "effective_sis": -0.0031169019965170963, + "ft_nll": 3.529581069946289, + "kind": "instruction", + "leak_lift": 0.0634865999212252, + "own_lift": 0.0603696979247081, + "passed": false, + "section_id": "2963ad39da116480", + "tag": "" + }, + { + "base_nll": 3.143678211030506, + "effective_sis": -0.024671729409215956, + "ft_nll": 3.0200040302579363, + "kind": "instruction", + "leak_lift": 0.06401232741909589, + "own_lift": 0.039340598009879935, + "passed": false, + "section_id": "4ebf42ad7431d861", + "tag": "" + }, + { + "base_nll": 3.2072388656496065, + "effective_sis": -0.024959602124314785, + "ft_nll": 3.0819649283341537, + "kind": "instruction", + "leak_lift": 0.06401934870483002, + "own_lift": 0.03905974658051523, + "passed": false, + "section_id": "8150fd7d0d03cee4", + "tag": "" + }, + { + "base_nll": 3.098959201719703, + "effective_sis": -0.05798496674069719, + "ft_nll": 3.0777626968011624, + "kind": "instruction", + "leak_lift": 0.06482484540279057, + "own_lift": 0.006839878662093378, + "passed": false, + "section_id": "7a4f482e72e04eba", + "tag": "" + }, + { + "base_nll": 3.8598657724808674, + "effective_sis": 0.14655176115748497, + "ft_nll": 3.063236158721301, + "kind": "instruction", + "leak_lift": 0.05983614472234709, + "own_lift": 0.20638790587983205, + "passed": true, + "section_id": "f4f345eacda9a037", + "tag": "" + }, + { + "base_nll": 4.277299859550562, + "effective_sis": -0.015805815856834417, + "ft_nll": 4.072031085410814, + "kind": "instruction", + "leak_lift": 0.06379608562513536, + "own_lift": 0.047990269768300946, + "passed": false, + "section_id": "01bec021954d9b71", + "tag": "" + }, + { + "base_nll": 4.189083491530374, + "effective_sis": -0.011938456933510457, + "ft_nll": 3.9722426940347546, + "kind": "instruction", + "leak_lift": 0.06370175979773722, + "own_lift": 0.051763302864226764, + "passed": false, + "section_id": "b13ccd1c3a9f683c", + "tag": "" + }, + { + "base_nll": 3.393718455097463, + "effective_sis": -0.018457564958768373, + "ft_nll": 3.239632785910427, + "kind": "instruction", + "leak_lift": 0.06386076243249962, + "own_lift": 0.04540319747373125, + "passed": false, + "section_id": "8cbfb9d8507491b7", + "tag": "" + }, + { + "base_nll": 2.7204354708311986, + "effective_sis": -0.04921115335256995, + "ft_nll": 2.6785415899558145, + "kind": "instruction", + "leak_lift": 0.06461084995429965, + "own_lift": 0.015399696601729702, + "passed": false, + "section_id": "c99eafec654809ed", + "tag": "" + }, + { + "base_nll": 3.576165140086207, + "effective_sis": -0.0005441335062590508, + "ft_nll": 3.3512968918372845, + "kind": "instruction", + "leak_lift": 0.0634238494702433, + "own_lift": 0.06287971596398424, + "passed": false, + "section_id": "7b7a511b8cb7a3cc", + "tag": "" + }, + { + "base_nll": 3.728303738725864, + "effective_sis": 0.031234744141658735, + "ft_nll": 3.378277538268547, + "kind": "instruction", + "leak_lift": 0.0626487548934648, + "own_lift": 0.09388349903512354, + "passed": false, + "section_id": "969d989aee32a657", + "tag": "" + }, + { + "base_nll": 3.2954477163461537, + "effective_sis": -0.06813722044003823, + "ft_nll": 3.3055474697015224, + "kind": "instruction", + "leak_lift": 0.06507246134667692, + "own_lift": -0.0030647590933613024, + "passed": false, + "section_id": "e4efe84850c617f9", + "tag": "" + }, + { + "base_nll": 3.3069370154178506, + "effective_sis": 0.12010170655245701, + "ft_nll": 2.7097604924982246, + "kind": "instruction", + "leak_lift": 0.06048126800539656, + "own_lift": 0.18058297455785358, + "passed": true, + "section_id": "b435fa0fb7979efc", + "tag": "" + }, + { + "base_nll": 2.861654940518466, + "effective_sis": -0.04222673866202122, + "ft_nll": 2.798086825284091, + "kind": "instruction", + "leak_lift": 0.06444049837648139, + "own_lift": 0.022213759714460164, + "passed": false, + "section_id": "311b3bf11dfbea7e", + "tag": "" + }, + { + "base_nll": 3.429737384502704, + "effective_sis": -0.004068967219511076, + "ft_nll": 3.225870866041917, + "kind": "instruction", + "leak_lift": 0.06350982102422505, + "own_lift": 0.05944085380471397, + "passed": false, + "section_id": "67c5f4e5dfcfb4b3", + "tag": "" + }, + { + "base_nll": 3.7294495597718256, + "effective_sis": 0.11403063971616001, + "ft_nll": 3.07806396484375, + "kind": "instruction", + "leak_lift": 0.06062934280628185, + "own_lift": 0.17465998252244186, + "passed": true, + "section_id": "95851f00680a27e0", + "tag": "" + }, + { + "base_nll": 3.770986524678893, + "effective_sis": -0.05708714592787426, + "ft_nll": 3.7418903415485962, + "kind": "instruction", + "leak_lift": 0.06480294733418512, + "own_lift": 0.007715801406310863, + "passed": false, + "section_id": "e7dfb14006866fb1", + "tag": "" + }, + { + "base_nll": 3.710047132186307, + "effective_sis": 0.06627955120954535, + "ft_nll": 3.2348882078214456, + "kind": "instruction", + "leak_lift": 0.06179400350156513, + "own_lift": 0.12807355471111048, + "passed": true, + "section_id": "e05149d6dd019c7d", + "tag": "" + }, + { + "base_nll": 3.2487693665519592, + "effective_sis": -0.03571138348993008, + "ft_nll": 3.155951363699777, + "kind": "instruction", + "leak_lift": 0.06428158727472308, + "own_lift": 0.028570203784792997, + "passed": false, + "section_id": "2abcffdaa5a5459e", + "tag": "" + }, + { + "base_nll": 3.5466603681075672, + "effective_sis": -0.01529189023592583, + "ft_nll": 3.374676917210098, + "kind": "instruction", + "leak_lift": 0.0637835508538937, + "own_lift": 0.04849166061796787, + "passed": false, + "section_id": "c717e7bcca5467af", + "tag": "" + }, + { + "base_nll": 3.90612353515625, + "effective_sis": -0.02370302113925562, + "ft_nll": 3.7487626953125, + "kind": "instruction", + "leak_lift": 0.06398870038812125, + "own_lift": 0.040285679248865634, + "passed": false, + "section_id": "2c92258d39229ce8", + "tag": "" + }, + { + "base_nll": 4.207152941212151, + "effective_sis": 0.18883884196111783, + "ft_nll": 3.165278464370633, + "kind": "instruction", + "leak_lift": 0.05880475250762433, + "own_lift": 0.24764359446874218, + "passed": true, + "section_id": "ee8b2414b1c97b71", + "tag": "" + }, + { + "base_nll": 4.4486438751220705, + "effective_sis": 0.2134270935054276, + "ft_nll": 3.240249252319336, + "kind": "instruction", + "leak_lift": 0.058205039055324106, + "own_lift": 0.2716321325607517, + "passed": true, + "section_id": "a04e766e64918615", + "tag": "" + }, + { + "base_nll": 3.3436742629447993, + "effective_sis": -0.018996994631269852, + "ft_nll": 3.193620445084398, + "kind": "instruction", + "leak_lift": 0.06387391925378014, + "own_lift": 0.044876924622510285, + "passed": false, + "section_id": "f52b3c00ccb92c83", + "tag": "" + }, + { + "base_nll": 4.163542639690896, + "effective_sis": -0.021224243214975277, + "ft_nll": 3.985742718240489, + "kind": "instruction", + "leak_lift": 0.06392824238996807, + "own_lift": 0.04270399917499279, + "passed": false, + "section_id": "73172bca8301df6f", + "tag": "" + }, + { + "base_nll": 3.3983470988723465, + "effective_sis": -0.038084031168146286, + "ft_nll": 3.309122049583579, + "kind": "instruction", + "leak_lift": 0.06433945673028932, + "own_lift": 0.026255425562143032, + "passed": false, + "section_id": "76bba5378f960fa1", + "tag": "" + }, + { + "base_nll": 4.008661218591638, + "effective_sis": 0.1105948796020304, + "ft_nll": 3.32194539662954, + "kind": "instruction", + "leak_lift": 0.06071314183345574, + "own_lift": 0.17130802143548615, + "passed": true, + "section_id": "6107afd44bb89580", + "tag": "" + }, + { + "base_nll": 4.3382890031144425, + "effective_sis": -0.03379828127826946, + "ft_nll": 4.206246040962838, + "kind": "instruction", + "leak_lift": 0.06423492624517038, + "own_lift": 0.030436644966900923, + "passed": false, + "section_id": "20883a8329375a3c", + "tag": "" + }, + { + "base_nll": 3.872911693431713, + "effective_sis": -0.036648144931686, + "ft_nll": 3.7658013237847223, + "kind": "instruction", + "leak_lift": 0.0643044351147659, + "own_lift": 0.0276562901830799, + "passed": false, + "section_id": "34fd68f2e2dc5091", + "tag": "" + }, + { + "base_nll": 4.457715809774056, + "effective_sis": -0.019557946291812463, + "ft_nll": 4.260106807132419, + "kind": "instruction", + "leak_lift": 0.06388760100159825, + "own_lift": 0.04432965470978579, + "passed": false, + "section_id": "d918396806259702", + "tag": "" + }, + { + "base_nll": 4.238513547982743, + "effective_sis": -0.0036582258921524988, + "ft_nll": 3.984874212919776, + "kind": "instruction", + "leak_lift": 0.06349980294306996, + "own_lift": 0.059841577050917465, + "passed": false, + "section_id": "d9fd6b631fed2d59", + "tag": "" + }, + { + "base_nll": 4.081884474981399, + "effective_sis": -0.040466313653234366, + "ft_nll": 3.984199887230283, + "kind": "instruction", + "leak_lift": 0.06439756118114512, + "own_lift": 0.023931247527910753, + "passed": false, + "section_id": "fe7a4197df714f68", + "tag": "" + }, + { + "base_nll": 2.9971887389225746, + "effective_sis": -0.03665403461771971, + "ft_nll": 2.914314839377332, + "kind": "instruction", + "leak_lift": 0.06430457876564477, + "own_lift": 0.02765054414792506, + "passed": false, + "section_id": "9c217dda0d7e0c0b", + "tag": "" + }, + { + "base_nll": 4.11610508631993, + "effective_sis": -0.02275970814578159, + "ft_nll": 3.9464969234866696, + "kind": "instruction", + "leak_lift": 0.06396569275413408, + "own_lift": 0.04120598460835249, + "passed": false, + "section_id": "4979f9ee02c60ce1", + "tag": "" + } + ], + "per_section_threshold": 0.05, + "raw_ci_95": [ + -0.020269755418408253, + 0.021137891699870626 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": -1.522290367384119e-12 + } + }, + "ft_value": null, + "kind": "section_internalization", + "message": "7/41 sections cleared; mean effective_sis=-0.000, z=-0.00\u03c3 vs null", + "name": "sis_fortran", + "raw": -1.5231718325650014e-18, + "score": 0.4999999999998732, + "verdict": "fail", + "z_score": -1.522290367384119e-12 + }, + { + "base_value": null, + "ci_95": null, + "duration_s": 3.292007022537291e-06, + "evidence": {}, + "ft_value": null, + "kind": "paraphrase_invariance", + "message": "no cases provided", + "name": "para_fortran", + "raw": null, + "score": null, + "verdict": "error", + "z_score": null + }, + { + "base_value": null, + "ci_95": [ + 0.060836501901140684, + 0.060836501901140684 + ], + "duration_s": 25.701656958000967, + "evidence": { + "mean_clean_recall": 0.060836501901140684, + "mean_fragility": 0.16894335375535185, + "mean_perturbed_recall": 0.05055857923921814, + "per_section": [ + { + "clean_recall": 0.060836501901140684, + "fragility": 0.16894335375535185, + "perturbed_recall": 0.05055857923921814, + "section_id": "79e3ec8ad3461cef" + } + ], + "raw_ci_95": [ + 0.060836501901140684, + 0.060836501901140684 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": 8.451854118485798 + } + }, + "ft_value": 0.16894335375535185, + "kind": "leakage", + "message": "greedy_recall=0.06 (perturbed=0.05, fragility=0.17), z=+8.45\u03c3 vs null", + "name": "leak_fortran", + "raw": 0.060836501901140684, + "score": 0.9436027416296762, + "verdict": "pass", + "z_score": 8.451854118485798 + }, + { + "base_value": null, + "ci_95": [ + 0.04, + 0.2 + ], + "duration_s": 2.9683373749867314, + "evidence": { + "fraction_regressed": 0.1, + "mean_delta_nats": -0.25887888199999, + "raw_ci_95": [ + 0.04, + 0.2 + ], + "regressed_count": 5, + "regression_nats_threshold": 1.0, + "total_items": 50, + "weight": 1.0, + "worst_offenders": [ + { + "delta": -1.7542665004730225, + "gold": " Paris", + "prompt": "The Eiffel Tower is in" + }, + { + "delta": -1.6363691091537476, + "gold": " Egypt", + "prompt": "The pyramids of Giza are in" + }, + { + "delta": -1.4172669649124146, + "gold": " Ag", + "prompt": "The chemical symbol for silver is" + }, + { + "delta": -1.3961007595062256, + "gold": " Europe", + "prompt": "The Mediterranean Sea borders southern" + }, + { + "delta": -1.076614797115326, + "gold": " Rome", + "prompt": "The Colosseum is in" + } + ], + "z_by_rank": null + }, + "ft_value": -0.25887888199999, + "kind": "calibration_drift", + "message": "5/50 items regressed >1.0 nats (frac=10.0%), mean_delta=-0.259 nats/tok (no calibration for calibration_drift)", + "name": "cal_general", + "raw": 0.1, + "score": 0.397632298133336, + "verdict": "pass", + "z_score": null + }, + { + "base_value": null, + "ci_95": null, + "duration_s": 0.6288362920022337, + "evidence": { + "lambdas": [ + 0.0, + 0.25, + 0.5, + 0.75, + 1.0, + 1.25 + ], + "linearity": 0.9149427048285227, + "mean_divergence_per_lambda": [ + 0.0, + 0.057499440320507945, + 0.057924101626428255, + 0.07944790530034684, + 0.129924414274642, + 0.20029354877499228 + ], + "overshoot": 1.5416159456497514, + "passed_linearity": true, + "passed_overshoot": true, + "passed_saturation": false, + "saturation_lambda": 1.25, + "saturation_reason": "found", + "weight": 1.0, + "z_by_rank": null + }, + "ft_value": null, + "kind": "adapter_ablation", + "message": "R\u00b2=0.91, sat_\u03bb=1.25 (out of band), overshoot=1.54 (no calibration for adapter_ablation)", + "name": "abl_fortran", + "raw": 0.9149427048285227, + "score": 0.79, + "verdict": "fail", + "z_score": null + } + ], + "schema_version": 1, + "score": { + "band": "partial", + "components": { + "ablation": 0.79, + "adherence": 0.48777247649269934, + "attribution": 0.4999999999998732, + "baseline": 1.0, + "calibration": 0.6706175198815061 + }, + "findings": [ + "dk_fortran (delta_kl) failed: mean js=0.1511, z=-0.15\u03c3 vs null", + "1 probe(s) errored \u2014 see full report for details" + ], + "overall": 0.5739552469240666, + "weights": { + "ablation": 0.15, + "adherence": 0.3, + "attribution": 0.35, + "baseline": 0.0, + "calibration": 0.2 + } + }, + "spec_path": "/Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/sway.yaml", + "started_at": "2026-04-29T20:36:38.876695+00:00", + "sway_version": "0.1.0", + "wall_seconds": 83.544246 +} \ No newline at end of file diff --git a/docs/audits/13-followup/sway-results.md b/docs/audits/13-followup/sway-results.md new file mode 100644 index 0000000..44a986e --- /dev/null +++ b/docs/audits/13-followup/sway-results.md @@ -0,0 +1,41 @@ +# sway report + +**Overall:** 0.57 (`partial`) +**Base:** `HuggingFaceTB/SmolLM2-135M-Instruct` +**Adapter:** `adapter/versions/v0001` +**Wall:** 83.5s +**Determinism:** `best_effort` (seed=0) +**Backend:** cache: 195/602 = 32% | batches: 6 (avg=5.3) + +## Components + +| category | score | weight | | +|---|---:|---:|---| +| adherence | 0.49 | 0.30 | | +| attribution | 0.50 | 0.35 | | +| calibration | 0.67 | 0.20 | | +| ablation | 0.79 | 0.15 | | +| baseline | 1.00 | 0.00 | (informational, weight=0) | + +## Probes + +| name | kind | verdict | score | raw | ci95 | z | duration | note | +|---|---|---|---:|---:|---:|---:|---:|---| +| null_baseline | `null_adapter` | pass | 1.00 | — | — | — | 52.0s | null calibration: 5 kinds calibrated over 3 seeds (1 opted out) | +| dk_fortran | `delta_kl` | fail | 0.49 | 0.151 | [0.132, 0.167] | -0.15σ | 0.33s | mean js=0.1511, z=-0.15σ vs null | +| sis_fortran | `section_internalization` | fail | 0.50 | -0.000 | [-0.020, 0.021] | -0.00σ | 1.70s | 7/41 sections cleared; mean effective_sis=-0.000, z=-0.00σ vs null | +| para_fortran | `paraphrase_invariance` | error | — | — | — | — | 0.00s | no cases provided | +| leak_fortran | `leakage` | pass | 0.94 | 0.061 | [0.061, 0.061] | +8.45σ | 25.7s | greedy_recall=0.06 (perturbed=0.05, fragility=0.17), z=+8.45σ vs null | +| cal_general | `calibration_drift` | pass | 0.40 | 0.100 | [0.040, 0.200] | — | 2.97s | 5/50 items regressed >1.0 nats (frac=10.0%), mean_delta=-0.259 nats/tok (no calibration for calibration_drift) | +| abl_fortran | `adapter_ablation` | fail | 0.79 | 0.915 | — | — | 0.63s | R²=0.91, sat_λ=1.25 (out of band), overshoot=1.54 (no calibration for adapter_ablation) | + +## Top findings + +- dk_fortran (delta_kl) failed: mean js=0.1511, z=-0.15σ vs null +- 1 probe(s) errored — see full report for details + +## Degenerate null calibration + +1 probe kind(s) ran null_adapter but the resulting baseline was too narrow for z-scoring (std ≈ 0, typically `runs: 1` or coincidentally-matched seeds). Fix: bump `runs:` in the `null_adapter` spec entry. Affected kinds: + +- `calibration_drift` diff --git a/docs/audits/13-followup/sway.yaml b/docs/audits/13-followup/sway.yaml new file mode 100644 index 0000000..7b92b49 --- /dev/null +++ b/docs/audits/13-followup/sway.yaml @@ -0,0 +1,105 @@ +# Audit 13 follow-up — sway spec for the fortran fine-tune. +# +# Compared to audit 13's spec (which got dk +44σ but +# paraphrase_invariance −3.51σ FAIL), this one: +# +# - Includes ``null_adapter`` for every numeric probe (was already +# present in audit 13; kept here as the recipe's foundation). +# - Activates the [dlm] bridge probes — section_internalization, +# paraphrase_invariance, leakage — which need the .dlm to be +# resolvable. The audit's venv didn't install ``[dlm]`` so these +# SKIPped; we install dlm separately in the follow-up venv to get +# them firing. +# - Fortran-specific prompts on delta_kl + adapter_ablation so the +# distribution shift signal is grounded in the trained domain. +# +# Run from the sway repo's venv after training completes: +# uv run --no-sync sway run \ +# /Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/sway.yaml \ +# --json /tmp/sway-13fu.json --markdown /tmp/sway-13fu.md + +version: 1 + +dlm_source: /Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/the-doc.dlm + +models: + base: + kind: hf + base: HuggingFaceTB/SmolLM2-135M-Instruct + ft: + kind: hf + base: HuggingFaceTB/SmolLM2-135M-Instruct + adapter: ~/.dlm/store/01KQCWMA64901VEMYB3DC8CGXY/adapter/versions/v0001 + +defaults: + seed: 0 + differential: true + coverage_threshold: 0.6 + score_weights: + adherence: 0.30 + attribution: 0.35 + calibration: 0.20 + ablation: 0.15 + +suite: + # ── Calibration baseline ───────────────────────────────────────── + - name: null_baseline + kind: null_adapter + runs: 3 + + # ── Adherence ──────────────────────────────────────────────────── + - name: dk_fortran + kind: delta_kl + prompts: + - "module mymod\n implicit none\n" + - "subroutine compute(x, y)" + - "do concurrent (i = 1:n)" + - "type :: container" + - "intent(in) ::" + - "use stdlib_io, only: loadtxt" + - "use stdlib_sorting, only:" + - "real(real64), allocatable ::" + assert_z_gte: 1.0 + assert_mean_gte: 0.02 + + # ── Attribution (bridge-aware) ─────────────────────────────────── + # Per-section internalization. Should fire now that [dlm] is + # installed and the bridge can resolve the .dlm. + - name: sis_fortran + kind: section_internalization + assert_z_gte: 1.0 + assert_passing_section_frac: 0.4 + + # The audit's load-bearing probe — was -3.51σ FAIL in audit 13, + # need to flip to PASS here. With 5× more INSTRUCTION sections + + # 5× more steps + the bridge installed, this is the falsifiable + # claim. + - name: para_fortran + kind: paraphrase_invariance + assert_z_gte: 1.0 + + # ── Calibration ────────────────────────────────────────────────── + - name: leak_fortran + kind: leakage + assert_z_gte: 1.0 + + # General-competence regression check. Audit 13 PASSed this — + # we want to confirm we didn't fry it with 5× more fine-tune steps. + - name: cal_general + kind: calibration_drift + items_limit: 50 + assert_z_gte: 1.0 + + # ── Ablation (the sway signature) ──────────────────────────────── + # Healthy fine-tunes show monotonic R² ≥ 0.85 on the λ-scaled KL + # curve. If this is a step-function or saturates early, the adapter + # is degenerate. + - name: abl_fortran + kind: adapter_ablation + lambdas: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25] + prompts: + - "subroutine matmul_optimized(A, B, C)" + - "do concurrent (i = 1:n)" + - "use stdlib_linalg, only: solve" + assert_linearity_gte: 0.85 + assert_z_gte: 1.0 diff --git a/docs/audits/13-followup/the-doc.dlm b/docs/audits/13-followup/the-doc.dlm new file mode 100644 index 0000000..cc396eb --- /dev/null +++ b/docs/audits/13-followup/the-doc.dlm @@ -0,0 +1,725 @@ +--- +dlm_id: 01KQCWMA64901VEMYB3DC8CGXY +dlm_version: 15 +base_model: smollm2-135m +training: + sources: + - path: ~/GithubOrgs/FortranGoingOnForty + include: ["**/*.f90", "**/*.F90", "**/*.f95"] + exclude: + - "**/build/**" + - "**/.git/**" + - "**/dist/**" + max_bytes_per_file: 32768 + - path: /tmp/stdlib_build/src + include: ["**/*.f90", "**/*.F90", "**/*.fypp"] + exclude: + - "**/build/**" + - "**/tests/**" + max_bytes_per_file: 32768 + - path: /tmp/stdlib_build/doc/specs + include: ["**/*.md"] + max_bytes_per_file: 131072 + sources_policy: permissive + adapter: lora + lora_r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + sequence_len: 1024 + micro_batch_size: 2 + grad_accum: 4 + learning_rate: 2.0e-4 + warmup_ratio: 0.1 + num_epochs: 1 +--- + +# Modern Fortran practitioner notes + +This document trains an adapter that knows the idioms of the Fortran 2008+ +codebase under `~/GithubOrgs/FortranGoingOnForty`, the canonical +`fortran-lang/stdlib` source tree at `/tmp/stdlib_build/src`, and the +FORD-generated stdlib spec documents at `/tmp/stdlib_build/doc/specs`. The +corpus spans numeric/aerospace simulation (`aero-emulation`, `convolution`), +scripting helpers (`fgof-*`), and stdlib modules (allocation, ascii, bitsets, +linear-algebra, sorting, statistics, strings, IO, hashing). + +## Idioms the corpus reinforces + +Modern Fortran prefers `do concurrent` over plain `do` for loops with no loop +carried dependence — it expresses the parallelism intent and lets the +compiler vectorise / OpenMP-offload without further pragmas. `pure` and +`elemental` procedures are first-class. Allocatable arrays beat pointer +arrays for owned memory; `move_alloc` transfers ownership without copying. +Derived types use `type, bind(C)` only when interoperability is required. +`intent(in)`, `intent(out)`, `intent(inout)` are mandatory on every dummy +argument in this codebase, even when the language would default — explicit +intent is a project convention, not a language requirement. + +## stdlib navigation + +The Fortran stdlib provides community-tested replacements for tasks the +language standard doesn't cover: `stdlib_array` (index manipulation), +`stdlib_io` (text-file IO), `stdlib_sorting` (in-place sorts), `stdlib_stats` +(mean/var/cov), `stdlib_string_type` (variable-length strings), +`stdlib_linalg` (BLAS/LAPACK wrappers), `stdlib_random` (distributions), +`stdlib_hashmaps` (open + chaining hash tables), `stdlib_bitsets` (compact +bit arrays), `stdlib_logger` (severity-tagged logging), `stdlib_optval` +(safe defaults for optional dummy args), `stdlib_kinds` (named real32/real64 +/int32/int64 kind parameters). + +::instruction:: + +### Q +When should I prefer `do concurrent` over a plain `do` loop in modern Fortran? + +### A +Use `do concurrent` whenever loop iterations have no carried dependence — +i.e., iteration `i` does not read a value written by iteration `i-1`. The +`do concurrent` construct is a contract with the compiler that the iterations +may execute in any order or in parallel, which lets it vectorise, OpenMP- +offload, or GPU-offload the loop without further pragmas. If you can't make +that promise (the loop accumulates into a shared scalar, or reads a value +the previous iteration wrote), keep a plain `do`. Locality clauses +(`local`, `shared`, `reduce`) give finer control in Fortran 2018+. + +::instruction:: + +### Q +Why use `allocatable` arrays instead of `pointer` arrays for owned memory? + +### A +`allocatable` expresses ownership: the variable owns the memory, the compiler +inserts the deallocate at scope exit, and you cannot have aliasing across +two allocatable variables. `pointer` arrays expose aliasing and demand +manual `deallocate`. Compilers can also generate better code for +`allocatable` because no aliasing means no aliasing-defeats-vectorisation +pessimism. Use `pointer` only when you actually need pointer semantics — +linked structures, optional aliasing into a parent array, or C interop. +`move_alloc(from, to)` transfers ownership without a copy. + +::instruction:: + +### Q +What does `intent(in)`, `intent(out)`, and `intent(inout)` declare on a +dummy argument? + +### A +`intent(in)` says the procedure reads but never assigns to the argument — +the compiler enforces this and will reject any assignment as an error. +`intent(out)` says the procedure must define the argument before returning; +the value on entry is undefined (and on some compilers explicitly poisoned), +so don't read it before assigning. `intent(inout)` says both are allowed — +the procedure may read the prior value and may overwrite it. Always declare +intent on every dummy argument. It is a contract that the compiler enforces +and a reader can rely on. + +::instruction:: + +### Q +How do I open an OpenMP parallel region with a reduction in Fortran? + +### A +Use `!$omp parallel do reduction(+:sum)` above a `do` loop where each +iteration accumulates into `sum`. The reduction clause tells OpenMP to give +each thread a private copy of `sum`, accumulate locally, then combine at +loop end. Other reduction operators include `*`, `min`, `max`, `.and.`, +`.or.`, `iand`, `ior`. The closing `!$omp end parallel do` is optional in +free-form Fortran. For simple no-reduction loops, `do concurrent` is +preferred. + +::instruction:: + +### Q +What's a derived type with allocatable components, and why is it useful? + +### A +A derived type with allocatable components has fields declared +`type :: container; real, allocatable :: data(:); end type`. The container +itself is value-typed but its `data` field grows or shrinks at runtime. +Compared to fixed-size arrays it doesn't waste memory; compared to pointer +fields it owns the memory and frees it automatically when the container +goes out of scope. Modern Fortran's `intrinsic_assignment` does a deep +copy by default. This is the canonical way to build resizable structures +without C-style malloc/free. + +::instruction:: + +### Q +What does `pure function` mean and when is the `pure` attribute required? + +### A +A `pure function` has no side effects: it doesn't modify any module +variable, doesn't perform I/O, doesn't call impure procedures. The compiler +verifies this. `pure` is required for any function called from inside +`do concurrent` (which forbids side effects across iterations) and from +within `forall`. It also enables aggressive optimisation: the compiler +knows two calls with equal arguments produce equal results and can hoist +or memoise. `elemental` implies `pure` and adds elementwise array semantics. + +::instruction:: + +### Q +How do I declare a generic interface for two procedures that differ only in +argument type? + +### A +```fortran +interface clamp + module procedure clamp_real32, clamp_real64 +end interface clamp +``` +The two module procedures have the same logical name `clamp` but distinct +argument signatures (one takes `real(real32)`, the other `real(real64)`). +The compiler resolves the call by argument type — Fortran's overloading +mechanism. Adding more types is purely additive; callers see one name. + +::instruction:: + +### Q +What's the difference between `module` and `submodule` in Fortran? + +### A +A `module` declares its public interface and definitions in one unit. A +`submodule` lets you declare the interface in the parent module but put the +implementation in a separately compiled file. Editing the submodule does +not retrigger compilation of every consumer of the parent module — only of +the submodule itself. Use submodules to break long compile chains in big +projects. The `module subroutine foo()` declaration in the parent and +`module procedure foo` in the submodule are how the two halves bind. + +::instruction:: + +### Q +What does `trueloc` from `stdlib_array` do? + +### A +`trueloc(mask, lbound)` turns a logical mask into an integer index array +of the positions where the mask is `.true.`. It's a pure function — given +`mask = [.false., .true., .false., .true.]` and default `lbound`, it +returns `[2, 4]`. With `lbound = 0` it returns `[1, 3]`. Useful when you +need the indices themselves (for example, to use as a subscript) rather +than the masked values; if you only need the values, `pack` is more +direct. The complementary `falseloc` returns indices where the mask is +`.false.`. + +::instruction:: + +### Q +When would I use `arange` from `stdlib_math`? + +### A +`arange(start, stop, step)` returns a rank-1 array of values from `start` +to `stop` (inclusive) stepping by `step`. With `arange(1, 5)` you get +`[1, 2, 3, 4, 5]`. With `arange(0.0, 1.0, 0.25)` you get +`[0.0, 0.25, 0.5, 0.75, 1.0]`. Useful for building index sequences or +sample grids without a manual `do` loop. Step defaults to 1. + +::instruction:: + +### Q +How do I read a numeric matrix from a text file using `stdlib_io`? + +### A +Use `loadtxt(filename, array)`: +```fortran +use stdlib_io, only: loadtxt +real, allocatable :: data(:,:) +call loadtxt('measurements.dat', data) +``` +The array is allocated automatically to match the file's rows × columns. +The companion `savetxt(filename, array)` writes the same shape back out. +Both procedures handle real, integer, and complex data of `real32`, +`real64`, `int32`, `int64` kinds. + +::instruction:: + +### Q +What does `getline` from `stdlib_io` give me that `read(*, fmt='(A)')` doesn't? + +### A +`getline(unit, line, iostat)` reads one line of arbitrary length into a +deferred-length character variable. With plain `read(*, fmt='(A)')` you +have to commit to a fixed-length buffer up front and either truncate +or run a loop on `iostat=eor`. `getline` allocates `line` to exactly the +line's length, with no truncation. It returns `iostat=iostat_end` at +EOF. + +::instruction:: + +### Q +How do I sort an array in place using `stdlib_sorting`? + +### A +Use `sort(array)`. It sorts `array` ascending in place using an +introsort-style hybrid (quicksort + insertion-sort fallback). For a +descending sort, pass `reverse=.true.`. The companion `ord_sort(array)` +is a stable mergesort — preserves equal-key relative order, at the cost +of an O(n) workspace allocation. Use `radix_sort` for integer arrays +when the value range is bounded; it's O(n) but only for unsigned-style +integer kinds. + +::instruction:: + +### Q +When should I use `sort_index` instead of `sort`? + +### A +Use `sort_index(array, index)` when you also need the permutation that +sorted the array — for example, to apply the same reordering to a second, +parallel array. After the call, `array` is sorted and `index(i)` is the +original position of the `i`-th element of the sorted array. So +`other_data = other_data(index)` reorders a sibling array consistently +with `array`. Plain `sort` discards this information. + +::instruction:: + +### Q +What does `mean(array)` from `stdlib_stats` compute, and how do I take a +mean along one axis of a 2D array? + +### A +`mean(array)` returns the arithmetic mean of all elements as a scalar. +For a 2D array, `mean(matrix, dim=1)` reduces along the first dimension, +returning a 1D array of column means. `mean(matrix, dim=2)` returns row +means. With a logical mask, `mean(array, mask=array > 0)` averages only +the elements satisfying the mask. The companion `var` (variance) and +`std` (standard deviation) accept the same dim/mask arguments. `corr` and +`cov` compute correlation / covariance between two arrays. + +::instruction:: + +### Q +What is `string_type` from `stdlib_string_type` and why use it instead of +plain `character(len=*)`? + +### A +`string_type` is a derived type wrapping a deferred-length character +allocatable. Two reasons to prefer it: (1) you can have arrays of +varying-length strings (`type(string_type) :: words(100)` where each +`words(i)` is its own length — impossible with plain character arrays +which must be uniform-length); (2) it has overloaded operators (`==`, +`<`, `>`, `//` for concat) and constructors that interoperate with both +literals and other `string_type` instances. Use `char(string_type_var)` +to get the underlying `character(:)` back when interfacing with code +that takes plain strings. + +::instruction:: + +### Q +How do I solve a linear system `A*x = b` using `stdlib_linalg`? + +### A +```fortran +use stdlib_linalg, only: solve +real :: A(n,n), b(n), x(n) +x = solve(A, b) +``` +`solve` is a pure function that returns the solution `x`. Internally it +calls LAPACK's `gesv` (LU with partial pivoting). For multiple right-hand +sides, `b` can be a 2D array `b(n,nrhs)`. For least-squares (overdetermined +or underdetermined), use `lstsq` instead. To check whether a matrix is +singular before solving, use `inv` plus the returned status, or compute +the condition number via `linalg_cond`. + +::instruction:: + +### Q +What does `eye(n)` return in `stdlib_linalg`? + +### A +`eye(n)` returns the n×n identity matrix as a `real` array — ones on the +diagonal, zeros off-diagonal. `eye(m, n)` returns a non-square m×n matrix +with ones on the main diagonal. Useful as a starting point for numerical +linear algebra or as the right-hand side of `solve(A, eye(n))` to compute +`inv(A)` directly. + +::instruction:: + +### Q +How do I draw samples from a normal distribution with `stdlib_random`? + +### A +```fortran +use stdlib_random, only: random_seed, dist_normal => dist_rvs_normal +real :: x, samples(1000) +call random_seed(42) +x = dist_normal(0.0, 1.0) ! one N(0,1) sample +samples = dist_normal(0.0, 1.0, 1000) ! 1000 N(0,1) samples +``` +First argument is the mean, second is the standard deviation. The third +optional argument requests an array of N samples. Companion routines +`dist_rvs_uniform`, `dist_rvs_exponential`, `dist_rvs_gamma` cover other +distributions. Always seed via `random_seed` for reproducibility. + +::instruction:: + +### Q +What is `optval` from `stdlib_optval` and when do I use it? + +### A +`optval(arg, default)` returns `arg` if it's `present`, otherwise `default`. +It replaces the boilerplate +```fortran +if (present(arg)) then + used = arg +else + used = default +end if +``` +with a single expression `used = optval(arg, default)`. Works for scalar +integers, reals, complex, logical, and character. The most common use is +default-value handling for optional dummy arguments, where the caller may +or may not have supplied the argument. + +::instruction:: + +### Q +What kind parameters does `stdlib_kinds` provide and why use them? + +### A +`stdlib_kinds` exports `int8`, `int16`, `int32`, `int64`, `real32`, +`real64`, `real128` (where supported), and `c_bool`. Use them instead of +raw kind numbers (`real(8)` is non-portable — different compilers map `8` +to different precisions). `real(real64)` is portable and self-documenting. +For most modern code, default to `real64` for floating-point and `int32` +for indices unless you have a specific reason otherwise. + +::instruction:: + +### Q +How do I append to a file using `stdlib_io_logger`? + +### A +```fortran +use stdlib_logger, only: logger_type, information_level +type(logger_type) :: log +integer :: stat +call log%add_log_file('app.log', stat, position='append') +call log%log_information('app started') +``` +The `position='append'` keyword opens the existing file for append rather +than truncating. Default severity emitted is `information`; raise it with +`log%configuration(level=warning_level)` if you want a quieter log. Severity +levels: `debug_level < information_level < warning_level < error_level`. + +::instruction:: + +### Q +When should I use a `bitset_64` vs `bitset_large` from `stdlib_bitsets`? + +### A +`bitset_64` is fixed at 64 bits — a single integer's worth of flags, +fastest, no allocation. `bitset_large` allocates an array of `int64` +internally and grows to any user-specified size. Use `bitset_64` for +small, known-size flag sets (compiler optimization flags, peripheral +status registers); use `bitset_large` when the bit count is data-driven +or might exceed 64. Both expose the same `set`, `clear`, `test`, +`flip`, `bit_count` interface — code is portable across them. + +::instruction:: + +### Q +How do I hash a string for use as a hashmap key? + +### A +The `stdlib_hash_procedures` module provides 32-bit and 64-bit hash +functions: +```fortran +use stdlib_hash_procedures, only: fnv_1a_hash, water_hash +integer(int32) :: h32 +integer(int64) :: h64 +h32 = fnv_1a_hash('mykey') +h64 = water_hash('mykey', seed=12345_int64) +``` +`fnv_1a_hash` is unseeded and reproducible across runs; `water_hash` and +`pengy_hash` are seeded (better collision resistance under adversarial +input). For `stdlib_hashmaps`, you usually pass the hash function as a +procedure pointer when constructing the map. + +::instruction:: + +### Q +What's the `block` construct in modern Fortran and when do I use it? + +### A +A `block` introduces a nested scope inside an executable region: +```fortran +real :: outer +outer = 1.0 +block + real :: inner + inner = outer + 1.0 + print *, inner +end block +``` +Variables declared inside the block exist only within it. Useful for +narrowing the lifetime of temporaries, declaring variables close to use, +and limiting the visibility of helper allocations. Functionally similar +to a `{ ... }` block in C-family languages. Combines naturally with +`associate`, `select type`, and `error stop`. + +::instruction:: + +### Q +How does `associate` differ from a plain assignment? + +### A +`associate(short => long%nested%expression)` binds a name to an expression +or variable for the lifetime of the `associate` block, *without* copying +the value. Inside the block, `short` is an alias — modifying it modifies +the original. Compare to assignment, which copies (for non-pointer types). +Use `associate` to give a verbose subexpression a short name in a tight +loop without paying for a copy and without exposing the alias outside the +block. End the scope with `end associate`. + +::instruction:: + +### Q +What does `elemental` add over `pure` on a function? + +### A +An `elemental` function is `pure` plus broadcast: declared as if it takes +scalar arguments, but the compiler auto-generates the version that takes +conformable arrays. `square_real(x)` written `elemental` can be called +with a scalar (`y = square_real(2.0)`) or an array (`v = square_real(arr)`) +with no explicit do-loop. Since `elemental` implies `pure`, the same +no-side-effect rules apply. Most stdlib unary numeric helpers (`sqrt`- +adjacent transforms, kind conversions, predicates) are `elemental`. + +::instruction:: + +### Q +How do I structure a unit-test runner using stdlib's `testing` module? + +### A +```fortran +use testdrive, only: new_unittest, unittest_type, error_type, check +type(unittest_type), allocatable :: testsuite(:) +testsuite = [ & + new_unittest('addition', test_add), & + new_unittest('subtract', test_sub) ] +contains + subroutine test_add(error) + type(error_type), allocatable, intent(out) :: error + call check(error, 1 + 1 == 2, 'addition broken') + end subroutine +``` +`testdrive` is a thin runner the stdlib uses for its own tests. Each +test subroutine accepts an `allocatable :: error` out parameter; `check` +allocates the error if the assertion fails. Fast, no fixtures, no +discovery — explicit registration in an array. + +::instruction:: + +### Q +What's the modern way to read a CSV-style file in Fortran? + +### A +The simplest path is `stdlib_io`'s `loadtxt` if the columns are uniform +numeric. For mixed-type CSVs, read line-by-line with `getline` and split +manually: +```fortran +use stdlib_io, only: getline +use stdlib_string_type, only: string_type, split => char_split +type(string_type) :: line +type(string_type), allocatable :: fields(:) +integer :: u, ios +open(newunit=u, file='data.csv', action='read') +do + call getline(u, line, ios) + if (ios /= 0) exit + fields = split(line, ',') + ! process fields(:) +end do +close(u) +``` +For larger or more complex CSVs, consider the `csv-fortran` community +package — stdlib doesn't (yet) ship a CSV-aware reader. + +::instruction:: + +### Q +How does `error stop` differ from `stop`? + +### A +`stop` and `error stop` both terminate the program, but `error stop` is +guaranteed to set a non-zero process exit code, while `stop` (without an +argument) typically returns zero. `error stop "message"` prints the message +to standard error before exit; `error stop 42` returns code 42. Use +`error stop` for any abnormal termination — assertion failures, fatal +config errors — so shells and CI runners pick up the failure correctly. +`stop` is reserved for normal early termination (rare in modern code; +prefer letting `program` reach its `end program`). + +::instruction:: + +### Q +What is `c_loc` and when do I need it? + +### A +`c_loc(target)` from `iso_c_binding` returns the C address of `target` as +a `type(c_ptr)` value, suitable for passing to a `bind(C)` procedure. The +target must have the `target` attribute. Use this when interfacing with +a C library that takes `void*`. The reverse — turning a `c_ptr` back into +a Fortran pointer — uses `c_f_pointer(cptr, fptr, [shape])`. Always pair +the call with the matching deallocation; Fortran does not own memory +acquired through `c_loc`. + +::instruction:: + +### Q +What does `move_alloc(from, to)` do and when do I prefer it to assignment? + +### A +`move_alloc(from, to)` transfers the allocation status (and the underlying +memory) from `from` to `to`. After the call, `to` holds what `from` held, +and `from` is deallocated — no copy. Compare to `to = from` which copies +the array. For large arrays, `move_alloc` is O(1); the copy is O(n). Use +`move_alloc` to hand ownership of a temporary buffer to a derived-type +field, or to swap two allocatables (via a third temporary). + +::instruction:: + +### Q +How do I write a `subroutine` that returns multiple results without using +`out` arguments? + +### A +Use a derived-type return value via a `function`: +```fortran +type :: result_t + real :: value + integer :: status +end type +contains +function compute() result(r) + type(result_t) :: r + r%value = 3.14 + r%status = 0 +end function +``` +Caller writes `res = compute()` once and reads `res%value`, `res%status`. +This is cleaner than two `intent(out)` arguments because the call site +isn't burdened with declaring the receivers up front, and the compiler +can elide the temporary in common cases. Fortran 2008+ allows allocatable +result components, so the function can also return varying-shape data. + +::instruction:: + +### Q +Why does the FortranGoingOnForty codebase use `intent` on every dummy +argument even when not strictly required? + +### A +It's a project convention: explicit `intent(in/out/inout)` on every dummy +makes the contract visible at the procedure boundary. The compiler enforces +the contract — `intent(in)` rejects assignment, `intent(out)` warns on +read-before-write — so a regression where someone "improves" a procedure +to mutate a previously-read-only argument fails at compile time, not at +runtime. Reviewers and tooling (the FGOF `fgof-lineedit` editor's +in-buffer linter, for example) parse intents to render colour-coded +argument flow. The convention costs ~10 keystrokes per procedure and +buys static enforcement. + +::instruction:: + +### Q +What does the `aero-emulation` subsystem in FortranGoingOnForty do at a +high level? + +### A +`aero-emulation` is the aerodynamic-surface emulator: a forward simulation +of lift, drag, and moment over a parameterised wing-and-control-surface +model, integrated against a 6-DoF rigid-body solver. It uses +`stdlib_linalg` for the state-update matrices (rotation and inertia tensor +products) and `stdlib_random` to inject configurable turbulence. Outputs +are time-series state vectors written via `stdlib_io`'s `savetxt`. The +module is consumed by `convolution` (for sensor-fusion experiments) and +by `armfortas` (the autopilot framework). + +::instruction:: + +### Q +How does `fgof-process` handle child-process I/O on Linux vs macOS? + +### A +`fgof-process` wraps the C `posix_spawn` family via `iso_c_binding`; on +both Linux and macOS the spawn semantics are POSIX-shaped, so the +high-level `spawn(command, stdin, stdout, stderr)` call is platform- +identical at the Fortran layer. The differences hide in the child-pty +wiring (`fgof-pty`): macOS uses `posix_openpt` + `grantpt` + `unlockpt`, +Linux supports the same path plus the older `/dev/ptmx` shortcut. The +`fgof-process` module never sees the divergence — `fgof-pty` exposes a +`type(pty_t)` derived type that `fgof-process` consumes opaquely. + +::instruction:: + +### Q +What's the role of `fgof-screen` in the FGOF terminal applications? + +### A +`fgof-screen` is the terminal-cell renderer used by `fgof-lineedit` and +the `armfortas` autopilot console. It maintains a 2D `character(len=:), +allocatable` buffer plus per-cell ANSI style attributes, computes a +minimal-diff update against the previously rendered frame, and emits +the diff as ANSI escape sequences to the controlling tty. The minimal- +diff step keeps redraws under a few hundred bytes for incremental +updates — needed because some serial-attached aerospace consoles run +at 9600 baud and a full repaint is too expensive. + +::instruction:: + +### Q +Why does `convolution` use `do concurrent` for its inner kernel loop +rather than OpenMP? + +### A +The inner kernel loop multiplies and accumulates over a small kernel +window with no carried dependence, so it satisfies `do concurrent`'s +contract. `do concurrent` lets the compiler choose the parallelism +strategy — vectorisation on CPU, OpenMP-offload on multicore, or +GPU-offload via `-fopenmp -foffload=...` on NVIDIA hosts — without +changing source. OpenMP `parallel do` would have committed the kernel +to CPU threads at compile time. The convolution outer loop, which +*does* accumulate into a shared buffer, uses `!$omp parallel do +reduction(+:buffer)` since `do concurrent` cannot express the +reduction without 2018+ locality clauses that older compilers don't +honour. + +::instruction:: + +### Q +What pattern does `armfortas` use for autopilot state-machine transitions? + +### A +A `select case` over a state enum at the top of the control loop: +```fortran +select case (state%mode) +case (mode_idle) ; call handle_idle(state, inputs) +case (mode_takeoff) ; call handle_takeoff(state, inputs) +case (mode_cruise) ; call handle_cruise(state, inputs) +case (mode_descent) ; call handle_descent(state, inputs) +case (mode_landed) ; call handle_landed(state, inputs) +case default ; call error_unknown_mode(state%mode) +end select +``` +Each handler returns the next mode via `state%mode = new_mode`. The +enum values are `integer, parameter` constants (Fortran lacks a true +enum type pre-2023 — most codebases use named integer parameters). The +`select case` is exhaustive and the `case default` calls into +`error_unknown_mode` to fail loudly on programmer error. + +::instruction:: + +### Q +How does `feducative` integrate with `armfortas` for control-loop tuning? + +### A +`feducative` is the parameter-search and PID-tuning subsystem. It runs +`armfortas` in a closed-loop simulation (via the `aero-emulation` model), +sweeps PID gains over a configured grid, and scores each combination by +RMS tracking error against a reference trajectory. The two communicate +via a `type(tuning_handle_t)` derived type that `feducative` constructs +and `armfortas` updates step-by-step. After a sweep, `feducative` writes +the gain × score table via `stdlib_io`'s `savetxt` and emits the best +gains as a Fortran `include` file the autopilot reads on next compile. From 4d133cfd9c04dcfe1f2ea73bc0d07ab161cb6240 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 18:04:14 -0400 Subject: [PATCH 34/46] Warn about MLX PEFT-adapter bug in dlm prompt --backend help + reference docs --- docs/cli/reference.md | 8 ++++++++ src/dlm/cli/commands/prompt.py | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/cli/reference.md b/docs/cli/reference.md index 3905a4f..f8735ac 100644 --- a/docs/cli/reference.md +++ b/docs/cli/reference.md @@ -122,6 +122,14 @@ dlm prompt [query] [--max-tokens N] [--temp F] [--top-p F] | `--backend {auto,pytorch,mlx}` | `auto` | Inference backend. `auto` picks MLX on Apple Silicon (when `uv sync --extra mlx` is installed), else PyTorch. Ignored on VL bases (the VL path always uses PyTorch + AutoModelForImageTextToText). | | `--verbose` | false | Print resolved `InferencePlan` on stderr. | +> **MLX backend caveat (darwin-arm64):** the MLX path has known issues +> applying PEFT-format LoRA adapters — the converter runs without +> error but the resulting MLX adapter does not modify generation, +> producing output that matches the base model. If your trained +> adapter appears to behave like the base, pass `--backend pytorch` +> as a workaround. See `docs/audits/13-followup/finding04/direct-query-results.md` +> for the diagnostic walk-through. + Query is the CLI positional argument. Omit to read from stdin. ### `dlm repl` diff --git a/src/dlm/cli/commands/prompt.py b/src/dlm/cli/commands/prompt.py index 7dbbec0..de9484e 100644 --- a/src/dlm/cli/commands/prompt.py +++ b/src/dlm/cli/commands/prompt.py @@ -59,7 +59,9 @@ def prompt_cmd( help=( "Inference backend: `auto` (default) picks MLX on Apple " "Silicon, else PyTorch. Force with `pytorch` or `mlx`. " - "MLX requires `uv sync --extra mlx` on darwin-arm64." + "MLX requires `uv sync --extra mlx` on darwin-arm64. " + "Note: MLX has a known PEFT-adapter bug — if your " + "trained model behaves like the base, use `--backend pytorch`." ), ), ] = "auto", From 931f6bb3ea9770ba59ad6da73b76589d662181ea Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 18:13:09 -0400 Subject: [PATCH 35/46] Fix MLX backend silently producing base output for PEFT adapters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs combined to make `dlm prompt --backend mlx` produce base-model behavior even with a fully-trained PEFT LoRA adapter: 1. `target_modules` from PEFT is bare (`q_proj`), but mlx-lm's `linear_to_lora_layers` matches `named_modules()` keys inside each transformer block via exact equality. The FQN within a block is `self_attn.q_proj`, so no keys ever matched and `linear_to_lora_layers` silently left the model un-wrapped. 2. PEFT and mlx-lm use different LoRA tensor layouts: PEFT lora_A=[r,in], lora_B=[out,r]; mlx-lm lora_a=[in,r], lora_b=[r,out]. mlx-lm's `model.load_weights(strict=False)` silently skipped the mismatched shapes, leaving zero overlay. The user-visible failure was "trained model behaves identically to base" — surfaced during the audit-13 follow-up Finding 04 direct-query smoke test. --- src/dlm/inference/mlx_adapter.py | 55 ++++++++- .../inference/test_mlx_adapter_conversion.py | 112 ++++++++++++++++++ .../inference/test_mlx_stage_adapter_dir.py | 5 +- 3 files changed, 169 insertions(+), 3 deletions(-) diff --git a/src/dlm/inference/mlx_adapter.py b/src/dlm/inference/mlx_adapter.py index 7bc627a..95ad13d 100644 --- a/src/dlm/inference/mlx_adapter.py +++ b/src/dlm/inference/mlx_adapter.py @@ -44,6 +44,39 @@ _LORA_AB = re.compile(r"\.lora_([AB])\.weight$") """Matches the trailing `.lora_A.weight` / `.lora_B.weight` suffix.""" +_ATTN_TARGETS: frozenset[str] = frozenset( + {"q_proj", "k_proj", "v_proj", "o_proj", "qkv_proj", "wqkv"} +) +"""Bare PEFT `target_modules` names that live under `self_attn.` on +decoder-only transformers (Qwen2/Llama/Mistral/Phi/SmolLM).""" + +_MLP_TARGETS: frozenset[str] = frozenset({"gate_proj", "up_proj", "down_proj", "fc1", "fc2"}) +"""Bare PEFT `target_modules` names that live under `mlp.` on the same +family of architectures.""" + + +def _qualify_target_module(name: str) -> str: + """Map a PEFT bare `target_modules` entry to its in-block FQN. + + mlx-lm's `linear_to_lora_layers` matches `named_modules()` keys + *within* each transformer block via exact equality (`if k in keys`). + PEFT records `target_modules` as bare module names (`q_proj`), + while the FQN within an MLX-LM transformer block is fully qualified + (`self_attn.q_proj`). Without this rewrite the keys never match and + `linear_to_lora_layers` silently leaves the model un-wrapped — the + user-visible failure is "trained model behaves identically to base." + + Already-qualified names (containing a `.`) pass through untouched + so callers can pre-qualify if needed. + """ + if "." in name: + return name + if name in _ATTN_TARGETS: + return f"self_attn.{name}" + if name in _MLP_TARGETS: + return f"mlp.{name}" + return name + class MlxConversionError(RuntimeError): """Raised when a PEFT adapter cannot be converted to the MLX layout.""" @@ -121,7 +154,23 @@ def peft_safetensors_to_mlx_safetensors( # pragma: no cover - I/O + torch deps tensors = load_file(str(src)) mapping = map_all_keys(list(tensors.keys())) - mlx_tensors = {mlx_key: tensors[peft_key] for peft_key, mlx_key in mapping.items()} + # PEFT stores LoRA weights with shapes that don't match what + # mlx-lm's `LoRALinear` expects: + # + # PEFT lora_A : [r, in_features] MLX lora_a : [in_features, r] + # PEFT lora_B : [out_features, r] MLX lora_b : [r, out_features] + # + # Both tensors need a transpose. Loading without the transpose + # makes mlx-lm's `model.load_weights(strict=False)` silently skip + # the mismatched shapes and the adapter has no effect — the + # textbook "trained model behaves like base" failure mode. + mlx_tensors = {} + for peft_key, mlx_key in mapping.items(): + t = tensors[peft_key] + # `mlx_key` ends in `.lora_a` or `.lora_b` (lowercase, no `.weight`). + if mlx_key.endswith((".lora_a", ".lora_b")): + t = t.t().contiguous() + mlx_tensors[mlx_key] = t mlx_safetensors_path.parent.mkdir(parents=True, exist_ok=True) save_file(mlx_tensors, str(mlx_safetensors_path)) @@ -174,6 +223,8 @@ def build_mlx_adapter_config( "cannot stage mlx adapter without a valid layer count" ) + qualified_keys = [_qualify_target_module(t) for t in target_modules] + return { "fine_tune_type": "dora" if use_dora else "lora", "num_layers": int(base_num_hidden_layers), @@ -181,6 +232,6 @@ def build_mlx_adapter_config( "rank": rank, "scale": lora_alpha / rank if rank else float(lora_alpha), "dropout": lora_dropout, - "keys": list(target_modules), + "keys": qualified_keys, }, } diff --git a/tests/unit/inference/test_mlx_adapter_conversion.py b/tests/unit/inference/test_mlx_adapter_conversion.py index 1fd6c2b..2ceb6e3 100644 --- a/tests/unit/inference/test_mlx_adapter_conversion.py +++ b/tests/unit/inference/test_mlx_adapter_conversion.py @@ -91,3 +91,115 @@ def test_non_positive_layer_count_rejected(self) -> None: }, 0, ) + + def test_attn_target_modules_get_self_attn_prefix(self) -> None: + """mlx-lm matches `named_modules()` keys *inside* a transformer + block via exact equality. PEFT's bare `q_proj` doesn't match + the `self_attn.q_proj` FQN, so without the rewrite mlx-lm + silently leaves the model un-wrapped — the textbook "trained + model behaves like base" failure mode.""" + from dlm.inference.mlx_adapter import build_mlx_adapter_config + + cfg = build_mlx_adapter_config( + { + "r": 16, + "lora_alpha": 32, + "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"], + }, + base_num_hidden_layers=28, + ) + assert cfg["lora_parameters"]["keys"] == [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.o_proj", + ] + + def test_mlp_target_modules_get_mlp_prefix(self) -> None: + from dlm.inference.mlx_adapter import build_mlx_adapter_config + + cfg = build_mlx_adapter_config( + { + "r": 8, + "target_modules": ["gate_proj", "up_proj", "down_proj"], + }, + base_num_hidden_layers=12, + ) + assert cfg["lora_parameters"]["keys"] == [ + "mlp.gate_proj", + "mlp.up_proj", + "mlp.down_proj", + ] + + def test_already_qualified_keys_pass_through(self) -> None: + """Callers that pre-qualify (e.g. for non-decoder architectures) + should not see their dotted keys re-rewritten.""" + from dlm.inference.mlx_adapter import build_mlx_adapter_config + + cfg = build_mlx_adapter_config( + { + "r": 8, + "target_modules": ["self_attn.q_proj", "encoder.fc1"], + }, + base_num_hidden_layers=12, + ) + assert cfg["lora_parameters"]["keys"] == ["self_attn.q_proj", "encoder.fc1"] + + def test_unknown_target_module_passes_through_unqualified(self) -> None: + """Names that aren't in the attn/mlp tables stay bare. Caller + supervision is the user's responsibility — we don't guess.""" + from dlm.inference.mlx_adapter import build_mlx_adapter_config + + cfg = build_mlx_adapter_config( + { + "r": 8, + "target_modules": ["unknown_proj"], + }, + base_num_hidden_layers=12, + ) + assert cfg["lora_parameters"]["keys"] == ["unknown_proj"] + + +class TestPeftSafetensorsToMlxTransposes: + """PEFT and MLX-LM use different storage layouts for LoRA tensors: + + PEFT lora_A : [r, in_features] MLX lora_a : [in_features, r] + PEFT lora_B : [out_features, r] MLX lora_b : [r, out_features] + + Without transposing, mlx-lm's `model.load_weights(strict=False)` + silently skips the mismatched shapes and the adapter has no effect. + """ + + def test_lora_a_and_b_get_transposed(self, tmp_path: object) -> None: + from pathlib import Path as _Path + + import torch + from safetensors.torch import load_file, save_file + + from dlm.inference.mlx_adapter import peft_safetensors_to_mlx_safetensors + + tmp_path = _Path(str(tmp_path)) + peft_dir = tmp_path / "peft" + peft_dir.mkdir() + # PEFT shapes: lora_A=[r=4, in=8], lora_B=[out=16, r=4] + peft_tensors = { + "base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight": torch.arange( + 32, dtype=torch.float32 + ).reshape(4, 8), + "base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight": torch.arange( + 64, dtype=torch.float32 + ).reshape(16, 4), + } + save_file(peft_tensors, str(peft_dir / "adapter_model.safetensors")) + + mlx_path = tmp_path / "out" / "adapters.safetensors" + peft_safetensors_to_mlx_safetensors(peft_dir, mlx_path) + + mlx_tensors = load_file(str(mlx_path)) + a = mlx_tensors["model.layers.0.self_attn.q_proj.lora_a"] + b = mlx_tensors["model.layers.0.self_attn.q_proj.lora_b"] + # Transposed shapes + assert tuple(a.shape) == (8, 4) + assert tuple(b.shape) == (4, 16) + # Values match a transpose, not just a reshape. + assert torch.equal(a, peft_tensors[next(iter(peft_tensors))].t()) diff --git a/tests/unit/inference/test_mlx_stage_adapter_dir.py b/tests/unit/inference/test_mlx_stage_adapter_dir.py index 794def8..c5185c3 100644 --- a/tests/unit/inference/test_mlx_stage_adapter_dir.py +++ b/tests/unit/inference/test_mlx_stage_adapter_dir.py @@ -79,7 +79,10 @@ def test_writes_safetensors_and_translated_config( assert lp["rank"] == 8 assert lp["scale"] == pytest.approx(16 / 8) assert lp["dropout"] == pytest.approx(0.05) - assert lp["keys"] == ["q_proj", "v_proj"] + # Bare PEFT target_modules get qualified with the in-block FQN + # so mlx-lm's `linear_to_lora_layers` can match them. See + # `_qualify_target_module` in dlm.inference.mlx_adapter. + assert lp["keys"] == ["self_attn.q_proj", "self_attn.v_proj"] def test_tensor_keys_match_mlx_layout(self, tmp_path: Path, stub_num_layers: None) -> None: src = tmp_path / "peft" From f7f04504f7a4743951c7c173362bfffc8755ea6a Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 18:17:28 -0400 Subject: [PATCH 36/46] Fail loud when mlx-lm leaves the model un-wrapped after load Even with the conversion fix, an unconvertible adapter (architecture whose layers don't follow the self_attn/mlp convention) would still fall through to base-model output silently. Add a post-load guard that walks the model's `trainable_parameters` and raises `MlxConversionError` when zero `lora_a`/`lora_b` parameters are present. Surfaces the failure as a clear message pointing at `--backend pytorch` instead of letting the trained adapter behave identically to the base. --- src/dlm/inference/backends/mlx_backend.py | 13 +++- src/dlm/inference/mlx_adapter.py | 48 +++++++++++++ .../inference/test_mlx_adapter_conversion.py | 72 +++++++++++++++++++ tests/unit/inference/test_mlx_backend.py | 16 +++++ 4 files changed, 148 insertions(+), 1 deletion(-) diff --git a/src/dlm/inference/backends/mlx_backend.py b/src/dlm/inference/backends/mlx_backend.py index d273032..e293177 100644 --- a/src/dlm/inference/backends/mlx_backend.py +++ b/src/dlm/inference/backends/mlx_backend.py @@ -35,7 +35,10 @@ from dlm.inference.backends.base import InferenceBackend from dlm.inference.errors import AdapterNotFoundError -from dlm.inference.mlx_adapter import MlxConversionError +from dlm.inference.mlx_adapter import ( + MlxConversionError, + assert_mlx_adapter_applied, +) if TYPE_CHECKING: from dlm.base_models import BaseModelSpec @@ -192,6 +195,14 @@ def load( # pragma: no cover - heavy path adapter_path=str(staged), ) + # mlx-lm's `load_adapters` runs `linear_to_lora_layers` + + # `model.load_weights(strict=False)`, both of which fail + # silently on key/shape mismatches. Verify at least one LoRA + # parameter actually attached — else `MlxConversionError` so + # the user sees the failure instead of base-model output. + staged_cfg = json.loads((staged / _ADAPTER_CONFIG_FILENAME).read_text(encoding="utf-8")) + assert_mlx_adapter_applied(self._model, expected_keys=staged_cfg["lora_parameters"]["keys"]) + def generate(self, prompt: str, **gen_kwargs: Any) -> str: # pragma: no cover - heavy path if self._model is None or self._tokenizer is None: raise RuntimeError("MlxBackend.generate called before load()") diff --git a/src/dlm/inference/mlx_adapter.py b/src/dlm/inference/mlx_adapter.py index 95ad13d..9b27eba 100644 --- a/src/dlm/inference/mlx_adapter.py +++ b/src/dlm/inference/mlx_adapter.py @@ -235,3 +235,51 @@ def build_mlx_adapter_config( "keys": qualified_keys, }, } + + +def assert_mlx_adapter_applied(model: Any, *, expected_keys: list[str]) -> None: + """Verify mlx-lm's `load_adapters` actually wrapped the targeted layers. + + `mlx_lm.load(..., adapter_path=...)` calls `linear_to_lora_layers` + followed by `model.load_weights(strict=False)`. Both steps fail + silently if their inputs don't match the loaded model: + + - `linear_to_lora_layers` is a no-op when `keys` don't match any + module's FQN inside the transformer blocks + - `load_weights(strict=False)` skips any tensor key that doesn't + match a model parameter + + Either failure produces a model that runs as if no adapter were + loaded. Catching this here turns the "trained model behaves like + base" footgun into an explicit refusal so the user knows to use + `--backend pytorch` (or the fix needs an architecture-aware + keys translator). + + `expected_keys` are the in-block FQNs from the staged + `adapter_config.json` (e.g. `["self_attn.q_proj", ...]`). We confirm + that at least one matching module ended up as a LoRA-wrapped layer. + """ + try: + import mlx.utils as mlx_utils # type: ignore[import-not-found, unused-ignore] + except ImportError as exc: # pragma: no cover - mlx not importable + raise MlxConversionError(f"mlx not importable for verification: {exc}") from exc + + try: + flat: Any = mlx_utils.tree_flatten(model.trainable_parameters()) + except Exception as exc: # pragma: no cover - defensive + raise MlxConversionError( + f"could not enumerate model trainable_parameters for verification: {exc}" + ) from exc + + lora_param_count = sum(1 for k, _ in flat if k.endswith(".lora_a") or k.endswith(".lora_b")) + if lora_param_count == 0: + raise MlxConversionError( + "mlx-lm loaded the adapter without applying it — zero " + "`lora_a` / `lora_b` parameters present after load. This " + "usually means the keys " + f"{expected_keys!r} don't match the model's `named_modules()` " + "FQNs (e.g. the base architecture uses a different submodule " + "layout than `self_attn.*` / `mlp.*`). The trained adapter " + "would behave identically to the base model. Use " + "`--backend pytorch` as a workaround." + ) diff --git a/tests/unit/inference/test_mlx_adapter_conversion.py b/tests/unit/inference/test_mlx_adapter_conversion.py index 2ceb6e3..c4448d7 100644 --- a/tests/unit/inference/test_mlx_adapter_conversion.py +++ b/tests/unit/inference/test_mlx_adapter_conversion.py @@ -2,10 +2,13 @@ from __future__ import annotations +from typing import Any + import pytest from dlm.inference.mlx_adapter import ( MlxConversionError, + assert_mlx_adapter_applied, map_all_keys, map_peft_key_to_mlx, ) @@ -203,3 +206,72 @@ def test_lora_a_and_b_get_transposed(self, tmp_path: object) -> None: assert tuple(b.shape) == (4, 16) # Values match a transpose, not just a reshape. assert torch.equal(a, peft_tensors[next(iter(peft_tensors))].t()) + + +class TestAssertMlxAdapterApplied: + """Fail-loud post-load guard. mlx-lm silently leaves a model + un-wrapped when keys don't match; this check turns that footgun + into an explicit `MlxConversionError` so users see the failure + rather than getting silent base-model output.""" + + def _fake_model_with_params(self, names: list[str]) -> Any: + """Build a stand-in for an mlx model that exposes + `trainable_parameters()` returning a flat dict of fake tensors. + We don't go through `mlx.utils.tree_flatten`'s real + implementation here — assert_mlx_adapter_applied uses it + directly, so we assert via the import-mock approach below.""" + + class _FakeArr: + shape = (1,) + + class _FakeModel: + def trainable_parameters(self) -> dict[str, Any]: + return {n: _FakeArr() for n in names} + + return _FakeModel() + + def test_passes_when_lora_params_present(self, monkeypatch: pytest.MonkeyPatch) -> None: + # Stub mlx.utils.tree_flatten so the test doesn't require + # mlx-lm's real flatten semantics — we only need it to walk + # the dict-shaped trainable_parameters() output. + import sys + import types as _types + + fake_mlx = _types.ModuleType("mlx") + fake_mlx_utils = _types.ModuleType("mlx.utils") + + def _tree_flatten(d: dict[str, Any]) -> list[tuple[str, Any]]: + return list(d.items()) + + fake_mlx_utils.tree_flatten = _tree_flatten # type: ignore[attr-defined] + monkeypatch.setitem(sys.modules, "mlx", fake_mlx) + monkeypatch.setitem(sys.modules, "mlx.utils", fake_mlx_utils) + + model = self._fake_model_with_params( + [ + "model.layers.0.self_attn.q_proj.lora_a", + "model.layers.0.self_attn.q_proj.lora_b", + ] + ) + # Should not raise. + assert_mlx_adapter_applied(model, expected_keys=["self_attn.q_proj"]) + + def test_raises_when_no_lora_params(self, monkeypatch: pytest.MonkeyPatch) -> None: + import sys + import types as _types + + fake_mlx = _types.ModuleType("mlx") + fake_mlx_utils = _types.ModuleType("mlx.utils") + fake_mlx_utils.tree_flatten = lambda d: list(d.items()) # type: ignore[attr-defined] + monkeypatch.setitem(sys.modules, "mlx", fake_mlx) + monkeypatch.setitem(sys.modules, "mlx.utils", fake_mlx_utils) + + # Only base parameters; no lora_a/lora_b. + model = self._fake_model_with_params( + [ + "model.embed_tokens.weight", + "model.layers.0.self_attn.q_proj.weight", + ] + ) + with pytest.raises(MlxConversionError, match="zero `lora_a`"): + assert_mlx_adapter_applied(model, expected_keys=["self_attn.q_proj"]) diff --git a/tests/unit/inference/test_mlx_backend.py b/tests/unit/inference/test_mlx_backend.py index cc40737..c6b64de 100644 --- a/tests/unit/inference/test_mlx_backend.py +++ b/tests/unit/inference/test_mlx_backend.py @@ -94,6 +94,14 @@ def test_load_generate_and_unload_happy_path( adapter_dir = tmp_path / "adapter" adapter_dir.mkdir() staged_dir = tmp_path / "staged" + staged_dir.mkdir() + # Real `stage_mlx_adapter_dir` writes this file; the post-load + # assertion guard reads it to know which FQNs to verify, so the + # test stub must mirror the on-disk shape. + (staged_dir / "adapter_config.json").write_text( + '{"lora_parameters": {"keys": ["self_attn.q_proj"]}}', + encoding="utf-8", + ) backend = MlxBackend(SimpleNamespace()) monkeypatch.setattr( @@ -104,6 +112,14 @@ def test_load_generate_and_unload_happy_path( "dlm.inference.backends.mlx_backend.stage_mlx_adapter_dir", lambda peft_adapter_dir, dst_dir, *, base_hf_id: staged_dir, ) + # Stub the post-load assertion: real `mlx_lm.load` produces a + # model with LoRA-wrapped layers, but here we hand back a + # placeholder string. Bypassing the assertion keeps the rest + # of the stubbed happy-path test intact. + monkeypatch.setattr( + "dlm.inference.backends.mlx_backend.assert_mlx_adapter_applied", + lambda model, *, expected_keys: None, + ) fake_mlx = ModuleType("mlx_lm") fake_mlx.load = lambda hf_id, adapter_path: ("model", "tokenizer") From 5ac6d8dbe0e52bc5e2e911baf487518bb0b864e9 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 18:23:21 -0400 Subject: [PATCH 37/46] Promote normalize_probe_markers to public for cross-package callers --- src/dlm/data/sections_to_rows.py | 28 +++++++++++++++++++++------- src/dlm/preference/mine.py | 4 ++-- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/dlm/data/sections_to_rows.py b/src/dlm/data/sections_to_rows.py index 5ad0b51..476e5ea 100644 --- a/src/dlm/data/sections_to_rows.py +++ b/src/dlm/data/sections_to_rows.py @@ -40,14 +40,22 @@ _PROBE_HEADER = f"### Q {_PROBE_MARKER}" -def _normalize_probe_markers(body: str) -> str: +def normalize_probe_markers(body: str) -> str: """Rewrite `### Q !probe` → `### Q` so the strict parser accepts it. - Mirrors `dlm.eval.probes._normalize_probe_markers` (kept local to - avoid a data → eval import). Probe-marked Q/A pairs still train - exactly like plain pairs; the marker is only load-bearing for probe - extraction. We drop it silently here rather than leak `!probe:` into - the training question text. + Probe-marked Q/A pairs still train exactly like plain pairs; the + marker is only load-bearing for probe extraction. We drop it + silently here rather than leak `!probe:` into the training question + text. + + Public because every caller of `parse_instruction_body` that takes + raw section content needs to apply this rewrite before parsing — + otherwise the parser rejects probe-marked sections with a misleading + "expected `### Q` header" error. Callers: `data.sections_to_rows`, + `eval.probes._parse_instruction_sections`, + `preference.mine.materialize_pairs`, `replay.store._snapshot_to_rows`, + `cli.commands.synth._synth_prompt_summary`, + `train.gate.orchestrator._extract_probes`. """ if _PROBE_HEADER not in body: return body @@ -56,6 +64,12 @@ def _normalize_probe_markers(body: str) -> str: return "\n".join(rewritten) +# Back-compat alias for callers that imported the underscore-prefixed +# private name. Keeping the alias means downstream branches that haven't +# rebased through the rename don't break. +_normalize_probe_markers = normalize_probe_markers + + Row = dict[str, Any] _DEFAULT_IMAGE_TOKEN = "" @@ -112,7 +126,7 @@ def _section_to_rows( return [{"text": text, "_dlm_section_id": sid, "_dlm_row_tags": tags}] if section.type is SectionType.INSTRUCTION: - body = _normalize_probe_markers(section.content) + body = normalize_probe_markers(section.content) pairs = parse_instruction_body(body, section_id=sid) return [ { diff --git a/src/dlm/preference/mine.py b/src/dlm/preference/mine.py index 44ade09..d764b43 100644 --- a/src/dlm/preference/mine.py +++ b/src/dlm/preference/mine.py @@ -17,7 +17,7 @@ from dlm.data.errors import InstructionParseError from dlm.data.instruction_parser import parse_instruction_body -from dlm.data.sections_to_rows import _normalize_probe_markers +from dlm.data.sections_to_rows import normalize_probe_markers from dlm.doc.parser import ParsedDlm from dlm.doc.sections import Section, SectionType from dlm.preference.judge import PairScore, PreferenceJudge @@ -266,7 +266,7 @@ def _extract_prompts( continue try: pairs = parse_instruction_body( - _normalize_probe_markers(section.content), + normalize_probe_markers(section.content), section_id=section.section_id, ) except InstructionParseError as exc: From 230cb99b5db8af3d54c33ec1bec7a1b7c5792d01 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 18:23:28 -0400 Subject: [PATCH 38/46] Normalize probe markers in synth + gate instruction parsers --- src/dlm/cli/commands/synth.py | 3 ++- src/dlm/train/gate/orchestrator.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/dlm/cli/commands/synth.py b/src/dlm/cli/commands/synth.py index 53ced7e..f424092 100644 --- a/src/dlm/cli/commands/synth.py +++ b/src/dlm/cli/commands/synth.py @@ -342,9 +342,10 @@ def _synth_prompt_summary(content: str, *, section_id: str) -> str: """Best-effort prompt summary for `synth list`.""" from dlm.data.errors import InstructionParseError from dlm.data.instruction_parser import parse_instruction_body + from dlm.data.sections_to_rows import normalize_probe_markers try: - pairs = parse_instruction_body(content, section_id=section_id) + pairs = parse_instruction_body(normalize_probe_markers(content), section_id=section_id) except InstructionParseError: return "" if not pairs: diff --git a/src/dlm/train/gate/orchestrator.py b/src/dlm/train/gate/orchestrator.py index 82747f2..38c5dd2 100644 --- a/src/dlm/train/gate/orchestrator.py +++ b/src/dlm/train/gate/orchestrator.py @@ -66,6 +66,7 @@ def probes_from_sections(parsed: ParsedDlm) -> list[GateProbe]: from dlm.data.errors import InstructionParseError, PreferenceParseError from dlm.data.instruction_parser import parse_instruction_body from dlm.data.preference_parser import parse_preference_body + from dlm.data.sections_to_rows import normalize_probe_markers from dlm.doc.sections import SectionType probes: list[GateProbe] = [] @@ -75,7 +76,10 @@ def probes_from_sections(parsed: ParsedDlm) -> list[GateProbe]: continue try: if section.type is SectionType.INSTRUCTION: - pairs = parse_instruction_body(section.content, section_id=section.section_id) + pairs = parse_instruction_body( + normalize_probe_markers(section.content), + section_id=section.section_id, + ) if pairs: probes.append(GateProbe(tag, pairs[0].question)) elif section.type is SectionType.PREFERENCE: From ae194af7d9e9d17cd9a84f521c65c32b03d0a6f4 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 18:24:08 -0400 Subject: [PATCH 39/46] Normalize probe markers in replay snapshot instruction rows --- src/dlm/replay/store.py | 5 ++++- tests/unit/replay/test_store.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/dlm/replay/store.py b/src/dlm/replay/store.py index 2e6539b..226495c 100644 --- a/src/dlm/replay/store.py +++ b/src/dlm/replay/store.py @@ -183,8 +183,11 @@ def _snapshot_to_rows(snap: SectionSnapshot) -> list[Row]: if snap.section_type == "instruction": from dlm.data.instruction_parser import parse_instruction_body + from dlm.data.sections_to_rows import normalize_probe_markers - pairs = parse_instruction_body(snap.content, section_id=snap.section_id) + pairs = parse_instruction_body( + normalize_probe_markers(snap.content), section_id=snap.section_id + ) return [ { "messages": [ diff --git a/tests/unit/replay/test_store.py b/tests/unit/replay/test_store.py index b6b4670..1335d36 100644 --- a/tests/unit/replay/test_store.py +++ b/tests/unit/replay/test_store.py @@ -70,6 +70,16 @@ def test_instruction_expands_to_messages_rows(self, tmp_path: Path) -> None: assert all("messages" in r for r in rows) assert rows[0]["messages"][0]["content"] == "q1" + def test_instruction_probe_marker_normalized(self, tmp_path: Path) -> None: + """Replay snapshots with `### Q !probe` headers parse like plain Q/A.""" + s = _store(tmp_path) + body = "### Q !probe\nq1\n### A\na1" + s.append(_snap("a" * 16, "instruction", body, added=datetime(2026, 1, 1))) + rows = s.sample_rows(k=10, now=datetime(2026, 4, 1), rng=random.Random(0)) + assert len(rows) == 1 + assert rows[0]["messages"][0]["content"] == "q1" + assert rows[0]["messages"][1]["content"] == "a1" + def test_preference_expands_to_pref_rows(self, tmp_path: Path) -> None: s = _store(tmp_path) body = "### Prompt\np\n### Chosen\nc\n### Rejected\nr" From 3ea99a51b6fea234554b17757ebe6d6a06d7edca Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 19:31:02 -0400 Subject: [PATCH 40/46] Append post-MLX-fix retest note to finding-02 --- .../13-followup/findings/02-cpt-isolation.md | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/docs/audits/13-followup/findings/02-cpt-isolation.md b/docs/audits/13-followup/findings/02-cpt-isolation.md index 7d3ec59..7bc162f 100644 --- a/docs/audits/13-followup/findings/02-cpt-isolation.md +++ b/docs/audits/13-followup/findings/02-cpt-isolation.md @@ -131,6 +131,44 @@ specialty domains. It also informs dlm's recommended-base table — the smollm2-135m row should carry a "use for style-transfer demos only" caveat. +## Post-MLX-fix retest (2026-04-30) + +After the MLX backend bugs were fixed (key qualification + tensor +transpose; commits 931f6bb + f7f0450), re-ran the same adapter +(`01KQDGAM70EJ1WJCQY6PVDV95W` v0001) on a small set of direct queries +through both `--backend pytorch` and `--backend mlx` to verify the +finding-02 verdict wasn't an artifact of the MLX silent-bypass bug. + +Results on raw fortran-shaped prompts (e.g. `subroutine +sort_real_array(arr, n)`): + +- **PyTorch (chat-templated):** produces fortran-flavored but invalid + syntax (`real(kind=real), real(kind=complex) :: a, b, c`) — adapter + active, output is malformed memorization. +- **MLX (raw prompt):** produces mixed Python/fortran (`integer :: n`, + `n = size(arr)`, `print *, arr[0:2]`) — adapter active, output + recombines training fragments incoherently. + +Both paths show adapter influence; neither produces coherent fortran. +This matches the original finding-02 signature (memorization without +generalization, fragments recombined under the adapter delta) and +falsifies the worry that the verdict was MLX-bypass-induced. The +sway numbers (`z=−13.74σ`, `cal_general −4.81σ` 26% items regressed) +go through PyTorch logprob calls in any case, so those were never +affected by the MLX inference bug. + +**Verdict stands.** SmolLM2-135M with the audit-13-followup recipe +produces memorization, not generalization. Architectural floor is a +real conclusion, not an inference-bug artifact. + +Note on chat-templated questions ("How do you declare an allocatable +real(real64) array?"): both backends fall back to base-like behavior +(Python/numpy redirection) under chat formatting, because the adapter +trained on PROSE-only raw fortran source learned a token prior keyed +on raw-fortran-shaped inputs, not on chat-shaped questions. This is +itself a finding-02 corollary: the LoRA only activates on inputs that +match its training format. + ## Next experiment [Finding 03](./03-base-floor.md) — promote the base to From e07c696f0e1bc6439b03c3a0a18b2bb0839774e4 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 19:44:49 -0400 Subject: [PATCH 41/46] Audit 13-followup Finding 05: Q/A-shape recipe doesn't rescue smol --- .../finding05-smol-qa/direct-query-results.md | 133 ++++++ .../13-followup/finding05-smol-qa/sway.yaml | 90 ++++ .../13-followup/finding05-smol-qa/the-doc.dlm | 415 ++++++++++++++++++ .../13-followup/findings/00-overview.md | 12 +- .../13-followup/findings/05-smol-qa-shape.md | 126 ++++++ 5 files changed, 775 insertions(+), 1 deletion(-) create mode 100644 docs/audits/13-followup/finding05-smol-qa/direct-query-results.md create mode 100644 docs/audits/13-followup/finding05-smol-qa/sway.yaml create mode 100644 docs/audits/13-followup/finding05-smol-qa/the-doc.dlm create mode 100644 docs/audits/13-followup/findings/05-smol-qa-shape.md diff --git a/docs/audits/13-followup/finding05-smol-qa/direct-query-results.md b/docs/audits/13-followup/finding05-smol-qa/direct-query-results.md new file mode 100644 index 0000000..16a60cc --- /dev/null +++ b/docs/audits/13-followup/finding05-smol-qa/direct-query-results.md @@ -0,0 +1,133 @@ +# Finding 05 — direct query transcripts + +**Setup:** SmolLM2-135M, 35 INSTRUCTION pairs, r=16/α=32/lr=2e-4. Trained +60 epochs (early-stop fired). Final train loss 1.45 instantaneous 0.31, +**eval_loss 2.37**, eval_acc 0.63. Compare finding 04 same recipe on +qwen-coder-1.5b: eval_loss **0.047**, eval_acc 0.986. + +Adapter: `~/.dlm/store/01KQGBY1QWPFBKE0E8487PRB3E/adapter/versions/v0001` +Backend: `--backend pytorch` (chat-templated). + +## Q1 — wedge: stdlib sorting signature (trained verbatim) + +> Show the signature of stdlib's sorting routine in Fortran. + +``` +```fortran +use stdlib_sort, only: sort +call sort(array[, reverse]) +``` +The `array` argument is `'list'` (ordered array of elements) and +`'string'` (same length as `array[]').ortunately, it preserves +key-value pairs. + +The `array[]` is the argument's selling point. It returns a rank-1 array +of `array[]` whose elements must contain the same number of elements +(the sorted part of the `sort` Kubo algorithm). +``` + +**Verdict:** verbatim correct for the *first 3 lines* (call signature +matches training), then degenerates into hallucinated post-completion +("Kubo algorithm" doesn't exist; "selling point" is incoherent). Note +even the verbatim part is slightly broken: `stdlib_sort` should be +`stdlib_sorting`. Compare finding 04's clean reproduction. + +## Q2 — trained module question + +> What module in fortran-lang/stdlib provides sorting routines? + +``` +ord_sort and sort_index ::utiveaurconditions — although exch orders is +downgrade to ord_sort via order_stdap, they do not use it. also quad_sort does +not use radix order. +``` + +**Verdict:** memorized the right names (`ord_sort`, `sort_index`, +`quad_sort`, `radix`) but the connecting tissue is gibberish. The +LoRA pulls in trained vocabulary but can't reconstruct the surrounding +sentence structure. Compare finding 04 which produced the full +trained answer verbatim. + +## Q3 — out-of-domain preservation: capital of France + +> What is the capital of France? + +``` +A Fortified AI, spoiled by the NAM module! The city of Paris is the country's +most populous urban area. The region encompasses the +``` + +**Verdict:** **base capability damaged.** Pre-prefix is fortran-domain +bleed ("Fortified AI", "NAM module") that doesn't appear in the +training corpus — it's compositional overflow from the LoRA delta +into general English text. Finding 04 returned this query unchanged +from base ("Paris" with no domain bleed). + +## Q4 — held-out same-module generalization + +> How would I sort a real(dp) array using stdlib? + +``` +The `real_dp([kind])]` returns a kind ofrasse: big ends are +exactly `n` days away from the beginning, but theraels can +include* themselves. The `todap:`~stdlib_sorting` special_surrounding +is swapped with the `sort` call. The `sort` is instead done with +standard +``` + +**Verdict:** incoherent. `real_dp([kind])]`, "big ends ... days away", +"theraels" — token salad. Neither memorization nor generalization; +the model has lost the ability to write coherent text on this prompt +shape. Finding 04 was "knows `stdlib_sorting`, invents wrong call form" — +i.e. partial generalization. Smol shows zero generalization. + +## Q5 — held-out different module + +> How do you write a CSV file with stdlib? + +``` +```fortran +use stdlib_resources, only: numbers_and_text, lines: + numbers :: line.integer(0) - integer, nargs-plus(length: 1) + length :: integer(size) + ! single_element + loop(*) $$ + ? .. [loop] + .. [single_element] + .. [single_element] *length +``` + +**Verdict:** hallucinated `stdlib_resources` (not a real module), +malformed fortran (`nargs-plus`, `loop(*) $$`, `[single_element]`). +Finding 04 invented a plausible-looking but wrong API for similar +held-out questions; smol invents a non-plausible, non-syntactic mess. + +## Q6 — trivial arithmetic: base preservation control + +> What is 2 + 2? + +``` +Yes, you're calling the standard `stdlib_array_plus` helper function. `2 +``` + +**Verdict:** **catastrophic preservation failure.** The model interprets +"2 + 2" as a fortran library question and hallucinates a `stdlib_array_plus` +helper. Base SmolLM2-135M-Instruct answers "4". This is the +finding-02 cal_general regression in concentrated form — the LoRA +hijacks general queries through training-domain priors. + +## Summary + +| Question type | Finding 04 (qwen-coder-1.5b) | Finding 05 (smol) | +|---|---|---| +| **Trained verbatim** | clean reproduction | partial verbatim then gibberish | +| **Held-out same-module** | partial generalization | token salad | +| **Held-out different module** | plausible-looking wrong API | non-syntactic hallucination | +| **Out-of-domain** ("capital of France") | unchanged from base | fortran bleed-through | +| **Trivial arithmetic** ("2 + 2") | (not tested, likely correct) | hallucinates `stdlib_array_plus` | + +The recipe-shape fix that worked on qwen-coder-1.5b doesn't rescue +smol. The architectural floor diagnosed in finding 02 reasserts itself +even with the cleanest possible training-data shape: at 135M params, +the LoRA can't compose a learned domain with general chat capability — +adding the domain *destroys* the chat capability. diff --git a/docs/audits/13-followup/finding05-smol-qa/sway.yaml b/docs/audits/13-followup/finding05-smol-qa/sway.yaml new file mode 100644 index 0000000..e1bd895 --- /dev/null +++ b/docs/audits/13-followup/finding05-smol-qa/sway.yaml @@ -0,0 +1,90 @@ +# Finding 05 — Q/A-shape recipe on smol (does the recipe rescue the small base?) +# +# Same INSTRUCTION-only corpus as finding 04, but base flipped to +# SmolLM2-135M. Tests whether the recipe-shape fix that worked on +# qwen-coder-1.5b also rescues a 135M base, or whether the architectural +# floor diagnosed in finding 02 still bites independent of training-data +# shape. +# +# Run from the sway repo's venv after training completes: +# uv run --no-sync sway run \ +# docs/audits/13-followup/finding05-smol-qa/sway.yaml \ +# --json /tmp/sway-finding05.json --markdown /tmp/sway-finding05.md + +version: 1 + +dlm_source: /Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/finding05-smol-qa/the-doc.dlm + +models: + base: + kind: hf + base: HuggingFaceTB/SmolLM2-135M-Instruct + ft: + kind: hf + base: HuggingFaceTB/SmolLM2-135M-Instruct + adapter: ~/.dlm/store/01KQGBY1QWPFBKE0E8487PRB3E/adapter/versions/v0001 + +defaults: + seed: 0 + differential: true + coverage_threshold: 0.6 + score_weights: + adherence: 0.30 + attribution: 0.35 + calibration: 0.20 + ablation: 0.15 + +suite: + # ── Calibration baseline ───────────────────────────────────────── + - name: null_baseline + kind: null_adapter + runs: 3 + + # ── Adherence ──────────────────────────────────────────────────── + # Q/A-shape prompts so the LoRA's training-format trigger fires. + - name: dk_fortran_qa_shaped + kind: delta_kl + prompts: + - "What module in fortran-lang/stdlib provides sorting routines?" + - "Show the signature of stdlib's sorting routine in Fortran." + - "How do you read a CSV file with stdlib?" + - "What is the intent(in) attribute used for?" + - "How do you declare an allocatable real(real64) array?" + - "What does do concurrent do in Fortran?" + assert_z_gte: 1.0 + assert_mean_gte: 0.02 + + # ── Attribution (bridge-aware) ─────────────────────────────────── + - name: sis_fortran + kind: section_internalization + assert_z_gte: 1.0 + assert_passing_section_frac: 0.4 + + - name: para_fortran + kind: paraphrase_invariance + assert_z_gte: 1.0 + + # ── Calibration ────────────────────────────────────────────────── + - name: leak_fortran + kind: leakage + assert_z_gte: 1.0 + + # The single most load-bearing probe for "did we break the base?" + # On smol w/ raw-source training (finding 02), 26% items regressed. + # Hypothesis: Q/A-shape recipe regresses fewer items because LoRA + # only activates on Q/A-shaped prompts, leaving English chat alone. + - name: cal_general + kind: calibration_drift + items_limit: 50 + assert_z_gte: 1.0 + + # ── Ablation (the sway signature) ──────────────────────────────── + - name: abl_fortran + kind: adapter_ablation + lambdas: [0.0, 0.25, 0.5, 0.75, 1.0, 1.25] + prompts: + - "What module in fortran-lang/stdlib provides sorting routines?" + - "Show the signature of stdlib's sorting routine in Fortran." + - "How do you declare an allocatable real(real64) array?" + assert_linearity_gte: 0.85 + assert_z_gte: 1.0 diff --git a/docs/audits/13-followup/finding05-smol-qa/the-doc.dlm b/docs/audits/13-followup/finding05-smol-qa/the-doc.dlm new file mode 100644 index 0000000..c664582 --- /dev/null +++ b/docs/audits/13-followup/finding05-smol-qa/the-doc.dlm @@ -0,0 +1,415 @@ +--- +dlm_id: 01KQGBY1QWPFBKE0E8487PRB3E +dlm_version: 15 +base_model: smollm2-135m +training: + adapter: lora + lora_r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + sequence_len: 1024 + micro_batch_size: 1 + grad_accum: 8 + learning_rate: 2.0e-4 + warmup_ratio: 0.1 + num_epochs: 100 +--- + +# Finding 05 — Q/A-shape recipe on smol (does the recipe rescue the small base?) + +Hand-authored Q/A pairs over `fortran-lang/stdlib`, sourced from +`/tmp/stdlib_build/doc/specs/*.md`. No raw `sources:` directives — +every training row is a `messages` row routed through Qwen's chatml +template. The wedge to track is **Q3: signature of stdlib's sorting +routine.** Finding 03's adapter degenerated into "should be able to +sort..." enumeration; here the model should learn to point at +`stdlib_sorting :: SORT` with proper signature. + +stdlib_sorting density is intentional (~60% of pairs): it's the wedge +target. Other modules (io, math, strings, array) provide variety so +the LoRA isn't a one-module specialist. + +::instruction:: + +### Q +What module in fortran-lang/stdlib provides sorting routines? + +### A +The `stdlib_sorting` module. It provides four overloaded subroutines: +`ORD_SORT`, `SORT`, `RADIX_SORT`, and `SORT_INDEX`, plus the parameters +`int_index` and `int_index_low` for indexing. +::instruction:: + +### Q +Show the signature of stdlib's sorting routine in Fortran. + +### A +```fortran +use stdlib_sorting, only: sort +call sort(array[, reverse]) +``` +The `array` argument is `intent(inout)` and must be a rank-1 array of +an intrinsic numeric type (integer kinds, real kinds), `character(*)`, +`type(string_type)`, `type(bitset_64)`, or `type(bitset_large)`. +The optional `reverse` argument is a scalar logical with `intent(in)`. +::instruction:: + +### Q +What's the difference between `ord_sort` and `sort` in stdlib? + +### A +`ord_sort` is intended for arrays that have significant runs of +decreasing or non-decreasing values — it can achieve `O(N)` performance +on already-sorted or reverse-sorted input. `sort` is intended for +arrays that are effectively unordered before the call. +`ord_sort` is stable; `sort` is not. +::instruction:: + +### Q +What is `radix_sort` in `stdlib_sorting` for? + +### A +`radix_sort` sorts fixed-width intrinsic data types — integers and +reals. It exploits the bit representation of these types for non- +comparison-based sorting and can outperform `sort` on numeric data. +::instruction:: + +### Q +What does `sort_index` return? + +### A +`sort_index` sorts the input array AND returns indices that map the +original (pre-sort) array to its sorted version. This lets you reorder +related arrays in the same way without re-sorting them — useful when +sorting one column of a tabular dataset and needing to keep the other +columns aligned. +::instruction:: + +### Q +What's the signature of `ord_sort`? + +### A +```fortran +call ord_sort(array[, work, reverse]) +``` +- `array` : `intent(inout)`, rank-1 array of supported intrinsic type +- `work` (optional): `intent(out)`, rank-1 same type as `array`, at + least `size(array)/2` elements — used as scratch memory to reduce + stack pressure +- `reverse` (optional): scalar default-logical `intent(in)`; if + `.true.`, sort in non-increasing order +::instruction:: + +### Q +What sorting algorithm does `ord_sort` use under the hood? + +### A +A hybrid combining merge sort and insertion sort. For random input it +performs like merge sort with `O(N log N)` worst-case — typically +slower than `sort`. For input with long runs of monotonic values, it +falls back toward `O(N)`, which is its specific advantage over `sort`. +::instruction:: + +### Q +Is `stdlib_sorting`'s `sort` stable? + +### A +No. The `sort` subroutine is not stable. If you need a stable sort, use +`ord_sort`, which is stable in the case of equal keys. `radix_sort` is +also not stable. +::instruction:: + +### Q +What types can `stdlib_sorting`'s `ord_sort` accept? + +### A +Rank-1 arrays of any of: `integer(int8)`, `integer(int16)`, +`integer(int32)`, `integer(int64)`, `real(sp)`, `real(dp)`, `real(qp)`, +`character(*)`, `type(string_type)`, `type(bitset_64)`, or +`type(bitset_large)`. +::instruction:: + +### Q +How do you use `stdlib_sorting`'s `sort` to sort a real array in +descending order? + +### A +```fortran +use stdlib_sorting, only: sort +real, allocatable :: arr(:) +arr = [3.14, 1.41, 2.72, 0.58] +call sort(arr, reverse=.true.) +! arr is now [3.14, 2.72, 1.41, 0.58] +``` +::instruction:: + +### Q +What kind of integer does `int_index` evaluate to? + +### A +`int_index` is set to `int64` from the `stdlib_kinds` module — a +64-bit integer kind for indexing large arrays. The companion +`int_index_low` is `int32`. +::instruction:: + +### Q +What's the difference between `sort_adjoint` and `sort_index`? + +### A +Both are based on `ord_sort`. `sort_adjoint` re-orders a *second* +array of the same size using the same permutations applied to the +input. `sort_index` instead returns the index permutation itself, +which the caller can then apply to any number of related arrays. +::instruction:: + +### Q +What's the licensing situation for `stdlib_sorting`? + +### A +The Fortran Standard Library overall is MIT-licensed, but +`ord_sort`, `sort_adjoint`, `sort_index`, and `sort` are translations of +Rust's `slice.rs` sort, which is Apache-2.0 / MIT dual-licensed. +`radix_sort` does not have this Rust ancestry. +::instruction:: + +### Q +Why might you pass the optional `work` argument to `ord_sort`? + +### A +`work` is used as scratch memory for `ord_sort`'s internal record +keeping. Passing a pre-allocated `work` array (e.g. one in static +storage) avoids stack allocation during the sort, which can +significantly reduce stack memory pressure when sorting large arrays +inside deeply nested call chains. +::instruction:: + +### Q +What happens if you `ord_sort` a real array containing `NaN`? + +### A +The order of the result is undefined. The same applies to `sort`, +`sort_adjoint`, `sort_index`, and `radix_sort`. If your data may +contain NaN values, filter them out before calling stdlib's sort +routines. +::instruction:: + +### Q +Show how to sort an integer array AND get indices to reorder a +companion array using `stdlib_sorting`. + +### A +```fortran +use stdlib_sorting, only: sort_index, int_index +integer :: keys(5) = [3, 1, 4, 1, 5] +real :: vals(5) = [10.0, 20.0, 30.0, 40.0, 50.0] +integer(int_index) :: idx(5) +call sort_index(keys, idx) ! keys is now sorted; idx maps original->sorted +vals = vals(idx) ! reorder vals using the same permutation +``` +::instruction:: + +### Q +What are the four overloaded subroutines in `stdlib_sorting`? + +### A +`ORD_SORT`, `SORT`, `RADIX_SORT`, and `SORT_INDEX`. (`SORT_ADJOINT` +exists but the spec groups it under `ORD_SORT`'s family.) +::instruction:: + +### Q +Are stdlib's sort routines thread-safe? + +### A +Each call modifies its `intent(inout)` `array` argument in place, so +two threads must not call sort on the same array concurrently. With +distinct arrays, there's no shared state in the sort routines +themselves — they're reentrant. +::instruction:: + +### Q +What is the `stdlib_io` module for in fortran-lang/stdlib? + +### A +`stdlib_io` provides text-file IO routines — primarily `loadtxt` for +reading delimited numeric data into rank-2 arrays and `savetxt` for +writing them. It also exposes formatting helpers like `disp` for +human-readable display of arrays and scalars. +::instruction:: + +### Q +Show the signature of `stdlib_io::loadtxt`. + +### A +```fortran +use stdlib_io, only: loadtxt +call loadtxt(filename, array[, skiprows, max_rows, fmt]) +``` +The `array` is allocated and filled from the file; supported types +include `real(sp)`, `real(dp)`, and `complex(dp)`. `skiprows` skips +header lines, `max_rows` caps the read. +::instruction:: + +### Q +What's `stdlib_string_type`'s purpose? + +### A +It defines `type(string_type)` — a deferred-length, allocatable string +wrapper around `character(:)` — and overloads operators (`==`, `<`, +`//`) for it. This gives Fortran a value-semantic variable-length +string type that's ergonomic in containers and arrays, working around +the limitations of bare `character(*)`. +::instruction:: + +### Q +What does `stdlib_array` provide? + +### A +`stdlib_array` provides index-manipulation utilities — `trueloc` and +`falseloc` for finding indices where a logical array is true or false, +plus indexing helpers. Useful for masking and selecting elements +without writing manual loops. +::instruction:: + +### Q +What is `stdlib_math::linspace`? + +### A +`linspace(start, end, n)` returns a rank-1 array of `n` evenly-spaced +values from `start` to `end` inclusive. Same shape as numpy's `linspace`. +The companion `logspace(start, end, n[, base])` returns evenly-spaced +values in log space. +::instruction:: + +### Q +What is `stdlib_stats::mean` for? + +### A +Computes the arithmetic mean of a rank-N array along an optional +specified dimension. With `dim` omitted, returns a scalar mean over +all elements. With `dim=k`, returns a rank-(N-1) array of means along +the k-th axis. Optionally accepts a `mask` for conditional inclusion. +::instruction:: + +### Q +What's `stdlib_linalg::solve` for? + +### A +Solves a dense linear system `A . x = b` for `x`, where `A` is a +square matrix and `b` is a right-hand-side vector or matrix. Wraps +LAPACK's `gesv` family. Returns the solution; the input `A` is +typically modified in-place by the LU decomposition. +::instruction:: + +### Q +What `kind` parameters does `stdlib_kinds` define? + +### A +`int8`, `int16`, `int32`, `int64` for integer kinds and `sp`, `dp`, +`qp` for real kinds (single, double, quadruple precision). Also +`xdp` for extended-double where supported. These wrap the standard +`iso_fortran_env` kinds with shorter names. +::instruction:: + +### Q +What does `stdlib_ascii::to_upper` do? + +### A +Returns its `character(*)` argument with ASCII letters (a-z) mapped to +their uppercase equivalents (A-Z). Non-letter characters pass through +unchanged. The companion `to_lower` does the opposite. +::instruction:: + +### Q +What's `stdlib_strings::replace_all`? + +### A +```fortran +use stdlib_strings, only: replace_all +result = replace_all(string, pattern, replacement) +``` +Returns `string` with every occurrence of `pattern` substituted by +`replacement`. Pure and elemental, so it can be applied to scalar or +array string inputs. +::instruction:: + +### Q +What's the difference between `stdlib_random::random_seed` and +Fortran's intrinsic `random_seed`? + +### A +The intrinsic `random_seed` configures the implementation-dependent +default RNG. `stdlib_random::random_seed` is part of stdlib's +deterministic RNG path — it seeds stdlib's portable distribution +sampling routines, giving reproducible random sequences across +compilers. +::instruction:: + +### Q +What does `stdlib_hashmaps::hashmap_type` provide? + +### A +A generic open-addressing hash map keyed by integer or string and +storing arbitrary user-defined data. `set`, `get`, `remove`, `keys`, +and iterators are provided as type-bound procedures. +::instruction:: + +### Q +What's `stdlib_quadrature::trapz` for? + +### A +Numerical integration via the trapezoidal rule. `trapz(y, x)` returns +the integral of y dx using sample points `x` and corresponding values +`y`. Both must be rank-1 arrays of the same length. `simps` is the +cousin using Simpson's rule. +::instruction:: + +### Q +How does the `stdlib_logger` module emit log messages? + +### A +Define a `logger_type` instance, configure its level (`debug`, `info`, +`warning`, `error`), then call methods like `log_information`, +`log_warning`, etc. Output goes to the configured unit (default +stderr). Logger instances can be configured with timestamps, source +locations, and log-level filtering. +::instruction:: + +### Q +What does `intent(in)` mean for a Fortran dummy argument? + +### A +The argument may be read but not modified inside the procedure. +Attempting to assign to or pass it as `intent(out)`/`intent(inout)` +to another routine is a compile-time error (in conformant compilers). +This is the contract you want for read-only inputs. +::instruction:: + +### Q +When should you prefer `do concurrent` over a plain `do` loop? + +### A +When loop iterations have no carried dependence — that is, no +iteration reads or writes a variable that another iteration could +write. `do concurrent` expresses this independence to the compiler, +which can then auto-vectorize, parallelize via OpenMP, or offload to +GPU without further pragmas. If iterations are actually dependent, +use plain `do`. +::instruction:: + +### Q +What's the canonical way to declare an allocatable array of +`real(real64)` in modern Fortran? + +### A +```fortran +use iso_fortran_env, only: real64 +real(real64), allocatable :: arr(:) +allocate(arr(n)) +! ... use arr ... +deallocate(arr) +``` +Or use stdlib's `dp` kind: +```fortran +use stdlib_kinds, only: dp +real(dp), allocatable :: arr(:) +``` diff --git a/docs/audits/13-followup/findings/00-overview.md b/docs/audits/13-followup/findings/00-overview.md index a3657df..885e9d8 100644 --- a/docs/audits/13-followup/findings/00-overview.md +++ b/docs/audits/13-followup/findings/00-overview.md @@ -39,7 +39,17 @@ hypothesis about why the original audit-13 fortran fine-tune showed adapters). The audit closes GREEN with a clean three-step product story. -5. **[Finding 05 — corpus density](./05-corpus-density.md)** *(optional)* +5. **[Finding 05 — Q/A-shape recipe on smol](./05-smol-qa-shape.md)** + Took finding 04's exact INSTRUCTION-only corpus, swapped the base + to SmolLM2-135M. Result: **architectural floor confirmed.** Smol + overfits the 32 train pairs (eval_loss bottoms at epoch 20-30 then + climbs while train descends), produces token-salad on held-out + questions, and damages base capability so badly that "What is 2+2?" + hallucinates a `stdlib_array_plus` helper. Finding 02's "use 135M + for style-transfer demos only" caveat is a base-size constraint, + not a recipe constraint. + +6. **[Finding 06 — corpus density](./06-corpus-density.md)** *(optional)* Test the dataset-size / generalization curve. Find the per-module Q/A density floor below which the model can't generalize the API form. Gives dlm users a concrete planning number ("budget N Q/A diff --git a/docs/audits/13-followup/findings/05-smol-qa-shape.md b/docs/audits/13-followup/findings/05-smol-qa-shape.md new file mode 100644 index 0000000..25b6e0f --- /dev/null +++ b/docs/audits/13-followup/findings/05-smol-qa-shape.md @@ -0,0 +1,126 @@ +# Finding 05 — Q/A-shape recipe on smol: floor still bites + +**Hypothesis tested:** Finding 04 closed the wedge with a Q/A-shape +recipe on `qwen2.5-coder-1.5b`. Finding 02 ruled SmolLM2-135M +inadequate using a *raw-source* (PROSE-only) recipe. Open question: +was finding 02's verdict really about base size, or about the +combination of small base + bad recipe? Take finding 04's exact +INSTRUCTION-only corpus, swap in `smollm2-135m`, run. + +**Verdict:** **architectural floor confirmed.** The recipe-shape fix +is necessary but not sufficient. SmolLM2-135M overfits the 32 train +pairs, fails to generalize, and damages base capability — all signals +qualitatively worse than finding 02's PROSE recipe on the same base. +Finding 02's "use 135M for style-transfer demos only" caveat +generalizes: it's a base-size constraint, not a recipe constraint. + +## Setup + +- **Doc:** `docs/audits/13-followup/finding05-smol-qa/the-doc.dlm` +- **Store ULID:** `01KQGBY1QWPFBKE0E8487PRB3E` +- **Adapter:** `~/.dlm/store/01KQGBY1QWPFBKE0E8487PRB3E/adapter/versions/v0001` +- **Base:** `smollm2-135m` +- **Sections:** 35 INSTRUCTION (32 train / 3 val), copied verbatim + from finding 04 — *only* base + dlm_id changed +- **Recipe:** r=16 / α=32 / lr=2e-4 / 100 epochs (early-stopped at epoch 60) + +## Training curve + +| Epoch | train_loss | eval_loss | train_acc | eval_acc | +|---:|---:|---:|---:|---:| +| 10 | 2.62 | 2.22 | 0.52 | 0.61 | +| 20 | 1.77 | 1.64 | 0.65 | **0.68** ← best | +| 30 | 1.29 | 1.57 | 0.74 | 0.66 | +| 40 | 0.86 | 1.96 | 0.84 | 0.64 | +| 50 | 0.55 | 2.18 | 0.91 | 0.65 | +| 60 | 0.31 | 2.37 | 0.95 | 0.63 | + +**Eval bottoms at epoch 20-30 then climbs while train continues to +descend** — textbook overfitting. Finding 04 same recipe on +qwen-coder-1.5b: final eval **0.047**, eval_acc **0.986**. Smol's +endpoint eval is 50× higher; held-out token-acc is 35 percentage +points lower. The base can memorize the train pairs but lacks the +parameter capacity to generalize the patterns. + +## Direct-query smoke + +Full transcripts at +[`finding05-smol-qa/direct-query-results.md`](../finding05-smol-qa/direct-query-results.md). +Highlights vs finding 04: + +| Query | Finding 04 (qwen-coder-1.5b) | Finding 05 (smol) | +|---|---|---| +| Wedge (sorting signature) | clean verbatim | partial verbatim → gibberish | +| Held-out same-module | partial generalization | token salad | +| Held-out different module | plausible wrong API | non-syntactic hallucination | +| Capital of France | unchanged from base | "Fortified AI"/"NAM module" bleed | +| **2 + 2** | (not tested) | "you're calling `stdlib_array_plus`" | + +The 2+2 result is the punchline: the LoRA so saturates the small base +that even arithmetic gets routed through fortran-domain hallucinations. +Finding 02 measured this as `cal_general` 26% items regressed >1 nat; +the qualitative picture is consistent. + +## Why this falsifies the "recipe-shape rescues smol" idea + +Finding 04 narrative: "raw source teaches autocomplete; INSTRUCTION +teaches Q/A". One way to read this: the bad recipe was masking a base +that could in principle handle either domain expansion or chat — just +not both at once via raw-source training. + +Finding 05 falsifies this: with the *cleanest* recipe (no raw sources, +INSTRUCTION-only, exactly the corpus that worked on a 1.5B base), smol +*still* destroys general capability. The recipe wasn't the bottleneck; +the parameter count is. Going from "memorization with raw sources" to +"memorization with Q/A pairs" doesn't buy generalization, it just +changes what gets memorized. + +This is the cleanest signal in the audit for the **base-size table** +in dlm's docs: the smollm2-135m row should refuse domain training in +recommended configurations and steer users to ≥ 1B for any +specialty-knowledge task. + +## What this confirms about finding 02 + +Finding 02's MLX-fix retest (appended 2026-04-30) already showed the +adapter was being applied during finding 02; the negative result wasn't +an MLX silent-bypass artifact. Finding 05 closes the loop: even with +the recipe correction (which would have been the next obvious thing to +try if MLX-bypass were the issue), the verdict holds. The +architectural-floor finding is robust across: + +- Recipe shape (PROSE-only in finding 02 vs INSTRUCTION-only in + finding 05) +- Inference backend (verified MLX & PyTorch in finding 02 retest) +- Training duration (60 epochs early-stopped here, 800 steps in + finding 02) + +## Implications for the dlm product narrative + +Updating the three-step story from finding 04: + +1. **Use a base ≥ 1B params.** Smaller bases (135M) actively degrade + under LoRA training of *any* shape. Finding 05 confirms this is a + floor, not a recipe-fixable failure. Document a hard refusal in + `dlm doctor` or a loud warning at `dlm train` time when the + selected base is below the floor and the corpus is non-trivial. +2. **Train on INSTRUCTION-shaped data, not raw source code.** + (Unchanged from finding 04.) +3. **Plan one Q/A pair per question you want to answer.** + (Unchanged from finding 04, but with the caveat that this only + works above the base-size floor.) + +## Next experiment (optional) + +The base-size *floor* is now diagnosed; the question of where the +ceiling sits is open but lower priority for the product story. +Possibilities for a finding 06 or future audit: + +- Test SmolLM2-360M and SmolLM2-1.7B with the same finding-05 corpus — + is the cliff between 135M and 1B continuous or stepped? +- Test qwen2.5-coder-0.5B vs 1.5B with the same corpus — does + code-pretraining lower the floor by domain proximity? + +Both are nice-to-have for the recommended-base table refinement; the +finding-04+05 pair is sufficient to make the product claim ("use ≥ 1B") +publishable as written. From 1f36c746f111239903a15c3022ab1a72d1a48beb Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Thu, 30 Apr 2026 19:45:37 -0400 Subject: [PATCH 42/46] Audit 13-followup Finding 05: append sway results --- .../finding05-smol-qa/sway-results.json | 824 ++++++++++++++++++ .../finding05-smol-qa/sway-results.md | 41 + .../13-followup/findings/05-smol-qa-shape.md | 24 + 3 files changed, 889 insertions(+) create mode 100644 docs/audits/13-followup/finding05-smol-qa/sway-results.json create mode 100644 docs/audits/13-followup/finding05-smol-qa/sway-results.md diff --git a/docs/audits/13-followup/finding05-smol-qa/sway-results.json b/docs/audits/13-followup/finding05-smol-qa/sway-results.json new file mode 100644 index 0000000..33d88f7 --- /dev/null +++ b/docs/audits/13-followup/finding05-smol-qa/sway-results.json @@ -0,0 +1,824 @@ +{ + "adapter_id": "/Users/mfwolffe/.dlm/store/01KQGBY1QWPFBKE0E8487PRB3E/adapter/versions/v0001", + "backend_stats": { + "avg_batch_size": 4.666666666666667, + "batched_prompts": 28, + "batches_sent": 6, + "cache_hits": 180, + "cache_misses": 378, + "forward_passes": 378, + "hit_rate": 0.3225806451612903, + "max_batch_size": 6, + "scoring_wall_s": 14.60764731594827 + }, + "base_model_id": "HuggingFaceTB/SmolLM2-135M-Instruct", + "determinism": { + "class": "best_effort", + "notes": [ + "MPS: bit-identical across runs is best-effort" + ], + "seed": 0 + }, + "finished_at": "2026-04-30T23:44:54.359560+00:00", + "null_stats": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.173796021454004, + "n": 3.0, + "std": 0.1939931492701627 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.17610427655357053, + "n": 3.0, + "std": 0.07092708538349159 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.012714107831319174, + "n": 3.0, + "std": 0.013087756562633434 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": -2.509727251123853e-20, + "n": 3.0, + "std": 1e-06 + } + }, + "probes": [ + { + "base_value": null, + "ci_95": null, + "duration_s": 48.716706999985036, + "evidence": { + "calibrated_kinds": [ + "delta_kl", + "section_internalization", + "paraphrase_invariance", + "leakage", + "calibration_drift" + ], + "from_cache": false, + "init_scale": 0.02, + "null_stats": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.173796021454004, + "n": 3.0, + "std": 0.1939931492701627 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.17610427655357053, + "n": 3.0, + "std": 0.07092708538349159 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.012714107831319174, + "n": 3.0, + "std": 0.013087756562633434 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": -2.509727251123853e-20, + "n": 3.0, + "std": 1e-06 + } + }, + "null_stats_by_rank": { + "rank_1.00": { + "calibration_drift": { + "degenerate": 1.0, + "mean": 0.0, + "n": 3.0, + "std": 1e-06 + }, + "delta_kl": { + "degenerate": 0.0, + "mean": 0.173796021454004, + "n": 3.0, + "std": 0.1939931492701627 + }, + "leakage": { + "degenerate": 0.0, + "mean": 0.17610427655357053, + "n": 3.0, + "std": 0.07092708538349159 + }, + "paraphrase_invariance": { + "degenerate": 0.0, + "mean": -0.012714107831319174, + "n": 3.0, + "std": 0.013087756562633434 + }, + "section_internalization": { + "degenerate": 0.0, + "mean": -2.509727251123853e-20, + "n": 3.0, + "std": 1e-06 + } + } + }, + "per_kind_raw_samples": { + "calibration_drift": [ + 0.0, + 0.0, + 0.0 + ], + "delta_kl": [ + 0.44814371786693713, + 0.03648415034633113, + 0.036760196148743804 + ], + "leakage": [ + 0.11363636363636363, + 0.27530364372469635, + 0.13937282229965156 + ], + "paraphrase_invariance": [ + -0.006305098533630371, + -0.0008810758590698242, + -0.030956149101257324 + ], + "section_internalization": [ + -2.1081708909440364e-20, + -1.2046690805394493e-20, + -4.216341781888073e-20 + ] + }, + "rank_multipliers": [ + 1.0 + ], + "runs": 3, + "seed_base": 1000, + "skipped_kinds": [ + { + "kind": "adapter_ablation", + "reason": "probe opted out (calibrate_spec returned None)" + } + ], + "weight": 1.0 + }, + "ft_value": null, + "kind": "null_adapter", + "message": "null calibration: 5 kinds calibrated over 3 seeds (1 opted out)", + "name": "null_baseline", + "raw": null, + "score": 1.0, + "verdict": "pass", + "z_score": null + }, + { + "base_value": null, + "ci_95": [ + 0.09650551619626635, + 0.35946236070362286 + ], + "duration_s": 0.16365929201128893, + "evidence": { + "divergence_kind": "js", + "max": 0.5209904680483264, + "num_prompts": 6, + "per_prompt": [ + 0.035233903048109916, + 0.16547128814672135, + 0.5209904680483264, + 0.037578388980321654, + 0.28977815672822294, + 0.2656513511901071 + ], + "raw_ci_95": [ + 0.09650551619626635, + 0.35946236070362286 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": 0.23362287829993433 + } + }, + "ft_value": null, + "kind": "delta_kl", + "message": "mean js=0.2191, z=+0.23\u03c3 vs null", + "name": "dk_fortran_qa_shaped", + "raw": 0.2191172593569682, + "score": 0.5194587403778158, + "verdict": "fail", + "z_score": 0.23362287829993433 + }, + { + "base_value": null, + "ci_95": [ + -0.04250307630931393, + 0.04526532776670737 + ], + "duration_s": 1.3934379169950262, + "evidence": { + "num_sections": 36, + "passing_frac": 0.4166666666666667, + "per_section": [ + { + "base_nll": 5.164129638671875, + "effective_sis": 0.18507939311104665, + "ft_nll": 2.3199871826171874, + "kind": "prose", + "leak_lift": 0.36567023074738, + "own_lift": 0.5507496238584266, + "passed": true, + "section_id": "2ae73af79acd2aa3", + "tag": "" + }, + { + "base_nll": 3.811819657035496, + "effective_sis": -0.3556689245978278, + "ft_nll": 3.7164399520210596, + "kind": "instruction", + "leak_lift": 0.3806910173504043, + "own_lift": 0.02502209275257649, + "passed": false, + "section_id": "2452de7d2dd5c3b4", + "tag": "" + }, + { + "base_nll": 4.136927462172234, + "effective_sis": 0.2418172898270216, + "ft_nll": 1.6303156710219109, + "kind": "instruction", + "leak_lift": 0.36409417806082517, + "own_lift": 0.6059114678878468, + "passed": true, + "section_id": "21bb9fbcaaa7a060", + "tag": "" + }, + { + "base_nll": 3.098089139755458, + "effective_sis": 0.050937016426319925, + "ft_nll": 1.7958587228435359, + "kind": "instruction", + "leak_lift": 0.36939640787751127, + "own_lift": 0.4203334243038312, + "passed": true, + "section_id": "f76ff274207e54ba", + "tag": "" + }, + { + "base_nll": 3.3260064697265626, + "effective_sis": -0.07481219193212046, + "ft_nll": 2.334599609375, + "kind": "instruction", + "leak_lift": 0.37288944144302355, + "own_lift": 0.2980772495109031, + "passed": false, + "section_id": "1fcb5d4cbb821529", + "tag": "" + }, + { + "base_nll": 3.056639161828446, + "effective_sis": -0.022327395419529494, + "ft_nll": 1.9895537912029109, + "kind": "instruction", + "leak_lift": 0.3714315304287849, + "own_lift": 0.3491041350092554, + "passed": false, + "section_id": "26055358b11a4b71", + "tag": "" + }, + { + "base_nll": 4.293870785361842, + "effective_sis": 0.16398709383659743, + "ft_nll": 2.017074906198602, + "kind": "instruction", + "leak_lift": 0.366256127949448, + "own_lift": 0.5302432217860454, + "passed": true, + "section_id": "5c2e655d5acab066", + "tag": "" + }, + { + "base_nll": 3.731144400203929, + "effective_sis": 0.12062639040683182, + "ft_nll": 1.9100213892319624, + "kind": "instruction", + "leak_lift": 0.3674605919336082, + "own_lift": 0.48808698234044, + "passed": true, + "section_id": "5b30c1a667e7f247", + "tag": "" + }, + { + "base_nll": 2.9113540649414062, + "effective_sis": -0.10372084294061601, + "ft_nll": 2.12537110157502, + "kind": "instruction", + "leak_lift": 0.37369245952659286, + "own_lift": 0.26997161658597685, + "passed": false, + "section_id": "7c4c6be29d6bb58b", + "tag": "" + }, + { + "base_nll": 3.886013966686321, + "effective_sis": 0.07231549642739837, + "ft_nll": 2.1718230337466835, + "kind": "instruction", + "leak_lift": 0.36880256121081467, + "own_lift": 0.44111805763821305, + "passed": true, + "section_id": "355dc55a9b922c2f", + "tag": "" + }, + { + "base_nll": 4.977588742278343, + "effective_sis": -0.010462636306369733, + "ft_nll": 3.1824745355650434, + "kind": "instruction", + "leak_lift": 0.37110195378675265, + "own_lift": 0.3606393174803829, + "passed": false, + "section_id": "591edca2bcb639df", + "tag": "" + }, + { + "base_nll": 4.4858864232113485, + "effective_sis": 0.0649295265511961, + "ft_nll": 2.5392931887978003, + "kind": "instruction", + "leak_lift": 0.3690077270407092, + "own_lift": 0.4339372535919053, + "passed": true, + "section_id": "dd845297627e6316", + "tag": "" + }, + { + "base_nll": 3.1663431020883412, + "effective_sis": 0.16946833433579728, + "ft_nll": 1.4705377432016227, + "kind": "instruction", + "leak_lift": 0.3661038712689147, + "own_lift": 0.535572205604712, + "passed": true, + "section_id": "f33fa262dde9e576", + "tag": "" + }, + { + "base_nll": 4.629479471842448, + "effective_sis": 0.12450598650031502, + "ft_nll": 2.35242919921875, + "kind": "instruction", + "leak_lift": 0.3673528253754559, + "own_lift": 0.4918588118757709, + "passed": true, + "section_id": "51b2d7bab71a2394", + "tag": "" + }, + { + "base_nll": 3.5608963639768834, + "effective_sis": 0.06661783229160151, + "ft_nll": 2.009845890410959, + "kind": "instruction", + "leak_lift": 0.36896082965903126, + "own_lift": 0.4355786619506328, + "passed": true, + "section_id": "a1471c8243e51656", + "tag": "" + }, + { + "base_nll": 3.46660628973269, + "effective_sis": 0.08009064584719167, + "ft_nll": 1.9112189797794117, + "kind": "instruction", + "leak_lift": 0.36858658483804263, + "own_lift": 0.4486772306852343, + "passed": true, + "section_id": "e4de1e9b6253d0fe", + "tag": "" + }, + { + "base_nll": 3.724906558082217, + "effective_sis": 0.0987929679005865, + "ft_nll": 1.9858965192522322, + "kind": "instruction", + "leak_lift": 0.368067075892115, + "own_lift": 0.4668600437927015, + "passed": true, + "section_id": "3f45b9dc7adb5ae2", + "tag": "" + }, + { + "base_nll": 5.869542275705645, + "effective_sis": -0.10509210902484184, + "ft_nll": 4.2927575880481355, + "kind": "instruction", + "leak_lift": 0.3737305502511547, + "own_lift": 0.26863844122631286, + "passed": false, + "section_id": "49b016bc73973496", + "tag": "" + }, + { + "base_nll": 3.878383435701069, + "effective_sis": -0.2987410517286544, + "ft_nll": 3.566683049787555, + "kind": "instruction", + "leak_lift": 0.3791096875484828, + "own_lift": 0.08036863581982842, + "passed": false, + "section_id": "7e22432d60d48676", + "tag": "" + }, + { + "base_nll": 3.624024938364498, + "effective_sis": 0.19054229192462147, + "ft_nll": 1.6088468207687627, + "kind": "instruction", + "leak_lift": 0.365518483558114, + "own_lift": 0.5560607754827355, + "passed": true, + "section_id": "26f0a3de93f72693", + "tag": "" + }, + { + "base_nll": 4.089062410242417, + "effective_sis": 0.03307038220764158, + "ft_nll": 2.441321204690372, + "kind": "instruction", + "leak_lift": 0.3698927032724746, + "own_lift": 0.40296308548011617, + "passed": false, + "section_id": "068761bfd5a13bf0", + "tag": "" + }, + { + "base_nll": 4.3356565817808495, + "effective_sis": -0.04247155325080737, + "ft_nll": 2.9069730318509617, + "kind": "instruction", + "leak_lift": 0.3719910903685426, + "own_lift": 0.32951953711773524, + "passed": false, + "section_id": "c36538fa2d77f1dc", + "tag": "" + }, + { + "base_nll": 3.747733806741649, + "effective_sis": -0.04766004529346185, + "ft_nll": 2.5316872432314117, + "kind": "instruction", + "leak_lift": 0.37213521514750525, + "own_lift": 0.3244751698540434, + "passed": false, + "section_id": "8c64273ee598cdc8", + "tag": "" + }, + { + "base_nll": 3.1840599323141165, + "effective_sis": -0.043150407071232744, + "ft_nll": 2.136951446533203, + "kind": "instruction", + "leak_lift": 0.37200994741911, + "own_lift": 0.32885954034787723, + "passed": false, + "section_id": "22844ad1e6cc21b5", + "tag": "" + }, + { + "base_nll": 3.509236952837776, + "effective_sis": 0.12555988804335655, + "ft_nll": 1.7795921774471508, + "kind": "instruction", + "leak_lift": 0.3673235503325936, + "own_lift": 0.49288343837595017, + "passed": true, + "section_id": "ca4042aec6060b82", + "tag": "" + }, + { + "base_nll": 3.8464840748271003, + "effective_sis": 0.021109981847880432, + "ft_nll": 2.341220543032787, + "kind": "instruction", + "leak_lift": 0.37022493661580125, + "own_lift": 0.3913349184636817, + "passed": false, + "section_id": "6d1105d942e21fe7", + "tag": "" + }, + { + "base_nll": 4.2444513956705725, + "effective_sis": -0.11153607984572833, + "ft_nll": 3.1308199564615884, + "kind": "instruction", + "leak_lift": 0.3739095494406237, + "own_lift": 0.2623734695948954, + "passed": false, + "section_id": "b2d3d820f1c1c345", + "tag": "" + }, + { + "base_nll": 3.5749533971150718, + "effective_sis": -0.17598699916513005, + "ft_nll": 2.860989252726237, + "kind": "instruction", + "leak_lift": 0.3756998527550516, + "own_lift": 0.19971285358992152, + "passed": false, + "section_id": "2a8f4c8330dc4df5", + "tag": "" + }, + { + "base_nll": 3.0795318603515627, + "effective_sis": -0.19925457730258353, + "ft_nll": 2.5341726449819713, + "kind": "instruction", + "leak_lift": 0.37634617436998086, + "own_lift": 0.17709159706739733, + "passed": false, + "section_id": "b176cf46a20e0714", + "tag": "" + }, + { + "base_nll": 3.2099117449860075, + "effective_sis": -0.03496741567277334, + "ft_nll": 2.128764593779151, + "kind": "instruction", + "leak_lift": 0.37178264210248607, + "own_lift": 0.33681522642971273, + "passed": false, + "section_id": "8169b4e2d63784ec", + "tag": "" + }, + { + "base_nll": 4.230460788892663, + "effective_sis": -0.20378740910551404, + "ft_nll": 3.49992503290591, + "kind": "instruction", + "leak_lift": 0.3764720863645067, + "own_lift": 0.17268467725899264, + "passed": false, + "section_id": "d6008217fa164da7", + "tag": "" + }, + { + "base_nll": 3.511493025154903, + "effective_sis": 0.04586452455157064, + "ft_nll": 2.0528123789820176, + "kind": "instruction", + "leak_lift": 0.36953731042958765, + "own_lift": 0.4154018349811583, + "passed": false, + "section_id": "f83dc98d1b6ddcae", + "tag": "" + }, + { + "base_nll": 3.3412074788411457, + "effective_sis": -0.011300414593640506, + "ft_nll": 2.1389581298828126, + "kind": "instruction", + "leak_lift": 0.37112522540584353, + "own_lift": 0.359824810812203, + "passed": false, + "section_id": "668c04d7ae53e8fe", + "tag": "" + }, + { + "base_nll": 3.6240665496341764, + "effective_sis": -0.2049958633403854, + "ft_nll": 3.002503652421255, + "kind": "instruction", + "leak_lift": 0.3765056545376976, + "own_lift": 0.1715097911973122, + "passed": false, + "section_id": "64606c5209a76c28", + "tag": "" + }, + { + "base_nll": 3.727417701898619, + "effective_sis": 0.008367172648670262, + "ft_nll": 2.314927389455396, + "kind": "instruction", + "leak_lift": 0.37057890353800155, + "own_lift": 0.3789460761866718, + "passed": false, + "section_id": "16f25169869c5c6f", + "tag": "" + }, + { + "base_nll": 4.547536041777013, + "effective_sis": 0.18225370190557222, + "ft_nll": 2.0554752673132946, + "kind": "instruction", + "leak_lift": 0.3657487221697543, + "own_lift": 0.5480024240753265, + "passed": true, + "section_id": "0f34524d32acc1b6", + "tag": "" + } + ], + "per_section_threshold": 0.05, + "raw_ci_95": [ + -0.04250307630931393, + 0.04526532776670737 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": 4.651026541782724e-12 + } + }, + "ft_value": null, + "kind": "section_internalization", + "message": "15/36 sections cleared; mean effective_sis=+0.000, z=+0.00\u03c3 vs null", + "name": "sis_fortran", + "raw": 4.625929269271485e-18, + "score": 0.5000000000003876, + "verdict": "fail", + "z_score": 4.651026541782724e-12 + }, + { + "base_value": null, + "ci_95": null, + "duration_s": 2.9580260161310434e-06, + "evidence": {}, + "ft_value": null, + "kind": "paraphrase_invariance", + "message": "no cases provided", + "name": "para_fortran", + "raw": null, + "score": null, + "verdict": "error", + "z_score": null + }, + { + "base_value": null, + "ci_95": [ + 0.03139013452914798, + 0.03139013452914798 + ], + "duration_s": 18.9565314159845, + "evidence": { + "mean_clean_recall": 0.03139013452914798, + "mean_fragility": 0.0, + "mean_perturbed_recall": 0.0407330191075767, + "per_section": [ + { + "clean_recall": 0.03139013452914798, + "fragility": 0.0, + "perturbed_recall": 0.0407330191075767, + "section_id": "2ae73af79acd2aa3" + } + ], + "raw_ci_95": [ + 0.03139013452914798, + 0.03139013452914798 + ], + "weight": 1.0, + "z_by_rank": { + "rank_1.00": 2.040322695370548 + } + }, + "ft_value": 0.0, + "kind": "leakage", + "message": "greedy_recall=0.03 (perturbed=0.04, fragility=0.00), z=+2.04\u03c3 vs null", + "name": "leak_fortran", + "raw": 0.03139013452914798, + "score": 0.6637627044025881, + "verdict": "pass", + "z_score": 2.040322695370548 + }, + { + "base_value": null, + "ci_95": [ + 0.04, + 0.22 + ], + "duration_s": 3.106611958995927, + "evidence": { + "fraction_regressed": 0.12, + "mean_delta_nats": -0.17035713793127796, + "raw_ci_95": [ + 0.04, + 0.22 + ], + "regressed_count": 6, + "regression_nats_threshold": 1.0, + "total_items": 50, + "weight": 1.0, + "worst_offenders": [ + { + "delta": -4.315955400466919, + "gold": " Europe", + "prompt": "The Mediterranean Sea borders southern" + }, + { + "delta": -1.7071008682250977, + "gold": " China", + "prompt": "Mount Everest is located on the border of Nepal and" + }, + { + "delta": -1.4701178073883057, + "gold": " Brasilia", + "prompt": "The capital of Brazil is" + }, + { + "delta": -1.428731918334961, + "gold": " Sun", + "prompt": "The closest star to Earth is the" + }, + { + "delta": -1.321993350982666, + "gold": " Asia", + "prompt": "The largest continent is" + } + ], + "z_by_rank": null + }, + "ft_value": -0.17035713793127796, + "kind": "calibration_drift", + "message": "6/50 items regressed >1.0 nats (frac=12.0%), mean_delta=-0.170 nats/tok (no calibration for calibration_drift)", + "name": "cal_general", + "raw": 0.12, + "score": 0.34123809655165915, + "verdict": "pass", + "z_score": null + }, + { + "base_value": null, + "ci_95": null, + "duration_s": 1.0620087500137743, + "evidence": { + "lambdas": [ + 0.0, + 0.25, + 0.5, + 0.75, + 1.0, + 1.25 + ], + "linearity": 0.9103879862304414, + "mean_divergence_per_lambda": [ + 0.0, + 0.024096238486922095, + 0.06339058010139965, + 0.12162927505983301, + 0.16378600880464075, + 0.3086516331232216 + ], + "overshoot": 1.8844810700001389, + "passed_linearity": true, + "passed_overshoot": true, + "passed_saturation": false, + "saturation_lambda": 1.25, + "saturation_reason": "found", + "weight": 1.0, + "z_by_rank": null + }, + "ft_value": null, + "kind": "adapter_ablation", + "message": "R\u00b2=0.91, sat_\u03bb=1.25 (out of band), overshoot=1.88 (no calibration for adapter_ablation)", + "name": "abl_fortran", + "raw": 0.9103879862304414, + "score": 0.79, + "verdict": "fail", + "z_score": null + } + ], + "schema_version": 1, + "score": { + "band": "partial", + "components": { + "ablation": 0.79, + "adherence": 0.5194587403778158, + "attribution": 0.5000000000003876, + "baseline": 1.0, + "calibration": 0.5025004004771236 + }, + "findings": [ + "dk_fortran_qa_shaped (delta_kl) failed: mean js=0.2191, z=+0.23\u03c3 vs null", + "1 probe(s) errored \u2014 see full report for details" + ], + "overall": 0.5498377022089052, + "weights": { + "ablation": 0.15, + "adherence": 0.3, + "attribution": 0.35, + "baseline": 0.0, + "calibration": 0.2 + } + }, + "spec_path": "/Users/mfwolffe/GithubOrgs/tenseleyFlow/DocumentLanguageModel/docs/audits/13-followup/finding05-smol-qa/sway.yaml", + "started_at": "2026-04-30T23:43:40.737655+00:00", + "sway_version": "0.1.0", + "wall_seconds": 73.621905 +} \ No newline at end of file diff --git a/docs/audits/13-followup/finding05-smol-qa/sway-results.md b/docs/audits/13-followup/finding05-smol-qa/sway-results.md new file mode 100644 index 0000000..6ddf710 --- /dev/null +++ b/docs/audits/13-followup/finding05-smol-qa/sway-results.md @@ -0,0 +1,41 @@ +# sway report + +**Overall:** 0.55 (`partial`) +**Base:** `HuggingFaceTB/SmolLM2-135M-Instruct` +**Adapter:** `adapter/versions/v0001` +**Wall:** 73.6s +**Determinism:** `best_effort` (seed=0) +**Backend:** cache: 180/558 = 32% | batches: 6 (avg=4.7) + +## Components + +| category | score | weight | | +|---|---:|---:|---| +| adherence | 0.52 | 0.30 | | +| attribution | 0.50 | 0.35 | | +| calibration | 0.50 | 0.20 | | +| ablation | 0.79 | 0.15 | | +| baseline | 1.00 | 0.00 | (informational, weight=0) | + +## Probes + +| name | kind | verdict | score | raw | ci95 | z | duration | note | +|---|---|---|---:|---:|---:|---:|---:|---| +| null_baseline | `null_adapter` | pass | 1.00 | — | — | — | 48.7s | null calibration: 5 kinds calibrated over 3 seeds (1 opted out) | +| dk_fortran_qa_shaped | `delta_kl` | fail | 0.52 | 0.219 | [0.097, 0.359] | +0.23σ | 0.16s | mean js=0.2191, z=+0.23σ vs null | +| sis_fortran | `section_internalization` | fail | 0.50 | 0.000 | [-0.043, 0.045] | +0.00σ | 1.39s | 15/36 sections cleared; mean effective_sis=+0.000, z=+0.00σ vs null | +| para_fortran | `paraphrase_invariance` | error | — | — | — | — | 0.00s | no cases provided | +| leak_fortran | `leakage` | pass | 0.66 | 0.031 | [0.031, 0.031] | +2.04σ | 19.0s | greedy_recall=0.03 (perturbed=0.04, fragility=0.00), z=+2.04σ vs null | +| cal_general | `calibration_drift` | pass | 0.34 | 0.120 | [0.040, 0.220] | — | 3.11s | 6/50 items regressed >1.0 nats (frac=12.0%), mean_delta=-0.170 nats/tok (no calibration for calibration_drift) | +| abl_fortran | `adapter_ablation` | fail | 0.79 | 0.910 | — | — | 1.06s | R²=0.91, sat_λ=1.25 (out of band), overshoot=1.88 (no calibration for adapter_ablation) | + +## Top findings + +- dk_fortran_qa_shaped (delta_kl) failed: mean js=0.2191, z=+0.23σ vs null +- 1 probe(s) errored — see full report for details + +## Degenerate null calibration + +1 probe kind(s) ran null_adapter but the resulting baseline was too narrow for z-scoring (std ≈ 0, typically `runs: 1` or coincidentally-matched seeds). Fix: bump `runs:` in the `null_adapter` spec entry. Affected kinds: + +- `calibration_drift` diff --git a/docs/audits/13-followup/findings/05-smol-qa-shape.md b/docs/audits/13-followup/findings/05-smol-qa-shape.md index 25b6e0f..51ba215 100644 --- a/docs/audits/13-followup/findings/05-smol-qa-shape.md +++ b/docs/audits/13-followup/findings/05-smol-qa-shape.md @@ -42,6 +42,30 @@ endpoint eval is 50× higher; held-out token-acc is 35 percentage points lower. The base can memorize the train pairs but lacks the parameter capacity to generalize the patterns. +## sway results + +Composite **0.55 (`partial`)**. Full report at +[`finding05-smol-qa/sway-results.md`](../finding05-smol-qa/sway-results.md). + +| probe | verdict | z | reading | +|---|---|---:|---| +| `dk_fortran_qa_shaped` | **FAIL** | +0.23σ | adapter shift on Q/A prompts is noise-level; the Q/A-shape recipe didn't even teach smol to *fire* on Q/A-shaped inputs | +| `sis_fortran` | **FAIL** | +0.00σ | 15/36 sections cleared; **no per-section internalization signal whatsoever** | +| `para_fortran` | ERROR | — | sway bridge "no cases provided" — separate sway issue | +| `leak_fortran` | PASS | +2.04σ | greedy_recall=0.03, fragility=0.00 — textbook memorization fingerprint | +| `cal_general` | pass* | — | **12% items regressed >1 nat** (was 26% in finding 02); recipe-shape fix mitigates forgetting but doesn't eliminate it | +| `abl_fortran` | **FAIL** | — | R²=0.91 linearity, but **overshoot=1.88, sat_λ=1.25 out of band** — adapter is in pathological "more is more" territory, not converged at a coherent minimum | + +*`cal_general` formally passed because its null-baseline std collapsed +to zero (1 of the 3 seeds duplicated), but the raw 12% regression is +the load-bearing signal — that's 6/50 general-competence items broken +by the LoRA on a model that started at 100%. + +The adherence and attribution probes both at noise-level (z ≈ 0) is +the cleanest signal here: **the adapter doesn't reliably activate on +the trained input shape** despite 95% train token-acc. That's the +parameter-capacity bottleneck on display. + ## Direct-query smoke Full transcripts at From 88e989b225d911da669fed3187c450cebd6932e2 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Sat, 2 May 2026 23:09:25 -0400 Subject: [PATCH 43/46] Wrap vllm + mlx-serve target export pipelines in dlm.export.entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifts each server target's prepare → smoke → finalize chain out of the CLI into a typed dispatcher. CLI just builds a Request, calls the runner, and renders. Smoke failure surfaces as a populated 'smoke' field with ok=False (and manifest_path=None), so the CLI keeps full control of exit codes. Dotted import of dlm.export.targets keeps existing test fixture monkeypatches visible at call time. --- src/dlm/cli/commands/export.py | 95 ++++++++++++------------- src/dlm/export/entry.py | 124 +++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+), 50 deletions(-) create mode 100644 src/dlm/export/entry.py diff --git a/src/dlm/cli/commands/export.py b/src/dlm/cli/commands/export.py index 2ef6b41..f160ced 100644 --- a/src/dlm/cli/commands/export.py +++ b/src/dlm/cli/commands/export.py @@ -146,6 +146,12 @@ def export_cmd( resolve_export_plan, run_export, ) + from dlm.export.entry import ( + MlxServeExportRequest, + VllmExportRequest, + run_mlx_serve_target_export, + run_vllm_target_export, + ) from dlm.export.ollama import ( OllamaBinaryNotFoundError, OllamaCreateError, @@ -154,14 +160,7 @@ def export_cmd( OllamaVersionError, ) from dlm.export.quantize import run_checked - from dlm.export.targets import ( - finalize_mlx_serve_export, - finalize_vllm_export, - prepare_llama_server_export, - prepare_mlx_serve_export, - prepare_vllm_export, - resolve_target, - ) + from dlm.export.targets import prepare_llama_server_export, resolve_target from dlm.store.paths import for_dlm console = Console(stderr=True) @@ -407,44 +406,42 @@ def _verbose_runner(cmd: Sequence[str]) -> object: declared_adapter_names = tuple(adapters_declared.keys()) if adapters_declared else None try: - vllm_result = prepare_vllm_export( - store=store, - spec=spec, - served_model_name=name or f"dlm-{parsed.frontmatter.dlm_id.lower()}", - training_sequence_len=parsed.frontmatter.training.sequence_len, - adapter_name=adapter, - adapter_path_override=adapter_path_override, - declared_adapter_names=declared_adapter_names, + vllm_outcome = run_vllm_target_export( + VllmExportRequest( + target=resolved_target, + store=store, + spec=spec, + served_model_name=name or f"dlm-{parsed.frontmatter.dlm_id.lower()}", + training_sequence_len=parsed.frontmatter.training.sequence_len, + adapter_name=adapter, + adapter_path_override=adapter_path_override, + declared_adapter_names=declared_adapter_names, + adapter_mix=mix_entries, + no_smoke=no_smoke, + ) ) except ExportError as exc: console.print(f"[red]export:[/red] {exc}") raise typer.Exit(code=1) from exc - vllm_smoke = None if no_smoke else resolved_target.smoke_test(vllm_result) - if vllm_smoke is not None and not vllm_smoke.ok: + if vllm_outcome.smoke is not None and not vllm_outcome.smoke.ok: console.print( - f"[red]smoke:[/red] {vllm_smoke.detail}\n" + f"[red]smoke:[/red] {vllm_outcome.smoke.detail}\n" " re-run with `--no-smoke` to skip the smoke test." ) raise typer.Exit(code=1) - manifest_path = finalize_vllm_export( - store=store, - spec=spec, - prepared=vllm_result, - smoke_output_first_line=None if vllm_smoke is None else vllm_smoke.detail, - adapter_name=adapter, - adapter_mix=mix_entries, - ) + vllm_result = vllm_outcome.prepared + assert vllm_outcome.manifest_path is not None console.print(f"[green]exported:[/green] {vllm_result.export_dir}") console.print("target: vllm") assert vllm_result.launch_script_path is not None assert vllm_result.config_path is not None console.print(f"launch: {vllm_result.launch_script_path.name}") console.print(f"config: {vllm_result.config_path.name}") - console.print(f"manifest: {manifest_path.name}") - if vllm_smoke is not None and vllm_smoke.detail: - console.print(f"smoke: {vllm_smoke.detail}") + console.print(f"manifest: {vllm_outcome.manifest_path.name}") + if vllm_outcome.smoke is not None and vllm_outcome.smoke.detail: + console.print(f"smoke: {vllm_outcome.smoke.detail}") return if resolved_target.name == "mlx-serve": @@ -475,40 +472,38 @@ def _verbose_runner(cmd: Sequence[str]) -> object: declared_adapter_names = tuple(adapters_declared.keys()) if adapters_declared else None try: - mlx_serve_result = prepare_mlx_serve_export( - store=store, - spec=spec, - adapter_name=adapter, - adapter_path_override=adapter_path_override, - declared_adapter_names=declared_adapter_names, + mlx_outcome = run_mlx_serve_target_export( + MlxServeExportRequest( + target=resolved_target, + store=store, + spec=spec, + adapter_name=adapter, + adapter_path_override=adapter_path_override, + declared_adapter_names=declared_adapter_names, + adapter_mix=mix_entries, + no_smoke=no_smoke, + ) ) except ExportError as exc: console.print(f"[red]export:[/red] {exc}") raise typer.Exit(code=1) from exc - mlx_serve_smoke = None if no_smoke else resolved_target.smoke_test(mlx_serve_result) - if mlx_serve_smoke is not None and not mlx_serve_smoke.ok: + if mlx_outcome.smoke is not None and not mlx_outcome.smoke.ok: console.print( - f"[red]smoke:[/red] {mlx_serve_smoke.detail}\n" + f"[red]smoke:[/red] {mlx_outcome.smoke.detail}\n" " re-run with `--no-smoke` to skip the smoke test." ) raise typer.Exit(code=1) - manifest_path = finalize_mlx_serve_export( - store=store, - spec=spec, - prepared=mlx_serve_result, - smoke_output_first_line=None if mlx_serve_smoke is None else mlx_serve_smoke.detail, - adapter_name=adapter, - adapter_mix=mix_entries, - ) + mlx_serve_result = mlx_outcome.prepared + assert mlx_outcome.manifest_path is not None console.print(f"[green]exported:[/green] {mlx_serve_result.export_dir}") console.print("target: mlx-serve") assert mlx_serve_result.launch_script_path is not None console.print(f"launch: {mlx_serve_result.launch_script_path.name}") - console.print(f"manifest: {manifest_path.name}") - if mlx_serve_smoke is not None and mlx_serve_smoke.detail: - console.print(f"smoke: {mlx_serve_smoke.detail}") + console.print(f"manifest: {mlx_outcome.manifest_path.name}") + if mlx_outcome.smoke is not None and mlx_outcome.smoke.detail: + console.print(f"smoke: {mlx_outcome.smoke.detail}") return try: diff --git a/src/dlm/export/entry.py b/src/dlm/export/entry.py new file mode 100644 index 0000000..53bc96d --- /dev/null +++ b/src/dlm/export/entry.py @@ -0,0 +1,124 @@ +"""Per-target export orchestration. + +Wraps the prepare → smoke → finalize chain for the OpenAI-compat server +targets (vLLM, MLX-serve) so the CLI doesn't repeat the same plumbing +for each. Returns typed results the CLI renders; smoke failure surfaces +as a populated `smoke` field with `ok=False`, leaving `manifest_path` +unset, so the CLI can decide its own exit code without the dispatcher +making control-flow choices. + +External-module imports are dotted (e.g. `from dlm.export import +targets as _targets; _targets.prepare_vllm_export(...)`) so test +fixtures that monkeypatch `dlm.export.targets.` resolve at call +time. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from dlm.export import targets as _targets + +if TYPE_CHECKING: + from pathlib import Path + + from dlm.base_models.schema import BaseModelSpec + from dlm.export.targets.base import ExportTarget, SmokeResult, TargetResult + from dlm.store.paths import StorePath + + +@dataclass(frozen=True) +class VllmExportRequest: + """Inputs to `run_vllm_target_export`.""" + + target: ExportTarget + store: StorePath + spec: BaseModelSpec + served_model_name: str + training_sequence_len: int | None + adapter_name: str | None + adapter_path_override: Path | None + declared_adapter_names: tuple[str, ...] | None + adapter_mix: list[tuple[str, float]] | None + no_smoke: bool + + +@dataclass(frozen=True) +class MlxServeExportRequest: + """Inputs to `run_mlx_serve_target_export`.""" + + target: ExportTarget + store: StorePath + spec: BaseModelSpec + adapter_name: str | None + adapter_path_override: Path | None + declared_adapter_names: tuple[str, ...] | None + adapter_mix: list[tuple[str, float]] | None + no_smoke: bool + + +@dataclass(frozen=True) +class ServerTargetExportResult: + """Outcome of a server-target export. + + `manifest_path` is `None` when smoke failed (finalize was skipped); + the CLI surfaces a smoke-failure exit in that case. `smoke` is `None` + when `--no-smoke` was set. + """ + + prepared: TargetResult + smoke: SmokeResult | None + manifest_path: Path | None + + +def run_vllm_target_export(req: VllmExportRequest) -> ServerTargetExportResult: + """Stage vLLM artifacts, smoke-test the server, then finalize the manifest.""" + prepared = _targets.prepare_vllm_export( + store=req.store, + spec=req.spec, + served_model_name=req.served_model_name, + training_sequence_len=req.training_sequence_len, + adapter_name=req.adapter_name, + adapter_path_override=req.adapter_path_override, + declared_adapter_names=req.declared_adapter_names, + ) + + smoke = None if req.no_smoke else req.target.smoke_test(prepared) + if smoke is not None and not smoke.ok: + return ServerTargetExportResult(prepared=prepared, smoke=smoke, manifest_path=None) + + manifest_path = _targets.finalize_vllm_export( + store=req.store, + spec=req.spec, + prepared=prepared, + smoke_output_first_line=None if smoke is None else smoke.detail, + adapter_name=req.adapter_name, + adapter_mix=req.adapter_mix, + ) + return ServerTargetExportResult(prepared=prepared, smoke=smoke, manifest_path=manifest_path) + + +def run_mlx_serve_target_export(req: MlxServeExportRequest) -> ServerTargetExportResult: + """Stage MLX-serve artifacts, smoke-test the server, then finalize the manifest.""" + prepared = _targets.prepare_mlx_serve_export( + store=req.store, + spec=req.spec, + adapter_name=req.adapter_name, + adapter_path_override=req.adapter_path_override, + declared_adapter_names=req.declared_adapter_names, + ) + + smoke = None if req.no_smoke else req.target.smoke_test(prepared) + if smoke is not None and not smoke.ok: + return ServerTargetExportResult(prepared=prepared, smoke=smoke, manifest_path=None) + + manifest_path = _targets.finalize_mlx_serve_export( + store=req.store, + spec=req.spec, + prepared=prepared, + smoke_output_first_line=None if smoke is None else smoke.detail, + adapter_name=req.adapter_name, + adapter_mix=req.adapter_mix, + ) + return ServerTargetExportResult(prepared=prepared, smoke=smoke, manifest_path=manifest_path) From 3092c524c74416f0b5754be97ec152bfc3b4b02b Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Sat, 2 May 2026 23:17:43 -0400 Subject: [PATCH 44/46] Wrap llama-server post-export prep + smoke in dlm.export.entry Lifts the adapter-dir resolution + prepare_llama_server_export + smoke chain out of the CLI's llama-server branch. CLI just builds a LlamaServerPostExportRequest, calls run_llama_server_post_export, and renders the typed result. VendoringError + ExportError still propagate to the CLI for target-specific banner formatting. --- src/dlm/cli/commands/export.py | 32 ++++++++++---------- src/dlm/export/entry.py | 53 ++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 16 deletions(-) diff --git a/src/dlm/cli/commands/export.py b/src/dlm/cli/commands/export.py index f160ced..9ea7cf6 100644 --- a/src/dlm/cli/commands/export.py +++ b/src/dlm/cli/commands/export.py @@ -147,8 +147,10 @@ def export_cmd( run_export, ) from dlm.export.entry import ( + LlamaServerPostExportRequest, MlxServeExportRequest, VllmExportRequest, + run_llama_server_post_export, run_mlx_serve_target_export, run_vllm_target_export, ) @@ -160,7 +162,7 @@ def export_cmd( OllamaVersionError, ) from dlm.export.quantize import run_checked - from dlm.export.targets import prepare_llama_server_export, resolve_target + from dlm.export.targets import resolve_target from dlm.store.paths import for_dlm console = Console(stderr=True) @@ -568,21 +570,18 @@ def _verbose_runner(cmd: Sequence[str]) -> object: raise typer.Exit(code=1) from exc if resolved_target.name == "llama-server": - adapter_dir = adapter_path_override - if adapter_dir is None: - if adapter is None: - adapter_dir = store.resolve_current_adapter() - else: - adapter_dir = store.resolve_current_adapter_for(adapter) - assert adapter_dir is not None try: - llama_server_result = prepare_llama_server_export( - export_dir=result.export_dir, - manifest_path=result.manifest_path, - artifacts=result.artifacts, - adapter_dir=adapter_dir, - spec=spec, - training_sequence_len=parsed.frontmatter.training.sequence_len, + llama_server_outcome = run_llama_server_post_export( + LlamaServerPostExportRequest( + target=resolved_target, + store=store, + spec=spec, + base_export=result, + adapter_name=adapter, + adapter_path_override=adapter_path_override, + training_sequence_len=parsed.frontmatter.training.sequence_len, + no_smoke=no_smoke, + ) ) except VendoringError as exc: console.print( @@ -594,7 +593,8 @@ def _verbose_runner(cmd: Sequence[str]) -> object: except ExportError as exc: console.print(f"[red]export:[/red] {exc}") raise typer.Exit(code=1) from exc - llama_server_smoke = None if no_smoke else resolved_target.smoke_test(llama_server_result) + llama_server_result = llama_server_outcome.prepared + llama_server_smoke = llama_server_outcome.smoke if llama_server_smoke is not None and not llama_server_smoke.ok: console.print( f"[red]smoke:[/red] {llama_server_smoke.detail}\n" diff --git a/src/dlm/export/entry.py b/src/dlm/export/entry.py index 53bc96d..f6878e8 100644 --- a/src/dlm/export/entry.py +++ b/src/dlm/export/entry.py @@ -24,6 +24,7 @@ from pathlib import Path from dlm.base_models.schema import BaseModelSpec + from dlm.export.runner import ExportResult from dlm.export.targets.base import ExportTarget, SmokeResult, TargetResult from dlm.store.paths import StorePath @@ -122,3 +123,55 @@ def run_mlx_serve_target_export(req: MlxServeExportRequest) -> ServerTargetExpor adapter_mix=req.adapter_mix, ) return ServerTargetExportResult(prepared=prepared, smoke=smoke, manifest_path=manifest_path) + + +@dataclass(frozen=True) +class LlamaServerPostExportRequest: + """Inputs to `run_llama_server_post_export`. + + `base_export` is the `ExportResult` returned by `run_export(target= + "llama-server")`; the dispatcher resolves the adapter dir, stages + the launch artifacts, and runs the smoke test on top of it. + """ + + target: ExportTarget + store: StorePath + spec: BaseModelSpec + base_export: ExportResult + adapter_name: str | None + adapter_path_override: Path | None + training_sequence_len: int | None + no_smoke: bool + + +@dataclass(frozen=True) +class LlamaServerPostExportResult: + """Outcome of `run_llama_server_post_export`. `smoke` is `None` + when `--no-smoke` was set.""" + + prepared: TargetResult + smoke: SmokeResult | None + + +def run_llama_server_post_export( + req: LlamaServerPostExportRequest, +) -> LlamaServerPostExportResult: + """Resolve adapter dir, stage llama-server artifacts, then smoke-test.""" + adapter_dir = req.adapter_path_override + if adapter_dir is None: + if req.adapter_name is None: + adapter_dir = req.store.resolve_current_adapter() + else: + adapter_dir = req.store.resolve_current_adapter_for(req.adapter_name) + assert adapter_dir is not None + + prepared = _targets.prepare_llama_server_export( + export_dir=req.base_export.export_dir, + manifest_path=req.base_export.manifest_path, + artifacts=req.base_export.artifacts, + adapter_dir=adapter_dir, + spec=req.spec, + training_sequence_len=req.training_sequence_len, + ) + smoke = None if req.no_smoke else req.target.smoke_test(prepared) + return LlamaServerPostExportResult(prepared=prepared, smoke=smoke) From b8d14476e274dc787662a332a3f6df4024cccfe6 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Sun, 3 May 2026 16:54:16 -0400 Subject: [PATCH 45/46] Add direct unit tests for new dispatcher modules Each new dispatcher module now has a tests/unit/ peer that drives its branches directly, so the per-package coverage gates (store, train, inference, export) stay at 100% without depending on CLI tests' indirect coverage. Modules covered: dlm.inference.dispatch, dlm.train.dispatch, dlm.store.bootstrap, dlm.store.show, dlm.export.entry. --- tests/unit/export/test_entry.py | 306 ++++++++++++++++++++ tests/unit/inference/test_dispatch.py | 69 +++++ tests/unit/store/test_bootstrap.py | 131 +++++++++ tests/unit/store/test_show.py | 394 ++++++++++++++++++++++++++ tests/unit/train/test_dispatch.py | 207 ++++++++++++++ 5 files changed, 1107 insertions(+) create mode 100644 tests/unit/export/test_entry.py create mode 100644 tests/unit/inference/test_dispatch.py create mode 100644 tests/unit/store/test_bootstrap.py create mode 100644 tests/unit/store/test_show.py create mode 100644 tests/unit/train/test_dispatch.py diff --git a/tests/unit/export/test_entry.py b/tests/unit/export/test_entry.py new file mode 100644 index 0000000..5be8a36 --- /dev/null +++ b/tests/unit/export/test_entry.py @@ -0,0 +1,306 @@ +"""Direct tests for `dlm.export.entry` (per-target dispatcher).""" + +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from dlm.export.entry import ( + LlamaServerPostExportRequest, + LlamaServerPostExportResult, + MlxServeExportRequest, + ServerTargetExportResult, + VllmExportRequest, + run_llama_server_post_export, + run_mlx_serve_target_export, + run_vllm_target_export, +) + + +def _make_target(smoke_ok: bool, smoke_detail: str = "smoke ok") -> Any: + """A fake ExportTarget with a configurable smoke result.""" + smoke_calls: list[Any] = [] + + def _smoke_test(prepared: Any) -> Any: + smoke_calls.append(prepared) + return SimpleNamespace(attempted=True, ok=smoke_ok, detail=smoke_detail) + + target = SimpleNamespace(name="vllm", smoke_test=_smoke_test) + target.smoke_calls = smoke_calls # type: ignore[attr-defined] + return target + + +def _vllm_request( + *, + target: Any, + store: Any = None, + spec: Any = None, + no_smoke: bool = False, +) -> VllmExportRequest: + return VllmExportRequest( + target=target, + store=store or SimpleNamespace(), + spec=spec or SimpleNamespace(), + served_model_name="dlm-test", + training_sequence_len=2048, + adapter_name=None, + adapter_path_override=None, + declared_adapter_names=None, + adapter_mix=None, + no_smoke=no_smoke, + ) + + +def _mlx_request( + *, + target: Any, + store: Any = None, + spec: Any = None, + no_smoke: bool = False, +) -> MlxServeExportRequest: + return MlxServeExportRequest( + target=target, + store=store or SimpleNamespace(), + spec=spec or SimpleNamespace(), + adapter_name=None, + adapter_path_override=None, + declared_adapter_names=None, + adapter_mix=None, + no_smoke=no_smoke, + ) + + +def test_run_vllm_target_export_finalizes_on_smoke_ok( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prepared = SimpleNamespace(name="vllm", export_dir=tmp_path) + monkeypatch.setattr("dlm.export.targets.prepare_vllm_export", lambda **kwargs: prepared) + finalize_calls: dict[str, object] = {} + + def _fake_finalize(**kwargs: object) -> Path: + finalize_calls.update(kwargs) + return tmp_path / "manifest.json" + + monkeypatch.setattr("dlm.export.targets.finalize_vllm_export", _fake_finalize) + + target = _make_target(smoke_ok=True) + result = run_vllm_target_export(_vllm_request(target=target)) + + assert isinstance(result, ServerTargetExportResult) + assert result.prepared is prepared + assert result.smoke is not None + assert result.smoke.ok is True + assert result.manifest_path == tmp_path / "manifest.json" + assert finalize_calls["smoke_output_first_line"] == "smoke ok" + + +def test_run_vllm_target_export_skips_finalize_on_smoke_failure( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prepared = SimpleNamespace(export_dir=tmp_path) + monkeypatch.setattr("dlm.export.targets.prepare_vllm_export", lambda **kwargs: prepared) + finalize_called: list[bool] = [] + monkeypatch.setattr( + "dlm.export.targets.finalize_vllm_export", + lambda **kwargs: finalize_called.append(True), + ) + + target = _make_target(smoke_ok=False, smoke_detail="vllm broke") + result = run_vllm_target_export(_vllm_request(target=target)) + + assert result.manifest_path is None + assert result.smoke is not None + assert result.smoke.ok is False + assert result.smoke.detail == "vllm broke" + assert finalize_called == [] + + +def test_run_vllm_target_export_skips_smoke_with_no_smoke( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prepared = SimpleNamespace(export_dir=tmp_path) + monkeypatch.setattr("dlm.export.targets.prepare_vllm_export", lambda **kwargs: prepared) + + finalize_calls: dict[str, object] = {} + + def _fake_finalize(**kwargs: object) -> Path: + finalize_calls.update(kwargs) + return tmp_path / "manifest.json" + + monkeypatch.setattr("dlm.export.targets.finalize_vllm_export", _fake_finalize) + + target = _make_target(smoke_ok=True) # ignored when no_smoke=True + result = run_vllm_target_export(_vllm_request(target=target, no_smoke=True)) + + assert result.smoke is None + assert result.manifest_path == tmp_path / "manifest.json" + assert finalize_calls["smoke_output_first_line"] is None + assert target.smoke_calls == [] + + +def test_run_mlx_serve_target_export_finalizes_on_smoke_ok( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prepared = SimpleNamespace(export_dir=tmp_path) + monkeypatch.setattr("dlm.export.targets.prepare_mlx_serve_export", lambda **kwargs: prepared) + monkeypatch.setattr( + "dlm.export.targets.finalize_mlx_serve_export", + lambda **kwargs: tmp_path / "manifest.json", + ) + + target = _make_target(smoke_ok=True) + result = run_mlx_serve_target_export(_mlx_request(target=target)) + + assert result.manifest_path == tmp_path / "manifest.json" + assert result.smoke is not None + assert result.smoke.ok is True + + +def test_run_mlx_serve_target_export_skips_finalize_on_smoke_failure( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prepared = SimpleNamespace(export_dir=tmp_path) + monkeypatch.setattr("dlm.export.targets.prepare_mlx_serve_export", lambda **kwargs: prepared) + finalize_called: list[bool] = [] + monkeypatch.setattr( + "dlm.export.targets.finalize_mlx_serve_export", + lambda **kwargs: finalize_called.append(True), + ) + + target = _make_target(smoke_ok=False) + result = run_mlx_serve_target_export(_mlx_request(target=target)) + + assert result.manifest_path is None + assert finalize_called == [] + + +def test_run_mlx_serve_target_export_skips_smoke_with_no_smoke( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prepared = SimpleNamespace(export_dir=tmp_path) + monkeypatch.setattr("dlm.export.targets.prepare_mlx_serve_export", lambda **kwargs: prepared) + monkeypatch.setattr( + "dlm.export.targets.finalize_mlx_serve_export", + lambda **kwargs: tmp_path / "manifest.json", + ) + + target = _make_target(smoke_ok=True) + result = run_mlx_serve_target_export(_mlx_request(target=target, no_smoke=True)) + + assert result.smoke is None + assert target.smoke_calls == [] + + +def test_run_llama_server_post_export_uses_path_override_when_given( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + override_dir = tmp_path / "override-adapter" + prepare_calls: dict[str, object] = {} + + def _fake_prepare(**kwargs: object) -> object: + prepare_calls.update(kwargs) + return SimpleNamespace(name="llama-server") + + monkeypatch.setattr("dlm.export.targets.prepare_llama_server_export", _fake_prepare) + + target = _make_target(smoke_ok=True) + request = LlamaServerPostExportRequest( + target=target, + store=SimpleNamespace(), # type: ignore[arg-type] + spec=SimpleNamespace(), # type: ignore[arg-type] + base_export=SimpleNamespace( # type: ignore[arg-type] + export_dir=tmp_path, + manifest_path=tmp_path / "m.json", + artifacts=[tmp_path / "a"], + ), + adapter_name=None, + adapter_path_override=override_dir, + training_sequence_len=1024, + no_smoke=False, + ) + result = run_llama_server_post_export(request) + + assert isinstance(result, LlamaServerPostExportResult) + assert prepare_calls["adapter_dir"] == override_dir + assert result.smoke is not None + assert result.smoke.ok is True + + +def test_run_llama_server_post_export_resolves_default_current_adapter( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + resolved_dir = tmp_path / "current" + store = SimpleNamespace( + resolve_current_adapter=lambda: resolved_dir, + resolve_current_adapter_for=lambda name: tmp_path / "named", + ) + prepare_calls: dict[str, object] = {} + + def _fake_prepare(**kwargs: object) -> object: + prepare_calls.update(kwargs) + return SimpleNamespace() + + monkeypatch.setattr("dlm.export.targets.prepare_llama_server_export", _fake_prepare) + + target = _make_target(smoke_ok=True) + request = LlamaServerPostExportRequest( + target=target, + store=store, # type: ignore[arg-type] + spec=SimpleNamespace(), # type: ignore[arg-type] + base_export=SimpleNamespace( # type: ignore[arg-type] + export_dir=tmp_path, manifest_path=tmp_path / "m.json", artifacts=[] + ), + adapter_name=None, + adapter_path_override=None, + training_sequence_len=512, + no_smoke=True, + ) + result = run_llama_server_post_export(request) + + assert prepare_calls["adapter_dir"] == resolved_dir + assert result.smoke is None + + +def test_run_llama_server_post_export_resolves_named_adapter( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + named_dir = tmp_path / "named-adapter" + store = SimpleNamespace( + resolve_current_adapter=lambda: tmp_path / "wrong", + resolve_current_adapter_for=lambda name: named_dir if name == "extras" else None, + ) + prepare_calls: dict[str, object] = {} + + monkeypatch.setattr( + "dlm.export.targets.prepare_llama_server_export", + lambda **kwargs: prepare_calls.update(kwargs) or SimpleNamespace(), + ) + + target = _make_target(smoke_ok=True) + request = LlamaServerPostExportRequest( + target=target, + store=store, # type: ignore[arg-type] + spec=SimpleNamespace(), # type: ignore[arg-type] + base_export=SimpleNamespace( # type: ignore[arg-type] + export_dir=tmp_path, manifest_path=tmp_path / "m.json", artifacts=[] + ), + adapter_name="extras", + adapter_path_override=None, + training_sequence_len=None, + no_smoke=True, + ) + run_llama_server_post_export(request) + + assert prepare_calls["adapter_dir"] == named_dir diff --git a/tests/unit/inference/test_dispatch.py b/tests/unit/inference/test_dispatch.py new file mode 100644 index 0000000..dc01e96 --- /dev/null +++ b/tests/unit/inference/test_dispatch.py @@ -0,0 +1,69 @@ +"""Direct tests for `dlm.inference.dispatch:run_prompt`.""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from dlm.inference.dispatch import PromptRequest, PromptResult, run_prompt + + +class _FakeBackend: + def __init__(self) -> None: + self.loaded_with: dict[str, Any] | None = None + self.generate_with: dict[str, Any] | None = None + + def load(self, spec: object, store: object, *, adapter_name: str | None = None) -> None: + self.loaded_with = {"spec": spec, "store": store, "adapter_name": adapter_name} + + def generate(self, query: str, **kwargs: object) -> str: + self.generate_with = {"query": query, **kwargs} + return "fake response" + + def unload(self) -> None: + pass + + +def test_run_prompt_loads_backend_and_returns_typed_result( + monkeypatch: pytest.MonkeyPatch, +) -> None: + backend = _FakeBackend() + monkeypatch.setattr( + "dlm.inference.backends.build_backend", + lambda *args, **kwargs: backend, + ) + + spec_obj = object() + caps_obj = object() + store_obj = object() + + request = PromptRequest( + spec=spec_obj, # type: ignore[arg-type] + capabilities=caps_obj, # type: ignore[arg-type] + store=store_obj, # type: ignore[arg-type] + backend_name="pytorch", + query="hello there", + max_new_tokens=42, + temperature=0.5, + top_p=0.9, + adapter="my-adapter", + ) + + result = run_prompt(request) + + assert isinstance(result, PromptResult) + assert result.response == "fake response" + assert result.backend_name == "pytorch" + + assert backend.loaded_with == { + "spec": spec_obj, + "store": store_obj, + "adapter_name": "my-adapter", + } + assert backend.generate_with == { + "query": "hello there", + "max_new_tokens": 42, + "temperature": 0.5, + "top_p": 0.9, + } diff --git a/tests/unit/store/test_bootstrap.py b/tests/unit/store/test_bootstrap.py new file mode 100644 index 0000000..da19505 --- /dev/null +++ b/tests/unit/store/test_bootstrap.py @@ -0,0 +1,131 @@ +"""Direct tests for `dlm.store.bootstrap:run_init`.""" + +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from dlm.store.bootstrap import ( + InitRequest, + InitResult, + ScaffoldKind, + run_init, +) +from dlm.store.manifest import load_manifest +from dlm.store.paths import for_dlm + +_REV = "0123456789abcdef0123456789abcdef01234567" + + +def _spec(key: str = "smollm2-135m") -> Any: + return SimpleNamespace(key=key, revision=_REV) + + +def _make_request( + tmp_path: Path, + *, + template_name: str | None = None, + scaffold_kind: ScaffoldKind = ScaffoldKind.TEXT, + force: bool = False, +) -> InitRequest: + return InitRequest( + path=tmp_path / "doc.dlm", + spec=_spec(), # type: ignore[arg-type] + acceptance=None, + force=force, + template_name=template_name, + scaffold_kind=scaffold_kind, + ) + + +def test_run_init_writes_text_scaffold_and_provisions_store( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + home = tmp_path / "home" + monkeypatch.setenv("DLM_HOME", str(home)) + + request = _make_request(tmp_path) + result = run_init(request) + + assert isinstance(result, InitResult) + assert result.applied_template is None + assert result.dlm_id # minted ULID + assert request.path.exists() + body = request.path.read_text(encoding="utf-8") + assert "::instruction::" in body + assert "::image" not in body + assert "::audio" not in body + assert f"dlm_id: {result.dlm_id}" in body + + store = for_dlm(result.dlm_id, home=home) + manifest = load_manifest(store.manifest) + assert manifest.dlm_id == result.dlm_id + assert manifest.base_model == "smollm2-135m" + assert manifest.license_acceptance is None + + +def test_run_init_writes_vision_scaffold( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("DLM_HOME", str(tmp_path / "home")) + + request = _make_request(tmp_path, scaffold_kind=ScaffoldKind.VISION) + run_init(request) + + body = request.path.read_text(encoding="utf-8") + assert "::image" in body + assert "dlm_version: 10" in body + + +def test_run_init_writes_audio_scaffold( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("DLM_HOME", str(tmp_path / "home")) + + request = _make_request(tmp_path, scaffold_kind=ScaffoldKind.AUDIO) + run_init(request) + + body = request.path.read_text(encoding="utf-8") + assert "::audio" in body + assert "dlm_version: 11" in body + + +def test_run_init_applies_template_via_dispatch( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("DLM_HOME", str(tmp_path / "home")) + + sentinel_template = SimpleNamespace(meta=SimpleNamespace(name="custom", title="Custom")) + apply_calls: dict[str, object] = {} + + def _fake_apply( + name: str, + target: Path, + *, + force: bool = False, + accept_license: bool = False, + ) -> object: + apply_calls["name"] = name + apply_calls["target"] = target + apply_calls["force"] = force + apply_calls["accept_license"] = accept_license + return SimpleNamespace(template=sentinel_template, dlm_id="01ABC123") + + monkeypatch.setattr("dlm.templates.init.apply_template", _fake_apply) + + request = _make_request(tmp_path, template_name="my-template", force=True) + result = run_init(request) + + assert result.dlm_id == "01ABC123" + assert result.applied_template is not None + assert result.applied_template.template is sentinel_template + assert apply_calls["name"] == "my-template" + assert apply_calls["force"] is True + assert apply_calls["accept_license"] is True diff --git a/tests/unit/store/test_show.py b/tests/unit/store/test_show.py new file mode 100644 index 0000000..dc82e3b --- /dev/null +++ b/tests/unit/store/test_show.py @@ -0,0 +1,394 @@ +"""Direct tests for `dlm.store.show:gather_store_view` + private helpers.""" + +from __future__ import annotations + +import json +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from dlm.store.show import ( + StoreView, + StoreViewRequest, + _summarize_base_security, + _summarize_gate, + _summarize_preference_mining, + _summarize_training_cache, + _summarize_training_sources_and_discovered, + gather_store_view, +) + +_DLM_ID = "01KPQ9X1000000000000000000" +_REV = "0123456789abcdef0123456789abcdef01234567" + + +def _write_doc(path: Path, *, body: str = "") -> None: + payload = f"---\ndlm_id: {_DLM_ID}\ndlm_version: 14\nbase_model: smollm2-135m\n---\n" + path.write_text(payload + body, encoding="utf-8") + + +def _parsed(home: Path, doc: Path) -> Any: + from dlm.doc.parser import parse_file + + return parse_file(doc) + + +def test_gather_store_view_returns_uninitialized_when_manifest_missing( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("DLM_HOME", str(tmp_path / "home")) + doc = tmp_path / "doc.dlm" + _write_doc(doc) + + from dlm.store.paths import for_dlm + + parsed = _parsed(tmp_path / "home", doc) + store = for_dlm(parsed.frontmatter.dlm_id) + + view = gather_store_view(StoreViewRequest(parsed=parsed, target_path=doc, store=store)) + + assert isinstance(view, StoreView) + assert view.inspection is None + assert view.training_cache is None + assert view.gate is None + assert view.preference_mining is None + assert view.base_security is None + assert view.parsed_dlm_id == _DLM_ID + assert view.parsed_base_model == "smollm2-135m" + assert view.training_sources is None + assert view.discovered_configs == [] + + +def test_gather_store_view_populates_inspection_when_manifest_exists( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + home = tmp_path / "home" + monkeypatch.setenv("DLM_HOME", str(home)) + doc = tmp_path / "doc.dlm" + _write_doc(doc) + + from dlm.store.manifest import Manifest, save_manifest + from dlm.store.paths import for_dlm + + parsed = _parsed(home, doc) + store = for_dlm(parsed.frontmatter.dlm_id) + store.ensure_layout() + save_manifest( + store.manifest, + Manifest( + dlm_id=_DLM_ID, + base_model="smollm2-135m", + base_model_revision=_REV, + source_path=doc.resolve(), + ), + ) + + view = gather_store_view(StoreViewRequest(parsed=parsed, target_path=doc, store=store)) + + assert view.inspection is not None + assert view.inspection.dlm_id == _DLM_ID + assert view.inspection.base_model == "smollm2-135m" + assert view.training_cache is None # tokenized_cache_dir doesn't exist yet + assert view.gate is None # no gate config + no events + assert view.preference_mining is None # no metrics + assert view.base_security is not None + assert view.base_security["base_model"] == "smollm2-135m" + + +def test_summarize_training_sources_returns_none_without_directives() -> None: + parsed = SimpleNamespace(frontmatter=SimpleNamespace(training=SimpleNamespace(sources=[]))) + sources, discovered = _summarize_training_sources_and_discovered(parsed, Path("/tmp")) + assert sources is None + assert discovered == [] + + +def test_summarize_training_sources_returns_declared_on_expand_failure( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from dlm.directives.errors import DirectiveError + + directive = SimpleNamespace( + path="docs", + include=("**/*",), + exclude=(), + max_files=None, + max_bytes_per_file=None, + ) + parsed = SimpleNamespace( + frontmatter=SimpleNamespace(training=SimpleNamespace(sources=[directive])) + ) + + def _raise(*args: object, **kwargs: object) -> None: + raise DirectiveError("expansion failed") + + monkeypatch.setattr("dlm.store.show._expand_sources", _raise) + + sources, discovered = _summarize_training_sources_and_discovered(parsed, Path("/tmp")) + assert sources is not None + assert len(sources) == 1 + assert sources[0]["path"] == "docs" + assert "file_count" not in sources[0] + assert discovered == [] + + +def test_summarize_training_sources_succeeds_with_provenance( + monkeypatch: pytest.MonkeyPatch, +) -> None: + directive = SimpleNamespace( + path="docs", + include=("**/*",), + exclude=(), + max_files=None, + max_bytes_per_file=None, + ) + parsed = SimpleNamespace( + frontmatter=SimpleNamespace(training=SimpleNamespace(sources=[directive])) + ) + prov = SimpleNamespace( + file_count=3, + total_bytes=1024, + skipped_binary=0, + skipped_encoding=0, + skipped_over_size=1, + ) + discovered_config = SimpleNamespace( + anchor=Path("/anchor"), + config=SimpleNamespace( + include=("*.py",), + exclude=(), + exclude_defaults=True, + metadata={"k": "v"}, + ), + ignore_rules=["pattern1"], + ) + + def _ok(*args: object, **kwargs: object) -> object: + return SimpleNamespace(provenance=[prov], discovered=[discovered_config]) + + monkeypatch.setattr("dlm.store.show._expand_sources", _ok) + + sources, discovered = _summarize_training_sources_and_discovered(parsed, Path("/tmp")) + assert sources is not None + assert sources[0]["file_count"] == 3 + assert sources[0]["total_bytes"] == 1024 + assert len(discovered) == 1 + assert discovered[0]["anchor"] == "/anchor" + assert discovered[0]["has_training_yaml"] is True + assert discovered[0]["has_ignore"] is True + + +def test_summarize_training_sources_pads_when_provenance_short( + monkeypatch: pytest.MonkeyPatch, +) -> None: + d1 = SimpleNamespace(path="a", include=(), exclude=(), max_files=None, max_bytes_per_file=None) + d2 = SimpleNamespace(path="b", include=(), exclude=(), max_files=None, max_bytes_per_file=None) + parsed = SimpleNamespace( + frontmatter=SimpleNamespace(training=SimpleNamespace(sources=[d1, d2])) + ) + short_prov = SimpleNamespace( + file_count=1, total_bytes=10, skipped_binary=0, skipped_encoding=0, skipped_over_size=0 + ) + + def _ok(*args: object, **kwargs: object) -> object: + # one declared has provenance, the other doesn't (defensive padding) + discovered_no_cfg = SimpleNamespace(anchor=Path("/anchor"), config=None, ignore_rules=[]) + return SimpleNamespace(provenance=[short_prov], discovered=[discovered_no_cfg]) + + monkeypatch.setattr("dlm.store.show._expand_sources", _ok) + + sources, discovered = _summarize_training_sources_and_discovered(parsed, Path("/tmp")) + assert sources is not None + assert len(sources) == 2 + assert sources[0]["file_count"] == 1 + assert "file_count" not in sources[1] # padded with declared-only + assert discovered[0]["has_training_yaml"] is False + assert discovered[0]["exclude_defaults"] is True + + +def test_summarize_training_cache_none_when_dir_missing(tmp_path: Path) -> None: + assert _summarize_training_cache(tmp_path / "missing", tmp_path) is None + + +def test_summarize_training_cache_with_no_last_run( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + + monkeypatch.setattr( + "dlm.directives.cache.TokenizedCache.open", + classmethod(lambda cls, _path: SimpleNamespace(entry_count=5, total_bytes=2048)), + ) + monkeypatch.setattr("dlm.metrics.queries.latest_tokenization", lambda root: None) + + snap = _summarize_training_cache(cache_dir, tmp_path) + assert snap is not None + assert snap["entry_count"] == 5 + assert snap["bytes"] == 2048 + assert snap["last_run_hit_rate"] is None + assert snap["last_run_id"] is None + + +def test_summarize_training_cache_with_last_run( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + + monkeypatch.setattr( + "dlm.directives.cache.TokenizedCache.open", + classmethod(lambda cls, _path: SimpleNamespace(entry_count=3, total_bytes=512)), + ) + monkeypatch.setattr( + "dlm.metrics.queries.latest_tokenization", + lambda root: SimpleNamespace(hit_rate=0.75, run_id=42), + ) + + snap = _summarize_training_cache(cache_dir, tmp_path) + assert snap is not None + assert snap["last_run_hit_rate"] == 0.75 + assert snap["last_run_id"] == 42 + + +def test_summarize_gate_none_when_no_cfg_no_events( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + store = SimpleNamespace(root=tmp_path) + monkeypatch.setattr( + "dlm.train.gate.paths.gate_config_path", lambda s: tmp_path / "missing.json" + ) + monkeypatch.setattr("dlm.metrics.queries.latest_gate_events", lambda root: []) + + assert _summarize_gate(store) is None # type: ignore[arg-type] + + +def test_summarize_gate_diverged_when_no_cfg_but_diverged_events( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + store = SimpleNamespace(root=tmp_path) + event = SimpleNamespace( + adapter_name="adapter-a", + mean_weight=0.5, + sample_count=10, + mode="diverged", + run_id=99, + ) + monkeypatch.setattr( + "dlm.train.gate.paths.gate_config_path", lambda s: tmp_path / "missing.json" + ) + monkeypatch.setattr("dlm.metrics.queries.latest_gate_events", lambda root: [event]) + + snap = _summarize_gate(store) # type: ignore[arg-type] + assert snap is not None + assert snap["mode"] == "diverged" + assert snap["last_run_id"] == 99 + assert snap["per_adapter"][0]["adapter_name"] == "adapter-a" + + +def test_summarize_gate_with_cfg_and_events( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + cfg_path = tmp_path / "gate_config.json" + cfg_path.write_text( + json.dumps( + {"mode": "trained", "adapter_names": ["a", "b"], "input_dim": 32, "hidden_proj_dim": 16} + ), + encoding="utf-8", + ) + store = SimpleNamespace(root=tmp_path) + event = SimpleNamespace( + adapter_name="a", mean_weight=0.6, sample_count=5, mode="active", run_id=7 + ) + + monkeypatch.setattr("dlm.train.gate.paths.gate_config_path", lambda s: cfg_path) + monkeypatch.setattr("dlm.metrics.queries.latest_gate_events", lambda root: [event]) + + snap = _summarize_gate(store) # type: ignore[arg-type] + assert snap is not None + assert snap["mode"] == "trained" + assert snap["last_run_id"] == 7 + assert snap["per_adapter"][0]["adapter_name"] == "a" + + +def test_summarize_gate_with_cfg_no_events( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + cfg_path = tmp_path / "gate_config.json" + cfg_path.write_text( + json.dumps( + {"mode": "trained", "adapter_names": ["a", "b"], "input_dim": 32, "hidden_proj_dim": 16} + ), + encoding="utf-8", + ) + store = SimpleNamespace(root=tmp_path) + + monkeypatch.setattr("dlm.train.gate.paths.gate_config_path", lambda s: cfg_path) + monkeypatch.setattr("dlm.metrics.queries.latest_gate_events", lambda root: []) + + snap = _summarize_gate(store) # type: ignore[arg-type] + assert snap is not None + assert snap["last_run_id"] is None + assert snap["per_adapter"] == [{"adapter_name": "a"}, {"adapter_name": "b"}] + + +def test_summarize_preference_mining_none_when_no_totals( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr("dlm.metrics.queries.preference_mining_totals", lambda root: None) + assert _summarize_preference_mining(tmp_path) is None + + +def test_summarize_preference_mining_with_totals( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + totals = SimpleNamespace( + run_count=2, event_count=5, total_mined_pairs=10, total_skipped_prompts=3 + ) + last = SimpleNamespace(run_id=42) + monkeypatch.setattr("dlm.metrics.queries.preference_mining_totals", lambda root: totals) + monkeypatch.setattr("dlm.metrics.queries.latest_preference_mining", lambda root: last) + monkeypatch.setattr( + "dlm.metrics.queries.preference_mining_for_run", lambda root, run_id: ["row1", "row2"] + ) + monkeypatch.setattr( + "dlm.metrics.queries.preference_mining_to_dict", lambda rows: [{"key": "value"}] + ) + + snap = _summarize_preference_mining(tmp_path) + assert snap is not None + assert snap["run_count"] == 2 + assert snap["last_run_id"] == 42 + assert snap["last_run_event_count"] == 2 + assert snap["last_event"] == {"key": "value"} + + +def test_summarize_base_security_returns_dict_for_known_base() -> None: + snap = _summarize_base_security("smollm2-135m") + assert snap is not None + assert snap["base_model"] == "smollm2-135m" + assert "trust_remote_code" in snap + + +def test_summarize_base_security_returns_none_when_resolve_fails( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from dlm.base_models.errors import UnknownBaseModelError + + def _raise(*args: object, **kwargs: object) -> None: + raise UnknownBaseModelError("nope", known_keys=()) + + monkeypatch.setattr("dlm.base_models.resolve", _raise) + + assert _summarize_base_security("definitely-not-real") is None diff --git a/tests/unit/train/test_dispatch.py b/tests/unit/train/test_dispatch.py new file mode 100644 index 0000000..d9fdccf --- /dev/null +++ b/tests/unit/train/test_dispatch.py @@ -0,0 +1,207 @@ +"""Direct tests for `dlm.train.dispatch:run_train`.""" + +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from dlm.train.dispatch import ( + NoViableTrainingPlanError, + TrainRequest, + TrainResult, + run_train, +) + + +def _fake_parsed(*, dlm_id: str = "01KPQ9X1000000000000000000") -> Any: + """Minimal ParsedDlm shape: frontmatter.training.sequence_len + dlm_id.""" + training = SimpleNamespace(sequence_len=1024) + frontmatter = SimpleNamespace( + dlm_id=dlm_id, + training=training, + base_model="smollm2-135m", + ) + return SimpleNamespace(frontmatter=frontmatter) + + +def _fake_spec() -> Any: + """Minimal BaseModelSpec shape: params, effective_context_length, key, revision.""" + return SimpleNamespace( + params=135_000_000, + effective_context_length=4096, + key="smollm2-135m", + revision="0123456789abcdef", + ) + + +def _make_store(tmp_path: Path, *, manifest_exists: bool = True) -> Any: + """Fake StorePath: ensure_layout no-op, manifest path optionally exists.""" + layout_calls: list[bool] = [] + manifest_path = tmp_path / "manifest.json" + if manifest_exists: + manifest_path.write_text("{}", encoding="utf-8") + + class _Store: + manifest = manifest_path + + def ensure_layout(self) -> None: + layout_calls.append(True) + + store = _Store() + store.layout_calls = layout_calls # type: ignore[attr-defined] + return store + + +def _make_request( + tmp_path: Path, + *, + parsed: Any | None = None, + spec: Any | None = None, + store: Any | None = None, +) -> TrainRequest: + return TrainRequest( + parsed=parsed or _fake_parsed(), # type: ignore[arg-type] + target_path=tmp_path / "doc.dlm", + spec=spec or _fake_spec(), # type: ignore[arg-type] + store=store or _make_store(tmp_path), # type: ignore[arg-type] + phase="all", + mode="fresh", + seed=42, + max_steps=None, + lock_mode="default", + world_size=1, + strict_metrics=False, + include_auto_mined=True, + ) + + +def test_run_train_returns_typed_result_on_happy_path( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + plan_obj = SimpleNamespace(name="fake-plan") + caps_obj = SimpleNamespace() + monkeypatch.setattr( + "dlm.hardware.doctor", + lambda **kwargs: SimpleNamespace(plan=plan_obj, capabilities=caps_obj), + ) + captured: dict[str, object] = {} + + def _fake_run_phases(*args: object, **kwargs: object) -> list[object]: + captured["args"] = args + captured["kwargs"] = kwargs + return ["pr1", "pr2"] + + monkeypatch.setattr( + "dlm.train.preference.phase_orchestrator.run_phases", + _fake_run_phases, + ) + + request = _make_request(tmp_path) + result = run_train(request) + + assert isinstance(result, TrainResult) + assert result.plan is plan_obj + assert result.phase_results == ["pr1", "pr2"] + assert request.store.layout_calls == [True] + assert captured["kwargs"]["capabilities"] is caps_obj + assert captured["kwargs"]["world_size"] == 1 + assert captured["kwargs"]["lock_mode"] == "default" + + +def test_run_train_raises_no_viable_plan_when_doctor_returns_none( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "dlm.hardware.doctor", + lambda **kwargs: SimpleNamespace(plan=None, capabilities=SimpleNamespace()), + ) + + request = _make_request(tmp_path) + with pytest.raises(NoViableTrainingPlanError, match="no viable training plan"): + run_train(request) + + +def test_run_train_provisions_manifest_when_missing_for_non_gated_spec( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "dlm.hardware.doctor", + lambda **kwargs: SimpleNamespace(plan=SimpleNamespace(), capabilities=SimpleNamespace()), + ) + monkeypatch.setattr( + "dlm.train.preference.phase_orchestrator.run_phases", + lambda *args, **kwargs: [], + ) + monkeypatch.setattr("dlm.base_models.is_gated", lambda spec: False) + + saved: dict[str, object] = {} + + def _fake_save(path: object, manifest: object) -> None: + saved["path"] = path + saved["manifest"] = manifest + + monkeypatch.setattr("dlm.store.manifest.save_manifest", _fake_save) + + store = _make_store(tmp_path, manifest_exists=False) + request = _make_request(tmp_path, store=store) + run_train(request) + + assert saved["path"] == store.manifest + assert saved["manifest"].license_acceptance is None # type: ignore[union-attr] + assert saved["manifest"].base_model == "smollm2-135m" # type: ignore[union-attr] + + +def test_run_train_provisions_manifest_with_acceptance_for_gated_spec( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "dlm.hardware.doctor", + lambda **kwargs: SimpleNamespace(plan=SimpleNamespace(), capabilities=SimpleNamespace()), + ) + monkeypatch.setattr( + "dlm.train.preference.phase_orchestrator.run_phases", + lambda *args, **kwargs: [], + ) + monkeypatch.setattr("dlm.base_models.is_gated", lambda spec: True) + + from datetime import datetime + + from dlm.base_models.license import LicenseAcceptance + + sentinel_acceptance = LicenseAcceptance( + accepted_at=datetime(2026, 5, 1), + license_url="https://example.test/lic", + license_spdx="apache-2.0", + via="cli_flag", + ) + require_calls: dict[str, object] = {} + + def _fake_require(spec: object, *, accept_license: bool, via: str) -> LicenseAcceptance: + require_calls["spec"] = spec + require_calls["accept_license"] = accept_license + require_calls["via"] = via + return sentinel_acceptance + + monkeypatch.setattr("dlm.base_models.license.require_acceptance", _fake_require) + + saved: dict[str, object] = {} + + def _fake_save(path: object, manifest: object) -> None: + saved["manifest"] = manifest + + monkeypatch.setattr("dlm.store.manifest.save_manifest", _fake_save) + + store = _make_store(tmp_path, manifest_exists=False) + request = _make_request(tmp_path, store=store) + run_train(request) + + assert require_calls["accept_license"] is True + assert require_calls["via"] == "cli_flag" + assert saved["manifest"].license_acceptance is sentinel_acceptance # type: ignore[union-attr] From d4b1b6c05c3ef45c7a890c4d2543ffea6866fe86 Mon Sep 17 00:00:00 2001 From: mfwolffe Date: Sun, 3 May 2026 22:41:54 -0400 Subject: [PATCH 46/46] Repoint integration-test build_judge patches to canonical module path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The preference dispatcher uses dotted import 'from dlm.preference import judge as _judge_mod; _judge_mod.build_judge(...)'. Tests must patch 'dlm.preference.judge.build_judge' (canonical) for late attribute lookup to see the patch — patches on the package re-export 'dlm.preference.build_judge' are invisible to the dispatcher. Caught by Ubuntu CI on PR #18. --- tests/integration/cli/test_preference_cli.py | 2 +- tests/integration/preference/test_mine_cycle.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/cli/test_preference_cli.py b/tests/integration/cli/test_preference_cli.py index abca054..0f945fe 100644 --- a/tests/integration/cli/test_preference_cli.py +++ b/tests/integration/cli/test_preference_cli.py @@ -127,7 +127,7 @@ def _patch_mining_runtime(monkeypatch: pytest.MonkeyPatch) -> None: lambda *args, **kwargs: _FakeBackend({"What is DGEMM?": ["bad answer", "good answer"]}), ) monkeypatch.setattr( - "dlm.preference.build_judge", + "dlm.preference.judge.build_judge", lambda *args, **kwargs: _FakeJudge(), ) diff --git a/tests/integration/preference/test_mine_cycle.py b/tests/integration/preference/test_mine_cycle.py index ed4d39c..17acec7 100644 --- a/tests/integration/preference/test_mine_cycle.py +++ b/tests/integration/preference/test_mine_cycle.py @@ -143,7 +143,7 @@ def _patch_mining(monkeypatch: pytest.MonkeyPatch) -> None: lambda *args, **kwargs: _FakeMiningBackend(_MINE_RESPONSES), ) monkeypatch.setattr( - "dlm.preference.build_judge", + "dlm.preference.judge.build_judge", lambda *args, **kwargs: _TerseJudge(), )