diff --git a/.kilo-prompt/review-mission.txt b/.kilo-prompt/review-mission.txt deleted file mode 100644 index c1099461df..0000000000 --- a/.kilo-prompt/review-mission.txt +++ /dev/null @@ -1,20 +0,0 @@ -You are a coding agent working on PR #32 in this repository (branch: kilo/deterministic-replay-harness). - -Your mission is to fully address all open review comments and ensure the PR is complete and correct. - -Steps: -1. Run: gh pr view 32 --comments -2. Run: gh pr diff 32 -3. Read all new/changed files in the PR. -4. Address EVERY review comment by implementing the requested change or clearly documenting why it's not applicable. -5. For each fix: make the code change, ensure tests cover it. -6. Update docs if any comment asks for it. -7. Commit all fixes on the PR branch (do NOT force-push, add new commits). -8. Push. -9. Reply to the review: gh pr review 32 --comment -b "All comments addressed: ..." - -Constraints: -- Any change to replay/recording code must include test coverage. -- No silent failure paths. - -Stopping condition: All review comments resolved, committed, pushed. \ No newline at end of file diff --git a/.omc/project-memory.json b/.omc/project-memory.json new file mode 100644 index 0000000000..e2fe8b323c --- /dev/null +++ b/.omc/project-memory.json @@ -0,0 +1,221 @@ +{ + "version": "1.0.0", + "lastScanned": 1772689120356, + "projectRoot": "D:\\_projects\\ALAS", + "techStack": { + "languages": [], + "frameworks": [], + "packageManager": null, + "runtime": null + }, + "build": { + "buildCommand": null, + "testCommand": null, + "lintCommand": null, + "devCommand": null, + "scripts": {} + }, + "conventions": { + "namingStyle": null, + "importStyle": null, + "testPattern": null, + "fileOrganization": null + }, + "structure": { + "isMonorepo": false, + "workspaces": [], + "mainDirectories": [ + "docs", + "scripts", + "tests" + ], + "gitBranches": { + "defaultBranch": "master", + "branchingStrategy": null + } + }, + "customNotes": [], + "directoryMap": { + "adb_vision": { + "path": "adb_vision", + "purpose": null, + "fileCount": 21, + "lastAccessed": 1772689120342, + "keyFiles": [ + "conftest.py", + "debug_droidcast.py", + "drive.bat", + "GEMINI_SYSTEM_PROMPT.md", + "mcp_actions.jsonl" + ] + }, + "agent_orchestrator": { + "path": "agent_orchestrator", + "purpose": null, + "fileCount": 31, + "lastAccessed": 1772689120342, + "keyFiles": [ + "alas_mcp_server.py", + "conftest.py", + "entrypoint_symlink_setup.py", + "heartbeat.jsonl", + "live_interact.py" + ] + }, + "alas_wrapped": { + "path": "alas_wrapped", + "purpose": null, + "fileCount": 14, + "lastAccessed": 1772689120343, + "keyFiles": [ + "alas.bat", + "alas.py", + "docker-compose.yml", + "gui.py", + "LICENSE" + ] + }, + "docs": { + "path": "docs", + "purpose": "Documentation", + "fileCount": 11, + "lastAccessed": 1772689120343, + "keyFiles": [ + "AGENT_MEMORY_2026-03-04.md", + "ALAS_CONFIG_REFERENCE.md", + "ARCHITECTURE.md", + "DOCUMENTATION_GOVERNANCE.md", + "GLOSSARY.md" + ] + }, + "memories": { + "path": "memories", + "purpose": null, + "fileCount": 1, + "lastAccessed": 1772689120343, + "keyFiles": [ + "git_workflow.md" + ] + }, + "scripts": { + "path": "scripts", + "purpose": "Build/utility scripts", + "fileCount": 2, + "lastAccessed": 1772689120343, + "keyFiles": [ + "install_hooks.sh", + "memu_task_bridge.ps1" + ] + }, + "tests": { + "path": "tests", + "purpose": "Test files", + "fileCount": 0, + "lastAccessed": 1772689120344, + "keyFiles": [] + }, + "alas_wrapped\\assets": { + "path": "alas_wrapped\\assets", + "purpose": "Static assets", + "fileCount": 0, + "lastAccessed": 1772689120344, + "keyFiles": [] + }, + "alas_wrapped\\bin": { + "path": "alas_wrapped\\bin", + "purpose": "Executable scripts", + "fileCount": 0, + "lastAccessed": 1772689120344, + "keyFiles": [] + }, + "alas_wrapped\\config": { + "path": "alas_wrapped\\config", + "purpose": "Configuration files", + "fileCount": 14, + "lastAccessed": 1772689120345, + "keyFiles": [ + "alas.json", + "deploy.template-AidLux-cn.yaml", + "deploy.template-AidLux.yaml" + ] + }, + "tests\\fixtures": { + "path": "tests\\fixtures", + "purpose": "Test fixtures", + "fileCount": 1, + "lastAccessed": 1772689120345, + "keyFiles": [ + "README.md" + ] + } + }, + "hotPaths": [ + { + "path": "", + "accessCount": 3, + "lastAccessed": 1772689492589, + "type": "directory" + }, + { + "path": "docs/plans/phase_2_master_plan.md", + "accessCount": 1, + "lastAccessed": 1772689169110, + "type": "file" + }, + { + "path": "docs/plans/program_execution_plan.md", + "accessCount": 1, + "lastAccessed": 1772689169498, + "type": "file" + }, + { + "path": "docs/plans/phase_2_gemini_cli_plan.md", + "accessCount": 1, + "lastAccessed": 1772689169607, + "type": "file" + }, + { + "path": "docs\\NORTH_STAR.md", + "accessCount": 1, + "lastAccessed": 1772689285903, + "type": "file" + }, + { + "path": "docs\\ARCHITECTURE.md", + "accessCount": 1, + "lastAccessed": 1772689286281, + "type": "file" + }, + { + "path": "docs\\ROADMAP.md", + "accessCount": 1, + "lastAccessed": 1772689383901, + "type": "file" + }, + { + "path": "TDD_IMPLEMENTATION_PLAN.md", + "accessCount": 1, + "lastAccessed": 1772689384336, + "type": "file" + }, + { + "path": "agent_orchestrator\\live_nav.py", + "accessCount": 1, + "lastAccessed": 1772689619979, + "type": "file" + }, + { + "path": "alas_wrapped\\tools\\navigation.py", + "accessCount": 1, + "lastAccessed": 1772689639746, + "type": "file" + }, + { + "path": "alas_wrapped\\tools\\login.py", + "accessCount": 1, + "lastAccessed": 1772689640177, + "type": "file" + } + ], + "userDirectives": [] +} \ No newline at end of file diff --git a/.omc/sessions/9f68cafc-1a6c-47cc-8c96-0394c2a40828.json b/.omc/sessions/9f68cafc-1a6c-47cc-8c96-0394c2a40828.json new file mode 100644 index 0000000000..859829fee9 --- /dev/null +++ b/.omc/sessions/9f68cafc-1a6c-47cc-8c96-0394c2a40828.json @@ -0,0 +1,8 @@ +{ + "session_id": "9f68cafc-1a6c-47cc-8c96-0394c2a40828", + "ended_at": "2026-03-05T07:15:43.925Z", + "reason": "other", + "agents_spawned": 0, + "agents_completed": 0, + "modes_used": [] +} \ No newline at end of file diff --git a/.omc/state/hud-state.json b/.omc/state/hud-state.json new file mode 100644 index 0000000000..74dfc9fe1f --- /dev/null +++ b/.omc/state/hud-state.json @@ -0,0 +1,6 @@ +{ + "timestamp": "2026-03-05T06:32:16.608Z", + "backgroundTasks": [], + "sessionStartTimestamp": "2026-03-05T05:38:40.025Z", + "sessionId": "2a799d2f-5602-4806-8bbd-8f6947137697" +} \ No newline at end of file diff --git a/.omc/state/hud-stdin-cache.json b/.omc/state/hud-stdin-cache.json new file mode 100644 index 0000000000..d6fc24b182 --- /dev/null +++ b/.omc/state/hud-stdin-cache.json @@ -0,0 +1 @@ +{"session_id":"2a799d2f-5602-4806-8bbd-8f6947137697","transcript_path":"C:\\Users\\pmacl\\.claude\\projects\\D---projects-ALAS\\2a799d2f-5602-4806-8bbd-8f6947137697.jsonl","cwd":"D:\\_projects\\ALAS","model":{"id":"claude-sonnet-4-6","display_name":"Sonnet 4.6"},"workspace":{"current_dir":"D:\\_projects\\ALAS","project_dir":"D:\\_projects\\ALAS","added_dirs":[]},"version":"2.1.69","output_style":{"name":"default"},"cost":{"total_cost_usd":1.4690392499999998,"total_duration_ms":3217190,"total_api_duration_ms":302861,"total_lines_added":0,"total_lines_removed":0},"context_window":{"total_input_tokens":65,"total_output_tokens":11703,"context_window_size":200000,"current_usage":{"input_tokens":3,"output_tokens":377,"cache_creation_input_tokens":672,"cache_read_input_tokens":64960},"used_percentage":33,"remaining_percentage":67},"exceeds_200k_tokens":false} \ No newline at end of file diff --git a/.omc/state/idle-notif-cooldown.json b/.omc/state/idle-notif-cooldown.json new file mode 100644 index 0000000000..15a2a55b80 --- /dev/null +++ b/.omc/state/idle-notif-cooldown.json @@ -0,0 +1,3 @@ +{ + "lastSentAt": "2026-03-05T06:31:23.445Z" +} \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 89e53e7c7e..0000000000 --- a/AGENTS.md +++ /dev/null @@ -1,204 +0,0 @@ -# AGENTS.md - -`AGENTS.md` is the canonical instruction source for agent behavior in this repository. -If any other entrypoint file conflicts with `AGENTS.md`, `AGENTS.md` wins. - -Derived entrypoint files: -- `CLAUDE.md` is generated from this file. -- `GEMINI.md` is generated from this file. -- `.github/copilot-instructions.md` may summarize this file for Copilot-specific discovery. - -## Project Purpose - -This repo transitions ALAS from a legacy script application into an LLM-driven automation system. -The permanent goal is an LLM Master Scheduler loop — deterministic tools on the hot path, LLM + vision as a permanent fallback that never gets phased out and drives blueprint creation. - -See: -- `docs/NORTH_STAR.md` -- `docs/ARCHITECTURE.md` -- `docs/ROADMAP.md` - -## Non-Negotiables - -- Never modify `upstream_alas/` directly for feature work. -- Never create additional git repos or submodules inside this repo. -- Treat `alas_wrapped/` as the runnable source of truth for customized behavior. -- Use deterministic tools first; LLM/vision is a permanent fallback for recovery, unknown states, and building new tools via manual piloting — it is never phased out. -- Do not commit runtime artifacts or secrets (for example: screenshots, ad-hoc runtime logs, local tokens). -- Keep `alas_wrapped/config/PatrickCustom.json` under version control and let hooks keep it in commit flow. -- If required docs for the current task have not been read, stop and read them before editing. - -## Required Reading - -Read these at session start: -1. `docs/NORTH_STAR.md` -2. `docs/ARCHITECTURE.md` -3. `docs/ROADMAP.md` - -Task-triggered required reads: -- MCP/tooling work: - - `docs/agent_tooling/README.md` - - `adb_vision/server.py` -- Implementation work (any phase): - - `TDD_IMPLEMENTATION_PLAN.md` -- Environment/bootstrap changes: - - `docs/dev/environment_setup.md` - - `docs/dev/memu_playbook.md` - -## Repository Model - -``` -ALAS/ [primary git repository] -├── upstream_alas/ [upstream sync submodule] -├── alas_wrapped/ [runnable ALAS + local customizations] -│ └── tools/ [tools that import ALAS internals] -├── agent_orchestrator/ [MCP server + standalone orchestration tools] -├── docs/ [project documentation] -└── .githooks/ [repo-tracked git hooks] -``` - -Directory ownership: -- `upstream_alas/`: sync source only. -- `alas_wrapped/`: runtime source of truth. -- `alas_wrapped/tools/`: ALAS-internal imports only. -- `agent_orchestrator/`: standalone modern tooling and MCP. - -Placement rule: -- If code imports `module.*` or ALAS internals, place it under `alas_wrapped/tools/`. -- Otherwise place it under `agent_orchestrator/`. - -## Upstream Sync Workflow - -One-way model: -1. Update submodule: - - `git submodule update --remote -- upstream_alas` -2. Compare upstream changes. -3. Apply needed changes manually into `alas_wrapped/`. -4. Preserve local customizations and MCP hooks. -5. Validate wrapped behavior before commit. - -## Runtime, Logs, and Launch - -Core configs: -- `alas_wrapped/config/PatrickCustom.json` -- `alas_wrapped/config/deploy.yaml` -- `alas_wrapped/config/alas.json` - -Known local environment: -- `Alas.Emulator.Serial`: `127.0.0.1:21513` -- `Alas.EmulatorInfo.Emulator`: `MEmuPlayer` - -## Emulator Environment (MEmu) - -**Prerequisite:** The MEmu Multiple Instance Manager (`MEmuConsole.exe`) is always running with admin permissions. This is the control plane for all emulator instances. - -### Key Commands (memuc.exe) - -The `memuc` CLI at `C:\Program Files\Microvirt\MEmu\memuc.exe` controls emulator instances: - -```bash -# Start/stop VMs -memuc start -n MEmu # Start by name -memuc start -i 0 # Start by index -memuc stop -n MEmu # Stop by name -memuc stopall # Stop all VMs - -# Check status -memuc listvms # List all VMs with index, status, PID -memuc listvms --running # List only running VMs -memuc isvmrunning -n MEmu # Check specific VM - -# VM lifecycle -memuc reboot -i 0 # Reboot VM -memuc clone -i 0 # Clone a VM -``` - -### Common Workflow - -1. **MEmuConsole.exe is already running** (admin) - the user ensures this -2. Use `memuc` commands to start/stop emulator instances as needed -3. Once started, ADB connects via `127.0.0.1:21513` -4. MCP server ADB tools (`adb_screenshot`, `adb_tap`, etc.) can then interact with the emulator - -### Python Integration - -Direct subprocess calls are sufficient - no special libraries required: - -```python -import subprocess - -# Start emulator -subprocess.run(["memuc", "start", "-n", "MEmu"], check=True) - -# Check status -result = subprocess.run(["memuc", "isvmrunning", "-n", "MEmu"], - capture_output=True, text=True) -``` - -Launch wrapped bot: -```bash -cd alas_wrapped -PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe gui.py --run PatrickCustom -``` - -MCP server purpose: -- Exposes ADB tools via MCP for the LLM harness (screenshot, tap, swipe, keyevent, launch, focus). -- Lightweight FastMCP server — no ALAS imports required. - -Run MCP server: -```bash -cd adb_vision -uv run server.py --serial 127.0.0.1:21513 -``` - -Log parser purpose: -- Fast forensic analysis of task timelines, warnings/errors, and tracebacks. -- Preferred over manual full-log scanning for debugging regressions. - -Recommended parser usage: -```bash -python3 agent_orchestrator/log_parser.py alas_wrapped/log/YYYY-MM-DD_PatrickCustom.txt --timeline --errors --trace -python3 agent_orchestrator/log_parser.py alas_wrapped/log/YYYY-MM-DD_PatrickCustom.txt --summary -``` - -## MCP Tool Surface - -Canonical callable names in `adb_vision/server.py`: -- `adb_screenshot` -- `adb_tap` -- `adb_swipe` -- `adb_keyevent` -- `adb_launch_game` -- `adb_get_focus` - -## Tool Contract For New Tools - -Return state using: -```python -{ - "success": bool, - "data": object | None, - "error": str | None, - "observed_state": str | None, - "expected_state": str -} -``` - -## Change Discipline - -- Keep changes focused and local to the request. -- Update docs when behavior changes: - - `CHANGELOG.md` for user-visible behavior changes. - - `docs/agent_tooling/README.md` for tool changes. - - `docs/ARCHITECTURE.md` for architecture changes. - - `docs/monorepo/MONOREPO_SYNC_NOTES.md` for process changes. - - `docs/ROADMAP.md` when milestone status changes. - -## Required Git Workflow - -- Any non-trivial code or behavior change must end with a commit and PR. -- Use a feature branch for non-trivial work. -- Keep commits scoped and descriptive. -- If runtime cannot execute git operations, provide exact commands for operator execution. -- The local hook flow stages `alas_wrapped/config/PatrickCustom.json` on pre-commit so it is included in the same commit. -- Pre-push validates that `PatrickCustom.json` is clean to prevent accidental drift. diff --git a/CLAUDE.md b/CLAUDE.md index f16c7b6871..d3853717e8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,231 +1,41 @@ -# CLAUDE.md - -This file is the single canonical instruction source for all agents working in this repository. -If any instruction file conflicts with this one, this file wins. - -## Entrypoint Files - -- `AGENTS.md` exists for Codex discovery and must mirror critical rules from this file. -- `GEMINI.md` exists for Gemini entry and must mirror critical rules from this file. -- `.github/copilot-instructions.md` exists for Copilot entry and must mirror critical rules from this file. - -## Non-Negotiables - -- Never modify `upstream_alas/` directly. -- Never create additional git repos or submodules inside this repo. -- Never run `git init` or `git clone` inside this repository's subdirectories. The only submodule is `upstream_alas/`. -- Treat `alas_wrapped/` as the runnable source of truth for customized ALAS behavior. -- Use deterministic tools first; LLM/vision is a permanent fallback — never phased out. -- Do not commit runtime artifacts or secrets (for example: `alas_wrapped/alas_admin_token`, screenshots, ad-hoc runtime logs). -- If a required doc for the current task has not been read, stop and ask before editing. - -## North Star - -`docs/NORTH_STAR.md` is immutable policy. All changes must align with: -- LLM IS the harness — the Master Scheduler loop, not a Python daemon. -- Deterministic tools for the fast path. -- LLM + vision as permanent fallback — for recovery, unknown states, and building new deterministic tools via manual piloting. -- Shared tool interfaces across development and production orchestration. - -## Required Reading - -Read these in order at session start: -1. `docs/NORTH_STAR.md` -2. `docs/ARCHITECTURE.md` -3. `docs/ROADMAP.md` - -Task-triggered required reads: -- MCP/tool implementation work: - - `docs/agent_tooling/README.md` - - `adb_vision/server.py` -- Implementation work (any phase): - - `TDD_IMPLEMENTATION_PLAN.md` -- Environment/bootstrap changes: - - `docs/dev/environment_setup.md` - - `docs/dev/memu_playbook.md` - -Fail-closed rule: -- If the task implies one of the sections above and the required file has not been read yet, stop and request confirmation before continuing. - -## Repository Model - -``` -ALAS/ [primary git repository] -├── upstream_alas/ [read-only upstream submodule] -├── alas_wrapped/ [runnable ALAS + local customizations] -│ └── tools/ [tools that import ALAS internals] -├── agent_orchestrator/ [MCP server + standalone orchestration tools] -├── docs/ [project documentation] -└── scripts/ [developer scripts] -``` - -Directory ownership: -- `upstream_alas/`: sync source only. -- `alas_wrapped/`: runtime source of truth. -- `alas_wrapped/tools/`: ALAS-internal imports only. -- `agent_orchestrator/`: standalone modern tooling and MCP. - -Placement rule: -- If code imports `module.*` or ALAS internals, place it under `alas_wrapped/tools/`. -- Otherwise place it under `agent_orchestrator/`. - -## Upstream Sync Workflow - -One-way model: -1. Update submodule: - - `git submodule update --remote -- upstream_alas` -2. Compare upstream changes. -3. Apply necessary changes into `alas_wrapped/` manually. -4. Preserve local customizations and MCP hooks. -5. Validate behavior in wrapped runtime before commit. - -## Runtime, Config, and Launch - -Core config files: -- `alas_wrapped/config/PatrickCustom.json` -- `alas_wrapped/config/deploy.yaml` -- `alas_wrapped/config/alas.json` - -Known environment values for this setup: -- `Alas.Emulator.Serial`: `127.0.0.1:21513` -- `Alas.EmulatorInfo.Emulator`: `MEmuPlayer` - -## Emulator Environment (MEmu) - -**Prerequisite:** The MEmu Multiple Instance Manager (`MEmuConsole.exe`) is always running with admin permissions. This is the control plane for all emulator instances. - -### Key Commands (memuc.exe) - -The `memuc` CLI at `C:\Program Files\Microvirt\MEmu\memuc.exe` controls emulator instances: - -```bash -# Start/stop VMs -memuc start -n MEmu # Start by name -memuc start -i 0 # Start by index -memuc stop -n MEmu # Stop by name -memuc stopall # Stop all VMs - -# Check status -memuc listvms # List all VMs with index, status, PID -memuc listvms --running # List only running VMs -memuc isvmrunning -n MEmu # Check specific VM - -# VM lifecycle -memuc reboot -i 0 # Reboot VM -memuc clone -i 0 # Clone a VM -``` - -### Common Workflow - -1. **MEmuConsole.exe is already running** (admin) - the user ensures this -2. Use `memuc` commands to start/stop emulator instances as needed -3. Once started, ADB connects via `127.0.0.1:21513` -4. MCP server ADB tools (`adb_screenshot`, `adb_tap`, etc.) can then interact with the emulator - -### Python Integration - -Direct subprocess calls are sufficient - no special libraries required: - -```python -import subprocess - -# Start emulator -subprocess.run(["memuc", "start", "-n", "MEmu"], check=True) - -# Check status -result = subprocess.run(["memuc", "isvmrunning", "-n", "MEmu"], - capture_output=True, text=True) -``` - -Wrapped runtime setup: -```bash -cd alas_wrapped -uv venv --python=3.9 .venv -uv pip install --python .venv/Scripts/python.exe -r requirements.txt --overrides overrides.txt -``` - -Launch wrapped bot (preferred explicit path): -```bash -cd alas_wrapped -PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe gui.py --run PatrickCustom -``` - -Convenience launch: -```bash -cd alas_wrapped -alas.bat -``` - -MCP server: -```bash -cd adb_vision -uv run server.py --serial 127.0.0.1:21513 -``` - -Log parser: -```bash -python agent_orchestrator/log_parser.py ../alas_wrapped/log/*.txt -``` - -## MCP Tool Surface - -Canonical callable names in `adb_vision/server.py`: -- `adb_screenshot` -- `adb_tap` -- `adb_swipe` -- `adb_keyevent` -- `adb_launch_game` -- `adb_get_focus` - -Notes: -- Some docs may use dotted labels (for example `adb.screenshot`), but callable names here are underscore-based. -- `adb_screenshot` returns image content with `mimeType: image/png` and base64 payload. - -## Tool Contract for New Tools - -Return state in this envelope: -```python -{ - "success": bool, - "data": object | None, - "error": str | None, - "observed_state": str | None, - "expected_state": str -} -``` - -## Change Discipline - -- Keep changes focused and local to the request. -- Update docs when behavior changes: - - `CHANGELOG.md` for user-visible behavior changes. - - `docs/agent_tooling/README.md` for new or changed tools. - - `docs/ARCHITECTURE.md` for architecture changes. - - `docs/monorepo/MONOREPO_SYNC_NOTES.md` for process changes. - - `docs/ROADMAP.md` when milestone status changes. - -Workflow modes: -- Upstream sync mode: `upstream_alas/` -> `alas_wrapped/`. -- Experimental mode: validate in scratch, then port minimal working changes back to `alas_wrapped/`. - -## Required Git Workflow - -- Any non-trivial code or behavior change must end with a commit and PR. -- Use a feature branch for non-trivial work. -- Keep commits scoped and descriptive. -- Open or update a PR with summary, rationale, and affected areas. -- If changelog/docs are impacted, include them in the same PR. -- If the current runtime cannot execute git operations, provide exact commands for the human operator to run. - -## Cross References - -- `docs/NORTH_STAR.md` -- `docs/ARCHITECTURE.md` -- `docs/ROADMAP.md` -- `docs/agent_tooling/README.md` -- `docs/monorepo/MONOREPO_SYNC_NOTES.md` -- `docs/state_machine/README.md` -- `docs/dev/environment_setup.md` -- `docs/DOCUMENTATION_GOVERNANCE.md` -- `agent_orchestrator/alas_mcp_server.py` -- `agent_orchestrator/log_parser.py` +# ALAS Agent Source of Truth + +This file (`CLAUDE.md`) is the single source of truth for all LLM agents (Claude, Gemini, etc.) operating in this workspace. + +## 1. System Intent & Permanent Loop +- **One permanent loop** for both build-time and autonomous runtime. +- **MasterLoop Architecture**: The Python scheduler drives the execution loop, picking the highest priority task and executing. +- **Deterministic tools** are the preferred hot path. +- **Vision/manual mode** is the permanent fallback and tool-growth engine. +- **Error recovery** always includes a restart fallback to a known-good state. +- **LangGraph is NOT USED** for the main execution pipeline. + +## 2. The Two-Tier Model (Single Harness) +All execution happens under a single MCP tool surface. + +**Tier 1 (Hot Path)**: +- Deterministic, element-powered tools. +- Must use modern element detection (MaaFramework/MaaMCP or uiautomator2 + OCR). +- **NO raw coordinates** and **NO ALAS templates**. +- Tool Output Contract strictly enforced: `{success, data, error, observed_state, expected_state}`. + +**Tier 2 (Recovery/Manual Pilot)**: +- VLM screenshot reasoning + manual raw ADB actions. +- Used when unknown screens are hit or deterministic tools fail. +- Captures blueprints of manual actions to be converted to new Tier 1 deterministic tools. + +## 3. Persistent State & The Scheduler +- The scheduler queue (`PatrickCustom.json`) determines the next task based on urgency scoring, ad-hoc inserts, and deadlines. +- **State representation**: We track persistent state. For example, hot OCR of resources (oil/gem/coin/timers) are exposed as queryable MCP resources. +- Loop executes: Get Next Task -> Deterministic Tool exists? -> Execute -> Update State -> Repeat. + +## 4. Extract, Don't Wrap (The ALAS Legacy Code Rule) +- The legacy ALAS state machine and page graph are **gold** — but as reference only! +- Extract the "what" (every screen, task flow, decision, edge case) from the ALAS codebase into clean, standalone, implementation-agnostic requirements. +- **Do not wrap** old ALAS logic in new tools. Rebuild the tools from the semantic ground up using modern element detection. +- We will eventually have a complete task list covering everything from ALAS. + +## 5. Development & Testing Axioms +- **Element Detection Layer First**: Build the element detection and OCR capabilities before building complex flows. +- **Zero Hallucination**: TDD + live emulator testing. Every feature must be tested directly against the LDPlayer (or MEmu) emulator as the ground truth. +- **Red-Green-Refactor**: Always write a failing unit/live test first before implementing new deterministic tool capabilities. diff --git a/DOC_GUIDE.md b/DOC_GUIDE.md new file mode 100644 index 0000000000..e772973c05 --- /dev/null +++ b/DOC_GUIDE.md @@ -0,0 +1,28 @@ +# Documentation Rules & Guide + +## Source of Truth Hierarchy + +1. `CLAUDE.md` + - The primary file for agent context, rules, and fundamental architectural axioms. It supersedes older files (`AGENTS.md`, `GEMINI.md`). +2. `docs/NORTH_STAR.md` + - The non-negotiable end-state definition for the project. Surgical edits only as we approach the final destination. +3. `docs/plans/program_execution_plan.md` + - The master index for the development phases. Outlines what phase we are in, and what remains. +4. `docs/plans/phase_X_*.md` + - Detailed execution plans for a specific project phase (e.g., Phase 2). Used as the day-to-day checklist. + +## Naming & Organization Rules + +### Tool Definitions +- Tools are placed in `adb_vision/tools/`. +- Must export a `name`, `description`, `preconditions`, `expected_state`, `timeout_s`, and `retry_budget`. +- Adhere strictly to the `alas.*` or descriptive namespace for MCP exposition. + +### Deprecated Docs +- All retired documentation must be completely removed from the repository or moved cleanly to `docs/archive/`. Do not leave them in `docs/plans/` as they pollute LLM context. +- We do NOT use `llm_driven_gameplay` or pure `LangGraph` planning documents anymore. + +### Documentation Style +- **Status Badges**: Use `Status: Active / Draft / Archived` at the top of planning documents. +- **Clarity**: Write clearly, declaring the specific tools and paradigms to be used (e.g., "MaaFramework", "uiautomator2"). Avoid broad vague language. +- **Diagrams**: Use Mermaid `.md` diagrams to visualize state machines or logic when it becomes sufficiently complex. diff --git a/FORK_CHANGES.md b/FORK_CHANGES.md deleted file mode 100644 index 0e47380f4d..0000000000 --- a/FORK_CHANGES.md +++ /dev/null @@ -1,194 +0,0 @@ -# Alas-with-Dashboard Fork Changes - -This document tracks Dashboard-specific modifications that differ from the upstream [LmeSzinc/AzurLaneAutoScript](https://github.com/LmeSzinc/AzurLaneAutoScript) repository. This helps with future upstream syncs and merge conflict resolution. - -## Overview - -**Fork**: Zuosizhu/Alas-with-Dashboard -**Upstream**: LmeSzinc/AzurLaneAutoScript -**Last Sync**: 2026-01-25 (upstream commit: 252e086db) - -## Major Dashboard-Specific Features - -### 1. Resource Logging System (LogRes) - -**Location**: `module/log_res/` - -**Purpose**: Tracks game resources (coins, oil, PT, etc.) with timestamps for Dashboard UI display. - -**Files Added**: -- `module/log_res/log_res.py` - Core logging implementation -- Related config in Dashboard settings - -**Integration Points** (9 files): -- `module/coalition/coalition.py` - PT logging -- `module/campaign/campaign_status.py` - Campaign resource tracking -- `module/gacha/gacha_reward.py` - Gacha rewards -- `module/os_handler/action_point.py` - Operation Siren AP -- `module/os_handler/os_status.py` - OS status tracking -- `module/raid/raid.py` - Raid resources -- `module/shop/shop_status.py` - Shop inventory -- `module/webui/app.py` - Dashboard UI backend - -**Merge Considerations**: -- When upstream changes resource detection code, ensure LogRes calls are preserved -- LogRes typically appears as: `LogRes(self.config). = value` -- Always followed by `self.config.update()` to persist changes - -### 2. Dashboard Web UI - -**Location**: `module/webui/` (enhanced from upstream) - -**Changes**: -- Additional endpoints for resource history -- LogRes integration for real-time tracking -- Enhanced status displays - -**Merge Considerations**: -- Upstream may modify `module/webui/app.py` - ensure Dashboard-specific routes preserved -- Check `module/webui/patch.py` for Dashboard customizations - -## Known Merge Conflict Patterns - -### Pattern 1: Coalition PT Reading - -**Typical Conflict**: `module/coalition/coalition.py` in `get_event_pt()` - -**Dashboard Version**: -```python -pt = ocr.ocr(self.device.image) -LogRes(self.config).Pt = pt -self.config.update() -return pt -``` - -**Resolution Strategy**: -- Preserve LogRes functionality within upstream's improved logic -- Example (2026-01-25 merge): - ```python - for _ in self.loop(timeout=1.5): - pt = ocr.ocr(self.device.image) - if pt not in [999999]: - LogRes(self.config).Pt = pt # Dashboard addition - self.config.update() # Dashboard addition - break - ``` - -### Pattern 2: Import Statements - -**Typical Conflict**: `from module.log_res.log_res import LogRes` - -**Resolution Strategy**: -- Dashboard needs: `from module.log_res.log_res import LogRes` -- Upstream may add: other imports -- Keep both, clean up formatting (remove double spaces) - -### Pattern 3: Config Updates - -**Typical Conflict**: Additional `self.config.update()` calls - -**Resolution Strategy**: -- Dashboard calls `config.update()` after LogRes to persist -- Upstream may batch updates differently -- Preserve Dashboard's update pattern for LogRes-modified values - -## Dashboard-Specific Branches - -- `master_Dashboard` - Main Dashboard development branch -- `master_lme` - Tracking branch for upstream changes -- `master` - Integration branch (merges from both) - -## Upstream Sync Process - -1. **Fetch upstream**: `git fetch upstream` -2. **Check commits**: `git log master..upstream/master` -3. **Create sync branch**: `git checkout -b sync-upstream-YYYY-MM` -4. **Merge**: `git merge upstream/master` -5. **Resolve conflicts**: Focus on LogRes preservation -6. **Test**: Run Dashboard with new changes -7. **Create PR**: To Zuosizhu/Alas-with-Dashboard - -## Testing Checklist for Upstream Syncs - -### LogRes Integration -- [ ] Coalition PT tracking works -- [ ] Campaign resource logging works -- [ ] OS AP tracking works -- [ ] Raid PT tracking works -- [ ] Dashboard UI displays resource history - -### Functional Areas -- [ ] Coalition events (test current event) -- [ ] Campaign runs -- [ ] Operation Siren -- [ ] Raid -- [ ] Shop purchases -- [ ] Gacha pulls - -### Dashboard UI -- [ ] Web interface loads -- [ ] Resource graphs display -- [ ] Real-time updates work -- [ ] Historical data preserved - -## Common Pitfalls - -1. **Don't remove LogRes imports**: Even if they look "unused" to IDE -2. **Don't remove config.update() calls**: Dashboard needs immediate persistence -3. **Check double spaces**: Dashboard originally had `from module.log_res` (two spaces) -4. **Verify Dashboard config files**: Upstream doesn't have Dashboard-specific config sections - -## File Change Summary - -### Files Only in Dashboard Fork -- `module/log_res/` (entire directory) -- Dashboard-specific config sections -- Additional web UI templates/routes - -### Files Modified from Upstream -- `module/coalition/coalition.py` (+LogRes) -- `module/campaign/campaign_status.py` (+LogRes) -- `module/gacha/gacha_reward.py` (+LogRes) -- `module/os_handler/action_point.py` (+LogRes) -- `module/os_handler/os_status.py` (+LogRes) -- `module/raid/raid.py` (+LogRes) -- `module/shop/shop_status.py` (+LogRes) -- `module/webui/app.py` (+LogRes integration) - -### Files Identical to Upstream -- Most other files follow upstream exactly - -## Future Considerations - -### Upstreaming Dashboard Features -If LogRes proves valuable, consider proposing it to upstream: -- Generic resource tracking framework -- Plugin architecture for optional logging -- Dashboard as optional component - -### Reducing Merge Conflicts -- Keep Dashboard changes minimal and focused -- Use inheritance/composition over modification where possible -- Document all deviations from upstream - -## References - -- **Upstream Repository**: https://github.com/LmeSzinc/AzurLaneAutoScript -- **Fork Repository**: https://github.com/Zuosizhu/Alas-with-Dashboard -- **Latest Sync PR**: https://github.com/Zuosizhu/Alas-with-Dashboard/pull/22 - -## Changelog - -### 2026-01-25: Sync with upstream (19 commits) -- Merged upstream commits up to 252e086db -- Added Fashion event (coalition_20260122) and DAL event (coalition_20251120) -- Integrated LogRes with upstream's improved PT reading timeout loop -- Resolved conflict in `module/coalition/coalition.py` -- Operation Siren refactored into 12 task modules -- QUIT_RECONFIRM assets reorganized - ---- - -*Last Updated*: 2026-01-25 -*Maintainer*: Zuosizhu -*Sync Status*: 19 commits behind upstream as of 2026-01-25 diff --git a/GEMINI.md b/GEMINI.md deleted file mode 100644 index d82e68ee23..0000000000 --- a/GEMINI.md +++ /dev/null @@ -1,168 +0,0 @@ - - -# AGENTS.md - -`AGENTS.md` is the canonical instruction source for agent behavior in this repository. -If any other entrypoint file conflicts with `AGENTS.md`, `AGENTS.md` wins. - -Derived entrypoint files: -- `CLAUDE.md` is generated from this file. -- `GEMINI.md` is generated from this file. -- `.github/copilot-instructions.md` may summarize this file for Copilot-specific discovery. - -## Project Purpose - -This repo transitions ALAS from a legacy script application into an LLM-augmented automation system. -The long-term goal is deterministic tool-first automation with LLM/vision used only for recovery. - -See: -- `docs/NORTH_STAR.md` -- `docs/ARCHITECTURE.md` -- `docs/ROADMAP.md` - -## Non-Negotiables - -- Never modify `upstream_alas/` directly for feature work. -- Never create additional git repos or submodules inside this repo. -- Treat `alas_wrapped/` as the runnable source of truth for customized behavior. -- Use deterministic tools first; use LLM/vision only for recovery and unexpected states. -- Do not commit runtime artifacts or secrets (for example: screenshots, ad-hoc runtime logs, local tokens). -- Keep `alas_wrapped/config/PatrickCustom.json` under version control and let hooks keep it in commit flow. -- If required docs for the current task have not been read, stop and read them before editing. - -## Required Reading - -Read these at session start: -1. `docs/NORTH_STAR.md` -2. `docs/ARCHITECTURE.md` -3. `docs/ROADMAP.md` - -Task-triggered required reads: -- MCP/tooling work: - - `docs/agent_tooling/README.md` - - `agent_orchestrator/alas_mcp_server.py` -- State machine or navigation behavior: - - `docs/state_machine/README.md` -- Environment/bootstrap changes: - - `docs/dev/environment_setup.md` -- Recovery agent or durable execution work: - - `docs/plans/recovery_agent_architecture.md` - - `docs/plans/durable_agent_architecture_design.md` -- Local VLM or vision integration work: - - `docs/plans/local_vlm_setup.md` -- State machine visualization work: - - `docs/plans/interactive_state_viz_plan.md` - - `docs/state_machine/STATE_MACHINE_VISUALIZATION.md` - -## Repository Model - -``` -ALAS/ [primary git repository] -├── upstream_alas/ [upstream sync submodule] -├── alas_wrapped/ [runnable ALAS + local customizations] -│ └── tools/ [tools that import ALAS internals] -├── agent_orchestrator/ [MCP server + standalone orchestration tools] -├── docs/ [project documentation] -└── .githooks/ [repo-tracked git hooks] -``` - -Directory ownership: -- `upstream_alas/`: sync source only. -- `alas_wrapped/`: runtime source of truth. -- `alas_wrapped/tools/`: ALAS-internal imports only. -- `agent_orchestrator/`: standalone modern tooling and MCP. - -Placement rule: -- If code imports `module.*` or ALAS internals, place it under `alas_wrapped/tools/`. -- Otherwise place it under `agent_orchestrator/`. - -## Upstream Sync Workflow - -One-way model: -1. Update submodule: - - `git submodule update --remote -- upstream_alas` -2. Compare upstream changes. -3. Apply needed changes manually into `alas_wrapped/`. -4. Preserve local customizations and MCP hooks. -5. Validate wrapped behavior before commit. - -## Runtime, Logs, and Launch - -Core configs: -- `alas_wrapped/config/PatrickCustom.json` -- `alas_wrapped/config/deploy.yaml` -- `alas_wrapped/config/alas.json` - -Known local environment: -- `Alas.Emulator.Serial`: `127.0.0.1:21503` -- `Alas.EmulatorInfo.Emulator`: `MEmuPlayer` - -Launch wrapped bot: -```bash -cd alas_wrapped -PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe gui.py --run PatrickCustom -``` - -MCP server purpose: -- Exposes deterministic ALAS/ADB tools via MCP for supervisor agents. -- Avoids runtime startup penalties by keeping ALAS loaded in a persistent process. - -Run MCP server: -```bash -cd agent_orchestrator -uv run alas_mcp_server.py --config alas -``` - -Log parser purpose: -- Fast forensic analysis of task timelines, warnings/errors, and tracebacks. -- Preferred over manual full-log scanning for debugging regressions. - -Recommended parser usage: -```bash -python3 agent_orchestrator/log_parser.py alas_wrapped/log/YYYY-MM-DD_PatrickCustom.txt --timeline --errors --trace -python3 agent_orchestrator/log_parser.py alas_wrapped/log/YYYY-MM-DD_PatrickCustom.txt --summary -``` - -## MCP Tool Surface - -Canonical callable names in `agent_orchestrator/alas_mcp_server.py`: -- `adb_screenshot` -- `adb_tap` -- `adb_swipe` -- `alas_get_current_state` -- `alas_goto` -- `alas_list_tools` -- `alas_call_tool` -- `alas_login_ensure_main` - -## Tool Contract For New Tools - -Return state using: -```python -{ - "success": bool, - "data": object | None, - "error": str | None, - "observed_state": str | None, - "expected_state": str -} -``` - -## Change Discipline - -- Keep changes focused and local to the request. -- Update docs when behavior changes: - - `CHANGELOG.md` for user-visible behavior changes. - - `docs/agent_tooling/README.md` for tool changes. - - `docs/ARCHITECTURE.md` for architecture changes. - - `docs/monorepo/MONOREPO_SYNC_NOTES.md` for process changes. - - `docs/ROADMAP.md` when milestone status changes. - -## Required Git Workflow - -- Any non-trivial code or behavior change must end with a commit and PR. -- Use a feature branch for non-trivial work. -- Keep commits scoped and descriptive. -- If runtime cannot execute git operations, provide exact commands for operator execution. -- The local hook flow stages `alas_wrapped/config/PatrickCustom.json` on pre-commit so it is included in the same commit. -- Pre-push validates that `PatrickCustom.json` is clean to prevent accidental drift. diff --git a/PROGRESS_ASSESSMENT.md b/PROGRESS_ASSESSMENT.md deleted file mode 100644 index b2d32ebe08..0000000000 --- a/PROGRESS_ASSESSMENT.md +++ /dev/null @@ -1,320 +0,0 @@ -# Progress Assessment: Current Codebase vs TDD Plan - -**Assessment Date:** Current session -**Goal:** Map existing `agent_orchestrator/` and `alas_wrapped/tools/` against the 6-phase TDD plan - ---- - -## Summary: Where We Are - -| Phase | Plan Status | Actual Status | Completion | -|-------|-------------|---------------|------------| -| **0** | 20+ live tests, standalone MCP | Hybrid MCP (ALAS imports), 1 smoke test | ~30% | -| **1** | Master scheduler + loop | `llm_pilot.py` exists but ALAS-dependent | ~40% | -| **2** | 3 deterministic tools | Navigation tools exist but ALAS-imported | ~20% | -| **3** | Blueprint system | Not started | 0% | -| **4** | Error handling + restart | Partial (recovery in llm_pilot) | ~30% | -| **5** | Autonomous mode | `llm_pilot.py` runs but not standalone | ~25% | - -**Overall:** ~25-30% toward TDD plan goals. Major gap: **Standalone MCP server** (no ALAS imports). - ---- - -## Phase 0: Foundation - Detailed Assessment - -### ✅ What's Working (Keep) - -| Component | Location | Status | Notes | -|-----------|----------|--------|-------| -| ADB screenshot | `alas_mcp_server.py:adb_screenshot()` | ✅ Working | Returns base64 PNG | -| ADB tap/swipe | `alas_mcp_server.py:adb_tap/swipe()` | ✅ Working | Uses ADB CLI | -| ADB launch game | `alas_mcp_server.py:adb_launch_game()` | ✅ Working | Activity manager | -| ADB get focus | `alas_mcp_server.py:adb_get_focus()` | ✅ Working | Window/package | -| MCP action log | `alas_mcp_server.py:_action_log()` | ✅ Working | JSONL logging | -| Screenshot save | `alas_mcp_server.py:_save_screenshot_png()` | ✅ Working | PNG files | -| ADB path detection | `alas_mcp_server.py:_find_adb()` | ✅ Working | Multiple candidates | -| Live smoke test | `smoke_test_live.py` | ✅ Working | Manual run, not pytest | - -### ❌ What's Missing (Build) - -| Required | Status | Blocker | -|----------|--------|---------| -| **Standalone MCP** (no ALAS imports) | ❌ Missing | Current has `sys.path.insert(0, alas_wrapped)` | -| **MEmu launch via memuc** | ❌ Missing | No memuc CLI wrapper | -| **MaaTouch socket input** | ❌ Missing | Uses slow ADB tap (200-300ms) | -| **DroidCast screenshot** | ❌ Missing | Uses uiautomator2 (may have black screen issues) | -| **VLM integration** | ❌ Missing | No Gemini Flash caller | -| **State cache** | ❌ Missing | No persistent resource cache | -| **Strict contract enforcement** | ⚠️ Partial | Some tools return strings, not `{success, data, error, observed_state, expected_state}` | -| **pytest live test suite** | ❌ Missing | `smoke_test_live.py` is standalone script, not pytest | - -### 🔧 Refactor Required - -```python -# Current (Phase 0 gap): -sys.path.insert(0, alas_wrapped) # ❌ ALAS import dependency -from alas import AzurLaneAutoScript - -# Target (TDD Phase 0): -# No sys.path hacks -# Direct ADB/MaaTouch/DroidCast -# Standalone MCP server -``` - ---- - -## Phase 1: Master Scheduler + Loop - Detailed Assessment - -### ✅ What's Working (Keep) - -| Component | Location | Status | Notes | -|-----------|----------|--------|-------| -| LLM pilot loop | `llm_pilot.py` | ✅ Working | Replaces ALAS scheduler | -| PID management | `llm_pilot.py` | ✅ Working | Kill/restart/guard | -| Logging | `llm_pilot.py` | ✅ Working | Timestamped logs | -| Process lifecycle | `llm_pilot.py` | ✅ Working | `--kill`, `--restart` | -| Tool cooldown | `llm_pilot.py` | ✅ Working | Runs tools on schedule | -| Recovery to main | `llm_pilot.py` | ✅ Working | Error → page_main | - -### ❌ What's Missing (Build) - -| Required | Status | Blocker | -|----------|--------|---------| -| **PatrickCustom.json loader** | ⚠️ Partial | Can read but not as formal scheduler | -| **get_next_action() decision** | ⚠️ Partial | Hardcoded in llm_pilot, not pluggable | -| **Task queue** | ❌ Missing | No formal priority queue | -| **Due-time calculation** | ❌ Partial | Uses ALAS `NextRun` parsing | -| **Loop without ALAS** | ❌ Missing | `llm_pilot.py` imports from `alas_wrapped` | -| **State cache updates** | ❌ Missing | No persistent cache queried | - -### Current Loop Flow (vs Target) - -```python -# Current (llm_pilot.py): -while running: - for tool in available_tools: # Hardcoded list - if cooldown_expired(tool): - try: - result = run_tool(tool) # Calls ALAS internals - except: - recover_to_main() # Uses ALAS state machine - -# Target (TDD Phase 1): -while running: - task = scheduler.get_next_action() # From PatrickCustom + dynamic - if task: - success = try_deterministic(task) # No ALAS - if not success: - enter_manual_piloting(task) # VLM fallback - update_state_cache() # Persistent -``` - ---- - -## Phase 2: Deterministic Tool Framework - Detailed Assessment - -### ✅ What's Working (Keep) - -| Component | Location | Status | Notes | -|-----------|----------|--------|-------| -| Navigation tools | `alas_wrapped/tools/navigation.py` | ✅ Working | `goto()`, `get_current_state()` | -| Login tools | `alas_wrapped/tools/login.py` | ✅ Working | `app_login()`, `ensure_main()` | -| Vision tools | `alas_wrapped/tools/vision.py` | ✅ Working | `find_button()`, `wait_for()` | -| State graph audit | `alas_wrapped/tools/state_graph_audit.py` | ✅ Working | CLI tool for validation | -| Tool registry | `alas_mcp_server.py:alas_list_tools/call_tool()` | ✅ Working | Dynamic tool discovery | -| Deterministic tools | `module/state_machine.py` | ✅ Working | `workflow.daily_base_sweep` | - -### ❌ What's Missing (Build) - -| Required | Status | Blocker | -|----------|--------|---------| -| **Standalone tool framework** | ❌ Missing | Tools import `from module.*` | -| **Tool contract enforcement** | ⚠️ Partial | Returns vary (str, dict, None) | -| **Commission tool** | ⚠️ Partial | Exists in ALAS, not standalone | -| **Resource check tool** | ⚠️ Partial | OCR exists but ALAS-dependent | -| **Live TDD tests per tool** | ❌ Missing | Tests mock ALAS | -| **Auto-discovery** | ⚠️ Partial | Manual registration in MCP | - -### Tool Dependency Analysis - -```python -# Current tool (ALAS-dependent): -# alas_wrapped/tools/navigation.py -from module.ui.page import Page # ❌ ALAS import -from alas_mcp_server import ALASContext # ❌ ALAS context - -def goto(page_name): - page = Page.all_pages[page_name] # ❌ ALAS state machine - ctx._state_machine.transition(page) # ❌ ALAS internal - -# Target tool (standalone): -# azurlane_agent/tools/goto_main.py -# No ALAS imports -# Uses element detection (OCR/template) or VLM fallback -``` - ---- - -## Phase 3: Manual Piloting + Blueprint - Detailed Assessment - -### ✅ What's Working (Keep) - -| Component | Location | Status | Notes | -|-----------|----------|--------|-------| -| Live navigation test | `live_nav.py` | ✅ Working | Manual script for testing | -| Screenshot capture | `smoke_test_live.py` | ✅ Working | Can get images | -| State transition | `live_nav.py` | ✅ Working | Uses ALAS state machine | - -### ❌ What's Missing (Build) - -| Required | Status | Blocker | -|----------|--------|---------| -| **VLM piloting loop** | ❌ Missing | No screenshot → VLM → action loop | -| **Blueprint generator** | ❌ Missing | No session recording → blueprint | -| **Blueprint format** | ❌ Missing | No JSON schema for blueprints | -| **Blueprint → tool converter** | ❌ Missing | No automated stub generation | -| **Manual approval UI** | ❌ Missing | No per-action approval in loop | - -### Blueprint System Gap - -```python -# Target (TDD Phase 3): -blueprint = { - "goal": "collect_daily_rewards", - "steps": [ - {"screenshot_hash": "abc123", "action": "tap(100,200)", - "observed_state": "rewards_button_visible"}, - # ... - ] -} -# Convert to deterministic tool stub -``` - ---- - -## Phase 4: Error Handling + Restart - Detailed Assessment - -### ✅ What's Working (Keep) - -| Component | Location | Status | Notes | -|-----------|----------|--------|-------| -| Error recovery | `llm_pilot.py` | ✅ Working | Exception → recover_to_main() | -| Process restart | `llm_pilot.py` | ✅ Working | `--restart` flag | -| MEmu serial | `alas_mcp_server.py` | ✅ Working | `127.0.0.1:21513` configured | - -### ❌ What's Missing (Build) - -| Required | Status | Blocker | -|----------|--------|---------| -| **VLM diagnosis** | ❌ Missing | No screenshot → VLM → diagnosis | -| **Recovery attempt cascade** | ⚠️ Partial | Only recovers to main, no retry | -| **MEmu restart via memuc** | ❌ Missing | No memuc stop/start wrapper | -| **Clean main verification** | ❌ Missing | No "at main menu" check post-restart | -| **Error test suite** | ❌ Missing | No deliberate failure tests | - ---- - -## Phase 5: Autonomous Mode - Detailed Assessment - -### ✅ What's Working (Keep) - -| Component | Location | Status | Notes | -|-----------|----------|--------|-------| -| Pilot runs overnight | `llm_pilot.py` | ✅ Working | Can run for hours | -| Tool execution | `llm_pilot.py` | ✅ Working | Calls tools on schedule | -| PID file | `llm_pilot.py` | ✅ Working | Prevents multiple instances | - -### ❌ What's Missing (Build) - -| Required | Status | Blocker | -|----------|--------|---------| -| **Same MCP for dev/prod** | ❌ Missing | Dev uses ALAS-imported MCP | -| **Dashboard queries** | ❌ Missing | No resource/page/task query tools | -| **24-hour stability** | ⚠️ Unknown | Not tested | -| **Autonomous start command** | ⚠️ Partial | `llm_pilot.py` exists but not standalone | -| **Structured logging** | ⚠️ Partial | Logs exist but not machine-parseable | - ---- - -## Critical Path to TDD Plan - -### Blocker #1: Standalone MCP Server (Phase 0) -**Impact:** Blocks all phases -**Work:** Create `azurlane_agent/mcp/server.py` with no ALAS imports - -### Blocker #2: MEmu memuc Wrapper (Phase 0) -**Impact:** Blocks Phase 4 restart, Phase 0 launch tests -**Work:** Create `azurlane_agent/emulator/memuc_cli.py` - -### Blocker #3: Tool Refactor (Phase 2) -**Impact:** Blocks deterministic tool framework -**Work:** Rewrite 3 tools without `from module.*` imports - -### Blocker #4: VLM Integration (Phase 3) -**Impact:** Blocks manual piloting, blueprint system -**Work:** Add Gemini Flash caller with screenshot input - ---- - -## Recommended Immediate Actions - -### Option A: Big Bang (High Risk) -1. Create new `azurlane_agent/` repo -2. Port working pieces from `agent_orchestrator/` -3. Rewrite everything standalone -4. **Time:** 3-4 weeks full-time - -### Option B: Incremental Refactor (Lower Risk) -1. Keep `agent_orchestrator/` -2. Create `agent_orchestrator/standalone/` subpackage -3. Migrate tool by tool -4. Gradually replace ALAS imports -5. **Time:** 6-8 weeks part-time - -### Option C: Hybrid (Recommended) -1. **Week 1:** Standalone MCP server scaffold (Phase 0) -2. **Week 2:** MEmu memuc wrapper + live tests (Phase 0) -3. **Week 3:** Scheduler refactor (Phase 1) -4. **Week 4-5:** First 3 deterministic tools (Phase 2) -5. **Week 6:** VLM + manual piloting (Phase 3) -6. **Week 7:** Error handling + restart (Phase 4) -7. **Week 8:** Autonomous polish (Phase 5) - ---- - -## Test Inventory: What Exists vs Needed - -### Existing Tests (Non-TDD) -| Test | Location | Type | Mocks? | -|------|----------|------|--------| -| `test_integration_mcp.py` | `agent_orchestrator/` | Integration | ✅ Yes (mock ALAS) | -| `test_login_replay.py` | `agent_orchestrator/` | Replay | ✅ Yes (fixtures) | -| `test_replay_extended.py` | `agent_orchestrator/` | Replay | ✅ Yes (fixtures) | -| `test_alas_mcp.py` | `agent_orchestrator/` | Unit | ✅ Yes (mock) | -| `test_navigation.py` | `alas_wrapped/tools/` | Unit | ✅ Yes (mock ALAS) | -| `test_login.py` | `alas_wrapped/tools/` | Unit | ✅ Yes (mock ALAS) | -| `smoke_test_live.py` | `agent_orchestrator/` | Live | ❌ No (real MEmu) | - -### TDD Plan Tests (To Write) -| Phase | Tests | Type | Count | -|-------|-------|------|-------| -| 0 | `test_memu_launch`, `test_screenshot`, `test_tap_moves_ui`, `test_vlm_call`, `test_state_cache` | Live | 20+ | -| 1 | `test_scheduler_load`, `test_get_next_action`, `test_loop_three_cycles` | Live | 5+ | -| 2 | `test_tool_discovery`, `test_commissions`, `test_resources` | Live | 10+ | -| 3 | `test_manual_pilot`, `test_blueprint_gen`, `test_blueprint_convert` | Live | 5+ | -| 4 | `test_error_recovery`, `test_memu_restart` | Live | 5+ | -| 5 | `test_autonomous_start`, `test_24h_stability` | Live | 3+ | - -**Gap:** Need ~48 new live tests, 0 mocks for emulator behavior. - ---- - -## Conclusion - -**Current State:** ~25-30% of TDD plan complete. Working foundation exists but tightly coupled to ALAS. - -**Biggest Risk:** Standalone MCP server requires breaking ALAS dependency, which touches everything. - -**Biggest Opportunity:** `llm_pilot.py` proves the loop concept works. Just needs refactor to standalone. - -**Next Decision:** Choose Option A (big bang), B (incremental), or C (hybrid) and start Phase 0-T1. diff --git a/adb_vision/loop/master_loop.py b/adb_vision/loop/master_loop.py index 814780fe4f..82566e94a3 100644 --- a/adb_vision/loop/master_loop.py +++ b/adb_vision/loop/master_loop.py @@ -7,7 +7,9 @@ from dataclasses import dataclass from datetime import datetime +import sys from typing import Any, Callable +from typing import Optional from contracts.tool_contract import ContractViolationError, validate_contract from scheduler.decider import TaskDecider @@ -28,6 +30,11 @@ class MasterLoop: fallback_executor: Executor | None = None state_updater: StateUpdater = cache.update + # Toggle if desired so existing callers that only pass a direct executor + # still work unchanged. The loop will always attempt direct command execution + # if no matching tool exists in the registry. + use_tool_registry: bool = True + def _persist_cycle(self, cycle_result: dict[str, Any]) -> None: self.state_updater("scheduler.last_cycle", cycle_result) @@ -54,9 +61,7 @@ def run_cycles(self, count: int = 3) -> list[dict[str, Any]]: deterministic_error: str | None = None deterministic_result: dict[str, Any] | None = None try: - deterministic_result = validate_contract( - self.executor(decision.task.command) - ) + deterministic_result = validate_contract(self._run_deterministic(decision.task.command)) except ContractViolationError: # Contract bugs are developer errors and should fail fast. raise @@ -137,3 +142,41 @@ def run_cycles(self, count: int = 3) -> list[dict[str, Any]]: self._persist_cycle(cycle_result) return results + + def _resolve_tool(self, command: str) -> Optional[Callable[[], dict[str, Any]]]: + """Resolve command to a registered deterministic tool if available.""" + if not self.use_tool_registry: + return None + + # Keep strict command names first-class; future command formats should be + # normalized before registration if needed. + candidate = command.strip() + + for module_name in ("adb_vision.tools.registry", "tools.registry"): + module = sys.modules.get(module_name) + if module is None: + continue + get_tool = getattr(module, "get_tool", None) + if callable(get_tool): + tool = get_tool(candidate) + if tool is not None: + return tool + + for module_name in ("adb_vision.tools.registry", "tools.registry"): + try: + module = __import__(module_name, fromlist=["get_tool"]) + except Exception: + continue + get_tool = getattr(module, "get_tool", None) + if callable(get_tool): + tool = get_tool(candidate) + if tool is not None: + return tool + return None + + def _run_deterministic(self, command: str) -> dict[str, Any]: + """Execute deterministic path (registry-first, then direct executor).""" + tool = self._resolve_tool(command) + if tool is not None: + return tool() + return self.executor(command) diff --git a/adb_vision/test_live_phase2_tools.py b/adb_vision/test_live_phase2_tools.py new file mode 100644 index 0000000000..ea379bc047 --- /dev/null +++ b/adb_vision/test_live_phase2_tools.py @@ -0,0 +1,76 @@ +""" +Live TDD Harness for Phase 2 Deterministic Tools. + +To run: `uv run pytest test_live_phase2_tools.py -v -s` +Requires: An active LDPlayer/MEmu running Azur Lane. +""" + +import asyncio +import os + +import pytest + +# We import the dynamically compiled registry rather than the tools directly +# to ensure the framework's strict decorators are applied correctly. +from adb_vision.tools import registry + +# Try to use the same MEmu fixture we built in Phase 0 +try: + from adb_vision.test_live import _check_emulator_reachable +except Exception: + _check_emulator_reachable = None + + +def _has_live_emulator() -> bool: + if os.environ.get("SKIP_LIVE_TESTS", "0") == "1": + return False + if _check_emulator_reachable is None: + return False + try: + return asyncio.run(_check_emulator_reachable()) + except Exception: + return False + + +_run_memu = _has_live_emulator() + +@pytest.fixture(autouse=True) +def ensure_tools_loaded(): + """Guarantee the framework discovered our 3 tools before testing begins.""" + assert len(registry.list_registered_tools()) >= 3 + if not _run_memu: + pytest.skip("Live emulator missing. Must be connected to ADB.") + +@pytest.mark.skipif(not _run_memu, reason="Live emulator missing. Must be connected to ADB.") +def test_live_goto_main(): + """ + Ensure the MasterLoop's first tool successfully returns to the lobby + from a random location. + """ + tool_fn = registry.get_tool("goto_main_menu") + assert tool_fn is not None + + result = tool_fn() + + # TDD Expected Failure: Will fail because we have not implemented MaaFramework hooks yet! + assert result["success"] is True, f"Failed: {result['error']}" + assert result["observed_state"] == "page_main" + +@pytest.mark.skipif(not _run_memu, reason="Requires emulator with resources visible.") +def test_live_check_resources(): + """Ensure OCR bounds properly capture and post state updates.""" + tool_fn = registry.get_tool("check_resource_balances") + result = tool_fn() + + assert result["success"] is True + assert result["observed_state"] == "resources_logged" + # Future assertion: assert cache.query("oil") > 0 + +@pytest.mark.skipif(not _run_memu, reason="Requires emulator to navigate safely.") +def test_live_collect_commissions(): + """The capstone tool of Phase 2. Can it collect without VLM recovery?""" + tool_fn = registry.get_tool("collect_commissions") + result = tool_fn() + + assert result["success"] is True + assert result["expected_state"] == "page_commissions_empty" diff --git a/adb_vision/test_master_loop.py b/adb_vision/test_master_loop.py index 4cba5b0a49..cf1493f87c 100644 --- a/adb_vision/test_master_loop.py +++ b/adb_vision/test_master_loop.py @@ -7,19 +7,26 @@ from typing import Any import pytest +from adb_vision.tools.registry import clear_registry_for_test, deterministic_tool from contracts.tool_contract import make_result from loop.master_loop import MasterLoop from scheduler.loader import TaskLoader -def _write_due_config(fp: Path) -> None: +@pytest.fixture(autouse=True) +def reset_tool_registry(): + clear_registry_for_test() + yield + + +def _write_due_config(fp: Path, command: str = "Restart") -> None: config: dict[str, Any] = { "Restart": { "Scheduler": { "Enable": True, "NextRun": "2020-01-01 00:00:00", - "Command": "Restart", + "Command": command, "SuccessInterval": 0, "FailureInterval": 0, "ServerUpdate": "00:00", @@ -93,6 +100,62 @@ def update_state(key: str, value: Any) -> None: assert state_updates[0][0] == "scheduler.last_cycle" +def test_core_loop_prefers_registered_tool_when_command_matches(tmp_path: Path): + fp = tmp_path / "PatrickCustom.json" + _write_due_config(fp, command="registry_task") + + direct_calls: list[str] = [] + fallback_calls: list[str] = [] + + def executor(command: str) -> dict: + direct_calls.append(command) + return make_result( + success=False, + data=None, + error="direct executor should not run", + observed_state="direct_path", + expected_state="task_completed", + ) + + def fallback_executor(command: str) -> dict: + fallback_calls.append(command) + return make_result( + success=False, + data=None, + error="fallback should not run", + observed_state="fallback", + expected_state="task_completed", + ) + + @deterministic_tool( + name="registry_task", + description="registry tool for loop integration", + expected_state="task_completed", + ) + def registry_task(): + return make_result( + success=True, + data={"source": "registry"}, + error=None, + observed_state="registry_task", + expected_state="task_completed", + ) + + loop = MasterLoop( + loader=TaskLoader(fp), + executor=executor, + fallback_executor=fallback_executor, + ) + results = loop.run_cycles(count=1) + + assert len(results) == 1 + assert results[0]["execution_path"] == "deterministic" + assert results[0]["status"] == "executed" + assert results[0]["result"]["data"]["source"] == "registry" + assert direct_calls == [] + assert fallback_calls == [] + + def test_core_loop_deterministic_failure_calls_fallback_once(tmp_path: Path): fp = tmp_path / "PatrickCustom.json" _write_due_config(fp) diff --git a/adb_vision/test_tool_registry.py b/adb_vision/test_tool_registry.py new file mode 100644 index 0000000000..7cbd4e52c5 --- /dev/null +++ b/adb_vision/test_tool_registry.py @@ -0,0 +1,56 @@ +"""TDD Unit Tests for the Deterministic Tool Registry.""" + +import pytest +from adb_vision.tools.registry import ( + deterministic_tool, + list_registered_tools, + get_tool, + clear_registry_for_test +) +from adb_vision.contracts.tool_contract import make_result, ContractViolationError + +@pytest.fixture(autouse=True) +def wipe_registry(): + clear_registry_for_test() + yield + +def test_tool_registry_discovers_modules(): + """Test that the decorator automatically places the tool into the global registry.""" + @deterministic_tool( + name="test_discovery", + description="A test tool", + expected_state="test_ok" + ) + def dummy_tool(): + return make_result(success=True, expected_state="test_ok") + + tools = list_registered_tools() + assert len(tools) == 1 + assert tools[0].name == "test_discovery" + assert tools[0].description == "A test tool" + +def test_tool_registry_rejects_duplicate_names(): + """Test that two tools cannot share the same registry name.""" + @deterministic_tool(name="duplicate", description="", expected_state="state") + def first_tool(): pass + + with pytest.raises(ValueError, match="already registered"): + @deterministic_tool(name="duplicate", description="", expected_state="state") + def second_tool(): pass + +def test_tool_invocation_enforces_contract(): + """Test that the decorator correctly applies the strict contract checker.""" + @deterministic_tool( + name="bad_contract_tool", + description="I return a bad dictionary", + expected_state="any" + ) + def bad_tool(): + # Fails because success=True requires error=None + return {"success": True, "error": "This shouldn't be here", "data": None, "observed_state": None, "expected_state": "any"} + + executable = get_tool("bad_contract_tool") + assert executable is not None + + with pytest.raises(ContractViolationError, match="On success, 'error' must be None or empty"): + executable() diff --git a/adb_vision/tools/__init__.py b/adb_vision/tools/__init__.py new file mode 100644 index 0000000000..83db8feed7 --- /dev/null +++ b/adb_vision/tools/__init__.py @@ -0,0 +1,19 @@ +"""Deterministic Tool Discovery and Registration. + +Auto-imports all python modules in this directory so the registry can find them. +""" +import importlib +import pkgutil +import sys + +# Auto-discover tools when `adb_vision.tools` is imported +def _discover_tools(): + package_name = __name__ + package_path = sys.modules[package_name].__path__ + + for _, module_name, is_pkg in pkgutil.iter_modules(package_path): + if not is_pkg and module_name != "registry": + full_module_name = f"{package_name}.{module_name}" + importlib.import_module(full_module_name) + +_discover_tools() diff --git a/adb_vision/tools/check_resources.py b/adb_vision/tools/check_resources.py new file mode 100644 index 0000000000..d0f771e9b5 --- /dev/null +++ b/adb_vision/tools/check_resources.py @@ -0,0 +1,29 @@ +"""Tool 2: Hot-Read OCR persistence of resource coordinates.""" + +from adb_vision.tools.registry import deterministic_tool +from adb_vision.contracts.tool_contract import make_result + +@deterministic_tool( + name="check_resource_balances", + description="Captures oil, gems, coins, and action points via OCR and persists to state cache.", + expected_state="resources_logged", + timeout_s=30, + retry_budget=2 +) +def check_resource_balances() -> dict: + """ + Perform an OCR sweep of resource anchors and update the MCP queryable state. + + Implementation Requirement: + - Identify semantic boundaries for Oil, Gem, and Coin. + - Parse numeric values. + - Persist to `adb_vision.state.cache`. + """ + + # [TDD Placeholder] + return make_result( + success=False, + error="check_resource_balances OCR element finding is not implemented.", + observed_state="unknown", + expected_state="resources_logged" + ) diff --git a/adb_vision/tools/collect_commissions.py b/adb_vision/tools/collect_commissions.py new file mode 100644 index 0000000000..69d4285553 --- /dev/null +++ b/adb_vision/tools/collect_commissions.py @@ -0,0 +1,31 @@ +"""Tool 3: Element-powered generic commission collection logic.""" + +from adb_vision.tools.registry import deterministic_tool +from adb_vision.contracts.tool_contract import make_result + +@deterministic_tool( + name="collect_commissions", + description="Navigate to commissions UI, collect all finished commissions, and dispatch new ones.", + expected_state="page_commissions_empty", + timeout_s=180, + retry_budget=3 +) +def collect_commissions(priority_policy: str = "default") -> dict: + """ + Deterministically completes a commission dispatch loop. + + Implementation Requirement: + - Discover "Commissions" entry point on Main Menu semantically. + - Enter Commission window. + - Find "Collect All" via Element/Text. + - Tap, wait for rewards overlay (dismiss via background click). + - Dispatch new commissions using `priority_policy` arg. + """ + + # [TDD Placeholder] + return make_result( + success=False, + error="collect_commissions has not been wired to modern UI tooling.", + observed_state="unknown", + expected_state="page_commissions_empty" + ) diff --git a/adb_vision/tools/goto_main_menu.py b/adb_vision/tools/goto_main_menu.py new file mode 100644 index 0000000000..1d71f6e228 --- /dev/null +++ b/adb_vision/tools/goto_main_menu.py @@ -0,0 +1,31 @@ +"""Tool 1: Deterministic Navigation to Main Menu.""" + +from adb_vision.tools.registry import deterministic_tool +from adb_vision.contracts.tool_contract import make_result + +@deterministic_tool( + name="goto_main_menu", + description="Navigate safely to the main menu from any typical submenu or popup using element anchors.", + expected_state="page_main", + timeout_s=90, + retry_budget=5 +) +def goto_main_menu() -> dict: + """ + Executes an Element-Powered escape sequence to return to the main lobby. + + Implementation Requirement: + - Use MaaFramework or uiautomator to identify the 'Home' button or 'Back' element. + - Loop logic: Find anchor -> Tap -> Wait for page_main semantic anchors. + """ + + # [TDD Placeholder]: The implementation will eventually perform OCR/Element reading + # and tap elements until the observed state equals the expected Main Menu anchors. + + # Simulating failure to satisfy TDD "write a failing test first" rules. + return make_result( + success=False, + error="goto_main_menu has not been implemented to use Element Detection.", + observed_state="unknown", + expected_state="page_main" + ) diff --git a/adb_vision/tools/registry.py b/adb_vision/tools/registry.py new file mode 100644 index 0000000000..62e0027407 --- /dev/null +++ b/adb_vision/tools/registry.py @@ -0,0 +1,69 @@ +"""Registry and decorator for Element-Powered deterministic tools.""" + +from dataclasses import dataclass +from typing import Any, Callable, Dict, Optional +import functools + +from adb_vision.contracts.tool_contract import strict_contract, make_result + +@dataclass +class ToolMetadata: + name: str + description: str + expected_state: str + timeout_s: int + retry_budget: int + +# Global registry dict +_TOOL_REGISTRY: Dict[str, tuple[Callable, ToolMetadata]] = {} + +def deterministic_tool( + name: str, + description: str, + expected_state: str, + timeout_s: int = 60, + retry_budget: int = 3 +): + """ + Registers a deterministic Tool in the MasterLoop registry. + + Enforces the `{success, data, error, observed_state, expected_state}` strict contract. + """ + def decorator(fn: Callable[..., dict]) -> Callable[..., dict]: + # Wrap the function with the strict contract checker + contract_enforced_fn = strict_contract(fn) + + metadata = ToolMetadata( + name=name, + description=description, + expected_state=expected_state, + timeout_s=timeout_s, + retry_budget=retry_budget + ) + + # Register it globally immediately upon import + if name in _TOOL_REGISTRY: + raise ValueError(f"Tool with name '{name}' is already registered.") + + _TOOL_REGISTRY[name] = (contract_enforced_fn, metadata) + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + return contract_enforced_fn(*args, **kwargs) + + return wrapper + + return decorator + +def list_registered_tools() -> list[ToolMetadata]: + """Returns metadata for all registered deterministic tools.""" + return [meta for _, meta in _TOOL_REGISTRY.values()] + +def get_tool(name: str) -> Optional[Callable[..., dict]]: + """Returns the executable tool function if registered.""" + record = _TOOL_REGISTRY.get(name) + return record[0] if record else None + +def clear_registry_for_test(): + """Testing helper to clear global state.""" + _TOOL_REGISTRY.clear() diff --git a/docs/IMPLEMENTATION_PLAN_NEXT_PASS.md b/docs/IMPLEMENTATION_PLAN_NEXT_PASS.md deleted file mode 100644 index 6336c7b44a..0000000000 --- a/docs/IMPLEMENTATION_PLAN_NEXT_PASS.md +++ /dev/null @@ -1,36 +0,0 @@ -# adb_vision Backend Implementation Pass 2 - -## Objective - -- Finish the backend pass with deterministic dispatch behavior for screenshot methods. -- Remove ambiguity in the `scrcpy` route so behavior is explicit and reviewable. -- Add/adjust tests for routing and size-based fallback checks. - -## Current state at pass start - -- Branch: `implement/adb-vision-backends-round2` -- Working branch switched to: `implement/adb-vision-backends-pass2` -- Implemented in prior pass: - - `adb_vision/screenshot.py`: DROIDCAST, U2, and SCRCPY-compat backends were wired. - - `adb_vision/test_live.py`: DroidCast skip handling updated. -- Remaining gaps: - - `scrcpy` backend still documents fallback behavior but is not a true scrcpy stream capture. - - `auto` backend order currently includes duplicated screenshot work paths. - - Unit tests still describe old stub assumptions. - -## Execution loop for this pass - -1. Patch screenshot dispatch so auto flow is explicit: - - `droidcast` first, then `u2`, then `screencap`. - - keep `scrcpy` as a compatibility path that is explicit about behavior. -2. Update `screenshot.py` comments/messages to match implementation. -3. Add/adjust unit tests: - - backend order in `auto` - - `scrcpy` compatibility behavior - - rejection of tiny/invalid images -4. Run targeted tests (unit-level) and collect a pass/fail checklist. - -## Risk notes - -- Full native scrcpy stream decoding is still not implemented in this lightweight package. -- `droidcast` and `uiautomator2` remain the practical low-latency capture paths. diff --git a/docs/MASTER_PLAN.md b/docs/MASTER_PLAN.md deleted file mode 100644 index f5587b9f7d..0000000000 --- a/docs/MASTER_PLAN.md +++ /dev/null @@ -1,110 +0,0 @@ -# MASTER PLAN — LLM-Driven ALAS Automation - -> Canonical execution plan for wrapping ALAS logic into an LLM Master Scheduler loop. -> Derived from NORTH_STAR.md (4 core pillars). -> Lock date: March 4, 2026. - -## Vision (Locked) - -The LLM agent is the Master Scheduler. It operates a permanent 24/7 loop that: -1. Observes game state via deterministic tools -2. Decides task priority (from ALAS scheduler logic ported directly) -3. Executes tasks using deterministic tools -4. Falls back to vision + manual piloting if a tool fails -5. Saves successful manual piloting as blueprints for future deterministic replay - -## Key Decisions (Immutable) - -| Decision | Choice | Why | -|----------|--------|-----| -| **Harness** | LLM agent (Copilot/Claude/Gemini) | No daemon; agent IS the Master Scheduler | -| **Orchestrator** | Conversational (CLI or chat integration) | Human/agent-supervised via standard interfaces | -| **MCP Server** | adb_vision | Clean ADB tools, no ALAS internals, pluggable backends | -| **Emulator** | MEmu only | Admin restart via memuc.exe solves recovery | -| **Scheduler Logic** | Port from ALAS directly | 39 tasks, intervals, priorities → LLM decision code | -| **State Machine** | 43-page graph (JSON data) | Reference for navigation, not blocking first milestone | -| **Vision Backend** | Local VLM (llama.cpp) or Gemini Flash | Operator choice; both feed same tools | -| **Error Recovery** | MEmu restart via memuc.exe | Deterministic, fast, avoids state corruption | -| **Screenshot Method** | DroidCast (MediaProjection API) | Works on MEmu OpenGL; bypasses framebuffer issue | -| **Blueprint Format** | Action sequence (tap, swipe, wait, checks) | Deterministic spec compiled from successful pilots | -| **ALAS Coexistence** | Deprecated; agent is the scheduler | No parallel scheduler running | - -## Master Scheduler Loop (The Harness) - -``` -WHILE True (24/7 when active): - - 1. OBSERVE STATE - - Take screenshot via adb_vision - - Get current game page / focus via deterministic query - - Check for error dialogs (visual) - - Verify game is running (launcher detection) - - 2. ORIENT (Am I in a good state?) - - If game not running → launch_game + wait + screenshot - - If error dialog detected → dismiss via tap - - If unknown state → take more screenshots, visual reasoning - - If OK → proceed - - 3. DECIDE NEXT TASK (Port ALAS scheduler logic) - - Load task catalog (39 tasks, intervals, priorities) - - Check timers: commissions due? Daily reset? Oil capped? - - Check event schedule: farming events? Limited tasks? - - Rank by priority: urgent > normal > deferred - - Pick highest-priority incomplete task - - 4. EXECUTE TASK - TRY: - - Call deterministic tool (e.g., "navigate_commission", "dispatch") - - Verify expected state post-execution - SUCCESS → log, update task timer, loop - CATCH tool_failure: - → FALLBACK: PILOTING MODE (see below) - - 5. PILOTING MODE (Vision + Manual) - - Take screenshot - - Describe current state to vision model - - Vision determines next action (tap, swipe, wait) - - Execute action via adb_tap / adb_swipe - - Take screenshot - - Repeat until task succeeds OR operator intervenes - - SAVE session as blueprint (action sequence) - - 6. ERROR: Unrecoverable State - - MEmu restart: memuc.exe stop + wait + memuc.exe start - - Wait for boot, re-login if needed - - Resume work - - 7. LOG & UPDATE - - Record task completion, timers, state changes - - Persist blueprint if created - - Continue loop (next iteration) -``` - -## Assets We Keep From ALAS - -| Asset | Status | Usage | -|-------|--------|-------| -| **43-page state machine** | Port as data | JSON file + thin navigator tool | -| **39-task catalog** | Reference | LLM reads to populate scheduler decisions | -| **Template matching** (PNG buttons, masks) | Lift & shift | Feed into vision pipeline + blueprints | -| **DroidCast APK** | Use now | Screenshot backend for MEmu OpenGL | -| **OCR / Device modules** | Keep available | Link into MCP server if needed; available for blueprint compilation | -| **Login flow** | Re-pilot | Agent learns popup handling | - -## Revision: What's NOT Happening - -- **No Phase II orchestrator**: The LLM agent IS the permanent orchestrator. -- **No multi-phase transition**: Deterministic + fallback is the permanent design. -- **No standalone UI**: The agent is the UI (text + screens). -- **No nested recovery layers**: Vision fallback is simple: screenshot → reason → tap → repeat. - -## First Milestone - -Agent successfully: -- Logs into game -- Navigates to commission page -- Dispatches available commissions -- Collects completed commissions -- Returns to main menu -- Repeats on schedule (simulated via loop, not real timers) diff --git a/docs/NORTH_STAR.md b/docs/NORTH_STAR.md index 9b31c327c2..f37fc5107a 100644 --- a/docs/NORTH_STAR.md +++ b/docs/NORTH_STAR.md @@ -1,53 +1,55 @@ # NORTH STAR +Status: LOCKED +Locked on: March 4, 2026 +Canonicality: This is the single source of truth for architecture intent. +Change control: Any behavior-level change must be explicitly approved by the user and reflected in the same PR across `docs/ARCHITECTURE.md`, `docs/ROADMAP.md`, and `TDD_IMPLEMENTATION_PLAN.md`. + +If any plan conflicts with this file, this file wins. + ## Vision -**ALAS Logic Meets LLM Orchestration**: Wrap ALAS's 9 years of embedded expertise (OCR, pixel matching, state machine, scheduler) into an LLM-driven Master Scheduler loop. The LLM is the harness—no separate orchestrator daemon. Vision models support manual piloting and recovery, blueprints capture successful manual sessions for deterministic replay. +**ALAS Logic Meets Modern Tooling**: Extract ALAS's 9 years of embedded expertise (state machine, scheduler priorities, page graphs) into a clean, new Master Scheduler loop. The loop uses a strong Two-Tier Model: Element-powered deterministic tools form the priority hot-path, and Vision/Manual piloting provides the permanent fallback and tool-growth engine. ## 4 Core Pillars -### 1. Master Scheduler + Permanent Core Loop (LLM-Driven) +### 1. System Intent & End-to-End Loop -- The LLM agent **is the Master Scheduler** (initially Copilot/Claude/Gemini via CLI or integration) -- No separate Python daemon; the agent calls MCP tools directly in a continuous loop -- Loop operates 24/7 when active, managing task scheduling, state checks, and recovery -- Port ALAS's legacy scheduler logic (task intervals, priorities, state transitions) **directly** into the LLM's decision-making -- The loop is permanent infrastructure, not a prototype or phase +- One permanent scheduled loop for both build-time and autonomous runtime. +- The Python scheduler queue drives the priority logic (scoring, deadlines, inserts). +- Deterministic tools are the preferred hot path. +- Loop follows a strict path: `Scheduler -> Deterministic Tool exists? -> Execute -> Update State -> Repeat`. +- If a deterministic tool fails or is missing, fall back to Manual Piloting Mode. -### 2. Deterministic Tools First (Always), Vision + Piloting Fallback (Permanent) +### 2. Tiered Interaction & Tool Contracts -- **Normal operation**: Call deterministic tools (navigate, dispatch, collect, check state) -- **Tool failure**: Fall back to vision + manual keyboard/tap piloting to resolve the issue -- **Vision fallback is permanent**, not a phase to be phased out -- Examples: - - Screen unresponsive → take screenshot + tap to recover - - Unknown game state → visual reasoning + manual navigation - - Dialog appeared → detect visually + dismiss -- Vision models: Local VLM (llama.cpp/Ollama on GeForce 5090) or Gemini Flash, operator choice +- **Tier 1 (Hot Path)**: Modern element detection layer (MaaFramework/MaaMCP or uiautomator2 + OCR) to find buttons/texts by semantics, bounds, and content—NOT raw coordinates or legacy ALAS templates. Tool return contract: `{success, data, error, observed_state, expected_state}`. +- **Tier 2 (Recovery/Fallback)**: VLM reasoning on screenshot + history -> raw adb_tap/adb_swipe actions. Used when unknown screens are hit or tools fail. +- **Single Harness Rule**: The same MCP tool surface is used by the autonomous scheduler AND by the vision recovery layer. +- Persistent state ensures we maintain hot OCR of resources (oil/gem/coin/timers) exposed as queryable endpoints. -### 3. Blueprint Creation (Lift & Shift) +### 3. Blueprint Creation (Extract, Don't Wrap) -- Every **successful manual piloting session** creates a blueprint (deterministic tool spec) for future runs -- Blueprints are reusable: next time the same problem arises, execute the blueprint instead of pilot again -- Blueprints use **ALAS patterns directly**: - - State machine graph (43-page transitions) - - OCR/pixel-match assets (template PNGs, masks) - - Task catalog (task names, intervals, preconditions) -- Blueprint spec: action sequence (tap coordinates, swipes, waits, state checks) -- Over time, successful blueprints become deterministic tools +- ALAS is a reference ONLY. We do not wrap ALAS code; we extract the "what" (every screen, task flow, decision, edge case) into implementation-agnostic requirements. +- Every successful manual piloting session captures a blueprint. +- Blueprints are translated into new deterministic Tier 1 tools covering semantic elements. -### 4. Error Handling: MEmu Restart via memuc.exe +### 4. Recovery Ladder & Error Handling -- Unrecoverable failure → restart the MEmu instance cleanly using `memuc.exe stop` then `memuc.exe start` -- No complex inner-recovery layers (no OCR fallback chains, no pixel-matching heuristics) -- MEmu Multi-Manager runs with admin permissions; `memuc.exe` CLI is the control plane -- After restart: screenshot to verify boot, re-login if needed, resume work -- This is reliable, fast, and avoids state corruption +- **Level 1**: VLM Diagnosis + raw recovery actions. +- **Level 2 (Ultimate Fallback)**: Restart emulator via admin plugin (`memuc.exe stop/start`). +- Error recovery *always* includes restart fallback to a known-good state if actions fail. +- After restart: screenshot root state, re-login, resume. ## Non-Negotiables -- Never replace ALAS with a standalone UI or orchestrator -- Never create a separate Phase II system -- Leverage ALAS patterns and assets directly (lift and shift) -- The LLM harness is the permanent design, not transitional -- Vision fallback is permanent, not a "recovery layer to be eliminated" +- **Zero Hallucination**: TDD + live emulator testing for everything. +- **Element-powered interaction**: "Click commission button", not "Click x:100, y:200". +- **Single Harness**: The same tool abstraction applies across the board. +- **CLAUDE.md** is the definitive reference for agent behavior and rules. + +## Document Governance + +- Canonical doc: `docs/NORTH_STAR.md` (and `CLAUDE.md` for agent rules) +- Derived/implementation docs: `docs/ARCHITECTURE.md`, `docs/ROADMAP.md`, `TDD_IMPLEMENTATION_PLAN.md`, `docs/plans/program_execution_plan.md` +- Legacy docs must be marked explicitly as superseded and may not override this file. diff --git a/docs/UPSTREAM_SYNC.md b/docs/UPSTREAM_SYNC.md deleted file mode 100644 index c16a173b26..0000000000 --- a/docs/UPSTREAM_SYNC.md +++ /dev/null @@ -1,76 +0,0 @@ -# Upstream Synchronization Workflow - -This document outlines the procedure for pulling updates from the upstream repository (`upstream_alas`) and merging them into the active `alas_wrapped` codebase. - -## Prerequisites - -- Git environment with access to the upstream remote. -- Familiarity with `diff` tools (e.g., Meld, KDiff3, or command-line diff). - -## Workflow Steps - -### 1. Refresh the Upstream Submodule - -The first step is to ensure the local `upstream_alas` submodule is up-to-date with the latest changes from the source repository (`Zuosizhu/Alas-with-Dashboard`). - -```bash -# From the root of the repository -git submodule update --remote upstream_alas -``` - -Verify the update by checking the commit log in the submodule: - -```bash -cd upstream_alas -git log -1 -cd .. -``` - -### 2. Compare Changes - -Since `alas_wrapped` is a customized copy of the upstream code, you must manually identify what has changed in the upstream version since the last sync. - -Use `diff` to list files that differ between the two directories: - -```bash -# List all differing files -diff -r -q alas_wrapped/module upstream_alas/module -``` - -### 3. Merge and Port Changes - -**Warning:** Do not simply overwrite `alas_wrapped` with `upstream_alas`. You must preserve local customizations (e.g., in `alas.py`, `login.py`, `config/`). - -For each differing file: -1. **Analyze the difference:** Determine if the change is an upstream improvement (e.g., new game logic) or a local customization (e.g., telemetry removal). -2. **Apply Upstream Fixes:** If the upstream change fixes a bug or adds a feature (like new event maps), copy those specific lines or functions into `alas_wrapped`. -3. **Preserve Local Logic:** Ensure that `alas_wrapped` specific logic (like the simplified error handling or custom launchers) remains intact. - -**Critical Files to Watch:** -- `alas.py`: The main loop. Ensure local restart/recovery logic is preserved. -- `module/handler/login.py`: Login logic. -- `module/config/`: Configuration handling. - -### 4. Verification - -After porting changes, verify the integrity of the application. - -1. **Run the Legacy Launcher (Baseline):** - If in doubt, run the upstream code directly to see how it *should* behave. - ```cmd - start_alas.bat --upstream - ``` - -2. **Run the Wrapped Launcher (Target):** - Run the standard launcher to verify your merged code. - ```cmd - start_alas.bat - ``` - -3. **Diagnose Regressions:** - If the wrapped version fails but the upstream version works, use the diff to isolate the missing logic. - Check logs in `log/` for specific errors. - -## Automation Note - -Currently, there is no automated script to perform this merge due to the complexity of the customizations. All syncs must be performed manually with care. diff --git a/docs/archive/deprecated/MIGRATE_POETRY_TO_UV.md b/docs/archive/deprecated/MIGRATE_POETRY_TO_UV.md deleted file mode 100644 index d4498204eb..0000000000 --- a/docs/archive/deprecated/MIGRATE_POETRY_TO_UV.md +++ /dev/null @@ -1,90 +0,0 @@ -### Step-by-Step Migration Plan - -If you decide to proceed, here is a detailed plan to migrate from Poetry to UV. - -**Phase 1: Preparation and Analysis** - -1. **Create a New Git Branch:** Before making any changes, create a backup of your current state. - ```bash - git checkout -b feature/migrate-to-uv - ``` -2. **Install UV:** Ensure you have UV available on your system. - ```bash - pip install uv - ``` -3. **Analyze Existing Dependencies:** Examine the `pyproject.toml` file to understand the project's dependencies, including development dependencies and any dependency groups. This will help you ensure they are all migrated correctly. - -**Phase 2: Core Dependency Migration** - -4. **Generate `requirements.txt` from `pyproject.toml`:** Use UV to compile the dependencies from your `pyproject.toml` into `requirements.txt` files. - * For main dependencies: - ```bash - uv pip compile pyproject.toml -o requirements.txt - ``` - * For development or other dependency groups (assuming a group named 'dev'): - ```bash - uv pip compile pyproject.toml --extra dev -o requirements-dev.txt - ``` -5. **Update `.gitignore`:** Add the new virtual environment directory to your `.gitignore` file to avoid committing it. - ``` - # Virtual Environment - .venv/ - ``` -6. **De-Poetry-fy `pyproject.toml`:** This is a critical step. You need to remove Poetry as the project manager. - * **Remove** the `[tool.poetry]` section entirely. - * **Keep** the `[project]` section if you plan to use other PEP 621-compliant tools, or create one if it doesn't exist, defining your project name, version, etc. UV and `pip` can read dependencies from here. - * Your `[project]` section in `pyproject.toml` would look something like this, listing dependencies directly: - ```toml - [project] - name = "alas" - version = "0.1.0" - requires-python = ">=3.7" - dependencies = [ - "numpy==1.21.0", - "opencv-python==4.5.3.56", - # ... list all dependencies from requirements.txt here - ] - - [project.optional-dependencies] - dev = [ - "pytest", - # ... list all dev dependencies here - ] - ``` - -**Phase 3: Update Scripts and Workflow** - -7. **Create and Activate a Virtual Environment:** The new workflow will require manually creating and activating a venv. - ```bash - # Create the virtual environment - python -m venv .venv - - # Activate it (Windows) - .venv\Scripts\activate - ``` -8. **Install Dependencies with UV:** Use UV to install the packages from your new requirements file. - ```bash - uv pip install -r requirements.txt - ``` -9. **Find and Replace `poetry` Commands:** Search the entire project for instances of `poetry` and replace them with the new UV/venv workflow. - * `poetry install` -> `uv pip install -r requirements.txt` - * `poetry run ` -> `` (This assumes the virtual environment is already active). Your scripts will now need to include the venv activation step. - * `poetry shell` -> The concept is replaced by manually running your activation script (e.g., `.venv\Scripts\activate`). -10. **Update Scripts:** Go through every `.bat` and `.ps1` file and update them. - * For example, `run_alas.bat` might change from `poetry run python alas.py` to: - ```bat - @echo off - CALL .\.venv\Scripts\activate.bat - python alas.py - ``` - -**Phase 4: Documentation and Cleanup** - -11. **Update Documentation:** - * Delete `POETRY_SETUP.md`. - * Update `README.md`, `SETUP_GUIDE.md`, and any other developer guides to reflect the new setup process (creating a venv, activating it, running `uv pip install`). -12. **Remove Poetry Artifacts:** - * Delete the `poetry.lock` file. It is no longer the source of truth for locked dependencies. -13. **Test Everything:** - * Run every single setup script (`setup_simple.bat`, `setup_environment.bat`, etc.) to ensure they work correctly. - * Run the application itself (`run_alas.bat`, `run_gui.bat`) to confirm it launches and functions as expected with the new environment. diff --git a/docs/archive/deprecated/PARSER_ARCHITECTURE_V2.md b/docs/archive/deprecated/PARSER_ARCHITECTURE_V2.md deleted file mode 100644 index 041bb69aa8..0000000000 --- a/docs/archive/deprecated/PARSER_ARCHITECTURE_V2.md +++ /dev/null @@ -1,78 +0,0 @@ -# Log Parser Architecture V2: Adaptive & Granular - -> **Location:** `agent_orchestrator/log_parser.py` -> -> This is a standalone tool with no ALAS dependencies, so it lives in `agent_orchestrator/` (Python 3.10+). - -## Core Philosophy -**"Familiar Default, Powerful Optional."** -The tool's default behavior must remain a **Single-Log Health Check** that looks and feels like the original tool. All advanced features (multi-day aggregation, forensic traces, loot tracking) must be opt-in via flags. - -## 1. Input Adaptation (The "Multi-Day" Solution) - -### CLI Interface & Defaults -- **Default (No Args):** Auto-finds the *latest* single log file. Runs `Summary` mode. -- **Explicit File:** `python log_parser.py log.txt` -> Runs `Summary` mode on that file. -- **Multi-File:** `python log_parser.py log1.txt log2.txt` -> Aggregates stats across all files into one Summary. -- **Globbing:** `python log_parser.py ./log/2026-01-*.txt` -> Aggregates all matching files. - -### Continuous Flow Logic -- The parser treats multiple files as a **continuous stream** of events. -- **No Artificial Session Breaks:** We do *not* reset stats on `[START]` banners (since ALAS restarts frequently). An error in File 1 is just an error in the past; it doesn't invalidate stats in File 2. - -## 2. Granular Filtering (The "Drill Down") - -### Time & Task Slicing -- **Smart Task Prefixing:** `--task Opsi` matches `OpsiDaily`, `OpsiAbyssal`, etc. -- **Time Ranges:** `--since` and `--until` for specific time windows. -- **Error Focus:** `--error-only` hides all successful operations. - -### Trend Analysis (The "Multi-Day" View) -- **Flag: `--daily`** - - *Purpose:* When parsing multiple files, this breaks down stats **by day/file** instead of a total aggregate. - - *Output:* A table showing Winrate/Task Success per day, helping identify *when* a problem started. - -## 3. Semantic Analyzers (The "Intelligence") - -### Shared Context Bus (The Refactor) -All analyzers share a `SessionState` to link disparate events (fixing the "Unknown Fight" issue). -```python -class SessionState: - active_task: str - in_combat: bool - last_critical_error: ErrorEvent -``` - -### Specialized "Behind Flags" Parsers -These features run silently by default. Output is hidden unless the specific flag is used. - -1. **Trace Analyzer (`--trace`)** - * **Logic:** Detects Python tracebacks and `Function calls:` blocks buried in `INFO` logs. - * **Default:** Hidden. - * **With Flag:** Prints full stack dumps for deep debugging. - -2. **Loot Analyzer (`--loot`)** - * **Logic:** Scans for `Get`, `Acquire`, `Obtain` patterns. - * **Default:** Hidden. - * **With Flag:** Shows "Session Loot: 2x Gold BP, 1x SR Ship". - -3. **Skip Analyzer (`--reasons`)** - * **Logic:** Extracts the `(Reason: ...)` from `Skip task` logs. - * **Default:** Hidden (counts only). - * **With Flag:** Breaks down *why* tasks were skipped. - -## 4. Output Modes - -| Mode | Flag | Purpose | Content | -| :--- | :--- | :--- | :--- | -| **Health Check** | *(default)* | "Did it run?" | Winrates, Task Success Counts, Error Counts (No stacks). **Same as current tool.** | -| **Forensic** | `--trace -e` | "Why did it fail?" | Stack traces, specific error timestamps, previous task context. | -| **Trends** | `--daily` | "When did it break?" | Per-day/Per-file breakdown of stats. | -| **Loot** | `--loot` | "What did I get?" | Loot table only. | - -## 5. Implementation Strategy - -1. **Refactor Input Layer:** Support `argparse` with `nargs='*'` for flexible file lists. -2. **Implement Context Bus:** Rewrite `AnalyzerPipeline` to pass `SessionState`. -3. **Add Flags & Detectors:** Implement the logic for `--trace`, `--loot`, `--reasons`. - diff --git a/docs/archive/deprecated/PARSER_IMPROVEMENT_PLAN.md b/docs/archive/deprecated/PARSER_IMPROVEMENT_PLAN.md deleted file mode 100644 index f4d90f5fef..0000000000 --- a/docs/archive/deprecated/PARSER_IMPROVEMENT_PLAN.md +++ /dev/null @@ -1,86 +0,0 @@ -# Log Parser Improvement Plan - -> **Location:** `agent_orchestrator/log_parser.py` - -This document outlines the proposed architecture and feature upgrades for `log_parser.py`. The goal is to transform it from a passive log reader into a semantic analysis tool that understands ALAS execution flow, data drops, and complex failure states. - -## 1. Core Architecture Overhaul: "The Context Bus" - -**Current Problem:** Analyzers (Combat, Task, Error) function in isolation. A crash detected by the `ErrorAnalyzer` is invisible to the `CombatAnalyzer`, leading to "Unknown" fight results instead of "Crashed". - -**Proposal:** Implement a `ContextBus` or shared `SessionState` that all analyzers write to and read from. - -### Design Pattern -```python -@dataclass -class GlobalContext: - current_task: Optional[str] = None - current_campaign_mode: Optional[str] = None - last_error: Optional[str] = None - is_in_combat: bool = False - -class Analyzer: - def process(self, log_line: LogLine, context: GlobalContext): - # Read context to understand state - # Write to context to inform others - pass -``` - -### Benefits -- **Combat & Error Linking:** If `ErrorAnalyzer` sees a traceback while `context.is_in_combat` is True, it explicitly marks the current fight as `CRASHED`. -- **Task Dependencies:** Can link a `Restart` task to the failure of the *previous* task. - -## 2. Enhanced "Soft" Failure Detection - -**Current Problem:** The parser relies on explicit `ERROR` or `CRITICAL` log levels. ALAS often logs stack traces or "stuck" states as multi-line `INFO` messages to avoid cluttering the console, which the current parser ignores. - -**Proposal:** -- **Stack Trace Heuristics:** Detect patterns like `Traceback (most recent call last):` or `Function calls:` within `INFO` blocks. -- **Stuck State Detection:** Track "Game Stuck" events that don't crash the bot but trigger timeouts (e.g., waiting 3 minutes for a button). -- **Recovery Tracking:** specific tracker for how often the bot triggers `recover()` or `handle_error()`. - -## 3. Game Data Semantics (Loot & Logic) - -**Current Problem:** The parser ignores the "business logic" of the game—drops, skip reasons, and resource caps. - -**Proposal:** -- **Loot Tracker (Flag: `--loot`):** Parse lines starting with `Get` or `Acquire` to track drops (Ships, Blueprints, Gear). - - *Default:* Hidden to reduce noise. - - *With `--loot`:* Output "Session Loot: 2x Gold BP, 1x SR Ship" -- **Skip Logic (Flag: `--reasons`):** Parse "Skip task [Reason]" messages. - - *Default:* Show only task counts. - - *With `--reasons`:* Output "Commission skipped 3x (Reason: Cooldown), Hard Mode skipped 1x (Reason: Out of Oil)" -- **Resource Delta:** Track Oil/Coin changes over time, not just static OCR readings. - -## 4. Control & Filtering - -**Current Problem:** Output can be "all or nothing". - -**Proposal:** -- **Granular Flags:** - - `--trace`: Show full stack traces for errors (default: show only exception type and line number). - - `--stuck`: Highlight periods where the bot was stuck/waiting for >1 minute. - - `--perf`: Show performance metrics (OCR time, screen transition time). -- **Smart Defaults:** The default `--summary` should remain a single-screen dashboard, with deep-dives available via flags. - -## 5. Visualization & Reporting - -**Current Problem:** Text output is good for CLI, but hard to digest for trends. - -**Proposal:** -- **Markdown Report Generator:** Output a `SUMMARY.md` file that can be rendered in GitHub or Obsidian. -- **ASCII Timeline V2:** Improve the timeline view to visually show "Stuck" periods (e.g., `====[STUCK 3m]====`). - -## 5. Implementation Roadmap - -### Phase 1: The Fix (High Priority) -- [ ] Implement `Stack Trace` detection in `INFO` logs. -- [ ] Link `ErrorAnalyzer` events to `CombatAnalyzer` to resolve "Unknown" fights. - -### Phase 2: The Context (Medium Priority) -- [ ] Refactor `AnalyzerPipeline` to use a shared `SessionState`. -- [ ] Implement `LootAnalyzer`. - -### Phase 3: The Polish (Low Priority) -- [ ] JSON export schema improvements. -- [ ] Markdown reporting. diff --git a/docs/archive/futurePlans/multi_agent_conversion_plan.md b/docs/archive/futurePlans/multi_agent_conversion_plan.md deleted file mode 100644 index b4e7ff342e..0000000000 --- a/docs/archive/futurePlans/multi_agent_conversion_plan.md +++ /dev/null @@ -1,215 +0,0 @@ -# Azur Lane Multi-Agent Conversion Plan (Final) - -## Architecture Overview - -Convert existing autonomous modules into LangGraph tools orchestrated by a single master agent, with comprehensive logging and screenshot-based vision verification. - -## Corrected Agent Structure - -### 1. Master Orchestrator Agent (LangGraph State Machine) -- **Role**: Central coordinator calling existing logic as tools -- **Responsibilities**: - - Call existing module logic as LangGraph tools - - Monitor results and logs for consistency checking - - Use cached screenshots with timestamps for verification - - Trigger vision verification when tool results don't match expected outcomes - - Maintain full compatibility with existing GUI monitoring - - Preserve all current logging and error handling capabilities - -### 2. Vision Verification Agent -- **Purpose**: Error detection and consistency verification -- **Method**: Analyze cached screenshots with timestamps -- **Triggers**: - - Tool returns unexpected response - - OCR confidence drops below threshold - - Log shows errors or anomalies - - Periodic consistency checks during long operations - - Manual verification requests - -## Development Priority (Corrected Order) - -### Phase 1: Foundation (Weeks 1-2) -1. **UI Module Understanding**: Full analysis of sophisticated UI navigation (630 lines) -2. **Exercise Module**: Convert exercise system to LangGraph tools first -3. **Parallel Development**: Build agent tools separately without breaking existing functionality - -### Phase 2: Core Systems (Weeks 3-6) -1. **Commissions System**: Convert commission handling -2. **Research System**: Convert research scheduling and execution -3. **Tactical System**: Convert tactical operations - -### Phase 3: Combat & Guild (Weeks 7-8) -1. **Guild System**: Convert guild operations -2. **Combat System**: Convert the sophisticated combat logic (600+ lines) - -### Phase 4: Map Operations (Weeks 9-10) -1. **Map Operations**: Convert pathfinding, fleet management (convert last) - -## Vision Verification Integration - -### Screenshot-Based Logging -```python -@tool -def vision_verification_tool(state: GameState, tool_name: str, context: str) -> VisionResult: - """Verify tool execution using cached screenshots""" - - # Get recent screenshots with timestamps - screenshots = state.get_cached_screenshots( - start_time=state.tool_start_time, - end_time=time.time() - ) - - # Analyze for consistency - for screenshot, timestamp in screenshots: - vision_result = vision_agent.analyze( - image=screenshot, - expected_outcome=state.expected_result, - context=f"{tool_name}: {context}", - timestamp=timestamp - ) - - if vision_result.confidence < 0.8: - logger.warning(f"Vision verification failed for {tool_name}") - return VisionResult( - success=False, - confidence=vision_result.confidence, - issues=vision_result.issues, - needs_investigation=True - ) - - return VisionResult(success=True, confidence=1.0) -``` - -### Tool Execution with Verification -```python -@tool -def exercise_execute_tool(state: GameState, **kwargs) -> ToolResult: - """Exercise system converted to LangGraph tool""" - try: - # Record start state - state.tool_start_time = time.time() - state.expected_result = "exercise_completed" - - # Execute using existing logic - exercise_instance = Exercise(device=state.device, config=state.config) - result = exercise_instance.run_exercise(**kwargs) - - # Verify consistency with vision - vision_result = vision_verification_tool(state, "exercise_execute", str(result)) - - if vision_result.needs_investigation: - return ToolResult( - success=False, - error="Vision verification failed", - vision_result=vision_result, - logs=exercise_instance.get_logs() - ) - - return ToolResult( - success=True, - data=result, - logs=exercise_instance.get_logs(), - screenshots=state.get_recent_screenshots() - ) - - except Exception as e: - # Even errors get logged with screenshots - vision_result = vision_verification_tool(state, "exercise_execute", f"error: {e}") - return ToolResult(success=False, error=str(e), vision_result=vision_result) -``` - -## GUI Integration Requirements - -### Preserve Existing Monitoring -- **Critical**: Maintain all current GUI functionality -- **Live Logging**: Existing GUI live monitoring must remain functional -- **No Erosion**: Cannot break any existing tooling capabilities -- **Screenshot Integration**: Agent screenshots should integrate with existing logging - -### GUI Enhancement Strategy -```python -# Agent system should enhance, not replace, existing GUI -class AgentGUIIntegration: - def __init__(self): - self.original_gui = ExistingGUI() # Preserve all current functionality - self.agent_monitor = AgentMonitor() # Add agent-specific monitoring - - def get_combined_logs(self): - # Merge existing logs with agent logs - original_logs = self.original_gui.get_logs() - agent_logs = self.agent_monitor.get_logs() - return self.merge_logs(original_logs, agent_logs) - - def display_screenshots(self): - # Show both original and agent verification screenshots - original_screenshots = self.original_gui.get_screenshots() - agent_screenshots = self.agent_monitor.get_verification_screenshots() - return self.display_combined(original_screenshots, agent_screenshots) -``` - -## Master Orchestrator Logic - -```python -@agent -def master_orchestrator(state: GameState) -> GameState: - """Main orchestration with GUI compatibility""" - - # Maintain compatibility with existing GUI - state.gui_compatibility_mode = True - - for action in state.task_queue: - try: - # Execute tool - tool_result = _call_tool(action, state) - - # Log to both agent system and preserve existing logging - _log_to_gui_compatible_format(action, tool_result) - - # Verify with vision if needed - if tool_result.needs_verification: - vision_result = vision_verification_tool(state, action, tool_result.context) - tool_result.vision_result = vision_result - - # Update GUI with verification results - _update_gui_with_vision_results(vision_result) - - # Continue or escalate based on results - if tool_result.success: - state = _update_state(state, tool_result) - else: - state.requires_human_review = True - _escalate_to_human(action, tool_result) - - except Exception as e: - logger.error(f"Orchestration error: {e}") - state.requires_human_intervention = True - - return state -``` - -## Key Implementation Principles - -1. **Preserve Excellence**: All existing 600+ line combat logic and 630 line UI system remain intact -2. **GUI Compatibility**: Agent system enhances, never replaces existing GUI functionality -3. **Screenshot Integration**: Cached screenshots with timestamps build on existing logging -4. **Separate Development**: Agent tools developed independently to avoid breaking existing system -5. **Error Detection Focus**: Vision used for consistency verification, not critical operations -6. **Progressive Conversion**: Follow exact priority order: Exercises → Commissions/Research/Tactical → Guild → Combat → Map - -## Success Metrics - -1. **GUI Preservation**: All existing GUI functionality works identically -2. **Logging Enhancement**: Agent screenshots integrate seamlessly with existing logs -3. **Tool Accuracy**: Vision verification catches tool execution inconsistencies -4. **System Reliability**: No degradation in existing system reliability -5. **Development Safety**: Agent tools can be developed without breaking existing functionality - -## Risk Mitigation - -1. **Parallel Development**: Agent tools built separately, integrated later -2. **GUI Testing**: Extensive testing of GUI compatibility at each phase -3. **Screenshot Validation**: Verify screenshot integration works with existing logging -4. **Incremental Rollout**: Convert modules one at a time with full testing -5. **Rollback Capability**: Ability to disable agent system and return to pure autonomous mode - -This approach respects your sophisticated existing architecture while adding the agent orchestration and vision verification capabilities you need, with the GUI monitoring as a top priority. \ No newline at end of file diff --git a/docs/archive/legacy/FASTMCP_3_MIGRATION.md b/docs/archive/legacy/FASTMCP_3_MIGRATION.md deleted file mode 100644 index 4ed7012d69..0000000000 --- a/docs/archive/legacy/FASTMCP_3_MIGRATION.md +++ /dev/null @@ -1,48 +0,0 @@ -# FastMCP 3.0 Migration Report (2026-01-27) - -## Overview -The ALAS MCP server has been migrated from a legacy hand-rolled JSON-RPC implementation to the **FastMCP 3.0 (Beta)** framework. This migration eliminates technical debt, provides native type safety, and aligns the project with modern 2026 MCP standards. - -## Architectural Shift: Provider-Component-Transform -FastMCP 3.0 rebuilds the server around three core primitives: - -### 1. Components (The "What") -All exposed capabilities (Tools, Resources, Prompts) are now **Components**. -- **Impact on ALAS:** The 7 core automation tools are registered as first-class components via the `@mcp.tool()` decorator. -- **Benefit:** Automatic Pydantic-based schema generation and versioning support. - -### 2. Providers (The "Where") -Providers are the dynamic sources of components. -- **ALAS Implementation:** Currently uses a local decorator provider. -- **Future Ready:** The architecture supports adding `OpenAPIProvider` or `FileSystemProvider` to expose configuration files or external APIs without changing the core server logic. - -### 3. Transforms (The "How") -Transforms act as middleware between providers and clients. -- **Applied Patterns:** Native threadpool dispatch for synchronous ALAS blocking IO (device interaction, screenshots). -- **Future Capabilities:** Can be used for namespacing tools or session-based access control. - -## Technical Implementation Details - -### Type Safety & Validation -By using Python type hints, FastMCP 3.0 automatically enforces parameter types. -- Example: `adb_swipe(x1: int, y1: int, duration_ms: int = 100)` -- **Benefit:** Invalid client requests are rejected at the framework level, preventing crashes in the ALAS core. - -### Legacy Core Compatibility (Patches) -To support the forward-compatible Python 3.12 environment required by FastMCP 3.0, several patches were applied to the Python 3.7 ALAS core (`alas_wrapped/`): -1. **`uiautomator2` Modernization:** Patched `module/device/method/utils.py` to handle the removal of the `.init` attribute in newer versions. -2. **`minitouch` Resilience:** Mocked the missing `_Service` class in `minitouch.py` to maintain compatibility with modern `uiautomator2`. -3. **`adbutils` Integration:** Monkey-patched `AdbClient._connect` to map to `make_connection`, ensuring device detection works with recent `adbutils` versions. -4. **Indentation & Syntax:** Fixed multiple syntax errors in `alas.py` that were previously masked by older interpreters. - -## Environment & Tooling -- **Manager:** `uv` -- **Python Version:** 3.12+ (forward compatible with ALAS 3.7 core) -- **Tracing:** Native OpenTelemetry support integrated into the `uv` environment. - -## Usage -The server is project-scoped and configured in `.gemini/settings.json`. -```bash -cd agent_orchestrator -uv run alas_mcp_server.py --config alas -``` diff --git a/docs/archive/local_vlm_setup.md b/docs/archive/local_vlm_setup.md deleted file mode 100644 index f9ac57638b..0000000000 --- a/docs/archive/local_vlm_setup.md +++ /dev/null @@ -1,1043 +0,0 @@ -# Local VLM Setup: Technical Primer & Phase Plan - -> **Status**: Plan (not started) -> **Hardware**: NVIDIA GeForce RTX 5090 (32 GB GDDR7) -> **Target**: Real-time game screenshot analysis for ALAS bot -> **Last updated**: 2026-02-17 - ---- - -## 1. Executive Summary - -ALAS currently relies on two vision approaches: -1. **Legacy CV** — deterministic template matching, OCR, and pixel/mask detection (fast, reliable, brittle) -2. **Cloud VLM** — Gemini Flash for recovery and annotation tasks (capable, but adds latency and API cost) - -A **local VLM** served on our GeForce 5090 would: -- Eliminate API costs for vision tasks during development and autonomous operation -- Enable always-on vision without network dependency -- Reduce latency below cloud round-trip times (target: <3 seconds per screenshot analysis) -- Support the Stage B "Annotate" pipeline (NORTH_STAR.md) where the agent plays screen-by-screen -- Serve as the primary vision layer for the Phase II Gemini orchestrator's vision router - -The 5090's 32 GB VRAM and 1,792 GB/s memory bandwidth make it uniquely suited for this role — it can run high-quality 7B–32B vision models at interactive speeds. - ---- - -## 2. Hardware: GeForce RTX 5090 - -### Specifications - -| Spec | Value | -|------|-------| -| Architecture | Blackwell (GB202) | -| CUDA Cores | 21,760 | -| Tensor Cores | 680 (5th gen) | -| VRAM | 32 GB GDDR7 | -| Memory Bandwidth | 1,792 GB/s | -| Memory Bus Width | 512-bit | -| VRAM Speed | 28 Gbps | -| PCIe | Gen 5 x16 | -| TDP | 575W | -| FP16 TFLOPS | 209.5 | -| AI TOPS (INT8) | 838 | -| Launch | January 30, 2025 | -| MSRP | $1,999 | - -### LLM Inference Characteristics - -The RTX 5090 is **bandwidth-bound** for LLM inference (token generation is limited by how fast weights can be read from VRAM). Key performance expectations based on published benchmarks: - -| Model Size | Quant | Approx. tok/s (5090) | Approx. tok/s (4090) | Improvement | -|------------|-------|----------------------|----------------------|-------------| -| 3B | Q4_K_M | ~340 | ~180 | ~90% | -| 7–8B | Q4_K_M | ~210 | ~120 | ~75% | -| 14B | Q4_K_M | ~130 | ~78 | ~67% | -| 32B | Q4_K_M | ~61 | ~38 | ~60% | - -*Sources: Puget Systems, NeevCloud, nikolasent benchmarks (Ollama, Q4_K_M, 8k context).* - -### VRAM Budget - -With 32 GB total, we must budget for: -- **Model weights**: Varies by model and quantization -- **KV cache**: Grows with context length (~0.5–2 GB at 4k–8k context) -- **Vision encoder + mmproj**: Typically 0.5–1.5 GB additional -- **Overhead**: ~1 GB for CUDA runtime and framework - -Rule of thumb: target **≤28 GB total** to leave headroom. - ---- - -## 3. Model Selection - -### Evaluation Criteria for ALAS - -Our use case has specific requirements: -1. **OCR accuracy** — reading game text (English, some CJK), numbers, button labels -2. **UI element recognition** — identifying buttons, menus, dialogs, game states -3. **Spatial understanding** — where on screen are elements located -4. **Speed** — response within 2–3 seconds for real-time, 5–10 seconds for recovery -5. **Structured output** — ability to return JSON describing game state - -### Top Candidate Models - -#### Tier 1: Primary Recommendations - -##### 1. Qwen3-VL-8B (Released October 2025) - -The latest in the Qwen VL family. Dense 8B parameter model with state-of-the-art vision capabilities. - -| Property | Value | -|----------|-------| -| Parameters | 8.77B | -| Architecture | Dense transformer + ViT | -| Context Window | 256K tokens | -| Ollama Size (Q4_K_M) | 6.1 GB | -| VRAM (Q4_K_M, 8k ctx) | ~8–10 GB | -| VRAM (Q8_0) | ~12–14 GB | -| VRAM (FP16) | ~18–20 GB | -| Expected tok/s (5090, Q4_K_M) | ~180–210 | -| OCR | Excellent (multilingual, structured) | -| Spatial Reasoning | Yes (2D/3D grounding) | -| License | Apache 2.0 | - -**Why**: Best balance of speed and capability. Qwen VL models have exceptional OCR and spatial understanding. The 8B size fits comfortably in 32 GB VRAM even at FP16, leaving room for large contexts. At Q4_K_M, it should generate ~180+ tok/s on the 5090 — meaning a 100-token game state description takes ~0.5 seconds of generation time. - -```bash -# Ollama -ollama run qwen3-vl:8b - -# llama.cpp -llama-server -hf ggml-org/Qwen3-VL-8B-Instruct-GGUF -``` - -##### 2. MiniCPM-V 4.5 / MiniCPM-o 4.5 (Released 2025) - -Built on Qwen3-8B + SigLIP2 vision encoder. 8–9B parameters but achieves scores rivaling 72B models on vision benchmarks. State-of-the-art OCR. - -| Property | Value | -|----------|-------| -| Parameters | 8–9B | -| Architecture | Qwen3-8B + SigLIP2-400M | -| Context Window | High-res images up to 1.8M pixels | -| Ollama Size (default) | ~6 GB | -| VRAM (Q4_K_M, 8k ctx) | ~8–10 GB | -| Expected tok/s (5090, Q4_K_M) | ~180–210 | -| OCR | State-of-the-art (OCRBench leader) | -| OpenCompass Score | 77.0–77.6 (surpasses GPT-4o) | -| License | Apache 2.0 | - -**Why**: If OCR accuracy is the primary concern (reading game text, numbers, button labels), MiniCPM-V 4.5 is the best sub-10B model available. It beats GPT-4o-latest and Qwen2.5-VL-72B on OpenCompass despite being only 8B. Excellent for document and UI text extraction. - -```bash -# Ollama -ollama run openbmb/minicpm-v4.5 - -# llama.cpp (requires mmproj) -llama-server -m MiniCPM-V-4_5-Q4_K_M.gguf --mmproj mmproj-model-f16.gguf -``` - -##### 3. Qwen3-VL-32B (Released October 2025) - -The dense 32B model pushes capability significantly at the cost of speed — but still fits within the 5090's 32 GB VRAM at Q4_K_M. - -| Property | Value | -|----------|-------| -| Parameters | 33B | -| Architecture | Dense transformer + ViT | -| Context Window | 256K tokens | -| Ollama Size (Q4_K_M) | 21 GB | -| VRAM (Q4_K_M, 8k ctx) | ~23–25 GB | -| VRAM (Q8_0) | Would not fit (>32 GB) | -| Expected tok/s (5090, Q4_K_M) | ~55–65 | -| OCR | Excellent+ | -| Spatial/Reasoning | Strong reasoning and spatial understanding | -| License | Apache 2.0 | - -**Why**: When accuracy matters more than speed (recovery mode, annotation pipeline). At ~60 tok/s, a 100-token response takes ~1.7 seconds — still within our 5-second recovery budget. This is the maximum quality model that fits in a single 5090. - -```bash -# Ollama -ollama run qwen3-vl:32b - -# llama.cpp -llama-server -hf ggml-org/Qwen3-VL-32B-Instruct-GGUF -``` - -#### Tier 2: Strong Alternatives - -##### 4. Gemma 3 27B (Released March 2025) - -Google's multimodal model. Well-supported in both llama.cpp and Ollama. Known for strong general reasoning. - -| Property | Value | -|----------|-------| -| Parameters | 27B | -| Context Window | 128K tokens | -| VRAM (Q4_0) | ~21 GB (model only) | -| VRAM (Q4_0, 8k ctx, vision) | ~24–26 GB | -| VRAM (Q4_0, 82k ctx, vision) | ~30 GB | -| Expected tok/s (5090, Q4_K_M) | ~65–75 | -| License | Gemma Terms of Use (permissive) | - -**Why**: Google's QAT (Quantization-Aware Training) means Q4 quantized Gemma models lose less quality than standard post-training quantization. Well-tested ecosystem. Good fallback if Qwen models have issues. - -```bash -# Ollama -ollama run gemma3:27b - -# llama.cpp -llama-server -hf ggml-org/gemma-3-27b-it-GGUF -``` - -##### 5. Qwen3-VL-30B-A3B (MoE, Released October 2025) - -A Mixture-of-Experts model: 30B total parameters but only 3B active per token. Extremely fast inference while maintaining strong capability. - -| Property | Value | -|----------|-------| -| Parameters | 30B total / 3B active | -| Architecture | MoE (128 experts, 8 active) | -| Context Window | 256K tokens | -| Ollama Size | 20 GB | -| VRAM (Q4_K_M, 8k ctx) | ~22–24 GB | -| Expected tok/s (5090) | ~150–200+ (only 3B active) | -| License | Apache 2.0 | - -**Why**: MoE gives high capability with near-3B inference speed. If it fits in VRAM (it does at Q4), it could be the fastest high-quality option. Trade-off is higher VRAM for model weights despite fast generation. - -```bash -# Ollama -ollama run qwen3-vl:30b - -# llama.cpp -llama-server -hf unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF -``` - -### Model Selection Matrix - -| Model | VRAM (Q4) | Speed (tok/s) | OCR Quality | UI Understanding | Recommended Use | -|-------|-----------|---------------|-------------|------------------|-----------------| -| **Qwen3-VL-8B** | ~8–10 GB | ~200 | ★★★★☆ | ★★★★☆ | **Primary: real-time bot** | -| **MiniCPM-V 4.5** | ~8–10 GB | ~200 | ★★★★★ | ★★★★☆ | **Primary: OCR-heavy tasks** | -| **Qwen3-VL-32B** | ~23–25 GB | ~60 | ★★★★★ | ★★★★★ | **Recovery / annotation** | -| Gemma 3 27B | ~24–26 GB | ~70 | ★★★★☆ | ★★★★☆ | Alternative large model | -| Qwen3-VL-30B-A3B | ~22–24 GB | ~170 | ★★★★☆ | ★★★★☆ | Speed+quality hybrid | - -### Recommended Strategy - -**Dual-model approach**: -1. **Fast model** (Qwen3-VL-8B or MiniCPM-V 4.5, Q4_K_M): For real-time 1 FPS screenshot analysis. ~200 tok/s, <1 second for a typical game state response. -2. **Quality model** (Qwen3-VL-32B, Q4_K_M): For recovery analysis, annotation pipeline, and complex reasoning. ~60 tok/s, 2–5 seconds per response. - -Both cannot be loaded simultaneously (combined VRAM exceeds 32 GB). The server would load/unload models on demand, or default to the fast model and swap to quality when needed. Ollama handles this automatically with model unloading. - -### Quantization Recommendations - -For **vision tasks** (OCR, UI recognition), quantization impact differs from pure text generation: - -| Quantization | Quality Impact | VRAM Ratio | Speed | Recommendation | -|-------------|----------------|------------|-------|----------------| -| **FP16** | Baseline | 1.0x | Slowest | Not needed — VRAM limited | -| **Q8_0** | ~<1% loss | 0.5x | Moderate | Best quality if VRAM permits | -| **Q6_K** | ~1–2% loss | 0.4x | Good | Code/precision-sensitive tasks | -| **Q5_K_M** | ~2–3% loss | 0.35x | Good | Excellent balance | -| **Q4_K_M** | ~5% loss | 0.3x | Fast | **Recommended default** — sweet spot | -| **Q3_K_M** | ~10–15% loss | 0.25x | Fastest | Not recommended for vision | - -**Key insight**: Vision quality degrades more gracefully with quantization than coding/math tasks. OCR and UI recognition remain strong at Q4_K_M because the visual encoder (typically kept at FP16) does the heavy lifting, while the language model just needs to describe what was seen. - -**Important**: The `mmproj` (multimodal projector) file should always be kept at **FP16** regardless of language model quantization. This preserves vision encoding quality. - ---- - -## 4. Serving Options - -### Option A: Ollama - -Ollama provides the simplest path to serving VLMs locally with an OpenAI-compatible API. - -#### Setup - -```powershell -# Install Ollama (Windows) -winget install Ollama.Ollama - -# Or download from https://ollama.com/download - -# Pull a vision model -ollama pull qwen3-vl:8b - -# Verify -ollama list -``` - -#### Running the Server - -```powershell -# Ollama runs as a service automatically after install -# Default API endpoint: http://localhost:11434 - -# Test with a vision model -ollama run qwen3-vl:8b "Describe this image" --images screenshot.png -``` - -#### API Configuration - -Ollama exposes an OpenAI-compatible `/v1/chat/completions` endpoint: - -```python -import base64 -from openai import OpenAI - -client = OpenAI( - base_url="http://localhost:11434/v1", - api_key="ollama", # Required but ignored -) - -# Encode screenshot as base64 -with open("screenshot.png", "rb") as f: - image_b64 = base64.b64encode(f.read()).decode() - -response = client.chat.completions.create( - model="qwen3-vl:8b", - messages=[ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Analyze this Azur Lane screenshot. What screen/page is shown? List all visible UI elements, buttons, and any text. Return JSON." - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_b64}" - } - } - ] - } - ], - max_tokens=500, - temperature=0.1, -) - -print(response.choices[0].message.content) -``` - -#### Ollama Configuration - -```powershell -# Environment variables for tuning -$env:OLLAMA_NUM_PARALLEL = "1" # Single request at a time (bot is sequential) -$env:OLLAMA_MAX_LOADED_MODELS = "1" # Keep one model loaded -$env:OLLAMA_KEEP_ALIVE = "24h" # Keep model in VRAM (bot runs continuously) -$env:OLLAMA_FLASH_ATTENTION = "1" # Enable flash attention -``` - -Model-specific settings via Modelfile or API: -``` -# Modelfile for game analysis -FROM qwen3-vl:8b -PARAMETER temperature 0.1 -PARAMETER top_p 0.8 -PARAMETER top_k 20 -PARAMETER num_ctx 4096 -PARAMETER num_predict 512 -``` - -#### Pros & Cons - -| Pros | Cons | -|------|------| -| Easiest setup (single binary) | Less control over quantization | -| Automatic model management | Model naming inconsistencies | -| Built-in OpenAI compatibility | May lag behind llama.cpp features | -| Auto GPU detection | Default context/predict settings can cause issues | -| Windows native support | New multimodal engine (May 2025) still maturing | - -### Option B: llama.cpp (llama-server) - -llama.cpp provides the most control and often the best raw performance. As of May 2025, it has full multimodal support via the `libmtmd` library. - -#### Setup - -```powershell -# Option 1: Install via package manager -# (If available — check winget or scoop) - -# Option 2: Download pre-built binaries -# Visit: https://github.com/ggml-org/llama.cpp/releases -# Download the CUDA build for Windows (e.g., llama-bxxxx-bin-win-cuda-cu12.x-x64.zip) -# Extract to a directory, e.g., C:\tools\llama-cpp\ - -# Option 3: Build from source with CUDA -git clone https://github.com/ggml-org/llama.cpp -cd llama.cpp -cmake -B build -DGGML_CUDA=ON -cmake --build build --config Release -``` - -#### Download Models - -```powershell -# Using Hugging Face CLI -pip install huggingface-hub - -# Qwen3-VL-8B (Q4_K_M) -huggingface-cli download ggml-org/Qwen3-VL-8B-Instruct-GGUF ` - --local-dir models/qwen3-vl-8b ` - --include "*.gguf" - -# Qwen2.5-VL-32B (Q4_K_M) — alternative -huggingface-cli download Mungert/Qwen2.5-VL-32B-Instruct-GGUF ` - --local-dir models/qwen2.5-vl-32b ` - --include "*Q4_K_M*" "*mmproj*" -``` - -#### Running the Server - -```powershell -# Simple: auto-download from HuggingFace -llama-server.exe -hf ggml-org/Qwen3-VL-8B-Instruct-GGUF ` - --host 127.0.0.1 --port 8080 ` - -ngl 999 ` # Offload all layers to GPU - --flash-attn ` # Enable flash attention - -c 4096 # Context size - -# Manual: specify model and mmproj files -llama-server.exe ` - -m models/qwen3-vl-8b/Qwen3-VL-8B-Instruct-Q4_K_M.gguf ` - --mmproj models/qwen3-vl-8b/mmproj-Qwen3-VL-8B-Instruct-F16.gguf ` - --host 127.0.0.1 --port 8080 ` - -ngl 999 ` - --flash-attn ` - -c 4096 - -# For the larger 32B model -llama-server.exe -hf ggml-org/Qwen3-VL-32B-Instruct-GGUF ` - --host 127.0.0.1 --port 8080 ` - -ngl 999 ` - --flash-attn ` - -c 4096 -``` - -#### API Usage - -llama-server provides an OpenAI-compatible endpoint at `http://localhost:8080/v1/chat/completions`: - -```python -import base64 -from openai import OpenAI - -client = OpenAI( - base_url="http://localhost:8080/v1", - api_key="not-needed", -) - -with open("screenshot.png", "rb") as f: - image_b64 = base64.b64encode(f.read()).decode() - -response = client.chat.completions.create( - model="qwen3-vl-8b", # Model name (informational for llama-server) - messages=[ - { - "role": "user", - "content": [ - {"type": "text", "text": "What game screen is this? Return JSON with: page_name, visible_buttons, visible_text, notable_elements"}, - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}} - ] - } - ], - max_tokens=500, - temperature=0.1, -) -``` - -#### Pros & Cons - -| Pros | Cons | -|------|------| -| Best raw performance | More setup required | -| Full quantization control | Must manage model files yourself | -| Latest model support (fastest to add) | No automatic model management | -| Flash attention, KV cache quantization | Windows build can be finicky | -| Detailed performance metrics | Manual mmproj file management | - -### Comparison: Ollama vs llama.cpp - -| Feature | Ollama | llama.cpp | -|---------|--------|-----------| -| **Setup Difficulty** | Easy (1 command) | Moderate (build or download binaries) | -| **Vision Model Support** | Yes (since May 2025 engine) | Yes (since May 2025 libmtmd) | -| **OpenAI API Compat** | `/v1/chat/completions` | `/v1/chat/completions` | -| **Model Management** | Automatic (pull/push) | Manual (download GGUF files) | -| **Performance** | Good (uses llama.cpp internally) | Slightly better (direct, less overhead) | -| **Quantization Control** | Limited (pre-packaged quants) | Full (choose any GGUF variant) | -| **Multi-Model** | Yes (auto load/unload) | One model per server instance | -| **KV Cache Quant** | Limited | Full (`--cache-type-k q4_0`) | -| **Flash Attention** | Yes | Yes | -| **Windows** | Native installer | Binary download or build | - -**Recommendation**: Start with **Ollama** for Phase L1 (simplicity). Switch to **llama.cpp** for Phase L3/L4 if we need finer control over performance tuning, KV cache quantization, or multi-model scheduling. - ---- - -## 5. Integration Plan: Wiring into ALAS MCP Server - -### Architecture: Vision Router - -The ARCHITECTURE.md already defines a **Vision Router** in the orchestrator layer. Here's the concrete design: - -``` -┌─────────────────────────────────────────────────────────┐ -│ Vision Router │ -│ │ -│ ┌─────────────┐ ┌──────────────┐ ┌─────────────┐ │ -│ │ Screenshot │──▶│ Route │──▶│ Parse │ │ -│ │ (base64) │ │ Decision │ │ Response │ │ -│ └─────────────┘ └──────┬───────┘ └─────────────┘ │ -│ ┌─────┴──────┐ │ -│ ▼ ▼ │ -│ ┌──────────┐ ┌──────────┐ │ -│ │ Local │ │ Cloud │ │ -│ │ VLM │ │ Gemini │ │ -│ │ (Ollama) │ │ Flash │ │ -│ └──────────┘ └──────────┘ │ -└─────────────────────────────────────────────────────────┘ -``` - -### New MCP Tool: `vision_analyze` - -```python -@mcp.tool() -async def vision_analyze( - screenshot_b64: str, - prompt: str = "Describe the current game screen state as JSON", - provider: str = "auto", # "local", "cloud", "auto" - timeout_seconds: float = 5.0, -) -> dict: - """Analyze a game screenshot using the vision model. - - Provider routing: - - "local": Use local VLM (Ollama/llama.cpp on 5090) - - "cloud": Use Gemini Flash API - - "auto": Try local first, fall back to cloud on timeout/error - - Returns tool contract envelope: - { - "success": bool, - "data": {"description": str, "parsed": dict | None}, - "error": str | None, - "observed_state": str | None, - "expected_state": "vision_analysis_complete", - "provider_used": "local" | "cloud", - "latency_ms": float - } - """ -``` - -### Configuration - -Add to `alas_wrapped/config/` or `agent_orchestrator/` config: - -```yaml -# vision_config.yaml -vision: - # Provider settings - local: - enabled: true - backend: "ollama" # "ollama" or "llamacpp" - base_url: "http://localhost:11434/v1" - model: "qwen3-vl:8b" - timeout_seconds: 3.0 - max_tokens: 500 - temperature: 0.1 - - cloud: - enabled: true - provider: "gemini" - model: "gemini-2.0-flash" - timeout_seconds: 10.0 - max_tokens: 500 - - # Routing - routing: - default_provider: "auto" # "local", "cloud", "auto" - fallback_on_timeout: true - fallback_on_error: true - - # Performance - image: - max_width: 1280 - max_height: 720 - format: "png" # "png" or "jpeg" - jpeg_quality: 85 -``` - -### Fallback Strategy - -```python -async def _route_vision_request(screenshot_b64: str, prompt: str, provider: str) -> dict: - """Route vision request with fallback logic.""" - - if provider == "auto": - # Try local first - try: - result = await _call_local_vlm(screenshot_b64, prompt, timeout=3.0) - return {**result, "provider_used": "local"} - except (TimeoutError, ConnectionError) as e: - logger.warning(f"Local VLM failed ({e}), falling back to cloud") - result = await _call_cloud_vlm(screenshot_b64, prompt, timeout=10.0) - return {**result, "provider_used": "cloud"} - - elif provider == "local": - result = await _call_local_vlm(screenshot_b64, prompt, timeout=5.0) - return {**result, "provider_used": "local"} - - elif provider == "cloud": - result = await _call_cloud_vlm(screenshot_b64, prompt, timeout=10.0) - return {**result, "provider_used": "cloud"} -``` - -### Integration with Existing MCP Server - -The `alas_mcp_server.py` already has `adb_screenshot` which returns base64 PNG. The vision tool chains naturally: - -```python -# Example orchestrator workflow: -screenshot = await mcp.call_tool("adb_screenshot") -analysis = await mcp.call_tool("vision_analyze", { - "screenshot_b64": screenshot["data"]["base64"], - "prompt": "What page is this? What buttons are visible? Return JSON.", - "provider": "local" -}) -``` - ---- - -## 6. Benchmarking Plan - -### Latency Targets - -| Scenario | Max Latency | Budget Breakdown | -|----------|-------------|------------------| -| **Real-time (1 FPS)** | 2–3 sec | Screenshot: 0.1s + Encode: 0.05s + VLM: 1.5–2.5s + Parse: 0.1s | -| **Recovery mode** | 5–10 sec | Screenshot: 0.1s + Encode: 0.05s + VLM: 4–9s + Parse: 0.1s | -| **Annotation pipeline** | 15–30 sec | Can tolerate larger models, longer prompts, multi-turn | - -### Throughput Requirements - -- Bot screenshot rate: ~1 FPS (1280×720 PNG) -- VLM must process: ≥1 screenshot every 2–3 seconds (real-time) or ≥1 every 5–10 seconds (recovery) -- Token output per analysis: ~50–200 tokens (structured JSON game state) -- Token input per image: ~1000–2000 tokens (depends on model's vision tokenizer + prompt) - -### Test Scenarios - -#### Scenario 1: Page Identification -- **Input**: Game screenshot -- **Prompt**: "What Azur Lane page/screen is this? Return one of: page_main, page_campaign, page_dock, page_academy, page_shop, page_unknown" -- **Expected output**: Single page name -- **Metrics**: Accuracy (vs ground truth), latency - -#### Scenario 2: UI Element Extraction -- **Input**: Game screenshot -- **Prompt**: "List all visible buttons and interactive elements with their approximate screen positions (x, y). Return JSON array." -- **Expected output**: JSON array of {name, x, y} -- **Metrics**: Precision, recall, position accuracy, latency - -#### Scenario 3: Text/Number Reading (OCR) -- **Input**: Game screenshot showing resources -- **Prompt**: "Read all visible numbers and resource counts (oil, gold, gems, etc). Return JSON." -- **Expected output**: JSON with resource values -- **Metrics**: Exact match accuracy, latency - -#### Scenario 4: Anomaly Detection -- **Input**: Game screenshot with error dialog or unexpected state -- **Prompt**: "Is there any error, popup, or unexpected dialog? If so, describe it and suggest an action." -- **Expected output**: Description + action suggestion -- **Metrics**: Detection rate, action quality, latency - -### Benchmarking Script - -```python -#!/usr/bin/env python3 -"""Benchmark local VLM against game screenshots.""" - -import asyncio -import base64 -import json -import time -from pathlib import Path -from openai import AsyncOpenAI - -SCREENSHOTS_DIR = Path("alas_wrapped/screenshots") -RESULTS_FILE = Path("docs/plans/vlm_benchmark_results.json") - -# Configure for Ollama -client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama") -MODEL = "qwen3-vl:8b" - -PROMPTS = { - "page_id": "What Azur Lane page/screen is this? Reply with only the page name.", - "ui_elements": "List all visible buttons and UI elements. Return JSON array.", - "ocr": "Read all visible text and numbers. Return JSON.", - "anomaly": "Is there an error popup or unexpected dialog? Describe if yes.", -} - -async def benchmark_single(image_path: Path, prompt_key: str) -> dict: - with open(image_path, "rb") as f: - img_b64 = base64.b64encode(f.read()).decode() - - start = time.perf_counter() - response = await client.chat.completions.create( - model=MODEL, - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": PROMPTS[prompt_key]}, - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}} - ] - }], - max_tokens=500, - temperature=0.1, - ) - elapsed = time.perf_counter() - start - - return { - "image": image_path.name, - "prompt": prompt_key, - "latency_s": round(elapsed, 3), - "tokens": response.usage.completion_tokens if response.usage else None, - "response": response.choices[0].message.content, - } - -async def main(): - screenshots = list(SCREENSHOTS_DIR.glob("*.png"))[:20] # Limit for benchmarking - results = [] - - for img in screenshots: - for prompt_key in PROMPTS: - result = await benchmark_single(img, prompt_key) - results.append(result) - print(f" {img.name} / {prompt_key}: {result['latency_s']}s") - - # Summary - latencies = [r["latency_s"] for r in results] - summary = { - "model": MODEL, - "total_tests": len(results), - "avg_latency_s": round(sum(latencies) / len(latencies), 3), - "p50_latency_s": round(sorted(latencies)[len(latencies) // 2], 3), - "p95_latency_s": round(sorted(latencies)[int(len(latencies) * 0.95)], 3), - "max_latency_s": round(max(latencies), 3), - } - - output = {"summary": summary, "results": results} - RESULTS_FILE.write_text(json.dumps(output, indent=2)) - print(f"\nSummary: {json.dumps(summary, indent=2)}") - -if __name__ == "__main__": - asyncio.run(main()) -``` - -### Performance Expectations - -Based on research data, expected benchmarks on RTX 5090: - -| Model | Quant | Prompt Processing | Token Generation | Est. Total (1 image + 150 tok response) | -|-------|-------|-------------------|------------------|-----------------------------------------| -| Qwen3-VL-8B | Q4_K_M | ~500–1000 tok/s | ~200 tok/s | **~1.5–2.5 sec** | -| MiniCPM-V 4.5 | Q4_K_M | ~500–1000 tok/s | ~200 tok/s | **~1.5–2.5 sec** | -| Qwen3-VL-32B | Q4_K_M | ~200–400 tok/s | ~60 tok/s | **~4–7 sec** | -| Gemma 3 27B | Q4_K_M | ~250–500 tok/s | ~70 tok/s | **~3.5–6 sec** | -| Qwen3-VL-30B-A3B | Q4_K_M | ~400–800 tok/s | ~170 tok/s | **~2–3.5 sec** | - -*Note: Vision prompt processing includes both text and image token processing. The image portion adds ~1000–2000 tokens depending on model and resolution. These are estimates extrapolated from published text-only benchmarks — actual vision latency may differ and must be validated empirically.* - ---- - -## 7. Phase Plan - -### Phase L1: Install & Basic Serving (1–2 days) - -**Goal**: Get a local VLM running and responding to image queries. - -**Steps**: -1. Install Ollama on the 5090 machine - ```powershell - winget install Ollama.Ollama - ``` -2. Pull the primary model - ```powershell - ollama pull qwen3-vl:8b - ``` -3. Verify vision works with a test image - ```powershell - ollama run qwen3-vl:8b "What do you see in this image?" --images test_screenshot.png - ``` -4. Verify the OpenAI-compatible API is accessible - ```python - # Quick test script - from openai import OpenAI - client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama") - response = client.models.list() - print([m.id for m in response.data]) - ``` -5. Take a game screenshot and test analysis - ```powershell - # From ALAS, capture a screenshot - # Then test: - ollama run qwen3-vl:8b "What Azur Lane screen is this? List visible UI elements." --images screenshot.png - ``` - -**Success Criteria**: -- [ ] Ollama running and serving qwen3-vl:8b -- [ ] Can send an image via OpenAI-compatible API and get a response -- [ ] Response time < 5 seconds for a game screenshot -- [ ] Model fits in VRAM with room for context - -**Deliverables**: -- Working Ollama installation -- Test script confirming API accessibility - ---- - -### Phase L2: Benchmark Against Game Screenshots (2–3 days) - -**Goal**: Determine whether local VLM meets ALAS's latency and accuracy requirements. - -**Steps**: -1. Collect 20–50 game screenshots covering key screens: - - Main menu, campaign, dock, academy, shop - - Dialog boxes, popups, error states - - Resource counters (oil, gold, gems) - - Battle screens, reward screens -2. Create ground-truth labels (page name, visible elements, text/numbers) -3. Run the benchmarking script (from Section 6) against multiple models: - - `qwen3-vl:8b` (Q4_K_M) - - `openbmb/minicpm-v4.5` (default quant) - - `qwen3-vl:32b` (Q4_K_M) — for comparison -4. Measure latency, accuracy, and token usage -5. Test different quantization levels if needed: - ```powershell - # Try Q8 for quality comparison - ollama pull qwen3-vl:8b-q8_0 - ``` -6. Document findings in `docs/plans/vlm_benchmark_results.json` - -**Success Criteria**: -- [ ] At least one model delivers page identification at >90% accuracy, <3 sec latency -- [ ] OCR accuracy on resource numbers at >95% -- [ ] P95 latency under 5 seconds for the "fast" model -- [ ] Documented comparison of at least 2 models - -**Deliverables**: -- Benchmark results file -- Model selection decision with rationale -- Performance characteristics documented - ---- - -### Phase L3: Integration with MCP Server (3–5 days) - -**Goal**: Wire the local VLM into the ALAS MCP server as a callable tool. - -**Steps**: -1. Create `agent_orchestrator/vision_router.py`: - - OpenAI client for local VLM (Ollama) - - OpenAI client for cloud VLM (Gemini) - - Routing logic (auto/local/cloud) - - Timeout and fallback handling -2. Add `vision_analyze` tool to `alas_mcp_server.py` -3. Create `vision_config.yaml` configuration -4. Add integration tests: - ```python - async def test_vision_analyze_local(): - result = await mcp.call_tool("vision_analyze", { - "screenshot_b64": test_screenshot_b64, - "prompt": "What page is this?", - "provider": "local" - }) - assert result["success"] - assert result["provider_used"] == "local" - assert result["latency_ms"] < 5000 - ``` -5. Test the full chain: `adb_screenshot` → `vision_analyze` → parse state -6. Validate fallback: stop Ollama → verify cloud fallback works - -**Success Criteria**: -- [ ] `vision_analyze` tool registered and callable via MCP -- [ ] "auto" routing tries local first, falls back to cloud -- [ ] Integration test passes with both providers -- [ ] Latency within targets (see Section 6) - -**Deliverables**: -- `agent_orchestrator/vision_router.py` -- Updated `alas_mcp_server.py` with `vision_analyze` tool -- `vision_config.yaml` -- Integration tests -- Updated `docs/agent_tooling/README.md` - ---- - -### Phase L4: Production Deployment (2–3 days) - -**Goal**: Reliable, always-on local VLM serving for autonomous bot operation. - -**Steps**: -1. Configure Ollama as a Windows service (auto-start on boot): - ```powershell - # Ollama typically auto-starts. Verify: - Get-Service Ollama - - # Set environment variables persistently - [System.Environment]::SetEnvironmentVariable("OLLAMA_KEEP_ALIVE", "24h", "Machine") - [System.Environment]::SetEnvironmentVariable("OLLAMA_NUM_PARALLEL", "1", "Machine") - [System.Environment]::SetEnvironmentVariable("OLLAMA_FLASH_ATTENTION", "1", "Machine") - ``` -2. Add health check to MCP server startup: - ```python - async def _check_vlm_health(): - """Verify local VLM is responding before starting bot.""" - try: - response = await local_client.chat.completions.create( - model="qwen3-vl:8b", - messages=[{"role": "user", "content": "ping"}], - max_tokens=5, - ) - return True - except Exception as e: - logger.warning(f"Local VLM health check failed: {e}") - return False - ``` -3. Add monitoring/metrics: - - Log latency per vision call - - Track local vs cloud fallback ratio - - Alert if local VLM latency degrades -4. Optimize for production: - - Pre-warm model on startup (send a dummy request) - - Tune `num_ctx` based on actual usage patterns - - Consider switching to llama.cpp if Ollama overhead is significant -5. Update ARCHITECTURE.md vision integration status to "Working" -6. Update ROADMAP.md - -**OR, if llama.cpp proves better in benchmarks**: -```powershell -# Create a PowerShell script to auto-start llama-server -# start_vlm_server.ps1 -$env:CUDA_VISIBLE_DEVICES = "0" -& "C:\tools\llama-cpp\llama-server.exe" ` - -hf ggml-org/Qwen3-VL-8B-Instruct-GGUF ` - --host 127.0.0.1 --port 8080 ` - -ngl 999 --flash-attn ` - -c 4096 ` - --metrics -``` - -**Success Criteria**: -- [ ] VLM auto-starts with the machine -- [ ] Bot can run autonomously using local VLM for vision -- [ ] Cloud fallback activates automatically on local failure -- [ ] Latency metrics logged and within targets -- [ ] Docs updated - -**Deliverables**: -- Production configuration (service/startup scripts) -- Health check in MCP server -- Monitoring/logging -- Updated ARCHITECTURE.md (Vision Integration status → Working) -- Updated ROADMAP.md -- CHANGELOG.md entry - ---- - -## Appendix A: Model Availability by Date - -> This section tracks which models were available as of this document's creation (Feb 2026) and their ecosystem readiness. - -| Model | Release | Ollama | llama.cpp GGUF | Notes | -|-------|---------|--------|----------------|-------| -| Qwen2.5-VL-7B | Jan 2025 | ✅ | ✅ | Mature, well-tested | -| Qwen2.5-VL-32B | Mar 2025 | ✅ | ✅ | First large open VLM with great quality | -| Gemma 3 27B | Mar 2025 | ✅ | ✅ | Google QAT, strong ecosystem | -| MiniCPM-V 4.5 | ~Mid 2025 | ✅ | ✅ | OCR champion, 8B | -| Qwen3-VL-8B | Oct 2025 | ✅ | ✅ | Latest gen, recommended | -| Qwen3-VL-32B | Oct 2025 | ✅ | ✅ | Dense, high quality | -| Qwen3-VL-30B-A3B | Oct 2025 | ✅ | ✅ | MoE, fast inference | -| Mistral Small 3.1 | Mar 2025 | ✅ | ✅ | 24B, tool-calling capable | - -## Appendix B: Supported llama.cpp Vision Models - -Canonical list from [llama.cpp multimodal docs](https://github.com/ggml-org/llama.cpp/blob/master/docs/multimodal.md): - -```bash -# Qwen family -llama-server -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF -llama-server -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF -# Qwen3-VL requires latest builds (post Oct 2025) - -# Gemma 3 -llama-server -hf ggml-org/gemma-3-4b-it-GGUF -llama-server -hf ggml-org/gemma-3-12b-it-GGUF -llama-server -hf ggml-org/gemma-3-27b-it-GGUF - -# MiniCPM-V -llama-server -m MiniCPM-V-4_5-Q4_K_M.gguf --mmproj mmproj-model-f16.gguf - -# Others -llama-server -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF -llama-server -hf ggml-org/InternVL2.5-4B-GGUF -``` - -## Appendix C: Quick Reference Commands - -```powershell -# === OLLAMA === -# Install -winget install Ollama.Ollama - -# Pull models -ollama pull qwen3-vl:8b -ollama pull qwen3-vl:32b -ollama pull openbmb/minicpm-v4.5 - -# Interactive test with image -ollama run qwen3-vl:8b "Describe this image" --images screenshot.png - -# Check loaded models -ollama ps - -# Check VRAM usage -nvidia-smi - -# === LLAMA.CPP === -# Serve model -llama-server -hf ggml-org/Qwen3-VL-8B-Instruct-GGUF --host 127.0.0.1 --port 8080 -ngl 999 --flash-attn -c 4096 - -# Test endpoint -curl http://localhost:8080/v1/models - -# === MONITORING === -# Watch GPU usage -nvidia-smi -l 1 - -# Watch Ollama logs (Windows) -Get-Content "$env:LOCALAPPDATA\Ollama\server.log" -Tail 50 -Wait -``` - -## Appendix D: Extrapolations & Uncertainty - -Where definitive February 2026 data was unavailable, the following extrapolations were made: - -1. **RTX 5090 VLM-specific benchmarks**: Published benchmarks are primarily for text-only LLMs. Vision models add overhead for image encoding (prompt processing). We estimate a 20–40% increase in total latency vs text-only for the same model, primarily in the prompt processing phase. - -2. **Qwen3-VL generation speed**: Extrapolated from Qwen3 text model benchmarks on 5090 (e.g., qwen2.5:32b at ~62 tok/s). Vision versions use similar architectures with added vision encoder overhead. - -3. **MoE model speeds**: The Qwen3-VL-30B-A3B speed estimate (~170 tok/s) is based on the fact that only 3B parameters are active per token, placing it between 3B and 8B dense model speeds, with some MoE routing overhead. - -4. **Quantization quality for vision**: The claim that vision tasks degrade more gracefully with quantization is based on research showing that the vision encoder (typically kept at FP16) preserves perceptual quality while the language model's Q4 quantization primarily affects text generation nuance — which matters less for structured output extraction. - -5. **Total latency estimates**: Combine prompt processing (image tokens + text tokens) and generation phases. Image token count varies dramatically by model and resolution — Qwen VL models are relatively efficient but exact token counts for 1280×720 images should be measured empirically. diff --git a/docs/archive/recovery_agent_architecture.md b/docs/archive/recovery_agent_architecture.md deleted file mode 100644 index 5ffa661d78..0000000000 --- a/docs/archive/recovery_agent_architecture.md +++ /dev/null @@ -1,1062 +0,0 @@ -# Durable Agent System Architecture with Autonomous Recovery - -## Executive Summary - -This document defines a comprehensive architecture for a self-healing agent system that can autonomously detect failures, recover from deadlocks, and maintain operational continuity for the ALAS (Azur Lane Auto Script) automation platform. The recovery agent operates as a supervisor layer above the existing MCP server and deterministic tools, enabling the system to monitor itself and take corrective action when the primary workflow encounters problems. - -## System Architecture - -### High-Level Component Diagram - -```mermaid -flowchart TB - subgraph "Recovery Layer" - RA[Recovery Agent
Supervisor Monitor] - RM[Recovery Manager
State Machine] - EH[Error Handler
Classifier & Logger] - end - - subgraph "Orchestration Layer" - MA[Main Agent
Task Orchestrator] - TM[Task Manager
Queue & State] - HS[Health Service
Heartbeat Monitor] - end - - subgraph "Tool Execution Layer" - MCP[ALAS MCP Server
JSON-RPC over stdio] - DT[Deterministic Tools
ADB + ALAS State] - end - - subgraph "External Systems" - EMU[Android Emulator
MEmuPlayer] - ALAS[ALAS Core
OCR + Game Logic] - LOG[Log Files
Persistence] - end - - RA -->|monitors| MA - RA -->|triggers| RM - RA -->|logs to| LOG - RM -->|executes via| MCP - MA -->|calls| MCP - MCP -->|controls| DT - DT -->|interacts with| EMU - DT -->|uses| ALAS - HS -->|heartbeat| RA - MA -->|reports status| HS -``` - -### Recovery Agent Position in System - -```mermaid -flowchart LR - subgraph "Phase II: Autonomous Operation" - direction TB - GA[Gemini Orchestrator
Phase II - Planned] - RC[Recovery Controller
This Architecture] - VM[Vision Model
Gemini Flash] - end - - subgraph "Phase I: MCP Server" - MCP[ALAS MCP Server
Operational] - end - - subgraph "Phase 0: Tools" - TOOLS[Deterministic Tools
ADB + State] - end - - GA -->|delegates to| RC - RC -->|monitors & recovers| GA - RC -->|calls on failure| VM - GA -->|invokes via MCP| MCP - RC -->|direct control| MCP - MCP -->|executes| TOOLS -``` - -## Core Design Principles - -### 1. Deterministic Tools First - -The system maintains the NORTH_STAR principle of using deterministic tools for normal operation. The recovery agent only activates when: -- Tools report failure (success=false) -- State transitions don't match expectations -- Deadlock is detected (no progress for N seconds) -- Exceptions propagate to the orchestrator level - -### 2. Layered Recovery Strategy - -Recovery operates at multiple levels with increasing intervention: - -| Level | Action | Trigger | Example | -|-------|--------|---------|---------| -| 0 | Tool Retry | Transient failure | Network timeout on ADB tap | -| 1 | State Reset | Wrong page detected | Navigate to page_main | -| 2 | Workflow Restart | Checkpoint available | Restart from last stable task | -| 3 | Vision Diagnosis | Unknown state | LLM analyzes screenshot | -| 4 | Human Escalation | Unresolvable | Log context, notify operator | - -### 3. Full Context on Failure - -Every recovery action has access to: -- Recent action history (last N tool calls) -- Current and expected state -- Screenshots (timestamped) -- Log excerpts (errors, warnings) -- System metrics (uptime, task counts) - -## Recovery Agent Components - -### 1. Health Monitor - -The Health Monitor continuously tracks system vitality through multiple channels: - -```python -class HealthMonitor: - """Tracks system health through multiple indicators.""" - - def __init__(self): - self.heartbeat_timeout = 30.0 # seconds - self.progress_timeout = 120.0 # seconds without state change - self.error_threshold = 5 # errors in window - self.error_window = 300.0 # 5 minute window - - def check_heartbeat(self, last_heartbeat: float) -> HealthStatus: - """Check if main agent is still responding.""" - elapsed = time.monotonic() - last_heartbeat - if elapsed > self.heartbeat_timeout: - return HealthStatus.STALLED - return HealthStatus.HEALTHY - - def check_progress(self, state_history: List[StateSnapshot]) -> HealthStatus: - """Detect deadlock - no state change despite active execution.""" - if len(state_history) < 2: - return HealthStatus.HEALTHY - - recent_states = state_history[-10:] # Last 10 states - unique_states = set(s.state_name for s in recent_states) - - if len(unique_states) == 1: - # Stuck in same state - time_in_state = recent_states[-1].timestamp - recent_states[0].timestamp - if time_in_state > self.progress_timeout: - return HealthStatus.DEADLOCKED - - return HealthStatus.HEALTHY - - def check_error_rate(self, errors: List[ErrorEvent]) -> HealthStatus: - """Detect error storms - too many errors in short window.""" - cutoff = time.monotonic() - self.error_window - recent_errors = [e for e in errors if e.timestamp > cutoff] - - if len(recent_errors) > self.error_threshold: - return HealthStatus.DEGRADED - - return HealthStatus.HEALTHY -``` - -### 2. Error Classifier - -Errors are classified to determine appropriate recovery strategy: - -```python -class ErrorCategory(Enum): - TRANSIENT = auto() # Retry likely to succeed - STATE_MISMATCH = auto() # Wrong page/UI state - RESOURCE_UNAVAILABLE = auto() # Emulator/ADB issue - LOGIC_ERROR = auto() # Tool implementation bug - UNKNOWN = auto() # Unclassified - needs diagnosis - -class ErrorClassifier: - """Classify errors to determine recovery strategy.""" - - TRANSIENT_PATTERNS = [ - r"Connection reset by peer", - r"ADB server didn't ACK", - r"Screenshot timeout", - r"MaaTouch daemon not responding", - ] - - STATE_MISMATCH_PATTERNS = [ - r"Expected.*but observed", - r"page_.*not found", - r"Button.*not appear", - ] - - RESOURCE_PATTERNS = [ - r"Device not found", - r"Emulator.*not running", - r"Failed to connect to ADB", - ] - - def classify(self, error: str, context: ExecutionContext) -> ErrorCategory: - """Classify error string into category.""" - error_lower = error.lower() - - for pattern in self.TRANSIENT_PATTERNS: - if re.search(pattern, error, re.IGNORECASE): - return ErrorCategory.TRANSIENT - - for pattern in self.STATE_MISMATCH_PATTERNS: - if re.search(pattern, error, re.IGNORECASE): - return ErrorCategory.STATE_MISMATCH - - for pattern in self.RESOURCE_PATTERNS: - if re.search(pattern, error, re.IGNORECASE): - return ErrorCategory.RESOURCE_UNAVAILABLE - - # Check if this is a recurring error - if context.error_history: - similar = [e for e in context.error_history - if self._similarity(e.message, error) > 0.8] - if len(similar) > 2: - return ErrorCategory.LOGIC_ERROR - - return ErrorCategory.UNKNOWN - - def _similarity(self, a: str, b: str) -> float: - """Simple string similarity for deduplication.""" - return SequenceMatcher(None, a, b).ratio() -``` - -### 3. Recovery Manager - -The Recovery Manager executes recovery strategies based on error classification: - -```python -class RecoveryManager: - """Executes recovery strategies based on error classification.""" - - def __init__(self, mcp_client: MCPClient, vision_agent: Optional[VisionAgent] = None): - self.mcp = mcp_client - self.vision = vision_agent - self.recovery_stats = RecoveryStatistics() - self.max_recovery_attempts = 3 - - async def recover(self, failure: FailureEvent, context: ExecutionContext) -> RecoveryResult: - """Execute appropriate recovery strategy.""" - - # Check if we've tried too many times - recent_attempts = self._count_recent_attempts(failure, context) - if recent_attempts >= self.max_recovery_attempts: - return RecoveryResult( - success=False, - action="ESCALATE", - reason=f"Max recovery attempts ({self.max_recovery_attempts}) exceeded" - ) - - # Route to appropriate handler - category = failure.error_category - - handlers = { - ErrorCategory.TRANSIENT: self._handle_transient, - ErrorCategory.STATE_MISMATCH: self._handle_state_mismatch, - ErrorCategory.RESOURCE_UNAVAILABLE: self._handle_resource_issue, - ErrorCategory.LOGIC_ERROR: self._handle_logic_error, - ErrorCategory.UNKNOWN: self._handle_unknown, - } - - handler = handlers.get(category, self._handle_unknown) - return await handler(failure, context) - - async def _handle_transient(self, failure: FailureEvent, context: ExecutionContext) -> RecoveryResult: - """Retry the failed operation.""" - # Simple retry with exponential backoff - delay = min(2 ** failure.attempt_count, 30) # Cap at 30 seconds - await asyncio.sleep(delay) - - # Retry the exact same operation - result = await self.mcp.call_tool(failure.tool_name, failure.tool_args) - - return RecoveryResult( - success=result.get("success", False), - action="RETRY", - data=result - ) - - async def _handle_state_mismatch(self, failure: FailureEvent, context: ExecutionContext) -> RecoveryResult: - """Reset to known good state.""" - # Get current state - current = await self.mcp.get_current_state() - - # Navigate to expected state or safe state (page_main) - expected = failure.expected_state or "page_main" - - try: - await self.mcp.goto(expected) - return RecoveryResult( - success=True, - action="STATE_RESET", - data={"from": current, "to": expected} - ) - except Exception as e: - # If specific state fails, try safe state - if expected != "page_main": - try: - await self.mcp.goto("page_main") - return RecoveryResult( - success=True, - action="STATE_RESET_SAFE", - data={"from": current, "to": "page_main"} - ) - except Exception as e2: - pass - - return RecoveryResult( - success=False, - action="STATE_RESET_FAILED", - error=str(e) - ) - - async def _handle_resource_issue(self, failure: FailureEvent, context: ExecutionContext) -> RecoveryResult: - """Handle emulator/ADB connectivity issues.""" - # Try to reconnect to ADB - for attempt in range(3): - try: - # Reconnect ADB - await self.mcp.reconnect_adb() - - # Verify connection with screenshot - screenshot = await self.mcp.screenshot() - if screenshot: - return RecoveryResult( - success=True, - action="ADB_RECONNECT", - data={"attempt": attempt + 1} - ) - except Exception as e: - await asyncio.sleep(5) - - return RecoveryResult( - success=False, - action="RESOURCE_RECOVERY_FAILED", - error="Could not reconnect to ADB/emulator" - ) - - async def _handle_unknown(self, failure: FailureEvent, context: ExecutionContext) -> RecoveryResult: - """Use vision to diagnose unknown failures.""" - if not self.vision: - return RecoveryResult( - success=False, - action="ESCALATE", - reason="Unknown error and no vision agent available" - ) - - # Capture current state - screenshot = await self.mcp.screenshot() - logs = await self._get_recent_logs(context) - - # Ask vision agent to diagnose - diagnosis = await self.vision.diagnose( - screenshot=screenshot, - error_message=failure.error_message, - action_history=context.recent_actions, - logs=logs - ) - - if diagnosis.can_recover: - # Execute recovery action suggested by vision - result = await self._execute_vision_recovery(diagnosis, context) - return RecoveryResult( - success=result.success, - action=f"VISION_RECOVERY:{diagnosis.recommended_action}", - data=result.data - ) - else: - return RecoveryResult( - success=False, - action="ESCALATE", - reason=diagnosis.reason, - context={ - "screenshot": screenshot, - "diagnosis": diagnosis.explanation - } - ) -``` - -### 4. Vision Diagnosis Agent - -The Vision Agent uses LLM with screenshot analysis to diagnose unknown failures: - -```python -class VisionDiagnosisAgent: - """Uses vision-capable LLM to diagnose unknown failures.""" - - SYSTEM_PROMPT = """You are a recovery diagnosis agent for a mobile game automation system. - -Your task is to analyze the current game state from a screenshot and determine: -1. What is the current game state/page? -2. What went wrong based on the error and action history? -3. Can this be recovered automatically? -4. What specific recovery action should be taken? - -Available recovery actions: -- TAP: Tap a specific coordinate -- GOTO: Navigate to a specific page -- WAIT: Wait for a condition -- RESTART_APP: Restart the game app -- ESCALATE: Cannot recover, needs human - -Respond in JSON format: -{ - "current_state": "description of what you see", - "issue_analysis": "what went wrong", - "can_recover": true/false, - "recommended_action": "TAP|GOTO|WAIT|RESTART_APP|ESCALATE", - "action_params": {"coordinates": [x, y]} or {"page": "page_name"} or {}, - "explanation": "why you chose this action" -}""" - - def __init__(self, model: str = "gemini-2.0-flash"): - self.model = model - - async def diagnose( - self, - screenshot: bytes, - error_message: str, - action_history: List[ActionRecord], - logs: List[str] - ) -> DiagnosisResult: - """Diagnose failure using vision model.""" - - # Build context - history_str = "\n".join([ - f"- {a.timestamp}: {a.tool_name} -> {a.result}" - for a in action_history[-5:] # Last 5 actions - ]) - - log_excerpt = "\n".join(logs[-10:]) # Last 10 log lines - - user_prompt = f"""Error: {error_message} - -Recent actions: -{history_str} - -Recent logs: -{log_excerpt} - -Analyze the attached screenshot and provide your diagnosis.""" - - # Call vision model - response = await self._call_vision_model( - system=self.SYSTEM_PROMPT, - user=user_prompt, - image=screenshot - ) - - # Parse JSON response - try: - result = json.loads(response) - return DiagnosisResult(**result) - except json.JSONDecodeError: - # Fallback if model doesn't return valid JSON - return DiagnosisResult( - current_state="unknown", - issue_analysis="Failed to parse vision response", - can_recover=False, - recommended_action="ESCALATE", - explanation=f"Vision model response: {response[:200]}" - ) -``` - -## State Persistence and Checkpoints - -### Checkpoint System - -The system maintains checkpoints for recovery at key boundaries: - -```python -@dataclass -class Checkpoint: - """A snapshot of system state at a specific point in time.""" - id: str # Unique checkpoint ID - timestamp: datetime - thread_id: str # For parallel workflows - - # State - current_page: str - task_queue: List[Task] - completed_tasks: List[Task] - current_task: Optional[Task] - - # Context - game_state: Dict[str, Any] # ALAS-specific state - inventory_cache: Dict[str, int] # Cached inventory counts - - # Metadata - checkpoint_type: CheckpointType # MANUAL, AUTO, RECOVERY - parent_checkpoint: Optional[str] # For branching - -class CheckpointManager: - """Manages checkpoint persistence for recovery.""" - - def __init__(self, storage: CheckpointStorage): - self.storage = storage - self.auto_checkpoint_interval = 300 # 5 minutes - self.checkpoint_on_task_complete = True - - async def create_checkpoint( - self, - context: ExecutionContext, - checkpoint_type: CheckpointType = CheckpointType.AUTO - ) -> Checkpoint: - """Create a new checkpoint.""" - checkpoint = Checkpoint( - id=str(uuid.uuid4()), - timestamp=datetime.utcnow(), - thread_id=context.thread_id, - current_page=context.current_page, - task_queue=list(context.task_queue), - completed_tasks=list(context.completed_tasks), - current_task=context.current_task, - game_state=await self._capture_game_state(context), - checkpoint_type=checkpoint_type - ) - - await self.storage.save(checkpoint) - return checkpoint - - async def restore_checkpoint(self, checkpoint_id: str) -> ExecutionContext: - """Restore system state from checkpoint.""" - checkpoint = await self.storage.load(checkpoint_id) - - context = ExecutionContext( - thread_id=checkpoint.thread_id, - current_page=checkpoint.current_page, - task_queue=deque(checkpoint.task_queue), - completed_tasks=checkpoint.completed_tasks, - current_task=checkpoint.current_task, - game_state=checkpoint.game_state - ) - - # Restore game state via MCP - if checkpoint.current_page: - await self.mcp.goto(checkpoint.current_page) - - return context - - async def get_latest_stable_checkpoint(self, thread_id: str) -> Optional[Checkpoint]: - """Get the most recent checkpoint that completed a task.""" - checkpoints = await self.storage.list_checkpoints( - thread_id=thread_id, - types=[CheckpointType.AUTO, CheckpointType.MANUAL], - limit=10 - ) - - # Find checkpoint after last completed task - for cp in reversed(checkpoints): - if cp.completed_tasks: - return cp - - return checkpoints[0] if checkpoints else None -``` - -### Storage Backends - -```python -class CheckpointStorage(ABC): - """Abstract base for checkpoint storage.""" - - @abstractmethod - async def save(self, checkpoint: Checkpoint): ... - - @abstractmethod - async def load(self, checkpoint_id: str) -> Checkpoint: ... - - @abstractmethod - async def list_checkpoints(self, thread_id: str, **filters) -> List[Checkpoint]: ... - -class SQLiteCheckpointStorage(CheckpointStorage): - """Local SQLite storage for development/single-node.""" - # Implementation for local SQLite - -class RedisCheckpointStorage(CheckpointStorage): - """Redis storage for distributed/multi-node.""" - # Implementation for Redis -``` - -## Escalation Procedures - -### Escalation Levels - -| Level | Condition | Action | Notification | -|-------|-----------|--------|--------------| -| 1 | Recovery succeeded | Log, continue | None | -| 2 | Recovery failed, can retry | Queue retry | Log warning | -| 3 | Max retries exceeded | Escalate to human | Log error, notify | -| 4 | Critical system failure | Stop all, preserve state | Immediate alert | - -### Escalation Handler - -```python -class EscalationHandler: - """Handles human escalation for unresolvable failures.""" - - def __init__(self): - self.escalation_log: List[EscalationRecord] = [] - self.notification_hooks: List[Callable] = [] - - async def escalate(self, failure: FailureEvent, context: ExecutionContext) -> EscalationResult: - """Escalate to human operator.""" - - # Capture full context - escalation_context = EscalationContext( - timestamp=datetime.utcnow(), - failure=failure, - action_history=context.recent_actions, - current_state=context.current_page, - screenshot=await self._capture_screenshot(), - logs=await self._get_log_excerpt(lines=100), - checkpoint_id=context.last_checkpoint_id - ) - - # Persist escalation record - record = EscalationRecord( - id=str(uuid.uuid4()), - context=escalation_context, - status=EscalationStatus.PENDING - ) - self.escalation_log.append(record) - - # Notify via hooks - for hook in self.notification_hooks: - try: - await hook(record) - except Exception as e: - logger.error(f"Notification hook failed: {e}") - - # Halt execution - return EscalationResult( - record_id=record.id, - action="HALT", - message="Execution halted pending human intervention" - ) - - def on_resolution(self, record_id: str, resolution: ResolutionAction): - """Called when human resolves escalation.""" - record = self._find_record(record_id) - record.status = EscalationStatus.RESOLVED - record.resolution = resolution - - if resolution.action == "RESUME": - # Resume from checkpoint - return ResumeCommand( - checkpoint_id=record.context.checkpoint_id, - skip_current_task=resolution.skip_current_task - ) - elif resolution.action == "ABORT": - # Abort current workflow - return AbortCommand(reason=resolution.reason) -``` - -## MCP Tool Extensions for Recovery - -### Recovery-Related MCP Tools - -The recovery agent requires additional MCP tools for full system control: - -```python -@mcp.tool() -def recovery_get_system_status() -> Dict[str, Any]: - """Get comprehensive system status for health monitoring. - - Returns: - { - "healthy": bool, - "current_page": str, - "current_task": str | None, - "adb_connected": bool, - "emulator_responding": bool, - "last_activity": timestamp, - "errors_in_last_5min": int - } - """ - -@mcp.tool() -def recovery_restart_adb() -> Dict[str, Any]: - """Restart ADB connection to recover from connectivity issues. - - Returns: - { - "success": bool, - "error": str | None, - "time_to_reconnect_ms": int - } - """ - -@mcp.tool() -def recovery_capture_checkpoint(checkpoint_type: str = "manual") -> Dict[str, Any]: - """Capture current state as a recovery checkpoint. - - Args: - checkpoint_type: "manual", "auto", or "recovery" - - Returns: - { - "success": bool, - "checkpoint_id": str | None, - "error": str | None - } - """ - -@mcp.tool() -def recovery_restore_checkpoint(checkpoint_id: str) -> Dict[str, Any]: - """Restore system state from a checkpoint. - - Args: - checkpoint_id: ID of checkpoint to restore - - Returns: - { - "success": bool, - "restored_page": str | None, - "error": str | None - } - """ - -@mcp.tool() -def recovery_get_recent_logs(lines: int = 50, level: str = "WARNING") -> List[Dict]: - """Get recent log entries for diagnosis. - - Args: - lines: Number of log lines to retrieve - level: Minimum log level (DEBUG, INFO, WARNING, ERROR) - - Returns: - List of log entries with timestamp, level, message - """ - -@mcp.tool() -def recovery_force_stop_app(package: str = "com.YoStarEN.AzurLane") -> Dict[str, Any]: - """Force stop the game app for clean restart. - - Args: - package: Package name of the app - - Returns: - { - "success": bool, - "error": str | None - } - """ - -@mcp.tool() -def recovery_restart_app(package: str = "com.YoStarEN.AzurLane") -> Dict[str, Any]: - """Restart the game app from scratch. - - Args: - package: Package name of the app - - Returns: - { - "success": bool, - "error": str | None, - "time_to_main_ms": int - } - """ -``` - -## Integration with Existing System - -### Deployment Architecture - -```mermaid -flowchart TB - subgraph "Host Machine" - subgraph "agent_orchestrator/" - RA[recovery_agent.py
Supervisor Process] - MCP[alas_mcp_server.py
Enhanced with recovery tools] - end - - subgraph "alas_wrapped/" - ALAS[ALAS Core] - TOOLS[tools/
login.py, vision.py, etc.] - end - - subgraph "Persistence" - DB[(checkpoints.db
SQLite/Redis)] - LOG[log files] - end - end - - subgraph "External" - EMU[MEmuPlayer
Android Emulator] - GEM[Gemini API
Vision Model] - end - - RA -->|monitors via| MCP - RA -->|reads| LOG - RA -->|writes| DB - MCP -->|controls| ALAS - ALAS -->|interacts with| EMU - RA -->|vision diagnosis| GEM -``` - -### Process Model - -1. **MCP Server**: Runs as persistent process (existing) -2. **Recovery Agent**: Can run as: - - Thread within MCP server (simpler) - - Separate process with IPC (more robust) - - External supervisor via MCP client (most flexible) - -### Configuration - -```yaml -# recovery_config.yaml -recovery: - enabled: true - - health_monitor: - heartbeat_timeout_seconds: 30 - progress_timeout_seconds: 120 - error_threshold: 5 - error_window_seconds: 300 - - recovery_manager: - max_recovery_attempts: 3 - enable_vision_diagnosis: true - vision_model: "gemini-2.0-flash" - - checkpointing: - enabled: true - storage_backend: "sqlite" # or "redis" - sqlite_path: "./checkpoints.db" - auto_checkpoint_interval_seconds: 300 - checkpoint_on_task_complete: true - max_checkpoints: 100 - - escalation: - enabled: true - notification_webhooks: - - "https://hooks.slack.com/services/..." - log_escalation_context: true - halt_on_escalation: true - - retry_policy: - transient: - max_retries: 3 - backoff_base_seconds: 2 - backoff_max_seconds: 30 - state_mismatch: - max_retries: 2 - reset_to_safe_state: true -``` - -## Recovery Workflows - -### Workflow 1: Transient Failure Recovery - -```mermaid -sequenceDiagram - participant MA as Main Agent - participant RM as Recovery Manager - participant MCP as MCP Server - participant Tool as Deterministic Tool - - MA->>MCP: Call tool (adb_tap) - MCP->>Tool: Execute - Tool-->>MCP: Error: ADB timeout - MCP-->>MA: {success: false, error: "ADB timeout"} - MA->>RM: Report failure - RM->>RM: Classify: TRANSIENT - RM->>RM: Check attempt count < max - RM->>MCP: Retry with delay - MCP->>Tool: Execute (retry) - Tool-->>MCP: Success - MCP-->>RM: {success: true} - RM-->>MA: Recovery successful, resume -``` - -### Workflow 2: State Mismatch Recovery - -```mermaid -sequenceDiagram - participant MA as Main Agent - participant RM as Recovery Manager - participant MCP as MCP Server - participant ALAS as ALAS Core - - MA->>MCP: goto(page_commission) - MCP->>ALAS: Navigate - ALAS-->>MCP: Page timeout - MCP-->>MA: {success: false, observed: "page_main", expected: "page_commission"} - MA->>RM: Report failure - RM->>RM: Classify: STATE_MISMATCH - RM->>MCP: get_current_state() - MCP-->>RM: page_main - RM->>MCP: goto(page_main) [safe state] - MCP-->>RM: {success: true} - RM->>MA: Reset to safe state, retry task -``` - -### Workflow 3: Deadlock Detection and Recovery - -```mermaid -sequenceDiagram - participant HM as Health Monitor - participant MA as Main Agent - participant RM as Recovery Manager - participant VA as Vision Agent - participant MCP as MCP Server - - loop Every 30 seconds - HM->>HM: Check heartbeat - HM->>HM: Check progress (last state change) - end - - Note over HM: No state change for 120s - HM->>RM: Trigger: DEADLOCKED - RM->>MCP: get_current_state() - MCP-->>RM: page_combat - RM->>RM: Classify: UNKNOWN (stuck in combat) - - alt Vision Diagnosis Enabled - RM->>MCP: screenshot() - MCP-->>RM: screenshot_bytes - RM->>VA: diagnose(screenshot, context) - VA-->>RM: {can_recover: true, action: "RESTART_APP"} - RM->>MCP: recovery_restart_app() - MCP-->>RM: {success: true} - RM->>MCP: recovery_restore_checkpoint() - MCP-->>RM: {success: true} - else Vision Diagnosis Disabled - RM->>RM: Escalate to human - end -``` - -### Workflow 4: Escalation to Human - -```mermaid -sequenceDiagram - participant RM as Recovery Manager - participant EH as Escalation Handler - participant DB as Checkpoint DB - participant Human as Human Operator - - RM->>RM: Max recovery attempts exceeded - RM->>EH: escalate(failure, context) - - EH->>MCP: screenshot() - EH->>MCP: recovery_get_recent_logs(100) - EH->>DB: Save escalation context - - EH->>EH: Notify via webhooks - EH-->>RM: EscalationResult(record_id, HALT) - - RM->>MA: Halt execution - - Note over Human: Human reviews escalation - Human->>DB: Review context (screenshot, logs) - Human->>EH: resolve(record_id, RESUME) - - EH->>MCP: recovery_restore_checkpoint() - EH-->>RM: ResumeCommand - RM->>MA: Resume execution -``` - -## Implementation Roadmap - -### Phase 1: Foundation (Core Recovery) - -- [ ] Implement Health Monitor with heartbeat and deadlock detection -- [ ] Implement Error Classifier with pattern matching -- [ ] Implement Recovery Manager with transient and state mismatch handlers -- [ ] Add recovery tools to MCP server (restart_adb, get_status) -- [ ] Implement basic checkpointing (SQLite storage) -- [ ] Create recovery configuration schema -- [ ] Write unit tests for recovery components - -### Phase 2: Vision Integration - -- [ ] Implement Vision Diagnosis Agent -- [ ] Integrate with Gemini Flash API -- [ ] Add vision-based recovery handlers -- [ ] Create screenshot analysis prompts -- [ ] Test vision diagnosis on common failure scenarios -- [ ] Implement confidence scoring for vision decisions - -### Phase 3: Persistence and Durability - -- [ ] Implement full checkpoint/restore system -- [ ] Add Redis backend option for distributed deployments -- [ ] Implement checkpoint pruning and management -- [ ] Add state validation after restore -- [ ] Test recovery from various checkpoint states -- [ ] Implement automatic checkpointing at task boundaries - -### Phase 4: Escalation and Observability - -- [ ] Implement Escalation Handler with webhook support -- [ ] Create escalation review UI/dashboard -- [ ] Add comprehensive metrics and logging -- [ ] Create recovery dashboards and alerts -- [ ] Write runbooks for common recovery scenarios -- [ ] Document escalation procedures - -### Phase 5: Integration and Hardening - -- [ ] Integrate recovery agent with main orchestrator -- [ ] Add graceful degradation modes -- [ ] Implement circuit breaker patterns -- [ ] Load test recovery under various failure modes -- [ ] Chaos engineering: inject failures and verify recovery -- [ ] Production readiness review - -## Success Metrics - -| Metric | Target | Measurement | -|--------|--------|-------------| -| Recovery success rate | >90% | % of failures recovered without human | -| Mean time to recovery | <60s | From failure detection to resolution | -| False positive rate | <5% | Unnecessary recovery attempts | -| Escalation rate | <2% | % of failures requiring human | -| Deadlock detection time | <10s | Time to detect stuck state | -| Checkpoint overhead | <100ms | Time to create checkpoint | - -## Risk Assessment - -| Risk | Likelihood | Impact | Mitigation | -|------|------------|--------|------------| -| Recovery makes things worse | Medium | High | Conservative recovery, extensive testing, human escalation | -| Vision diagnosis hallucinates | Low | Medium | Confidence thresholds, multiple samples | -| Checkpoint corruption | Low | High | Validation on restore, multiple checkpoint versions | -| Recovery agent itself fails | Low | High | Separate process, watchdog, fallback to existing behavior | -| Performance degradation | Medium | Low | Async monitoring, configurable intervals | - -## Appendix: External Research Summary - -### Key Patterns from Industry Research - -1. **Supervisor Pattern**: The recovery agent follows the established supervisor pattern from Erlang/OTP and modern AI agent frameworks, where a supervisor monitors workers and restarts them on failure. - -2. **LangGraph Checkpointing**: The checkpoint system is modeled after LangGraph's persistence layer, using the same concepts of threads and checkpoints for durable execution. - -3. **Self-Healing Systems**: Research from 2025 on self-healing AI systems emphasizes: - - Detection before diagnosis - - Conservative recovery actions - - Human escalation for uncertainty - - Full context preservation - -4. **Multi-Agent Failure Recovery**: Industry best practices for multi-agent systems include: - - Health monitoring at multiple levels - - Error classification for targeted recovery - - State persistence for rollback - - Graceful degradation rather than hard failure - -5. **Vision for Recovery**: Using vision models for error diagnosis is an emerging pattern that provides: - - Better understanding of UI state - - Ability to recover from unknown states - - Reduced need for explicit error handling - -## References - -- [NORTH_STAR.md](../NORTH_STAR.md) - Project vision and principles -- [ARCHITECTURE.md](../ARCHITECTURE.md) - System architecture overview -- [ROADMAP.md](../ROADMAP.md) - Implementation phasing -- [AGENTS.md](../../AGENTS.md) - Agent behavior standards -- [docs/agent_orchestration/README.md](../agent_orchestration/README.md) - Agent orchestration design -- [docs/agent_tooling/README.md](../agent_tooling/README.md) - MCP tool design - ---- - -*Document Status: Draft - Ready for Review* -*Last Updated: 2026-02-17* diff --git a/docs/archive/recovery_agent_implementation_plan.md b/docs/archive/recovery_agent_implementation_plan.md deleted file mode 100644 index af95ac9e42..0000000000 --- a/docs/archive/recovery_agent_implementation_plan.md +++ /dev/null @@ -1,761 +0,0 @@ -# Recovery Agent Implementation Plan - -## Overview - -This document provides a detailed, actionable implementation plan for building the autonomous recovery agent system described in [`recovery_agent_architecture.md`](recovery_agent_architecture.md). The plan is organized into phases with specific, independent tasks that can be executed by development agents. - -## Implementation Phases - -### Phase 1: Core Recovery Foundation - -**Goal**: Implement basic recovery infrastructure with health monitoring, error classification, and state-based recovery. - -#### Task 1.1: Create Recovery Agent Package Structure - -**Files to Create:** -- `agent_orchestrator/recovery/__init__.py` -- `agent_orchestrator/recovery/models.py` - Data models for failures, recoveries, checkpoints -- `agent_orchestrator/recovery/config.py` - Configuration schema and loading - -**Requirements:** -```python -# models.py core classes to implement: -- FailureEvent: Captures failure details (tool_name, error, timestamp, context) -- RecoveryResult: Result of recovery attempt (success, action, data) -- HealthStatus: Enum (HEALTHY, STALLED, DEADLOCKED, DEGRADED) -- ErrorCategory: Enum (TRANSIENT, STATE_MISMATCH, RESOURCE_UNAVAILABLE, LOGIC_ERROR, UNKNOWN) -- Checkpoint: State snapshot for recovery -- ExecutionContext: Running state of the system -``` - -**Acceptance Criteria:** -- All data classes are immutable (frozen dataclasses) -- JSON serialization/deserialization works for all models -- Type hints are complete - ---- - -#### Task 1.2: Implement Health Monitor - -**Files to Create:** -- `agent_orchestrator/recovery/health_monitor.py` - -**Requirements:** -Implement the HealthMonitor class with these methods: - -```python -class HealthMonitor: - def __init__(self, config: HealthMonitorConfig) - def check_heartbeat(self, last_heartbeat: float) -> HealthStatus - def check_progress(self, state_history: List[StateSnapshot]) -> HealthStatus - def check_error_rate(self, errors: List[ErrorEvent]) -> HealthStatus - def get_composite_status(self, context: ExecutionContext) -> HealthStatus -``` - -**Configuration Options:** -- `heartbeat_timeout_seconds`: 30.0 -- `progress_timeout_seconds`: 120.0 -- `error_threshold`: 5 -- `error_window_seconds`: 300.0 -- `state_history_size`: 10 - -**Acceptance Criteria:** -- Detects stalled agent (no heartbeat for timeout period) -- Detects deadlock (same state for > progress_timeout) -- Detects error storms (> error_threshold in window) -- Unit tests cover all detection scenarios - ---- - -#### Task 1.3: Implement Error Classifier - -**Files to Create:** -- `agent_orchestrator/recovery/error_classifier.py` - -**Requirements:** -Implement the ErrorClassifier class: - -```python -class ErrorClassifier: - TRANSIENT_PATTERNS: List[str] = [...] - STATE_MISMATCH_PATTERNS: List[str] = [...] - RESOURCE_PATTERNS: List[str] = [...] - - def classify(self, error: str, context: ExecutionContext) -> ErrorCategory - def classify_tool_result(self, result: Dict[str, Any]) -> ErrorCategory -``` - -**Pattern Lists:** -- Transient: Connection reset, ADB timeout, screenshot timeout, daemon issues -- State mismatch: "Expected X but observed Y", page not found, button not appear -- Resource: Device not found, emulator not running, ADB connection failed - -**Acceptance Criteria:** -- Correctly classifies known error patterns -- Returns UNKNOWN for unclassified errors -- Handles empty/None errors gracefully -- Unit tests for each pattern category - ---- - -#### Task 1.4: Implement Recovery Manager Core - -**Files to Create:** -- `agent_orchestrator/recovery/recovery_manager.py` - -**Requirements:** -Implement RecoveryManager with strategy pattern: - -```python -class RecoveryManager: - def __init__(self, mcp_client: MCPClient, config: RecoveryConfig) - async def recover(self, failure: FailureEvent, context: ExecutionContext) -> RecoveryResult - def _count_recent_attempts(self, failure: FailureEvent, context: ExecutionContext) -> int -``` - -**Handler Methods (stubs for now):** -- `_handle_transient`: Returns RETRY action -- `_handle_state_mismatch`: Returns STATE_RESET action -- `_handle_resource_issue`: Returns ADB_RECONNECT action -- `_handle_logic_error`: Returns ESCALATE action -- `_handle_unknown`: Returns ESCALATE action - -**Acceptance Criteria:** -- Routes errors to correct handler based on classification -- Tracks recovery attempt counts -- Returns RecoveryResult with appropriate action -- Prevents infinite recovery loops - ---- - -#### Task 1.5: Implement Basic Recovery Handlers - -**Files to Modify:** -- `agent_orchestrator/recovery/recovery_manager.py` - -**Requirements:** -Implement concrete recovery handlers: - -```python -async def _handle_transient(self, failure, context): - # Retry with exponential backoff - # Max 3 retries, delay = min(2^attempt, 30) - # Call same tool with same args - # Return result - -async def _handle_state_mismatch(self, failure, context): - # Get current state via MCP - # Try to navigate to expected state - # Fallback to page_main if that fails - # Return success/failure - -async def _handle_resource_issue(self, failure, context): - # Attempt ADB reconnection (3 attempts, 5s delay) - # Verify with screenshot - # Return result -``` - -**Acceptance Criteria:** -- Transient handler retries with backoff -- State mismatch handler resets to safe state -- Resource handler reconnects ADB -- All handlers return proper RecoveryResult - ---- - -#### Task 1.6: Add Recovery Tools to MCP Server - -**Files to Modify:** -- `agent_orchestrator/alas_mcp_server.py` - -**Requirements:** -Add these new MCP tools: - -```python -@mcp.tool() -def recovery_get_system_status() -> Dict[str, Any] - # Returns: healthy, current_page, adb_connected, last_activity - -@mcp.tool() -def recovery_restart_adb() -> Dict[str, Any] - # Restart ADB connection - # Returns: success, error, time_to_reconnect_ms - -@mcp.tool() -def recovery_get_recent_logs(lines: int = 50, level: str = "WARNING") -> List[Dict] - # Read from ALAS log files - # Return structured log entries -``` - -**Acceptance Criteria:** -- Tools follow existing tool contract (success, data, error, etc.) -- Tools are tested and working -- Documentation added to agent_tooling README - ---- - -### Phase 2: Checkpoint and Persistence - -**Goal**: Implement state checkpointing for workflow restart capability. - -#### Task 2.1: Implement Checkpoint Models and Storage Interface - -**Files to Create:** -- `agent_orchestrator/recovery/checkpoint.py` -- `agent_orchestrator/recovery/storage/base.py` - -**Requirements:** -```python -# checkpoint.py -@dataclass(frozen=True) -class Checkpoint: - id: str - timestamp: datetime - thread_id: str - current_page: str - task_queue: List[Task] - completed_tasks: List[Task] - current_task: Optional[Task] - game_state: Dict[str, Any] - checkpoint_type: CheckpointType - -# storage/base.py -class CheckpointStorage(ABC): - async def save(self, checkpoint: Checkpoint): ... - async def load(self, checkpoint_id: str) -> Checkpoint: ... - async def list_checkpoints(self, thread_id: str, **filters) -> List[Checkpoint]: ... - async def delete(self, checkpoint_id: str): ... -``` - -**Acceptance Criteria:** -- Checkpoint is serializable to JSON -- Storage interface is async -- Type hints are complete - ---- - -#### Task 2.2: Implement SQLite Checkpoint Storage - -**Files to Create:** -- `agent_orchestrator/recovery/storage/sqlite.py` - -**Requirements:** -```python -class SQLiteCheckpointStorage(CheckpointStorage): - def __init__(self, db_path: str) - # Create table on init if not exists - # Store checkpoint as JSON blob - # Support pagination in list_checkpoints -``` - -**Schema:** -```sql -CREATE TABLE checkpoints ( - id TEXT PRIMARY KEY, - thread_id TEXT NOT NULL, - timestamp TEXT NOT NULL, - data JSON NOT NULL, - type TEXT NOT NULL -); -CREATE INDEX idx_thread_time ON checkpoints(thread_id, timestamp); -``` - -**Acceptance Criteria:** -- Stores and retrieves checkpoints correctly -- Lists checkpoints with filters -- Handles database errors gracefully -- Unit tests with in-memory SQLite - ---- - -#### Task 2.3: Implement Checkpoint Manager - -**Files to Create:** -- `agent_orchestrator/recovery/checkpoint_manager.py` - -**Requirements:** -```python -class CheckpointManager: - def __init__(self, storage: CheckpointStorage, config: CheckpointConfig) - async def create_checkpoint(self, context: ExecutionContext, type: CheckpointType) -> Checkpoint - async def restore_checkpoint(self, checkpoint_id: str) -> ExecutionContext - async def get_latest_stable_checkpoint(self, thread_id: str) -> Optional[Checkpoint] - async def prune_old_checkpoints(self, thread_id: str, keep_count: int = 10) -``` - -**Acceptance Criteria:** -- Captures full execution context -- Restores context and navigates to saved page -- Returns latest checkpoint that completed a task -- Prunes old checkpoints to manage storage - ---- - -#### Task 2.4: Add Checkpoint Tools to MCP Server - -**Files to Modify:** -- `agent_orchestrator/alas_mcp_server.py` - -**Requirements:** -Add MCP tools: - -```python -@mcp.tool() -def recovery_capture_checkpoint(checkpoint_type: str = "manual") -> Dict[str, Any] - # Capture current state as checkpoint - # Returns: success, checkpoint_id, error - -@mcp.tool() -def recovery_restore_checkpoint(checkpoint_id: str) -> Dict[str, Any] - # Restore from checkpoint - # Returns: success, restored_page, error - -@mcp.tool() -def recovery_list_checkpoints(thread_id: str, limit: int = 10) -> List[Dict] - # List available checkpoints -``` - -**Acceptance Criteria:** -- Tools integrate with CheckpointManager -- Restore validates state after restoration -- Error handling for invalid checkpoint IDs - ---- - -### Phase 3: Vision-Based Recovery - -**Goal**: Integrate vision capabilities for diagnosing unknown failures. - -#### Task 3.1: Implement Vision Diagnosis Agent - -**Files to Create:** -- `agent_orchestrator/recovery/vision_agent.py` - -**Requirements:** -```python -class VisionDiagnosisAgent: - SYSTEM_PROMPT: str = """...""" - - def __init__(self, model: str = "gemini-2.0-flash") - async def diagnose( - self, - screenshot: bytes, - error_message: str, - action_history: List[ActionRecord], - logs: List[str] - ) -> DiagnosisResult -``` - -**DiagnosisResult:** -```python -@dataclass -class DiagnosisResult: - current_state: str - issue_analysis: str - can_recover: bool - recommended_action: str # TAP, GOTO, WAIT, RESTART_APP, ESCALATE - action_params: Dict[str, Any] - explanation: str - confidence: float -``` - -**Acceptance Criteria:** -- Integrates with Gemini Flash API -- Parses JSON response from model -- Handles malformed responses gracefully -- Includes screenshot in API call - ---- - -#### Task 3.2: Integrate Vision into Recovery Manager - -**Files to Modify:** -- `agent_orchestrator/recovery/recovery_manager.py` - -**Requirements:** -Update `_handle_unknown` to use vision: - -```python -async def _handle_unknown(self, failure, context): - if not self.vision_agent: - return RecoveryResult(action="ESCALATE", ...) - - # Capture screenshot - # Get logs - # Call vision.diagnose() - # If can_recover: execute recommended action - # Else: escalate -``` - -**Acceptance Criteria:** -- Uses vision only when configured -- Falls back to escalation if vision unavailable -- Executes vision-recommended actions -- Tracks vision-based recoveries - ---- - -#### Task 3.3: Add App Control Tools to MCP Server - -**Files to Modify:** -- `agent_orchestrator/alas_mcp_server.py` - -**Requirements:** -Add tools for app control: - -```python -@mcp.tool() -def recovery_force_stop_app(package: str = "com.YoStarEN.AzurLane") -> Dict[str, Any] - # Force stop the game app - # Returns: success, error - -@mcp.tool() -def recovery_restart_app(package: str = "com.YoStarEN.AzurLane") -> Dict[str, Any] - # Restart game app and wait for main screen - # Returns: success, error, time_to_main_ms -``` - -**Acceptance Criteria:** -- Uses ALAS's existing app control methods -- restart_app waits for page_main -- Error handling for app not installed - ---- - -### Phase 4: Escalation and Human Handoff - -**Goal**: Implement human escalation for unresolvable failures. - -#### Task 4.1: Implement Escalation Handler - -**Files to Create:** -- `agent_orchestrator/recovery/escalation.py` - -**Requirements:** -```python -class EscalationHandler: - def __init__(self, config: EscalationConfig) - async def escalate(self, failure: FailureEvent, context: ExecutionContext) -> EscalationResult - def register_notification_hook(self, hook: Callable[[EscalationRecord], Awaitable[None]]) - async def on_resolution(self, record_id: str, resolution: ResolutionAction) -> Command -``` - -**EscalationRecord:** -```python -@dataclass -class EscalationRecord: - id: str - timestamp: datetime - context: EscalationContext # screenshot, logs, state - status: EscalationStatus # PENDING, RESOLVED - resolution: Optional[ResolutionAction] -``` - -**Acceptance Criteria:** -- Captures full context (screenshot, logs, history) -- Calls notification hooks -- Persists escalation record -- Handles resolution callbacks - ---- - -#### Task 4.2: Implement Webhook Notifications - -**Files to Create:** -- `agent_orchestrator/recovery/notifications.py` - -**Requirements:** -```python -class WebhookNotifier: - def __init__(self, webhook_urls: List[str]) - async def notify(self, record: EscalationRecord) - # POST JSON payload to webhooks - # Include screenshot base64, logs excerpt, timestamp -``` - -**Webhook Payload:** -```json -{ - "escalation_id": "...", - "timestamp": "...", - "error": "...", - "current_state": "...", - "screenshot_url": "...", // or base64 - "logs_excerpt": "...", - "checkpoint_id": "..." -} -``` - -**Acceptance Criteria:** -- Sends to all configured webhooks -- Handles webhook failures gracefully -- Retries with backoff -- Logs notification attempts - ---- - -### Phase 5: Integration and Orchestration - -**Goal**: Wire everything together into a cohesive recovery system. - -#### Task 5.1: Implement Recovery Agent Main Class - -**Files to Create:** -- `agent_orchestrator/recovery/agent.py` - -**Requirements:** -```python -class RecoveryAgent: - def __init__( - self, - mcp_client: MCPClient, - config: RecoveryConfig, - health_monitor: Optional[HealthMonitor] = None, - recovery_manager: Optional[RecoveryManager] = None, - escalation_handler: Optional[EscalationHandler] = None - ) - async def start_monitoring(self) - async def stop_monitoring(self) - async def on_tool_failure(self, failure: FailureEvent, context: ExecutionContext) -> RecoveryResult - async def on_health_check(self, status: HealthStatus, context: ExecutionContext) - async def get_status(self) -> RecoveryAgentStatus -``` - -**Acceptance Criteria:** -- Starts health monitoring loop -- Handles tool failures -- Manages recovery lifecycle -- Clean shutdown - ---- - -#### Task 5.2: Create Configuration Schema - -**Files to Create:** -- `agent_orchestrator/recovery/config_schema.yaml` -- `agent_orchestrator/recovery/config_loader.py` - -**Requirements:** -Full configuration schema covering all components: - -```yaml -recovery: - enabled: bool - health_monitor: {...} - recovery_manager: {...} - checkpointing: {...} - escalation: {...} - vision: {...} -``` - -**Acceptance Criteria:** -- YAML and JSON support -- Environment variable substitution -- Validation with helpful errors -- Default values for all fields - ---- - -#### Task 5.3: Create Recovery Agent CLI - -**Files to Create:** -- `agent_orchestrator/recovery_cli.py` - -**Requirements:** -```bash -# Start recovery agent as standalone process -python recovery_cli.py --config recovery.yaml --mcp-server "python alas_mcp_server.py" - -# Commands: -# --status: Show current status -# --checkpoint: Trigger manual checkpoint -# --restore ID: Restore from checkpoint -# --escalations: List pending escalations -# --resolve ID: Resolve escalation with action -``` - -**Acceptance Criteria:** -- Starts MCP server as subprocess -- Loads configuration -- Runs monitoring loop -- Handles signals (SIGTERM graceful shutdown) - ---- - -### Phase 6: Testing and Hardening - -#### Task 6.1: Unit Tests for Recovery Components - -**Files to Create:** -- `agent_orchestrator/tests/recovery/test_health_monitor.py` -- `agent_orchestrator/tests/recovery/test_error_classifier.py` -- `agent_orchestrator/tests/recovery/test_recovery_manager.py` -- `agent_orchestrator/tests/recovery/test_checkpoint.py` - -**Requirements:** -- 80%+ code coverage -- Mock MCP client for isolation -- Test all error categories -- Test recovery strategies - ---- - -#### Task 6.2: Integration Tests - -**Files to Create:** -- `agent_orchestrator/tests/recovery/test_integration.py` - -**Requirements:** -- Test with real MCP server (if emulator available) -- Test checkpoint/restore cycle -- Test recovery from common failures -- Test escalation flow - ---- - -#### Task 6.3: Create Test Fixtures and Mocks - -**Files to Create:** -- `agent_orchestrator/tests/recovery/conftest.py` -- `agent_orchestrator/tests/recovery/mocks.py` - -**Requirements:** -- MockMCPClient that simulates failures -- FakeCheckpointStorage (in-memory) -- Sample error messages for testing -- MockVisionAgent - ---- - -## File Structure - -``` -agent_orchestrator/ -├── recovery/ -│ ├── __init__.py -│ ├── models.py # Data models -│ ├── config.py # Configuration -│ ├── health_monitor.py # Health monitoring -│ ├── error_classifier.py # Error classification -│ ├── recovery_manager.py # Recovery execution -│ ├── checkpoint_manager.py # Checkpoint management -│ ├── checkpoint.py # Checkpoint models -│ ├── vision_agent.py # Vision diagnosis -│ ├── escalation.py # Human escalation -│ ├── notifications.py # Webhook notifications -│ ├── agent.py # Main recovery agent -│ ├── storage/ -│ │ ├── __init__.py -│ │ ├── base.py # Storage interface -│ │ └── sqlite.py # SQLite implementation -│ └── utils.py # Shared utilities -├── recovery_cli.py # CLI entry point -└── tests/recovery/ # Test suite - -# Modified files: -agent_orchestrator/ -└── alas_mcp_server.py # Add recovery tools -``` - -## Dependencies - -### New Python Dependencies - -```toml -# Add to agent_orchestrator/pyproject.toml -[project.optional-dependencies] -recovery = [ - "aiosqlite>=0.20.0", # Async SQLite - "aiohttp>=3.9.0", # HTTP client for webhooks - "pillow>=10.0.0", # Image handling for vision -] -``` - -### External Services - -- Gemini API (for vision diagnosis) -- Webhook endpoints (for notifications) - -## Configuration Example - -```yaml -# recovery_config.yaml -recovery: - enabled: true - - health_monitor: - heartbeat_timeout_seconds: 30 - progress_timeout_seconds: 120 - error_threshold: 5 - error_window_seconds: 300 - - recovery_manager: - max_recovery_attempts: 3 - enable_vision_diagnosis: true - retry_backoff_base: 2 - retry_backoff_max: 30 - - checkpointing: - enabled: true - storage_backend: "sqlite" - sqlite_path: "./data/checkpoints.db" - auto_checkpoint_interval_seconds: 300 - checkpoint_on_task_complete: true - max_checkpoints_per_thread: 50 - - vision: - model: "gemini-2.0-flash" - api_key: "${GEMINI_API_KEY}" - confidence_threshold: 0.7 - - escalation: - enabled: true - max_escalation_context_age_hours: 24 - notification_webhooks: - - "${SLACK_WEBHOOK_URL}" - - "${DISCORD_WEBHOOK_URL}" - halt_on_escalation: true -``` - -## Success Criteria by Phase - -| Phase | Key Deliverables | Success Metric | -|-------|------------------|----------------| -| 1 | Health monitor, error classifier, basic recovery | 90% of transient failures auto-recovered | -| 2 | Checkpoint system, persistence | Workflow can restart from any checkpoint | -| 3 | Vision diagnosis | 70% of unknown failures diagnosed correctly | -| 4 | Escalation system | <5% of failures require human intervention | -| 5 | Full integration | End-to-end recovery in <60s | -| 6 | Test coverage | 80%+ unit test coverage, integration tests pass | - -## Risks and Mitigations - -| Risk | Phase | Mitigation | -|------|-------|------------| -| Recovery makes situation worse | 1 | Conservative approach, extensive testing | -| Checkpoint storage grows unbounded | 2 | Automatic pruning, size limits | -| Vision API costs too high | 3 | Confidence threshold, caching | -| Webhook spam | 4 | Rate limiting, batching | -| MCP server compatibility | 5 | Feature flags, graceful degradation | - -## Documentation Requirements - -- [ ] Update ARCHITECTURE.md with recovery agent component -- [ ] Add recovery section to AGENTS.md -- [ ] Create recovery agent README -- [ ] Document all MCP recovery tools -- [ ] Create runbook for common recovery scenarios -- [ ] Document configuration options - -## Next Steps - -1. Review this plan with stakeholders -2. Prioritize tasks based on current pain points -3. Create feature branch: `feature/recovery-agent` -4. Begin Phase 1 implementation -5. Set up CI/CD for new test suite - ---- - -*Document Status: Ready for Implementation* -*Depends on: [recovery_agent_architecture.md](recovery_agent_architecture.md)* diff --git a/docs/dev/watchdog_investigation_2026_02_20.md b/docs/dev/watchdog_investigation_2026_02_20.md deleted file mode 100644 index 755bb9ab8c..0000000000 --- a/docs/dev/watchdog_investigation_2026_02_20.md +++ /dev/null @@ -1,113 +0,0 @@ -# Watchdog Investigation — 2026-02-20 - -## Problem Statement - -The user requested a recurring watchdog that detects when the bot has **paused** (scheduler loop idle, user clicked Stop, or no forward progress) and sends a start command to resume it — without requiring a full restart. The previous implementation mis-interpreted this intent. - ---- - -## Findings - -### Finding 1: The Watchdog Checks the Wrong Process - -**Current logic** (`has_running_config`): - -```python -if "gui.py" in joined and "--run" in joined and config_name in joined: - return True -``` - -This checks whether `gui.py --run PatrickCustom` appears in the system process list. That is the **web UI server process** (`uvicorn`/`pywebio`), **not** the bot task loop. - -The bot task loop is a separate `multiprocessing.Process` child of `gui.py`, launched by `ProcessManager.start()`. When the user clicks **Stop** in the UI, `ProcessManager.stop()` kills that child process with `process.kill()` — but **`gui.py` remains running**. The watchdog would see `gui.py` alive, log `OK: PatrickCustom is running`, and do absolutely nothing. This is exactly the failure mode described. - -**Process hierarchy (observed live today):** - -``` -gui.py (PID 64556) ← uvicorn web server — ALWAYS alive - ├── python spawn_main (PID 93864) ← bot task worker - ├── python spawn_main (PID 71768) ← bot task worker - └── python spawn_main (PID 122984) ← bot task worker - -gui.py (PID 123832) ← a second gui.py with NO children → bot is STOPPED here -``` - -`gui.py` with no children = bot stopped. The watchdog cannot see this distinction. - ---- - -### Finding 2: There Is No External HTTP API to Trigger "Start" - -Investigation confirmed that the ALAS web UI has **no JSON/REST API**. The Starlette app only exposes pywebio WebSocket routes and static files. `ProcessManager._processes` is a class-level dict that lives entirely inside the `uvicorn` server process — nothing exposes it externally. - -The only mechanism available from outside to start the bot is: - -> Kill `gui.py` and relaunch it with `--run PatrickCustom`. - -When `gui.py` starts with `--run PatrickCustom`, it automatically fires: - -```python -ProcessManager.restart_processes(instances=["PatrickCustom"], ev=updater.event) -``` - -This is the equivalent of clicking Start immediately on boot. This is the correct mechanism. **Relaunching `gui.py` IS pressing Start.** - ---- - -### Finding 3: The Correct "Paused" Signal Is Log Staleness - -Because `ProcessManager.alive` and `ProcessManager.state` are internal to `gui.py` and not exposable, the only reliable external signal that the bot scheduler loop has **paused or stopped** is the ALAS log file. - -- The bot writes a line **at every screenshot** (every ~0.3–1.0 seconds while active). -- During scheduler sleep (waiting for next task) it writes "Scheduler: Next run in X seconds" entries periodically. -- If the log file has not been written to in more than **N minutes**, the bot has either stopped, crashed, or is stuck. - -**N = 10 minutes** is a safe threshold — even during `time.sleep(60)` pauses in the recovery path, log activity resumes within a minute. - ---- - -### Finding 4: The Bot Has Been Stuck in a GameStuckError Loop All Day - -Separately, the bot has been stuck in a `GameStuckError` loop since ~03:51 today, cycling every ~3.5 minutes: - -``` -APP RESTART → login → click LOGIN_CHECK → Login success → click GET_SHIP - → ~3 minute spin → GameStuckError: Wait too long - → End task Restart → Start task Restart immediately - → (every 3rd failure): delay 10 min → resume -``` - -This is the **Feb 17 login regression** (`continue` instead of `return True` in `_handle_app_login`), already patched today. The bot process needs to be restarted to load the fix. - -The watchdog played **no role** in this loop. Every watchdog check returned `OK` because `gui.py` was alive throughout. - ---- - -## Correct Design for the Watchdog - -The watchdog should: - -1. **Check log staleness** — read the most recent ALAS log file for `PatrickCustom` and check the timestamp of the last line. If older than `stale_threshold_minutes`, the bot is paused. -2. **Kill and relaunch `gui.py`** — this is the equivalent of pressing Start. On relaunch, `gui.py --run PatrickCustom` automatically starts the bot worker. -3. **Never check `gui.py` process presence as the sole criterion** — `gui.py` being alive says nothing about whether the bot loop is running. - -See `agent_orchestrator/watchdog_keep_patrick_running.py` for canonical implementation. -Compatibility wrapper remains at `scripts/watchdog_keep_patrick_running.py`. - ---- - -## Process Table for Future Reference - -| What is running | `gui.py` alive? | Bot worker children? | Log being written? | Bot state | -|---|---|---|---|---| -| Normal bot running | ✅ | ✅ | ✅ | Running | -| User clicked Stop | ✅ | ❌ | ❌ | **Paused — watchdog should restart** | -| Bot stuck/GameStuckError loop | ✅ | ✅ | ✅ (same lines) | Stuck — log staleness won't catch this | -| Machine suspended | ✅ (frozen) | ✅ (frozen) | ❌ (stale) | **Paused — watchdog should restart on wake** | -| `gui.py` crashed | ❌ | ❌ | ❌ | **Down — watchdog relaunches gui.py** | - -### Limitation - -Log staleness cannot distinguish "scheduler is sleeping between tasks" (normal) from "bot is paused". The scheduler emits periodic log lines even when sleeping, so a 10-minute stale threshold correctly ignores normal inter-task waits (which are at most a few minutes for any enabled task). - -A deeper stuck-loop (e.g., GameStuckError cycling every 3.5 minutes) DOES write log lines, so staleness won't catch it. Catching that requires a separate failure-pattern detector (parsing for repeated errors) — this is tracked in `docs/plans/recovery_agent_architecture.md` as a future Phase 0 capability. diff --git a/docs/investigations/login_popup_loop_stuck_investigation.md b/docs/investigations/login_popup_loop_stuck_investigation.md deleted file mode 100644 index 3fefe569c1..0000000000 --- a/docs/investigations/login_popup_loop_stuck_investigation.md +++ /dev/null @@ -1,349 +0,0 @@ -# Login Popup Loop Stuck Investigation (Feb 14–20, 2026) - -## Thesis: Law Type and State - -**Investigation state:** Root-cause confirmed; fix applied (documentation-only risk change). - -**Law ID:** `L_LOGIN_POPUP_CONVERGENCE` - -**Law type:** Deterministic state-convergence law (transition correctness + bounded progress). - -**Thesis:** The `_handle_app_login()` loop must converge to post-login state within bounded time. The predicate `P` states: when `ui_page_main_popups()` reports a handled main-page popup during login recovery, the loop must exit successfully. The fail mode `F` occurs when the loop re-enters popup/transition handling without progress until `stuck_record_check()` raises `GameStuckError` after 60 seconds. - -The regression introduced in commit `37151310c` on Feb 17, 2026 changed the branch control from `return True` to `continue`, broadening the loop surface and preventing convergence when popups were handled successfully. - ---- - -## Exact Logic That Causes the Hit - -### Code Flow Architecture - -```mermaid -flowchart TD - A[handle_app_login] -->|sets screenshot interval| B[_handle_app_login loop] - B --> C{is_in_main?} - C -->|Yes| D{confirm_timer.reached?} - D -->|Yes| E[break - Login to main confirm] - D -->|No| F[screenshot] - C -->|No| G[reset confirm_timer] - G --> F - F --> H{LOGIN_CHECK match?} - H -->|Yes| I[click LOGIN_CHECK] - I --> J[login_success = True] - J --> K{ui_page_main_popups?} - K -->|Returns True - BRANCH A| L[return True - EXIT] - K -->|Returns True - BRANCH B| M[continue - LOOP AGAIN] - M --> F - L --> N[Login complete] - F --> O{Other handlers} - O -->|Any handled| F - O -->|None handled| P{stuck_timer.reached?} - P -->|Yes| Q[GameStuckError] - P -->|No| F -``` - -### The Critical Branch - -The critical code location is in [`alas_wrapped/module/handler/login.py:94-99`](alas_wrapped/module/handler/login.py:94): - -**Upstream (correct):** -```python -# Popups appear at page_main -if self.ui_page_main_popups(get_ship=login_success): - return True # EXIT the loop -``` - -**Regression (commit 37151310c):** -```python -# Popups appear at page_main. -# Keep looping until main-page confirmation succeeds, -# otherwise we may exit login while UI is still transitioning. -if self.ui_page_main_popups(get_ship=login_success): - continue # LOOP AGAIN - never exits! -``` - -### Why This Is the Fact That Was Hit - -1. **[`ui_page_main_popups()`](alas_wrapped/module/ui/ui.py:362)** handles many post-login popup states (guild popups, announcements, GET_ITEMS, GET_SHIP, event lists, battle pass notices, etc.) -2. When any popup is handled, the function returns `True` -3. With `continue`, the loop iterates again instead of exiting -4. The `confirm_timer` only reaches when `is_in_main()` returns True -5. If popups keep appearing, `is_in_main()` may never return True -6. The `stuck_timer` (60 seconds) eventually fires, causing `GameStuckError` - -### Stuck Detection Mechanism - -From [`alas_wrapped/module/device/device.py:68-70`](alas_wrapped/module/device/device.py:68): -```python -stuck_timer = Timer(60, count=60).start() -stuck_timer_long = Timer(180, count=180).start() -stuck_long_wait_list = ['BATTLE_STATUS_S', 'PAUSE', 'LOGIN_CHECK'] -``` - -From [`alas_wrapped/module/device/device.py:242-264`](alas_wrapped/module/device/device.py:242): -```python -def stuck_record_check(self): - reached = self.stuck_timer.reached() - reached_long = self.stuck_timer_long.reached() - - if not reached: - return False - if not reached_long: - for button in self.stuck_long_wait_list: - if button in self.detect_record: - return False - - logger.warning('Wait too long') - logger.warning(f'Waiting for {self.detect_record}') - - if self.app_is_running(): - raise GameStuckError(f'Wait too long') -``` - ---- - -## Pass History Across the Law (Error + Non-Error) - -### Failure Law Passes (Violation) - 3 Citations - -#### Citation 1: 2026-02-20 00:03:46 (Log: [`2026-02-19_PatrickCustom.txt:395-434`](alas_wrapped/log/2026-02-19_PatrickCustom.txt:395)) -``` -2026-02-19 23:59:58.334 | INFO | handle_app_login -2026-02-19 23:59:58.339 | INFO | <<< APP LOGIN >>> -2026-02-20 00:00:25.808 | INFO | Click (1137, 615) @ LOGIN_CHECK -2026-02-20 00:00:25.905 | INFO | Login success -2026-02-20 00:00:25.914 | INFO | Click (1018, 661) @ GET_SHIP -2026-02-20 00:00:44.997 | INFO | Click (1138, 600) @ LOGIN_CHECK -2026-02-20 00:00:45.091 | INFO | Click (1018, 669) @ GET_SHIP -2026-02-20 00:03:46.546 | WARNING | Wait too long -2026-02-20 00:03:46.623 | ERROR | GameStuckError: Wait too long -``` -**Analysis:** Login succeeded at 00:00:25, GET_SHIP popup was handled, but the loop continued. Another LOGIN_CHECK click at 00:00:44 shows the loop re-entered login handling. After ~3.5 minutes, stuck timer fired. - -**Outcome: Law failed** (no convergence before stuck timeout). - -#### Citation 2: 2026-02-17 04:45:08 (Log: [`2026-02-17_PatrickCustom.txt:2549-2551`](alas_wrapped/log/2026-02-17_PatrickCustom.txt:2549)) -``` -2026-02-17 04:45:08.798 | INFO | [Package_name] com.YoStarEN.AzurLane -2026-02-17 04:45:08.802 | ERROR | GameStuckError: Wait too long -``` -**Analysis:** Same pattern - login entered, stuck timer fired during popup handling loop. - -**Outcome: Law failed.** - -#### Citation 3: 2026-02-16 00:59:01 (Log: [`2026-02-16_PatrickCustom.txt:1094-1096`](alas_wrapped/log/2026-02-16_PatrickCustom.txt:1094)) -``` -2026-02-16 00:59:01.517 | INFO | [Package_name] com.YoStarEN.AzurLane -2026-02-16 00:59:01.518 | ERROR | GameStuckError: Wait too long -``` -**Analysis:** Same failure pattern during login recovery. - -**Outcome: Law failed.** - -### Non-Error Law Passes - 3 Citations - -#### Citation 4: 2026-02-15 21:43:00 (Log: [`2026-02-15_PatrickCustom.txt:806-832`](alas_wrapped/log/2026-02-15_PatrickCustom.txt:806)) -``` -2026-02-15 21:43:00.581 | INFO | App start: com.YoStarEN.AzurLane -2026-02-15 21:43:00.684 | INFO | handle_app_login -2026-02-15 21:43:00.686 | INFO | Screenshot interval set to 1.0s -``` -**Analysis:** This was BEFORE the Feb 17 regression commit. Login completed successfully without GameStuckError. The task scheduler progressed to the next task. - -**Outcome: Law passed** (pre-regression behavior). - -#### Citation 5: 2026-02-15 22:00:11 (Log: [`2026-02-15_PatrickCustom.txt:1551-1570`](alas_wrapped/log/2026-02-15_PatrickCustom.txt:1551)) -``` -2026-02-15 22:00:11.337 | INFO | App start: com.YoStarEN.AzurLane -2026-02-15 22:00:11.390 | INFO | handle_app_login -2026-02-15 22:00:11.391 | INFO | Screenshot interval set to 1.0s -``` -**Analysis:** Also before the regression. Login succeeded, Restart task completed, forward scheduling continued. - -**Outcome: Law passed** (pre-regression behavior). - -#### Citation 6: 2026-02-14 01:04:47 (Log: [`2026-02-14_PatrickCustom.txt:101-257`](alas_wrapped/log/2026-02-14_PatrickCustom.txt:101)) -``` -2026-02-14 01:04:47.908 | INFO | App start: com.YoStarEN.AzurLane -2026-02-14 01:04:47.968 | INFO | handle_app_login -2026-02-14 01:04:47.970 | INFO | Screenshot interval set to 1.0s -``` -**Analysis:** Pre-regression. Login completed without GameStuckError. An unrelated OSError appeared later in the run, but the login law path was not violated. - -**Outcome: Law passed for this condition.** - ---- - -## Code Citations (6 Locations) - -### Citation A: Upstream Login Handler (Correct) -**File:** [`upstream_alas/module/handler/login.py:94-96`](upstream_alas/module/handler/login.py:94) -```python -# Popups appear at page_main -if self.ui_page_main_popups(get_ship=login_success): - return True -``` -**Significance:** This is the correct upstream behavior - `return True` exits the login loop when popups are handled. - -### Citation B: Wrapped Login Handler (Fixed) -**File:** [`alas_wrapped/module/handler/login.py:94-99`](alas_wrapped/module/handler/login.py:94) -```python -# Popups appear at page_main. -# Keep looping until main-page confirmation succeeds, -# otherwise we may exit login while UI is still transitioning. -if self.ui_page_main_popups(get_ship=login_success): - return True -``` -**Significance:** The fix has been applied - `return True` is restored. The comment from the regression remains as documentation of the intent. - -### Citation C: Popup Handler Implementation -**File:** [`alas_wrapped/module/ui/ui.py:362-426`](alas_wrapped/module/ui/ui.py:362) -```python -def ui_page_main_popups(self, get_ship=True): - """ - Handle popups appear at page_main, page_reward - """ - # Guild popup - if self.handle_guild_popup_cancel(): - return True - # Daily reset - if self.appear_then_click(LOGIN_ANNOUNCE, offset=(30, 30), interval=3): - return True - # ... many more handlers ... - return False -``` -**Significance:** Shows the many popup types handled - each returns `True` when handled, which should allow login to complete. - -### Citation D: Stuck Timer Configuration -**File:** [`alas_wrapped/module/device/device.py:68-70`](alas_wrapped/module/device/device.py:68) -```python -stuck_timer = Timer(60, count=60).start() -stuck_timer_long = Timer(180, count=180).start() -stuck_long_wait_list = ['BATTLE_STATUS_S', 'PAUSE', 'LOGIN_CHECK'] -``` -**Significance:** The 60-second stuck timer is the bound on loop iterations. `LOGIN_CHECK` is in the long-wait list, giving 180 seconds during login. - -### Citation E: Stuck Record Check -**File:** [`alas_wrapped/module/device/device.py:242-264`](alas_wrapped/module/device/device.py:242) -```python -def stuck_record_check(self): - """ - Raises: - GameStuckError: - """ - reached = self.stuck_timer.reached() - # ... - if self.app_is_running(): - raise GameStuckError(f'Wait too long') - else: - raise GameNotRunningError('Game died') -``` -**Significance:** This is called on every `screenshot()` call, enforcing the bounded progress law. - -### Citation F: Login Loop Structure -**File:** [`alas_wrapped/module/handler/login.py:40-103`](alas_wrapped/module/handler/login.py:40) -```python -while 1: - # ... orientation handling ... - self.device.screenshot() - - # End - if self.is_in_main(): - if confirm_timer.reached(): - logger.info('Login to main confirm') - break - else: - confirm_timer.reset() - - # ... many handlers with continue ... - - # Popups appear at page_main - if self.ui_page_main_popups(get_ship=login_success): - return True - - # Always goto page_main - if self.appear_then_click(GOTO_MAIN, offset=(30, 30), interval=5): - continue - -return True -``` -**Significance:** Shows the loop structure - only two exit paths: `break` after `is_in_main()` confirmation, or `return True` from popup handling. - ---- - -## Risk and Explanation - -### Root Cause -The high-confidence hit mechanism is **branch semantics + repeated handled-popups**, not `ui_page_main_popups` detection quality itself. - -### Intent vs Impact -The recovery intent in commit message (`37151310c`) was to "preserve recovery" by not exiting login while UI was still transitioning. However, the specific branch control change from `return True` to `continue` broadened the loop surface too much. The comment stated: - -> "Keep looping until main-page confirmation succeeds, otherwise we may exit login while UI is still transitioning." - -This intent was reasonable, but the implementation prevented ANY exit via the popup branch, even when popups were successfully handled and the UI was stable. - -### Correct Fix -Returning from handled popup is the conservative convergence-safe form. If a popup was successfully handled (function returned `True`), login should complete. The `is_in_main()` + `confirm_timer` gate already provides protection against half-painted screens. - ---- - -## Current Evidence-Backed Documentation Status - -- **Current code state:** `return True` at the popup branch ([`login.py:97-99`](alas_wrapped/module/handler/login.py:97)) - fix applied. -- **Regression commit:** `37151310c` on Feb 17, 2026 changed `return True` to `continue`. -- **Shared path:** The same state transition is consumed by deterministic MCP tooling ([`agent_orchestrator/alas_mcp_server.py`](agent_orchestrator/alas_mcp_server.py)), so this hotspot is directly shared by native + tool paths. - ---- - -## Timeline Summary - -| Date | Status | Description | -|------|--------|-------------| -| Feb 14-15 | ✅ Pass | Pre-regression: login completes successfully | -| Feb 17 | ❌ Fail | Regression commit `37151310c` introduces `continue` bug | -| Feb 17-20 | ❌ Fail | Continuous GameStuckError during login attempts | -| Feb 20 | ✅ Fixed | `return True` restored in wrapped code | - ---- - -## Lessons Learned - -1. **Loop exit paths must be explicit:** When a handler returns `True` indicating success, the loop should exit, not continue. -2. **Comments can mask bugs:** The comment "Keep looping until main-page confirmation succeeds" sounded reasonable but masked the fact that successful popup handling should also exit. -3. **Test popup-heavy scenarios:** The bug only manifested when popups appeared during login; simple login flows without popups would succeed. -4. **Stuck timer is a safety net, not a feature:** The 60-second stuck timer caught this bug, but relying on it means the user experiences a 60-second delay before recovery. - -## 2026-02-20 Implementation Update (Sidecar Observability) - -No further regression logic was introduced in this fix; the behavior change was limited to instrumentation and recovery policy around transport retries. - -Files changed for parsing support: - -- `alas_wrapped/module/exception.py` (new `GameTransportError`) -- `alas_wrapped/module/handler/login.py` (login trace sidecar writes) -- `alas_wrapped/alas.py` (schedule status sidecar writes, restart dedupe, transport recover-before-restart) - -What remains unchanged: - -- Existing standard logger text output and logger setup. -- The core popup fix itself remains `return True` at the `ui_page_main_popups` branch. - -For quick parser usage: - -- `Get-Content alas_wrapped/log/login_trace.jsonl | ConvertFrom-Json` -- `Get-Content alas_wrapped/log/schedule_status.jsonl | ConvertFrom-Json` - -## Review Annotations - -### Files Reviewed -- `docs/investigations/login_popup_loop_stuck_investigation.md` - -### Assessment -- [ERROR FOUND] There is one timing precision issue in the writeup: repeated claims reference a fixed 60-second wait while the actual failure behavior includes a 180-second allowance when `LOGIN_CHECK` is in `stuck_long_wait_list`. -- [NO LOGIC ERROR] The root-cause chain is internally consistent and aligns with the provided code snippets and citations. - -### Specific Observations -- [FIXED-STATE NOTE] The doc says the fix is applied and currently shows `return True` in wrapped code, which appears consistent with the cited file excerpt. -- [EVIDENCE QUALITY] The pass examples only show entry logs plus partial context; they are plausible but not full proofs of successful task completion without auxiliary log lines. -- [NOMINAL WARNING] A few lines include non-ASCII glyph substitutions (`–`, `✅`, etc.); this is formatting-only but reduces readability. diff --git a/docs/monorepo/MONOREPO_SYNC_NOTES.md b/docs/monorepo/MONOREPO_SYNC_NOTES.md deleted file mode 100644 index 770056fce9..0000000000 --- a/docs/monorepo/MONOREPO_SYNC_NOTES.md +++ /dev/null @@ -1,83 +0,0 @@ -# Monorepo Organization & Sync Workflow - -> **Status**: Complete - structure in place and operational - -This subdomain documents the vendor branch pattern and the operational procedures for staying current with upstream ALAS updates. - -## High-Level Summary - -We use a **Monorepo** structure implementing the **Vendor Branch Pattern** to safely develop AI capabilities alongside a legacy bot. - -> **⚠️ Git Rule: One repo, one submodule. Everything else is folders.** -> - `ALAS/` is the only git repo -> - `upstream_alas/` is the only git submodule -> - Never run `git init` in subfolders - -## Git Structure & Purpose - -| Path | Type | Purpose | Python | -|------|------|---------|--------| -| `upstream_alas/` | Submodule | Read-only mirror of `Zuosizhu/Alas-with-Dashboard` | - | -| `alas_wrapped/` | Folder | **Single source of truth** — ALAS with MCP hooks, tools, our customizations | 3.9 | -| `agent_orchestrator/` | Folder | AI agent code and persistent MCP server | 3.10+ | - -## Tool Placement Rule - -| Tool Type | Location | Python Version | -| :--- | :--- | :--- | -| Tools importing ALAS internals (`module.*`) | `alas_wrapped/tools/` | 3.9 | -| Standalone tools (zero ALAS dependencies) | `agent_orchestrator/` | 3.10+ | - ---- - -## The Sync Workflow - -Changes flow **downstream only**: `upstream → wrapped` - -### 1. The Update Loop (Monthly) -*Goal: Fetch game updates from the community.* - -1. **Update Submodule**: - ```bash - git submodule update --remote -- upstream_alas - ``` -2. **Compare & Merge**: Diff `upstream_alas` against `alas_wrapped`, apply relevant changes while preserving our MCP hooks and customizations. Use a merge tool for logic files. -3. **Test**: Run `agent_orchestrator` tests to ensure the Agent can still drive the wrapped code. - -### 3. The Development Loop (Daily) -1. Modify `agent_orchestrator` code (Python 3.10+). -2. If new actions are needed, add a function to `alas_wrapped/module/state_machine.py`. -3. Restart the persistent `alas_mcp_server`. - ---- - -## Submodule Strategy - -### `upstream_alas` -- **Source**: `https://github.com/Zuosizhu/Alas-with-Dashboard.git` -- **Pinning**: Pinned by commit; updated via `git submodule update --remote -- upstream_alas`. -- **Rationale**: Pull upstream changes regularly for game updates. -- **Rule**: Never modify this folder directly. - ---- - -## Guardrails - -### Pre-push hook: nested `.git` check - -`.githooks/pre-push` scans for any `.git` directory inside the repo (up to 4 levels deep) that is neither `./.git` (the repo root) nor `./upstream_alas/.git` (the submodule). If any are found the push is blocked with a clear error listing the offending paths. - -To fix: remove the nested `.git` directory (`rm -rf /.git`) and push again. - -### Why nested `.git` directories are dangerous - -1. **VS Code confusion**: VS Code's git extension picks up any `.git` it finds. A nested `.git` causes it to treat that subdirectory as a separate repository, showing phantom untracked/modified files from the wrong worktree. -2. **Commits to the wrong repo**: `git` commands run inside the subdirectory resolve against the nested `.git`, so commits, pushes, and branch operations silently target the wrong remote. -3. **Merge noise**: `git status` in the parent repo sees the entire subdirectory as a single untracked path, hiding real changes underneath it. - -### Incident record - -**Date**: 2026-03-03 - -`alas_wrapped/.git` was found to be a rogue `.git` directory left over from the initial upstream copy. It pointed at `LmeSzinc/AzurLaneAutoScript` (the original upstream remote), not this repository. This caused VS Code to report 203 phantom untracked changes inside `alas_wrapped/` and would have silently routed any `git` command run from that directory to the wrong remote. The directory was deleted manually and the pre-push guard was added to prevent recurrence. - diff --git a/docs/monorepo/UPSTREAM_ALAS_LOCAL_EDITS_2026-02-16_043433.md b/docs/monorepo/UPSTREAM_ALAS_LOCAL_EDITS_2026-02-16_043433.md deleted file mode 100644 index 3d7b547e7f..0000000000 --- a/docs/monorepo/UPSTREAM_ALAS_LOCAL_EDITS_2026-02-16_043433.md +++ /dev/null @@ -1,283 +0,0 @@ -# upstream_alas Local Edits Snapshot - -- Captured: 2026-02-16T04:34:33-06:00 -- Submodule: `upstream_alas` -- HEAD before refresh: `cda8619c0` - -## Status -``` -## HEAD (no branch) - M module/config/utils.py - M requirements.txt - M webapp/package-lock.json - M webapp/packages/main/src/config.ts - M webapp/packages/main/vite.config.js - M webapp/packages/preload/vite.config.js - M webapp/packages/renderer/vite.config.js - M webapp/scripts/watch.js - M webapp/yarn.lock -``` - -## Diffstat -``` - module/config/utils.py | 5 +- - requirements.txt | 45 +- - webapp/package-lock.json | 108 +- - webapp/packages/main/src/config.ts | 8 +- - webapp/packages/main/vite.config.js | 8 +- - webapp/packages/preload/vite.config.js | 8 +- - webapp/packages/renderer/vite.config.js | 12 +- - webapp/yarn.lock | 2623 +++++++++++++++---------------- - 8 files changed, 1397 insertions(+), 1420 deletions(-) -``` - -## Key Patch Excerpts (non-lockfiles) -```diff -diff --git a/module/config/utils.py b/module/config/utils.py -index 23f2085ed..2cde34af8 100644 ---- a/module/config/utils.py -+++ b/module/config/utils.py -@@ -77,7 +77,6 @@ def read_file(file): - Returns: - dict, list: - """ -- print(f'read: {file}') - if file.endswith('.json'): - content = atomic_read_bytes(file) - if not content: -@@ -92,7 +91,6 @@ def read_file(file): - data = {} - return data - else: -- print(f'Unsupported config file extension: {file}') - return {} - - -@@ -104,7 +102,6 @@ def write_file(file, data): - file (str): - data (dict, list): - """ -- print(f'write: {file}') - if file.endswith('.json'): - content = json.dumps(data, indent=2, ensure_ascii=False, sort_keys=False, default=str) - atomic_write(file, content) -@@ -117,7 +114,7 @@ def write_file(file, data): - data, default_flow_style=False, encoding='utf-8', allow_unicode=True, sort_keys=False) - atomic_write(file, content) - else: -- print(f'Unsupported config file extension: {file}') -+ pass - - - def iter_folder(folder, is_dir=False, ext=None): -diff --git a/requirements.txt b/requirements.txt -index 72ed5c457..4c4f6a3d3 100644 ---- a/requirements.txt -+++ b/requirements.txt -@@ -1,18 +1,13 @@ --# --# This file is autogenerated by pip-compile with Python 3.7 --# by the following command: --# --# pip-compile --annotation-style=line --output-file=requirements.txt requirements-in.txt --# -- --adbutils==0.11.0 # via -r requirements-in.txt, uiautomator2 -+# This file was autogenerated by uv via the following command: -+# uv pip compile requirements-in.txt --python-version=3.9 --override=overrides.txt --output-file=requirements.txt --annotation-style=line --only-binary av -+adbutils==0.11.0 # via uiautomator2, -r requirements-in.txt - aiofiles==0.7.0 # via -r requirements-in.txt - alas-webapp==0.3.7 # via -r requirements-in.txt - anyio==1.3.1 # via -r requirements-in.txt - apkutils2==1.0.0 # via adbutils - asgiref==3.4.1 # via uvicorn - async-generator==1.10 # via anyio --av==10.0.0 # via -r requirements-in.txt -+av==12.0.0 # via -r requirements-in.txt - cached-property==1.5.2 # via uiautomator2 - certifi==2021.5.30 # via requests - cffi==1.15.0 # via gevent -@@ -36,7 +31,6 @@ h11==0.12.0 # via uvicorn - httptools==0.4.0 # via uvicorn - idna==2.6 # via requests - imageio==2.27.0 # via -r requirements-in.txt --importlib-metadata==4.8.1 # via click, prettytable - inflection==0.5.1 # via -r requirements-in.txt - jellyfish==0.11.2 # via -r requirements-in.txt - kiwisolver==1.3.2 # via matplotlib -@@ -45,19 +39,19 @@ lxml==4.6.3 # via uiautomator2 - lz4==3.1.3 # via -r requirements-in.txt - matplotlib==3.4.3 # via gluoncv - msgpack==1.0.3 # via zerorpc --mxnet==1.6.0 # via -r requirements-in.txt, cnocr --numpy==1.16.6 # via -r requirements-in.txt, cnocr, gluoncv, imageio, matplotlib, mxnet, opencv-python, scipy -+mxnet==1.6.0 # via cnocr, -r requirements-in.txt -+numpy==1.19.5 # via cnocr, gluoncv, imageio, matplotlib, mxnet, opencv-python, scipy, --override overrides.txt, -r requirements-in.txt - onepush==1.4.0 # via -r requirements-in.txt - opencv-python==4.5.3.56 # via -r requirements-in.txt - packaging==20.9 # via deprecation, uiautomator2 --pillow==8.3.2 # via -r requirements-in.txt, cnocr, gluoncv, imageio, matplotlib, uiautomator2 -+pillow==8.3.2 # via cnocr, gluoncv, imageio, matplotlib, uiautomator2, -r requirements-in.txt - portalocker==2.3.2 # via gluoncv - prettytable==2.2.1 # via -r requirements-in.txt - progress==1.6 # via uiautomator2 - psutil==5.9.3 # via -r requirements-in.txt - py==1.10.0 # via retry - pycparser==2.21 # via cffi --pycryptodome==3.9.9 # via onepush -+pycryptodome==3.9.9 # via onepush, -r requirements-in.txt - pydantic==1.10.2 # via -r requirements-in.txt - pyelftools==0.27 # via apkutils2 - pygments==2.12.0 # via rich -@@ -67,35 +61,32 @@ python-dateutil==2.8.2 # via matplotlib - python-dotenv==0.19.0 # via uvicorn - pywebio==1.6.2 # via -r requirements-in.txt - pywin32==301 # via portalocker --pyyaml==5.4.1 # via -r requirements-in.txt, uvicorn --pyzmq==22.3.0 # via -r requirements-in.txt, zerorpc -+pyyaml==5.4.1 # via uvicorn, -r requirements-in.txt -+pyzmq==22.3.0 # via zerorpc, -r requirements-in.txt - requests==2.18.4 # via adbutils, gluoncv, mxnet, onepush, uiautomator2 - retry==0.9.2 # via adbutils, uiautomator2 - retrying==1.3.3 # via -r requirements-in.txt - rich==11.2.0 # via -r requirements-in.txt --scipy==1.4.1 # via -r requirements-in.txt, gluoncv -+scipy==1.7.3 # via gluoncv, -r requirements-in.txt -+setuptools==81.0.0 # via gevent, zope-event, zope-interface - six==1.16.0 # via adbutils, cycler, python-dateutil, retrying, uiautomator2 - sniffio==1.2.0 # via anyio - starlette==0.14.2 # via -r requirements-in.txt - tornado==6.1 # via pywebio --tqdm==4.62.3 # via -r requirements-in.txt, gluoncv --typing-extensions==4.3.0 # via asgiref, importlib-metadata, pydantic, rich, uvicorn -+tqdm==4.62.3 # via gluoncv, -r requirements-in.txt -+typing-extensions==4.3.0 # via pydantic - ua-parser==0.10.0 # via user-agents - uiautomator2==2.16.17 # via -r requirements-in.txt - uiautomator2cache==0.3.0.1 # via -r requirements-in.txt - urllib3==1.22 # via requests - user-agents==2.2.0 # via pywebio --uvicorn[standard]==0.17.6 # via -r requirements-in.txt -+uvicorn==0.17.6 # via -r requirements-in.txt - watchgod==0.7 # via uvicorn - wcwidth==0.2.5 # via prettytable - websockets==10.0 # via uvicorn - whichcraft==0.6.1 # via adbutils, uiautomator2 --wrapt==1.13.1 # via -r requirements-in.txt, deprecated -+wrapt==1.13.1 # via deprecated, -r requirements-in.txt - xmltodict==0.12.0 # via apkutils2 - zerorpc==0.6.3 # via -r requirements-in.txt --zipp==3.6.0 # via importlib-metadata --zope.event==4.5.0 # via gevent --zope.interface==5.4.0 # via gevent -- --# The following packages are considered to be unsafe in a requirements file: --# setuptools -+zope-event==4.5.0 # via gevent -+zope-interface==5.4.0 # via gevent -diff --git a/webapp/packages/main/src/config.ts b/webapp/packages/main/src/config.ts -index b295c5eae..266ad04d1 100644 ---- a/webapp/packages/main/src/config.ts -+++ b/webapp/packages/main/src/config.ts -@@ -3,7 +3,13 @@ const fs = require('fs'); - const path = require('path'); - - // export const alasPath = 'D:/AzurLaneAutoScript'; --export const alasPath = process.cwd(); -+// When running from webapp/, ALAS root is the parent directory. -+// If parent is upstream_alas, use alas_wrapped (our customized version with MCP tools). -+let root = path.resolve(process.cwd(), '..'); -+if (path.basename(root) === 'upstream_alas') { -+ root = path.resolve(root, '..', 'alas_wrapped'); -+} -+export const alasPath = root; - - const file = fs.readFileSync(path.join(alasPath, './config/deploy.yaml'), 'utf8'); - const config = yaml.parse(file); -diff --git a/webapp/packages/main/vite.config.js b/webapp/packages/main/vite.config.js -index 5a755eae3..a7e8b8526 100644 ---- a/webapp/packages/main/vite.config.js -+++ b/webapp/packages/main/vite.config.js -@@ -1,6 +1,6 @@ --import {node} from '../../electron-vendors.config.json'; --import {join} from 'path'; --import {builtinModules} from 'module'; -+const {node} = require('../../electron-vendors.config.json'); -+const {join} = require('path'); -+const {builtinModules} = require('module'); - - const PACKAGE_ROOT = __dirname; - -@@ -50,4 +50,4 @@ const config = { - }, - }; - --export default config; -+module.exports = config; -diff --git a/webapp/packages/preload/vite.config.js b/webapp/packages/preload/vite.config.js -index ead1f6107..e9157ebf8 100644 ---- a/webapp/packages/preload/vite.config.js -+++ b/webapp/packages/preload/vite.config.js -@@ -1,6 +1,6 @@ --import {chrome} from '../../electron-vendors.config.json'; --import {join} from 'path'; --import {builtinModules} from 'module'; -+const {chrome} = require('../../electron-vendors.config.json'); -+const {join} = require('path'); -+const {builtinModules} = require('module'); - - const PACKAGE_ROOT = __dirname; - -@@ -48,4 +48,4 @@ const config = { - }, - }; - --export default config; -+module.exports = config; -diff --git a/webapp/packages/renderer/vite.config.js b/webapp/packages/renderer/vite.config.js -index b9bcc3f6f..093e099c9 100644 ---- a/webapp/packages/renderer/vite.config.js -+++ b/webapp/packages/renderer/vite.config.js -@@ -1,9 +1,9 @@ - /* eslint-env node */ - --import {chrome} from '../../electron-vendors.config.json'; --import {join} from 'path'; --import {builtinModules} from 'module'; --import vue from '@vitejs/plugin-vue'; -+const {chrome} = require('../../electron-vendors.config.json'); -+const {join} = require('path'); -+const {builtinModules} = require('module'); -+const vue = require('@vitejs/plugin-vue'); - - const PACKAGE_ROOT = __dirname; - -@@ -19,7 +19,7 @@ const config = { - '/@/': join(PACKAGE_ROOT, 'src') + '/', - }, - }, -- plugins: [vue()], -+ plugins: [vue.default()], - base: '', - server: { - fs: { -@@ -48,4 +48,4 @@ const config = { - }, - }; - --export default config; -+module.exports = config; -``` - -## Lockfile Note -- Full diffs omitted intentionally for size/noise: - - `upstream_alas/webapp/package-lock.json` - - `upstream_alas/webapp/yarn.lock` - -## Refresh Result -- HEAD after refresh: `cda8619c0` - -### Clean Status -``` -## HEAD (no branch) -``` diff --git a/docs/plans/llm_driven_gameplay_architecture.md b/docs/plans/llm_driven_gameplay_architecture.md deleted file mode 100644 index 05f0fe267c..0000000000 --- a/docs/plans/llm_driven_gameplay_architecture.md +++ /dev/null @@ -1,422 +0,0 @@ -# LLM-Driven Gameplay Architecture (Revised) - -## Executive Summary - -This document revises the architecture to clarify the core intent: **The LLM actively plays the game**, making decisions and calling tools. Vision is used as a fallback when tools fail or state is unexpected - not as the primary interaction mode. - -## Key Clarifications - -### What "Vision Recovery" Actually Means - -| Misinterpretation | Correct Interpretation | -|-------------------|------------------------| -| LLM only activates on failure | LLM is always active, driving gameplay | -| Vision is the primary mode | Deterministic tools are primary, vision is fallback | -| Recovery agent is separate | Recovery is built into the main orchestrator loop | - -### The Actual Flow - -``` -LLM Orchestrator Loop: -1. Observe state (via tools or screenshot) -2. Decide action (which tool to call) -3. Execute tool -4. Check result - - Success? Continue loop - - Failure/Unexpected? Use vision to understand, then recover -5. Repeat -``` - -## Architecture: LLM-Driven Gameplay - -### High-Level Architecture - -```mermaid -flowchart TB - subgraph "Orchestrator Process - Python 3.10+" - LG[LangGraph 1.0
StateGraph Runtime] - GM[Gemini LLM
Decision Engine] - CP[CheckpointSaver
Durable Execution] - end - - subgraph "MCP Tool Layer" - MCP1[ALAS MCP Server
Game-specific tools] - MCP2[MaaMCP Server
Generic ADB/OCR] - end - - subgraph "Emulator" - EMU[MEmuPlayer
Android Emulator] - GAME[Azur Lane
Game Client] - end - - LG -->|calls| GM - GM -->|tool selection| LG - LG -->|checkpoint| CP - LG -->|MCP calls| MCP1 - LG -->|MCP calls| MCP2 - MCP1 -->|ADB| EMU - MCP2 -->|ADB| EMU - EMU --> GAME -``` - -### LangGraph 1.0 Integration - -```python -from langgraph.graph import StateGraph, MessagesState, START, END -from langgraph.checkpoint.memory import InMemorySaver -from langchain_core.messages import HumanMessage, AIMessage, ToolMessage - -class GameState(MessagesState): - """State for the game-playing agent.""" - current_page: str = "unknown" - last_action: str = "" - last_result: dict = {} - retry_count: int = 0 - task_queue: list = [] - -def observe_node(state: GameState) -> GameState: - """Observe current game state via tools.""" - # Call alas_get_current_state or screenshot - result = mcp_call("alas_get_current_state", {}) - return {"current_page": result.get("data", "unknown")} - -def decide_node(state: GameState) -> GameState: - """LLM decides what action to take.""" - # Gemini analyzes state and decides action - response = llm.invoke([ - SystemMessage(content=GAMEPLAY_SYSTEM_PROMPT), - HumanMessage(content=f"Current page: {state.current_page}. What should we do?") - ]) - return {"messages": [response]} - -def act_node(state: GameState) -> GameState: - """Execute the decided action.""" - tool_call = state.messages[-1].tool_calls[0] - result = mcp_call(tool_call["name"], tool_call["args"]) - return { - "last_action": tool_call["name"], - "last_result": result, - "messages": [ToolMessage(content=str(result), tool_call_id=tool_call["id"])] - } - -def check_result_node(state: GameState) -> str: - """Check if action succeeded, route accordingly.""" - if state.last_result.get("success"): - return "success" - if state.retry_count >= 3: - return "escalate" - return "recover" - -def recover_node(state: GameState) -> GameState: - """Use vision to understand failure and recover.""" - # Take screenshot - screenshot = mcp_call("adb_screenshot", {}) - - # Ask vision model to diagnose - diagnosis = vision_llm.invoke([ - SystemMessage(content=VISION_DIAGNOSIS_PROMPT), - HumanMessage(content=[ - {"type": "text", "text": f"Action {state.last_action} failed. Result: {state.last_result}"}, - {"type": "image", "data": screenshot["data"]} - ]) - ]) - - # Execute recovery action - # ... - return {"retry_count": state.retry_count + 1} - -# Build the graph -builder = StateGraph(GameState) -builder.add_node("observe", observe_node) -builder.add_node("decide", decide_node) -builder.add_node("act", act_node) -builder.add_node("recover", recover_node) - -builder.add_edge(START, "observe") -builder.add_edge("observe", "decide") -builder.add_edge("decide", "act") -builder.add_conditional_edges("act", check_result_node, { - "success": "observe", # Continue gameplay loop - "recover": "recover", - "escalate": END -}) -builder.add_edge("recover", "observe") - -# Compile with checkpointing -checkpointer = InMemorySaver() # Or PostgresSaver, SqliteSaver -graph = builder.compile(checkpointer=checkpointer) -``` - -## MCP Server Strategy: ALAS vs MaaMCP - -### Comparison - -| Aspect | ALAS MCP Server | MaaMCP | -|--------|-----------------|--------| -| **Purpose** | Azur Lane specific | General Android/Windows automation | -| **State Machine** | Built-in (9 years refined) | None (LLM-driven) | -| **OCR** | Game-specific models | General ONNX models | -| **Python** | 3.9 (via MCP bridge) | 3.10+ native | -| **Dependencies** | Heavy (ALAS stack) | Light (MaaFramework) | -| **License** | MIT | AGPL-3.0 | - -### Recommended Approach: Dual MCP Servers - -```json -// .mcp.json -{ - "mcpServers": { - "alas": { - "command": "uv", - "args": ["run", "--directory", "agent_orchestrator", "alas_mcp_server.py", "--config", "alas"], - "env": { - "PYTHONIOENCODING": "utf-8" - } - }, - "maamcp": { - "command": "maa-mcp", - "env": { - "MAA_RESOURCE_PATH": "./maa_resources" - } - } - } -} -``` - -### When to Use Each - -| Scenario | Primary Server | Fallback | -|----------|---------------|----------| -| Navigate to page | ALAS (has state machine) | MaaMCP (OCR + click) | -| Commission handling | ALAS (domain logic) | MaaMCP (generic) | -| Unknown popup | MaaMCP (screenshot + OCR) | Vision LLM | -| Combat operations | ALAS (sophisticated logic) | MaaMCP (basic) | -| Recovery from stuck | MaaMCP (independent ADB) | Vision LLM | - -## LangGraph 1.0 Best Practices - -### 1. Use StateGraph, Not Chains - -```python -# DON'T: Linear chains -chain = prompt | llm | parser - -# DO: StateGraph with cycles -builder = StateGraph(GameState) -builder.add_node("observe", observe_node) -builder.add_node("decide", decide_node) -builder.add_node("act", act_node) -builder.add_conditional_edges("act", check_result) -``` - -### 2. Checkpoint Everything - -```python -from langgraph.checkpoint.postgres import PostgresSaver - -# Production checkpointing -DB_URI = "postgresql://user:pass@localhost/langgraph" -checkpointer = PostgresSaver.from_conn_string(DB_URI) - -graph = builder.compile(checkpointer=checkpointer) - -# Resume from checkpoint -config = {"configurable": {"thread_id": "game_session_123"}} -result = graph.invoke(initial_state, config) -``` - -### 3. Human-in-the-Loop for Escalation - -```python -from langgraph.prebuilt import ToolNode - -# Add interrupt before critical actions -builder.add_node("human_review", human_review_node) -builder.add_edge("decide", "human_review") -builder.add_edge("human_review", "act") - -# Interrupt for approval -def human_review_node(state: GameState) -> GameState: - if state.current_page == "combat" and state.last_action == "start_combat": - # This will pause execution until human approves - return {"requires_approval": True} - return state -``` - -### 4. Use ToolNode for MCP Integration - -```python -from langgraph.prebuilt import ToolNode - -# Wrap MCP calls as LangChain tools -@tool -def alas_goto(page: str) -> dict: - """Navigate to a game page.""" - return mcp_call("alas_goto", {"page": page}) - -@tool -def alas_get_state() -> str: - """Get current game page.""" - return mcp_call("alas_get_current_state", {}) - -tools = [alas_goto, alas_get_state, adb_screenshot, adb_tap] -tool_node = ToolNode(tools) - -builder.add_node("tools", tool_node) -``` - -## Implementation Plan (Revised) - -### Phase 1: LangGraph Orchestrator Foundation - -**Goal**: Build the core gameplay loop with LangGraph 1.0 - -**Tasks:** -1. Create `agent_orchestrator/game_agent/` package -2. Define `GameState` state schema -3. Implement observe/decide/act/recover nodes -4. Wire up MCP client for tool calls -5. Add InMemorySaver checkpointing -6. Test basic gameplay loop - -**Files:** -``` -agent_orchestrator/game_agent/ -├── __init__.py -├── state.py # GameState definition -├── nodes.py # Node implementations -├── graph.py # StateGraph builder -├── tools.py # MCP tool wrappers -└── prompts.py # System prompts for gameplay -``` - -### Phase 2: Dual MCP Integration - -**Goal**: Support both ALAS MCP and MaaMCP - -**Tasks:** -1. Add MaaMCP as pip dependency -2. Create unified MCP client that routes to correct server -3. Implement fallback logic (ALAS → MaaMCP → Vision) -4. Add configuration for server selection -5. Test with both servers running - -### Phase 3: Vision Recovery Integration - -**Goal**: Use vision when tools fail - -**Tasks:** -1. Add Gemini Flash integration for vision -2. Implement screenshot analysis for failure diagnosis -3. Create recovery action generator -4. Add confidence thresholds for escalation -5. Test recovery scenarios - -### Phase 4: Durable Execution - -**Goal**: Production-ready checkpointing - -**Tasks:** -1. Switch from InMemorySaver to PostgresSaver -2. Add session management (thread_id per game session) -3. Implement resume from checkpoint -4. Add checkpoint pruning -5. Test crash recovery - -### Phase 5: Human-in-the-Loop - -**Goal**: Escalation and oversight - -**Tasks:** -1. Add interrupt nodes for critical actions -2. Implement approval workflow -3. Add notification hooks -4. Create resume-after-approval flow -5. Test human intervention scenarios - -## Key Design Decisions - -### 1. LLM is Always Active - -The LLM (Gemini) is the decision-maker in the gameplay loop. It: -- Observes state (via tools or vision) -- Decides what action to take -- Executes tools -- Handles failures with vision - -### 2. Deterministic Tools are Still Primary - -ALAS's tools are still the primary way to interact with the game because: -- They encode 9 years of domain knowledge -- They're faster than vision-based decisions -- They're more reliable for known scenarios - -### 3. Vision is for Unknowns - -Vision (screenshot analysis) is used when: -- Tools return unexpected results -- State doesn't match expectations -- Unknown popups appear -- Recovery is needed - -### 4. MaaMCP is the Backup - -MaaMCP provides: -- Independent ADB access (doesn't depend on ALAS) -- Generic OCR for unknown screens -- Fallback when ALAS tools fail - -### 5. LangGraph 1.0 for Durability - -LangGraph 1.0 provides: -- Durable execution with checkpointing -- Human-in-the-loop patterns -- State management -- Recovery from crashes - -## Updated File Structure - -``` -agent_orchestrator/ -├── game_agent/ # NEW: LangGraph-based gameplay agent -│ ├── __init__.py -│ ├── state.py # GameState TypedDict -│ ├── nodes.py # observe, decide, act, recover nodes -│ ├── graph.py # StateGraph builder -│ ├── tools.py # MCP tool wrappers for LangChain -│ ├── prompts.py # System prompts -│ └── recovery.py # Vision-based recovery logic -├── recovery/ # Recovery utilities (from previous plan) -│ ├── health_monitor.py -│ ├── error_classifier.py -│ └── ... -├── alas_mcp_server.py # Existing ALAS MCP server -└── pyproject.toml # Add: langgraph, langchain-core -``` - -## Dependencies to Add - -```toml -# Add to agent_orchestrator/pyproject.toml -dependencies = [ - # ... existing deps ... - "langgraph>=1.0.0", - "langchain-core>=1.0.0", - "langchain-google-genai>=2.0.0", # For Gemini -] -``` - -## Success Metrics - -| Metric | Target | How to Measure | -|--------|--------|----------------| -| Gameplay loop latency | <2s per action | Tool call timing | -| Recovery success rate | >80% | Vision recovery attempts | -| Checkpoint overhead | <100ms | Checkpoint timing | -| Human escalations | <5% | Escalation count | - ---- - -*Document Status: Revised Architecture* -*Previous: [recovery_agent_architecture.md](recovery_agent_architecture.md)* -*Last Updated: 2026-02-17* diff --git a/docs/plans/phase_0_login_tool_spec.md b/docs/plans/phase_0_login_tool_spec.md index 413599c5e2..f0485ff199 100644 --- a/docs/plans/phase_0_login_tool_spec.md +++ b/docs/plans/phase_0_login_tool_spec.md @@ -2,9 +2,7 @@ ## Why this tool exists -Login is the first deterministic “workflow tool” that a supervisor can delegate to. It turns the documented login sequence (see [docs/archive/deprecated/LoginFlow.md](../archive/deprecated/LoginFlow.md)) into a callable tool with a clear contract. - -This tool is Phase 0 work: **pure Python tool logic** (no MCP concerns). +Login is the first deterministic "workflow tool" that the MasterLoop can execute. It turns the conceptual login sequence into a callable tool with a clear, strict contract. This tool forms the foundation of the Tier 1 Hot Path. ## Tool name @@ -12,14 +10,16 @@ This tool is Phase 0 work: **pure Python tool logic** (no MCP concerns). ## Behavior -Guarantee the game is at the **Main Lobby** (`page_main`) with common popups dismissed. +Guarantee the game is at the **Main Lobby** (`page_main`) with all common popups dismissed. -Implementation should **reuse ALAS’s existing login handling** (template/asset driven), not rely primarily on hard-coded coordinate taps. +**Implementation Constraint**: This tool must use the **Element Detection Layer**. It must use modern UI tools (MaaFramework/MaaMCP or uiautomator2 + OCR) to semantically find login buttons, confirmation dialogs, and main menu anchors by content and boundaries. +- **DO NOT** use raw coordinate taps (`[x, y]`). +- **DO NOT** wrap ALAS's internal legacy template/asset matching classes. Extract the concepts, but build the tool cleanly. ## Preconditions - Emulator/device is reachable via ADB. -- ALAS core is initialized (OCR/models loaded) in the current process. +- Element detection backend (OCR models/UIAutomator) is initialized. ## Postconditions @@ -31,9 +31,7 @@ Implementation should **reuse ALAS’s existing login handling** (template/asset - `poll_interval_s: float = 1.0` - `dismiss_popups: bool = True` -(Deliberately minimal. If this grows, we should add separate tools rather than option explosion.) - -## Return Envelope (Required) +## Return Envelope (Strict Contract) The tool returns: @@ -43,9 +41,9 @@ The tool returns: - `observed_state: str | null` - `expected_state: str` (always `"page_main"`) -### `data` (suggested) +### `data` (suggested tracking) -- `actions_taken: list[object]` (clicks/taps performed) +- `actions_taken: list[object]` (clicked elements) - `markers_seen: list[str]` (popups/buttons observed) - `elapsed_s: float` @@ -54,17 +52,17 @@ The tool returns: If the tool cannot reach `page_main` within `max_wait_s`: - `success = false` -- `error` describes the last blocking condition if known -- `observed_state` is set to the last known state +- `error` describes the last blocking condition (e.g., "Failed to find 'Start' element"). +- `observed_state` is set to the last known semantic state. - `expected_state` is `"page_main"` -- `data` should contain diagnostic information (`actions_taken`, `markers_seen`, etc.) -- Include a screenshot in diagnostics when available (either via tool data or via a separate `adb.screenshot` call by the caller) +- `data` contains diagnostic information (`actions_taken`, `markers_seen`, etc.) +- A screenshot must be taken for the supervisor to transition into Tier 2 (VLM Piloting) for manual recovery and blueprinting. -This tool should be **best-effort deterministic** and must not attempt open-ended recovery. Recovery belongs to the supervisor (Phase II). +This tool is **best-effort deterministic**. Open-ended recovery belongs entirely to the Tier 2 Vision/Manual pipeline, never inside the deterministic tool itself. ## Test plan -- Script-style test similar to `alas_wrapped/tools/test_navigation.py` +- **Zero Hallucination**: Must be tested directly against live LDPlayer/MEmu. - Manual validation steps: - Start from app closed → run tool → end at `page_main` - Start from “Press to Start” → run tool → end at `page_main` diff --git a/docs/plans/phase_2_gemini_cli_plan.md b/docs/plans/phase_2_gemini_cli_plan.md deleted file mode 100644 index b348ebf40f..0000000000 --- a/docs/plans/phase_2_gemini_cli_plan.md +++ /dev/null @@ -1,61 +0,0 @@ -# Plan: Phase II - Supervisor Client (Transport) - -## Overview -Phase II introduces an autonomous supervisor that drives deterministic tools via MCP. - -This document is intentionally **transport-agnostic**: the supervisor could run as a CLI during development, a long-running service, or later as a LangGraph graph. The core requirement is that it speaks MCP and obeys tool contracts. - -## Inputs and Outputs - -### Inputs (each step) -- Current tool-visible state (e.g., `alas.get_current_state`) -- Optional screenshot (`adb.screenshot`) for recovery only -- Available tools (`alas.list_tools`) -- Short action history (last ~5-10 actions) - -### Outputs (each step) -- One tool call with arguments OR an explicit stop/escalation decision - -## Core Loop (Iterative Tool Loop) - -The supervisor follows: -1. Observe (state) -2. Decide (next tool) -3. Execute (tool call) -4. Verify (compare `expected_state` vs `observed_state`) -5. Continue / retry / escalate - -The supervisor must prefer deterministic `alas.*` tools over raw `adb.*` operations. - -## Tool Contract Requirements - -The supervisor assumes all gameplay tools return: -`{success, data, error, observed_state, expected_state}`. - -If a tool does not provide `observed_state`, the supervisor treats the result as incomplete and should stop early rather than guess. - -## Safety and Guardrails - -- **Loop caps**: maximum actions per goal to prevent infinite loops -- **Retry policy**: bounded retries for transient failures (timeouts, ADB hiccups) -- **Escalation policy**: fail fast if the state does not converge after retries -- **Minimal history**: keep only enough context to avoid repeating actions - -## Recovery Policy (Vision is Not the Hot Path) - -Vision is called only when: -- a deterministic tool fails unexpectedly, or -- `observed_state` is unknown/contradictory to `expected_state` - -When invoking vision, the supervisor must include: -- screenshot -- last actions -- expected outcome - -## Verification Plan - -Start with a single end-to-end deterministic workflow (login): -- Call `alas.login.ensure_main` -- Verify `observed_state == expected_state == page_main` - -Then expand to daily/commission workflows once tools exist. diff --git a/docs/plans/phase_2_master_plan.md b/docs/plans/phase_2_master_plan.md new file mode 100644 index 0000000000..313cf076df --- /dev/null +++ b/docs/plans/phase_2_master_plan.md @@ -0,0 +1,79 @@ +# Phase 2 Master Plan: Deterministic Tool Framework and First Production Tool Set + +Status: Draft (execution-ready) +Owner: ALAS automation team +Scope: `adb_vision/` in `d:\_projects\ALAS` +Aligned docs: `CLAUDE.md`, `docs/NORTH_STAR.md`, `TDD_IMPLEMENTATION_PLAN.md` + +--- + +## 1. Objective + +Phase 2 builds the **Tier 1 (Hot Path)** deterministic toolset that the MasterLoop executes automatically on every cycle: + +1. Register and discover deterministic tools. +2. Execute deterministic tools under strict contract checks. +3. Validate observed state after each tool action. +4. Fall back to Manual/VLM mode when executing tools fail. + +**Core Paradigm Shift**: All tools built in this phase must be **Element-Powered**. We are extracting logic from ALAS, not wrapping ALAS. We use modern ADB tools (MaaFramework/MaaMCP or uiautomator2 + OCR) to find buttons and icons semantically or by bounds. No raw coordinates, no ALAS asset templates. + +--- + +## 2. Phase Boundary + +### In Scope + +- Deterministic tool framework in `adb_vision/tools/`. +- First set of element-powered production tools: + - `goto_main_menu()` + - `check_resource_balances()` (OCR hot-read to state) + - `collect_commissions(...)` +- Tool registry and metadata model. +- Live test coverage for end-to-end deterministic execution against LDPlayer/MEmu. +- Logging and observability for deterministic tool calls. + +### Out of Scope + +- Blueprint auto-generation from Manual Piloting (Phase 3). +- Escalation to manual/VLM control (Phase 3). +- Ultimate emulator restart (Phase 4). + +--- + +## 3. Locked Constraints (Zero Hallucination) + +- **Element-driven**: Use content, bounds, and semantics to interact (`tap_element("commissions")`), never static `[x,y]`. +- **TDD + Live Emulator**: Every feature is built and tested directly against an active emulator. +- **Strict output contract**: + - `{success, data, error, observed_state, expected_state}` +- No ALAS-internal imports inside `adb_vision` production code. + +--- + +## 4. Success Criteria (Phase 2 Exit) + +1. Deterministic tool registry auto-discovers tool modules. +2. Three core tools exist, use element detection natively, and pass live tests. +3. Loop executes the deterministic path first and surfaces failures cleanly. +4. `collect_commissions` flow completely runs from main menu to end in life environment. +5. `check_resource_balances` writes hot OCR readings (oil/gem/coin) to queryable MCP resource state. + +--- + +## 5. Workstreams + +### 5.1 Workstream A: Element Tool Framework Core +Setup semantic discovery utilities mapping `MaaFramework` or `uiautomator2` bindings to the exact `{success, expected_state, observed_state}` contract. + +### 5.2 Workstream B: Tool 1 `goto_main_menu` +Identify core main menu anchors semantically, allowing navigation back to the primary state from common popups. + +### 5.3 Workstream C: Tool 2 `check_resource_balances` +Perform OCR reads on resource coordinates/icons, persisting output as standard telemetry JSON payload to the persistent state cache. + +### 5.4 Workstream D: Tool 3 `collect_commissions` +Perform exact commission tasks (find commission button text/bounds, click, select fleet, confirm) entirely via Element interactions. Ensure expected transitions map to UI element appearances. + +### 5.5 Workstream E: Live Test Wiring +Provide explicit pytest harnesses running against `adb get-state` attached emulators. Red-Green-refactor is mandatory for every tool created. diff --git a/docs/plans/program_execution_plan.md b/docs/plans/program_execution_plan.md new file mode 100644 index 0000000000..a15326eca1 --- /dev/null +++ b/docs/plans/program_execution_plan.md @@ -0,0 +1,83 @@ +# Program Execution Plan (All Phases) + +Status: Active +Purpose: Single planning index across Phase 0–5 with clear ownership of source-of-truth docs and current execution state. + +--- + +## 1. Canonical Planning Stack + +Use this order for decisions: + +1. `CLAUDE.md` + Agent interaction rules and core system truths. +2. `docs/NORTH_STAR.md` + Product truth and permanent architecture constraints. +3. `docs/plans/program_execution_plan.md` (this file) + End-to-end loop intent and operating model. +4. `TDD_IMPLEMENTATION_PLAN.md` + Build/test execution contract by phase. +5. `docs/ROADMAP.md` + Current status checkpoints. + +Phase-specific deep plans: +- Phase 2: `docs/plans/phase_2_master_plan.md` (primary) + +--- + +## 2. System Intent & Principles +Review `CLAUDE.md` and `NORTH_STAR.md` for full architecture. Core highlights: +- **Two-Tier Model**: Tier 1 is element-powered deterministic tools; Tier 2 is VLM/Manual recovery. +- **Single Harness**: The autonomous loop and recovery pipeline both use the identical MCP tool surface. +- **Extract, Don't Wrap**: Legacy ALAS is a reference only. Build clean, standalone tools. +- **Element-Aware**: Modern tools (MaaFramework/uiautomator2+OCR) over coordinate taps or templates. + +--- + +## 3. Phase Ledger (The Build Order) + +### Phase 0 (Foundation) +Goal: Emulator, ADB, screenshot, tool contract, and state-cache primitives working. +State: Complete. +Evidence: +- Strict `{success, data, error, observed_state, expected_state}` tool contract +- State cache operations +- Live test harness scaffolding + +### Phase 1 (Scheduler + Persistent State) +Goal: The MasterLoop handles scheduler decisions + loop execution + state persistence. Hook up hot OCR of resources (oil/gem/coin/timers) into persistent MCP queryable state. +State: Complete. +Evidence: +- `MasterLoop` deterministic-first check framework +- Per-cycle cache persistence +- Live test: `adb_vision/test_live.py::TestPhase1LiveLoop` + +### Phase 2 (Deterministic Tool Framework + First Tool Set/Wrappers) +Goal: Implement deterministic tools for the most frequent tasks, using modern element-aware detection (no hardcoded coordinates). +State: Active. +Primary plan: `docs/plans/phase_2_master_plan.md` + +### Phase 3 (Manual Piloting + Blueprint Recording) +Goal: Implement the VLM-guided or manual fallback path to interact with unknown elements, capturing successful raw actions as blueprint artifacts to be codified into deterministic tools later. +State: Not started. + +### Phase 4 (Recovery Ladder + Restart Path) +Goal: Build the robust recovery path cascading down to an ultimate fallback of cleanly restarting MEmu via `.exe` admin plugin when VLM reasoning fails to recover a broken game state. +State: Not started. + +### Phase 5 (Autonomous Continuous Loop) +Goal: Hardened, long-running 24/7 autonomous loop executing within the same harness, governed by telemetry, stability gates, and the operationalized scheduler. +State: Not started. + +--- + +## 4. Operational Rule Going Forward + +For every phase: + +1. Keep one dedicated phase master plan under `docs/plans/phase_X_master_plan.md` +2. Update `TDD_IMPLEMENTATION_PLAN.md` checkboxes as source of test truth +3. Update `docs/ROADMAP.md` for status visibility +4. Keep this program plan synchronized as portfolio index + +**No separate architectural experiments** (e.g., pure LangGraph loops). The system follows the MasterLoop scheduler paradigm definitively. diff --git a/docs/plans/tool_node_design_review.md b/docs/plans/tool_node_design_review.md index beec92af2f..b9312c4308 100644 --- a/docs/plans/tool_node_design_review.md +++ b/docs/plans/tool_node_design_review.md @@ -1,250 +1,60 @@ -# Tool Node Design Review: Response to Issue #21 +# Deterministic Tool Contract Review: Response to Issue #21 > **Status**: Architecture Review > **Created**: 2026-02-17 -> **Related**: GitHub Issue #21, `durable_agent_system_plan.md` +> **Revised**: March 2026 (Aligned to Two-Tier MasterLoop) +> **Related**: GitHub Issue #21 ## Executive Summary -Issue #21 correctly identifies that my durable agent system plan oversimplified the Tool Node component. The issue raises critical concerns about verification, retry logic, and multi-modal feedback correlation that need to be addressed. - -**Verdict**: The issue is correct. My plan treated the Tool Node as a simple pass-through that executes MCP calls and returns results. This is insufficient for production use. +Issue #21 correctly identifies that the initial plan for deterministic tools oversimplified the execution pattern. The tools cannot simply execute an action and assume success; they require state verification, belief tracking, and properly structured error enrichment so the Tier 2 (VLM) fallback can effectively recover. --- -## What the Issue Gets Right +## What the Issue Gets Right (Applied to Tier 1 Tools) ### 1. Execute + Verify Pattern -**My plan's gap**: I assumed `success=True` from a tool means the game state actually changed. - -**Issue's insight**: Tools can report success while the game shows popups, loading screens, or didn't actually transition. The Tool Node must verify state changes post-execution. - -**What I would add**: -```python -async def tool_node_with_verify(state: GameState) -> GameState: - tool_call = state["messages"][-1].tool_calls[0] - - # Execute - result = await call_mcp_tool(tool_call) - - # VERIFY: Tool said success, but did game actually change? - if result.get("success"): - observed = await verify_state_change(tool_call, result) - if observed != result.get("expected_state"): - # Tool lied or popup/loading interfered - result["success"] = False - result["error"] = f"Expected {result['expected_state']}, observed {observed}" - result["needs_vision"] = True - - return {"messages": [ToolMessage(content=format_result(result))]} -``` +**Previous Gap**: Assuming `success=True` from an ADB click meant the game state actually changed. +**Correction**: Determinstic tools must verify state changes post-execution before returning their contract. If a tool taps a semantic "Commission" element, but a popup appears instantly, it did not reach the semantic `page_commission` state. The tool must return `success=False` and report the `observed_state == "event_popup"`. ### 2. State Belief Tracking -**My plan's gap**: I didn't track the difference between "where we think we are" vs "where we actually are." - -**Issue's insight**: The bot maintains belief state that may diverge from reality. This is critical for recovery decisions. - -**What I would add to GameState**: -```python -class GameState(TypedDict): - # Existing fields... - believed_page: str # What we think we're on - believed_task: str # What we think we're doing - last_action_success: bool - screenshot_verified: bool # Did we actually look? - belief_confidence: float # How confident are we? -``` - -### 3. Multi-Modal Feedback Correlation - -**My plan's gap**: I treated tool returns as the only feedback channel. - -**Issue's insight**: There are multiple feedback channels with different latencies: - -| Channel | Example | Latency | -|---------|---------|---------| -| Tool return | `{success: true, observed_state: "page_commission"}` | Immediate | -| State query | `get_current_state()` → `"page_main"` | ~100ms | -| Screenshot OCR | OCR text on screen | ~500ms | -| Vision LLM | "I see a popup blocking the screen" | ~2s | - -**What I would add**: A feedback correlation layer that: -1. Checks tool return first (immediate) -2. Queries state if tool reports success (100ms) -3. Uses OCR if state mismatch (500ms) -4. Falls back to vision if OCR ambiguous (2s) +**Correction**: The MasterLoop tracks "where we think we are" (`expected_state`). The deterministic tools track "where we actually are" (`observed_state`). Any derivation between these immediately triggers the Tier 2 Fallback. -### 4. Structured Error Enrichment +### 3. Structured Error Enrichment -**My plan's gap**: I returned basic error information. +**Correction**: When a deterministic tool fails, it must provide structured context so the Tier 2 VLM can reason effectively. -**Issue's insight**: LLM needs structured context for recovery decisions. - -**What I would change**: ```python -# Before (my plan) +# The strict `{success, data, error, observed_state, expected_state}` contract { "success": False, - "error": "State mismatch", + "expected_state": "page_commission", "observed_state": "page_main", - "expected_state": "page_commission" -} - -# After (issue's approach) -{ - "tool": "alas_goto", - "target": "page_commission", - "result": { - "tool_success": True, # ALAS thought it worked + "error": "State mismatch after tap_element('commission_button')", + "data": { + "tool_success": True, # ADB click worked "state_verification": "FAILED", - "expected": "page_commission", - "observed": "page_main", - "screenshot_analysis": { - "blocking_element": "event_popup", - "ocr_detected": ["New Event!", "Claim Reward"], - "suggested_action": "dismiss_popup_then_retry" - } + "last_ocr_detected": ["New Event!", "Claim Reward"], } } ``` --- -## What the Issue Gets Wrong (or Needs Clarification) - -### 1. Retry Logic Clarification - -The issue's correction comment is important: - -> "The retry logic doesn't handle all failures without LLM - it specifically handles transient failures (ADB timeouts, loading delays, temporary connection issues) transparently." - -> "The LLM is actively involved in the retry process - it plans the retry strategy, reasons about failure types, and decides when to escalate." - -**My concern**: This creates a tension. If the LLM is "actively involved" in retry, then: -- Every retry requires an LLM round-trip (adds latency) -- The LLM must understand ADB timeout semantics (leaks implementation details) - -**My recommendation**: Split retry into two layers: - -1. **Tool-level retry** (no LLM involvement): Transient failures only (ADB timeout, loading delay) - - Pre-configured max retries, backoff - - No LLM decision needed - - Fast recovery - -2. **LLM-guided retry** (with LLM involvement): Structural failures - - Tool returns enriched error context - - LLM decides: retry, alternative path, or escalate - - Slower but smarter - -### 2. Verification Cost +## What the Issue Gets Wrong: Retry Logic -The issue proposes screenshot verification for every tool call. This is expensive: - -- Screenshot: ~100-500ms -- OCR: ~500ms -- Vision: ~2s - -**My recommendation**: Tiered verification based on tool type: - -| Tool Type | Verification | Cost | -|-----------|--------------|------| -| Navigation (`alas_goto`) | State query + OCR if mismatch | ~100-600ms | -| Action (`alas_call_tool`) | Tool return only | Immediate | -| Critical (combat, rewards) | Screenshot + OCR | ~600ms | -| Recovery | Vision | ~2s | - ---- - -## Proposed Changes to `durable_agent_system_plan.md` - -### Add Phase 2.5: Tool Node Verification Layer - -Between Phase 2 (Core Gameplay Loop) and Phase 3 (Vision Fallback): - -**Phase 2.5: Tool Node Verification** -- [ ] Implement `verify_state_change()` for navigation tools -- [ ] Add `GameState` belief tracking fields -- [ ] Create tiered verification strategy (state query → OCR → vision) -- [ ] Implement structured error enrichment -- [ ] Add feedback correlation logic - -### Update Phase 2.3: Act Node - -The Act Node should not just execute and return. It should: - -```python -async def act_node(state: GameState) -> GameState: - tool_call = state["messages"][-1].tool_calls[0] - - # Execute with retry for transient failures - result = await execute_with_retry(tool_call, max_retries=2) - - # Verify state change for navigation tools - if is_navigation_tool(tool_call): - result = await verify_and_enrich(result, tool_call) - - # Update belief state - return { - "last_action": tool_call["name"], - "last_result": result, - "believed_page": result.get("observed_state", state["believed_page"]), - "screenshot_verified": result.get("screenshot_verified", False), - "messages": [ToolMessage(content=str(result), tool_call_id=tool_call["id"])] - } -``` - -### Add New File: `agent_orchestrator/nodes/verify.py` - -```python -async def verify_state_change(tool_call: dict, result: dict) -> str: - """Verify that the expected state change actually occurred.""" - expected = result.get("expected_state") - if not expected: - return result.get("observed_state", "unknown") - - # Tier 1: State query (fast) - current = await mcp_call("alas_get_current_state", {}) - observed = current.get("data", {}).get("page", "unknown") - - if observed == expected: - return observed - - # Tier 2: OCR verification (medium) - screenshot = await mcp_call("adb_screenshot", {}) - ocr_result = await ocr_analyze(screenshot["data"]) - - if ocr_match(ocr_result, expected): - return expected - - # Tier 3: Vision verification (slow) - vision_result = await vision_analyze(screenshot["data"], f"Are we on {expected}?") - return vision_result.get("state", observed) -``` - ---- - -## Acceptance Criteria Additions - -Based on the issue, I would add these acceptance criteria to the plan: - -- [ ] Tool Node verifies state changes post-execution for navigation tools -- [ ] Retry logic distinguishes transient vs structural failures -- [ ] LLM receives structured error context for recovery decisions -- [ ] Belief state tracks expected vs observed state -- [ ] Tiered verification strategy minimizes latency while ensuring correctness -- [ ] Feedback correlation combines tool returns, state queries, OCR, and vision - ---- +The issue originally proposed that the LLM is "actively involved in the retry process." This conflicts directly with the established Two-Tier architecture. -## Summary +**Resolution:** +The MasterLoop splits retry handling entirely: -Issue #21 is correct that my plan oversimplified the Tool Node. The key additions needed are: +1. **Tier 1 (Tool-level) Retry**: Handled natively within the deterministic Python tool. Transient failures ONLY (e.g., ADB lag, element loading delay). It has pre-configured max retries and backoffs. **Zero LLM involvement.** +2. **Tier 2 (VLM Fallback) Escalation**: Handled when Tier 1 exhausts its retry budget or detects structural failures (e.g., unexpected UI states). The VLM takes the screenshot + error snapshot and determines the raw operations needed to manually break out of the bad state. -1. **Execute + Verify Pattern**: Don't trust tool success blindly -2. **State Belief Tracking**: Track where we think we are vs reality -3. **Multi-Modal Feedback Correlation**: Combine multiple verification channels -4. **Structured Error Enrichment**: Give LLM the context it needs for recovery +## Implementation Requirements -The issue's correction about LLM involvement in retry is important but needs careful design to avoid excessive latency. I recommend a two-layer retry approach: tool-level for transient failures (fast, no LLM), LLM-guided for structural failures (slower, smarter). +- [ ] Implement `verify_state_change()` natively inside base deterministic tool classes. +- [ ] Tier 1 tools must use fast state queries (OCR/Element-existence checks) to verify success before returning. +- [ ] Tier 2 execution handles slow visual-verification and logical error resolution. diff --git a/docs/reviews/PR29_DEVILS_ADVOCATE_2026-03-04.md b/docs/reviews/PR29_DEVILS_ADVOCATE_2026-03-04.md deleted file mode 100644 index e84df42188..0000000000 --- a/docs/reviews/PR29_DEVILS_ADVOCATE_2026-03-04.md +++ /dev/null @@ -1,33 +0,0 @@ -# PR #29 Devil's Advocate Review - -PR: https://github.com/Zuosizhu/Alas-with-Dashboard/pull/29 - -## Case For Merging - -- Contains substantial real implementation value already present in branch history: - - `adb_vision` service/tools. - - state/agent documentation and operational guidance. - - screenshot backend stack now implemented and tested. -- Aligns with long-term architecture direction: deterministic tool surface + fallback paths. -- Unblocks immediate development on permanent loop components. - -## Case Against Merging - -- Scope is too broad for safe review (`+37663 / -8573` currently reported). -- Title/body are no longer representative of true branch scope. -- Mixed concerns increase regression risk and make rollback difficult. -- Reviewability and accountability are weak at this size. - -## Arbitration - -- Keep the branch as active integration trunk for now. -- Do not treat current PR metadata as production-merge ready. -- Recommended near-term action: - 1. Create a focused backend PR from current head (screenshot/runtime + tests only). - 2. Keep large historical changes in branch, but land forward in small reviewed slices. - -## Recommendation - -- `PR #29` as currently framed: **Do not merge directly in current form**. -- Continue implementation on branch, then open focused incremental PR(s) for actual merge gates. - diff --git a/plans/phase2_scaffolding_review.md b/plans/phase2_scaffolding_review.md new file mode 100644 index 0000000000..db79015277 --- /dev/null +++ b/plans/phase2_scaffolding_review.md @@ -0,0 +1,230 @@ +# Phase 2 Code Scaffolding Review + +## Executive Summary + +The Phase 2 scaffolding establishes a robust **Deterministic Tool Framework** with strict contract enforcement, auto-discovery, and TDD-ready test harnesses. The architecture follows the Two-Tier model specified in CLAUDE.md, with Tier 1 (deterministic tools) as the hot path and Tier 2 (VLM fallback) for recovery. + +--- + +## Architecture Overview + +```mermaid +graph TB + subgraph "Phase 2: Deterministic Tools" + A["@deterministic_tool
Decorator"] --> B[ToolRegistry] + B --> C["Tool Metadata
Store"] + A --> D[Strict Contract
Enforcement] + D --> E["{success, data, error,
observed_state,
expected_state}"] + end + + subgraph "Tool Implementations" + F["goto_main_menu.py"] --> B + G["check_resources.py"] --> B + H["collect_commissions.py"] --> B + end + + subgraph "Integration Layer" + I[MasterLoop] --> J[Executor] + J --> B + B --> K[State Cache] + K --> L["JSONL
Persistence"] + end + + subgraph "Tier 2 Fallback" + M[Gemini Flash
VLM] --> J + end +``` + +--- + +## Component Analysis + +### 1. Core Registry Framework ([`registry.py`](adb_vision/tools/registry.py)) + +**Strengths:** +- ✅ Clean `@deterministic_tool` decorator pattern +- ✅ Automatic registration on import via global `_TOOL_REGISTRY` dict +- ✅ Metadata embedding (timeout, expected_state, retry_budget) +- ✅ Contract enforcement via delegation to `strict_contract` +- ✅ Duplicate name prevention with clear error + +**Contract Structure:** +```python +{ + "success": bool, # Required + "data": object | None, # Optional payload + "error": str | None, # Required on failure + "observed_state": str | None, # What we actually saw + "expected_state": str # What we expected +} +``` + +**Gap Identified:** +- ⚠️ No `retry_budget` enforcement in the decorator (metadata stored but not used for retries) +- ⚠️ Missing `get_tool_metadata(name)` accessor for tooling/debugging + +### 2. Contract Enforcement ([`tool_contract.py`](adb_vision/contracts/tool_contract.py)) + +**Strengths:** +- ✅ Comprehensive validation via `validate_contract()` +- ✅ Runtime contract checking with `strict_contract` decorator +- ✅ Clear `ContractViolationError` for debugging +- ✅ Factory helper `make_result()` ensures valid structure + +**Validation Rules:** +- All 5 keys must be present +- `success` must be boolean +- `expected_state` must be string +- On `success=True`: `error` must be `None` or `""` +- On `success=False`: `error` must be non-empty string + +**Gap Identified:** +- ⚠️ No validation of `observed_state` being populated (should be required when success=False) + +### 3. Tool Implementations + +#### [`goto_main_menu.py`](adb_vision/tools/goto_main_menu.py) +- **Expected State:** `page_main` +- **Timeout:** 90s +- **Retry Budget:** 5 +- **Status:** TDD placeholder (fails by design) + +#### [`check_resources.py`](adb_vision/tools/check_resources.py) +- **Expected State:** `resources_logged` +- **Timeout:** 30s +- **Retry Budget:** 2 +- **Status:** TDD placeholder (fails by design) + +#### [`collect_commissions.py`](adb_vision/tools/collect_commissions.py) +- **Expected State:** `page_commissions_empty` +- **Timeout:** 180s +- **Retry Budget:** 3 +- **Status:** TDD placeholder (fails by design) + +### 4. Test Harnesses + +#### [`test_tool_registry.py`](adb_vision/test_tool_registry.py) +**Coverage:** +- ✅ Tool discovery via decorator +- ✅ Duplicate name rejection +- ✅ Contract enforcement validation +- ✅ Registry clearing for test isolation + +#### [`test_live_phase2_tools.py`](adb_vision/test_live_phase2_tools.py) +**Coverage:** +- ✅ Auto-discovery verification (≥3 tools) +- ✅ Live emulator integration tests (skipped without device) +- ✅ Each tool tested individually + +**Gap Identified:** +- ⚠️ No test for retry logic behavior +- ⚠️ No test for timeout behavior + +### 5. Integration with MasterLoop ([`master_loop.py`](adb_vision/loop/master_loop.py)) + +**Current Integration Pattern:** +```python +# In test_live.py - live executor +def executor(command: str) -> dict: + # Legacy pattern - doesn't use registry directly + return make_result(...) +``` + +**Gap Identified:** +- ⚠️ **CRITICAL:** No integration between MasterLoop and ToolRegistry +- The registry exists but MasterLoop doesn't use `registry.get_tool(command)` to find deterministic tools +- Need to add a registry-aware executor that maps commands to registered tools + +--- + +## Recommendations + +### Priority 1: Bridge MasterLoop to Registry + +Create an executor wrapper that: +1. Takes `command` string from scheduler +2. Looks up tool in registry via `get_tool(command)` +3. If found → execute deterministic tool +4. If not found → fall back to legacy/VLM executor + +```python +def registry_executor(command: str) -> dict: + tool_fn = registry.get_tool(command) + if tool_fn: + return tool_fn() + return {"success": False, "error": "No deterministic tool found", ...} +``` + +### Priority 2: Implement Retry Logic + +The `retry_budget` in metadata is stored but not used. Add retry loop in registry: + +```python +def execute_with_retry(name: str) -> dict: + tool_fn, meta = _TOOL_REGISTRY[name] + for attempt in range(meta.retry_budget): + result = tool_fn() + if result["success"]: + return result + return result # Return final attempt +``` + +### Priority 3: Add State Persistence to Tools + +Each tool should update state cache on success: + +```python +from adb_vision.state import cache + +def check_resource_balances() -> dict: + # ... OCR logic ... + cache.update("resources.last_check", datetime.now()) + cache.update("resources.oil", oil_value) + return make_result(success=True, observed_state="resources_logged", ...) +``` + +### Priority 4: Element Detection Integration Points + +For the TDD placeholder implementations, these hooks should be added: + +| Tool | Element Detection Hook | +|------|----------------------| +| `goto_main_menu` | Find "Home" button / "Back" via MaaFramework/uiautomator | +| `check_resources` | OCR bounds for Oil/Gem/Coin via screenshot parsing | +| `collect_commissions` | Find "Collect All" button via text/element detection | + +--- + +## File Inventory + +| File | Purpose | Status | +|------|---------|--------| +| [`adb_vision/tools/registry.py`](adb_vision/tools/registry.py) | Core framework | ✅ Complete | +| [`adb_vision/contracts/tool_contract.py`](adb_vision/contracts/tool_contract.py) | Contract validation | ✅ Complete | +| [`adb_vision/tools/goto_main_menu.py`](adb_vision/tools/goto_main_menu.py) | Tool #1 scaffold | ✅ TDD placeholder | +| [`adb_vision/tools/check_resources.py`](adb_vision/tools/check_resources.py) | Tool #2 scaffold | ✅ TDD placeholder | +| [`adb_vision/tools/collect_commissions.py`](adb_vision/tools/collect_commissions.py) | Tool #3 scaffold | ✅ TDD placeholder | +| [`adb_vision/tools/__init__.py`](adb_vision/tools/__init__.py) | Auto-discovery | ✅ Complete | +| [`adb_vision/test_tool_registry.py`](adb_vision/test_tool_registry.py) | Unit tests | ✅ Complete | +| [`adb_vision/test_live_phase2_tools.py`](adb_vision/test_live_phase2_tools.py) | Live harness | ✅ Scaffolded | + +--- + +## Next Steps for Implementation + +1. **Bridge Registry → MasterLoop** - Create registry-aware executor +2. **Wire up Element Detection** - Replace TDD placeholders with real MaaFramework/uiautomator calls +3. **Add State Integration** - Tools update cache on success +4. **Implement Retry Logic** - Enforce `retry_budget` +5. **Run Live Tests** - Execute `test_live_phase2_tools.py` against emulator + +--- + +## Risk Assessment + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Element detection backend unavailable | High | Tier 2 VLM fallback always available | +| Contract violations crash loop | Medium | ContractViolations fail-fast (correct behavior) | +| Registry not populated at runtime | High | Auto-discovery runs on `import adb_vision.tools` | +| Duplicate tool names | Medium | Raised at import time with clear error |