android-action-kernel/kernel.py at main · Action-State-Labs/android-action-kernel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Android Action Kernel - Main Agent Loop

An AI agent that controls Android devices through the accessibility API.
Uses LLMs to make decisions based on screen context.

Usage:
    python kernel.py
"""

import os
import json
import time
from typing import List, Dict, Any

from config import Config
from actions import execute_action, run_adb_command
from llm_providers import get_llm_provider
import sanitizer


def get_screen_state() -> str:
    """Dumps the current UI XML and returns the sanitized JSON string."""
    # 1. Capture XML from device
    run_adb_command(["shell", "uiautomator", "dump", Config.SCREEN_DUMP_PATH])

    # 2. Pull to local machine
    run_adb_command(["pull", Config.SCREEN_DUMP_PATH, Config.LOCAL_DUMP_PATH])

    # 3. Read & Sanitize
    if not os.path.exists(Config.LOCAL_DUMP_PATH):
        return "Error: Could not capture screen."

    with open(Config.LOCAL_DUMP_PATH, "r", encoding="utf-8") as f:
        xml_content = f.read()

    elements = sanitizer.get_interactive_elements(xml_content)
    return json.dumps(elements, indent=2)


def run_agent(goal: str, max_steps: int = None) -> None:
    """
    Main agent loop: Perceive -> Reason -> Act

    Args:
        goal: The task to accomplish
        max_steps: Maximum steps before stopping (default from config)
    """
    if max_steps is None:
        max_steps = Config.MAX_STEPS

    print(f"🚀 Android Action Kernel Started")
    print(f"📋 Goal: {goal}")
    print(f"🤖 Provider: {Config.LLM_PROVIDER} ({Config.get_model()})")

    # Initialize LLM provider
    llm = get_llm_provider()
    action_history: List[Dict[str, Any]] = []

    for step in range(max_steps):
        print(f"\n--- Step {step + 1}/{max_steps} ---")

        # 1. Perception: Capture screen state
        print("👀 Scanning Screen...")
        screen_context = get_screen_state()

        # 2. Reasoning: Get LLM decision
        print("🧠 Thinking...")
        decision = llm.get_decision(goal, screen_context, action_history)
        print(f"💡 Decision: {decision.get('reason', 'No reason provided')}")

        # 3. Action: Execute the decision
        execute_action(decision)

        # Track action history for context
        action_history.append(decision)

        # Wait for UI to update
        time.sleep(Config.STEP_DELAY)

    print("\n⚠️ Max steps reached. Task may be incomplete.")


def main():
    """Entry point for the Android Action Kernel."""
    try:
        Config.validate()
    except ValueError as e:
        print(f"❌ Configuration Error: {e}")
        return

    goal = input("Enter your goal: ")
    if not goal.strip():
        print("❌ No goal provided. Exiting.")
        return

    run_agent(goal)


if __name__ == "__main__":
    main()