Action-State-Labs · misran3 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/README.md b/README.md
@@ -178,8 +178,20 @@ brew install android-platform-tools  # macOS
 # 4. Connect device & verify
 adb devices
 
-# 5. Set API key
-export OPENAI_API_KEY="sk-..."
+# 5. Configure LLM Provider (choose one)
+export LLM_PROVIDER="openai"  # or anthropic, gemini, bedrock
+
+# Set appropriate API key
+export OPENAI_API_KEY="sk-..."        # for OpenAI
+# export ANTHROPIC_API_KEY="sk-..."  # for Anthropic
+# export GOOGLE_API_KEY="..."        # for Gemini
+# export AWS_PROFILE="default"       # for Bedrock
+
+# Optional: Override default model
+# export OPENAI_MODEL="gpt-4o"
+# export ANTHROPIC_MODEL="claude-sonnet-4"
+# export GEMINI_MODEL="gemini-2.0-flash-exp"
+# export BEDROCK_MODEL="anthropic.claude-sonnet-4-20250514-v1:0"
 
 # 6. Run your first agent
 python kernel.py
@@ -390,7 +402,8 @@ screen_json = get_screen_state()
 
 ### Next 2 Weeks
 - [ ] **PyPI package:** `pip install android-use`
-- [ ] **Multi-LLM support:** Claude, Gemini, Llama
+- [x] **Multi-LLM support:** OpenAI, Claude, Gemini, Bedrock
+- [ ] **Llama support:** Local model integration
 - [ ] **WhatsApp integration:** Pre-built actions for messaging
 - [ ] **Error recovery:** Retry logic, fallback strategies
 

diff --git a/action_models.py b/action_models.py
@@ -0,0 +1,32 @@
+from typing import Literal, Union, List
+from pydantic import BaseModel, Field, field_validator
+
+class TapAction(BaseModel):
+    action: Literal["tap"] = "tap"
+    coordinates: List[int] = Field(..., description="[x, y] coordinates to tap")
+    reason: str = Field(..., description="Why this tap is needed")
+
+    @field_validator("coordinates")
+    @classmethod
+    def validate_coordinates(cls, v):
+        if len(v) != 2:
+            raise ValueError("coordinates must be [x, y]")
+        if not all(isinstance(coord, int) and coord >= 0 for coord in v):
+            raise ValueError("coordinates must be positive integers")
+        return v
+
+class TypeAction(BaseModel):
+    action: Literal["type"] = "type"
+    text: str = Field(..., description="Text to type")
+    reason: str = Field(..., description="Why this text is needed")
+
+class NavigationAction(BaseModel):
+    action: Literal["home", "back"] = Field(..., description="Navigation action")
+    reason: str = Field(..., description="Why this navigation is needed")
+
+class ControlAction(BaseModel):
+    action: Literal["wait", "done"] = Field(..., description="Control action")
+    reason: str = Field(..., description="Why this action is needed")
+
+# Union type for all possible actions
+AndroidAction = Union[TapAction, TypeAction, NavigationAction, ControlAction]
diff --git a/examples/anthropic_example.sh b/examples/anthropic_example.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Example: Using Anthropic Claude
+
+export LLM_PROVIDER="anthropic"
+export ANTHROPIC_API_KEY="sk-..."  # Replace with your key
+export ANTHROPIC_MODEL="claude-sonnet-4"  # Optional
+
+python kernel.py
diff --git a/examples/bedrock_example.sh b/examples/bedrock_example.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Example: Using AWS Bedrock with Claude
+
+export LLM_PROVIDER="bedrock"
+export AWS_PROFILE="default"  # Or use AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY
+export BEDROCK_MODEL="anthropic.claude-sonnet-4-20250514-v1:0"  # Optional
+
+python kernel.py
diff --git a/examples/gemini_example.sh b/examples/gemini_example.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Example: Using Google Gemini (cheapest option)
+
+export LLM_PROVIDER="gemini"
+export GOOGLE_API_KEY="..."  # Replace with your key
+export GEMINI_MODEL="gemini-2.0-flash-exp"  # Optional
+
+python kernel.py
diff --git a/examples/openai_example.sh b/examples/openai_example.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Example: Using OpenAI GPT-4o
+
+export LLM_PROVIDER="openai"
+export OPENAI_API_KEY="sk-..."  # Replace with your key
+export OPENAI_MODEL="gpt-4o"    # Optional: override default
+
+python kernel.py
diff --git a/kernel.py b/kernel.py
@@ -2,17 +2,25 @@
 import time
 import subprocess
 import json
-from typing import Dict, Any
-from openai import OpenAI
+import asyncio
+from typing import Dict, Any, List
+from llm_manager import LLMManager
+from action_models import TapAction, TypeAction, NavigationAction, ControlAction
 import sanitizer
 
 # --- CONFIGURATION ---
-ADB_PATH = "adb"  # Ensure adb is in your PATH
-MODEL = "gpt-4o"  # Or "gpt-4-turbo" for faster/cheaper execution
+ADB_PATH = "adb"
 SCREEN_DUMP_PATH = "/sdcard/window_dump.xml"
 LOCAL_DUMP_PATH = "window_dump.xml"
 
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+# Initialize LLM manager
+llm_manager = None
+
+def initialize_llm():
+    """Initialize the LLM manager."""
+    global llm_manager
+    if llm_manager is None:
+        llm_manager = LLMManager()
 
 def run_adb_command(command: List[str]):
     """Executes a shell command via ADB."""
@@ -39,93 +47,68 @@ def get_screen_state() -> str:
     elements = sanitizer.get_interactive_elements(xml_content)
     return json.dumps(elements, indent=2)
 
-def execute_action(action: Dict[str, Any]):
+def execute_action(action):
     """Executes the action decided by the LLM."""
-    act_type = action.get("action")
-
-    if act_type == "tap":
-        x, y = action.get("coordinates")
+    if isinstance(action, TapAction):
+        x, y = action.coordinates
         print(f"👉 Tapping: ({x}, {y})")
         run_adb_command(["shell", "input", "tap", str(x), str(y)])
-        
-    elif act_type == "type":
-        text = action.get("text").replace(" ", "%s") # ADB requires %s for spaces
-        print(f"⌨️ Typing: {action.get('text')}")
+
+    elif isinstance(action, TypeAction):
+        text = action.text.replace(" ", "%s")  # ADB requires %s for spaces
+        print(f"⌨️ Typing: {action.text}")
         run_adb_command(["shell", "input", "text", text])
-
-    elif act_type == "home":
-        print("🏠 Going Home")
-        run_adb_command(["shell", "input", "keyevent", "KEYWORDS_HOME"])
-
-    elif act_type == "back":
-        print("🔙 Going Back")
-        run_adb_command(["shell", "input", "keyevent", "KEYWORDS_BACK"])
-
-    elif act_type == "wait":
-        print("⏳ Waiting...")
-        time.sleep(2)
-
-    elif act_type == "done":
-        print("✅ Goal Achieved.")
-        exit(0)
 
-def get_llm_decision(goal: str, screen_context: str) -> Dict[str, Any]:
+    elif isinstance(action, NavigationAction):
+        if action.action == "home":
+            print("🏠 Going Home")
+            run_adb_command(["shell", "input", "keyevent", "KEYCODE_HOME"])
+        elif action.action == "back":
+            print("🔙 Going Back")
+            run_adb_command(["shell", "input", "keyevent", "KEYCODE_BACK"])
+
+    elif isinstance(action, ControlAction):
+        if action.action == "wait":
+            print("⏳ Waiting...")
+            time.sleep(2)
+        elif action.action == "done":
+            print("✅ Goal Achieved.")
+            exit(0)
+
+async def get_llm_decision(goal: str, screen_context: str):
     """Sends screen context to LLM and asks for the next move."""
-    system_prompt = """
-    You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the UI.
-
-    You will receive:
-    1. The User's Goal.
-    2. A list of interactive UI elements (JSON) with their (x,y) center coordinates.
-
-    You must output ONLY a valid JSON object with your next action.
-
-    Available Actions:
-    - {"action": "tap", "coordinates": [x, y], "reason": "Why you are tapping"}
-    - {"action": "type", "text": "Hello World", "reason": "Why you are typing"}
-    - {"action": "home", "reason": "Go to home screen"}
-    - {"action": "back", "reason": "Go back"}
-    - {"action": "wait", "reason": "Wait for loading"}
-    - {"action": "done", "reason": "Task complete"}
-
-    Example Output:
-    {"action": "tap", "coordinates": [540, 1200], "reason": "Clicking the 'Connect' button"}
-    """
-
-    response = client.chat.completions.create(
-        model=MODEL,
-        response_format={"type": "json_object"},
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": f"GOAL: {goal}\n\nSCREEN_CONTEXT:\n{screen_context}"}
-        ]
-    )
-
-    return json.loads(response.choices[0].message.content)
+    global llm_manager
+    if llm_manager is None:
+        initialize_llm()
+
+    action = await llm_manager.get_decision(goal, screen_context)
+    return action
+
+async def run_agent(goal: str, max_steps=10):
+    """Main agent loop."""
+    initialize_llm()
+    print(f"🚀 Android Use Agent Started")
+    print(f"📡 Provider: {llm_manager.provider} | Model: {llm_manager.model}")
+    print(f"🎯 Goal: {goal}\n")
 
-def run_agent(goal: str, max_steps=10):
-    print(f"🚀 Android Use Agent Started. Goal: {goal}")
-
     for step in range(max_steps):
         print(f"\n--- Step {step + 1} ---")
-        
+
         # 1. Perception
         print("👀 Scanning Screen...")
         screen_context = get_screen_state()
-        
+
         # 2. Reasoning
         print("🧠 Thinking...")
-        decision = get_llm_decision(goal, screen_context)
-        print(f"💡 Decision: {decision.get('reason')}")
-        
+        decision = await get_llm_decision(goal, screen_context)
+        print(f"💡 Decision: {decision.reason}")
+
         # 3. Action
         execute_action(decision)
-        
+
         # Wait for UI to update
         time.sleep(2)
 
 if __name__ == "__main__":
-    # Example Goal: "Open settings and turn on Wi-Fi"
-    # Or your demo goal: "Find the 'Connect' button and tap it"
     GOAL = input("Enter your goal: ")
-    run_agent(GOAL)
+    asyncio.run(run_agent(GOAL))
diff --git a/llm_manager.py b/llm_manager.py
@@ -0,0 +1,115 @@
+import os
+from typing import Optional
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.models.anthropic import AnthropicModel
+from pydantic_ai.models.google import GoogleModel
+from pydantic_ai.models.bedrock import BedrockConverseModel
+from action_models import AndroidAction
+
+class LLMManager:
+    """Manages LLM provider initialization and agent creation."""
+
+    DEFAULT_MODELS = {
+        "openai": "gpt-4o",
+        "anthropic": "claude-sonnet-4",
+        "gemini": "gemini-2.0-flash-exp",
+        "bedrock": "anthropic.claude-sonnet-4-20250514-v1:0"
+    }
+
+    def __init__(self):
+        self.provider = self._get_provider()
+        self.model = self._get_model()
+        self.agent = self._create_agent()
+
+    def _get_provider(self) -> str:
+        """Get provider from environment."""
+        provider = os.environ.get("LLM_PROVIDER")
+        if not provider:
+            raise ValueError(
+                "LLM_PROVIDER environment variable must be set. "
+                "Valid values: openai, anthropic, gemini, bedrock"
+            )
+        if provider not in self.DEFAULT_MODELS:
+            raise ValueError(
+                f"Invalid LLM_PROVIDER '{provider}'. "
+                f"Valid values: {', '.join(self.DEFAULT_MODELS.keys())}"
+            )
+        return provider
+
+    def _get_model(self) -> str:
+        """Get model name from environment or use default."""
+        env_var = f"{self.provider.upper()}_MODEL"
+        model = os.environ.get(env_var)
+        if not model:
+            model = self.DEFAULT_MODELS[self.provider]
+        return model
+
+    def _validate_credentials(self):
+        """Validate that required credentials are present."""
+        if self.provider == "openai":
+            if not os.environ.get("OPENAI_API_KEY"):
+                raise ValueError("OPENAI_API_KEY environment variable must be set")
+        elif self.provider == "anthropic":
+            if not os.environ.get("ANTHROPIC_API_KEY"):
+                raise ValueError("ANTHROPIC_API_KEY environment variable must be set")
+        elif self.provider == "gemini":
+            if not os.environ.get("GOOGLE_API_KEY"):
+                raise ValueError("GOOGLE_API_KEY environment variable must be set")
+        elif self.provider == "bedrock":
+            # Bedrock uses AWS credentials - boto3 will handle validation
+            pass
+
+    def _create_agent(self) -> Agent:
+        """Create Pydantic AI agent with appropriate model."""
+        self._validate_credentials()
+
+        if self.provider == "openai":
+            model = OpenAIChatModel(self.model)
+        elif self.provider == "anthropic":
+            model = AnthropicModel(self.model)
+        elif self.provider == "gemini":
+            model = GoogleModel(self.model)
+        elif self.provider == "bedrock":
+            model = BedrockConverseModel(self.model)
+
+        # Create agent with structured output
+        agent = Agent(
+            model=model,
+            output_type=AndroidAction,
+            system_prompt=self._get_system_prompt()
+        )
+
+        return agent
+
+    def _get_system_prompt(self) -> str:
+        """Get the system prompt for the Android agent."""
+        return """You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the UI.
+
+You will receive:
+1. The User's Goal.
+2. A list of interactive UI elements (JSON) with their (x,y) center coordinates.
+
+You must decide the next action to take.
+
+Available Actions:
+- tap: Tap at specific coordinates
+- type: Type text into a field
+- home: Go to home screen
+- back: Go back to previous screen
+- wait: Wait for loading or animation
+- done: Task is complete
+
+Always provide a clear reason for your action."""
+
+    async def get_decision(self, goal: str, screen_context: str) -> AndroidAction:
+        """Get LLM decision for next action."""
+        prompt = f"""GOAL: {goal}
+
+SCREEN_CONTEXT:
+{screen_context}
+
+What action should I take next?"""
+
+        result = await self.agent.run(prompt)
+        return result.data
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,5 @@
-openai>=1.12.0
+pydantic-ai-slim[openai,anthropic,google,bedrock]
+
+# Dev dependencies
+pytest>=8.0.0
+pytest-asyncio>=0.24.0