Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,20 @@ brew install android-platform-tools # macOS
# 4. Connect device & verify
adb devices

# 5. Set API key
export OPENAI_API_KEY="sk-..."
# 5. Configure LLM Provider (choose one)
export LLM_PROVIDER="openai" # or anthropic, gemini, bedrock

# Set appropriate API key
export OPENAI_API_KEY="sk-..." # for OpenAI
# export ANTHROPIC_API_KEY="sk-..." # for Anthropic
# export GOOGLE_API_KEY="..." # for Gemini
# export AWS_PROFILE="default" # for Bedrock

# Optional: Override default model
# export OPENAI_MODEL="gpt-4o"
# export ANTHROPIC_MODEL="claude-sonnet-4"
# export GEMINI_MODEL="gemini-2.0-flash-exp"
# export BEDROCK_MODEL="anthropic.claude-sonnet-4-20250514-v1:0"

# 6. Run your first agent
python kernel.py
Expand Down Expand Up @@ -390,7 +402,8 @@ screen_json = get_screen_state()

### Next 2 Weeks
- [ ] **PyPI package:** `pip install android-use`
- [ ] **Multi-LLM support:** Claude, Gemini, Llama
- [x] **Multi-LLM support:** OpenAI, Claude, Gemini, Bedrock
- [ ] **Llama support:** Local model integration
- [ ] **WhatsApp integration:** Pre-built actions for messaging
- [ ] **Error recovery:** Retry logic, fallback strategies

Expand Down
32 changes: 32 additions & 0 deletions action_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import Literal, Union, List
from pydantic import BaseModel, Field, field_validator

class TapAction(BaseModel):
action: Literal["tap"] = "tap"
coordinates: List[int] = Field(..., description="[x, y] coordinates to tap")
reason: str = Field(..., description="Why this tap is needed")

@field_validator("coordinates")
@classmethod
def validate_coordinates(cls, v):
if len(v) != 2:
raise ValueError("coordinates must be [x, y]")
if not all(isinstance(coord, int) and coord >= 0 for coord in v):
raise ValueError("coordinates must be positive integers")
return v

class TypeAction(BaseModel):
action: Literal["type"] = "type"
text: str = Field(..., description="Text to type")
reason: str = Field(..., description="Why this text is needed")

class NavigationAction(BaseModel):
action: Literal["home", "back"] = Field(..., description="Navigation action")
reason: str = Field(..., description="Why this navigation is needed")

class ControlAction(BaseModel):
action: Literal["wait", "done"] = Field(..., description="Control action")
reason: str = Field(..., description="Why this action is needed")

# Union type for all possible actions
AndroidAction = Union[TapAction, TypeAction, NavigationAction, ControlAction]
8 changes: 8 additions & 0 deletions examples/anthropic_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
# Example: Using Anthropic Claude

export LLM_PROVIDER="anthropic"
export ANTHROPIC_API_KEY="sk-..." # Replace with your key
export ANTHROPIC_MODEL="claude-sonnet-4" # Optional

python kernel.py
8 changes: 8 additions & 0 deletions examples/bedrock_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
# Example: Using AWS Bedrock with Claude

export LLM_PROVIDER="bedrock"
export AWS_PROFILE="default" # Or use AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY
export BEDROCK_MODEL="anthropic.claude-sonnet-4-20250514-v1:0" # Optional

python kernel.py
8 changes: 8 additions & 0 deletions examples/gemini_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
# Example: Using Google Gemini (cheapest option)

export LLM_PROVIDER="gemini"
export GOOGLE_API_KEY="..." # Replace with your key
export GEMINI_MODEL="gemini-2.0-flash-exp" # Optional

python kernel.py
8 changes: 8 additions & 0 deletions examples/openai_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
# Example: Using OpenAI GPT-4o

export LLM_PROVIDER="openai"
export OPENAI_API_KEY="sk-..." # Replace with your key
export OPENAI_MODEL="gpt-4o" # Optional: override default

python kernel.py
131 changes: 57 additions & 74 deletions kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,25 @@
import time
import subprocess
import json
from typing import Dict, Any
from openai import OpenAI
import asyncio
from typing import Dict, Any, List
from llm_manager import LLMManager
from action_models import TapAction, TypeAction, NavigationAction, ControlAction
import sanitizer

# --- CONFIGURATION ---
ADB_PATH = "adb" # Ensure adb is in your PATH
MODEL = "gpt-4o" # Or "gpt-4-turbo" for faster/cheaper execution
ADB_PATH = "adb"
SCREEN_DUMP_PATH = "/sdcard/window_dump.xml"
LOCAL_DUMP_PATH = "window_dump.xml"

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# Initialize LLM manager
llm_manager = None

def initialize_llm():
"""Initialize the LLM manager."""
global llm_manager
if llm_manager is None:
llm_manager = LLMManager()

def run_adb_command(command: List[str]):
"""Executes a shell command via ADB."""
Expand All @@ -39,93 +47,68 @@ def get_screen_state() -> str:
elements = sanitizer.get_interactive_elements(xml_content)
return json.dumps(elements, indent=2)

def execute_action(action: Dict[str, Any]):
def execute_action(action):
"""Executes the action decided by the LLM."""
act_type = action.get("action")

if act_type == "tap":
x, y = action.get("coordinates")
if isinstance(action, TapAction):
x, y = action.coordinates
print(f"👉 Tapping: ({x}, {y})")
run_adb_command(["shell", "input", "tap", str(x), str(y)])
elif act_type == "type":
text = action.get("text").replace(" ", "%s") # ADB requires %s for spaces
print(f"⌨️ Typing: {action.get('text')}")

elif isinstance(action, TypeAction):
text = action.text.replace(" ", "%s") # ADB requires %s for spaces
print(f"⌨️ Typing: {action.text}")
run_adb_command(["shell", "input", "text", text])

elif act_type == "home":
print("🏠 Going Home")
run_adb_command(["shell", "input", "keyevent", "KEYWORDS_HOME"])

elif act_type == "back":
print("🔙 Going Back")
run_adb_command(["shell", "input", "keyevent", "KEYWORDS_BACK"])

elif act_type == "wait":
print("⏳ Waiting...")
time.sleep(2)

elif act_type == "done":
print("✅ Goal Achieved.")
exit(0)

def get_llm_decision(goal: str, screen_context: str) -> Dict[str, Any]:
elif isinstance(action, NavigationAction):
if action.action == "home":
print("🏠 Going Home")
run_adb_command(["shell", "input", "keyevent", "KEYCODE_HOME"])
elif action.action == "back":
print("🔙 Going Back")
run_adb_command(["shell", "input", "keyevent", "KEYCODE_BACK"])

elif isinstance(action, ControlAction):
if action.action == "wait":
print("⏳ Waiting...")
time.sleep(2)
elif action.action == "done":
print("✅ Goal Achieved.")
exit(0)

async def get_llm_decision(goal: str, screen_context: str):
"""Sends screen context to LLM and asks for the next move."""
system_prompt = """
You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the UI.

You will receive:
1. The User's Goal.
2. A list of interactive UI elements (JSON) with their (x,y) center coordinates.

You must output ONLY a valid JSON object with your next action.

Available Actions:
- {"action": "tap", "coordinates": [x, y], "reason": "Why you are tapping"}
- {"action": "type", "text": "Hello World", "reason": "Why you are typing"}
- {"action": "home", "reason": "Go to home screen"}
- {"action": "back", "reason": "Go back"}
- {"action": "wait", "reason": "Wait for loading"}
- {"action": "done", "reason": "Task complete"}

Example Output:
{"action": "tap", "coordinates": [540, 1200], "reason": "Clicking the 'Connect' button"}
"""

response = client.chat.completions.create(
model=MODEL,
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"GOAL: {goal}\n\nSCREEN_CONTEXT:\n{screen_context}"}
]
)

return json.loads(response.choices[0].message.content)
global llm_manager
if llm_manager is None:
initialize_llm()

action = await llm_manager.get_decision(goal, screen_context)
return action

async def run_agent(goal: str, max_steps=10):
"""Main agent loop."""
initialize_llm()
print(f"🚀 Android Use Agent Started")
print(f"📡 Provider: {llm_manager.provider} | Model: {llm_manager.model}")
print(f"🎯 Goal: {goal}\n")

def run_agent(goal: str, max_steps=10):
print(f"🚀 Android Use Agent Started. Goal: {goal}")

for step in range(max_steps):
print(f"\n--- Step {step + 1} ---")

# 1. Perception
print("👀 Scanning Screen...")
screen_context = get_screen_state()

# 2. Reasoning
print("🧠 Thinking...")
decision = get_llm_decision(goal, screen_context)
print(f"💡 Decision: {decision.get('reason')}")
decision = await get_llm_decision(goal, screen_context)
print(f"💡 Decision: {decision.reason}")

# 3. Action
execute_action(decision)

# Wait for UI to update
time.sleep(2)

if __name__ == "__main__":
# Example Goal: "Open settings and turn on Wi-Fi"
# Or your demo goal: "Find the 'Connect' button and tap it"
GOAL = input("Enter your goal: ")
run_agent(GOAL)
asyncio.run(run_agent(GOAL))
115 changes: 115 additions & 0 deletions llm_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import os
from typing import Optional
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.models.anthropic import AnthropicModel
from pydantic_ai.models.google import GoogleModel
from pydantic_ai.models.bedrock import BedrockConverseModel
from action_models import AndroidAction

class LLMManager:
"""Manages LLM provider initialization and agent creation."""

DEFAULT_MODELS = {
"openai": "gpt-4o",
"anthropic": "claude-sonnet-4",
"gemini": "gemini-2.0-flash-exp",
"bedrock": "anthropic.claude-sonnet-4-20250514-v1:0"
}

def __init__(self):
self.provider = self._get_provider()
self.model = self._get_model()
self.agent = self._create_agent()

def _get_provider(self) -> str:
"""Get provider from environment."""
provider = os.environ.get("LLM_PROVIDER")
if not provider:
raise ValueError(
"LLM_PROVIDER environment variable must be set. "
"Valid values: openai, anthropic, gemini, bedrock"
)
if provider not in self.DEFAULT_MODELS:
raise ValueError(
f"Invalid LLM_PROVIDER '{provider}'. "
f"Valid values: {', '.join(self.DEFAULT_MODELS.keys())}"
)
return provider

def _get_model(self) -> str:
"""Get model name from environment or use default."""
env_var = f"{self.provider.upper()}_MODEL"
model = os.environ.get(env_var)
if not model:
model = self.DEFAULT_MODELS[self.provider]
return model

def _validate_credentials(self):
"""Validate that required credentials are present."""
if self.provider == "openai":
if not os.environ.get("OPENAI_API_KEY"):
raise ValueError("OPENAI_API_KEY environment variable must be set")
elif self.provider == "anthropic":
if not os.environ.get("ANTHROPIC_API_KEY"):
raise ValueError("ANTHROPIC_API_KEY environment variable must be set")
elif self.provider == "gemini":
if not os.environ.get("GOOGLE_API_KEY"):
raise ValueError("GOOGLE_API_KEY environment variable must be set")
elif self.provider == "bedrock":
# Bedrock uses AWS credentials - boto3 will handle validation
pass

def _create_agent(self) -> Agent:
"""Create Pydantic AI agent with appropriate model."""
self._validate_credentials()

if self.provider == "openai":
model = OpenAIChatModel(self.model)
elif self.provider == "anthropic":
model = AnthropicModel(self.model)
elif self.provider == "gemini":
model = GoogleModel(self.model)
elif self.provider == "bedrock":
model = BedrockConverseModel(self.model)

# Create agent with structured output
agent = Agent(
model=model,
output_type=AndroidAction,
system_prompt=self._get_system_prompt()
)

return agent

def _get_system_prompt(self) -> str:
"""Get the system prompt for the Android agent."""
return """You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the UI.

You will receive:
1. The User's Goal.
2. A list of interactive UI elements (JSON) with their (x,y) center coordinates.

You must decide the next action to take.

Available Actions:
- tap: Tap at specific coordinates
- type: Type text into a field
- home: Go to home screen
- back: Go back to previous screen
- wait: Wait for loading or animation
- done: Task is complete

Always provide a clear reason for your action."""

async def get_decision(self, goal: str, screen_context: str) -> AndroidAction:
"""Get LLM decision for next action."""
prompt = f"""GOAL: {goal}

SCREEN_CONTEXT:
{screen_context}

What action should I take next?"""

result = await self.agent.run(prompt)
return result.data
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
openai>=1.12.0
pydantic-ai-slim[openai,anthropic,google,bedrock]

# Dev dependencies
pytest>=8.0.0
pytest-asyncio>=0.24.0
Loading