MuSEAgent/run_react.py at main · DeepExperience/MuSEAgent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Example: ReAct mode with tool use
Enables the agent to use tools in a ReAct (Reasoning + Acting) loop.
"""

import asyncio
from runner import Runner


async def main():
    """Run evaluation with ReAct mode (tools enabled, no retrieval)."""

    dataset_jsonl = "data/benchmark/example_dataset.jsonl"
    dataset_image_dir = "data/benchmark/images"

    agent_config = {
        "tool_bank": [
            "ocr",
            "solve_math_equation",
            "web_search",
            "localize_objects",
            "zoom_in",
            "calculator",
            "crop",
            "visualize_regions",
            "estimate_region_depth",
            "estimate_object_depth",
            "get_image2images_similarity",
            "get_image2texts_similarity",
            "get_text2images_similarity"
        ],
        "model_name": "qwen3-vl-32b-instruct",
        "max_tokens": 40000,
        "temperature": 0.7,
        "memory_dir": "memory/react",
        "max_iterations": 20,
        "base_url": "http://localhost:8000/v1",  # Replace with your API endpoint
        "api_keys": ["your-api-key-here"],  # Replace with your actual API keys
    }

    runner = Runner(
        jsonl_path=dataset_jsonl,
        image_dir=dataset_image_dir,
        agent_config=agent_config,
        output_dir="results/react",
        batch_size=100,
        max_concurrent=10,
        verbose=True
    )

    stats = await runner.run_evaluation()

    print("\n" + "="*80)
    print("📊 Evaluation Statistics")
    print("="*80)
    print(f"Overall Accuracy: {stats['accuracy']*100:.2f}%")
    print(f"Total: {stats['total']}, Correct: {stats['correct']}")
    print("="*80)


if __name__ == "__main__":
    asyncio.run(main())