Atlas/gemini_client.py at main · coded-devs/Atlas · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
gemini_client.py — Shared Gemini call wrapper with model fallback.

Both atlas.py (CLI) and app.py (Streamlit) route their generate_content
calls through smart_generate(). When a model returns a 429 /
RESOURCE_EXHAUSTED rate-limit error, we transparently fall back to the
next model in MODEL_CHAIN instead of failing the whole request.

Any non-rate-limit error (bad request, auth, server error, etc.) is
re-raised unchanged so callers can handle it normally.
"""

# Ordered by preference. Invalid/unavailable models are skipped
# automatically via 404 handling. Chain ends with high-quota
# models (1.5-flash: 1500 RPD) as safety net.
MODEL_CHAIN = [
    "gemini-3.5-flash",      # newer, 20 RPD free
    "gemini-3.1-pro",        # may not be available, will skip via 404
    "gemini-3.0-flash",      # may not be available, will skip via 404
    "gemini-2.5-flash",      # confirmed working, 20 RPD free
    "gemini-2.5-pro",        # exists but may have 0 quota
    "gemini-2.5-flash-lite", # lightweight variant
    "gemini-2.0-flash",      # confirmed working, 20 RPD free
    "gemini-1.5-flash",      # older but 1500 RPD free, reliable
    "gemini-1.5-flash-8b",   # lightweight, 1500 RPD free
]


def _should_fallback(err: Exception) -> bool:
    """
    True for 429 / RESOURCE_EXHAUSTED rate-limit errors, 503 / UNAVAILABLE
    overload errors, and 404 / NOT_FOUND missing model errors.

    The google-genai SDK surfaces these as an APIError with code == 429 and
    status "RESOURCE_EXHAUSTED". We also fall back to string matching so the
    detection survives SDK changes — but we never treat an arbitrary error
    as a rate limit.
    """
    code = getattr(err, "code", None)
    if code in (429, 503, 404):
        return True

    status = getattr(err, "status", None)
    if status and any(x in str(status) for x in ["RESOURCE_EXHAUSTED", "UNAVAILABLE", "NOT_FOUND"]):
        return True

    message = str(err)
    return any(x in message for x in ["429", "RESOURCE_EXHAUSTED", "503", "UNAVAILABLE", "404", "NOT_FOUND"])


def smart_generate(client, contents, config, on_status=None):
    """
    Call client.models.generate_content(), trying each model in MODEL_CHAIN
    until one succeeds or all are exhausted.

    Args:
        client:   a google.genai Client.
        contents: the conversation contents to send.
        config:   a GenerateContentConfig (system prompt + tools).
        on_status: optional callback(str). When provided (e.g. the Streamlit
                   status container's .write), it receives which model is in
                   use and any fallback messages, so the user can see the
                   fallback happen. Fallback messages are always printed too.

    Returns:
        The generate_content response from the first model that works.

    Raises:
        The last rate-limit error if every model in the chain is exhausted,
        or any non-rate-limit error immediately (unchanged).
    """
    last_error = None

    for i, model in enumerate(MODEL_CHAIN):
        if on_status:
            on_status(f"Using model: `{model}`")

        try:
            return client.models.generate_content(
                model=model,
                contents=contents,
                config=config,
            )
        except Exception as e:
            # Only certain errors trigger fallback. Anything else
            # is a real failure — re-raise it untouched.
            if not _should_fallback(e):
                raise

            last_error = e
            next_model = MODEL_CHAIN[i + 1] if i + 1 < len(MODEL_CHAIN) else None

            if next_model:
                msg = f"Rate limited on {model}, switching to {next_model}..."
                print(msg)
                if on_status:
                    on_status(msg)
            else:
                msg = f"Rate limited on {model}, no more models to try."
                print(msg)
                if on_status:
                    on_status(msg)

    # Exhausted every model in the chain.
    if last_error is not None:
        raise last_error
    raise RuntimeError("smart_generate: MODEL_CHAIN is empty.")