From b83b339514524c2305c137fc55a1b26ce15f0cca Mon Sep 17 00:00:00 2001
From: Vigno04 <davivigna2004@gmail.com>
Date: Fri, 27 Mar 2026 17:27:28 +0100
Subject: [PATCH 1/6] Add Google temporary chat mode with metadata fallback

---
 README.md           |  17 ++++++
 app/server/chat.py  | 124 ++++++++++++++++++++++++++++++++++++++++----
 app/utils/config.py |   8 +++
 config/config.yaml  |   2 +
 4 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 6b6f485..c75571f 100644
--- a/README.md
+++ b/README.md
@@ -211,6 +211,23 @@ To use Gemini-FastAPI, you need to extract your Gemini session cookies:
 
 Each client entry can be configured with a different proxy to work around rate limits. Omit the `proxy` field or set it to `null` or an empty string to keep a direct connection.
 
+### Chat Session Mode
+
+You can control whether the server reuses Google chat metadata or always starts fresh chats:
+
+```yaml
+gemini:
+  chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account)
+  fallback_to_internal_on_missing_chat: true # Retry with local history replay when reuse fails
+```
+
+Environment variable equivalents:
+
+```bash
+export CONFIG_GEMINI__CHAT_MODE="temporary"
+export CONFIG_GEMINI__FALLBACK_TO_INTERNAL_ON_MISSING_CHAT=true
+```
+
 ### Custom Models
 
 You can define custom models in `config/config.yaml` or via environment variables.
diff --git a/app/server/chat.py b/app/server/chat.py
index 8d8be91..47a539b 100644
--- a/app/server/chat.py
+++ b/app/server/chat.py
@@ -71,6 +71,15 @@
 MAX_CHARS_PER_REQUEST = int(g_config.gemini.max_chars_per_request * 0.9)
 METADATA_TTL_MINUTES = 15
 
+_MISSING_CHAT_ERROR_MARKERS = (
+    "not found",
+    "404",
+    "invalid",
+    "metadata",
+    "conversation",
+    "chat",
+)
+
 router = APIRouter()
 
 
@@ -745,6 +754,10 @@ async def _find_reusable_session(
     messages: list[Message],
 ) -> tuple[ChatSession | None, GeminiClientWrapper | None, list[Message]]:
     """Find an existing chat session matching the longest suitable history prefix."""
+    if g_config.gemini.chat_mode == "temporary":
+        logger.debug("Temporary chat mode enabled; skipping metadata-based session reuse.")
+        return None, None, messages
+
     if len(messages) < 2:
         return None, None, messages
 
@@ -759,7 +772,14 @@ async def _find_reusable_session(
                     age_minutes = (now - updated_at).total_seconds() / 60
                     if age_minutes <= METADATA_TTL_MINUTES:
                         client = await pool.acquire(conv.client_id)
-                        session = client.start_chat(metadata=conv.metadata, model=model)
+                        try:
+                            session = client.start_chat(metadata=conv.metadata, model=model)
+                        except Exception as exc:
+                            logger.warning(
+                                f"Failed to reuse metadata chat at prefix length {search_end}: {exc}"
+                            )
+                            search_end -= 1
+                            continue
                         remain = messages[search_end:]
                         logger.debug(
                             f"Match found at prefix length {search_end}/{len(messages)}. Client: {conv.client_id}"
@@ -776,7 +796,6 @@ async def _find_reusable_session(
                 logger.warning(
                     f"Error checking LMDB for reusable session at length {search_end}: {e}"
                 )
-                break
         search_end -= 1
 
     logger.debug(f"No reusable session found for {len(messages)} messages.")
@@ -788,13 +807,14 @@ async def _send_with_split(
     text: str,
     files: list[Path | str | io.BytesIO] | None = None,
     stream: bool = False,
+    temporary: bool = False,
 ) -> AsyncGenerator[ModelOutput] | ModelOutput:
     """Send text to Gemini, splitting or converting to attachment if too long."""
     if len(text) <= MAX_CHARS_PER_REQUEST:
         try:
             if stream:
-                return session.send_message_stream(text, files=files)
-            return await session.send_message(text, files=files)
+                return session.send_message_stream(text, files=files, temporary=temporary)
+            return await session.send_message(text, files=files, temporary=temporary)
         except Exception as e:
             logger.exception(f"Error sending message to Gemini: {e}")
             raise
@@ -815,13 +835,73 @@ async def _send_with_split(
             "3. Execute the instructions or answer the questions found *inside* that file immediately.\n"
         )
         if stream:
-            return session.send_message_stream(instruction, files=final_files)
-        return await session.send_message(instruction, files=final_files)
+            return session.send_message_stream(instruction, files=final_files, temporary=temporary)
+        return await session.send_message(instruction, files=final_files, temporary=temporary)
     except Exception as e:
         logger.exception(f"Error sending large text as file to Gemini: {e}")
         raise
 
 
+def _is_missing_chat_error(exc: Exception) -> bool:
+    lowered = str(exc).lower()
+    if not lowered:
+        return False
+    return all(marker in lowered for marker in ("chat", "not found")) or any(
+        marker in lowered for marker in _MISSING_CHAT_ERROR_MARKERS
+    )
+
+
+async def _send_with_internal_fallback(
+    *,
+    pool: GeminiClientPool,
+    model: Model,
+    session: ChatSession,
+    client: GeminiClientWrapper,
+    current_input: str,
+    files: list[Path | str | io.BytesIO],
+    full_prepared_messages: list[Message],
+    tmp_dir: Path,
+    stream: bool,
+    reused_session: bool,
+    temporary: bool,
+) -> tuple[AsyncGenerator[ModelOutput] | ModelOutput, ChatSession, GeminiClientWrapper]:
+    try:
+        output = await _send_with_split(
+            session,
+            current_input,
+            files=files,
+            stream=stream,
+            temporary=temporary,
+        )
+        return output, session, client
+    except Exception as exc:
+        should_fallback = (
+            g_config.gemini.fallback_to_internal_on_missing_chat
+            and reused_session
+            and not stream
+            and _is_missing_chat_error(exc)
+        )
+        if not should_fallback:
+            raise
+
+        logger.warning(
+            "Metadata-backed chat reuse failed; retrying with internal history replay in a fresh chat."
+        )
+        fallback_client = await pool.acquire()
+        fallback_session = fallback_client.start_chat(model=model)
+        fallback_input, fallback_files = await GeminiClientWrapper.process_conversation(
+            full_prepared_messages, tmp_dir
+        )
+        output = await _send_with_split(
+            fallback_session,
+            fallback_input,
+            files=fallback_files,
+            stream=False,
+            temporary=temporary,
+        )
+        return output, fallback_session, fallback_client
+
+
 class StreamingOutputFilter:
     """
     Filter to suppress technical protocol markers, tool calls, and system hints from the stream.
@@ -1603,6 +1683,7 @@ async def create_chat_completion(
     )
 
     session, client, remain = await _find_reusable_session(db, pool, model, msgs)
+    reused_session = session is not None
 
     if session:
         if not remain:
@@ -1636,14 +1717,25 @@ async def create_chat_completion(
 
     completion_id = f"chatcmpl-{uuid.uuid4()}"
     created_time = int(datetime.now(tz=UTC).timestamp())
+    use_google_temporary_mode = g_config.gemini.chat_mode == "temporary"
 
     try:
         assert session and client
         logger.debug(
             f"Client ID: {client.id}, Input length: {len(m_input)}, files count: {len(files)}"
         )
-        resp_or_stream = await _send_with_split(
-            session, m_input, files=files, stream=request.stream
+        resp_or_stream, session, client = await _send_with_internal_fallback(
+            pool=pool,
+            model=model,
+            session=session,
+            client=client,
+            current_input=m_input,
+            files=files,
+            full_prepared_messages=msgs,
+            tmp_dir=tmp_dir,
+            stream=bool(request.stream),
+            reused_session=reused_session,
+            temporary=use_google_temporary_mode,
         )
     except Exception as e:
         logger.exception("Gemini API error")
@@ -1785,6 +1877,7 @@ async def create_response(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
 
     session, client, remain = await _find_reusable_session(db, pool, model, messages)
+    reused_session = session is not None
     if session:
         msgs = _prepare_messages_for_model(
             remain,
@@ -1812,14 +1905,25 @@ async def create_response(
 
     response_id = f"resp_{uuid.uuid4().hex}"
     created_time = int(datetime.now(tz=UTC).timestamp())
+    use_google_temporary_mode = g_config.gemini.chat_mode == "temporary"
 
     try:
         assert session and client
         logger.debug(
             f"Client ID: {client.id}, Input length: {len(m_input)}, files count: {len(files)}"
         )
-        resp_or_stream = await _send_with_split(
-            session, m_input, files=files, stream=request.stream
+        resp_or_stream, session, client = await _send_with_internal_fallback(
+            pool=pool,
+            model=model,
+            session=session,
+            client=client,
+            current_input=m_input,
+            files=files,
+            full_prepared_messages=messages,
+            tmp_dir=tmp_dir,
+            stream=bool(request.stream),
+            reused_session=reused_session,
+            temporary=use_google_temporary_mode,
         )
     except Exception as e:
         logger.exception("Gemini API error")
diff --git a/app/utils/config.py b/app/utils/config.py
index 69af2e1..ba168e0 100644
--- a/app/utils/config.py
+++ b/app/utils/config.py
@@ -96,6 +96,14 @@ class GeminiConfig(BaseModel):
         ge=1,
         description="Maximum characters Gemini Web can accept per request",
     )
+    chat_mode: Literal["normal", "temporary"] = Field(
+        default="normal",
+        description="Chat mode: 'normal' reuses Google chat metadata, 'temporary' always starts fresh chats",
+    )
+    fallback_to_internal_on_missing_chat: bool = Field(
+        default=True,
+        description="Retry by replaying local history when metadata-based Google chat reuse fails",
+    )
 
     @field_validator("models", mode="before")
     @classmethod
diff --git a/config/config.yaml b/config/config.yaml
index bd9fbc0..b7581b1 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -28,6 +28,8 @@ gemini:
   refresh_interval: 600    # Refresh interval in seconds (Not less than 60s)
   verbose: false           # Enable verbose logging for Gemini requests
   max_chars_per_request: 1000000     # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit
+  chat_mode: "normal"     # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account)
+  fallback_to_internal_on_missing_chat: true  # Retry with internal history replay when metadata-based chat is missing
   model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only)
   models: []
 

From 9144833835e374fec003acdffbe8134ea3eb6607 Mon Sep 17 00:00:00 2001
From: Vigno04 <davivigna2004@gmail.com>
Date: Sat, 28 Mar 2026 09:55:08 +0100
Subject: [PATCH 2/6] Refine chat mode config and fallback detection

---
 README.md           |  2 --
 app/server/chat.py  | 31 ++++++++++++++-----------------
 app/utils/config.py | 16 ++++++++++------
 config/config.yaml  |  1 -
 4 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index c75571f..41a9030 100644
--- a/README.md
+++ b/README.md
@@ -218,14 +218,12 @@ You can control whether the server reuses Google chat metadata or always starts
 ```yaml
 gemini:
   chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account)
-  fallback_to_internal_on_missing_chat: true # Retry with local history replay when reuse fails
 ```
 
 Environment variable equivalents:
 
 ```bash
 export CONFIG_GEMINI__CHAT_MODE="temporary"
-export CONFIG_GEMINI__FALLBACK_TO_INTERNAL_ON_MISSING_CHAT=true
 ```
 
 ### Custom Models
diff --git a/app/server/chat.py b/app/server/chat.py
index 47a539b..415c8a9 100644
--- a/app/server/chat.py
+++ b/app/server/chat.py
@@ -1,6 +1,7 @@
 import base64
 import hashlib
 import io
+import re
 import reprlib
 import uuid
 from collections.abc import AsyncGenerator
@@ -54,6 +55,7 @@
 )
 from app.services import GeminiClientPool, GeminiClientWrapper, LMDBConversationStore
 from app.utils import g_config
+from app.utils.config import ChatMode
 from app.utils.helper import (
     STREAM_MASTER_RE,
     STREAM_TAIL_RE,
@@ -71,13 +73,11 @@
 MAX_CHARS_PER_REQUEST = int(g_config.gemini.max_chars_per_request * 0.9)
 METADATA_TTL_MINUTES = 15
 
-_MISSING_CHAT_ERROR_MARKERS = (
-    "not found",
-    "404",
-    "invalid",
-    "metadata",
-    "conversation",
-    "chat",
+_MISSING_CHAT_ERROR_PATTERNS = (
+    # gemini_webapi maps ErrorCode.MODEL_INCONSISTENT (1050) to this message.
+    re.compile(r"\bmodel\s+is\s+inconsistent\s+with\s+the\s+conversation\s+history\b"),
+    # Defensive pattern for equivalent wording in wrappers/alternate versions.
+    re.compile(r"\bconversation\s+history\b[^\n]{0,120}\b(?:inconsistent|mismatch|does\s+not\s+match)\b"),
 )
 
 router = APIRouter()
@@ -754,7 +754,7 @@ async def _find_reusable_session(
     messages: list[Message],
 ) -> tuple[ChatSession | None, GeminiClientWrapper | None, list[Message]]:
     """Find an existing chat session matching the longest suitable history prefix."""
-    if g_config.gemini.chat_mode == "temporary":
+    if g_config.gemini.chat_mode == ChatMode.TEMPORARY:
         logger.debug("Temporary chat mode enabled; skipping metadata-based session reuse.")
         return None, None, messages
 
@@ -843,12 +843,10 @@ async def _send_with_split(
 
 
 def _is_missing_chat_error(exc: Exception) -> bool:
-    lowered = str(exc).lower()
-    if not lowered:
+    normalized = " ".join(part for part in (str(exc), repr(exc)) if part).lower()
+    if not normalized:
         return False
-    return all(marker in lowered for marker in ("chat", "not found")) or any(
-        marker in lowered for marker in _MISSING_CHAT_ERROR_MARKERS
-    )
+    return any(pattern.search(normalized) for pattern in _MISSING_CHAT_ERROR_PATTERNS)
 
 
 async def _send_with_internal_fallback(
@@ -876,8 +874,7 @@ async def _send_with_internal_fallback(
         return output, session, client
     except Exception as exc:
         should_fallback = (
-            g_config.gemini.fallback_to_internal_on_missing_chat
-            and reused_session
+            reused_session
             and not stream
             and _is_missing_chat_error(exc)
         )
@@ -1717,7 +1714,7 @@ async def create_chat_completion(
 
     completion_id = f"chatcmpl-{uuid.uuid4()}"
     created_time = int(datetime.now(tz=UTC).timestamp())
-    use_google_temporary_mode = g_config.gemini.chat_mode == "temporary"
+    use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY
 
     try:
         assert session and client
@@ -1905,7 +1902,7 @@ async def create_response(
 
     response_id = f"resp_{uuid.uuid4().hex}"
     created_time = int(datetime.now(tz=UTC).timestamp())
-    use_google_temporary_mode = g_config.gemini.chat_mode == "temporary"
+    use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY
 
     try:
         assert session and client
diff --git a/app/utils/config.py b/app/utils/config.py
index ba168e0..7a7c19f 100644
--- a/app/utils/config.py
+++ b/app/utils/config.py
@@ -1,6 +1,7 @@
 import ast
 import os
 import sys
+from enum import Enum
 from typing import Any, Literal
 
 import orjson
@@ -71,6 +72,13 @@ def _parse_json_string(cls, v: Any) -> Any:
         return v
 
 
+class ChatMode(str, Enum):
+    """Chat mode options for Gemini conversation handling."""
+
+    NORMAL = "normal"
+    TEMPORARY = "temporary"
+
+
 class GeminiConfig(BaseModel):
     """Gemini API configuration"""
 
@@ -96,14 +104,10 @@ class GeminiConfig(BaseModel):
         ge=1,
         description="Maximum characters Gemini Web can accept per request",
     )
-    chat_mode: Literal["normal", "temporary"] = Field(
-        default="normal",
+    chat_mode: ChatMode = Field(
+        default=ChatMode.NORMAL,
         description="Chat mode: 'normal' reuses Google chat metadata, 'temporary' always starts fresh chats",
     )
-    fallback_to_internal_on_missing_chat: bool = Field(
-        default=True,
-        description="Retry by replaying local history when metadata-based Google chat reuse fails",
-    )
 
     @field_validator("models", mode="before")
     @classmethod
diff --git a/config/config.yaml b/config/config.yaml
index b7581b1..927dad5 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -29,7 +29,6 @@ gemini:
   verbose: false           # Enable verbose logging for Gemini requests
   max_chars_per_request: 1000000     # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit
   chat_mode: "normal"     # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account)
-  fallback_to_internal_on_missing_chat: true  # Retry with internal history replay when metadata-based chat is missing
   model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only)
   models: []
 

From 8eb991d30a0015f421739ace9c3e1994e6e6f462 Mon Sep 17 00:00:00 2001
From: Vigno04 <davivigna2004@gmail.com>
Date: Sat, 28 Mar 2026 10:04:27 +0100
Subject: [PATCH 3/6] added break like suggested by copilot

---
 app/server/chat.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/app/server/chat.py b/app/server/chat.py
index 415c8a9..8bf6019 100644
--- a/app/server/chat.py
+++ b/app/server/chat.py
@@ -796,6 +796,7 @@ async def _find_reusable_session(
                 logger.warning(
                     f"Error checking LMDB for reusable session at length {search_end}: {e}"
                 )
+                break
         search_end -= 1
 
     logger.debug(f"No reusable session found for {len(messages)} messages.")

From 574fe7d0ffa471afd4d4e57f4b785b0928f240be Mon Sep 17 00:00:00 2001
From: Vigno04 <davivigna2004@gmail.com>
Date: Sat, 28 Mar 2026 11:37:59 +0100
Subject: [PATCH 4/6] Add summary compaction for temporary and fallback flows

---
 README.md           |   7 +-
 app/server/chat.py  | 154 +++++++++++++++++++++++++++++++++++++++-----
 app/utils/config.py |   2 +-
 config/config.yaml  |   2 +-
 4 files changed, 145 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 41a9030..a891808 100644
--- a/README.md
+++ b/README.md
@@ -213,17 +213,22 @@ Each client entry can be configured with a different proxy to work around rate l
 
 ### Chat Session Mode
 
-You can control whether the server reuses Google chat metadata or always starts fresh chats:
+You can control whether requests use normal Google chats or Google's temporary chat mode:
 
 ```yaml
 gemini:
   chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account)
+  max_chars_per_request: 1000000
 ```
 
+When `chat_mode` is set to `temporary`, the server applies an internal effective input limit of 90% of `max_chars_per_request`.
+If a temporary-mode request (or fallback full-history replay) exceeds this budget, older turns are compacted into a summary block while recent turns stay verbatim.
+
 Environment variable equivalents:
 
 ```bash
 export CONFIG_GEMINI__CHAT_MODE="temporary"
+export CONFIG_GEMINI__MAX_CHARS_PER_REQUEST=1000000
 ```
 
 ### Custom Models
diff --git a/app/server/chat.py b/app/server/chat.py
index 8bf6019..5cd72c4 100644
--- a/app/server/chat.py
+++ b/app/server/chat.py
@@ -70,8 +70,11 @@
     text_from_message,
 )
 
-MAX_CHARS_PER_REQUEST = int(g_config.gemini.max_chars_per_request * 0.9)
 METADATA_TTL_MINUTES = 15
+SUMMARY_KEEP_LAST_MESSAGES = 8
+SUMMARY_MAX_LINES = 24
+SUMMARY_MAX_LINE_CHARS = 320
+SUMMARY_MAX_TOTAL_CHARS = 6000
 
 _MISSING_CHAT_ERROR_PATTERNS = (
     # gemini_webapi maps ErrorCode.MODEL_INCONSISTENT (1050) to this message.
@@ -83,6 +86,105 @@
 router = APIRouter()
 
 
+def _effective_max_chars_per_request() -> int:
+    """Compute effective request size guardrail from config values."""
+    limit = g_config.gemini.max_chars_per_request
+    if g_config.gemini.chat_mode == ChatMode.TEMPORARY:
+        limit = int(limit * 0.9)
+    return max(limit, 1)
+
+
+def _build_history_summary_message(messages: list[Message]) -> Message | None:
+    """Create a compact summary message for older turns to reduce oversized replay payloads."""
+    if not messages:
+        return None
+
+    summary_lines: list[str] = []
+    used_chars = 0
+    for msg in messages:
+        if len(summary_lines) >= SUMMARY_MAX_LINES or used_chars >= SUMMARY_MAX_TOTAL_CHARS:
+            break
+
+        raw = text_from_message(msg).replace("\n", " ").strip()
+        if not raw and not msg.tool_calls:
+            continue
+
+        if msg.tool_calls:
+            raw = f"{raw} [tool_calls={len(msg.tool_calls)}]".strip()
+
+        if len(raw) > SUMMARY_MAX_LINE_CHARS:
+            raw = f"{raw[: SUMMARY_MAX_LINE_CHARS - 3]}..."
+
+        line = f"- {msg.role}: {raw}"
+        used_chars += len(line)
+        summary_lines.append(line)
+
+    if not summary_lines:
+        return None
+
+    summary_text = (
+        "Conversation summary for older turns (compacted to stay within provider limits):\n"
+        + "\n".join(summary_lines)
+        + "\nUse this as context continuity for earlier turns."
+    )
+    return Message(role="system", content=summary_text)
+
+
+def _compact_messages_with_summary(messages: list[Message]) -> list[Message]:
+    """Keep recent turns verbatim and compact older turns into one summary message."""
+    if len(messages) <= SUMMARY_KEEP_LAST_MESSAGES:
+        return messages
+
+    older = messages[:-SUMMARY_KEEP_LAST_MESSAGES]
+    recent = messages[-SUMMARY_KEEP_LAST_MESSAGES:]
+    summary_msg = _build_history_summary_message(older)
+    if not summary_msg:
+        return messages
+
+    compacted: list[Message] = []
+    if messages and messages[0].role == "system":
+        first = messages[0].model_copy(deep=True)
+        if isinstance(first.content, str):
+            first.content = (
+                f"{first.content}\n\n{summary_msg.content}"
+                if first.content
+                else str(summary_msg.content)
+            )
+            compacted.append(first)
+        else:
+            compacted.append(summary_msg)
+    else:
+        compacted.append(summary_msg)
+
+    compacted.extend(recent)
+    return compacted
+
+
+async def _process_conversation_with_compaction(
+    messages: list[Message],
+    tmp_dir: Path,
+    allow_summary_compaction: bool,
+    reason: str,
+) -> tuple[str, list[Path | str]]:
+    """Build conversation payload and optionally compact oversized histories."""
+    model_input, files = await GeminiClientWrapper.process_conversation(messages, tmp_dir)
+    effective_limit = _effective_max_chars_per_request()
+    if len(model_input) <= effective_limit or not allow_summary_compaction:
+        return model_input, files
+
+    compacted = _compact_messages_with_summary(messages)
+    if compacted == messages:
+        return model_input, files
+
+    compacted_input, compacted_files = await GeminiClientWrapper.process_conversation(
+        compacted, tmp_dir
+    )
+    logger.warning(
+        f"Input too large for {reason} ({len(model_input)}>{effective_limit}); compacted history to {len(compacted_input)} chars before send."
+    )
+    return compacted_input, compacted_files
+
+
 @dataclass
 class StructuredOutputRequirement:
     """Represents a structured response request from the client."""
@@ -754,10 +856,6 @@ async def _find_reusable_session(
     messages: list[Message],
 ) -> tuple[ChatSession | None, GeminiClientWrapper | None, list[Message]]:
     """Find an existing chat session matching the longest suitable history prefix."""
-    if g_config.gemini.chat_mode == ChatMode.TEMPORARY:
-        logger.debug("Temporary chat mode enabled; skipping metadata-based session reuse.")
-        return None, None, messages
-
     if len(messages) < 2:
         return None, None, messages
 
@@ -811,7 +909,8 @@ async def _send_with_split(
     temporary: bool = False,
 ) -> AsyncGenerator[ModelOutput] | ModelOutput:
     """Send text to Gemini, splitting or converting to attachment if too long."""
-    if len(text) <= MAX_CHARS_PER_REQUEST:
+    effective_limit = _effective_max_chars_per_request()
+    if len(text) <= effective_limit:
         try:
             if stream:
                 return session.send_message_stream(text, files=files, temporary=temporary)
@@ -821,7 +920,7 @@ async def _send_with_split(
             raise
 
     logger.info(
-        f"Message length ({len(text)}) exceeds limit ({MAX_CHARS_PER_REQUEST}). Converting text to file attachment."
+        f"Message length ({len(text)}) exceeds effective limit ({effective_limit}). Converting text to file attachment."
     )
     file_obj = io.BytesIO(text.encode("utf-8"))
     file_obj.name = "message.txt"
@@ -887,8 +986,11 @@ async def _send_with_internal_fallback(
         )
         fallback_client = await pool.acquire()
         fallback_session = fallback_client.start_chat(model=model)
-        fallback_input, fallback_files = await GeminiClientWrapper.process_conversation(
-            full_prepared_messages, tmp_dir
+        fallback_input, fallback_files = await _process_conversation_with_compaction(
+            full_prepared_messages,
+            tmp_dir,
+            allow_summary_compaction=True,
+            reason="fallback replay",
         )
         output = await _send_with_split(
             fallback_session,
@@ -1682,6 +1784,7 @@ async def create_chat_completion(
 
     session, client, remain = await _find_reusable_session(db, pool, model, msgs)
     reused_session = session is not None
+    use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY
 
     if session:
         if not remain:
@@ -1696,7 +1799,12 @@ async def create_chat_completion(
             extra_instr,
             False,
         )
-        m_input, files = await GeminiClientWrapper.process_conversation(input_msgs, tmp_dir)
+        m_input, files = await _process_conversation_with_compaction(
+            input_msgs,
+            tmp_dir,
+            allow_summary_compaction=use_google_temporary_mode,
+            reason="temporary session replay",
+        )
 
         logger.debug(
             f"Reused session {reprlib.repr(session.metadata)} - sending {len(input_msgs)} prepared messages."
@@ -1706,7 +1814,12 @@ async def create_chat_completion(
             client = await pool.acquire()
             session = client.start_chat(model=model)
             # Use the already prepared 'msgs' for a fresh session
-            m_input, files = await GeminiClientWrapper.process_conversation(msgs, tmp_dir)
+            m_input, files = await _process_conversation_with_compaction(
+                msgs,
+                tmp_dir,
+                allow_summary_compaction=use_google_temporary_mode,
+                reason="temporary fresh replay",
+            )
         except Exception as e:
             logger.exception("Error in preparing conversation")
             raise HTTPException(
@@ -1715,8 +1828,6 @@ async def create_chat_completion(
 
     completion_id = f"chatcmpl-{uuid.uuid4()}"
     created_time = int(datetime.now(tz=UTC).timestamp())
-    use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY
-
     try:
         assert session and client
         logger.debug(
@@ -1876,6 +1987,7 @@ async def create_response(
 
     session, client, remain = await _find_reusable_session(db, pool, model, messages)
     reused_session = session is not None
+    use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY
     if session:
         msgs = _prepare_messages_for_model(
             remain,
@@ -1886,7 +1998,12 @@ async def create_response(
         )
         if not msgs:
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No new messages.")
-        m_input, files = await GeminiClientWrapper.process_conversation(msgs, tmp_dir)
+        m_input, files = await _process_conversation_with_compaction(
+            msgs,
+            tmp_dir,
+            allow_summary_compaction=use_google_temporary_mode,
+            reason="temporary session replay",
+        )
         logger.debug(
             f"Reused session {reprlib.repr(session.metadata)} - sending {len(msgs)} prepared messages."
         )
@@ -1894,7 +2011,12 @@ async def create_response(
         try:
             client = await pool.acquire()
             session = client.start_chat(model=model)
-            m_input, files = await GeminiClientWrapper.process_conversation(messages, tmp_dir)
+            m_input, files = await _process_conversation_with_compaction(
+                messages,
+                tmp_dir,
+                allow_summary_compaction=use_google_temporary_mode,
+                reason="temporary fresh replay",
+            )
         except Exception as e:
             logger.exception("Error in preparing conversation")
             raise HTTPException(
@@ -1903,8 +2025,6 @@ async def create_response(
 
     response_id = f"resp_{uuid.uuid4().hex}"
     created_time = int(datetime.now(tz=UTC).timestamp())
-    use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY
-
     try:
         assert session and client
         logger.debug(
diff --git a/app/utils/config.py b/app/utils/config.py
index 7a7c19f..51838d0 100644
--- a/app/utils/config.py
+++ b/app/utils/config.py
@@ -106,7 +106,7 @@ class GeminiConfig(BaseModel):
     )
     chat_mode: ChatMode = Field(
         default=ChatMode.NORMAL,
-        description="Chat mode: 'normal' reuses Google chat metadata, 'temporary' always starts fresh chats",
+        description="Chat mode: 'normal' uses standard chats, 'temporary' uses Google's temporary mode (not saved to account), enforces an effective input limit of 90% of max_chars_per_request, and compacts older turns into a summary when oversized",
     )
 
     @field_validator("models", mode="before")
diff --git a/config/config.yaml b/config/config.yaml
index 927dad5..0081b77 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -28,7 +28,7 @@ gemini:
   refresh_interval: 600    # Refresh interval in seconds (Not less than 60s)
   verbose: false           # Enable verbose logging for Gemini requests
   max_chars_per_request: 1000000     # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit
-  chat_mode: "normal"     # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account)
+  chat_mode: "normal"     # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account), uses a 90% effective input limit, and compacts older turns into a summary when oversized
   model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only)
   models: []
 

From 8cc1937af3b7bbd83dcaebf7aa87dac2259aea24 Mon Sep 17 00:00:00 2001
From: Vigno04 <davivigna2004@gmail.com>
Date: Sat, 28 Mar 2026 13:16:19 +0100
Subject: [PATCH 5/6] added file option for context

---
 README.md           |  6 +++++-
 app/server/chat.py  | 25 ++++++++++++++-----------
 app/utils/config.py |  6 +++++-
 config/config.yaml  |  3 ++-
 4 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index a891808..f5ee964 100644
--- a/README.md
+++ b/README.md
@@ -219,16 +219,20 @@ You can control whether requests use normal Google chats or Google's temporary c
 gemini:
   chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account)
   max_chars_per_request: 1000000
+  oversized_context_strategy: "compaction" # "compaction" or "file"
 ```
 
 When `chat_mode` is set to `temporary`, the server applies an internal effective input limit of 90% of `max_chars_per_request`.
-If a temporary-mode request (or fallback full-history replay) exceeds this budget, older turns are compacted into a summary block while recent turns stay verbatim.
+When context exceeds the effective budget, handling is controlled by `oversized_context_strategy`:
+- `compaction`: summarize older turns and keep recent turns verbatim.
+- `file`: attach oversized context as `message.txt` and process it from file.
 
 Environment variable equivalents:
 
 ```bash
 export CONFIG_GEMINI__CHAT_MODE="temporary"
 export CONFIG_GEMINI__MAX_CHARS_PER_REQUEST=1000000
+export CONFIG_GEMINI__OVERSIZED_CONTEXT_STRATEGY="compaction"
 ```
 
 ### Custom Models
diff --git a/app/server/chat.py b/app/server/chat.py
index 5cd72c4..337061d 100644
--- a/app/server/chat.py
+++ b/app/server/chat.py
@@ -94,6 +94,11 @@ def _effective_max_chars_per_request() -> int:
     return max(limit, 1)
 
 
+def _should_use_summary_compaction() -> bool:
+    """Return whether oversized context should be compacted instead of sent as file."""
+    return g_config.gemini.oversized_context_strategy == "compaction"
+
+
 def _build_history_summary_message(messages: list[Message]) -> Message | None:
     """Create a compact summary message for older turns to reduce oversized replay payloads."""
     if not messages:
@@ -920,19 +925,17 @@ async def _send_with_split(
             raise
 
     logger.info(
-        f"Message length ({len(text)}) exceeds effective limit ({effective_limit}). Converting text to file attachment."
+        f"Message length ({len(text)}) exceeds effective limit ({effective_limit})."
     )
+    logger.info("Converting oversized message to file attachment.")
     file_obj = io.BytesIO(text.encode("utf-8"))
     file_obj.name = "message.txt"
     try:
         final_files = list(files) if files else []
         final_files.append(file_obj)
         instruction = (
-            "The user's input exceeds the character limit and is provided in the attached file `message.txt`.\n\n"
-            "**System Instruction:**\n"
-            "1. Read the content of `message.txt`.\n"
-            "2. Treat that content as the **primary** user prompt for this turn.\n"
-            "3. Execute the instructions or answer the questions found *inside* that file immediately.\n"
+            "Context is attached in `message.txt`. "
+            "Acknowledge it briefly, then treat it as the primary user input for this turn and answer based on it."
         )
         if stream:
             return session.send_message_stream(instruction, files=final_files, temporary=temporary)
@@ -989,7 +992,7 @@ async def _send_with_internal_fallback(
         fallback_input, fallback_files = await _process_conversation_with_compaction(
             full_prepared_messages,
             tmp_dir,
-            allow_summary_compaction=True,
+            allow_summary_compaction=_should_use_summary_compaction(),
             reason="fallback replay",
         )
         output = await _send_with_split(
@@ -1802,7 +1805,7 @@ async def create_chat_completion(
         m_input, files = await _process_conversation_with_compaction(
             input_msgs,
             tmp_dir,
-            allow_summary_compaction=use_google_temporary_mode,
+            allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(),
             reason="temporary session replay",
         )
 
@@ -1817,7 +1820,7 @@ async def create_chat_completion(
             m_input, files = await _process_conversation_with_compaction(
                 msgs,
                 tmp_dir,
-                allow_summary_compaction=use_google_temporary_mode,
+                allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(),
                 reason="temporary fresh replay",
             )
         except Exception as e:
@@ -2001,7 +2004,7 @@ async def create_response(
         m_input, files = await _process_conversation_with_compaction(
             msgs,
             tmp_dir,
-            allow_summary_compaction=use_google_temporary_mode,
+            allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(),
             reason="temporary session replay",
         )
         logger.debug(
@@ -2014,7 +2017,7 @@ async def create_response(
             m_input, files = await _process_conversation_with_compaction(
                 messages,
                 tmp_dir,
-                allow_summary_compaction=use_google_temporary_mode,
+                allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(),
                 reason="temporary fresh replay",
             )
         except Exception as e:
diff --git a/app/utils/config.py b/app/utils/config.py
index 51838d0..fc5d140 100644
--- a/app/utils/config.py
+++ b/app/utils/config.py
@@ -104,9 +104,13 @@ class GeminiConfig(BaseModel):
         ge=1,
         description="Maximum characters Gemini Web can accept per request",
     )
+    oversized_context_strategy: Literal["compaction", "file"] = Field(
+        default="compaction",
+        description="Strategy for oversized context: 'compaction' summarizes older turns, 'file' sends oversized context as attachment",
+    )
     chat_mode: ChatMode = Field(
         default=ChatMode.NORMAL,
-        description="Chat mode: 'normal' uses standard chats, 'temporary' uses Google's temporary mode (not saved to account), enforces an effective input limit of 90% of max_chars_per_request, and compacts older turns into a summary when oversized",
+        description="Chat mode: 'normal' uses standard chats, 'temporary' uses Google's temporary mode (not saved to account) and enforces an effective input limit of 90% of max_chars_per_request",
     )
 
     @field_validator("models", mode="before")
diff --git a/config/config.yaml b/config/config.yaml
index 0081b77..462167f 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -28,7 +28,8 @@ gemini:
   refresh_interval: 600    # Refresh interval in seconds (Not less than 60s)
   verbose: false           # Enable verbose logging for Gemini requests
   max_chars_per_request: 1000000     # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit
-  chat_mode: "normal"     # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account), uses a 90% effective input limit, and compacts older turns into a summary when oversized
+  oversized_context_strategy: "compaction" # "compaction" summarizes older turns when oversized; "file" sends oversized context as attached file
+  chat_mode: "normal"     # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account) and uses a 90% effective input limit
   model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only)
   models: []
 

From 88fff07ae5625baa8929e871460bf198622a0168 Mon Sep 17 00:00:00 2001
From: Vigno04 <davivigna2004@gmail.com>
Date: Tue, 31 Mar 2026 12:37:57 +0200
Subject: [PATCH 6/6] simplified some logic

---
 app/server/chat.py  | 28 ++++++++++++----------------
 app/utils/config.py | 11 +++++++++--
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/app/server/chat.py b/app/server/chat.py
index 337061d..dad65f5 100644
--- a/app/server/chat.py
+++ b/app/server/chat.py
@@ -55,7 +55,7 @@
 )
 from app.services import GeminiClientPool, GeminiClientWrapper, LMDBConversationStore
 from app.utils import g_config
-from app.utils.config import ChatMode
+from app.utils.config import ChatMode, OversizedContextStrategy
 from app.utils.helper import (
     STREAM_MASTER_RE,
     STREAM_TAIL_RE,
@@ -75,6 +75,11 @@
 SUMMARY_MAX_LINES = 24
 SUMMARY_MAX_LINE_CHARS = 320
 SUMMARY_MAX_TOTAL_CHARS = 6000
+COMPACTED_SUMMARY_PROMPT = (
+    "Conversation summary for older turns (compacted to stay within provider limits):\n"
+    "{summary}\n"
+    "Use this as context continuity for earlier turns."
+)
 
 _MISSING_CHAT_ERROR_PATTERNS = (
     # gemini_webapi maps ErrorCode.MODEL_INCONSISTENT (1050) to this message.
@@ -94,11 +99,6 @@ def _effective_max_chars_per_request() -> int:
     return max(limit, 1)
 
 
-def _should_use_summary_compaction() -> bool:
-    """Return whether oversized context should be compacted instead of sent as file."""
-    return g_config.gemini.oversized_context_strategy == "compaction"
-
-
 def _build_history_summary_message(messages: list[Message]) -> Message | None:
     """Create a compact summary message for older turns to reduce oversized replay payloads."""
     if not messages:
@@ -127,11 +127,7 @@ def _build_history_summary_message(messages: list[Message]) -> Message | None:
     if not summary_lines:
         return None
 
-    summary_text = (
-        "Conversation summary for older turns (compacted to stay within provider limits):\n"
-        + "\n".join(summary_lines)
-        + "\nUse this as context continuity for earlier turns."
-    )
+    summary_text = COMPACTED_SUMMARY_PROMPT.format(summary="\n".join(summary_lines))
     return Message(role="system", content=summary_text)
 
 
@@ -992,7 +988,7 @@ async def _send_with_internal_fallback(
         fallback_input, fallback_files = await _process_conversation_with_compaction(
             full_prepared_messages,
             tmp_dir,
-            allow_summary_compaction=_should_use_summary_compaction(),
+            allow_summary_compaction=(g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
             reason="fallback replay",
         )
         output = await _send_with_split(
@@ -1805,7 +1801,7 @@ async def create_chat_completion(
         m_input, files = await _process_conversation_with_compaction(
             input_msgs,
             tmp_dir,
-            allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(),
+            allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
             reason="temporary session replay",
         )
 
@@ -1820,7 +1816,7 @@ async def create_chat_completion(
             m_input, files = await _process_conversation_with_compaction(
                 msgs,
                 tmp_dir,
-                allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(),
+                allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
                 reason="temporary fresh replay",
             )
         except Exception as e:
@@ -2004,7 +2000,7 @@ async def create_response(
         m_input, files = await _process_conversation_with_compaction(
             msgs,
             tmp_dir,
-            allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(),
+            allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
             reason="temporary session replay",
         )
         logger.debug(
@@ -2017,7 +2013,7 @@ async def create_response(
             m_input, files = await _process_conversation_with_compaction(
                 messages,
                 tmp_dir,
-                allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(),
+                allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
                 reason="temporary fresh replay",
             )
         except Exception as e:
diff --git a/app/utils/config.py b/app/utils/config.py
index fc5d140..fe806c9 100644
--- a/app/utils/config.py
+++ b/app/utils/config.py
@@ -72,6 +72,13 @@ def _parse_json_string(cls, v: Any) -> Any:
         return v
 
 
+class OversizedContextStrategy(str, Enum):
+    """Strategy for handling oversized context."""
+
+    COMPACTION = "compaction"
+    FILE = "file"
+
+
 class ChatMode(str, Enum):
     """Chat mode options for Gemini conversation handling."""
 
@@ -104,8 +111,8 @@ class GeminiConfig(BaseModel):
         ge=1,
         description="Maximum characters Gemini Web can accept per request",
     )
-    oversized_context_strategy: Literal["compaction", "file"] = Field(
-        default="compaction",
+    oversized_context_strategy: OversizedContextStrategy = Field(
+        default=OversizedContextStrategy.COMPACTION,
         description="Strategy for oversized context: 'compaction' summarizes older turns, 'file' sends oversized context as attachment",
     )
     chat_mode: ChatMode = Field(