From b83b339514524c2305c137fc55a1b26ce15f0cca Mon Sep 17 00:00:00 2001 From: Vigno04 Date: Fri, 27 Mar 2026 17:27:28 +0100 Subject: [PATCH 1/6] Add Google temporary chat mode with metadata fallback --- README.md | 17 ++++++ app/server/chat.py | 124 ++++++++++++++++++++++++++++++++++++++++---- app/utils/config.py | 8 +++ config/config.yaml | 2 + 4 files changed, 141 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 6b6f485..c75571f 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,23 @@ To use Gemini-FastAPI, you need to extract your Gemini session cookies: Each client entry can be configured with a different proxy to work around rate limits. Omit the `proxy` field or set it to `null` or an empty string to keep a direct connection. +### Chat Session Mode + +You can control whether the server reuses Google chat metadata or always starts fresh chats: + +```yaml +gemini: + chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account) + fallback_to_internal_on_missing_chat: true # Retry with local history replay when reuse fails +``` + +Environment variable equivalents: + +```bash +export CONFIG_GEMINI__CHAT_MODE="temporary" +export CONFIG_GEMINI__FALLBACK_TO_INTERNAL_ON_MISSING_CHAT=true +``` + ### Custom Models You can define custom models in `config/config.yaml` or via environment variables. diff --git a/app/server/chat.py b/app/server/chat.py index 8d8be91..47a539b 100644 --- a/app/server/chat.py +++ b/app/server/chat.py @@ -71,6 +71,15 @@ MAX_CHARS_PER_REQUEST = int(g_config.gemini.max_chars_per_request * 0.9) METADATA_TTL_MINUTES = 15 +_MISSING_CHAT_ERROR_MARKERS = ( + "not found", + "404", + "invalid", + "metadata", + "conversation", + "chat", +) + router = APIRouter() @@ -745,6 +754,10 @@ async def _find_reusable_session( messages: list[Message], ) -> tuple[ChatSession | None, GeminiClientWrapper | None, list[Message]]: """Find an existing chat session matching the longest suitable history prefix.""" + if g_config.gemini.chat_mode == "temporary": + logger.debug("Temporary chat mode enabled; skipping metadata-based session reuse.") + return None, None, messages + if len(messages) < 2: return None, None, messages @@ -759,7 +772,14 @@ async def _find_reusable_session( age_minutes = (now - updated_at).total_seconds() / 60 if age_minutes <= METADATA_TTL_MINUTES: client = await pool.acquire(conv.client_id) - session = client.start_chat(metadata=conv.metadata, model=model) + try: + session = client.start_chat(metadata=conv.metadata, model=model) + except Exception as exc: + logger.warning( + f"Failed to reuse metadata chat at prefix length {search_end}: {exc}" + ) + search_end -= 1 + continue remain = messages[search_end:] logger.debug( f"Match found at prefix length {search_end}/{len(messages)}. Client: {conv.client_id}" @@ -776,7 +796,6 @@ async def _find_reusable_session( logger.warning( f"Error checking LMDB for reusable session at length {search_end}: {e}" ) - break search_end -= 1 logger.debug(f"No reusable session found for {len(messages)} messages.") @@ -788,13 +807,14 @@ async def _send_with_split( text: str, files: list[Path | str | io.BytesIO] | None = None, stream: bool = False, + temporary: bool = False, ) -> AsyncGenerator[ModelOutput] | ModelOutput: """Send text to Gemini, splitting or converting to attachment if too long.""" if len(text) <= MAX_CHARS_PER_REQUEST: try: if stream: - return session.send_message_stream(text, files=files) - return await session.send_message(text, files=files) + return session.send_message_stream(text, files=files, temporary=temporary) + return await session.send_message(text, files=files, temporary=temporary) except Exception as e: logger.exception(f"Error sending message to Gemini: {e}") raise @@ -815,13 +835,73 @@ async def _send_with_split( "3. Execute the instructions or answer the questions found *inside* that file immediately.\n" ) if stream: - return session.send_message_stream(instruction, files=final_files) - return await session.send_message(instruction, files=final_files) + return session.send_message_stream(instruction, files=final_files, temporary=temporary) + return await session.send_message(instruction, files=final_files, temporary=temporary) except Exception as e: logger.exception(f"Error sending large text as file to Gemini: {e}") raise +def _is_missing_chat_error(exc: Exception) -> bool: + lowered = str(exc).lower() + if not lowered: + return False + return all(marker in lowered for marker in ("chat", "not found")) or any( + marker in lowered for marker in _MISSING_CHAT_ERROR_MARKERS + ) + + +async def _send_with_internal_fallback( + *, + pool: GeminiClientPool, + model: Model, + session: ChatSession, + client: GeminiClientWrapper, + current_input: str, + files: list[Path | str | io.BytesIO], + full_prepared_messages: list[Message], + tmp_dir: Path, + stream: bool, + reused_session: bool, + temporary: bool, +) -> tuple[AsyncGenerator[ModelOutput] | ModelOutput, ChatSession, GeminiClientWrapper]: + try: + output = await _send_with_split( + session, + current_input, + files=files, + stream=stream, + temporary=temporary, + ) + return output, session, client + except Exception as exc: + should_fallback = ( + g_config.gemini.fallback_to_internal_on_missing_chat + and reused_session + and not stream + and _is_missing_chat_error(exc) + ) + if not should_fallback: + raise + + logger.warning( + "Metadata-backed chat reuse failed; retrying with internal history replay in a fresh chat." + ) + fallback_client = await pool.acquire() + fallback_session = fallback_client.start_chat(model=model) + fallback_input, fallback_files = await GeminiClientWrapper.process_conversation( + full_prepared_messages, tmp_dir + ) + output = await _send_with_split( + fallback_session, + fallback_input, + files=fallback_files, + stream=False, + temporary=temporary, + ) + return output, fallback_session, fallback_client + + class StreamingOutputFilter: """ Filter to suppress technical protocol markers, tool calls, and system hints from the stream. @@ -1603,6 +1683,7 @@ async def create_chat_completion( ) session, client, remain = await _find_reusable_session(db, pool, model, msgs) + reused_session = session is not None if session: if not remain: @@ -1636,14 +1717,25 @@ async def create_chat_completion( completion_id = f"chatcmpl-{uuid.uuid4()}" created_time = int(datetime.now(tz=UTC).timestamp()) + use_google_temporary_mode = g_config.gemini.chat_mode == "temporary" try: assert session and client logger.debug( f"Client ID: {client.id}, Input length: {len(m_input)}, files count: {len(files)}" ) - resp_or_stream = await _send_with_split( - session, m_input, files=files, stream=request.stream + resp_or_stream, session, client = await _send_with_internal_fallback( + pool=pool, + model=model, + session=session, + client=client, + current_input=m_input, + files=files, + full_prepared_messages=msgs, + tmp_dir=tmp_dir, + stream=bool(request.stream), + reused_session=reused_session, + temporary=use_google_temporary_mode, ) except Exception as e: logger.exception("Gemini API error") @@ -1785,6 +1877,7 @@ async def create_response( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc session, client, remain = await _find_reusable_session(db, pool, model, messages) + reused_session = session is not None if session: msgs = _prepare_messages_for_model( remain, @@ -1812,14 +1905,25 @@ async def create_response( response_id = f"resp_{uuid.uuid4().hex}" created_time = int(datetime.now(tz=UTC).timestamp()) + use_google_temporary_mode = g_config.gemini.chat_mode == "temporary" try: assert session and client logger.debug( f"Client ID: {client.id}, Input length: {len(m_input)}, files count: {len(files)}" ) - resp_or_stream = await _send_with_split( - session, m_input, files=files, stream=request.stream + resp_or_stream, session, client = await _send_with_internal_fallback( + pool=pool, + model=model, + session=session, + client=client, + current_input=m_input, + files=files, + full_prepared_messages=messages, + tmp_dir=tmp_dir, + stream=bool(request.stream), + reused_session=reused_session, + temporary=use_google_temporary_mode, ) except Exception as e: logger.exception("Gemini API error") diff --git a/app/utils/config.py b/app/utils/config.py index 69af2e1..ba168e0 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -96,6 +96,14 @@ class GeminiConfig(BaseModel): ge=1, description="Maximum characters Gemini Web can accept per request", ) + chat_mode: Literal["normal", "temporary"] = Field( + default="normal", + description="Chat mode: 'normal' reuses Google chat metadata, 'temporary' always starts fresh chats", + ) + fallback_to_internal_on_missing_chat: bool = Field( + default=True, + description="Retry by replaying local history when metadata-based Google chat reuse fails", + ) @field_validator("models", mode="before") @classmethod diff --git a/config/config.yaml b/config/config.yaml index bd9fbc0..b7581b1 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -28,6 +28,8 @@ gemini: refresh_interval: 600 # Refresh interval in seconds (Not less than 60s) verbose: false # Enable verbose logging for Gemini requests max_chars_per_request: 1000000 # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit + chat_mode: "normal" # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account) + fallback_to_internal_on_missing_chat: true # Retry with internal history replay when metadata-based chat is missing model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only) models: [] From 9144833835e374fec003acdffbe8134ea3eb6607 Mon Sep 17 00:00:00 2001 From: Vigno04 Date: Sat, 28 Mar 2026 09:55:08 +0100 Subject: [PATCH 2/6] Refine chat mode config and fallback detection --- README.md | 2 -- app/server/chat.py | 31 ++++++++++++++----------------- app/utils/config.py | 16 ++++++++++------ config/config.yaml | 1 - 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index c75571f..41a9030 100644 --- a/README.md +++ b/README.md @@ -218,14 +218,12 @@ You can control whether the server reuses Google chat metadata or always starts ```yaml gemini: chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account) - fallback_to_internal_on_missing_chat: true # Retry with local history replay when reuse fails ``` Environment variable equivalents: ```bash export CONFIG_GEMINI__CHAT_MODE="temporary" -export CONFIG_GEMINI__FALLBACK_TO_INTERNAL_ON_MISSING_CHAT=true ``` ### Custom Models diff --git a/app/server/chat.py b/app/server/chat.py index 47a539b..415c8a9 100644 --- a/app/server/chat.py +++ b/app/server/chat.py @@ -1,6 +1,7 @@ import base64 import hashlib import io +import re import reprlib import uuid from collections.abc import AsyncGenerator @@ -54,6 +55,7 @@ ) from app.services import GeminiClientPool, GeminiClientWrapper, LMDBConversationStore from app.utils import g_config +from app.utils.config import ChatMode from app.utils.helper import ( STREAM_MASTER_RE, STREAM_TAIL_RE, @@ -71,13 +73,11 @@ MAX_CHARS_PER_REQUEST = int(g_config.gemini.max_chars_per_request * 0.9) METADATA_TTL_MINUTES = 15 -_MISSING_CHAT_ERROR_MARKERS = ( - "not found", - "404", - "invalid", - "metadata", - "conversation", - "chat", +_MISSING_CHAT_ERROR_PATTERNS = ( + # gemini_webapi maps ErrorCode.MODEL_INCONSISTENT (1050) to this message. + re.compile(r"\bmodel\s+is\s+inconsistent\s+with\s+the\s+conversation\s+history\b"), + # Defensive pattern for equivalent wording in wrappers/alternate versions. + re.compile(r"\bconversation\s+history\b[^\n]{0,120}\b(?:inconsistent|mismatch|does\s+not\s+match)\b"), ) router = APIRouter() @@ -754,7 +754,7 @@ async def _find_reusable_session( messages: list[Message], ) -> tuple[ChatSession | None, GeminiClientWrapper | None, list[Message]]: """Find an existing chat session matching the longest suitable history prefix.""" - if g_config.gemini.chat_mode == "temporary": + if g_config.gemini.chat_mode == ChatMode.TEMPORARY: logger.debug("Temporary chat mode enabled; skipping metadata-based session reuse.") return None, None, messages @@ -843,12 +843,10 @@ async def _send_with_split( def _is_missing_chat_error(exc: Exception) -> bool: - lowered = str(exc).lower() - if not lowered: + normalized = " ".join(part for part in (str(exc), repr(exc)) if part).lower() + if not normalized: return False - return all(marker in lowered for marker in ("chat", "not found")) or any( - marker in lowered for marker in _MISSING_CHAT_ERROR_MARKERS - ) + return any(pattern.search(normalized) for pattern in _MISSING_CHAT_ERROR_PATTERNS) async def _send_with_internal_fallback( @@ -876,8 +874,7 @@ async def _send_with_internal_fallback( return output, session, client except Exception as exc: should_fallback = ( - g_config.gemini.fallback_to_internal_on_missing_chat - and reused_session + reused_session and not stream and _is_missing_chat_error(exc) ) @@ -1717,7 +1714,7 @@ async def create_chat_completion( completion_id = f"chatcmpl-{uuid.uuid4()}" created_time = int(datetime.now(tz=UTC).timestamp()) - use_google_temporary_mode = g_config.gemini.chat_mode == "temporary" + use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY try: assert session and client @@ -1905,7 +1902,7 @@ async def create_response( response_id = f"resp_{uuid.uuid4().hex}" created_time = int(datetime.now(tz=UTC).timestamp()) - use_google_temporary_mode = g_config.gemini.chat_mode == "temporary" + use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY try: assert session and client diff --git a/app/utils/config.py b/app/utils/config.py index ba168e0..7a7c19f 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -1,6 +1,7 @@ import ast import os import sys +from enum import Enum from typing import Any, Literal import orjson @@ -71,6 +72,13 @@ def _parse_json_string(cls, v: Any) -> Any: return v +class ChatMode(str, Enum): + """Chat mode options for Gemini conversation handling.""" + + NORMAL = "normal" + TEMPORARY = "temporary" + + class GeminiConfig(BaseModel): """Gemini API configuration""" @@ -96,14 +104,10 @@ class GeminiConfig(BaseModel): ge=1, description="Maximum characters Gemini Web can accept per request", ) - chat_mode: Literal["normal", "temporary"] = Field( - default="normal", + chat_mode: ChatMode = Field( + default=ChatMode.NORMAL, description="Chat mode: 'normal' reuses Google chat metadata, 'temporary' always starts fresh chats", ) - fallback_to_internal_on_missing_chat: bool = Field( - default=True, - description="Retry by replaying local history when metadata-based Google chat reuse fails", - ) @field_validator("models", mode="before") @classmethod diff --git a/config/config.yaml b/config/config.yaml index b7581b1..927dad5 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -29,7 +29,6 @@ gemini: verbose: false # Enable verbose logging for Gemini requests max_chars_per_request: 1000000 # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit chat_mode: "normal" # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account) - fallback_to_internal_on_missing_chat: true # Retry with internal history replay when metadata-based chat is missing model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only) models: [] From 8eb991d30a0015f421739ace9c3e1994e6e6f462 Mon Sep 17 00:00:00 2001 From: Vigno04 Date: Sat, 28 Mar 2026 10:04:27 +0100 Subject: [PATCH 3/6] added break like suggested by copilot --- app/server/chat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/app/server/chat.py b/app/server/chat.py index 415c8a9..8bf6019 100644 --- a/app/server/chat.py +++ b/app/server/chat.py @@ -796,6 +796,7 @@ async def _find_reusable_session( logger.warning( f"Error checking LMDB for reusable session at length {search_end}: {e}" ) + break search_end -= 1 logger.debug(f"No reusable session found for {len(messages)} messages.") From 574fe7d0ffa471afd4d4e57f4b785b0928f240be Mon Sep 17 00:00:00 2001 From: Vigno04 Date: Sat, 28 Mar 2026 11:37:59 +0100 Subject: [PATCH 4/6] Add summary compaction for temporary and fallback flows --- README.md | 7 +- app/server/chat.py | 154 +++++++++++++++++++++++++++++++++++++++----- app/utils/config.py | 2 +- config/config.yaml | 2 +- 4 files changed, 145 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 41a9030..a891808 100644 --- a/README.md +++ b/README.md @@ -213,17 +213,22 @@ Each client entry can be configured with a different proxy to work around rate l ### Chat Session Mode -You can control whether the server reuses Google chat metadata or always starts fresh chats: +You can control whether requests use normal Google chats or Google's temporary chat mode: ```yaml gemini: chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account) + max_chars_per_request: 1000000 ``` +When `chat_mode` is set to `temporary`, the server applies an internal effective input limit of 90% of `max_chars_per_request`. +If a temporary-mode request (or fallback full-history replay) exceeds this budget, older turns are compacted into a summary block while recent turns stay verbatim. + Environment variable equivalents: ```bash export CONFIG_GEMINI__CHAT_MODE="temporary" +export CONFIG_GEMINI__MAX_CHARS_PER_REQUEST=1000000 ``` ### Custom Models diff --git a/app/server/chat.py b/app/server/chat.py index 8bf6019..5cd72c4 100644 --- a/app/server/chat.py +++ b/app/server/chat.py @@ -70,8 +70,11 @@ text_from_message, ) -MAX_CHARS_PER_REQUEST = int(g_config.gemini.max_chars_per_request * 0.9) METADATA_TTL_MINUTES = 15 +SUMMARY_KEEP_LAST_MESSAGES = 8 +SUMMARY_MAX_LINES = 24 +SUMMARY_MAX_LINE_CHARS = 320 +SUMMARY_MAX_TOTAL_CHARS = 6000 _MISSING_CHAT_ERROR_PATTERNS = ( # gemini_webapi maps ErrorCode.MODEL_INCONSISTENT (1050) to this message. @@ -83,6 +86,105 @@ router = APIRouter() +def _effective_max_chars_per_request() -> int: + """Compute effective request size guardrail from config values.""" + limit = g_config.gemini.max_chars_per_request + if g_config.gemini.chat_mode == ChatMode.TEMPORARY: + limit = int(limit * 0.9) + return max(limit, 1) + + +def _build_history_summary_message(messages: list[Message]) -> Message | None: + """Create a compact summary message for older turns to reduce oversized replay payloads.""" + if not messages: + return None + + summary_lines: list[str] = [] + used_chars = 0 + for msg in messages: + if len(summary_lines) >= SUMMARY_MAX_LINES or used_chars >= SUMMARY_MAX_TOTAL_CHARS: + break + + raw = text_from_message(msg).replace("\n", " ").strip() + if not raw and not msg.tool_calls: + continue + + if msg.tool_calls: + raw = f"{raw} [tool_calls={len(msg.tool_calls)}]".strip() + + if len(raw) > SUMMARY_MAX_LINE_CHARS: + raw = f"{raw[: SUMMARY_MAX_LINE_CHARS - 3]}..." + + line = f"- {msg.role}: {raw}" + used_chars += len(line) + summary_lines.append(line) + + if not summary_lines: + return None + + summary_text = ( + "Conversation summary for older turns (compacted to stay within provider limits):\n" + + "\n".join(summary_lines) + + "\nUse this as context continuity for earlier turns." + ) + return Message(role="system", content=summary_text) + + +def _compact_messages_with_summary(messages: list[Message]) -> list[Message]: + """Keep recent turns verbatim and compact older turns into one summary message.""" + if len(messages) <= SUMMARY_KEEP_LAST_MESSAGES: + return messages + + older = messages[:-SUMMARY_KEEP_LAST_MESSAGES] + recent = messages[-SUMMARY_KEEP_LAST_MESSAGES:] + summary_msg = _build_history_summary_message(older) + if not summary_msg: + return messages + + compacted: list[Message] = [] + if messages and messages[0].role == "system": + first = messages[0].model_copy(deep=True) + if isinstance(first.content, str): + first.content = ( + f"{first.content}\n\n{summary_msg.content}" + if first.content + else str(summary_msg.content) + ) + compacted.append(first) + else: + compacted.append(summary_msg) + else: + compacted.append(summary_msg) + + compacted.extend(recent) + return compacted + + +async def _process_conversation_with_compaction( + messages: list[Message], + tmp_dir: Path, + allow_summary_compaction: bool, + reason: str, +) -> tuple[str, list[Path | str]]: + """Build conversation payload and optionally compact oversized histories.""" + model_input, files = await GeminiClientWrapper.process_conversation(messages, tmp_dir) + effective_limit = _effective_max_chars_per_request() + if len(model_input) <= effective_limit or not allow_summary_compaction: + return model_input, files + + compacted = _compact_messages_with_summary(messages) + if compacted == messages: + return model_input, files + + compacted_input, compacted_files = await GeminiClientWrapper.process_conversation( + compacted, tmp_dir + ) + logger.warning( + f"Input too large for {reason} ({len(model_input)}>{effective_limit}); compacted history to {len(compacted_input)} chars before send." + ) + return compacted_input, compacted_files + + @dataclass class StructuredOutputRequirement: """Represents a structured response request from the client.""" @@ -754,10 +856,6 @@ async def _find_reusable_session( messages: list[Message], ) -> tuple[ChatSession | None, GeminiClientWrapper | None, list[Message]]: """Find an existing chat session matching the longest suitable history prefix.""" - if g_config.gemini.chat_mode == ChatMode.TEMPORARY: - logger.debug("Temporary chat mode enabled; skipping metadata-based session reuse.") - return None, None, messages - if len(messages) < 2: return None, None, messages @@ -811,7 +909,8 @@ async def _send_with_split( temporary: bool = False, ) -> AsyncGenerator[ModelOutput] | ModelOutput: """Send text to Gemini, splitting or converting to attachment if too long.""" - if len(text) <= MAX_CHARS_PER_REQUEST: + effective_limit = _effective_max_chars_per_request() + if len(text) <= effective_limit: try: if stream: return session.send_message_stream(text, files=files, temporary=temporary) @@ -821,7 +920,7 @@ async def _send_with_split( raise logger.info( - f"Message length ({len(text)}) exceeds limit ({MAX_CHARS_PER_REQUEST}). Converting text to file attachment." + f"Message length ({len(text)}) exceeds effective limit ({effective_limit}). Converting text to file attachment." ) file_obj = io.BytesIO(text.encode("utf-8")) file_obj.name = "message.txt" @@ -887,8 +986,11 @@ async def _send_with_internal_fallback( ) fallback_client = await pool.acquire() fallback_session = fallback_client.start_chat(model=model) - fallback_input, fallback_files = await GeminiClientWrapper.process_conversation( - full_prepared_messages, tmp_dir + fallback_input, fallback_files = await _process_conversation_with_compaction( + full_prepared_messages, + tmp_dir, + allow_summary_compaction=True, + reason="fallback replay", ) output = await _send_with_split( fallback_session, @@ -1682,6 +1784,7 @@ async def create_chat_completion( session, client, remain = await _find_reusable_session(db, pool, model, msgs) reused_session = session is not None + use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY if session: if not remain: @@ -1696,7 +1799,12 @@ async def create_chat_completion( extra_instr, False, ) - m_input, files = await GeminiClientWrapper.process_conversation(input_msgs, tmp_dir) + m_input, files = await _process_conversation_with_compaction( + input_msgs, + tmp_dir, + allow_summary_compaction=use_google_temporary_mode, + reason="temporary session replay", + ) logger.debug( f"Reused session {reprlib.repr(session.metadata)} - sending {len(input_msgs)} prepared messages." @@ -1706,7 +1814,12 @@ async def create_chat_completion( client = await pool.acquire() session = client.start_chat(model=model) # Use the already prepared 'msgs' for a fresh session - m_input, files = await GeminiClientWrapper.process_conversation(msgs, tmp_dir) + m_input, files = await _process_conversation_with_compaction( + msgs, + tmp_dir, + allow_summary_compaction=use_google_temporary_mode, + reason="temporary fresh replay", + ) except Exception as e: logger.exception("Error in preparing conversation") raise HTTPException( @@ -1715,8 +1828,6 @@ async def create_chat_completion( completion_id = f"chatcmpl-{uuid.uuid4()}" created_time = int(datetime.now(tz=UTC).timestamp()) - use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY - try: assert session and client logger.debug( @@ -1876,6 +1987,7 @@ async def create_response( session, client, remain = await _find_reusable_session(db, pool, model, messages) reused_session = session is not None + use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY if session: msgs = _prepare_messages_for_model( remain, @@ -1886,7 +1998,12 @@ async def create_response( ) if not msgs: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No new messages.") - m_input, files = await GeminiClientWrapper.process_conversation(msgs, tmp_dir) + m_input, files = await _process_conversation_with_compaction( + msgs, + tmp_dir, + allow_summary_compaction=use_google_temporary_mode, + reason="temporary session replay", + ) logger.debug( f"Reused session {reprlib.repr(session.metadata)} - sending {len(msgs)} prepared messages." ) @@ -1894,7 +2011,12 @@ async def create_response( try: client = await pool.acquire() session = client.start_chat(model=model) - m_input, files = await GeminiClientWrapper.process_conversation(messages, tmp_dir) + m_input, files = await _process_conversation_with_compaction( + messages, + tmp_dir, + allow_summary_compaction=use_google_temporary_mode, + reason="temporary fresh replay", + ) except Exception as e: logger.exception("Error in preparing conversation") raise HTTPException( @@ -1903,8 +2025,6 @@ async def create_response( response_id = f"resp_{uuid.uuid4().hex}" created_time = int(datetime.now(tz=UTC).timestamp()) - use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY - try: assert session and client logger.debug( diff --git a/app/utils/config.py b/app/utils/config.py index 7a7c19f..51838d0 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -106,7 +106,7 @@ class GeminiConfig(BaseModel): ) chat_mode: ChatMode = Field( default=ChatMode.NORMAL, - description="Chat mode: 'normal' reuses Google chat metadata, 'temporary' always starts fresh chats", + description="Chat mode: 'normal' uses standard chats, 'temporary' uses Google's temporary mode (not saved to account), enforces an effective input limit of 90% of max_chars_per_request, and compacts older turns into a summary when oversized", ) @field_validator("models", mode="before") diff --git a/config/config.yaml b/config/config.yaml index 927dad5..0081b77 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -28,7 +28,7 @@ gemini: refresh_interval: 600 # Refresh interval in seconds (Not less than 60s) verbose: false # Enable verbose logging for Gemini requests max_chars_per_request: 1000000 # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit - chat_mode: "normal" # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account) + chat_mode: "normal" # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account), uses a 90% effective input limit, and compacts older turns into a summary when oversized model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only) models: [] From 8cc1937af3b7bbd83dcaebf7aa87dac2259aea24 Mon Sep 17 00:00:00 2001 From: Vigno04 Date: Sat, 28 Mar 2026 13:16:19 +0100 Subject: [PATCH 5/6] added file option for context --- README.md | 6 +++++- app/server/chat.py | 25 ++++++++++++++----------- app/utils/config.py | 6 +++++- config/config.yaml | 3 ++- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index a891808..f5ee964 100644 --- a/README.md +++ b/README.md @@ -219,16 +219,20 @@ You can control whether requests use normal Google chats or Google's temporary c gemini: chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account) max_chars_per_request: 1000000 + oversized_context_strategy: "compaction" # "compaction" or "file" ``` When `chat_mode` is set to `temporary`, the server applies an internal effective input limit of 90% of `max_chars_per_request`. -If a temporary-mode request (or fallback full-history replay) exceeds this budget, older turns are compacted into a summary block while recent turns stay verbatim. +When context exceeds the effective budget, handling is controlled by `oversized_context_strategy`: +- `compaction`: summarize older turns and keep recent turns verbatim. +- `file`: attach oversized context as `message.txt` and process it from file. Environment variable equivalents: ```bash export CONFIG_GEMINI__CHAT_MODE="temporary" export CONFIG_GEMINI__MAX_CHARS_PER_REQUEST=1000000 +export CONFIG_GEMINI__OVERSIZED_CONTEXT_STRATEGY="compaction" ``` ### Custom Models diff --git a/app/server/chat.py b/app/server/chat.py index 5cd72c4..337061d 100644 --- a/app/server/chat.py +++ b/app/server/chat.py @@ -94,6 +94,11 @@ def _effective_max_chars_per_request() -> int: return max(limit, 1) +def _should_use_summary_compaction() -> bool: + """Return whether oversized context should be compacted instead of sent as file.""" + return g_config.gemini.oversized_context_strategy == "compaction" + + def _build_history_summary_message(messages: list[Message]) -> Message | None: """Create a compact summary message for older turns to reduce oversized replay payloads.""" if not messages: @@ -920,19 +925,17 @@ async def _send_with_split( raise logger.info( - f"Message length ({len(text)}) exceeds effective limit ({effective_limit}). Converting text to file attachment." + f"Message length ({len(text)}) exceeds effective limit ({effective_limit})." ) + logger.info("Converting oversized message to file attachment.") file_obj = io.BytesIO(text.encode("utf-8")) file_obj.name = "message.txt" try: final_files = list(files) if files else [] final_files.append(file_obj) instruction = ( - "The user's input exceeds the character limit and is provided in the attached file `message.txt`.\n\n" - "**System Instruction:**\n" - "1. Read the content of `message.txt`.\n" - "2. Treat that content as the **primary** user prompt for this turn.\n" - "3. Execute the instructions or answer the questions found *inside* that file immediately.\n" + "Context is attached in `message.txt`. " + "Acknowledge it briefly, then treat it as the primary user input for this turn and answer based on it." ) if stream: return session.send_message_stream(instruction, files=final_files, temporary=temporary) @@ -989,7 +992,7 @@ async def _send_with_internal_fallback( fallback_input, fallback_files = await _process_conversation_with_compaction( full_prepared_messages, tmp_dir, - allow_summary_compaction=True, + allow_summary_compaction=_should_use_summary_compaction(), reason="fallback replay", ) output = await _send_with_split( @@ -1802,7 +1805,7 @@ async def create_chat_completion( m_input, files = await _process_conversation_with_compaction( input_msgs, tmp_dir, - allow_summary_compaction=use_google_temporary_mode, + allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(), reason="temporary session replay", ) @@ -1817,7 +1820,7 @@ async def create_chat_completion( m_input, files = await _process_conversation_with_compaction( msgs, tmp_dir, - allow_summary_compaction=use_google_temporary_mode, + allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(), reason="temporary fresh replay", ) except Exception as e: @@ -2001,7 +2004,7 @@ async def create_response( m_input, files = await _process_conversation_with_compaction( msgs, tmp_dir, - allow_summary_compaction=use_google_temporary_mode, + allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(), reason="temporary session replay", ) logger.debug( @@ -2014,7 +2017,7 @@ async def create_response( m_input, files = await _process_conversation_with_compaction( messages, tmp_dir, - allow_summary_compaction=use_google_temporary_mode, + allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(), reason="temporary fresh replay", ) except Exception as e: diff --git a/app/utils/config.py b/app/utils/config.py index 51838d0..fc5d140 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -104,9 +104,13 @@ class GeminiConfig(BaseModel): ge=1, description="Maximum characters Gemini Web can accept per request", ) + oversized_context_strategy: Literal["compaction", "file"] = Field( + default="compaction", + description="Strategy for oversized context: 'compaction' summarizes older turns, 'file' sends oversized context as attachment", + ) chat_mode: ChatMode = Field( default=ChatMode.NORMAL, - description="Chat mode: 'normal' uses standard chats, 'temporary' uses Google's temporary mode (not saved to account), enforces an effective input limit of 90% of max_chars_per_request, and compacts older turns into a summary when oversized", + description="Chat mode: 'normal' uses standard chats, 'temporary' uses Google's temporary mode (not saved to account) and enforces an effective input limit of 90% of max_chars_per_request", ) @field_validator("models", mode="before") diff --git a/config/config.yaml b/config/config.yaml index 0081b77..462167f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -28,7 +28,8 @@ gemini: refresh_interval: 600 # Refresh interval in seconds (Not less than 60s) verbose: false # Enable verbose logging for Gemini requests max_chars_per_request: 1000000 # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit - chat_mode: "normal" # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account), uses a 90% effective input limit, and compacts older turns into a summary when oversized + oversized_context_strategy: "compaction" # "compaction" summarizes older turns when oversized; "file" sends oversized context as attached file + chat_mode: "normal" # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account) and uses a 90% effective input limit model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only) models: [] From 88fff07ae5625baa8929e871460bf198622a0168 Mon Sep 17 00:00:00 2001 From: Vigno04 Date: Tue, 31 Mar 2026 12:37:57 +0200 Subject: [PATCH 6/6] simplified some logic --- app/server/chat.py | 28 ++++++++++++---------------- app/utils/config.py | 11 +++++++++-- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/app/server/chat.py b/app/server/chat.py index 337061d..dad65f5 100644 --- a/app/server/chat.py +++ b/app/server/chat.py @@ -55,7 +55,7 @@ ) from app.services import GeminiClientPool, GeminiClientWrapper, LMDBConversationStore from app.utils import g_config -from app.utils.config import ChatMode +from app.utils.config import ChatMode, OversizedContextStrategy from app.utils.helper import ( STREAM_MASTER_RE, STREAM_TAIL_RE, @@ -75,6 +75,11 @@ SUMMARY_MAX_LINES = 24 SUMMARY_MAX_LINE_CHARS = 320 SUMMARY_MAX_TOTAL_CHARS = 6000 +COMPACTED_SUMMARY_PROMPT = ( + "Conversation summary for older turns (compacted to stay within provider limits):\n" + "{summary}\n" + "Use this as context continuity for earlier turns." +) _MISSING_CHAT_ERROR_PATTERNS = ( # gemini_webapi maps ErrorCode.MODEL_INCONSISTENT (1050) to this message. @@ -94,11 +99,6 @@ def _effective_max_chars_per_request() -> int: return max(limit, 1) -def _should_use_summary_compaction() -> bool: - """Return whether oversized context should be compacted instead of sent as file.""" - return g_config.gemini.oversized_context_strategy == "compaction" - - def _build_history_summary_message(messages: list[Message]) -> Message | None: """Create a compact summary message for older turns to reduce oversized replay payloads.""" if not messages: @@ -127,11 +127,7 @@ def _build_history_summary_message(messages: list[Message]) -> Message | None: if not summary_lines: return None - summary_text = ( - "Conversation summary for older turns (compacted to stay within provider limits):\n" - + "\n".join(summary_lines) - + "\nUse this as context continuity for earlier turns." - ) + summary_text = COMPACTED_SUMMARY_PROMPT.format(summary="\n".join(summary_lines)) return Message(role="system", content=summary_text) @@ -992,7 +988,7 @@ async def _send_with_internal_fallback( fallback_input, fallback_files = await _process_conversation_with_compaction( full_prepared_messages, tmp_dir, - allow_summary_compaction=_should_use_summary_compaction(), + allow_summary_compaction=(g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION), reason="fallback replay", ) output = await _send_with_split( @@ -1805,7 +1801,7 @@ async def create_chat_completion( m_input, files = await _process_conversation_with_compaction( input_msgs, tmp_dir, - allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(), + allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION), reason="temporary session replay", ) @@ -1820,7 +1816,7 @@ async def create_chat_completion( m_input, files = await _process_conversation_with_compaction( msgs, tmp_dir, - allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(), + allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION), reason="temporary fresh replay", ) except Exception as e: @@ -2004,7 +2000,7 @@ async def create_response( m_input, files = await _process_conversation_with_compaction( msgs, tmp_dir, - allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(), + allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION), reason="temporary session replay", ) logger.debug( @@ -2017,7 +2013,7 @@ async def create_response( m_input, files = await _process_conversation_with_compaction( messages, tmp_dir, - allow_summary_compaction=use_google_temporary_mode and _should_use_summary_compaction(), + allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION), reason="temporary fresh replay", ) except Exception as e: diff --git a/app/utils/config.py b/app/utils/config.py index fc5d140..fe806c9 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -72,6 +72,13 @@ def _parse_json_string(cls, v: Any) -> Any: return v +class OversizedContextStrategy(str, Enum): + """Strategy for handling oversized context.""" + + COMPACTION = "compaction" + FILE = "file" + + class ChatMode(str, Enum): """Chat mode options for Gemini conversation handling.""" @@ -104,8 +111,8 @@ class GeminiConfig(BaseModel): ge=1, description="Maximum characters Gemini Web can accept per request", ) - oversized_context_strategy: Literal["compaction", "file"] = Field( - default="compaction", + oversized_context_strategy: OversizedContextStrategy = Field( + default=OversizedContextStrategy.COMPACTION, description="Strategy for oversized context: 'compaction' summarizes older turns, 'file' sends oversized context as attachment", ) chat_mode: ChatMode = Field(