diff --git a/README.md b/README.md
index 6b6f485..f5ee964 100644
--- a/README.md
+++ b/README.md
@@ -211,6 +211,30 @@ To use Gemini-FastAPI, you need to extract your Gemini session cookies:
 
 Each client entry can be configured with a different proxy to work around rate limits. Omit the `proxy` field or set it to `null` or an empty string to keep a direct connection.
 
+### Chat Session Mode
+
+You can control whether requests use normal Google chats or Google's temporary chat mode:
+
+```yaml
+gemini:
+  chat_mode: "normal" # "normal" (reuse metadata) or "temporary" (Google temporary chat, not saved to account)
+  max_chars_per_request: 1000000
+  oversized_context_strategy: "compaction" # "compaction" or "file"
+```
+
+When `chat_mode` is set to `temporary`, the server applies an internal effective input limit of 90% of `max_chars_per_request`.
+When context exceeds the effective budget, handling is controlled by `oversized_context_strategy`:
+- `compaction`: summarize older turns and keep recent turns verbatim.
+- `file`: attach oversized context as `message.txt` and process it from file.
+
+Environment variable equivalents:
+
+```bash
+export CONFIG_GEMINI__CHAT_MODE="temporary"
+export CONFIG_GEMINI__MAX_CHARS_PER_REQUEST=1000000
+export CONFIG_GEMINI__OVERSIZED_CONTEXT_STRATEGY="compaction"
+```
+
 ### Custom Models
 
 You can define custom models in `config/config.yaml` or via environment variables.
diff --git a/app/server/chat.py b/app/server/chat.py
index 8d8be91..dad65f5 100644
--- a/app/server/chat.py
+++ b/app/server/chat.py
@@ -1,6 +1,7 @@
 import base64
 import hashlib
 import io
+import re
 import reprlib
 import uuid
 from collections.abc import AsyncGenerator
@@ -54,6 +55,7 @@
 )
 from app.services import GeminiClientPool, GeminiClientWrapper, LMDBConversationStore
 from app.utils import g_config
+from app.utils.config import ChatMode, OversizedContextStrategy
 from app.utils.helper import (
     STREAM_MASTER_RE,
     STREAM_TAIL_RE,
@@ -68,12 +70,122 @@
     text_from_message,
 )
 
-MAX_CHARS_PER_REQUEST = int(g_config.gemini.max_chars_per_request * 0.9)
 METADATA_TTL_MINUTES = 15
+SUMMARY_KEEP_LAST_MESSAGES = 8
+SUMMARY_MAX_LINES = 24
+SUMMARY_MAX_LINE_CHARS = 320
+SUMMARY_MAX_TOTAL_CHARS = 6000
+COMPACTED_SUMMARY_PROMPT = (
+    "Conversation summary for older turns (compacted to stay within provider limits):\n"
+    "{summary}\n"
+    "Use this as context continuity for earlier turns."
+)
+
+_MISSING_CHAT_ERROR_PATTERNS = (
+    # gemini_webapi maps ErrorCode.MODEL_INCONSISTENT (1050) to this message.
+    re.compile(r"\bmodel\s+is\s+inconsistent\s+with\s+the\s+conversation\s+history\b"),
+    # Defensive pattern for equivalent wording in wrappers/alternate versions.
+    re.compile(r"\bconversation\s+history\b[^\n]{0,120}\b(?:inconsistent|mismatch|does\s+not\s+match)\b"),
+)
 
 router = APIRouter()
 
 
+def _effective_max_chars_per_request() -> int:
+    """Compute effective request size guardrail from config values."""
+    limit = g_config.gemini.max_chars_per_request
+    if g_config.gemini.chat_mode == ChatMode.TEMPORARY:
+        limit = int(limit * 0.9)
+    return max(limit, 1)
+
+
+def _build_history_summary_message(messages: list[Message]) -> Message | None:
+    """Create a compact summary message for older turns to reduce oversized replay payloads."""
+    if not messages:
+        return None
+
+    summary_lines: list[str] = []
+    used_chars = 0
+    for msg in messages:
+        if len(summary_lines) >= SUMMARY_MAX_LINES or used_chars >= SUMMARY_MAX_TOTAL_CHARS:
+            break
+
+        raw = text_from_message(msg).replace("\n", " ").strip()
+        if not raw and not msg.tool_calls:
+            continue
+
+        if msg.tool_calls:
+            raw = f"{raw} [tool_calls={len(msg.tool_calls)}]".strip()
+
+        if len(raw) > SUMMARY_MAX_LINE_CHARS:
+            raw = f"{raw[: SUMMARY_MAX_LINE_CHARS - 3]}..."
+
+        line = f"- {msg.role}: {raw}"
+        used_chars += len(line)
+        summary_lines.append(line)
+
+    if not summary_lines:
+        return None
+
+    summary_text = COMPACTED_SUMMARY_PROMPT.format(summary="\n".join(summary_lines))
+    return Message(role="system", content=summary_text)
+
+
+def _compact_messages_with_summary(messages: list[Message]) -> list[Message]:
+    """Keep recent turns verbatim and compact older turns into one summary message."""
+    if len(messages) <= SUMMARY_KEEP_LAST_MESSAGES:
+        return messages
+
+    older = messages[:-SUMMARY_KEEP_LAST_MESSAGES]
+    recent = messages[-SUMMARY_KEEP_LAST_MESSAGES:]
+    summary_msg = _build_history_summary_message(older)
+    if not summary_msg:
+        return messages
+
+    compacted: list[Message] = []
+    if messages and messages[0].role == "system":
+        first = messages[0].model_copy(deep=True)
+        if isinstance(first.content, str):
+            first.content = (
+                f"{first.content}\n\n{summary_msg.content}"
+                if first.content
+                else str(summary_msg.content)
+            )
+            compacted.append(first)
+        else:
+            compacted.append(summary_msg)
+    else:
+        compacted.append(summary_msg)
+
+    compacted.extend(recent)
+    return compacted
+
+
+async def _process_conversation_with_compaction(
+    messages: list[Message],
+    tmp_dir: Path,
+    allow_summary_compaction: bool,
+    reason: str,
+) -> tuple[str, list[Path | str]]:
+    """Build conversation payload and optionally compact oversized histories."""
+    model_input, files = await GeminiClientWrapper.process_conversation(messages, tmp_dir)
+    effective_limit = _effective_max_chars_per_request()
+    if len(model_input) <= effective_limit or not allow_summary_compaction:
+        return model_input, files
+
+    compacted = _compact_messages_with_summary(messages)
+    if compacted == messages:
+        return model_input, files
+
+    compacted_input, compacted_files = await GeminiClientWrapper.process_conversation(
+        compacted, tmp_dir
+    )
+    logger.warning(
+        f"Input too large for {reason} ({len(model_input)}>{effective_limit}); compacted history to {len(compacted_input)} chars before send."
+    )
+    return compacted_input, compacted_files
+
+
 @dataclass
 class StructuredOutputRequirement:
     """Represents a structured response request from the client."""
@@ -759,7 +871,14 @@ async def _find_reusable_session(
                     age_minutes = (now - updated_at).total_seconds() / 60
                     if age_minutes <= METADATA_TTL_MINUTES:
                         client = await pool.acquire(conv.client_id)
-                        session = client.start_chat(metadata=conv.metadata, model=model)
+                        try:
+                            session = client.start_chat(metadata=conv.metadata, model=model)
+                        except Exception as exc:
+                            logger.warning(
+                                f"Failed to reuse metadata chat at prefix length {search_end}: {exc}"
+                            )
+                            search_end -= 1
+                            continue
                         remain = messages[search_end:]
                         logger.debug(
                             f"Match found at prefix length {search_end}/{len(messages)}. Client: {conv.client_id}"
@@ -788,40 +907,100 @@ async def _send_with_split(
     text: str,
     files: list[Path | str | io.BytesIO] | None = None,
     stream: bool = False,
+    temporary: bool = False,
 ) -> AsyncGenerator[ModelOutput] | ModelOutput:
     """Send text to Gemini, splitting or converting to attachment if too long."""
-    if len(text) <= MAX_CHARS_PER_REQUEST:
+    effective_limit = _effective_max_chars_per_request()
+    if len(text) <= effective_limit:
         try:
             if stream:
-                return session.send_message_stream(text, files=files)
-            return await session.send_message(text, files=files)
+                return session.send_message_stream(text, files=files, temporary=temporary)
+            return await session.send_message(text, files=files, temporary=temporary)
         except Exception as e:
             logger.exception(f"Error sending message to Gemini: {e}")
             raise
 
     logger.info(
-        f"Message length ({len(text)}) exceeds limit ({MAX_CHARS_PER_REQUEST}). Converting text to file attachment."
+        f"Message length ({len(text)}) exceeds effective limit ({effective_limit})."
     )
+    logger.info("Converting oversized message to file attachment.")
     file_obj = io.BytesIO(text.encode("utf-8"))
     file_obj.name = "message.txt"
     try:
         final_files = list(files) if files else []
         final_files.append(file_obj)
         instruction = (
-            "The user's input exceeds the character limit and is provided in the attached file `message.txt`.\n\n"
-            "**System Instruction:**\n"
-            "1. Read the content of `message.txt`.\n"
-            "2. Treat that content as the **primary** user prompt for this turn.\n"
-            "3. Execute the instructions or answer the questions found *inside* that file immediately.\n"
+            "Context is attached in `message.txt`. "
+            "Acknowledge it briefly, then treat it as the primary user input for this turn and answer based on it."
         )
         if stream:
-            return session.send_message_stream(instruction, files=final_files)
-        return await session.send_message(instruction, files=final_files)
+            return session.send_message_stream(instruction, files=final_files, temporary=temporary)
+        return await session.send_message(instruction, files=final_files, temporary=temporary)
     except Exception as e:
         logger.exception(f"Error sending large text as file to Gemini: {e}")
         raise
 
 
+def _is_missing_chat_error(exc: Exception) -> bool:
+    normalized = " ".join(part for part in (str(exc), repr(exc)) if part).lower()
+    if not normalized:
+        return False
+    return any(pattern.search(normalized) for pattern in _MISSING_CHAT_ERROR_PATTERNS)
+
+
+async def _send_with_internal_fallback(
+    *,
+    pool: GeminiClientPool,
+    model: Model,
+    session: ChatSession,
+    client: GeminiClientWrapper,
+    current_input: str,
+    files: list[Path | str | io.BytesIO],
+    full_prepared_messages: list[Message],
+    tmp_dir: Path,
+    stream: bool,
+    reused_session: bool,
+    temporary: bool,
+) -> tuple[AsyncGenerator[ModelOutput] | ModelOutput, ChatSession, GeminiClientWrapper]:
+    try:
+        output = await _send_with_split(
+            session,
+            current_input,
+            files=files,
+            stream=stream,
+            temporary=temporary,
+        )
+        return output, session, client
+    except Exception as exc:
+        should_fallback = (
+            reused_session
+            and not stream
+            and _is_missing_chat_error(exc)
+        )
+        if not should_fallback:
+            raise
+
+        logger.warning(
+            "Metadata-backed chat reuse failed; retrying with internal history replay in a fresh chat."
+        )
+        fallback_client = await pool.acquire()
+        fallback_session = fallback_client.start_chat(model=model)
+        fallback_input, fallback_files = await _process_conversation_with_compaction(
+            full_prepared_messages,
+            tmp_dir,
+            allow_summary_compaction=(g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
+            reason="fallback replay",
+        )
+        output = await _send_with_split(
+            fallback_session,
+            fallback_input,
+            files=fallback_files,
+            stream=False,
+            temporary=temporary,
+        )
+        return output, fallback_session, fallback_client
+
+
 class StreamingOutputFilter:
     """
     Filter to suppress technical protocol markers, tool calls, and system hints from the stream.
@@ -1603,6 +1782,8 @@ async def create_chat_completion(
     )
 
     session, client, remain = await _find_reusable_session(db, pool, model, msgs)
+    reused_session = session is not None
+    use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY
 
     if session:
         if not remain:
@@ -1617,7 +1798,12 @@ async def create_chat_completion(
             extra_instr,
             False,
         )
-        m_input, files = await GeminiClientWrapper.process_conversation(input_msgs, tmp_dir)
+        m_input, files = await _process_conversation_with_compaction(
+            input_msgs,
+            tmp_dir,
+            allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
+            reason="temporary session replay",
+        )
 
         logger.debug(
             f"Reused session {reprlib.repr(session.metadata)} - sending {len(input_msgs)} prepared messages."
@@ -1627,7 +1813,12 @@ async def create_chat_completion(
             client = await pool.acquire()
             session = client.start_chat(model=model)
             # Use the already prepared 'msgs' for a fresh session
-            m_input, files = await GeminiClientWrapper.process_conversation(msgs, tmp_dir)
+            m_input, files = await _process_conversation_with_compaction(
+                msgs,
+                tmp_dir,
+                allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
+                reason="temporary fresh replay",
+            )
         except Exception as e:
             logger.exception("Error in preparing conversation")
             raise HTTPException(
@@ -1636,14 +1827,23 @@ async def create_chat_completion(
 
     completion_id = f"chatcmpl-{uuid.uuid4()}"
     created_time = int(datetime.now(tz=UTC).timestamp())
-
     try:
         assert session and client
         logger.debug(
             f"Client ID: {client.id}, Input length: {len(m_input)}, files count: {len(files)}"
         )
-        resp_or_stream = await _send_with_split(
-            session, m_input, files=files, stream=request.stream
+        resp_or_stream, session, client = await _send_with_internal_fallback(
+            pool=pool,
+            model=model,
+            session=session,
+            client=client,
+            current_input=m_input,
+            files=files,
+            full_prepared_messages=msgs,
+            tmp_dir=tmp_dir,
+            stream=bool(request.stream),
+            reused_session=reused_session,
+            temporary=use_google_temporary_mode,
         )
     except Exception as e:
         logger.exception("Gemini API error")
@@ -1785,6 +1985,8 @@ async def create_response(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
 
     session, client, remain = await _find_reusable_session(db, pool, model, messages)
+    reused_session = session is not None
+    use_google_temporary_mode = g_config.gemini.chat_mode == ChatMode.TEMPORARY
     if session:
         msgs = _prepare_messages_for_model(
             remain,
@@ -1795,7 +1997,12 @@ async def create_response(
         )
         if not msgs:
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No new messages.")
-        m_input, files = await GeminiClientWrapper.process_conversation(msgs, tmp_dir)
+        m_input, files = await _process_conversation_with_compaction(
+            msgs,
+            tmp_dir,
+            allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
+            reason="temporary session replay",
+        )
         logger.debug(
             f"Reused session {reprlib.repr(session.metadata)} - sending {len(msgs)} prepared messages."
         )
@@ -1803,7 +2010,12 @@ async def create_response(
         try:
             client = await pool.acquire()
             session = client.start_chat(model=model)
-            m_input, files = await GeminiClientWrapper.process_conversation(messages, tmp_dir)
+            m_input, files = await _process_conversation_with_compaction(
+                messages,
+                tmp_dir,
+                allow_summary_compaction=use_google_temporary_mode and (g_config.gemini.oversized_context_strategy == OversizedContextStrategy.COMPACTION),
+                reason="temporary fresh replay",
+            )
         except Exception as e:
             logger.exception("Error in preparing conversation")
             raise HTTPException(
@@ -1812,14 +2024,23 @@ async def create_response(
 
     response_id = f"resp_{uuid.uuid4().hex}"
     created_time = int(datetime.now(tz=UTC).timestamp())
-
     try:
         assert session and client
         logger.debug(
             f"Client ID: {client.id}, Input length: {len(m_input)}, files count: {len(files)}"
         )
-        resp_or_stream = await _send_with_split(
-            session, m_input, files=files, stream=request.stream
+        resp_or_stream, session, client = await _send_with_internal_fallback(
+            pool=pool,
+            model=model,
+            session=session,
+            client=client,
+            current_input=m_input,
+            files=files,
+            full_prepared_messages=messages,
+            tmp_dir=tmp_dir,
+            stream=bool(request.stream),
+            reused_session=reused_session,
+            temporary=use_google_temporary_mode,
         )
     except Exception as e:
         logger.exception("Gemini API error")
diff --git a/app/utils/config.py b/app/utils/config.py
index 69af2e1..fe806c9 100644
--- a/app/utils/config.py
+++ b/app/utils/config.py
@@ -1,6 +1,7 @@
 import ast
 import os
 import sys
+from enum import Enum
 from typing import Any, Literal
 
 import orjson
@@ -71,6 +72,20 @@ def _parse_json_string(cls, v: Any) -> Any:
         return v
 
 
+class OversizedContextStrategy(str, Enum):
+    """Strategy for handling oversized context."""
+
+    COMPACTION = "compaction"
+    FILE = "file"
+
+
+class ChatMode(str, Enum):
+    """Chat mode options for Gemini conversation handling."""
+
+    NORMAL = "normal"
+    TEMPORARY = "temporary"
+
+
 class GeminiConfig(BaseModel):
     """Gemini API configuration"""
 
@@ -96,6 +111,14 @@ class GeminiConfig(BaseModel):
         ge=1,
         description="Maximum characters Gemini Web can accept per request",
     )
+    oversized_context_strategy: OversizedContextStrategy = Field(
+        default=OversizedContextStrategy.COMPACTION,
+        description="Strategy for oversized context: 'compaction' summarizes older turns, 'file' sends oversized context as attachment",
+    )
+    chat_mode: ChatMode = Field(
+        default=ChatMode.NORMAL,
+        description="Chat mode: 'normal' uses standard chats, 'temporary' uses Google's temporary mode (not saved to account) and enforces an effective input limit of 90% of max_chars_per_request",
+    )
 
     @field_validator("models", mode="before")
     @classmethod
diff --git a/config/config.yaml b/config/config.yaml
index bd9fbc0..462167f 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -28,6 +28,8 @@ gemini:
   refresh_interval: 600    # Refresh interval in seconds (Not less than 60s)
   verbose: false           # Enable verbose logging for Gemini requests
   max_chars_per_request: 1000000     # Maximum characters Gemini Web accepts per request. Non-pro users might have a lower limit
+  oversized_context_strategy: "compaction" # "compaction" summarizes older turns when oversized; "file" sends oversized context as attached file
+  chat_mode: "normal"     # "normal" reuses Google chat metadata; "temporary" sends with Google's temporary mode (not saved to account) and uses a 90% effective input limit
   model_strategy: "append" # Strategy: 'append' (default + custom) or 'overwrite' (custom only)
   models: []