From 384097dec3e5be612a0b57a8978a726a788e6386 Mon Sep 17 00:00:00 2001
From: henrrypg <henrry.pulgarin@edunext.co>
Date: Thu, 18 Jun 2026 14:46:57 -0500
Subject: [PATCH 1/3] chore: add integration tests for threading

---
 backend/tests/integration/test_threading.py | 230 ++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 backend/tests/integration/test_threading.py

diff --git a/backend/tests/integration/test_threading.py b/backend/tests/integration/test_threading.py
new file mode 100644
index 00000000..03ab246d
--- /dev/null
+++ b/backend/tests/integration/test_threading.py
@@ -0,0 +1,230 @@
+"""
+Validates that stale / expired remote thread IDs are recovered from
+without crashing, that the recovered conversation starts cleanly, that
+multi-turn context persists across three turns, and that Anthropic
+prompt caching fires (or at least does not crash) at various token sizes.
+
+Threading tests (N, O, P) use real AIWorkflowSession DB rows so that
+session.save() exercises the actual persistence layer rather than a mock.
+"""
+
+import os
+from unittest.mock import MagicMock
+
+import pytest
+
+from .conftest import create_live_session
+
+DUMMY_CONTENT = (
+    "Python is a high-level interpreted programming language. "
+    "It uses indentation for code blocks and supports multiple paradigms."
+)
+
+_OPENAI_CONFIG = {
+    "LLMProcessor": {
+        "provider": "test_openai",
+        "stream": False,
+        "function": "chat_with_context",
+    }
+}
+
+
+@pytest.mark.live_llm
+@pytest.mark.django_db
+@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
+def test_stale_thread_id_triggers_recovery(live_user, course_key):
+    """
+    When session.remote_response_id points to a non-existent / expired
+    OpenAI thread, the processor must catch previous_response_not_found,
+    clear the stale ID, start a fresh thread, and return a valid response.
+    """
+    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
+
+    session = create_live_session(
+        live_user, course_key,
+        remote_response_id="resp_fake_expired_thread_id_xyz_000000",
+    )
+
+    processor = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
+    result = processor.process(
+        context=DUMMY_CONTENT,
+        input_data="Hello, please introduce yourself briefly.",
+    )
+
+    assert result.get("status") == "success", f"Expected success after recovery, got: {result}"
+    assert result.get("response"), "Expected non-empty response after thread recovery"
+
+    session.refresh_from_db()
+    assert session.remote_response_id != "resp_fake_expired_thread_id_xyz_000000", (
+        "remote_response_id was not updated after stale-thread recovery"
+    )
+
+
+@pytest.mark.live_llm
+@pytest.mark.django_db
+@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
+def test_conversation_clean_after_stale_thread_recovery(live_user, course_key):
+    """
+    After stale-thread recovery, a second call on the same session must
+    succeed and produce a coherent response grounded in the provided content.
+    """
+    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
+
+    session = create_live_session(
+        live_user, course_key,
+        remote_response_id="resp_fake_expired_thread_id_xyz_000000",
+    )
+
+    # Turn 1 — recovery happens here
+    proc1 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
+    result1 = proc1.process(
+        context=DUMMY_CONTENT,
+        input_data="Hello, please introduce yourself briefly.",
+    )
+    assert result1.get("response"), "Turn 1 must produce a response for test O to be meaningful"
+
+    # Turn 2 — same session, recovered thread
+    session.refresh_from_db()
+    proc2 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
+    result2 = proc2.process(
+        context=DUMMY_CONTENT,
+        input_data="What programming language are we discussing?",
+    )
+
+    assert result2.get("status") == "success", f"Turn 2 failed: {result2}"
+    response_text = (result2.get("response") or "").lower()
+    assert len(response_text) > 5, "Turn 2 produced an empty response"
+    assert "python" in response_text, (
+        f"Expected 'python' in turn-2 response (grounded in content), got: {result2.get('response')}"
+    )
+
+
+@pytest.mark.live_llm
+@pytest.mark.django_db
+@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
+def test_three_turn_context_chain(live_user, course_key):
+    """
+    A fact planted in turn 1 must still be recalled in turn 3, even after
+    a neutral turn 2 that does not reference it.  Verifies that the server-
+    side thread correctly chains three consecutive turns.
+    """
+    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
+
+    session = create_live_session(live_user, course_key)
+
+    # Turn 0 — initialise the remote thread (system messages only; no user input
+    # reaches OpenAI on the first call with the current logic)
+    LLMProcessor(config=_OPENAI_CONFIG, user_session=session).process(
+        context=DUMMY_CONTENT, input_data="Start."
+    )
+    session.refresh_from_db()
+
+    # Turn 1 — plant memorable fact (sent via previous_response_id)
+    proc1 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
+    r1 = proc1.process(context=DUMMY_CONTENT, input_data="My favourite colour is TURQUOISE. Just say 'Got it'.")
+    assert r1.get("response"), "Turn 1 must return a response"
+
+    # Turn 2 — neutral noise turn
+    session.refresh_from_db()
+    proc2 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
+    r2 = proc2.process(context=DUMMY_CONTENT, input_data="Tell me one thing about Python in one sentence.")
+    assert r2.get("response"), "Turn 2 must return a response"
+
+    # Turn 3 — recall the fact from turn 1
+    session.refresh_from_db()
+    proc3 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
+    r3 = proc3.process(context=DUMMY_CONTENT, input_data="What is my favourite colour?")
+
+    assert r3.get("status") == "success", f"Turn 3 failed: {r3}"
+    response_text = (r3.get("response") or "").lower()
+    assert "turquoise" in response_text, (
+        f"Expected 'turquoise' in turn-3 response, got: {r3.get('response')}"
+    )
+
+
+_LONG_SYSTEM_CONTEXT = (
+    "The history of computing spans several decades. "
+    "From vacuum tubes to transistors to integrated circuits, each era "
+    "brought dramatic improvements in speed, size, and cost. "
+    "ENIAC (1945) was the first general-purpose electronic computer, "
+    "weighing 30 tons and occupying an entire room. "
+    "The invention of the transistor in 1947 at Bell Labs was a watershed moment, "
+    "enabling miniaturisation that made personal computers possible. "
+    "Intel released the first commercial microprocessor, the 4004, in 1971. "
+    "The IBM PC in 1981 standardised the personal computer market. "
+    "Tim Berners-Lee invented the World Wide Web in 1989, transforming computing. "
+    "The rise of smartphones in the 2000s put computing in every pocket. "
+    "Cloud computing emerged in the 2010s, shifting workloads to remote data centres. "
+    "Today artificial intelligence, driven by GPUs and large language models, "
+    "represents the next major inflection point in the history of computing technology. "
+) * 24  # Repeat to exceed Anthropic's cache minimum (4096 tokens for claude-haiku-4-5)
+
+
+@pytest.mark.live_llm
+@pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY"), reason="ANTHROPIC_API_KEY not set")
+def test_anthropic_cache_hit_on_second_call():
+    """
+    When the same large system context is sent twice to Anthropic, the
+    second call's usage should report cache_read_input_tokens > 0,
+    confirming the cache_control prefix written by the first call was
+    reused. claude-haiku-4-5's cache minimum is 4096 tokens;
+    _LONG_SYSTEM_CONTEXT comfortably exceeds it.
+    """
+    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
+
+    config = {
+        "LLMProcessor": {
+            "provider": "test_anthropic",
+            "stream": False,
+            "function": "summarize_content",
+        }
+    }
+
+    # First call — warms the cache
+    proc1 = LLMProcessor(config=config, user_session=MagicMock(remote_response_id=None))
+    r1 = proc1.process(context=_LONG_SYSTEM_CONTEXT, input_data="Summarize this in one sentence.")
+    assert r1.get("status") == "success", f"First call failed: {r1}"
+
+    # Second call — should hit the cache
+    proc2 = LLMProcessor(config=config, user_session=MagicMock(remote_response_id=None))
+    r2 = proc2.process(context=_LONG_SYSTEM_CONTEXT, input_data="Summarize this in one sentence.")
+    assert r2.get("status") == "success", f"Second call failed: {r2}"
+
+    usage = proc2.get_usage()
+    assert usage is not None, "Expected usage to be populated on second call"
+    cache_tokens = getattr(usage, "cache_read_input_tokens", 0) or 0
+    assert cache_tokens > 0, (
+        f"Expected cache_read_input_tokens > 0 on second call. usage={usage}"
+    )
+
+
+_SHORT_CONTENT = "Python uses indentation to define code blocks."
+
+
+@pytest.mark.live_llm
+@pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY"), reason="ANTHROPIC_API_KEY not set")
+def test_anthropic_cache_short_prompt_no_crash():
+    """
+    Anthropic silently ignores cache_control for prompts below the model's
+    minimum (4096 tokens for claude-haiku-4-5). Enabling cache on a short
+    prompt must not crash — a valid response is returned with no error,
+    even if no cache tokens are reported.
+    """
+    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
+
+    config = {
+        "LLMProcessor": {
+            "provider": "test_anthropic",
+            "stream": False,
+            "function": "summarize_content",
+            "cache": True,
+        }
+    }
+
+    processor = LLMProcessor(config=config, user_session=MagicMock(remote_response_id=None))
+    result = processor.process(context=_SHORT_CONTENT, input_data="Summarize this.")
+
+    assert result.get("status") == "success", (
+        f"Short-prompt cache call failed: {result}"
+    )
+    assert result.get("response"), "Expected non-empty response"

From 933b7c6d3f62081538dc2718d325b4f06fd49cd1 Mon Sep 17 00:00:00 2001
From: henrrypg <henrry.pulgarin@edunext.co>
Date: Tue, 23 Jun 2026 11:21:08 -0500
Subject: [PATCH 2/3] fix: address comments

---
 .../processors/llm/llm_processor.py           |   4 +-
 backend/tests/integration/conftest.py         |  26 +++
 backend/tests/integration/test_threading.py   | 150 ++++++++++++------
 3 files changed, 128 insertions(+), 52 deletions(-)

diff --git a/backend/openedx_ai_extensions/processors/llm/llm_processor.py b/backend/openedx_ai_extensions/processors/llm/llm_processor.py
index 628a901e..47362480 100644
--- a/backend/openedx_ai_extensions/processors/llm/llm_processor.py
+++ b/backend/openedx_ai_extensions/processors/llm/llm_processor.py
@@ -124,8 +124,8 @@ def _build_response_api_params(self, system_role=None):
         params = {}
         params["stream"] = self.stream
 
+        user_text = normalize_input_to_text(self.input_data)
         if self.chat_history:
-            user_text = normalize_input_to_text(self.input_data)
             if user_text:
                 self.chat_history.append({"role": "user", "content": user_text})
             params["input"] = self.chat_history
@@ -135,6 +135,8 @@ def _build_response_api_params(self, system_role=None):
                 {"role": "system", "content": self.custom_prompt or system_role},
                 {"role": "system", "content": self.context},
             ]
+            if user_text:
+                params["input"].append({"role": "user", "content": user_text})
 
         # Add optional parameters only if configured
         params.update(self.extra_params)
diff --git a/backend/tests/integration/conftest.py b/backend/tests/integration/conftest.py
index 7c75e8a4..a5b80c11 100644
--- a/backend/tests/integration/conftest.py
+++ b/backend/tests/integration/conftest.py
@@ -26,6 +26,8 @@
 _settings.SERVICE_VARIANT = "lms"
 
 
+from openedx_ai_extensions.processors.llm.providers import \
+    provider_supports  # noqa: E402  pylint: disable=wrong-import-position
 from openedx_ai_extensions.workflows.models import (  # noqa: E402  pylint: disable=wrong-import-position
     AIWorkflowProfile,
     AIWorkflowScope,
@@ -63,6 +65,30 @@ def skip_if_no_key(env_var: str) -> None:
         pytest.skip(f"{env_var} not set — skipping live LLM test")
 
 
+PROVIDER_ENV_VARS = {
+    "openai": "OPENAI_API_KEY",
+    "anthropic": "ANTHROPIC_API_KEY",
+}
+
+
+def skip_unless_capability(capability: str):
+    """
+    Skip the test unless a provider supporting *capability* (per
+    _PROVIDER_CAPABILITIES) has its API key set, so capability/test
+    coverage stays driven by providers/__init__.py rather than duplicated
+    provider names in test code.
+    """
+    env_vars = [
+        env_var for provider, env_var in PROVIDER_ENV_VARS.items()
+        if provider_supports(provider, capability)
+    ]
+    has_key = any(os.environ.get(env_var) for env_var in env_vars)
+    return pytest.mark.skipif(
+        not has_key,
+        reason=f"No API key set for a provider supporting {capability!r} ({', '.join(env_vars)})",
+    )
+
+
 def create_profile_and_scope(  # pylint: disable=redefined-outer-name
     provider_slug: str,
     course_key,
diff --git a/backend/tests/integration/test_threading.py b/backend/tests/integration/test_threading.py
index 03ab246d..2bb9ce5e 100644
--- a/backend/tests/integration/test_threading.py
+++ b/backend/tests/integration/test_threading.py
@@ -8,18 +8,22 @@
 session.save() exercises the actual persistence layer rather than a mock.
 """
 
-import os
-from unittest.mock import MagicMock
-
 import pytest
 
-from .conftest import create_live_session
+from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor
+
+from .conftest import PROVIDERS, create_live_session, skip_if_no_key, skip_unless_capability
 
 DUMMY_CONTENT = (
     "Python is a high-level interpreted programming language. "
     "It uses indentation for code blocks and supports multiple paradigms."
 )
 
+ALREADY_EXPIRED_THREAD_ID = (
+    "resp_bGl0ZWxsbTpjdXN0b21fbGxtX3Byb3ZpZGVyOm9wZW5haTttb2RlbF9pZDpOb25lO3Jlc3BvbnNlX2lkOnJlc3BfMDI5MTVhYjk4Mjc4"
+    "ODVhMTAwNmEwZTNhMWQ1NjY0ODE5NWJmOTUyYWIxYTExYjE3ZmQ="
+)
+
 _OPENAI_CONFIG = {
     "LLMProcessor": {
         "provider": "test_openai",
@@ -31,18 +35,16 @@
 
 @pytest.mark.live_llm
 @pytest.mark.django_db
-@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
+@skip_unless_capability("server_side_thread_id")
 def test_stale_thread_id_triggers_recovery(live_user, course_key):
     """
     When session.remote_response_id points to a non-existent / expired
     OpenAI thread, the processor must catch previous_response_not_found,
     clear the stale ID, start a fresh thread, and return a valid response.
     """
-    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
-
     session = create_live_session(
         live_user, course_key,
-        remote_response_id="resp_fake_expired_thread_id_xyz_000000",
+        remote_response_id=ALREADY_EXPIRED_THREAD_ID,
     )
 
     processor = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
@@ -55,31 +57,34 @@ def test_stale_thread_id_triggers_recovery(live_user, course_key):
     assert result.get("response"), "Expected non-empty response after thread recovery"
 
     session.refresh_from_db()
-    assert session.remote_response_id != "resp_fake_expired_thread_id_xyz_000000", (
+    assert session.remote_response_id != ALREADY_EXPIRED_THREAD_ID, (
         "remote_response_id was not updated after stale-thread recovery"
     )
 
 
 @pytest.mark.live_llm
 @pytest.mark.django_db
-@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
+@skip_unless_capability("server_side_thread_id")
 def test_conversation_clean_after_stale_thread_recovery(live_user, course_key):
     """
     After stale-thread recovery, a second call on the same session must
-    succeed and produce a coherent response grounded in the provided content.
+    succeed and recall a fact planted in turn 1 — proving the recovered
+    thread actually carries turn-1 context forward, not just that turn 2
+    independently produces a plausible answer. The planted number is not in
+    DUMMY_CONTENT or inferable from general knowledge, so the model can only
+    recall it if turn 2 has real access to turn 1. Framed as a "lucky number"
+    rather than an ID/identifier to avoid PII-refusal false negatives.
     """
-    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
-
     session = create_live_session(
         live_user, course_key,
-        remote_response_id="resp_fake_expired_thread_id_xyz_000000",
+        remote_response_id=ALREADY_EXPIRED_THREAD_ID,
     )
 
     # Turn 1 — recovery happens here
     proc1 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
     result1 = proc1.process(
         context=DUMMY_CONTENT,
-        input_data="Hello, please introduce yourself briefly.",
+        input_data="My lucky number is 9142. Just say 'Got it'.",
     )
     assert result1.get("response"), "Turn 1 must produce a response for test O to be meaningful"
 
@@ -88,52 +93,83 @@ def test_conversation_clean_after_stale_thread_recovery(live_user, course_key):
     proc2 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
     result2 = proc2.process(
         context=DUMMY_CONTENT,
-        input_data="What programming language are we discussing?",
+        input_data="What is my lucky number?",
     )
 
     assert result2.get("status") == "success", f"Turn 2 failed: {result2}"
-    response_text = (result2.get("response") or "").lower()
-    assert len(response_text) > 5, "Turn 2 produced an empty response"
-    assert "python" in response_text, (
-        f"Expected 'python' in turn-2 response (grounded in content), got: {result2.get('response')}"
+    response_text = result2.get("response") or ""
+    assert "9142" in response_text, (
+        f"Expected '9142' in turn-2 response (recalled from turn 1), got: {response_text}"
     )
 
 
 @pytest.mark.live_llm
 @pytest.mark.django_db
-@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
-def test_three_turn_context_chain(live_user, course_key):
+@pytest.mark.parametrize("provider_slug,env_var", PROVIDERS)
+def test_three_turn_context_chain(provider_slug, env_var, live_user, course_key):
     """
     A fact planted in turn 1 must still be recalled in turn 3, even after
-    a neutral turn 2 that does not reference it.  Verifies that the server-
-    side thread correctly chains three consecutive turns.
+    a neutral turn 2 that does not reference it. Multi-turn context retention
+    is a general guarantee of the processor — providers with server-side
+    threading (e.g. OpenAI) chain via previous_response_id, while others
+    (e.g. Anthropic) need the prior turns resent as chat_history — so this
+    runs against every configured provider rather than just OpenAI.
+
+    LLMProcessor itself never auto-reconstructs chat_history (that's the
+    caller's job — see ThreadedLLMResponse.run); since this test calls
+    LLMProcessor directly, it threads chat_history between calls itself so
+    non-OpenAI providers actually receive turn 1/2 on later calls instead of
+    relying solely on previous_response_id (OpenAI-only).
     """
-    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
+    skip_if_no_key(env_var)
+    config = {
+        "LLMProcessor": {
+            "provider": provider_slug,
+            "stream": False,
+            "function": "chat_with_context",
+        }
+    }
 
     session = create_live_session(live_user, course_key)
+    chat_history = []
 
-    # Turn 0 — initialise the remote thread (system messages only; no user input
-    # reaches OpenAI on the first call with the current logic)
-    LLMProcessor(config=_OPENAI_CONFIG, user_session=session).process(
-        context=DUMMY_CONTENT, input_data="Start."
+    # Turn 0 — initialise the thread
+    r0 = LLMProcessor(config=config, user_session=session).process(
+        context=DUMMY_CONTENT, input_data="Start.", chat_history=chat_history
     )
+    chat_history.append({"role": "user", "content": "Start."})
+    chat_history.append({"role": "assistant", "content": r0.get("response") or ""})
     session.refresh_from_db()
 
-    # Turn 1 — plant memorable fact (sent via previous_response_id)
-    proc1 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
-    r1 = proc1.process(context=DUMMY_CONTENT, input_data="My favourite colour is TURQUOISE. Just say 'Got it'.")
+    # Turn 1 — plant memorable fact
+    proc1 = LLMProcessor(config=config, user_session=session)
+    r1 = proc1.process(
+        context=DUMMY_CONTENT,
+        input_data="My favourite colour is TURQUOISE. Just say 'Got it'.",
+        chat_history=chat_history,
+    )
     assert r1.get("response"), "Turn 1 must return a response"
+    chat_history.append({"role": "assistant", "content": r1.get("response") or ""})
 
     # Turn 2 — neutral noise turn
     session.refresh_from_db()
-    proc2 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
-    r2 = proc2.process(context=DUMMY_CONTENT, input_data="Tell me one thing about Python in one sentence.")
+    proc2 = LLMProcessor(config=config, user_session=session)
+    r2 = proc2.process(
+        context=DUMMY_CONTENT,
+        input_data="Tell me one thing about Python in one sentence.",
+        chat_history=chat_history,
+    )
     assert r2.get("response"), "Turn 2 must return a response"
+    chat_history.append({"role": "assistant", "content": r2.get("response") or ""})
 
     # Turn 3 — recall the fact from turn 1
     session.refresh_from_db()
-    proc3 = LLMProcessor(config=_OPENAI_CONFIG, user_session=session)
-    r3 = proc3.process(context=DUMMY_CONTENT, input_data="What is my favourite colour?")
+    proc3 = LLMProcessor(config=config, user_session=session)
+    r3 = proc3.process(
+        context=DUMMY_CONTENT,
+        input_data="What is my favourite colour?",
+        chat_history=chat_history,
+    )
 
     assert r3.get("status") == "success", f"Turn 3 failed: {r3}"
     response_text = (r3.get("response") or "").lower()
@@ -161,17 +197,20 @@ def test_three_turn_context_chain(live_user, course_key):
 
 
 @pytest.mark.live_llm
-@pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY"), reason="ANTHROPIC_API_KEY not set")
-def test_anthropic_cache_hit_on_second_call():
+@pytest.mark.django_db
+@skip_unless_capability("multi_turn_cache")
+def test_anthropic_cache_hit_on_second_call(live_user, course_key):
     """
     When the same large system context is sent twice to Anthropic, the
     second call's usage should report cache_read_input_tokens > 0,
     confirming the cache_control prefix written by the first call was
     reused. claude-haiku-4-5's cache minimum is 4096 tokens;
     _LONG_SYSTEM_CONTEXT comfortably exceeds it.
-    """
-    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
 
+    Uses a real AIWorkflowSession (like the other threading tests) rather
+    than a MagicMock, so any session.save() call this code path makes is
+    actually exercised instead of silently swallowed.
+    """
     config = {
         "LLMProcessor": {
             "provider": "test_anthropic",
@@ -180,13 +219,16 @@ def test_anthropic_cache_hit_on_second_call():
         }
     }
 
+    session = create_live_session(live_user, course_key)
+
     # First call — warms the cache
-    proc1 = LLMProcessor(config=config, user_session=MagicMock(remote_response_id=None))
+    proc1 = LLMProcessor(config=config, user_session=session)
     r1 = proc1.process(context=_LONG_SYSTEM_CONTEXT, input_data="Summarize this in one sentence.")
     assert r1.get("status") == "success", f"First call failed: {r1}"
 
     # Second call — should hit the cache
-    proc2 = LLMProcessor(config=config, user_session=MagicMock(remote_response_id=None))
+    session.refresh_from_db()
+    proc2 = LLMProcessor(config=config, user_session=session)
     r2 = proc2.process(context=_LONG_SYSTEM_CONTEXT, input_data="Summarize this in one sentence.")
     assert r2.get("status") == "success", f"Second call failed: {r2}"
 
@@ -202,26 +244,32 @@ def test_anthropic_cache_hit_on_second_call():
 
 
 @pytest.mark.live_llm
-@pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY"), reason="ANTHROPIC_API_KEY not set")
-def test_anthropic_cache_short_prompt_no_crash():
+@pytest.mark.django_db
+@pytest.mark.parametrize("provider_slug,env_var", PROVIDERS)
+def test_cache_short_prompt_no_crash(provider_slug, env_var, live_user, course_key):
     """
+    Setting cache=True must never crash, regardless of whether the provider
+    actually supports a caching feature. Providers without "multi_turn_cache"
+    in _PROVIDER_CAPABILITIES (e.g. OpenAI) should just ignore the flag;
     Anthropic silently ignores cache_control for prompts below the model's
-    minimum (4096 tokens for claude-haiku-4-5). Enabling cache on a short
-    prompt must not crash — a valid response is returned with no error,
-    even if no cache tokens are reported.
-    """
-    from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor  # pylint: disable=C0415
+    minimum (4096 tokens for claude-haiku-4-5) too. Either way, a valid
+    response is returned with no error, even if no cache tokens are reported.
 
+    Uses a real AIWorkflowSession so this exercises the same persistence
+    path as every other test in this file, regardless of provider.
+    """
+    skip_if_no_key(env_var)
     config = {
         "LLMProcessor": {
-            "provider": "test_anthropic",
+            "provider": provider_slug,
             "stream": False,
             "function": "summarize_content",
             "cache": True,
         }
     }
 
-    processor = LLMProcessor(config=config, user_session=MagicMock(remote_response_id=None))
+    session = create_live_session(live_user, course_key)
+    processor = LLMProcessor(config=config, user_session=session)
     result = processor.process(context=_SHORT_CONTENT, input_data="Summarize this.")
 
     assert result.get("status") == "success", (

From 8a4b4fcc9b79c2bf0f9feda5d837bebd320a69b2 Mon Sep 17 00:00:00 2001
From: henrrypg <henrry.pulgarin@edunext.co>
Date: Tue, 23 Jun 2026 16:43:20 -0500
Subject: [PATCH 3/3] fix: address comments

---
 backend/tests/integration/conftest.py       | 10 +----
 backend/tests/integration/sample_content.py | 19 ++++++++++
 backend/tests/integration/test_threading.py | 42 +++++----------------
 3 files changed, 30 insertions(+), 41 deletions(-)

diff --git a/backend/tests/integration/conftest.py b/backend/tests/integration/conftest.py
index a5b80c11..18138390 100644
--- a/backend/tests/integration/conftest.py
+++ b/backend/tests/integration/conftest.py
@@ -65,12 +65,6 @@ def skip_if_no_key(env_var: str) -> None:
         pytest.skip(f"{env_var} not set — skipping live LLM test")
 
 
-PROVIDER_ENV_VARS = {
-    "openai": "OPENAI_API_KEY",
-    "anthropic": "ANTHROPIC_API_KEY",
-}
-
-
 def skip_unless_capability(capability: str):
     """
     Skip the test unless a provider supporting *capability* (per
@@ -79,8 +73,8 @@ def skip_unless_capability(capability: str):
     provider names in test code.
     """
     env_vars = [
-        env_var for provider, env_var in PROVIDER_ENV_VARS.items()
-        if provider_supports(provider, capability)
+        env_var for provider_slug, env_var in (p.values for p in PROVIDERS)
+        if provider_supports(provider_slug.removeprefix("test_"), capability)
     ]
     has_key = any(os.environ.get(env_var) for env_var in env_vars)
     return pytest.mark.skipif(
diff --git a/backend/tests/integration/sample_content.py b/backend/tests/integration/sample_content.py
index b56d11f4..104e65d2 100644
--- a/backend/tests/integration/sample_content.py
+++ b/backend/tests/integration/sample_content.py
@@ -34,3 +34,22 @@
     "It emphasises code readability using significant indentation. "
     "Python supports multiple programming paradigms and has a large standard library."
 )
+
+SHORT_CONTENT = "Python uses indentation to define code blocks."
+
+LONG_SYSTEM_CONTEXT = (
+    "The history of computing spans several decades. "
+    "From vacuum tubes to transistors to integrated circuits, each era "
+    "brought dramatic improvements in speed, size, and cost. "
+    "ENIAC (1945) was the first general-purpose electronic computer, "
+    "weighing 30 tons and occupying an entire room. "
+    "The invention of the transistor in 1947 at Bell Labs was a watershed moment, "
+    "enabling miniaturisation that made personal computers possible. "
+    "Intel released the first commercial microprocessor, the 4004, in 1971. "
+    "The IBM PC in 1981 standardised the personal computer market. "
+    "Tim Berners-Lee invented the World Wide Web in 1989, transforming computing. "
+    "The rise of smartphones in the 2000s put computing in every pocket. "
+    "Cloud computing emerged in the 2010s, shifting workloads to remote data centres. "
+    "Today artificial intelligence, driven by GPUs and large language models, "
+    "represents the next major inflection point in the history of computing technology. "
+) * 24  # Repeat to exceed Anthropic's cache minimum (4096 tokens for claude-haiku-4-5)
diff --git a/backend/tests/integration/test_threading.py b/backend/tests/integration/test_threading.py
index 2bb9ce5e..e6e673b8 100644
--- a/backend/tests/integration/test_threading.py
+++ b/backend/tests/integration/test_threading.py
@@ -4,8 +4,9 @@
 multi-turn context persists across three turns, and that Anthropic
 prompt caching fires (or at least does not crash) at various token sizes.
 
-Threading tests (N, O, P) use real AIWorkflowSession DB rows so that
-session.save() exercises the actual persistence layer rather than a mock.
+Every test in this file uses a real AIWorkflowSession DB row (via
+create_live_session) rather than a mock, so session.save() exercises the
+actual persistence layer.
 """
 
 import pytest
@@ -13,11 +14,7 @@
 from openedx_ai_extensions.processors.llm.llm_processor import LLMProcessor
 
 from .conftest import PROVIDERS, create_live_session, skip_if_no_key, skip_unless_capability
-
-DUMMY_CONTENT = (
-    "Python is a high-level interpreted programming language. "
-    "It uses indentation for code blocks and supports multiple paradigms."
-)
+from .sample_content import DUMMY_CONTENT, LONG_SYSTEM_CONTEXT, SHORT_CONTENT
 
 ALREADY_EXPIRED_THREAD_ID = (
     "resp_bGl0ZWxsbTpjdXN0b21fbGxtX3Byb3ZpZGVyOm9wZW5haTttb2RlbF9pZDpOb25lO3Jlc3BvbnNlX2lkOnJlc3BfMDI5MTVhYjk4Mjc4"
@@ -86,7 +83,7 @@ def test_conversation_clean_after_stale_thread_recovery(live_user, course_key):
         context=DUMMY_CONTENT,
         input_data="My lucky number is 9142. Just say 'Got it'.",
     )
-    assert result1.get("response"), "Turn 1 must produce a response for test O to be meaningful"
+    assert result1.get("response"), "Turn 1 must produce a response for this test to be meaningful"
 
     # Turn 2 — same session, recovered thread
     session.refresh_from_db()
@@ -178,24 +175,6 @@ def test_three_turn_context_chain(provider_slug, env_var, live_user, course_key)
     )
 
 
-_LONG_SYSTEM_CONTEXT = (
-    "The history of computing spans several decades. "
-    "From vacuum tubes to transistors to integrated circuits, each era "
-    "brought dramatic improvements in speed, size, and cost. "
-    "ENIAC (1945) was the first general-purpose electronic computer, "
-    "weighing 30 tons and occupying an entire room. "
-    "The invention of the transistor in 1947 at Bell Labs was a watershed moment, "
-    "enabling miniaturisation that made personal computers possible. "
-    "Intel released the first commercial microprocessor, the 4004, in 1971. "
-    "The IBM PC in 1981 standardised the personal computer market. "
-    "Tim Berners-Lee invented the World Wide Web in 1989, transforming computing. "
-    "The rise of smartphones in the 2000s put computing in every pocket. "
-    "Cloud computing emerged in the 2010s, shifting workloads to remote data centres. "
-    "Today artificial intelligence, driven by GPUs and large language models, "
-    "represents the next major inflection point in the history of computing technology. "
-) * 24  # Repeat to exceed Anthropic's cache minimum (4096 tokens for claude-haiku-4-5)
-
-
 @pytest.mark.live_llm
 @pytest.mark.django_db
 @skip_unless_capability("multi_turn_cache")
@@ -205,7 +184,7 @@ def test_anthropic_cache_hit_on_second_call(live_user, course_key):
     second call's usage should report cache_read_input_tokens > 0,
     confirming the cache_control prefix written by the first call was
     reused. claude-haiku-4-5's cache minimum is 4096 tokens;
-    _LONG_SYSTEM_CONTEXT comfortably exceeds it.
+    LONG_SYSTEM_CONTEXT comfortably exceeds it.
 
     Uses a real AIWorkflowSession (like the other threading tests) rather
     than a MagicMock, so any session.save() call this code path makes is
@@ -223,13 +202,13 @@ def test_anthropic_cache_hit_on_second_call(live_user, course_key):
 
     # First call — warms the cache
     proc1 = LLMProcessor(config=config, user_session=session)
-    r1 = proc1.process(context=_LONG_SYSTEM_CONTEXT, input_data="Summarize this in one sentence.")
+    r1 = proc1.process(context=LONG_SYSTEM_CONTEXT, input_data="Summarize this in one sentence.")
     assert r1.get("status") == "success", f"First call failed: {r1}"
 
     # Second call — should hit the cache
     session.refresh_from_db()
     proc2 = LLMProcessor(config=config, user_session=session)
-    r2 = proc2.process(context=_LONG_SYSTEM_CONTEXT, input_data="Summarize this in one sentence.")
+    r2 = proc2.process(context=LONG_SYSTEM_CONTEXT, input_data="Summarize this in one sentence.")
     assert r2.get("status") == "success", f"Second call failed: {r2}"
 
     usage = proc2.get_usage()
@@ -240,9 +219,6 @@ def test_anthropic_cache_hit_on_second_call(live_user, course_key):
     )
 
 
-_SHORT_CONTENT = "Python uses indentation to define code blocks."
-
-
 @pytest.mark.live_llm
 @pytest.mark.django_db
 @pytest.mark.parametrize("provider_slug,env_var", PROVIDERS)
@@ -270,7 +246,7 @@ def test_cache_short_prompt_no_crash(provider_slug, env_var, live_user, course_k
 
     session = create_live_session(live_user, course_key)
     processor = LLMProcessor(config=config, user_session=session)
-    result = processor.process(context=_SHORT_CONTENT, input_data="Summarize this.")
+    result = processor.process(context=SHORT_CONTENT, input_data="Summarize this.")
 
     assert result.get("status") == "success", (
         f"Short-prompt cache call failed: {result}"