From 3cd559fd0d548b2aab52ea1626b1dfe44e65e026 Mon Sep 17 00:00:00 2001
From: Developer <developer@example.com>
Date: Sun, 24 May 2026 17:18:11 +0800
Subject: [PATCH 1/5] Fix signed-out Ollama prompt and chat

---
 src/openhuman/inference/local/ops.rs          |  4 +-
 .../inference/local/service/public_infer.rs   | 47 +++++++++-
 .../local/service/public_infer_tests.rs       | 87 +++++++++++++++++++
 src/openhuman/inference/model_ids.rs          | 13 ++-
 4 files changed, 144 insertions(+), 7 deletions(-)
diff --git a/src/openhuman/inference/local/ops.rs b/src/openhuman/inference/local/ops.rs
index f3b64ecb57..10ad801f2f 100644
--- a/src/openhuman/inference/local/ops.rs
+++ b/src/openhuman/inference/local/ops.rs
@@ -197,7 +197,7 @@ pub async fn local_ai_prompt(
         service.bootstrap(config).await;
     }
     let output = service
-        .prompt(config, prompt.trim(), max_tokens, no_think.unwrap_or(true))
+        .prompt_interactive(config, prompt.trim(), max_tokens, no_think.unwrap_or(true))
         .await
         .map_err(|e| e.to_string())?;
     Ok(RpcOutcome::single_log(output, "local ai prompt completed"))
@@ -412,7 +412,7 @@ pub async fn local_ai_chat(
 
     let service = local_ai::global(config);
     let reply = service
-        .chat_with_history(config, ollama_messages, max_tokens)
+        .chat_with_history_interactive(config, ollama_messages, max_tokens)
         .await?;
 
     tracing::debug!(
diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs
index 3aea0c46b1..796aa7dc1c 100644
--- a/src/openhuman/inference/local/service/public_infer.rs
+++ b/src/openhuman/inference/local/service/public_infer.rs
@@ -63,6 +63,25 @@ impl LocalAiService {
             .await
     }
 
+    pub async fn prompt_interactive(
+        &self,
+        config: &Config,
+        prompt: &str,
+        max_tokens: Option<u32>,
+        no_think: bool,
+    ) -> Result<String, String> {
+        if !config.local_ai.runtime_enabled {
+            return Err("local ai is disabled".to_string());
+        }
+        let system = if no_think {
+            "You are a concise assistant. Return only the final answer. Do not include reasoning or chain-of-thought."
+        } else {
+            "You are a helpful assistant."
+        };
+        self.inference_interactive(config, system, prompt, max_tokens.or(Some(160)), no_think)
+            .await
+    }
+
     pub async fn inline_complete(
         &self,
         config: &Config,
@@ -193,6 +212,27 @@ impl LocalAiService {
         config: &Config,
         messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
         max_tokens: Option<u32>,
+    ) -> Result<String, String> {
+        self.chat_with_history_internal(config, messages, max_tokens, true)
+            .await
+    }
+
+    pub(crate) async fn chat_with_history_interactive(
+        &self,
+        config: &Config,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
+        max_tokens: Option<u32>,
+    ) -> Result<String, String> {
+        self.chat_with_history_internal(config, messages, max_tokens, false)
+            .await
+    }
+
+    async fn chat_with_history_internal(
+        &self,
+        config: &Config,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
+        max_tokens: Option<u32>,
+        gated: bool,
     ) -> Result<String, String> {
         if !config.local_ai.runtime_enabled {
             return Err("local ai is disabled".to_string());
@@ -206,8 +246,11 @@ impl LocalAiService {
             return Err("messages must not be empty".to_string());
         }
 
-        // Multi-turn local chat is background LLM-bound work — gate it.
-        let _gate_permit = crate::openhuman::scheduler_gate::wait_for_capacity().await;
+        let _gate_permit = if gated {
+            crate::openhuman::scheduler_gate::wait_for_capacity().await
+        } else {
+            None
+        };
 
         if provider_from_config(config) == LocalAiProvider::LmStudio {
             let started = std::time::Instant::now();
diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs
index 8931438e2c..3ed591d42e 100644
--- a/src/openhuman/inference/local/service/public_infer_tests.rs
+++ b/src/openhuman/inference/local/service/public_infer_tests.rs
@@ -351,6 +351,93 @@ async fn inline_complete_interactive_does_not_block_on_held_permit() {
     );
 }
 
+#[tokio::test]
+async fn prompt_interactive_does_not_block_when_signed_out() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let app = Router::new().route(
+        "/api/generate",
+        post(|Json(_body): Json<serde_json::Value>| async move {
+            Json(json!({
+                "model": "test",
+                "response": "hello from mock",
+                "done": true
+            }))
+        }),
+    );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = enabled_config();
+    crate::openhuman::scheduler_gate::init_global(&config);
+    let _signed_out = crate::openhuman::scheduler_gate::SignedOutTestGuard::set(true);
+    let service = ready_service(&config);
+
+    let result = tokio::time::timeout(
+        std::time::Duration::from_secs(2),
+        service.prompt_interactive(&config, "hi", Some(16), true),
+    )
+    .await;
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    let reply = result
+        .expect("interactive prompt must not block when scheduler gate is signed out")
+        .expect("interactive prompt response");
+    assert_eq!(reply, "hello from mock");
+}
+
+#[tokio::test]
+async fn chat_with_history_interactive_does_not_block_when_signed_out() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let app = Router::new().route(
+        "/api/chat",
+        post(|Json(_body): Json<serde_json::Value>| async move {
+            Json(json!({
+                "model": "test",
+                "message": { "role": "assistant", "content": "history reply" },
+                "done": true
+            }))
+        }),
+    );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = enabled_config();
+    crate::openhuman::scheduler_gate::init_global(&config);
+    let _signed_out = crate::openhuman::scheduler_gate::SignedOutTestGuard::set(true);
+    let service = ready_service(&config);
+
+    let result = tokio::time::timeout(
+        std::time::Duration::from_secs(2),
+        service.chat_with_history_interactive(
+            &config,
+            vec![crate::openhuman::inference::local::ollama::OllamaChatMessage {
+                role: "user".to_string(),
+                content: "hi".to_string(),
+            }],
+            Some(16),
+        ),
+    )
+    .await;
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    let reply = result
+        .expect("interactive chat must not block when scheduler gate is signed out")
+        .expect("interactive chat response");
+    assert_eq!(reply, "history reply");
+}
+
 /// Counterpart: the gated `inline_complete` (and `prompt`/`summarize`)
 /// MUST queue behind a held permit. We assert this with a try-style
 /// race: spawn the gated call, give it time to enter the wait, then
diff --git a/src/openhuman/inference/model_ids.rs b/src/openhuman/inference/model_ids.rs
index a407df65d2..d325faab28 100644
--- a/src/openhuman/inference/model_ids.rs
+++ b/src/openhuman/inference/model_ids.rs
@@ -15,9 +15,9 @@ pub(crate) const DEFAULT_OLLAMA_VISION_MODEL: &str = "";
 pub(crate) const DEFAULT_LOW_VISION_MODEL: &str = "moondream:1.8b-v2-q4_K_S";
 pub(crate) const DEFAULT_OLLAMA_EMBED_MODEL: &str = "bge-m3";
 
-/// Chat models allowed in the current MVP build (2–4 GB tier only).
+/// Chat models allowed in the current local Ollama build.
 /// Any resolved chat model ID not listed here is redirected to `MVP_DEFAULT_CHAT_MODEL`.
-const MVP_ALLOWED_CHAT_MODELS: &[&str] = &["gemma3:1b-it-qat"];
+const MVP_ALLOWED_CHAT_MODELS: &[&str] = &["gemma3:1b-it-qat", "gemma4:e4b-it-q8_0"];
 const MVP_DEFAULT_CHAT_MODEL: &str = "gemma3:1b-it-qat";
 
 /// Vision models allowed in MVP — only disabled (empty string) since the
@@ -204,6 +204,13 @@ mod tests {
         assert_eq!(effective_chat_model_id(&config), "gemma3:1b-it-qat");
     }
 
+    #[test]
+    fn chat_model_allows_requested_ollama_gemma4_q8() {
+        let mut config = test_config();
+        config.local_ai.chat_model_id = "gemma4:e4b-it-q8_0".to_string();
+        assert_eq!(effective_chat_model_id(&config), "gemma4:e4b-it-q8_0");
+    }
+
     #[test]
     fn chat_model_allows_custom_ids_for_lm_studio() {
         let mut config = test_config();
@@ -230,7 +237,7 @@ mod tests {
     #[test]
     fn chat_model_rejects_non_mvp_models() {
         let mut config = test_config();
-        // All models outside the single MVP-allowed model are rejected.
+        // All models outside the local allowlist are rejected.
         config.local_ai.chat_model_id = "gemma3:4b-it-qat".to_string();
         assert_eq!(effective_chat_model_id(&config), MVP_DEFAULT_CHAT_MODEL);
 

From 1ed33aa54d3bf77677e015afafec7528d3a9d155 Mon Sep 17 00:00:00 2001
From: Developer <developer@example.com>
Date: Sun, 24 May 2026 17:19:09 +0800
Subject: [PATCH 2/5] Format Ollama inference regression test

---
 .../inference/local/service/public_infer_tests.rs      | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs
index 3ed591d42e..d73a33b7fc 100644
--- a/src/openhuman/inference/local/service/public_infer_tests.rs
+++ b/src/openhuman/inference/local/service/public_infer_tests.rs
@@ -419,10 +419,12 @@ async fn chat_with_history_interactive_does_not_block_when_signed_out() {
         std::time::Duration::from_secs(2),
         service.chat_with_history_interactive(
             &config,
-            vec![crate::openhuman::inference::local::ollama::OllamaChatMessage {
-                role: "user".to_string(),
-                content: "hi".to_string(),
-            }],
+            vec![
+                crate::openhuman::inference::local::ollama::OllamaChatMessage {
+                    role: "user".to_string(),
+                    content: "hi".to_string(),
+                },
+            ],
             Some(16),
         ),
     )

From 946d4d5f525bb9682f2311632f35c97b2836b0fa Mon Sep 17 00:00:00 2001
From: Developer <developer@example.com>
Date: Sun, 24 May 2026 17:43:39 +0800
Subject: [PATCH 3/5] Isolate interactive inference permit tests

---
 .../local/service/public_infer_tests.rs        | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs
index d73a33b7fc..226477d937 100644
--- a/src/openhuman/inference/local/service/public_infer_tests.rs
+++ b/src/openhuman/inference/local/service/public_infer_tests.rs
@@ -352,9 +352,12 @@ async fn inline_complete_interactive_does_not_block_on_held_permit() {
 }
 
 #[tokio::test]
-async fn prompt_interactive_does_not_block_when_signed_out() {
+async fn prompt_interactive_does_not_block_on_held_permit() {
     let _guard = crate::openhuman::inference::inference_test_guard();
 
+    let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit()
+        .expect("test must start with a free permit");
+
     let app = Router::new().route(
         "/api/generate",
         post(|Json(_body): Json<serde_json::Value>| async move {
@@ -371,8 +374,6 @@ async fn prompt_interactive_does_not_block_when_signed_out() {
     }
 
     let config = enabled_config();
-    crate::openhuman::scheduler_gate::init_global(&config);
-    let _signed_out = crate::openhuman::scheduler_gate::SignedOutTestGuard::set(true);
     let service = ready_service(&config);
 
     let result = tokio::time::timeout(
@@ -386,15 +387,18 @@ async fn prompt_interactive_does_not_block_when_signed_out() {
     }
 
     let reply = result
-        .expect("interactive prompt must not block when scheduler gate is signed out")
+        .expect("interactive prompt must not block on a held permit")
         .expect("interactive prompt response");
     assert_eq!(reply, "hello from mock");
 }
 
 #[tokio::test]
-async fn chat_with_history_interactive_does_not_block_when_signed_out() {
+async fn chat_with_history_interactive_does_not_block_on_held_permit() {
     let _guard = crate::openhuman::inference::inference_test_guard();
 
+    let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit()
+        .expect("test must start with a free permit");
+
     let app = Router::new().route(
         "/api/chat",
         post(|Json(_body): Json<serde_json::Value>| async move {
@@ -411,8 +415,6 @@ async fn chat_with_history_interactive_does_not_block_when_signed_out() {
     }
 
     let config = enabled_config();
-    crate::openhuman::scheduler_gate::init_global(&config);
-    let _signed_out = crate::openhuman::scheduler_gate::SignedOutTestGuard::set(true);
     let service = ready_service(&config);
 
     let result = tokio::time::timeout(
@@ -435,7 +437,7 @@ async fn chat_with_history_interactive_does_not_block_when_signed_out() {
     }
 
     let reply = result
-        .expect("interactive chat must not block when scheduler gate is signed out")
+        .expect("interactive chat must not block on a held permit")
         .expect("interactive chat response");
     assert_eq!(reply, "history reply");
 }

From d592e92df5acba44ce1a69041c981df5c979717e Mon Sep 17 00:00:00 2001
From: Developer <developer@example.com>
Date: Sun, 24 May 2026 22:46:51 +0800
Subject: [PATCH 4/5] Probe Ollama readiness with GET /api/tags

---
 .env.example                                  |  3 ++
 .../inference/local/service/ollama_admin.rs   |  2 +-
 .../local/service/ollama_admin_tests.rs       | 32 +++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/.env.example b/.env.example
index 698664938e..2fd33e5baf 100644
--- a/.env.example
+++ b/.env.example
@@ -148,6 +148,9 @@ OPENHUMAN_PROXY_SERVICES=
 # [optional] Override selected model tier: low, medium, high
 # Applies the corresponding preset at config load time (overrides config.toml).
 OPENHUMAN_LOCAL_AI_TIER=
+# [optional] Override Ollama's HTTP server base URL (default: http://localhost:11434).
+# Useful when Ollama runs in another container, on another host, or on a non-default port.
+# OPENHUMAN_OLLAMA_BASE_URL=http://127.0.0.1:11434
 # [optional] Override LM Studio's OpenAI-compatible local server base URL.
 # Defaults to http://localhost:1234/v1 when local_ai.provider = "lm_studio".
 OPENHUMAN_LM_STUDIO_BASE_URL=
diff --git a/src/openhuman/inference/local/service/ollama_admin.rs b/src/openhuman/inference/local/service/ollama_admin.rs
index a7639d84c6..5fd83e84cd 100644
--- a/src/openhuman/inference/local/service/ollama_admin.rs
+++ b/src/openhuman/inference/local/service/ollama_admin.rs
@@ -1302,7 +1302,7 @@ impl LocalAiService {
     async fn ollama_runner_ok_at(&self, base_url: &str) -> bool {
         let resp = self
             .http
-            .post(format!("{base_url}/api/tags"))
+            .get(format!("{base_url}/api/tags"))
             .timeout(std::time::Duration::from_secs(3))
             .send()
             .await;
diff --git a/src/openhuman/inference/local/service/ollama_admin_tests.rs b/src/openhuman/inference/local/service/ollama_admin_tests.rs
index 86c38ab2b2..b1eb3ab744 100644
--- a/src/openhuman/inference/local/service/ollama_admin_tests.rs
+++ b/src/openhuman/inference/local/service/ollama_admin_tests.rs
@@ -247,6 +247,38 @@ async fn ensure_ollama_server_reports_broken_external_runner_without_restart_att
     );
 }
 
+#[tokio::test]
+async fn ensure_ollama_server_accepts_healthy_external_runner() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let app = Router::new()
+        .route("/api/tags", get(|| async { Json(json!({ "models": [] })) }))
+        .route(
+            "/api/show",
+            axum::routing::post(|| async {
+                (
+                    axum::http::StatusCode::NOT_FOUND,
+                    Json(json!({ "error": "model '___nonexistent_probe___' not found" })),
+                )
+            }),
+        );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = Config::default();
+    let service = LocalAiService::new(&config);
+    service
+        .ensure_ollama_server(&config)
+        .await
+        .expect("healthy external runner should pass");
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+}
+
 #[tokio::test]
 async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_binary_exists() {
     let _guard = crate::openhuman::inference::inference_test_guard();

From dde7bc92d4da49d058ff5b7a8818a39ecd600c1d Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sun, 24 May 2026 22:11:46 -0700
Subject: [PATCH 5/5] fix(inference): add diagnosis logging and update stale
 docs on interactive paths

- Add trace logs to prompt_interactive and chat_with_history_interactive
  indicating scheduler gate bypass (addresses @Copilot and @coderabbitai
  on public_infer.rs:80 and public_infer.rs:220-228)
- Update inline_complete_interactive doc comment to reflect that
  prompt_interactive and chat_with_history_interactive are also ungated
---
 src/openhuman/inference/local/service/public_infer.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs
index 796aa7dc1c..c2cf6243f4 100644
--- a/src/openhuman/inference/local/service/public_infer.rs
+++ b/src/openhuman/inference/local/service/public_infer.rs
@@ -70,6 +70,7 @@ impl LocalAiService {
         max_tokens: Option<u32>,
         no_think: bool,
     ) -> Result<String, String> {
+        log::trace!("[local_ai] prompt_interactive bypasses scheduler_gate permit");
         if !config.local_ai.runtime_enabled {
             return Err("local ai is disabled".to_string());
         }
@@ -113,9 +114,11 @@ impl LocalAiService {
     /// turn against it than show stale or empty completions for the
     /// duration of the backfill.
     ///
-    /// This is the only path inside [`LocalAiService`] that opts out of
-    /// the gate. Every other entry point (`inference`, `prompt`,
-    /// `summarize`, `inline_complete`, `vision_prompt`, `embed`)
+    /// Along with [`Self::prompt_interactive`] and
+    /// [`Self::chat_with_history_interactive`], this is one of the paths
+    /// inside [`LocalAiService`] that opts out of the gate. Every other
+    /// entry point (`inference`, `prompt`, `summarize`,
+    /// `inline_complete`, `vision_prompt`, `embed`, `chat_with_history`)
     /// acquires before talking to Ollama.
     pub async fn inline_complete_interactive(
         &self,
@@ -223,6 +226,7 @@ impl LocalAiService {
         messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
         max_tokens: Option<u32>,
     ) -> Result<String, String> {
+        log::trace!("[local_ai] chat_with_history_interactive bypasses scheduler_gate permit");
         self.chat_with_history_internal(config, messages, max_tokens, false)
             .await
     }