diff --git a/.env.example b/.env.example
index a3c92312c9..f131b416d4 100644
--- a/.env.example
+++ b/.env.example
@@ -162,6 +162,9 @@ OPENHUMAN_PROXY_SERVICES=
 # [optional] Override selected model tier: low, medium, high
 # Applies the corresponding preset at config load time (overrides config.toml).
 OPENHUMAN_LOCAL_AI_TIER=
+# [optional] Override Ollama's HTTP server base URL (default: http://localhost:11434).
+# Useful when Ollama runs in another container, on another host, or on a non-default port.
+# OPENHUMAN_OLLAMA_BASE_URL=http://127.0.0.1:11434
 # [optional] Override LM Studio's OpenAI-compatible local server base URL.
 # Defaults to http://localhost:1234/v1 when local_ai.provider = "lm_studio".
 OPENHUMAN_LM_STUDIO_BASE_URL=
diff --git a/src/openhuman/inference/local/ops.rs b/src/openhuman/inference/local/ops.rs
index b977bda39f..22fec3bec9 100644
--- a/src/openhuman/inference/local/ops.rs
+++ b/src/openhuman/inference/local/ops.rs
@@ -197,7 +197,7 @@ pub async fn local_ai_prompt(
         service.bootstrap(config).await;
     }
     let output = service
-        .prompt(config, prompt.trim(), max_tokens, no_think.unwrap_or(true))
+        .prompt_interactive(config, prompt.trim(), max_tokens, no_think.unwrap_or(true))
         .await
         .map_err(|e| e.to_string())?;
     Ok(RpcOutcome::single_log(output, "local ai prompt completed"))
@@ -360,6 +360,67 @@ pub async fn local_ai_download_asset(
     ))
 }
 
+/// A single message in a local AI chat conversation.
+#[derive(Debug, serde::Deserialize)]
+pub struct LocalAiChatMessage {
+    /// The role of the message sender (e.g., "user", "assistant").
+    pub role: String,
+    /// The text content of the message.
+    pub content: String,
+}
+
+/// Executes a multi-turn chat conversation using the local model.
+pub async fn local_ai_chat(
+    config: &Config,
+    messages: Vec<LocalAiChatMessage>,
+    max_tokens: Option<u32>,
+) -> Result<RpcOutcome<String>, String> {
+    tracing::debug!(
+        message_count = messages.len(),
+        "[local_ai:chat] local_ai_chat op: validating"
+    );
+
+    if messages.is_empty() {
+        return Err("messages must not be empty".to_string());
+    }
+
+    let mut ollama_messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage> =
+        Vec::with_capacity(messages.len());
+
+    for msg in messages.into_iter() {
+        let normalized_role = msg.role.trim().to_ascii_lowercase();
+        match normalized_role.as_str() {
+            "user" => {
+                enforce_user_prompt_or_reject(msg.content.as_str(), "local_ai.ops.local_ai_chat")?;
+            }
+            "system" | "assistant" => {}
+            _ => {
+                return Err(format!(
+                    "unsupported message role: '{}'; expected one of: user, system, assistant",
+                    msg.role.trim()
+                ));
+            }
+        }
+
+        ollama_messages.push(
+            crate::openhuman::inference::local::ollama::OllamaChatMessage {
+                role: normalized_role,
+                content: msg.content,
+            },
+        );
+    }
+
+    let service = local_ai::global(config);
+    let reply = service
+        .chat_with_history_interactive(config, ollama_messages, max_tokens)
+        .await?;
+
+    tracing::debug!(
+        reply_len = reply.len(),
+        "[local_ai:chat] local_ai_chat op: done"
+    );
+    Ok(RpcOutcome::single_log(reply, "local ai chat completed"))
+}
 /// Result of the reaction-decision prompt.
 #[derive(Debug, serde::Serialize)]
 pub struct ReactionDecision {
diff --git a/src/openhuman/inference/local/service/ollama_admin.rs b/src/openhuman/inference/local/service/ollama_admin.rs
index a7639d84c6..5fd83e84cd 100644
--- a/src/openhuman/inference/local/service/ollama_admin.rs
+++ b/src/openhuman/inference/local/service/ollama_admin.rs
@@ -1302,7 +1302,7 @@ impl LocalAiService {
     async fn ollama_runner_ok_at(&self, base_url: &str) -> bool {
         let resp = self
             .http
-            .post(format!("{base_url}/api/tags"))
+            .get(format!("{base_url}/api/tags"))
             .timeout(std::time::Duration::from_secs(3))
             .send()
             .await;
diff --git a/src/openhuman/inference/local/service/ollama_admin_tests.rs b/src/openhuman/inference/local/service/ollama_admin_tests.rs
index 86c38ab2b2..b1eb3ab744 100644
--- a/src/openhuman/inference/local/service/ollama_admin_tests.rs
+++ b/src/openhuman/inference/local/service/ollama_admin_tests.rs
@@ -247,6 +247,38 @@ async fn ensure_ollama_server_reports_broken_external_runner_without_restart_att
     );
 }
 
+#[tokio::test]
+async fn ensure_ollama_server_accepts_healthy_external_runner() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let app = Router::new()
+        .route("/api/tags", get(|| async { Json(json!({ "models": [] })) }))
+        .route(
+            "/api/show",
+            axum::routing::post(|| async {
+                (
+                    axum::http::StatusCode::NOT_FOUND,
+                    Json(json!({ "error": "model '___nonexistent_probe___' not found" })),
+                )
+            }),
+        );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = Config::default();
+    let service = LocalAiService::new(&config);
+    service
+        .ensure_ollama_server(&config)
+        .await
+        .expect("healthy external runner should pass");
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+}
+
 #[tokio::test]
 async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_binary_exists() {
     let _guard = crate::openhuman::inference::inference_test_guard();
diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs
index 07306a7d01..c2cf6243f4 100644
--- a/src/openhuman/inference/local/service/public_infer.rs
+++ b/src/openhuman/inference/local/service/public_infer.rs
@@ -63,6 +63,26 @@ impl LocalAiService {
             .await
     }
 
+    pub async fn prompt_interactive(
+        &self,
+        config: &Config,
+        prompt: &str,
+        max_tokens: Option<u32>,
+        no_think: bool,
+    ) -> Result<String, String> {
+        log::trace!("[local_ai] prompt_interactive bypasses scheduler_gate permit");
+        if !config.local_ai.runtime_enabled {
+            return Err("local ai is disabled".to_string());
+        }
+        let system = if no_think {
+            "You are a concise assistant. Return only the final answer. Do not include reasoning or chain-of-thought."
+        } else {
+            "You are a helpful assistant."
+        };
+        self.inference_interactive(config, system, prompt, max_tokens.or(Some(160)), no_think)
+            .await
+    }
+
     pub async fn inline_complete(
         &self,
         config: &Config,
@@ -94,9 +114,11 @@ impl LocalAiService {
     /// turn against it than show stale or empty completions for the
     /// duration of the backfill.
     ///
-    /// This is the only path inside [`LocalAiService`] that opts out of
-    /// the gate. Every other entry point (`inference`, `prompt`,
-    /// `summarize`, `inline_complete`, `vision_prompt`, `embed`)
+    /// Along with [`Self::prompt_interactive`] and
+    /// [`Self::chat_with_history_interactive`], this is one of the paths
+    /// inside [`LocalAiService`] that opts out of the gate. Every other
+    /// entry point (`inference`, `prompt`, `summarize`,
+    /// `inline_complete`, `vision_prompt`, `embed`, `chat_with_history`)
     /// acquires before talking to Ollama.
     pub async fn inline_complete_interactive(
         &self,
@@ -185,6 +207,180 @@ impl LocalAiService {
         Ok(sanitize_inline_completion(&raw, context))
     }
 
+    /// Multi-turn chat completion via Ollama /api/chat.
+    /// Messages are `[{role: "user"|"assistant"|"system", content: "..."}]`.
+    /// Returns the assistant reply string.
+    pub(crate) async fn chat_with_history(
+        &self,
+        config: &Config,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
+        max_tokens: Option<u32>,
+    ) -> Result<String, String> {
+        self.chat_with_history_internal(config, messages, max_tokens, true)
+            .await
+    }
+
+    pub(crate) async fn chat_with_history_interactive(
+        &self,
+        config: &Config,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
+        max_tokens: Option<u32>,
+    ) -> Result<String, String> {
+        log::trace!("[local_ai] chat_with_history_interactive bypasses scheduler_gate permit");
+        self.chat_with_history_internal(config, messages, max_tokens, false)
+            .await
+    }
+
+    async fn chat_with_history_internal(
+        &self,
+        config: &Config,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
+        max_tokens: Option<u32>,
+        gated: bool,
+    ) -> Result<String, String> {
+        if !config.local_ai.runtime_enabled {
+            return Err("local ai is disabled".to_string());
+        }
+
+        if !matches!(self.status.lock().state.as_str(), "ready") {
+            self.bootstrap(config).await;
+        }
+
+        if messages.is_empty() {
+            return Err("messages must not be empty".to_string());
+        }
+
+        let _gate_permit = if gated {
+            crate::openhuman::scheduler_gate::wait_for_capacity().await
+        } else {
+            None
+        };
+
+        if provider_from_config(config) == LocalAiProvider::LmStudio {
+            let started = std::time::Instant::now();
+            let lm_messages = messages
+                .into_iter()
+                .map(
+                    |message| crate::openhuman::inference::local::lm_studio::LmStudioChatMessage {
+                        role: message.role,
+                        content: message.content,
+                    },
+                )
+                .collect();
+            let outcome = self
+                .lm_studio_chat_completion(
+                    config,
+                    lm_messages,
+                    max_tokens,
+                    config.default_temperature as f32,
+                    false,
+                )
+                .await?;
+            let elapsed_ms = started.elapsed().as_millis() as u64;
+            {
+                let mut status = self.status.lock();
+                status.state = "ready".to_string();
+                status.last_latency_ms = Some(elapsed_ms);
+                status.prompt_toks_per_sec = None;
+                status.gen_toks_per_sec = None;
+                status.warning = None;
+            }
+            tracing::debug!(
+                elapsed_ms,
+                prompt_tokens = ?outcome.prompt_tokens,
+                completion_tokens = ?outcome.completion_tokens,
+                reply_len = outcome.reply.len(),
+                "[local_ai:chat] lm studio /v1/chat/completions done"
+            );
+            return Ok(outcome.reply);
+        }
+
+        tracing::debug!(
+            message_count = messages.len(),
+            model = %crate::openhuman::inference::model_ids::effective_chat_model_id(config),
+            "[local_ai:chat] sending to ollama /api/chat"
+        );
+
+        let started = std::time::Instant::now();
+
+        let body = crate::openhuman::inference::local::ollama::OllamaChatRequest {
+            model: crate::openhuman::inference::model_ids::effective_chat_model_id(config),
+            messages,
+            stream: false,
+            options: Some(
+                crate::openhuman::inference::local::ollama::OllamaGenerateOptions {
+                    temperature: Some(config.default_temperature as f32),
+                    top_k: Some(40),
+                    top_p: Some(0.9),
+                    num_predict: max_tokens.map(|v| v as i32),
+                },
+            ),
+        };
+
+        let base_url = ollama_base_url_from_config(config);
+        let response = self
+            .http
+            .post(format!("{base_url}/api/chat"))
+            .json(&body)
+            .send()
+            .await
+            .map_err(|e| {
+                external_ollama_request_error_with_url("ollama chat request failed", &e, &base_url)
+            })?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await.unwrap_or_default();
+            let detail = body.trim();
+            return Err(format!(
+                "ollama chat failed with status {}{}",
+                status,
+                if detail.is_empty() {
+                    String::new()
+                } else {
+                    format!(": {detail}")
+                }
+            ));
+        }
+
+        let payload: crate::openhuman::inference::local::ollama::OllamaChatResponse = response
+            .json()
+            .await
+            .map_err(|e| format!("ollama chat response parse failed: {e}"))?;
+
+        let elapsed_ms = started.elapsed().as_millis() as u64;
+        let prompt_tps = payload
+            .prompt_eval_count
+            .zip(payload.prompt_eval_duration)
+            .and_then(|(count, dur_ns)| ns_to_tps(count as f32, dur_ns));
+        let gen_tps = payload
+            .eval_count
+            .zip(payload.eval_duration)
+            .and_then(|(count, dur_ns)| ns_to_tps(count as f32, dur_ns));
+
+        {
+            let mut status = self.status.lock();
+            status.state = "ready".to_string();
+            status.last_latency_ms = Some(elapsed_ms);
+            status.prompt_toks_per_sec = prompt_tps;
+            status.gen_toks_per_sec = gen_tps;
+            status.warning = None;
+        }
+
+        tracing::debug!(
+            elapsed_ms,
+            reply_len = payload.message.content.len(),
+            "[local_ai:chat] ollama /api/chat done"
+        );
+
+        let reply = payload.message.content.trim().to_string();
+        if reply.is_empty() {
+            Err("ollama returned empty reply".to_string())
+        } else {
+            Ok(reply)
+        }
+    }
+
     pub(crate) async fn inference(
         &self,
         config: &Config,
diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs
index 27d92d3dc3..f9e9fa6b63 100644
--- a/src/openhuman/inference/local/service/public_infer_tests.rs
+++ b/src/openhuman/inference/local/service/public_infer_tests.rs
@@ -310,6 +310,97 @@ async fn inline_complete_interactive_does_not_block_on_held_permit() {
     );
 }
 
+#[tokio::test]
+async fn prompt_interactive_does_not_block_on_held_permit() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit()
+        .expect("test must start with a free permit");
+
+    let app = Router::new().route(
+        "/api/generate",
+        post(|Json(_body): Json<serde_json::Value>| async move {
+            Json(json!({
+                "model": "test",
+                "response": "hello from mock",
+                "done": true
+            }))
+        }),
+    );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = enabled_config();
+    let service = ready_service(&config);
+
+    let result = tokio::time::timeout(
+        std::time::Duration::from_secs(2),
+        service.prompt_interactive(&config, "hi", Some(16), true),
+    )
+    .await;
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    let reply = result
+        .expect("interactive prompt must not block on a held permit")
+        .expect("interactive prompt response");
+    assert_eq!(reply, "hello from mock");
+}
+
+#[tokio::test]
+async fn chat_with_history_interactive_does_not_block_on_held_permit() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit()
+        .expect("test must start with a free permit");
+
+    let app = Router::new().route(
+        "/api/chat",
+        post(|Json(_body): Json<serde_json::Value>| async move {
+            Json(json!({
+                "model": "test",
+                "message": { "role": "assistant", "content": "history reply" },
+                "done": true
+            }))
+        }),
+    );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = enabled_config();
+    let service = ready_service(&config);
+
+    let result = tokio::time::timeout(
+        std::time::Duration::from_secs(2),
+        service.chat_with_history_interactive(
+            &config,
+            vec![
+                crate::openhuman::inference::local::ollama::OllamaChatMessage {
+                    role: "user".to_string(),
+                    content: "hi".to_string(),
+                },
+            ],
+            Some(16),
+        ),
+    )
+    .await;
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    let reply = result
+        .expect("interactive chat must not block on a held permit")
+        .expect("interactive chat response");
+    assert_eq!(reply, "history reply");
+}
+
 /// Counterpart: the gated `inline_complete` (and `prompt`/`summarize`)
 /// MUST queue behind a held permit. We assert this with a try-style
 /// race: spawn the gated call, give it time to enter the wait, then
diff --git a/src/openhuman/inference/model_ids.rs b/src/openhuman/inference/model_ids.rs
index a407df65d2..d325faab28 100644
--- a/src/openhuman/inference/model_ids.rs
+++ b/src/openhuman/inference/model_ids.rs
@@ -15,9 +15,9 @@ pub(crate) const DEFAULT_OLLAMA_VISION_MODEL: &str = "";
 pub(crate) const DEFAULT_LOW_VISION_MODEL: &str = "moondream:1.8b-v2-q4_K_S";
 pub(crate) const DEFAULT_OLLAMA_EMBED_MODEL: &str = "bge-m3";
 
-/// Chat models allowed in the current MVP build (2–4 GB tier only).
+/// Chat models allowed in the current local Ollama build.
 /// Any resolved chat model ID not listed here is redirected to `MVP_DEFAULT_CHAT_MODEL`.
-const MVP_ALLOWED_CHAT_MODELS: &[&str] = &["gemma3:1b-it-qat"];
+const MVP_ALLOWED_CHAT_MODELS: &[&str] = &["gemma3:1b-it-qat", "gemma4:e4b-it-q8_0"];
 const MVP_DEFAULT_CHAT_MODEL: &str = "gemma3:1b-it-qat";
 
 /// Vision models allowed in MVP — only disabled (empty string) since the
@@ -204,6 +204,13 @@ mod tests {
         assert_eq!(effective_chat_model_id(&config), "gemma3:1b-it-qat");
     }
 
+    #[test]
+    fn chat_model_allows_requested_ollama_gemma4_q8() {
+        let mut config = test_config();
+        config.local_ai.chat_model_id = "gemma4:e4b-it-q8_0".to_string();
+        assert_eq!(effective_chat_model_id(&config), "gemma4:e4b-it-q8_0");
+    }
+
     #[test]
     fn chat_model_allows_custom_ids_for_lm_studio() {
         let mut config = test_config();
@@ -230,7 +237,7 @@ mod tests {
     #[test]
     fn chat_model_rejects_non_mvp_models() {
         let mut config = test_config();
-        // All models outside the single MVP-allowed model are rejected.
+        // All models outside the local allowlist are rejected.
         config.local_ai.chat_model_id = "gemma3:4b-it-qat".to_string();
         assert_eq!(effective_chat_model_id(&config), MVP_DEFAULT_CHAT_MODEL);