tinyhumansai · senamakel · May 25, 2026 · May 24, 2026 · May 24, 2026 · May 24, 2026
@@ -162,6 +162,9 @@ OPENHUMAN_PROXY_SERVICES=
 # [optional] Override selected model tier: low, medium, high
 # Applies the corresponding preset at config load time (overrides config.toml).
 OPENHUMAN_LOCAL_AI_TIER=
+# [optional] Override Ollama's HTTP server base URL (default: http://localhost:11434).
+# Useful when Ollama runs in another container, on another host, or on a non-default port.
+# OPENHUMAN_OLLAMA_BASE_URL=http://127.0.0.1:11434
 # [optional] Override LM Studio's OpenAI-compatible local server base URL.
 # Defaults to http://localhost:1234/v1 when local_ai.provider = "lm_studio".
 OPENHUMAN_LM_STUDIO_BASE_URL=

@@ -197,7 +197,7 @@ pub async fn local_ai_prompt(
         service.bootstrap(config).await;
     }
     let output = service
-        .prompt(config, prompt.trim(), max_tokens, no_think.unwrap_or(true))
+        .prompt_interactive(config, prompt.trim(), max_tokens, no_think.unwrap_or(true))
         .await
         .map_err(|e| e.to_string())?;
     Ok(RpcOutcome::single_log(output, "local ai prompt completed"))
@@ -360,6 +360,67 @@ pub async fn local_ai_download_asset(
     ))
 }
 
+/// A single message in a local AI chat conversation.
+#[derive(Debug, serde::Deserialize)]
+pub struct LocalAiChatMessage {
+    /// The role of the message sender (e.g., "user", "assistant").
+    pub role: String,
+    /// The text content of the message.
+    pub content: String,
+}
+
+/// Executes a multi-turn chat conversation using the local model.
+pub async fn local_ai_chat(
+    config: &Config,
+    messages: Vec<LocalAiChatMessage>,
+    max_tokens: Option<u32>,
+) -> Result<RpcOutcome<String>, String> {
+    tracing::debug!(
+        message_count = messages.len(),
+        "[local_ai:chat] local_ai_chat op: validating"
+    );
+
+    if messages.is_empty() {
+        return Err("messages must not be empty".to_string());
+    }
+
+    let mut ollama_messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage> =
+        Vec::with_capacity(messages.len());
+
+    for msg in messages.into_iter() {
+        let normalized_role = msg.role.trim().to_ascii_lowercase();
+        match normalized_role.as_str() {
+            "user" => {
+                enforce_user_prompt_or_reject(msg.content.as_str(), "local_ai.ops.local_ai_chat")?;
+            }
+            "system" | "assistant" => {}
+            _ => {
+                return Err(format!(
+                    "unsupported message role: '{}'; expected one of: user, system, assistant",
+                    msg.role.trim()
+                ));
+            }
+        }
+
+        ollama_messages.push(
+            crate::openhuman::inference::local::ollama::OllamaChatMessage {
+                role: normalized_role,
+                content: msg.content,
+            },
+        );
+    }
+
+    let service = local_ai::global(config);
+    let reply = service
+        .chat_with_history_interactive(config, ollama_messages, max_tokens)
+        .await?;
+
+    tracing::debug!(
+        reply_len = reply.len(),
+        "[local_ai:chat] local_ai_chat op: done"
+    );
+    Ok(RpcOutcome::single_log(reply, "local ai chat completed"))
+}
 /// Result of the reaction-decision prompt.
 #[derive(Debug, serde::Serialize)]
 pub struct ReactionDecision {

@@ -1302,7 +1302,7 @@ impl LocalAiService {
     async fn ollama_runner_ok_at(&self, base_url: &str) -> bool {
         let resp = self
             .http
-            .post(format!("{base_url}/api/tags"))
+            .get(format!("{base_url}/api/tags"))
             .timeout(std::time::Duration::from_secs(3))
             .send()
             .await;

@@ -247,6 +247,38 @@ async fn ensure_ollama_server_reports_broken_external_runner_without_restart_att
     );
 }
 
+#[tokio::test]
+async fn ensure_ollama_server_accepts_healthy_external_runner() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let app = Router::new()
+        .route("/api/tags", get(|| async { Json(json!({ "models": [] })) }))
+        .route(
+            "/api/show",
+            axum::routing::post(|| async {
+                (
+                    axum::http::StatusCode::NOT_FOUND,
+                    Json(json!({ "error": "model '___nonexistent_probe___' not found" })),
+                )
+            }),
+        );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = Config::default();
+    let service = LocalAiService::new(&config);
+    service
+        .ensure_ollama_server(&config)
+        .await
+        .expect("healthy external runner should pass");
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+}
+
 #[tokio::test]
 async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_binary_exists() {
     let _guard = crate::openhuman::inference::inference_test_guard();

@@ -63,6 +63,26 @@ impl LocalAiService {
             .await
     }
 
+    pub async fn prompt_interactive(
+        &self,
+        config: &Config,
+        prompt: &str,
+        max_tokens: Option<u32>,
+        no_think: bool,
+    ) -> Result<String, String> {
+        log::trace!("[local_ai] prompt_interactive bypasses scheduler_gate permit");
+        if !config.local_ai.runtime_enabled {
+            return Err("local ai is disabled".to_string());
+        }
+        let system = if no_think {
+            "You are a concise assistant. Return only the final answer. Do not include reasoning or chain-of-thought."
+        } else {
+            "You are a helpful assistant."
+        };
+        self.inference_interactive(config, system, prompt, max_tokens.or(Some(160)), no_think)
+            .await
+    }
+
     pub async fn inline_complete(
         &self,
         config: &Config,
@@ -94,9 +114,11 @@ impl LocalAiService {
     /// turn against it than show stale or empty completions for the
     /// duration of the backfill.
     ///
-    /// This is the only path inside [`LocalAiService`] that opts out of
-    /// the gate. Every other entry point (`inference`, `prompt`,
-    /// `summarize`, `inline_complete`, `vision_prompt`, `embed`)
+    /// Along with [`Self::prompt_interactive`] and
+    /// [`Self::chat_with_history_interactive`], this is one of the paths
+    /// inside [`LocalAiService`] that opts out of the gate. Every other
+    /// entry point (`inference`, `prompt`, `summarize`,
+    /// `inline_complete`, `vision_prompt`, `embed`, `chat_with_history`)
     /// acquires before talking to Ollama.
     pub async fn inline_complete_interactive(
         &self,
@@ -185,6 +207,180 @@ impl LocalAiService {
         Ok(sanitize_inline_completion(&raw, context))
     }
 
+    /// Multi-turn chat completion via Ollama /api/chat.
+    /// Messages are `[{role: "user"|"assistant"|"system", content: "..."}]`.
+    /// Returns the assistant reply string.
+    pub(crate) async fn chat_with_history(
+        &self,
+        config: &Config,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
+        max_tokens: Option<u32>,
+    ) -> Result<String, String> {
+        self.chat_with_history_internal(config, messages, max_tokens, true)
+            .await
+    }
+
+    pub(crate) async fn chat_with_history_interactive(
+        &self,
+        config: &Config,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
+        max_tokens: Option<u32>,
+    ) -> Result<String, String> {
+        log::trace!("[local_ai] chat_with_history_interactive bypasses scheduler_gate permit");
+        self.chat_with_history_internal(config, messages, max_tokens, false)
+            .await
+    }
+
+    async fn chat_with_history_internal(
+        &self,
+        config: &Config,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
+        max_tokens: Option<u32>,
+        gated: bool,
+    ) -> Result<String, String> {
+        if !config.local_ai.runtime_enabled {
+            return Err("local ai is disabled".to_string());
+        }
+
+        if !matches!(self.status.lock().state.as_str(), "ready") {
+            self.bootstrap(config).await;
+        }
+
+        if messages.is_empty() {
+            return Err("messages must not be empty".to_string());
+        }
+
+        let _gate_permit = if gated {
+            crate::openhuman::scheduler_gate::wait_for_capacity().await
+        } else {
+            None
+        };
+
+        if provider_from_config(config) == LocalAiProvider::LmStudio {
+            let started = std::time::Instant::now();
+            let lm_messages = messages
+                .into_iter()
+                .map(
+                    |message| crate::openhuman::inference::local::lm_studio::LmStudioChatMessage {
+                        role: message.role,
+                        content: message.content,
+                    },
+                )
+                .collect();
+            let outcome = self
+                .lm_studio_chat_completion(
+                    config,
+                    lm_messages,
+                    max_tokens,
+                    config.default_temperature as f32,
+                    false,
+                )
+                .await?;
+            let elapsed_ms = started.elapsed().as_millis() as u64;
+            {
+                let mut status = self.status.lock();
+                status.state = "ready".to_string();
+                status.last_latency_ms = Some(elapsed_ms);
+                status.prompt_toks_per_sec = None;
+                status.gen_toks_per_sec = None;
+                status.warning = None;
+            }
+            tracing::debug!(
+                elapsed_ms,
+                prompt_tokens = ?outcome.prompt_tokens,
+                completion_tokens = ?outcome.completion_tokens,
+                reply_len = outcome.reply.len(),
+                "[local_ai:chat] lm studio /v1/chat/completions done"
+            );
+            return Ok(outcome.reply);
+        }
+
+        tracing::debug!(
+            message_count = messages.len(),
+            model = %crate::openhuman::inference::model_ids::effective_chat_model_id(config),
+            "[local_ai:chat] sending to ollama /api/chat"
+        );
+
+        let started = std::time::Instant::now();
+
+        let body = crate::openhuman::inference::local::ollama::OllamaChatRequest {
+            model: crate::openhuman::inference::model_ids::effective_chat_model_id(config),
+            messages,
+            stream: false,
+            options: Some(
+                crate::openhuman::inference::local::ollama::OllamaGenerateOptions {
+                    temperature: Some(config.default_temperature as f32),
+                    top_k: Some(40),
+                    top_p: Some(0.9),
+                    num_predict: max_tokens.map(|v| v as i32),
+                },
+            ),
+        };
+
+        let base_url = ollama_base_url_from_config(config);
+        let response = self
+            .http
+            .post(format!("{base_url}/api/chat"))
+            .json(&body)
+            .send()
+            .await
+            .map_err(|e| {
+                external_ollama_request_error_with_url("ollama chat request failed", &e, &base_url)
+            })?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await.unwrap_or_default();
+            let detail = body.trim();
+            return Err(format!(
+                "ollama chat failed with status {}{}",
+                status,
+                if detail.is_empty() {
+                    String::new()
+                } else {
+                    format!(": {detail}")
+                }
+            ));
+        }
+
+        let payload: crate::openhuman::inference::local::ollama::OllamaChatResponse = response
+            .json()
+            .await
+            .map_err(|e| format!("ollama chat response parse failed: {e}"))?;
+
+        let elapsed_ms = started.elapsed().as_millis() as u64;
+        let prompt_tps = payload
+            .prompt_eval_count
+            .zip(payload.prompt_eval_duration)
+            .and_then(|(count, dur_ns)| ns_to_tps(count as f32, dur_ns));
+        let gen_tps = payload
+            .eval_count
+            .zip(payload.eval_duration)
+            .and_then(|(count, dur_ns)| ns_to_tps(count as f32, dur_ns));
+
+        {
+            let mut status = self.status.lock();
+            status.state = "ready".to_string();
+            status.last_latency_ms = Some(elapsed_ms);
+            status.prompt_toks_per_sec = prompt_tps;
+            status.gen_toks_per_sec = gen_tps;
+            status.warning = None;
+        }
+
+        tracing::debug!(
+            elapsed_ms,
+            reply_len = payload.message.content.len(),
+            "[local_ai:chat] ollama /api/chat done"
+        );
+
+        let reply = payload.message.content.trim().to_string();
+        if reply.is_empty() {
+            Err("ollama returned empty reply".to_string())
+        } else {
+            Ok(reply)
+        }
+    }
+
     pub(crate) async fn inference(
         &self,
         config: &Config,