diff --git a/.env.example b/.env.example index a3c92312c9..f131b416d4 100644 --- a/.env.example +++ b/.env.example @@ -162,6 +162,9 @@ OPENHUMAN_PROXY_SERVICES= # [optional] Override selected model tier: low, medium, high # Applies the corresponding preset at config load time (overrides config.toml). OPENHUMAN_LOCAL_AI_TIER= +# [optional] Override Ollama's HTTP server base URL (default: http://localhost:11434). +# Useful when Ollama runs in another container, on another host, or on a non-default port. +# OPENHUMAN_OLLAMA_BASE_URL=http://127.0.0.1:11434 # [optional] Override LM Studio's OpenAI-compatible local server base URL. # Defaults to http://localhost:1234/v1 when local_ai.provider = "lm_studio". OPENHUMAN_LM_STUDIO_BASE_URL= diff --git a/src/openhuman/inference/local/ops.rs b/src/openhuman/inference/local/ops.rs index b977bda39f..22fec3bec9 100644 --- a/src/openhuman/inference/local/ops.rs +++ b/src/openhuman/inference/local/ops.rs @@ -197,7 +197,7 @@ pub async fn local_ai_prompt( service.bootstrap(config).await; } let output = service - .prompt(config, prompt.trim(), max_tokens, no_think.unwrap_or(true)) + .prompt_interactive(config, prompt.trim(), max_tokens, no_think.unwrap_or(true)) .await .map_err(|e| e.to_string())?; Ok(RpcOutcome::single_log(output, "local ai prompt completed")) @@ -360,6 +360,67 @@ pub async fn local_ai_download_asset( )) } +/// A single message in a local AI chat conversation. +#[derive(Debug, serde::Deserialize)] +pub struct LocalAiChatMessage { + /// The role of the message sender (e.g., "user", "assistant"). + pub role: String, + /// The text content of the message. + pub content: String, +} + +/// Executes a multi-turn chat conversation using the local model. +pub async fn local_ai_chat( + config: &Config, + messages: Vec, + max_tokens: Option, +) -> Result, String> { + tracing::debug!( + message_count = messages.len(), + "[local_ai:chat] local_ai_chat op: validating" + ); + + if messages.is_empty() { + return Err("messages must not be empty".to_string()); + } + + let mut ollama_messages: Vec = + Vec::with_capacity(messages.len()); + + for msg in messages.into_iter() { + let normalized_role = msg.role.trim().to_ascii_lowercase(); + match normalized_role.as_str() { + "user" => { + enforce_user_prompt_or_reject(msg.content.as_str(), "local_ai.ops.local_ai_chat")?; + } + "system" | "assistant" => {} + _ => { + return Err(format!( + "unsupported message role: '{}'; expected one of: user, system, assistant", + msg.role.trim() + )); + } + } + + ollama_messages.push( + crate::openhuman::inference::local::ollama::OllamaChatMessage { + role: normalized_role, + content: msg.content, + }, + ); + } + + let service = local_ai::global(config); + let reply = service + .chat_with_history_interactive(config, ollama_messages, max_tokens) + .await?; + + tracing::debug!( + reply_len = reply.len(), + "[local_ai:chat] local_ai_chat op: done" + ); + Ok(RpcOutcome::single_log(reply, "local ai chat completed")) +} /// Result of the reaction-decision prompt. #[derive(Debug, serde::Serialize)] pub struct ReactionDecision { diff --git a/src/openhuman/inference/local/service/ollama_admin.rs b/src/openhuman/inference/local/service/ollama_admin.rs index a7639d84c6..5fd83e84cd 100644 --- a/src/openhuman/inference/local/service/ollama_admin.rs +++ b/src/openhuman/inference/local/service/ollama_admin.rs @@ -1302,7 +1302,7 @@ impl LocalAiService { async fn ollama_runner_ok_at(&self, base_url: &str) -> bool { let resp = self .http - .post(format!("{base_url}/api/tags")) + .get(format!("{base_url}/api/tags")) .timeout(std::time::Duration::from_secs(3)) .send() .await; diff --git a/src/openhuman/inference/local/service/ollama_admin_tests.rs b/src/openhuman/inference/local/service/ollama_admin_tests.rs index 86c38ab2b2..b1eb3ab744 100644 --- a/src/openhuman/inference/local/service/ollama_admin_tests.rs +++ b/src/openhuman/inference/local/service/ollama_admin_tests.rs @@ -247,6 +247,38 @@ async fn ensure_ollama_server_reports_broken_external_runner_without_restart_att ); } +#[tokio::test] +async fn ensure_ollama_server_accepts_healthy_external_runner() { + let _guard = crate::openhuman::inference::inference_test_guard(); + + let app = Router::new() + .route("/api/tags", get(|| async { Json(json!({ "models": [] })) })) + .route( + "/api/show", + axum::routing::post(|| async { + ( + axum::http::StatusCode::NOT_FOUND, + Json(json!({ "error": "model '___nonexistent_probe___' not found" })), + ) + }), + ); + let base = spawn_mock(app).await; + unsafe { + std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base); + } + + let config = Config::default(); + let service = LocalAiService::new(&config); + service + .ensure_ollama_server(&config) + .await + .expect("healthy external runner should pass"); + + unsafe { + std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL"); + } +} + #[tokio::test] async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_binary_exists() { let _guard = crate::openhuman::inference::inference_test_guard(); diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs index 07306a7d01..c2cf6243f4 100644 --- a/src/openhuman/inference/local/service/public_infer.rs +++ b/src/openhuman/inference/local/service/public_infer.rs @@ -63,6 +63,26 @@ impl LocalAiService { .await } + pub async fn prompt_interactive( + &self, + config: &Config, + prompt: &str, + max_tokens: Option, + no_think: bool, + ) -> Result { + log::trace!("[local_ai] prompt_interactive bypasses scheduler_gate permit"); + if !config.local_ai.runtime_enabled { + return Err("local ai is disabled".to_string()); + } + let system = if no_think { + "You are a concise assistant. Return only the final answer. Do not include reasoning or chain-of-thought." + } else { + "You are a helpful assistant." + }; + self.inference_interactive(config, system, prompt, max_tokens.or(Some(160)), no_think) + .await + } + pub async fn inline_complete( &self, config: &Config, @@ -94,9 +114,11 @@ impl LocalAiService { /// turn against it than show stale or empty completions for the /// duration of the backfill. /// - /// This is the only path inside [`LocalAiService`] that opts out of - /// the gate. Every other entry point (`inference`, `prompt`, - /// `summarize`, `inline_complete`, `vision_prompt`, `embed`) + /// Along with [`Self::prompt_interactive`] and + /// [`Self::chat_with_history_interactive`], this is one of the paths + /// inside [`LocalAiService`] that opts out of the gate. Every other + /// entry point (`inference`, `prompt`, `summarize`, + /// `inline_complete`, `vision_prompt`, `embed`, `chat_with_history`) /// acquires before talking to Ollama. pub async fn inline_complete_interactive( &self, @@ -185,6 +207,180 @@ impl LocalAiService { Ok(sanitize_inline_completion(&raw, context)) } + /// Multi-turn chat completion via Ollama /api/chat. + /// Messages are `[{role: "user"|"assistant"|"system", content: "..."}]`. + /// Returns the assistant reply string. + pub(crate) async fn chat_with_history( + &self, + config: &Config, + messages: Vec, + max_tokens: Option, + ) -> Result { + self.chat_with_history_internal(config, messages, max_tokens, true) + .await + } + + pub(crate) async fn chat_with_history_interactive( + &self, + config: &Config, + messages: Vec, + max_tokens: Option, + ) -> Result { + log::trace!("[local_ai] chat_with_history_interactive bypasses scheduler_gate permit"); + self.chat_with_history_internal(config, messages, max_tokens, false) + .await + } + + async fn chat_with_history_internal( + &self, + config: &Config, + messages: Vec, + max_tokens: Option, + gated: bool, + ) -> Result { + if !config.local_ai.runtime_enabled { + return Err("local ai is disabled".to_string()); + } + + if !matches!(self.status.lock().state.as_str(), "ready") { + self.bootstrap(config).await; + } + + if messages.is_empty() { + return Err("messages must not be empty".to_string()); + } + + let _gate_permit = if gated { + crate::openhuman::scheduler_gate::wait_for_capacity().await + } else { + None + }; + + if provider_from_config(config) == LocalAiProvider::LmStudio { + let started = std::time::Instant::now(); + let lm_messages = messages + .into_iter() + .map( + |message| crate::openhuman::inference::local::lm_studio::LmStudioChatMessage { + role: message.role, + content: message.content, + }, + ) + .collect(); + let outcome = self + .lm_studio_chat_completion( + config, + lm_messages, + max_tokens, + config.default_temperature as f32, + false, + ) + .await?; + let elapsed_ms = started.elapsed().as_millis() as u64; + { + let mut status = self.status.lock(); + status.state = "ready".to_string(); + status.last_latency_ms = Some(elapsed_ms); + status.prompt_toks_per_sec = None; + status.gen_toks_per_sec = None; + status.warning = None; + } + tracing::debug!( + elapsed_ms, + prompt_tokens = ?outcome.prompt_tokens, + completion_tokens = ?outcome.completion_tokens, + reply_len = outcome.reply.len(), + "[local_ai:chat] lm studio /v1/chat/completions done" + ); + return Ok(outcome.reply); + } + + tracing::debug!( + message_count = messages.len(), + model = %crate::openhuman::inference::model_ids::effective_chat_model_id(config), + "[local_ai:chat] sending to ollama /api/chat" + ); + + let started = std::time::Instant::now(); + + let body = crate::openhuman::inference::local::ollama::OllamaChatRequest { + model: crate::openhuman::inference::model_ids::effective_chat_model_id(config), + messages, + stream: false, + options: Some( + crate::openhuman::inference::local::ollama::OllamaGenerateOptions { + temperature: Some(config.default_temperature as f32), + top_k: Some(40), + top_p: Some(0.9), + num_predict: max_tokens.map(|v| v as i32), + }, + ), + }; + + let base_url = ollama_base_url_from_config(config); + let response = self + .http + .post(format!("{base_url}/api/chat")) + .json(&body) + .send() + .await + .map_err(|e| { + external_ollama_request_error_with_url("ollama chat request failed", &e, &base_url) + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + let detail = body.trim(); + return Err(format!( + "ollama chat failed with status {}{}", + status, + if detail.is_empty() { + String::new() + } else { + format!(": {detail}") + } + )); + } + + let payload: crate::openhuman::inference::local::ollama::OllamaChatResponse = response + .json() + .await + .map_err(|e| format!("ollama chat response parse failed: {e}"))?; + + let elapsed_ms = started.elapsed().as_millis() as u64; + let prompt_tps = payload + .prompt_eval_count + .zip(payload.prompt_eval_duration) + .and_then(|(count, dur_ns)| ns_to_tps(count as f32, dur_ns)); + let gen_tps = payload + .eval_count + .zip(payload.eval_duration) + .and_then(|(count, dur_ns)| ns_to_tps(count as f32, dur_ns)); + + { + let mut status = self.status.lock(); + status.state = "ready".to_string(); + status.last_latency_ms = Some(elapsed_ms); + status.prompt_toks_per_sec = prompt_tps; + status.gen_toks_per_sec = gen_tps; + status.warning = None; + } + + tracing::debug!( + elapsed_ms, + reply_len = payload.message.content.len(), + "[local_ai:chat] ollama /api/chat done" + ); + + let reply = payload.message.content.trim().to_string(); + if reply.is_empty() { + Err("ollama returned empty reply".to_string()) + } else { + Ok(reply) + } + } + pub(crate) async fn inference( &self, config: &Config, diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs index 27d92d3dc3..f9e9fa6b63 100644 --- a/src/openhuman/inference/local/service/public_infer_tests.rs +++ b/src/openhuman/inference/local/service/public_infer_tests.rs @@ -310,6 +310,97 @@ async fn inline_complete_interactive_does_not_block_on_held_permit() { ); } +#[tokio::test] +async fn prompt_interactive_does_not_block_on_held_permit() { + let _guard = crate::openhuman::inference::inference_test_guard(); + + let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit() + .expect("test must start with a free permit"); + + let app = Router::new().route( + "/api/generate", + post(|Json(_body): Json| async move { + Json(json!({ + "model": "test", + "response": "hello from mock", + "done": true + })) + }), + ); + let base = spawn_mock(app).await; + unsafe { + std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base); + } + + let config = enabled_config(); + let service = ready_service(&config); + + let result = tokio::time::timeout( + std::time::Duration::from_secs(2), + service.prompt_interactive(&config, "hi", Some(16), true), + ) + .await; + + unsafe { + std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL"); + } + + let reply = result + .expect("interactive prompt must not block on a held permit") + .expect("interactive prompt response"); + assert_eq!(reply, "hello from mock"); +} + +#[tokio::test] +async fn chat_with_history_interactive_does_not_block_on_held_permit() { + let _guard = crate::openhuman::inference::inference_test_guard(); + + let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit() + .expect("test must start with a free permit"); + + let app = Router::new().route( + "/api/chat", + post(|Json(_body): Json| async move { + Json(json!({ + "model": "test", + "message": { "role": "assistant", "content": "history reply" }, + "done": true + })) + }), + ); + let base = spawn_mock(app).await; + unsafe { + std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base); + } + + let config = enabled_config(); + let service = ready_service(&config); + + let result = tokio::time::timeout( + std::time::Duration::from_secs(2), + service.chat_with_history_interactive( + &config, + vec![ + crate::openhuman::inference::local::ollama::OllamaChatMessage { + role: "user".to_string(), + content: "hi".to_string(), + }, + ], + Some(16), + ), + ) + .await; + + unsafe { + std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL"); + } + + let reply = result + .expect("interactive chat must not block on a held permit") + .expect("interactive chat response"); + assert_eq!(reply, "history reply"); +} + /// Counterpart: the gated `inline_complete` (and `prompt`/`summarize`) /// MUST queue behind a held permit. We assert this with a try-style /// race: spawn the gated call, give it time to enter the wait, then diff --git a/src/openhuman/inference/model_ids.rs b/src/openhuman/inference/model_ids.rs index a407df65d2..d325faab28 100644 --- a/src/openhuman/inference/model_ids.rs +++ b/src/openhuman/inference/model_ids.rs @@ -15,9 +15,9 @@ pub(crate) const DEFAULT_OLLAMA_VISION_MODEL: &str = ""; pub(crate) const DEFAULT_LOW_VISION_MODEL: &str = "moondream:1.8b-v2-q4_K_S"; pub(crate) const DEFAULT_OLLAMA_EMBED_MODEL: &str = "bge-m3"; -/// Chat models allowed in the current MVP build (2–4 GB tier only). +/// Chat models allowed in the current local Ollama build. /// Any resolved chat model ID not listed here is redirected to `MVP_DEFAULT_CHAT_MODEL`. -const MVP_ALLOWED_CHAT_MODELS: &[&str] = &["gemma3:1b-it-qat"]; +const MVP_ALLOWED_CHAT_MODELS: &[&str] = &["gemma3:1b-it-qat", "gemma4:e4b-it-q8_0"]; const MVP_DEFAULT_CHAT_MODEL: &str = "gemma3:1b-it-qat"; /// Vision models allowed in MVP — only disabled (empty string) since the @@ -204,6 +204,13 @@ mod tests { assert_eq!(effective_chat_model_id(&config), "gemma3:1b-it-qat"); } + #[test] + fn chat_model_allows_requested_ollama_gemma4_q8() { + let mut config = test_config(); + config.local_ai.chat_model_id = "gemma4:e4b-it-q8_0".to_string(); + assert_eq!(effective_chat_model_id(&config), "gemma4:e4b-it-q8_0"); + } + #[test] fn chat_model_allows_custom_ids_for_lm_studio() { let mut config = test_config(); @@ -230,7 +237,7 @@ mod tests { #[test] fn chat_model_rejects_non_mvp_models() { let mut config = test_config(); - // All models outside the single MVP-allowed model are rejected. + // All models outside the local allowlist are rejected. config.local_ai.chat_model_id = "gemma3:4b-it-qat".to_string(); assert_eq!(effective_chat_model_id(&config), MVP_DEFAULT_CHAT_MODEL);