From 3cd559fd0d548b2aab52ea1626b1dfe44e65e026 Mon Sep 17 00:00:00 2001 From: Developer Date: Sun, 24 May 2026 17:18:11 +0800 Subject: [PATCH 1/5] Fix signed-out Ollama prompt and chat --- src/openhuman/inference/local/ops.rs | 4 +- .../inference/local/service/public_infer.rs | 47 +++++++++- .../local/service/public_infer_tests.rs | 87 +++++++++++++++++++ src/openhuman/inference/model_ids.rs | 13 ++- 4 files changed, 144 insertions(+), 7 deletions(-) diff --git a/src/openhuman/inference/local/ops.rs b/src/openhuman/inference/local/ops.rs index f3b64ecb57..10ad801f2f 100644 --- a/src/openhuman/inference/local/ops.rs +++ b/src/openhuman/inference/local/ops.rs @@ -197,7 +197,7 @@ pub async fn local_ai_prompt( service.bootstrap(config).await; } let output = service - .prompt(config, prompt.trim(), max_tokens, no_think.unwrap_or(true)) + .prompt_interactive(config, prompt.trim(), max_tokens, no_think.unwrap_or(true)) .await .map_err(|e| e.to_string())?; Ok(RpcOutcome::single_log(output, "local ai prompt completed")) @@ -412,7 +412,7 @@ pub async fn local_ai_chat( let service = local_ai::global(config); let reply = service - .chat_with_history(config, ollama_messages, max_tokens) + .chat_with_history_interactive(config, ollama_messages, max_tokens) .await?; tracing::debug!( diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs index 3aea0c46b1..796aa7dc1c 100644 --- a/src/openhuman/inference/local/service/public_infer.rs +++ b/src/openhuman/inference/local/service/public_infer.rs @@ -63,6 +63,25 @@ impl LocalAiService { .await } + pub async fn prompt_interactive( + &self, + config: &Config, + prompt: &str, + max_tokens: Option, + no_think: bool, + ) -> Result { + if !config.local_ai.runtime_enabled { + return Err("local ai is disabled".to_string()); + } + let system = if no_think { + "You are a concise assistant. Return only the final answer. Do not include reasoning or chain-of-thought." + } else { + "You are a helpful assistant." + }; + self.inference_interactive(config, system, prompt, max_tokens.or(Some(160)), no_think) + .await + } + pub async fn inline_complete( &self, config: &Config, @@ -193,6 +212,27 @@ impl LocalAiService { config: &Config, messages: Vec, max_tokens: Option, + ) -> Result { + self.chat_with_history_internal(config, messages, max_tokens, true) + .await + } + + pub(crate) async fn chat_with_history_interactive( + &self, + config: &Config, + messages: Vec, + max_tokens: Option, + ) -> Result { + self.chat_with_history_internal(config, messages, max_tokens, false) + .await + } + + async fn chat_with_history_internal( + &self, + config: &Config, + messages: Vec, + max_tokens: Option, + gated: bool, ) -> Result { if !config.local_ai.runtime_enabled { return Err("local ai is disabled".to_string()); @@ -206,8 +246,11 @@ impl LocalAiService { return Err("messages must not be empty".to_string()); } - // Multi-turn local chat is background LLM-bound work — gate it. - let _gate_permit = crate::openhuman::scheduler_gate::wait_for_capacity().await; + let _gate_permit = if gated { + crate::openhuman::scheduler_gate::wait_for_capacity().await + } else { + None + }; if provider_from_config(config) == LocalAiProvider::LmStudio { let started = std::time::Instant::now(); diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs index 8931438e2c..3ed591d42e 100644 --- a/src/openhuman/inference/local/service/public_infer_tests.rs +++ b/src/openhuman/inference/local/service/public_infer_tests.rs @@ -351,6 +351,93 @@ async fn inline_complete_interactive_does_not_block_on_held_permit() { ); } +#[tokio::test] +async fn prompt_interactive_does_not_block_when_signed_out() { + let _guard = crate::openhuman::inference::inference_test_guard(); + + let app = Router::new().route( + "/api/generate", + post(|Json(_body): Json| async move { + Json(json!({ + "model": "test", + "response": "hello from mock", + "done": true + })) + }), + ); + let base = spawn_mock(app).await; + unsafe { + std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base); + } + + let config = enabled_config(); + crate::openhuman::scheduler_gate::init_global(&config); + let _signed_out = crate::openhuman::scheduler_gate::SignedOutTestGuard::set(true); + let service = ready_service(&config); + + let result = tokio::time::timeout( + std::time::Duration::from_secs(2), + service.prompt_interactive(&config, "hi", Some(16), true), + ) + .await; + + unsafe { + std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL"); + } + + let reply = result + .expect("interactive prompt must not block when scheduler gate is signed out") + .expect("interactive prompt response"); + assert_eq!(reply, "hello from mock"); +} + +#[tokio::test] +async fn chat_with_history_interactive_does_not_block_when_signed_out() { + let _guard = crate::openhuman::inference::inference_test_guard(); + + let app = Router::new().route( + "/api/chat", + post(|Json(_body): Json| async move { + Json(json!({ + "model": "test", + "message": { "role": "assistant", "content": "history reply" }, + "done": true + })) + }), + ); + let base = spawn_mock(app).await; + unsafe { + std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base); + } + + let config = enabled_config(); + crate::openhuman::scheduler_gate::init_global(&config); + let _signed_out = crate::openhuman::scheduler_gate::SignedOutTestGuard::set(true); + let service = ready_service(&config); + + let result = tokio::time::timeout( + std::time::Duration::from_secs(2), + service.chat_with_history_interactive( + &config, + vec![crate::openhuman::inference::local::ollama::OllamaChatMessage { + role: "user".to_string(), + content: "hi".to_string(), + }], + Some(16), + ), + ) + .await; + + unsafe { + std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL"); + } + + let reply = result + .expect("interactive chat must not block when scheduler gate is signed out") + .expect("interactive chat response"); + assert_eq!(reply, "history reply"); +} + /// Counterpart: the gated `inline_complete` (and `prompt`/`summarize`) /// MUST queue behind a held permit. We assert this with a try-style /// race: spawn the gated call, give it time to enter the wait, then diff --git a/src/openhuman/inference/model_ids.rs b/src/openhuman/inference/model_ids.rs index a407df65d2..d325faab28 100644 --- a/src/openhuman/inference/model_ids.rs +++ b/src/openhuman/inference/model_ids.rs @@ -15,9 +15,9 @@ pub(crate) const DEFAULT_OLLAMA_VISION_MODEL: &str = ""; pub(crate) const DEFAULT_LOW_VISION_MODEL: &str = "moondream:1.8b-v2-q4_K_S"; pub(crate) const DEFAULT_OLLAMA_EMBED_MODEL: &str = "bge-m3"; -/// Chat models allowed in the current MVP build (2–4 GB tier only). +/// Chat models allowed in the current local Ollama build. /// Any resolved chat model ID not listed here is redirected to `MVP_DEFAULT_CHAT_MODEL`. -const MVP_ALLOWED_CHAT_MODELS: &[&str] = &["gemma3:1b-it-qat"]; +const MVP_ALLOWED_CHAT_MODELS: &[&str] = &["gemma3:1b-it-qat", "gemma4:e4b-it-q8_0"]; const MVP_DEFAULT_CHAT_MODEL: &str = "gemma3:1b-it-qat"; /// Vision models allowed in MVP — only disabled (empty string) since the @@ -204,6 +204,13 @@ mod tests { assert_eq!(effective_chat_model_id(&config), "gemma3:1b-it-qat"); } + #[test] + fn chat_model_allows_requested_ollama_gemma4_q8() { + let mut config = test_config(); + config.local_ai.chat_model_id = "gemma4:e4b-it-q8_0".to_string(); + assert_eq!(effective_chat_model_id(&config), "gemma4:e4b-it-q8_0"); + } + #[test] fn chat_model_allows_custom_ids_for_lm_studio() { let mut config = test_config(); @@ -230,7 +237,7 @@ mod tests { #[test] fn chat_model_rejects_non_mvp_models() { let mut config = test_config(); - // All models outside the single MVP-allowed model are rejected. + // All models outside the local allowlist are rejected. config.local_ai.chat_model_id = "gemma3:4b-it-qat".to_string(); assert_eq!(effective_chat_model_id(&config), MVP_DEFAULT_CHAT_MODEL); From 1ed33aa54d3bf77677e015afafec7528d3a9d155 Mon Sep 17 00:00:00 2001 From: Developer Date: Sun, 24 May 2026 17:19:09 +0800 Subject: [PATCH 2/5] Format Ollama inference regression test --- .../inference/local/service/public_infer_tests.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs index 3ed591d42e..d73a33b7fc 100644 --- a/src/openhuman/inference/local/service/public_infer_tests.rs +++ b/src/openhuman/inference/local/service/public_infer_tests.rs @@ -419,10 +419,12 @@ async fn chat_with_history_interactive_does_not_block_when_signed_out() { std::time::Duration::from_secs(2), service.chat_with_history_interactive( &config, - vec![crate::openhuman::inference::local::ollama::OllamaChatMessage { - role: "user".to_string(), - content: "hi".to_string(), - }], + vec![ + crate::openhuman::inference::local::ollama::OllamaChatMessage { + role: "user".to_string(), + content: "hi".to_string(), + }, + ], Some(16), ), ) From 946d4d5f525bb9682f2311632f35c97b2836b0fa Mon Sep 17 00:00:00 2001 From: Developer Date: Sun, 24 May 2026 17:43:39 +0800 Subject: [PATCH 3/5] Isolate interactive inference permit tests --- .../local/service/public_infer_tests.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs index d73a33b7fc..226477d937 100644 --- a/src/openhuman/inference/local/service/public_infer_tests.rs +++ b/src/openhuman/inference/local/service/public_infer_tests.rs @@ -352,9 +352,12 @@ async fn inline_complete_interactive_does_not_block_on_held_permit() { } #[tokio::test] -async fn prompt_interactive_does_not_block_when_signed_out() { +async fn prompt_interactive_does_not_block_on_held_permit() { let _guard = crate::openhuman::inference::inference_test_guard(); + let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit() + .expect("test must start with a free permit"); + let app = Router::new().route( "/api/generate", post(|Json(_body): Json| async move { @@ -371,8 +374,6 @@ async fn prompt_interactive_does_not_block_when_signed_out() { } let config = enabled_config(); - crate::openhuman::scheduler_gate::init_global(&config); - let _signed_out = crate::openhuman::scheduler_gate::SignedOutTestGuard::set(true); let service = ready_service(&config); let result = tokio::time::timeout( @@ -386,15 +387,18 @@ async fn prompt_interactive_does_not_block_when_signed_out() { } let reply = result - .expect("interactive prompt must not block when scheduler gate is signed out") + .expect("interactive prompt must not block on a held permit") .expect("interactive prompt response"); assert_eq!(reply, "hello from mock"); } #[tokio::test] -async fn chat_with_history_interactive_does_not_block_when_signed_out() { +async fn chat_with_history_interactive_does_not_block_on_held_permit() { let _guard = crate::openhuman::inference::inference_test_guard(); + let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit() + .expect("test must start with a free permit"); + let app = Router::new().route( "/api/chat", post(|Json(_body): Json| async move { @@ -411,8 +415,6 @@ async fn chat_with_history_interactive_does_not_block_when_signed_out() { } let config = enabled_config(); - crate::openhuman::scheduler_gate::init_global(&config); - let _signed_out = crate::openhuman::scheduler_gate::SignedOutTestGuard::set(true); let service = ready_service(&config); let result = tokio::time::timeout( @@ -435,7 +437,7 @@ async fn chat_with_history_interactive_does_not_block_when_signed_out() { } let reply = result - .expect("interactive chat must not block when scheduler gate is signed out") + .expect("interactive chat must not block on a held permit") .expect("interactive chat response"); assert_eq!(reply, "history reply"); } From d592e92df5acba44ce1a69041c981df5c979717e Mon Sep 17 00:00:00 2001 From: Developer Date: Sun, 24 May 2026 22:46:51 +0800 Subject: [PATCH 4/5] Probe Ollama readiness with GET /api/tags --- .env.example | 3 ++ .../inference/local/service/ollama_admin.rs | 2 +- .../local/service/ollama_admin_tests.rs | 32 +++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/.env.example b/.env.example index 698664938e..2fd33e5baf 100644 --- a/.env.example +++ b/.env.example @@ -148,6 +148,9 @@ OPENHUMAN_PROXY_SERVICES= # [optional] Override selected model tier: low, medium, high # Applies the corresponding preset at config load time (overrides config.toml). OPENHUMAN_LOCAL_AI_TIER= +# [optional] Override Ollama's HTTP server base URL (default: http://localhost:11434). +# Useful when Ollama runs in another container, on another host, or on a non-default port. +# OPENHUMAN_OLLAMA_BASE_URL=http://127.0.0.1:11434 # [optional] Override LM Studio's OpenAI-compatible local server base URL. # Defaults to http://localhost:1234/v1 when local_ai.provider = "lm_studio". OPENHUMAN_LM_STUDIO_BASE_URL= diff --git a/src/openhuman/inference/local/service/ollama_admin.rs b/src/openhuman/inference/local/service/ollama_admin.rs index a7639d84c6..5fd83e84cd 100644 --- a/src/openhuman/inference/local/service/ollama_admin.rs +++ b/src/openhuman/inference/local/service/ollama_admin.rs @@ -1302,7 +1302,7 @@ impl LocalAiService { async fn ollama_runner_ok_at(&self, base_url: &str) -> bool { let resp = self .http - .post(format!("{base_url}/api/tags")) + .get(format!("{base_url}/api/tags")) .timeout(std::time::Duration::from_secs(3)) .send() .await; diff --git a/src/openhuman/inference/local/service/ollama_admin_tests.rs b/src/openhuman/inference/local/service/ollama_admin_tests.rs index 86c38ab2b2..b1eb3ab744 100644 --- a/src/openhuman/inference/local/service/ollama_admin_tests.rs +++ b/src/openhuman/inference/local/service/ollama_admin_tests.rs @@ -247,6 +247,38 @@ async fn ensure_ollama_server_reports_broken_external_runner_without_restart_att ); } +#[tokio::test] +async fn ensure_ollama_server_accepts_healthy_external_runner() { + let _guard = crate::openhuman::inference::inference_test_guard(); + + let app = Router::new() + .route("/api/tags", get(|| async { Json(json!({ "models": [] })) })) + .route( + "/api/show", + axum::routing::post(|| async { + ( + axum::http::StatusCode::NOT_FOUND, + Json(json!({ "error": "model '___nonexistent_probe___' not found" })), + ) + }), + ); + let base = spawn_mock(app).await; + unsafe { + std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base); + } + + let config = Config::default(); + let service = LocalAiService::new(&config); + service + .ensure_ollama_server(&config) + .await + .expect("healthy external runner should pass"); + + unsafe { + std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL"); + } +} + #[tokio::test] async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_binary_exists() { let _guard = crate::openhuman::inference::inference_test_guard(); From dde7bc92d4da49d058ff5b7a8818a39ecd600c1d Mon Sep 17 00:00:00 2001 From: Steven Enamakel Date: Sun, 24 May 2026 22:11:46 -0700 Subject: [PATCH 5/5] fix(inference): add diagnosis logging and update stale docs on interactive paths - Add trace logs to prompt_interactive and chat_with_history_interactive indicating scheduler gate bypass (addresses @Copilot and @coderabbitai on public_infer.rs:80 and public_infer.rs:220-228) - Update inline_complete_interactive doc comment to reflect that prompt_interactive and chat_with_history_interactive are also ungated --- src/openhuman/inference/local/service/public_infer.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs index 796aa7dc1c..c2cf6243f4 100644 --- a/src/openhuman/inference/local/service/public_infer.rs +++ b/src/openhuman/inference/local/service/public_infer.rs @@ -70,6 +70,7 @@ impl LocalAiService { max_tokens: Option, no_think: bool, ) -> Result { + log::trace!("[local_ai] prompt_interactive bypasses scheduler_gate permit"); if !config.local_ai.runtime_enabled { return Err("local ai is disabled".to_string()); } @@ -113,9 +114,11 @@ impl LocalAiService { /// turn against it than show stale or empty completions for the /// duration of the backfill. /// - /// This is the only path inside [`LocalAiService`] that opts out of - /// the gate. Every other entry point (`inference`, `prompt`, - /// `summarize`, `inline_complete`, `vision_prompt`, `embed`) + /// Along with [`Self::prompt_interactive`] and + /// [`Self::chat_with_history_interactive`], this is one of the paths + /// inside [`LocalAiService`] that opts out of the gate. Every other + /// entry point (`inference`, `prompt`, `summarize`, + /// `inline_complete`, `vision_prompt`, `embed`, `chat_with_history`) /// acquires before talking to Ollama. pub async fn inline_complete_interactive( &self, @@ -223,6 +226,7 @@ impl LocalAiService { messages: Vec, max_tokens: Option, ) -> Result { + log::trace!("[local_ai] chat_with_history_interactive bypasses scheduler_gate permit"); self.chat_with_history_internal(config, messages, max_tokens, false) .await }