From 292dcc6a5ddecd0c02767d485c052ba5c2639fed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= <liguanchen@xiaomi.com>
Date: Fri, 29 May 2026 20:05:41 +0800
Subject: [PATCH 1/2] fix(local-ai): unblock summary diagnostics

---
 app/src/utils/tauriCommands/localAi.ts        |  1 +
 src/openhuman/inference/local/ops.rs          |  2 +-
 .../inference/local/service/ollama_admin.rs   | 30 ++++++--
 .../local/service/ollama_admin_tests.rs       | 76 +++++++++++++++++++
 .../inference/local/service/public_infer.rs   | 28 ++++++-
 .../local/service/public_infer_tests.rs       | 41 ++++++++++
 6 files changed, 167 insertions(+), 11 deletions(-)

diff --git a/app/src/utils/tauriCommands/localAi.ts b/app/src/utils/tauriCommands/localAi.ts
index 7c2ecdca05..b4711adf44 100644
--- a/app/src/utils/tauriCommands/localAi.ts
+++ b/app/src/utils/tauriCommands/localAi.ts
@@ -194,6 +194,7 @@ export interface InstalledModelInfo {
 
 export interface LocalAiDiagnostics {
   ollama_running: boolean;
+  ollama_runner_ok?: boolean;
   ollama_base_url: string;
   ollama_binary_path: string | null;
   vision_mode?: string;
diff --git a/src/openhuman/inference/local/ops.rs b/src/openhuman/inference/local/ops.rs
index efecec0601..7320bc4c51 100644
--- a/src/openhuman/inference/local/ops.rs
+++ b/src/openhuman/inference/local/ops.rs
@@ -200,7 +200,7 @@ pub async fn local_ai_summarize(
         service.bootstrap(config).await;
     }
     let summary = service
-        .summarize(config, text, max_tokens)
+        .summarize_interactive(config, text, max_tokens)
         .await
         .map_err(|e| e.to_string())?;
     Ok(RpcOutcome::single_log(
diff --git a/src/openhuman/inference/local/service/ollama_admin.rs b/src/openhuman/inference/local/service/ollama_admin.rs
index 5fd83e84cd..7d0089c953 100644
--- a/src/openhuman/inference/local/service/ollama_admin.rs
+++ b/src/openhuman/inference/local/service/ollama_admin.rs
@@ -848,6 +848,11 @@ impl LocalAiService {
 
         let base_url = ollama_base_url_from_config(config);
         let healthy = self.ollama_healthy_at(&base_url).await;
+        let runner_ok = if healthy {
+            self.ollama_runner_ok_at(&base_url).await
+        } else {
+            false
+        };
 
         log::debug!(
             "[local_ai] diagnostics: entry base_url={} healthy={}",
@@ -884,11 +889,15 @@ impl LocalAiService {
         // `/api/show` is one bounded round-trip per installed model,
         // fetched concurrently and only on this diagnostics path.
         let model_eligibilities: Vec<ContextEligibility> = if healthy {
-            futures_util::future::join_all(models.iter().map(|m| self.fetch_model_context(&m.name)))
-                .await
-                .into_iter()
-                .map(evaluate_context)
-                .collect()
+            futures_util::future::join_all(
+                models
+                    .iter()
+                    .map(|m| self.fetch_model_context_at(&base_url, &m.name)),
+            )
+            .await
+            .into_iter()
+            .map(evaluate_context)
+            .collect()
         } else {
             Vec::new()
         };
@@ -941,6 +950,12 @@ impl LocalAiService {
                 base_url
             ));
         }
+        if healthy && !runner_ok {
+            issues.push(
+                "Configured Ollama runtime is reachable but cannot execute models. Restart the external runtime and retry."
+                    .to_string(),
+            );
+        }
         if healthy && !chat_found {
             issues.push(format!("Chat model `{}` is not installed", expected_chat));
         }
@@ -1002,6 +1017,7 @@ impl LocalAiService {
 
         Ok(serde_json::json!({
             "ollama_running": healthy,
+            "ollama_runner_ok": runner_ok,
             "ollama_base_url": base_url,
             "ollama_binary_path": binary_path,
             "installed_models": installed_models,
@@ -1112,8 +1128,8 @@ impl LocalAiService {
     /// the metadata key is absent) — the caller maps that to an `Unknown`
     /// eligibility verdict rather than a hard rejection. One bounded HTTP
     /// round-trip per model; only ever invoked from the diagnostics path.
-    async fn fetch_model_context(&self, model: &str) -> Option<u64> {
-        let url = format!("{}/api/show", ollama_base_url());
+    async fn fetch_model_context_at(&self, base_url: &str, model: &str) -> Option<u64> {
+        let url = format!("{}/api/show", base_url.trim_end_matches('/'));
         let resp = self
             .http
             .post(&url)
diff --git a/src/openhuman/inference/local/service/ollama_admin_tests.rs b/src/openhuman/inference/local/service/ollama_admin_tests.rs
index b1eb3ab744..1e11b500c2 100644
--- a/src/openhuman/inference/local/service/ollama_admin_tests.rs
+++ b/src/openhuman/inference/local/service/ollama_admin_tests.rs
@@ -437,6 +437,82 @@ async fn diagnostics_ok_when_expected_models_are_present() {
     }
 }
 
+#[tokio::test]
+async fn diagnostics_reports_broken_runner_even_when_models_are_present() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let config = Config::default();
+    let chat = crate::openhuman::inference::model_ids::effective_chat_model_id(&config);
+    let embedding = crate::openhuman::inference::model_ids::effective_embedding_model_id(&config);
+    let chat_tag = format!("{}:latest", chat);
+    let embed_tag = format!("{}:latest", embedding);
+    let app = Router::new()
+        .route(
+            "/api/tags",
+            get(move || {
+                let chat_tag = chat_tag.clone();
+                let embed_tag = embed_tag.clone();
+                async move {
+                    Json(json!({
+                        "models": [
+                            { "name": chat_tag, "modified_at": "", "size": 1u64, "digest": "d" },
+                            { "name": embed_tag, "modified_at": "", "size": 2u64, "digest": "e" },
+                        ]
+                    }))
+                }
+            }),
+        )
+        .route(
+            "/api/show",
+            axum::routing::post(|Json(body): Json<serde_json::Value>| async move {
+                let model = body["name"]
+                    .as_str()
+                    .or_else(|| body["model"].as_str())
+                    .unwrap_or_default();
+                if model == "___nonexistent_probe___" {
+                    return (
+                        axum::http::StatusCode::INTERNAL_SERVER_ERROR,
+                        "fork/exec /broken/ollama: no such file or directory".to_string(),
+                    );
+                }
+                (
+                    axum::http::StatusCode::OK,
+                    json!({
+                        "model_info": {
+                            "general.architecture": "bert",
+                            "bert.context_length": 8192,
+                        },
+                        "capabilities": ["embedding"],
+                    })
+                    .to_string(),
+                )
+            }),
+        );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let service = LocalAiService::new(&config);
+    let diag = service.diagnostics(&config).await.expect("diagnostics");
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    assert_eq!(diag["ollama_running"], true);
+    assert_eq!(diag["ok"], false);
+    let issues = diag["issues"].as_array().cloned().unwrap_or_default();
+    assert!(
+        issues.iter().any(|issue| issue
+            .as_str()
+            .unwrap_or_default()
+            .contains("cannot execute models")),
+        "diagnostics should report the broken Ollama runner, got: {:?}",
+        issues
+    );
+}
+
 #[tokio::test]
 async fn resolve_binary_path_finds_binary_via_ollama_bin_env() {
     let _guard = crate::openhuman::inference::inference_test_guard();
diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs
index c2cf6243f4..5b3305a352 100644
--- a/src/openhuman/inference/local/service/public_infer.rs
+++ b/src/openhuman/inference/local/service/public_infer.rs
@@ -44,6 +44,25 @@ impl LocalAiService {
             .await
     }
 
+    pub async fn summarize_interactive(
+        &self,
+        config: &Config,
+        text: &str,
+        max_tokens: Option<u32>,
+    ) -> Result<String, String> {
+        log::trace!("[local_ai] summarize_interactive bypasses scheduler_gate permit");
+        if !config.local_ai.runtime_enabled {
+            return Err("local ai is disabled".to_string());
+        }
+        let system = "You summarize internal assistant context. Keep concise bullet points.";
+        let prompt = format!(
+            "Summarize this text in concise bullet points. Preserve decisions and commitments.\\n\\n{}",
+            text
+        );
+        self.inference_interactive(config, system, &prompt, max_tokens.or(Some(128)), true)
+            .await
+    }
+
     pub async fn prompt(
         &self,
         config: &Config,
@@ -114,7 +133,8 @@ impl LocalAiService {
     /// turn against it than show stale or empty completions for the
     /// duration of the backfill.
     ///
-    /// Along with [`Self::prompt_interactive`] and
+    /// Along with [`Self::prompt_interactive`],
+    /// [`Self::summarize_interactive`], and
     /// [`Self::chat_with_history_interactive`], this is one of the paths
     /// inside [`LocalAiService`] that opts out of the gate. Every other
     /// entry point (`inference`, `prompt`, `summarize`,
@@ -397,13 +417,15 @@ impl LocalAiService {
     /// the scheduler gate's LLM permit**.
     ///
     /// Used by user-arrival paths where the user is staring at the
-    /// output (push-to-talk dictation cleanup, in particular). If we
+    /// output (push-to-talk dictation cleanup and debug summary tests, in
+    /// particular). If we
     /// queue these behind a long-running memory backfill, the user
     /// experiences a frozen UI; better to race the call against
     /// background work and accept the contention than to silently
     /// degrade interactivity.
     ///
-    /// Sibling to [`Self::inline_complete_interactive`] for autocomplete.
+    /// Sibling to [`Self::inline_complete_interactive`] for autocomplete and
+    /// [`Self::summarize_interactive`] for explicit debug summary requests.
     /// Every other entry point (`inference`, `prompt`, `summarize`,
     /// `inline_complete`, `vision_prompt`, `embed`, `chat_with_history`)
     /// remains gated.
diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs
index f9e9fa6b63..0fa8539387 100644
--- a/src/openhuman/inference/local/service/public_infer_tests.rs
+++ b/src/openhuman/inference/local/service/public_infer_tests.rs
@@ -351,6 +351,47 @@ async fn prompt_interactive_does_not_block_on_held_permit() {
     assert_eq!(reply, "hello from mock");
 }
 
+#[tokio::test]
+async fn summarize_interactive_does_not_block_on_held_permit() {
+    let _guard = crate::openhuman::inference::inference_test_guard();
+
+    let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit()
+        .expect("test must start with a free permit");
+
+    let app = Router::new().route(
+        "/api/generate",
+        post(|Json(_body): Json<serde_json::Value>| async move {
+            Json(json!({
+                "model": "test",
+                "response": "summary from mock",
+                "done": true
+            }))
+        }),
+    );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = enabled_config();
+    let service = ready_service(&config);
+
+    let result = tokio::time::timeout(
+        std::time::Duration::from_secs(2),
+        service.summarize_interactive(&config, "text to summarize", Some(16)),
+    )
+    .await;
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    let reply = result
+        .expect("interactive summary must not block on a held permit")
+        .expect("interactive summary response");
+    assert_eq!(reply, "summary from mock");
+}
+
 #[tokio::test]
 async fn chat_with_history_interactive_does_not_block_on_held_permit() {
     let _guard = crate::openhuman::inference::inference_test_guard();

From 1c3d50216785f902bbf3fff228ad4c41daece69d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= <liguanchen@xiaomi.com>
Date: Fri, 29 May 2026 20:35:53 +0800
Subject: [PATCH 2/2] fix(local-ai): use real newlines in summary prompt

---
 src/openhuman/inference/local/service/public_infer.rs |  4 ++--
 .../inference/local/service/public_infer_tests.rs     | 11 ++++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs
index 5b3305a352..4b036d086f 100644
--- a/src/openhuman/inference/local/service/public_infer.rs
+++ b/src/openhuman/inference/local/service/public_infer.rs
@@ -37,7 +37,7 @@ impl LocalAiService {
         }
         let system = "You summarize internal assistant context. Keep concise bullet points.";
         let prompt = format!(
-            "Summarize this text in concise bullet points. Preserve decisions and commitments.\\n\\n{}",
+            "Summarize this text in concise bullet points. Preserve decisions and commitments.\n\n{}",
             text
         );
         self.inference(config, system, &prompt, max_tokens.or(Some(128)), true)
@@ -56,7 +56,7 @@ impl LocalAiService {
         }
         let system = "You summarize internal assistant context. Keep concise bullet points.";
         let prompt = format!(
-            "Summarize this text in concise bullet points. Preserve decisions and commitments.\\n\\n{}",
+            "Summarize this text in concise bullet points. Preserve decisions and commitments.\n\n{}",
             text
         );
         self.inference_interactive(config, system, &prompt, max_tokens.or(Some(128)), true)
diff --git a/src/openhuman/inference/local/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs
index 0fa8539387..54c3c97d1e 100644
--- a/src/openhuman/inference/local/service/public_infer_tests.rs
+++ b/src/openhuman/inference/local/service/public_infer_tests.rs
@@ -360,7 +360,16 @@ async fn summarize_interactive_does_not_block_on_held_permit() {
 
     let app = Router::new().route(
         "/api/generate",
-        post(|Json(_body): Json<serde_json::Value>| async move {
+        post(|Json(body): Json<serde_json::Value>| async move {
+            let prompt = body["prompt"].as_str().unwrap_or_default();
+            assert!(
+                prompt.contains("commitments.\n\ntext to summarize"),
+                "summary prompt should use real newlines, got: {prompt:?}"
+            );
+            assert!(
+                !prompt.contains(r"commitments.\n\ntext to summarize"),
+                "summary prompt must not contain literal backslash-n separators"
+            );
             Json(json!({
                 "model": "test",
                 "response": "summary from mock",