From d507a416702151ecc1e735068cccd1af0c6b0802 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 15:11:51 -0700
Subject: [PATCH 01/18] Refactor inference around external Ollama routing

---
 .../local-model/DeviceCapabilitySection.tsx   |  51 +--
 .../local-model/ModelDownloadSection.tsx      |  18 +-
 .../local-model/ModelStatusSection.test.tsx   | 227 +++++++++---
 .../panels/local-model/ModelStatusSection.tsx | 200 ++---------
 app/src/utils/tauriCommands/localAi.ts        |  13 +-
 .../e2e/specs/local-model-runtime.spec.ts     |  26 +-
 src/core/all.rs                               |   4 +
 src/core/cli_tests.rs                         |   1 +
 src/openhuman/app_state/ops.rs                |   2 +-
 .../channels/providers/presentation.rs        |   3 +-
 src/openhuman/inference/mod.rs                |  15 +
 src/openhuman/inference/ops.rs                |  89 +++++
 src/openhuman/inference/ops_tests.rs          |  96 ++++++
 src/openhuman/inference/schemas.rs            | 325 ++++++++++++++++++
 src/openhuman/inference/schemas_tests.rs      |  78 +++++
 src/openhuman/local_ai/mod.rs                 |   1 +
 src/openhuman/local_ai/ops.rs                 |  80 ++---
 src/openhuman/local_ai/schemas.rs             |  41 +--
 src/openhuman/local_ai/schemas_tests.rs       |  10 +-
 src/openhuman/local_ai/service/assets.rs      |  16 +-
 src/openhuman/local_ai/service/bootstrap.rs   |  30 +-
 .../local_ai/service/ollama_admin.rs          |  81 +----
 .../local_ai/service/ollama_admin_tests.rs    | 113 +++++-
 src/openhuman/local_ai/types.rs               |   8 +-
 src/openhuman/mod.rs                          |   1 +
 src/openhuman/subconscious/executor.rs        |   2 +-
 tests/json_rpc_e2e.rs                         | 180 ++++++++++
 27 files changed, 1207 insertions(+), 504 deletions(-)
 create mode 100644 src/openhuman/inference/mod.rs
 create mode 100644 src/openhuman/inference/ops.rs
 create mode 100644 src/openhuman/inference/ops_tests.rs
 create mode 100644 src/openhuman/inference/schemas.rs
 create mode 100644 src/openhuman/inference/schemas_tests.rs

diff --git a/app/src/components/settings/panels/local-model/DeviceCapabilitySection.tsx b/app/src/components/settings/panels/local-model/DeviceCapabilitySection.tsx
index b910d2e2f9..8fdf648319 100644
--- a/app/src/components/settings/panels/local-model/DeviceCapabilitySection.tsx
+++ b/app/src/components/settings/panels/local-model/DeviceCapabilitySection.tsx
@@ -14,30 +14,15 @@ interface DeviceCapabilitySectionProps {
   formatRamGb: (bytes: number) => string;
   onPresetApplied?: (result: ApplyPresetResult) => void;
   /**
-   * When `false`, the Ollama runtime isn't installed yet. Local tiers
-   * require Ollama, so they're rendered disabled with a notice that
-   * lets the user install Ollama in place. The "Disabled (cloud
-   * fallback)" option stays enabled since it doesn't need Ollama.
+   * When `false`, the external Ollama runtime isn't reachable yet. Local tiers
+   * stay disabled until the user runs Ollama themselves. The "Disabled (cloud
+   * fallback)" option stays enabled since it doesn't depend on Ollama.
    */
   ollamaAvailable?: boolean;
-  /**
-   * Triggers the same install pipeline the Runtime Status section uses.
-   * Wired only when `ollamaAvailable === false` to surface an inline
-   * Install Ollama button next to the locked tiers.
-   */
   onTriggerOllamaInstall?: () => void;
-  /** True while an install pipeline is already running. */
   isTriggeringInstall?: boolean;
-  /**
-   * Live state from `local_ai_status` so the notice can show real install
-   * progress: `installing`, `downloading`, `degraded`, etc. The button's
-   * own `isTriggeringInstall` only covers the RPC round-trip (~ms);
-   * `installState` covers the entire backend pipeline (~60s).
-   */
   installState?: string;
-  /** Latest `status.warning` text — shown under the progress label. */
   installWarning?: string | null;
-  /** Latest `status.error_detail` — shown when state is `degraded`. */
   installError?: string | null;
 }
 
@@ -57,9 +42,13 @@ const DeviceCapabilitySection = ({
   installWarning,
   installError,
 }: DeviceCapabilitySectionProps) => {
-  const installInProgress =
-    installState === 'installing' || installState === 'downloading' || installState === 'loading';
-  const installFailed = installState === 'degraded';
+  void onTriggerOllamaInstall;
+  void isTriggeringInstall;
+  void installState;
+  void installWarning;
+  void installError;
+  const installInProgress = false;
+  const installFailed = false;
   const [applying, setApplying] = useState<string | null>(null);
   const [applyError, setApplyError] = useState<string>('');
   const [applySuccess, setApplySuccess] = useState<ApplyPresetResult | null>(null);
@@ -187,26 +176,18 @@ const DeviceCapabilitySection = ({
           ) : (
             <>
               <div className="text-xs text-amber-800">
-                <span className="font-semibold text-amber-900">Install Ollama first.</span> Local
-                tiers run on the Ollama runtime, which isn&apos;t installed yet. The &ldquo;Disabled
-                (cloud fallback)&rdquo; option stays available either way.
+                <span className="font-semibold text-amber-900">Run Ollama first.</span> Local
+                tiers depend on an externally managed Ollama endpoint. Start it yourself, pull the
+                models you want, and keep using &ldquo;Disabled (cloud fallback)&rdquo; until the
+                runtime is reachable.
               </div>
               <div className="flex items-center gap-2">
-                {onTriggerOllamaInstall && (
-                  <button
-                    type="button"
-                    onClick={onTriggerOllamaInstall}
-                    disabled={isTriggeringInstall}
-                    className="px-3 py-1.5 text-xs rounded-md bg-amber-600 hover:bg-amber-700 disabled:opacity-60 text-white font-medium">
-                    {isTriggeringInstall ? 'Starting…' : 'Install Ollama'}
-                  </button>
-                )}
                 <a
                   href="https://ollama.com"
                   target="_blank"
                   rel="noopener noreferrer"
                   className="px-3 py-1.5 text-xs rounded-md border border-amber-300 hover:border-amber-400 text-amber-800">
-                  Install manually
+                  Ollama docs
                 </a>
               </div>
             </>
@@ -257,7 +238,7 @@ const DeviceCapabilitySection = ({
                 key={preset.tier}
                 onClick={() => void handleApply(preset.tier)}
                 disabled={applying !== null || locked}
-                title={locked ? 'Install Ollama first to use this tier' : undefined}
+                title={locked ? 'Run Ollama first to use this tier' : undefined}
                 className={`w-full text-left rounded-lg border p-3 transition-colors ${
                   isCurrent
                     ? 'border-primary-400 bg-primary-50'
diff --git a/app/src/components/settings/panels/local-model/ModelDownloadSection.tsx b/app/src/components/settings/panels/local-model/ModelDownloadSection.tsx
index 994f2fe42d..5ac567c75f 100644
--- a/app/src/components/settings/panels/local-model/ModelDownloadSection.tsx
+++ b/app/src/components/settings/panels/local-model/ModelDownloadSection.tsx
@@ -126,12 +126,18 @@ const ModelDownloadSection = ({
                 {item?.path && (
                   <div className="text-[10px] text-stone-500 mt-1 break-all">{item.path}</div>
                 )}
-                <button
-                  onClick={() => onTriggerAssetDownload(key)}
-                  disabled={!runtimeEnabled || assetDownloadBusy[key]}
-                  className="mt-2 px-2 py-1 text-[10px] rounded border border-stone-200 hover:border-stone-300 disabled:opacity-60 text-stone-600">
-                  {assetDownloadBusy[key] ? 'Downloading...' : 'Download'}
-                </button>
+                {item?.provider === 'ollama' || item?.provider === 'lm_studio' ? (
+                  <div className="mt-2 text-[10px] text-stone-500">
+                    Manage this model in your external runtime.
+                  </div>
+                ) : (
+                  <button
+                    onClick={() => onTriggerAssetDownload(key)}
+                    disabled={!runtimeEnabled || assetDownloadBusy[key]}
+                    className="mt-2 px-2 py-1 text-[10px] rounded border border-stone-200 hover:border-stone-300 disabled:opacity-60 text-stone-600">
+                    {assetDownloadBusy[key] ? 'Downloading...' : 'Download'}
+                  </button>
+                )}
               </div>
             ))}
           </div>
diff --git a/app/src/components/settings/panels/local-model/ModelStatusSection.test.tsx b/app/src/components/settings/panels/local-model/ModelStatusSection.test.tsx
index 1b34c17d54..c511b8a77a 100644
--- a/app/src/components/settings/panels/local-model/ModelStatusSection.test.tsx
+++ b/app/src/components/settings/panels/local-model/ModelStatusSection.test.tsx
@@ -1,7 +1,7 @@
-import { fireEvent, render, screen } from '@testing-library/react';
+import { render, screen } from '@testing-library/react';
 import { describe, expect, it, vi } from 'vitest';
 
-import type { LocalAiDiagnostics, RepairAction } from '../../../../utils/tauriCommands';
+import type { LocalAiDiagnostics } from '../../../../utils/tauriCommands';
 import ModelStatusSection from './ModelStatusSection';
 
 const defaultProps = {
@@ -55,11 +55,11 @@ const makeDiagnostics = (overrides: Partial<LocalAiDiagnostics> = {}): LocalAiDi
 });
 
 describe('ModelStatusSection diagnostics', () => {
-  it('disables bootstrap controls when runtime is disabled', () => {
+  it('still renders runtime status when runtime is disabled', () => {
     render(<ModelStatusSection {...defaultProps} runtimeEnabled={false} />);
 
-    expect(screen.getByRole('button', { name: 'Bootstrap / Resume' })).toBeDisabled();
-    expect(screen.getByRole('button', { name: 'Force Re-bootstrap' })).toBeDisabled();
+    expect(screen.getByText('Runtime Status')).toBeTruthy();
+    expect(screen.getByText('Refresh')).toBeTruthy();
   });
 
   it('shows the base URL being checked', () => {
@@ -122,64 +122,19 @@ describe('ModelStatusSection diagnostics', () => {
     expect(screen.getByText('/opt/homebrew/bin/ollama')).toBeTruthy();
   });
 
-  it('renders repair action buttons', () => {
-    const repairActions: RepairAction[] = [
-      { action: 'install_ollama' },
-      { action: 'start_server', binary_path: '/usr/local/bin/ollama' },
-      { action: 'pull_model', model: 'gemma3:1b-it-qat' },
-    ];
+  it('renders manual-management guidance when diagnostics fail', () => {
     render(
       <ModelStatusSection
         {...defaultProps}
         diagnostics={makeDiagnostics({
           ok: false,
           issues: ['Ollama server is not running'],
-          repair_actions: repairActions,
         })}
       />
     );
-    expect(screen.getByText('Install Ollama')).toBeTruthy();
-    expect(screen.getByText('Start Server')).toBeTruthy();
-    expect(screen.getByText('Pull gemma3:1b-it-qat')).toBeTruthy();
-  });
-
-  it('calls onRepairAction with the correct action when button is clicked', () => {
-    const onRepairAction = vi.fn();
-    const repairActions: RepairAction[] = [{ action: 'install_ollama' }];
-    render(
-      <ModelStatusSection
-        {...defaultProps}
-        onRepairAction={onRepairAction}
-        diagnostics={makeDiagnostics({
-          ok: false,
-          issues: ['Ollama server is not running'],
-          repair_actions: repairActions,
-        })}
-      />
-    );
-    fireEvent.click(screen.getByText('Install Ollama'));
-    expect(onRepairAction).toHaveBeenCalledWith({ action: 'install_ollama' });
-  });
-
-  it('calls onRepairAction with pull_model action', () => {
-    const onRepairAction = vi.fn();
-    const repairActions: RepairAction[] = [{ action: 'pull_model', model: 'gemma3:1b-it-qat' }];
-    render(
-      <ModelStatusSection
-        {...defaultProps}
-        onRepairAction={onRepairAction}
-        diagnostics={makeDiagnostics({
-          ok: false,
-          issues: ['Chat model is not installed'],
-          repair_actions: repairActions,
-        })}
-      />
-    );
-    fireEvent.click(screen.getByText('Pull gemma3:1b-it-qat'));
-    expect(onRepairAction).toHaveBeenCalledWith({
-      action: 'pull_model',
-      model: 'gemma3:1b-it-qat',
-    });
+    expect(
+      screen.getByText(/Manage the Ollama process and model pulls outside OpenHuman/)
+    ).toBeTruthy();
   });
 
   it('does not render repair actions section when repair_actions is empty', () => {
@@ -212,4 +167,168 @@ describe('ModelStatusSection diagnostics', () => {
     render(<ModelStatusSection {...defaultProps} diagnostics={null} />);
     expect(screen.getByText(/Click.*Run Diagnostics/)).toBeTruthy();
   });
+
+  it('shows external-runtime guidance when ollama is unavailable', () => {
+    render(
+      <ModelStatusSection
+        {...defaultProps}
+        downloads={{
+          state: 'idle',
+          warning: null,
+          progress: 0,
+          downloaded_bytes: null,
+          total_bytes: null,
+          speed_bps: null,
+          eta_seconds: null,
+          ollama_available: false,
+          chat: {
+            id: 'gemma3:1b-it-qat',
+            provider: 'ollama',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+          vision: {
+            id: '',
+            provider: 'ollama',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+          embedding: {
+            id: 'bge-m3',
+            provider: 'ollama',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+          stt: {
+            id: 'whisper',
+            provider: 'whisper',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+          tts: {
+            id: 'piper',
+            provider: 'piper',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+        }}
+      />
+    );
+
+    expect(screen.getByText('Ollama runtime unavailable')).toBeTruthy();
+    expect(screen.getByText(/external inference runtime/)).toBeTruthy();
+    expect(screen.getByText('Ollama docs')).toBeTruthy();
+  });
+
+  it('renders docs link instead of install controls when ollama is unavailable', () => {
+    render(
+      <ModelStatusSection
+        {...defaultProps}
+        downloads={{
+          state: 'idle',
+          warning: null,
+          progress: 0,
+          downloaded_bytes: null,
+          total_bytes: null,
+          speed_bps: null,
+          eta_seconds: null,
+          ollama_available: false,
+          chat: {
+            id: 'gemma3:1b-it-qat',
+            provider: 'ollama',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+          vision: {
+            id: '',
+            provider: 'ollama',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+          embedding: {
+            id: 'bge-m3',
+            provider: 'ollama',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+          stt: {
+            id: 'whisper',
+            provider: 'whisper',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+          tts: {
+            id: 'piper',
+            provider: 'piper',
+            state: 'missing',
+            progress: null,
+            downloaded_bytes: null,
+            total_bytes: null,
+            speed_bps: null,
+            eta_seconds: null,
+            warning: null,
+            path: null,
+          },
+        }}
+      />
+    );
+
+    expect(screen.queryByRole('button', { name: 'Install Ollama' })).toBeNull();
+    expect(screen.queryByRole('button', { name: 'Set Path' })).toBeNull();
+    expect(screen.getByRole('link', { name: 'Ollama docs' })).toBeTruthy();
+  });
 });
diff --git a/app/src/components/settings/panels/local-model/ModelStatusSection.tsx b/app/src/components/settings/panels/local-model/ModelStatusSection.tsx
index a89089d735..9854635a3d 100644
--- a/app/src/components/settings/panels/local-model/ModelStatusSection.tsx
+++ b/app/src/components/settings/panels/local-model/ModelStatusSection.tsx
@@ -37,17 +37,6 @@ interface ModelStatusSectionProps {
   onRepairAction?: (action: RepairAction) => void;
 }
 
-const repairActionLabel = (action: RepairAction): string => {
-  switch (action.action) {
-    case 'install_ollama':
-      return 'Install Ollama';
-    case 'start_server':
-      return 'Start Server';
-    case 'pull_model':
-      return `Pull ${action.model}`;
-  }
-};
-
 const ModelStatusSection = ({
   status,
   downloads,
@@ -78,19 +67,26 @@ const ModelStatusSection = ({
   onRunDiagnostics,
   onRepairAction,
 }: ModelStatusSectionProps) => {
-  // Core reports `ollama_available: false` when no Ollama binary is
-  // discoverable on disk. The backend short-circuits all `has_model` HTTP
-  // probes in that state, so model rows below will all read "missing". Surface
-  // a clear install CTA up front so users don't have to interpret the empty
-  // model state on their own.
+  // OpenHuman no longer installs or launches Ollama itself. When the runtime
+  // is unavailable, surface manual guidance instead of management controls.
   const showInstallOllamaCta = downloads?.ollama_available === false;
 
+  void isTriggeringDownload;
+  void bootstrapMessage;
+  void isInstalling;
+  void isInstallError;
+  void showErrorDetail;
+  void ollamaPathInput;
+  void isSettingPath;
+  void runtimeEnabled;
+  void onTriggerDownload;
+  void onSetOllamaPath;
+  void onClearOllamaPath;
+  void onSetOllamaPathInput;
+  void onToggleErrorDetail;
+  void onRepairAction;
+
   if (showInstallOllamaCta) {
-    // No Ollama on disk — the runtime-status card and diagnostics panels
-    // below would just read "n/a" / "missing" everywhere, which is more
-    // confusing than helpful. Render only the install CTA, with the binary
-    // path setter inline for users who installed Ollama in a non-standard
-    // location that auto-discovery can't find.
     return (
       <section className="rounded-lg border border-amber-300 bg-amber-50 p-4 space-y-3">
         <div className="flex items-start gap-3">
@@ -107,80 +103,22 @@ const ModelStatusSection = ({
             />
           </svg>
           <div className="flex-1 space-y-1">
-            <div className="text-sm font-semibold text-amber-900">Ollama is not installed</div>
+            <div className="text-sm font-semibold text-amber-900">Ollama runtime unavailable</div>
             <div className="text-xs text-amber-800">
-              Local AI features (chat, vision, embedding) need the Ollama runtime. Install it below
-              — the installer runs silently and lands in your workspace; no console window will
-              appear.
+              OpenHuman now treats Ollama as an external inference runtime. Start your own Ollama
+              server, pull the models you want, and point workload routing at it.
             </div>
           </div>
         </div>
         <div className="flex items-center gap-2 pt-1">
-          <button
-            type="button"
-            onClick={() => onTriggerDownload(true)}
-            disabled={isTriggeringDownload}
-            className="px-3 py-1.5 text-xs rounded-md bg-amber-600 hover:bg-amber-700 disabled:opacity-60 text-white font-medium">
-            {isTriggeringDownload ? 'Installing...' : 'Install Ollama'}
-          </button>
           <a
             href="https://ollama.com"
             target="_blank"
             rel="noopener noreferrer"
             className="px-3 py-1.5 text-xs rounded-md border border-amber-300 hover:border-amber-400 text-amber-800">
-            Install manually
+            Ollama docs
           </a>
         </div>
-
-        {isInstallError && status?.error_detail && (
-          <div className="space-y-1 pt-2 border-t border-amber-200">
-            <button
-              type="button"
-              onClick={onToggleErrorDetail}
-              className="text-xs text-red-700 hover:text-red-600 underline">
-              {showErrorDetail ? 'Hide error details' : 'Show install error details'}
-            </button>
-            {showErrorDetail && (
-              <pre className="max-h-40 overflow-auto rounded bg-red-50 border border-red-200 p-2 text-[10px] text-red-700 leading-tight whitespace-pre-wrap break-words">
-                {status.error_detail}
-              </pre>
-            )}
-          </div>
-        )}
-
-        <div className="pt-2 border-t border-amber-200 space-y-1">
-          <div className="text-amber-900 text-xs font-medium">
-            Already installed in a custom location?
-          </div>
-          <div className="text-[11px] text-amber-800">
-            Point us at the binary and we&apos;ll use it instead of running the installer.
-          </div>
-          <div className="flex items-center gap-2 pt-1">
-            <input
-              type="text"
-              value={ollamaPathInput}
-              onChange={e => onSetOllamaPathInput(e.target.value)}
-              placeholder="C:\Users\you\AppData\Local\Programs\Ollama\ollama.exe"
-              className="flex-1 rounded-md border border-amber-300 bg-white px-2 py-1.5 text-xs text-stone-900 placeholder:text-stone-400 focus:border-amber-500 focus:outline-none"
-            />
-            <button
-              type="button"
-              onClick={onSetOllamaPath}
-              disabled={isSettingPath || !ollamaPathInput.trim()}
-              className="px-2 py-1.5 text-xs rounded-md bg-amber-600 hover:bg-amber-700 disabled:opacity-60 text-white whitespace-nowrap">
-              {isSettingPath ? 'Setting...' : 'Set Path'}
-            </button>
-            {ollamaPathInput && (
-              <button
-                type="button"
-                onClick={onClearOllamaPath}
-                disabled={isSettingPath}
-                className="px-2 py-1.5 text-xs rounded-md border border-amber-300 hover:border-amber-400 disabled:opacity-60 text-amber-800 whitespace-nowrap">
-                Clear
-              </button>
-            )}
-          </div>
-        </div>
       </section>
     );
   }
@@ -272,7 +210,7 @@ const ModelStatusSection = ({
           {status?.warning && <div className="text-xs text-amber-700">{status.warning}</div>}
           {statusError && <div className="text-xs text-red-600">{statusError}</div>}
 
-          {isInstallError && status?.error_detail && (
+          {status?.error_detail && (
             <div className="space-y-1">
               <button
                 onClick={onToggleErrorDetail}
@@ -285,7 +223,7 @@ const ModelStatusSection = ({
                 </pre>
               )}
               <p className="text-xs text-stone-500">
-                Install Ollama manually from{' '}
+                OpenHuman only connects to an already-running Ollama-compatible endpoint. See{' '}
                 <a
                   href="https://ollama.com"
                   target="_blank"
@@ -293,73 +231,10 @@ const ModelStatusSection = ({
                   className="text-primary-500 hover:text-primary-600 underline">
                   ollama.com
                 </a>{' '}
-                then set its path below.
+                for setup instructions, then retry after your runtime is reachable.
               </p>
             </div>
           )}
-
-          <div className="space-y-1">
-            <div className="text-stone-500 text-xs uppercase tracking-wide">
-              Ollama Binary Path (optional)
-            </div>
-            <div className="flex items-center gap-2">
-              <input
-                type="text"
-                value={ollamaPathInput}
-                onChange={e => onSetOllamaPathInput(e.target.value)}
-                placeholder="/usr/local/bin/ollama"
-                className="flex-1 rounded-md border border-stone-200 bg-white px-2 py-1.5 text-xs text-stone-900 placeholder:text-stone-400 focus:border-primary-500 focus:outline-none"
-              />
-              <button
-                onClick={onSetOllamaPath}
-                disabled={isSettingPath || !ollamaPathInput.trim()}
-                className="px-2 py-1.5 text-xs rounded-md bg-primary-600 hover:bg-primary-700 disabled:opacity-60 text-white whitespace-nowrap">
-                {isSettingPath ? 'Setting...' : 'Set Path'}
-              </button>
-              {ollamaPathInput && (
-                <button
-                  onClick={onClearOllamaPath}
-                  disabled={isSettingPath}
-                  className="px-2 py-1.5 text-xs rounded-md border border-stone-200 hover:border-stone-300 disabled:opacity-60 text-stone-600 whitespace-nowrap">
-                  Clear
-                </button>
-              )}
-            </div>
-          </div>
-
-          <div className="flex items-center gap-2 pt-1">
-            {status?.state === 'ready' ? (
-              <span className="inline-flex items-center gap-1 px-3 py-1.5 text-xs rounded-md bg-green-50 text-green-700 border border-green-200 font-medium">
-                <svg className="h-3 w-3" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                  <path
-                    strokeLinecap="round"
-                    strokeLinejoin="round"
-                    strokeWidth={2}
-                    d="M5 13l4 4L19 7"
-                  />
-                </svg>
-                Running
-              </span>
-            ) : (
-              <button
-                onClick={() => onTriggerDownload(false)}
-                disabled={!runtimeEnabled || isTriggeringDownload}
-                className="px-3 py-1.5 text-xs rounded-md bg-primary-600 hover:bg-primary-700 disabled:opacity-60 text-white">
-                {isTriggeringDownload
-                  ? 'Triggering...'
-                  : status?.state === 'degraded'
-                    ? 'Retry Bootstrap'
-                    : 'Bootstrap / Resume'}
-              </button>
-            )}
-            <button
-              onClick={() => onTriggerDownload(true)}
-              disabled={!runtimeEnabled || isTriggeringDownload}
-              className="px-3 py-1.5 text-xs rounded-md border border-stone-200 hover:border-stone-300 disabled:opacity-60 text-stone-600">
-              {isTriggeringDownload ? 'Working...' : 'Force Re-bootstrap'}
-            </button>
-            {bootstrapMessage && <span className="text-xs text-green-600">{bootstrapMessage}</span>}
-          </div>
         </div>
       </section>
 
@@ -376,14 +251,14 @@ const ModelStatusSection = ({
         <div className="bg-stone-50 rounded-lg border border-stone-200 p-4 space-y-3">
           {!diagnostics && !diagnosticsError && (
             <p className="text-xs text-stone-500">
-              Click &ldquo;Run Diagnostics&rdquo; to verify Ollama is running and models are
-              installed.
+              Click &ldquo;Run Diagnostics&rdquo; to verify your external Ollama endpoint is
+              reachable and has the expected models.
             </p>
           )}
           {isDiagnosticsLoading && (
             <div className="flex items-center gap-2 text-xs text-primary-600">
               <div className="h-3 w-3 rounded-full border-2 border-blue-400 border-t-transparent animate-spin" />
-              Checking Ollama server and models...
+              Checking Ollama endpoint and models...
             </div>
           )}
           {diagnosticsError && (
@@ -511,23 +386,10 @@ const ModelStatusSection = ({
                 </div>
               )}
 
-              {diagnostics.repair_actions && diagnostics.repair_actions.length > 0 && (
-                <div>
-                  <div className="text-amber-700 uppercase tracking-wide text-[10px] mb-1">
-                    Suggested Fixes
-                  </div>
-                  <div className="flex flex-wrap gap-2">
-                    {diagnostics.repair_actions.map((action, i) => (
-                      <button
-                        key={i}
-                        onClick={() => onRepairAction?.(action)}
-                        className="px-2.5 py-1 text-xs rounded-md bg-amber-50 border border-amber-300 text-amber-800 hover:bg-amber-100 transition-colors">
-                        {repairActionLabel(action)}
-                      </button>
-                    ))}
-                  </div>
-                </div>
-              )}
+              <div className="text-xs text-stone-500">
+                Manage the Ollama process and model pulls outside OpenHuman, then rerun
+                diagnostics.
+              </div>
             </>
           )}
         </div>
diff --git a/app/src/utils/tauriCommands/localAi.ts b/app/src/utils/tauriCommands/localAi.ts
index c1b487d1cf..432de5f6f0 100644
--- a/app/src/utils/tauriCommands/localAi.ts
+++ b/app/src/utils/tauriCommands/localAi.ts
@@ -1,5 +1,9 @@
 /**
- * Local AI / Ollama commands.
+ * Local AI / Ollama-facing commands routed through the core.
+ *
+ * The renderer never talks to Ollama directly. It always calls the core, and
+ * the core decides whether to route a request to the configured inference
+ * backend (for example an external Ollama endpoint).
  */
 import { callCoreRpc } from '../../services/coreRpcClient';
 import { CommandResponse, isTauri, tauriErrorMessage } from './common';
@@ -51,10 +55,9 @@ export interface LocalAiAssetsStatus {
   tts: LocalAiAssetStatus;
   quantization: string;
   /**
-   * True when the core can find an Ollama binary on disk. When false the UI
-   * should render an "Install Ollama" CTA instead of model state — every
-   * Ollama-backed asset will be reported as `missing` and `/api/tags`
-   * probes are skipped entirely (no 30s timeout).
+   * True when the configured Ollama endpoint is reachable enough for model
+   * checks. When false the UI should render external-runtime guidance instead
+   * of pretending the app can install or launch Ollama itself.
    */
   ollama_available: boolean;
 }
diff --git a/app/test/e2e/specs/local-model-runtime.spec.ts b/app/test/e2e/specs/local-model-runtime.spec.ts
index 71ec471749..aaa2aec0eb 100644
--- a/app/test/e2e/specs/local-model-runtime.spec.ts
+++ b/app/test/e2e/specs/local-model-runtime.spec.ts
@@ -43,9 +43,9 @@ async function waitForAnyText(candidates, timeout = 20_000) {
   return null;
 }
 
-// Local model runtime requires Ollama binary which is not available in the
-// Linux CI Docker container. The "Local model runtime" card and "Manage"
-// button only appear on the home page when Ollama is detected. Skip on Linux.
+// Local model runtime now talks to an external Ollama endpoint through core.
+// CI does not provision a live Ollama server, so keep this spec skipped until
+// a deterministic mockable local-runtime harness exists for WDIO.
 describe.skip('Local model runtime flow', () => {
   before(async () => {
     await startMockServer();
@@ -57,7 +57,7 @@ describe.skip('Local model runtime flow', () => {
     await stopMockServer();
   });
 
-  it('can trigger local model bootstrap from UI and enter active runtime state', async () => {
+  it('shows direct-runtime guidance instead of app-managed bootstrap controls', async () => {
     await triggerAuthDeepLink('e2e-local-model-token');
     await waitForWindowVisible(25_000);
     await waitForWebView(15_000);
@@ -84,14 +84,18 @@ describe.skip('Local model runtime flow', () => {
       'Local model runtime is unavailable in this core build. Restart app after updating to the latest build.';
     expect(await textExists(incompatibleError)).toBe(false);
 
-    await clickText('Bootstrap / Resume', 12_000);
-    await waitForAnyText(['Triggering...'], 8_000);
-
-    const activeState = await waitForAnyText(['Downloading', 'Loading', 'Ready'], 25_000);
-    if (!activeState) {
+    const guidance = await waitForAnyText(
+      [
+        'Ollama runtime unavailable',
+        'Manage the Ollama process and model pulls outside OpenHuman.',
+        'Ollama docs',
+      ],
+      25_000
+    );
+    if (!guidance) {
       const tree = await dumpAccessibilityTree();
-      console.log('[LocalModelE2E] No active runtime state seen. Tree:\n', tree.slice(0, 5000));
+      console.log('[LocalModelE2E] No direct-runtime guidance seen. Tree:\n', tree.slice(0, 5000));
     }
-    expect(activeState).not.toBeNull();
+    expect(guidance).not.toBeNull();
   });
 });
diff --git a/src/core/all.rs b/src/core/all.rs
index 2b67247dc9..3bf468d0b8 100644
--- a/src/core/all.rs
+++ b/src/core/all.rs
@@ -149,6 +149,8 @@ fn build_registered_controllers() -> Vec<RegisteredController> {
     controllers.extend(crate::openhuman::service::all_service_registered_controllers());
     // Data migration utilities
     controllers.extend(crate::openhuman::migration::all_migration_registered_controllers());
+    // External inference runtime access
+    controllers.extend(crate::openhuman::inference::all_inference_registered_controllers());
     // Local AI model management and inference
     controllers.extend(crate::openhuman::local_ai::all_local_ai_registered_controllers());
     // People resolution and interaction scoring
@@ -271,6 +273,7 @@ fn build_declared_controller_schemas() -> Vec<ControllerSchema> {
     schemas.extend(crate::openhuman::credentials::all_credentials_controller_schemas());
     schemas.extend(crate::openhuman::service::all_service_controller_schemas());
     schemas.extend(crate::openhuman::migration::all_migration_controller_schemas());
+    schemas.extend(crate::openhuman::inference::all_inference_controller_schemas());
     schemas.extend(crate::openhuman::local_ai::all_local_ai_controller_schemas());
     schemas.extend(crate::openhuman::people::all_people_controller_schemas());
     schemas.extend(
@@ -357,6 +360,7 @@ pub fn namespace_description(namespace: &str) -> Option<&'static str> {
         "doctor" => Some("Run diagnostics for workspace and runtime health."),
         "encrypt" => Some("Encrypt secure values managed by secret storage."),
         "health" => Some("Process and component health snapshots."),
+        "inference" => Some("Connect to configured text, vision, and embedding inference runtimes."),
         "local_ai" => Some("Local AI chat, inference, downloads, and media operations."),
         "migrate" => Some("Data migration utilities."),
         "screen_intelligence" => Some("Screen capture, permissions, and accessibility automation."),
diff --git a/src/core/cli_tests.rs b/src/core/cli_tests.rs
index 0e5e83752a..111586e358 100644
--- a/src/core/cli_tests.rs
+++ b/src/core/cli_tests.rs
@@ -24,6 +24,7 @@ fn grouped_schemas_contains_migrated_namespaces() {
     assert!(grouped.contains_key("auth"));
     assert!(grouped.contains_key("service"));
     assert!(grouped.contains_key("migrate"));
+    assert!(grouped.contains_key("inference"));
     assert!(grouped.contains_key("local_ai"));
 }
 
diff --git a/src/openhuman/app_state/ops.rs b/src/openhuman/app_state/ops.rs
index a37abccb53..309f9684ed 100644
--- a/src/openhuman/app_state/ops.rs
+++ b/src/openhuman/app_state/ops.rs
@@ -410,7 +410,7 @@ async fn build_runtime_snapshot(config: &Config) -> RuntimeSnapshot {
             .await
     };
 
-    let local_ai = match crate::openhuman::local_ai::rpc::local_ai_status(config).await {
+    let local_ai = match crate::openhuman::inference::rpc::inference_status(config).await {
         Ok(outcome) => outcome.value,
         Err(error) => {
             warn!("{LOG_PREFIX} local_ai status failed during snapshot: {error}");
diff --git a/src/openhuman/channels/providers/presentation.rs b/src/openhuman/channels/providers/presentation.rs
index 6f9049c7d4..16f57287bc 100644
--- a/src/openhuman/channels/providers/presentation.rs
+++ b/src/openhuman/channels/providers/presentation.rs
@@ -396,7 +396,8 @@ async fn try_reaction(user_message: &str) -> Option<String> {
         return None;
     }
 
-    match crate::openhuman::local_ai::ops::local_ai_should_react(&config, user_message, "web").await
+    match crate::openhuman::inference::ops::inference_should_react(&config, user_message, "web")
+        .await
     {
         Ok(outcome) => {
             let decision = outcome.value;
diff --git a/src/openhuman/inference/mod.rs b/src/openhuman/inference/mod.rs
new file mode 100644
index 0000000000..79ec294b50
--- /dev/null
+++ b/src/openhuman/inference/mod.rs
@@ -0,0 +1,15 @@
+//! External inference domain.
+//!
+//! This module is the canonical controller surface for text / vision /
+//! embedding inference. The underlying implementation still reuses the
+//! existing local-runtime service during the migration away from the
+//! `local_ai` catch-all namespace.
+
+pub mod ops;
+mod schemas;
+
+pub use ops as rpc;
+pub use schemas::{
+    all_controller_schemas as all_inference_controller_schemas,
+    all_registered_controllers as all_inference_registered_controllers,
+};
diff --git a/src/openhuman/inference/ops.rs b/src/openhuman/inference/ops.rs
new file mode 100644
index 0000000000..a0f2d76688
--- /dev/null
+++ b/src/openhuman/inference/ops.rs
@@ -0,0 +1,89 @@
+//! JSON-RPC controller surface for inference operations.
+
+use crate::openhuman::config::Config;
+use crate::openhuman::local_ai;
+use crate::openhuman::local_ai::gif_decision::GifDecision;
+use crate::openhuman::local_ai::ops::{LocalAiChatMessage, ReactionDecision};
+use crate::openhuman::local_ai::sentiment::SentimentResult;
+use crate::openhuman::local_ai::{LocalAiEmbeddingResult, LocalAiStatus, TenorSearchResult};
+use crate::rpc::RpcOutcome;
+
+pub async fn inference_status(config: &Config) -> Result<RpcOutcome<LocalAiStatus>, String> {
+    local_ai::rpc::local_ai_status(config).await
+}
+
+pub async fn inference_summarize(
+    config: &Config,
+    text: &str,
+    max_tokens: Option<u32>,
+) -> Result<RpcOutcome<String>, String> {
+    local_ai::rpc::local_ai_summarize(config, text, max_tokens).await
+}
+
+pub async fn inference_prompt(
+    config: &Config,
+    prompt: &str,
+    max_tokens: Option<u32>,
+    no_think: Option<bool>,
+) -> Result<RpcOutcome<String>, String> {
+    local_ai::rpc::local_ai_prompt(config, prompt, max_tokens, no_think).await
+}
+
+pub async fn inference_vision_prompt(
+    config: &Config,
+    prompt: &str,
+    image_refs: &[String],
+    max_tokens: Option<u32>,
+) -> Result<RpcOutcome<String>, String> {
+    local_ai::rpc::local_ai_vision_prompt(config, prompt, image_refs, max_tokens).await
+}
+
+pub async fn inference_embed(
+    config: &Config,
+    inputs: &[String],
+) -> Result<RpcOutcome<LocalAiEmbeddingResult>, String> {
+    local_ai::rpc::local_ai_embed(config, inputs).await
+}
+
+pub async fn inference_chat(
+    config: &Config,
+    messages: Vec<LocalAiChatMessage>,
+    max_tokens: Option<u32>,
+) -> Result<RpcOutcome<String>, String> {
+    local_ai::rpc::local_ai_chat(config, messages, max_tokens).await
+}
+
+pub async fn inference_should_react(
+    config: &Config,
+    message: &str,
+    channel_type: &str,
+) -> Result<RpcOutcome<ReactionDecision>, String> {
+    local_ai::rpc::local_ai_should_react(config, message, channel_type).await
+}
+
+pub async fn inference_analyze_sentiment(
+    config: &Config,
+    message: &str,
+) -> Result<RpcOutcome<SentimentResult>, String> {
+    local_ai::sentiment::local_ai_analyze_sentiment(config, message).await
+}
+
+pub async fn inference_should_send_gif(
+    config: &Config,
+    message: &str,
+    channel_type: &str,
+) -> Result<RpcOutcome<GifDecision>, String> {
+    local_ai::gif_decision::local_ai_should_send_gif(config, message, channel_type).await
+}
+
+pub async fn inference_tenor_search(
+    config: &Config,
+    query: &str,
+    limit: Option<u32>,
+) -> Result<RpcOutcome<TenorSearchResult>, String> {
+    local_ai::gif_decision::tenor_search(config, query, limit).await
+}
+
+#[cfg(test)]
+#[path = "ops_tests.rs"]
+mod tests;
diff --git a/src/openhuman/inference/ops_tests.rs b/src/openhuman/inference/ops_tests.rs
new file mode 100644
index 0000000000..40870db77b
--- /dev/null
+++ b/src/openhuman/inference/ops_tests.rs
@@ -0,0 +1,96 @@
+use super::*;
+use tempfile::tempdir;
+
+fn disabled_config() -> Config {
+    let tmp = tempdir().expect("tempdir");
+    let mut config = Config::default();
+    config.workspace_dir = tmp.path().join("workspace");
+    config.config_path = tmp.path().join("config.toml");
+    config.local_ai.runtime_enabled = false;
+    config.local_ai.opt_in_confirmed = false;
+    config
+}
+
+#[tokio::test]
+async fn inference_status_reports_disabled_state_when_runtime_disabled() {
+    let config = disabled_config();
+    let outcome = inference_status(&config).await.expect("status");
+    assert!(
+        matches!(outcome.value.state.as_str(), "idle" | "disabled"),
+        "unexpected state: {}",
+        outcome.value.state
+    );
+}
+
+#[tokio::test]
+async fn inference_prompt_reuses_local_ai_disabled_error() {
+    let config = disabled_config();
+    let err = inference_prompt(&config, "hello", None, Some(true))
+        .await
+        .expect_err("prompt should fail");
+    assert!(err.contains("local ai is disabled"));
+}
+
+#[tokio::test]
+async fn inference_summarize_reuses_local_ai_disabled_error() {
+    let config = disabled_config();
+    let err = inference_summarize(&config, "hello", None)
+        .await
+        .expect_err("summarize should fail");
+    assert!(err.contains("local ai is disabled"));
+}
+
+#[tokio::test]
+async fn inference_embed_reuses_local_ai_disabled_error() {
+    let config = disabled_config();
+    let err = inference_embed(&config, &["hello".to_string()])
+        .await
+        .expect_err("embed should fail");
+    assert!(err.contains("local ai is disabled"));
+}
+
+#[tokio::test]
+async fn inference_chat_rejects_empty_messages() {
+    let config = disabled_config();
+    let err = inference_chat(&config, vec![], None)
+        .await
+        .expect_err("chat should fail");
+    assert!(err.contains("must not be empty"));
+}
+
+#[tokio::test]
+async fn inference_should_react_short_circuits_for_empty_message() {
+    let config = disabled_config();
+    let outcome = inference_should_react(&config, "   ", "web")
+        .await
+        .expect("reaction decision");
+    assert!(!outcome.value.should_react);
+    assert!(outcome.value.emoji.is_none());
+}
+
+#[tokio::test]
+async fn inference_analyze_sentiment_handles_empty_message() {
+    let config = disabled_config();
+    let outcome = inference_analyze_sentiment(&config, "   ")
+        .await
+        .expect("sentiment");
+    assert_eq!(outcome.value.valence, "neutral");
+}
+
+#[tokio::test]
+async fn inference_should_send_gif_short_circuits_for_empty_message() {
+    let config = disabled_config();
+    let outcome = inference_should_send_gif(&config, "   ", "web")
+        .await
+        .expect("gif decision");
+    assert!(!outcome.value.should_send_gif);
+}
+
+#[tokio::test]
+async fn inference_tenor_search_requires_query() {
+    let config = disabled_config();
+    let err = inference_tenor_search(&config, "   ", Some(3))
+        .await
+        .expect_err("query validation should fail");
+    assert!(err.contains("query is required"));
+}
diff --git a/src/openhuman/inference/schemas.rs b/src/openhuman/inference/schemas.rs
new file mode 100644
index 0000000000..d6233ecf29
--- /dev/null
+++ b/src/openhuman/inference/schemas.rs
@@ -0,0 +1,325 @@
+use serde::de::DeserializeOwned;
+use serde::Deserialize;
+use serde_json::{Map, Value};
+
+use crate::core::all::{ControllerFuture, RegisteredController};
+use crate::core::ControllerSchema;
+use crate::openhuman::config::rpc as config_rpc;
+use crate::rpc::RpcOutcome;
+
+#[derive(Debug, Deserialize)]
+struct InferenceSummarizeParams {
+    text: String,
+    max_tokens: Option<u32>,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferencePromptParams {
+    prompt: String,
+    max_tokens: Option<u32>,
+    no_think: Option<bool>,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceVisionPromptParams {
+    prompt: String,
+    image_refs: Vec<String>,
+    max_tokens: Option<u32>,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceEmbedParams {
+    inputs: Vec<String>,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceChatMessageParam {
+    role: String,
+    content: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceChatParams {
+    messages: Vec<InferenceChatMessageParam>,
+    max_tokens: Option<u32>,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceShouldReactParams {
+    message: String,
+    channel_type: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceAnalyzeSentimentParams {
+    message: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceShouldSendGifParams {
+    message: String,
+    channel_type: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceTenorSearchParams {
+    query: String,
+    limit: Option<u32>,
+}
+
+pub fn all_controller_schemas() -> Vec<ControllerSchema> {
+    vec![
+        schemas("status"),
+        schemas("summarize"),
+        schemas("prompt"),
+        schemas("vision_prompt"),
+        schemas("embed"),
+        schemas("chat"),
+        schemas("should_react"),
+        schemas("analyze_sentiment"),
+        schemas("should_send_gif"),
+        schemas("tenor_search"),
+    ]
+}
+
+pub fn all_registered_controllers() -> Vec<RegisteredController> {
+    vec![
+        RegisteredController {
+            schema: schemas("status"),
+            handler: handle_inference_status,
+        },
+        RegisteredController {
+            schema: schemas("summarize"),
+            handler: handle_inference_summarize,
+        },
+        RegisteredController {
+            schema: schemas("prompt"),
+            handler: handle_inference_prompt,
+        },
+        RegisteredController {
+            schema: schemas("vision_prompt"),
+            handler: handle_inference_vision_prompt,
+        },
+        RegisteredController {
+            schema: schemas("embed"),
+            handler: handle_inference_embed,
+        },
+        RegisteredController {
+            schema: schemas("chat"),
+            handler: handle_inference_chat,
+        },
+        RegisteredController {
+            schema: schemas("should_react"),
+            handler: handle_inference_should_react,
+        },
+        RegisteredController {
+            schema: schemas("analyze_sentiment"),
+            handler: handle_inference_analyze_sentiment,
+        },
+        RegisteredController {
+            schema: schemas("should_send_gif"),
+            handler: handle_inference_should_send_gif,
+        },
+        RegisteredController {
+            schema: schemas("tenor_search"),
+            handler: handle_inference_tenor_search,
+        },
+    ]
+}
+
+pub fn schemas(function: &str) -> ControllerSchema {
+    let (source, target_function) = match function {
+        "status" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_status"),
+            "status",
+        ),
+        "summarize" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_summarize"),
+            "summarize",
+        ),
+        "prompt" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_prompt"),
+            "prompt",
+        ),
+        "vision_prompt" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_vision_prompt"),
+            "vision_prompt",
+        ),
+        "embed" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_embed"),
+            "embed",
+        ),
+        "chat" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_chat"),
+            "chat",
+        ),
+        "should_react" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_should_react"),
+            "should_react",
+        ),
+        "analyze_sentiment" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_analyze_sentiment"),
+            "analyze_sentiment",
+        ),
+        "should_send_gif" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_should_send_gif"),
+            "should_send_gif",
+        ),
+        "tenor_search" => (
+            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_tenor_search"),
+            "tenor_search",
+        ),
+        other => panic!("unknown inference schema: {other}"),
+    };
+
+    ControllerSchema {
+        namespace: "inference",
+        function: target_function,
+        description: source.description,
+        inputs: source.inputs,
+        outputs: source.outputs,
+    }
+}
+
+fn handle_inference_status(_params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(crate::openhuman::inference::rpc::inference_status(&config).await?)
+    })
+}
+
+fn handle_inference_summarize(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferenceSummarizeParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(
+            crate::openhuman::inference::rpc::inference_summarize(&config, &p.text, p.max_tokens)
+                .await?,
+        )
+    })
+}
+
+fn handle_inference_prompt(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferencePromptParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(
+            crate::openhuman::inference::rpc::inference_prompt(
+                &config,
+                &p.prompt,
+                p.max_tokens,
+                p.no_think,
+            )
+            .await?,
+        )
+    })
+}
+
+fn handle_inference_vision_prompt(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferenceVisionPromptParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(
+            crate::openhuman::inference::rpc::inference_vision_prompt(
+                &config,
+                &p.prompt,
+                &p.image_refs,
+                p.max_tokens,
+            )
+            .await?,
+        )
+    })
+}
+
+fn handle_inference_embed(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferenceEmbedParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(crate::openhuman::inference::rpc::inference_embed(&config, &p.inputs).await?)
+    })
+}
+
+fn handle_inference_chat(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferenceChatParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        let messages = p
+            .messages
+            .into_iter()
+            .map(
+                |message| crate::openhuman::local_ai::ops::LocalAiChatMessage {
+                    role: message.role,
+                    content: message.content,
+                },
+            )
+            .collect();
+        to_json(
+            crate::openhuman::inference::rpc::inference_chat(&config, messages, p.max_tokens)
+                .await?,
+        )
+    })
+}
+
+fn handle_inference_should_react(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferenceShouldReactParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(
+            crate::openhuman::inference::rpc::inference_should_react(
+                &config,
+                &p.message,
+                &p.channel_type,
+            )
+            .await?,
+        )
+    })
+}
+
+fn handle_inference_analyze_sentiment(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferenceAnalyzeSentimentParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(
+            crate::openhuman::inference::rpc::inference_analyze_sentiment(&config, &p.message)
+                .await?,
+        )
+    })
+}
+
+fn handle_inference_should_send_gif(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferenceShouldSendGifParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(
+            crate::openhuman::inference::rpc::inference_should_send_gif(
+                &config,
+                &p.message,
+                &p.channel_type,
+            )
+            .await?,
+        )
+    })
+}
+
+fn handle_inference_tenor_search(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let p = deserialize_params::<InferenceTenorSearchParams>(params)?;
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(
+            crate::openhuman::inference::rpc::inference_tenor_search(&config, &p.query, p.limit)
+                .await?,
+        )
+    })
+}
+
+fn deserialize_params<T: DeserializeOwned>(params: Map<String, Value>) -> Result<T, String> {
+    serde_json::from_value(Value::Object(params)).map_err(|e| format!("invalid params: {e}"))
+}
+
+fn to_json<T: serde::Serialize>(outcome: RpcOutcome<T>) -> Result<Value, String> {
+    outcome.into_cli_compatible_json()
+}
+
+#[cfg(test)]
+#[path = "schemas_tests.rs"]
+mod tests;
diff --git a/src/openhuman/inference/schemas_tests.rs b/src/openhuman/inference/schemas_tests.rs
new file mode 100644
index 0000000000..86a682c1d0
--- /dev/null
+++ b/src/openhuman/inference/schemas_tests.rs
@@ -0,0 +1,78 @@
+use super::*;
+
+#[test]
+fn inference_catalog_counts_match_and_nonempty() {
+    let declared = all_controller_schemas();
+    let registered = all_registered_controllers();
+    assert_eq!(declared.len(), registered.len());
+    assert!(declared.len() >= 10);
+}
+
+#[test]
+fn inference_schemas_use_inference_namespace() {
+    for schema in all_controller_schemas() {
+        assert_eq!(
+            schema.namespace, "inference",
+            "function {}",
+            schema.function
+        );
+        assert!(!schema.description.is_empty());
+        assert!(!schema.outputs.is_empty());
+    }
+}
+
+#[test]
+fn inference_schema_function_names_are_stable() {
+    let functions: Vec<&str> = all_controller_schemas()
+        .into_iter()
+        .map(|schema| schema.function)
+        .collect();
+    assert!(functions.contains(&"status"));
+    assert!(functions.contains(&"prompt"));
+    assert!(functions.contains(&"vision_prompt"));
+    assert!(functions.contains(&"embed"));
+    assert!(functions.contains(&"chat"));
+}
+
+#[test]
+fn inference_prompt_schema_reuses_local_ai_shape_with_new_namespace() {
+    let schema = schemas("prompt");
+    assert_eq!(schema.namespace, "inference");
+    assert_eq!(schema.function, "prompt");
+    assert!(schema.inputs.iter().any(|field| field.name == "prompt"));
+    assert!(schema.inputs.iter().any(|field| field.name == "max_tokens"));
+}
+
+#[test]
+fn inference_chat_schema_requires_messages() {
+    let schema = schemas("chat");
+    assert_eq!(schema.namespace, "inference");
+    assert_eq!(schema.function, "chat");
+    assert!(schema
+        .inputs
+        .iter()
+        .any(|field| field.name == "messages" && field.required));
+}
+
+#[test]
+fn inference_unknown_schema_panics() {
+    let panic = std::panic::catch_unwind(|| schemas("no_such_function"));
+    assert!(panic.is_err());
+}
+
+#[tokio::test]
+async fn inference_status_handler_returns_cli_json() {
+    let value = handle_inference_status(Map::new())
+        .await
+        .expect("handler value");
+    assert!(value.get("result").is_some() || value.get("logs").is_some());
+}
+
+#[tokio::test]
+async fn inference_prompt_handler_rejects_invalid_shape() {
+    let params = Map::from_iter([("prompt".to_string(), Value::Bool(true))]);
+    let err = handle_inference_prompt(params)
+        .await
+        .expect_err("invalid params");
+    assert!(err.contains("invalid params"));
+}
diff --git a/src/openhuman/local_ai/mod.rs b/src/openhuman/local_ai/mod.rs
index 596e983973..d3876b7be5 100644
--- a/src/openhuman/local_ai/mod.rs
+++ b/src/openhuman/local_ai/mod.rs
@@ -43,6 +43,7 @@ pub use presets::{ModelPreset, ModelTier, VisionMode};
 pub use schemas::{
     all_controller_schemas as all_local_ai_controller_schemas,
     all_registered_controllers as all_local_ai_registered_controllers,
+    schemas as local_ai_controller_schema,
 };
 pub use sentiment::SentimentResult;
 pub(crate) use service::whisper_engine;
diff --git a/src/openhuman/local_ai/ops.rs b/src/openhuman/local_ai/ops.rs
index 1602f8bd57..35c2e500cf 100644
--- a/src/openhuman/local_ai/ops.rs
+++ b/src/openhuman/local_ai/ops.rs
@@ -176,23 +176,8 @@ pub async fn local_ai_status(
 pub async fn local_ai_shutdown_owned(
     config: &mut Config,
 ) -> Result<RpcOutcome<local_ai::LocalAiStatus>, String> {
-    let service = local_ai::global(config);
-    service.shutdown_owned_ollama(config).await;
-
-    // Shift any ollama-routed workload back to "cloud" (= primary).
-    let cleared = clear_ollama_workload_routes(config);
-    if cleared > 0 {
-        log::info!(
-            "[local_ai] shutdown_owned: shifted {cleared} ollama-routed workload(s) back to cloud"
-        );
-        config.save().await.map_err(|e| e.to_string())?;
-    }
-
-    service.mark_disabled(config);
-    Ok(RpcOutcome::single_log(
-        service.status(),
-        "local ai runtime gated off (owned daemon killed if any)",
-    ))
+    let _ = config;
+    Err("OpenHuman does not manage the Ollama process anymore. Stop or restart your external Ollama runtime directly.".to_string())
 }
 
 /// Clear every per-workload `*_provider` field whose stored value starts
@@ -234,21 +219,8 @@ pub async fn local_ai_download(
     config: &Config,
     force: bool,
 ) -> Result<RpcOutcome<local_ai::LocalAiStatus>, String> {
-    let service = local_ai::global(config);
-    if force {
-        service.reset_to_idle(config);
-    }
-    let service_clone = service.clone();
-    let config_clone = config.clone();
-    tokio::spawn(async move {
-        if let Err(err) = service_clone.download_all_models(&config_clone).await {
-            service_clone.mark_degraded(err);
-        }
-    });
-    Ok(RpcOutcome::single_log(
-        service.status(),
-        "local ai full model download triggered",
-    ))
+    let _ = (config, force);
+    Err("OpenHuman no longer downloads or starts Ollama for you. Start your external Ollama runtime and pull models yourself.".to_string())
 }
 
 /// Triggers a download of all local AI assets and returns progress information.
@@ -256,25 +228,8 @@ pub async fn local_ai_download_all_assets(
     config: &Config,
     force: bool,
 ) -> Result<RpcOutcome<LocalAiDownloadsProgress>, String> {
-    let service = local_ai::global(config);
-    if force {
-        service.reset_to_idle(config);
-    }
-    let service_clone = service.clone();
-    let config_clone = config.clone();
-    tokio::spawn(async move {
-        if let Err(err) = service_clone.download_all_models(&config_clone).await {
-            service_clone.mark_degraded(err);
-        }
-    });
-    let progress = service
-        .downloads_progress(config)
-        .await
-        .map_err(|e| e.to_string())?;
-    Ok(RpcOutcome::single_log(
-        progress,
-        "local ai full asset download triggered",
-    ))
+    let _ = (config, force);
+    Err("OpenHuman no longer downloads Ollama assets. Start your external Ollama runtime and manage model pulls yourself.".to_string())
 }
 
 /// Generates a summary of the provided text using local AI models.
@@ -467,14 +422,21 @@ pub async fn local_ai_download_asset(
     config: &Config,
     capability: &str,
 ) -> Result<RpcOutcome<LocalAiAssetsStatus>, String> {
-    let service = local_ai::global(config);
-    let output = service
-        .download_asset(config, capability.trim())
-        .await
-        .map_err(|e| e.to_string())?;
-    Ok(RpcOutcome::single_log(
-        output,
-        "local ai asset download triggered",
+    let capability = capability.trim().to_ascii_lowercase();
+    if matches!(capability.as_str(), "stt" | "tts") {
+        let service = local_ai::global(config);
+        let output = service
+            .download_asset(config, capability.as_str())
+            .await
+            .map_err(|e| e.to_string())?;
+        return Ok(RpcOutcome::single_log(
+            output,
+            "local ai voice asset download triggered",
+        ));
+    }
+
+    Err(format!(
+        "OpenHuman no longer downloads `{capability}` via Ollama. Start your external Ollama runtime and pull that model yourself."
     ))
 }
 
diff --git a/src/openhuman/local_ai/schemas.rs b/src/openhuman/local_ai/schemas.rs
index cf8c56f45e..d2b9bb5b45 100644
--- a/src/openhuman/local_ai/schemas.rs
+++ b/src/openhuman/local_ai/schemas.rs
@@ -925,45 +925,8 @@ fn handle_local_ai_diagnostics(_params: Map<String, Value>) -> ControllerFuture
 
 fn handle_local_ai_set_ollama_path(params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
-        let p = deserialize_params::<LocalAiSetOllamaPathParams>(params)?;
-        let path_str = p.path.trim().to_string();
-        tracing::debug!(path = %path_str, "[local_ai] set_ollama_path: validating");
-
-        let new_value = if path_str.is_empty() {
-            None
-        } else {
-            let path = std::path::Path::new(&path_str);
-            if !path.is_file() {
-                return Err(format!(
-                    "Ollama binary not found at '{}'. Provide a valid path to the ollama executable.",
-                    path_str
-                ));
-            }
-            Some(path_str.clone())
-        };
-
-        let mut config = config_rpc::load_config_with_timeout().await?;
-        config.local_ai.ollama_binary_path = new_value.clone();
-        config
-            .save()
-            .await
-            .map_err(|e| format!("save config: {e}"))?;
-        tracing::debug!(path = ?new_value, "[local_ai] set_ollama_path: config saved, triggering re-bootstrap");
-
-        let service = crate::openhuman::local_ai::global(&config);
-        service.reset_to_idle(&config);
-        let service_clone = service.clone();
-        let config_clone = config.clone();
-        tokio::spawn(async move {
-            service_clone.bootstrap(&config_clone).await;
-        });
-
-        let current_status =
-            serde_json::to_value(service.status()).map_err(|e| format!("serialize: {e}"))?;
-        Ok(serde_json::json!({
-            "ollama_binary_path": new_value,
-            "status": current_status,
-        }))
+        let _ = deserialize_params::<LocalAiSetOllamaPathParams>(params)?;
+        Err("OpenHuman no longer manages an Ollama binary path. Point your inference setup at an already-running Ollama-compatible endpoint instead.".to_string())
     })
 }
 
diff --git a/src/openhuman/local_ai/schemas_tests.rs b/src/openhuman/local_ai/schemas_tests.rs
index d898f1f562..b07e60568e 100644
--- a/src/openhuman/local_ai/schemas_tests.rs
+++ b/src/openhuman/local_ai/schemas_tests.rs
@@ -238,7 +238,7 @@ async fn handle_apply_preset_accepts_valid_tier_and_persists() {
 }
 
 #[tokio::test]
-async fn handle_set_ollama_path_rejects_nonexistent_path() {
+async fn handle_set_ollama_path_reports_external_runtime_contract() {
     let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
     let tmp = TempDir::new().unwrap();
     unsafe {
@@ -252,22 +252,22 @@ async fn handle_set_ollama_path_rejects_nonexistent_path() {
     unsafe {
         std::env::remove_var("OPENHUMAN_WORKSPACE");
     }
-    assert!(err.contains("Ollama binary not found"));
+    assert!(err.contains("no longer manages an Ollama binary path"));
 }
 
 #[tokio::test]
-async fn handle_set_ollama_path_accepts_empty_string_to_clear() {
+async fn handle_set_ollama_path_rejects_empty_string_too() {
     let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
     let tmp = TempDir::new().unwrap();
     unsafe {
         std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
     }
     let params = Map::from_iter([("path".to_string(), serde_json::json!(""))]);
-    // Empty path clears the setting — must not error.
-    let _ = handle_local_ai_set_ollama_path(params).await.expect("ok");
+    let err = handle_local_ai_set_ollama_path(params).await.unwrap_err();
     unsafe {
         std::env::remove_var("OPENHUMAN_WORKSPACE");
     }
+    assert!(err.contains("no longer manages an Ollama binary path"));
 }
 
 /// Regression test for the CodeRabbit #7 race on PR #1755: when two
diff --git a/src/openhuman/local_ai/service/assets.rs b/src/openhuman/local_ai/service/assets.rs
index 8b281cbc7c..8802843406 100644
--- a/src/openhuman/local_ai/service/assets.rs
+++ b/src/openhuman/local_ai/service/assets.rs
@@ -37,23 +37,21 @@ impl LocalAiService {
             "[local_ai:assets:provider_routing] entry"
         );
 
-        // Pre-flight precondition: if no Ollama binary exists anywhere
-        // discoverable, every Ollama-backed `has_model` call will fail (or
-        // time out). LM Studio still delegates embeddings to Ollama in this
-        // first provider slice, so it needs the same pre-flight for the
-        // embedding branch.
+        // External-runtime precondition: OpenHuman no longer installs or
+        // starts Ollama itself, so the interesting question is whether the
+        // user-managed runtime is reachable right now.
         let uses_ollama_assets = matches!(
             provider,
             LocalAiProvider::Ollama | LocalAiProvider::LmStudio
         );
         let ollama_available = if uses_ollama_assets {
-            let present = self.ollama_binary_present(config);
+            let present = self.ollama_healthy().await;
             debug!(
                 target: "local_ai::assets",
                 %correlation_id,
                 provider = %provider.as_str(),
                 ollama_available = present,
-                "[local_ai:assets:provider_routing] ollama binary check"
+                "[local_ai:assets:provider_routing] ollama runtime check"
             );
             present
         } else {
@@ -121,7 +119,7 @@ impl LocalAiService {
                     %correlation_id,
                     provider = "ollama",
                     model = %embedding_model,
-                    "[local_ai:assets:provider_routing] lm studio embedding check skipped; ollama binary missing"
+                    "[local_ai:assets:provider_routing] lm studio embedding check skipped; ollama runtime unavailable"
                 );
                 false
             };
@@ -216,7 +214,7 @@ impl LocalAiService {
             trace!(
                 target: "local_ai::assets",
                 %correlation_id,
-                branch = "ollama_missing_binary",
+                branch = "ollama_runtime_unavailable",
                 "[local_ai:assets:provider_routing] selected provider branch"
             );
             (false, false, false)
diff --git a/src/openhuman/local_ai/service/bootstrap.rs b/src/openhuman/local_ai/service/bootstrap.rs
index 843f8d9477..30bff62f0a 100644
--- a/src/openhuman/local_ai/service/bootstrap.rs
+++ b/src/openhuman/local_ai/service/bootstrap.rs
@@ -301,30 +301,12 @@ impl LocalAiService {
             return;
         }
 
-        if let Err(first_err) = self.ensure_ollama_server(&effective_config).await {
-            log::warn!(
-                "[local_ai] ensure_ollama_server failed, retrying with fresh install: {first_err}"
-            );
-            // Force a fresh install attempt before giving up.
-            {
-                let mut status = self.status.lock();
-                status.state = "installing".to_string();
-                status.warning = Some("Retrying Ollama installation...".to_string());
-                status.error_detail = None;
-                status.error_category = None;
-            }
-            if let Err(err) = self.ensure_ollama_server_fresh(&effective_config).await {
-                let mut status = self.status.lock();
-                status.state = "degraded".to_string();
-                let is_install_error = status.error_category.as_deref() == Some("install");
-                if is_install_error {
-                    status.warning = Some(err);
-                } else {
-                    status.error_category = Some("server".to_string());
-                    status.warning = Some(format_degraded_warning(&err, &effective_config));
-                }
-                return;
-            }
+        if let Err(err) = self.ensure_ollama_server(&effective_config).await {
+            let mut status = self.status.lock();
+            status.state = "degraded".to_string();
+            status.error_category = Some("server".to_string());
+            status.warning = Some(format_degraded_warning(&err, &effective_config));
+            return;
         }
 
         if let Err(err) = self.ensure_models_available(&effective_config).await {
diff --git a/src/openhuman/local_ai/service/ollama_admin.rs b/src/openhuman/local_ai/service/ollama_admin.rs
index 7479ceb1fe..417134ba36 100644
--- a/src/openhuman/local_ai/service/ollama_admin.rs
+++ b/src/openhuman/local_ai/service/ollama_admin.rs
@@ -25,35 +25,22 @@ fn lm_studio_models_error_means_unreachable(error: &str) -> bool {
 impl LocalAiService {
     pub(in crate::openhuman::local_ai::service) async fn ensure_ollama_server(
         &self,
-        config: &Config,
+        _config: &Config,
     ) -> Result<(), String> {
-        // If openhuman crashed last session and left a daemon running, the
-        // spawn marker lets us recognise it and reclaim it (kill + respawn
-        // under owned-child tracking) instead of either leaking it forever
-        // or hitting an external daemon that just happens to be on :11434.
-        self.reclaim_orphan_if_ours(config).await;
-
         if self.ollama_healthy().await {
-            // Server is running — verify it can actually execute models by checking
-            // if the runner works. A stale server with a missing binary will 500.
             if self.ollama_runner_ok().await {
                 return Ok(());
             }
-            // Runner is broken (e.g. binary moved).
             log::warn!("[local_ai] Ollama server responds but runner is broken");
-            // Only restart if we own it. Killing an external daemon's
-            // broken runner is the user's job, not ours — friendly-fire.
-            self.kill_ollama_server().await;
-            if self.ollama_healthy().await {
-                // Our kill was a no-op (or didn't take effect) — daemon is external.
-                return Err("An external Ollama daemon on :11434 has a broken runner. \
-                     Restart it manually (or stop it so openhuman can take over)."
-                    .to_string());
-            }
+            return Err(
+                "Configured Ollama runtime is reachable but cannot execute models. Restart the external runtime and retry."
+                    .to_string(),
+            );
         }
-
-        let ollama_cmd = self.resolve_or_install_ollama_binary(config).await?;
-        self.start_and_wait_for_server(config, &ollama_cmd).await
+        let base_url = ollama_base_url();
+        Err(format!(
+            "OpenHuman no longer starts or installs Ollama automatically. Start your inference runtime yourself and make sure it is reachable at {base_url}."
+        ))
     }
 
     /// Like `ensure_ollama_server`, but forces a fresh install of the Ollama binary
@@ -62,18 +49,7 @@ impl LocalAiService {
         &self,
         config: &Config,
     ) -> Result<(), String> {
-        // Force a fresh download regardless of existing binaries.
-        self.download_and_install_ollama(config).await?;
-
-        let Some(ollama_cmd) = find_workspace_ollama_binary(config) else {
-            // Also check system path after install.
-            let system_bin = find_system_ollama_binary()
-                .ok_or_else(|| "Ollama installed but binary not found on system".to_string())?;
-            // Try to use the system binary directly.
-            return self.start_and_wait_for_server(config, &system_bin).await;
-        };
-
-        self.start_and_wait_for_server(config, &ollama_cmd).await
+        self.ensure_ollama_server(config).await
     }
 
     /// Check if a healthy daemon on `:11434` is actually openhuman's own
@@ -496,7 +472,7 @@ impl LocalAiService {
         Ok(())
     }
 
-    async fn ollama_healthy(&self) -> bool {
+    pub(in crate::openhuman::local_ai::service) async fn ollama_healthy(&self) -> bool {
         self.http
             .get(format!("{}/api/tags", ollama_base_url()))
             .timeout(std::time::Duration::from_secs(2))
@@ -883,38 +859,22 @@ impl LocalAiService {
         let binary_path = self.resolve_binary_path(config);
 
         let mut issues: Vec<String> = Vec::new();
-        let mut repair_actions: Vec<serde_json::Value> = Vec::new();
+        let repair_actions: Vec<serde_json::Value> = Vec::new();
 
         if !healthy {
             issues.push(format!(
                 "Ollama server is not running or not reachable at {}",
                 base_url
             ));
-            if binary_path.is_none() {
-                repair_actions.push(serde_json::json!({"action": "install_ollama"}));
-            } else {
-                repair_actions.push(serde_json::json!({
-                    "action": "start_server",
-                    "binary_path": binary_path,
-                }));
-            }
         }
         if healthy && !chat_found {
             issues.push(format!("Chat model `{}` is not installed", expected_chat));
-            repair_actions.push(serde_json::json!({
-                "action": "pull_model",
-                "model": expected_chat,
-            }));
         }
         if healthy && config.local_ai.preload_embedding_model && !embedding_found {
             issues.push(format!(
                 "Embedding model `{}` is not installed",
                 expected_embedding
             ));
-            repair_actions.push(serde_json::json!({
-                "action": "pull_model",
-                "model": expected_embedding,
-            }));
         }
         if healthy
             && matches!(
@@ -927,10 +887,6 @@ impl LocalAiService {
                 "Vision model `{}` is not installed",
                 expected_vision
             ));
-            repair_actions.push(serde_json::json!({
-                "action": "pull_model",
-                "model": expected_vision,
-            }));
         }
         if let Some(ref e) = tags_error {
             issues.push(format!("Failed to list models: {e}"));
@@ -1064,7 +1020,7 @@ impl LocalAiService {
             .any(|name| name == &expected_chat.to_ascii_lowercase());
 
         let mut issues: Vec<String> = Vec::new();
-        let mut repair_actions: Vec<serde_json::Value> = Vec::new();
+        let repair_actions: Vec<serde_json::Value> = Vec::new();
 
         if !healthy {
             let detail = models_error
@@ -1075,25 +1031,14 @@ impl LocalAiService {
                 "LM Studio server is not running or not reachable at {}{}",
                 base_url, detail
             ));
-            repair_actions.push(serde_json::json!({
-                "action": "start_lm_studio_server",
-                "base_url": base_url,
-            }));
         }
         if healthy && models_error.is_none() && models.is_empty() {
             issues.push("LM Studio is reachable but no models are loaded".to_string());
-            repair_actions.push(serde_json::json!({
-                "action": "load_lm_studio_model",
-            }));
         } else if healthy && models_error.is_none() && !chat_found {
             issues.push(format!(
                 "Chat model `{}` is not loaded in LM Studio",
                 expected_chat
             ));
-            repair_actions.push(serde_json::json!({
-                "action": "load_lm_studio_model",
-                "model": expected_chat,
-            }));
         }
         if healthy {
             if let Some(ref err) = models_error {
diff --git a/src/openhuman/local_ai/service/ollama_admin_tests.rs b/src/openhuman/local_ai/service/ollama_admin_tests.rs
index e2a0511fa2..b821301d90 100644
--- a/src/openhuman/local_ai/service/ollama_admin_tests.rs
+++ b/src/openhuman/local_ai/service/ollama_admin_tests.rs
@@ -123,6 +123,100 @@ async fn ollama_healthy_returns_false_on_unreachable_url() {
     }
 }
 
+#[tokio::test]
+async fn ensure_ollama_server_requires_external_runtime_when_unreachable() {
+    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
+    }
+
+    let config = Config::default();
+    let service = LocalAiService::new(&config);
+    let err = service
+        .ensure_ollama_server(&config)
+        .await
+        .expect_err("unreachable runtime should fail");
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    assert!(
+        err.contains("no longer starts or installs Ollama automatically"),
+        "unexpected error: {err}"
+    );
+}
+
+#[tokio::test]
+async fn ensure_ollama_server_reports_broken_external_runner_without_restart_attempt() {
+    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+
+    let app = Router::new()
+        .route("/api/tags", get(|| async { Json(json!({ "models": [] })) }))
+        .route(
+            "/api/show",
+            axum::routing::post(|| async {
+                (
+                    axum::http::StatusCode::INTERNAL_SERVER_ERROR,
+                    "fork/exec /broken/ollama: no such file or directory",
+                )
+            }),
+        );
+    let base = spawn_mock(app).await;
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
+    }
+
+    let config = Config::default();
+    let service = LocalAiService::new(&config);
+    let err = service
+        .ensure_ollama_server(&config)
+        .await
+        .expect_err("broken runner should fail");
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    assert!(
+        err.contains("cannot execute models") || err.contains("Restart the external runtime"),
+        "unexpected error: {err}"
+    );
+}
+
+#[tokio::test]
+async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_binary_exists() {
+    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
+    }
+    let fake_ollama = std::env::current_exe().expect("current exe");
+    let prev_ollama_bin = std::env::var_os("OLLAMA_BIN");
+    unsafe {
+        std::env::set_var("OLLAMA_BIN", &fake_ollama);
+    }
+
+    let config = Config::default();
+    let service = LocalAiService::new(&config);
+    let status = service.assets_status(&config).await.expect("assets status");
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+        match prev_ollama_bin {
+            Some(value) => std::env::set_var("OLLAMA_BIN", value),
+            None => std::env::remove_var("OLLAMA_BIN"),
+        }
+    }
+
+    assert!(
+        !status.ollama_available,
+        "runtime-down status must not be treated as available"
+    );
+    assert_ne!(status.chat.state, "ready");
+}
+
 #[tokio::test]
 async fn diagnostics_reports_server_unreachable_when_url_unbound() {
     let _guard = crate::openhuman::local_ai::local_ai_test_guard();
@@ -151,8 +245,8 @@ async fn diagnostics_reports_server_unreachable_when_url_unbound() {
         .cloned()
         .unwrap_or_default();
     assert!(
-        !repair_actions.is_empty(),
-        "unreachable server must produce at least one repair action"
+        repair_actions.is_empty(),
+        "OpenHuman should not suggest app-managed repair actions anymore"
     );
     unsafe {
         std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
@@ -181,16 +275,13 @@ async fn diagnostics_with_running_server_but_missing_models_flags_issues() {
     // No models are installed → expected chat model issue surfaces.
     let issues = diag["issues"].as_array().cloned().unwrap_or_default();
     assert!(!issues.is_empty());
-    // Missing chat model should produce a pull_model repair action.
     let repair_actions = diag["repair_actions"]
         .as_array()
         .cloned()
         .unwrap_or_default();
     assert!(
-        repair_actions
-            .iter()
-            .any(|a| a["action"].as_str() == Some("pull_model")),
-        "missing models must produce pull_model repair action"
+        repair_actions.is_empty(),
+        "missing models should no longer surface app-managed pull actions"
     );
     unsafe {
         std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
@@ -286,7 +377,7 @@ async fn resolve_binary_path_finds_binary_via_ollama_bin_env() {
 }
 
 #[tokio::test]
-async fn diagnostics_repair_actions_include_start_server_when_binary_known() {
+async fn diagnostics_repair_actions_are_empty_when_binary_is_known_but_server_is_down() {
     let _guard = crate::openhuman::local_ai::local_ai_test_guard();
 
     let tmp = tempfile::tempdir().unwrap();
@@ -312,10 +403,8 @@ async fn diagnostics_repair_actions_include_start_server_when_binary_known() {
         .cloned()
         .unwrap_or_default();
     assert!(
-        repair_actions
-            .iter()
-            .any(|a| a["action"].as_str() == Some("start_server")),
-        "when binary is known but server is down, repair action should be start_server"
+        repair_actions.is_empty(),
+        "when server is down, diagnostics should not advertise app-managed start actions"
     );
 
     unsafe {
diff --git a/src/openhuman/local_ai/types.rs b/src/openhuman/local_ai/types.rs
index 5a814fbba9..ca4b9425bf 100644
--- a/src/openhuman/local_ai/types.rs
+++ b/src/openhuman/local_ai/types.rs
@@ -95,11 +95,9 @@ pub struct LocalAiAssetsStatus {
     pub stt: LocalAiAssetStatus,
     pub tts: LocalAiAssetStatus,
     pub quantization: String,
-    /// True when an Ollama binary is discoverable on disk (workspace install,
-    /// system install, or via `OLLAMA_BIN`/configured path). When false, the
-    /// frontend should render an "Install Ollama" CTA instead of model state —
-    /// querying `/api/tags` against a missing server otherwise lets a 30s
-    /// connect timeout cascade through `has_model`.
+    /// True when the configured Ollama endpoint is reachable enough for model
+    /// checks. When false, the frontend should render external-runtime
+    /// guidance rather than app-managed install/start affordances.
     pub ollama_available: bool,
 }
 
diff --git a/src/openhuman/mod.rs b/src/openhuman/mod.rs
index 266a3ffb72..9b26111b72 100644
--- a/src/openhuman/mod.rs
+++ b/src/openhuman/mod.rs
@@ -35,6 +35,7 @@ pub mod embeddings;
 pub mod encryption;
 pub mod health;
 pub mod heartbeat;
+pub mod inference;
 pub mod integrations;
 pub mod learning;
 pub mod local_ai;
diff --git a/src/openhuman/subconscious/executor.rs b/src/openhuman/subconscious/executor.rs
index a9e8305207..db2e99a09a 100644
--- a/src/openhuman/subconscious/executor.rs
+++ b/src/openhuman/subconscious/executor.rs
@@ -209,7 +209,7 @@ async fn execute_with_local_model(
         },
     ];
 
-    let outcome = crate::openhuman::local_ai::ops::local_ai_chat(&config, messages, None)
+    let outcome = crate::openhuman::inference::ops::inference_chat(&config, messages, None)
         .await
         .map_err(|e| format!("local model: {e}"))?;
 
diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs
index a556f9e30f..812ed0b728 100644
--- a/tests/json_rpc_e2e.rs
+++ b/tests/json_rpc_e2e.rs
@@ -3340,6 +3340,186 @@ async fn json_rpc_local_ai_lm_studio_config_diagnostics_and_prompt() {
     rpc_join.abort();
 }
 
+#[tokio::test]
+async fn json_rpc_inference_namespace_lm_studio_prompt_and_status() {
+    let _env_lock = json_rpc_e2e_env_lock();
+    let tmp = tempdir().expect("tempdir");
+    let home = tmp.path();
+    let openhuman_home = home.join(".openhuman");
+
+    let _home_guard = EnvVarGuard::set_to_path("HOME", home);
+    let _workspace_guard = EnvVarGuard::unset("OPENHUMAN_WORKSPACE");
+    let _backend_url_guard = EnvVarGuard::unset("BACKEND_URL");
+    let _vite_backend_guard = EnvVarGuard::unset("VITE_BACKEND_URL");
+    let _tier_guard = EnvVarGuard::unset("OPENHUMAN_LOCAL_AI_TIER");
+    let _lm_env_guard = EnvVarGuard::unset("OPENHUMAN_LM_STUDIO_BASE_URL");
+    let _lm_alias_env_guard = EnvVarGuard::unset("LM_STUDIO_BASE_URL");
+
+    let (mock_addr, mock_join) = serve_on_ephemeral(mock_upstream_router()).await;
+    let mock_origin = format!("http://{}", mock_addr);
+    write_min_config(&openhuman_home, &mock_origin);
+
+    let lm_app = Router::new()
+        .route(
+            "/v1/models",
+            get(|| async {
+                Json(json!({
+                    "object": "list",
+                    "data": [
+                        { "id": "local-model", "object": "model", "owned_by": "lm-studio" }
+                    ]
+                }))
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(|Json(_body): Json<Value>| async move {
+                Json(json!({
+                    "id": "chatcmpl-inference-e2e",
+                    "object": "chat.completion",
+                    "choices": [{
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": "hello from inference namespace"
+                        },
+                        "finish_reason": "stop"
+                    }],
+                    "usage": {
+                        "prompt_tokens": 7,
+                        "completion_tokens": 5,
+                        "total_tokens": 12
+                    }
+                }))
+            }),
+        );
+    let (lm_addr, lm_join) = serve_on_ephemeral(lm_app).await;
+    let lm_base = format!("http://{lm_addr}/v1");
+
+    let (rpc_addr, rpc_join) = serve_on_ephemeral(build_core_http_router(false)).await;
+    let rpc_base = format!("http://{}", rpc_addr);
+    tokio::time::sleep(Duration::from_millis(100)).await;
+
+    let update = post_json_rpc(
+        &rpc_base,
+        360,
+        "openhuman.config_update_local_ai_settings",
+        json!({
+            "runtime_enabled": true,
+            "opt_in_confirmed": true,
+            "provider": "lm_studio",
+            "base_url": lm_base,
+            "model_id": "local-model",
+            "chat_model_id": "local-model"
+        }),
+    )
+    .await;
+    assert_no_jsonrpc_error(&update, "update_local_ai_settings for inference namespace");
+
+    let status = post_json_rpc(&rpc_base, 361, "openhuman.inference_status", json!({})).await;
+    let status_result = assert_no_jsonrpc_error(&status, "inference_status");
+    let status_payload = status_result.get("result").unwrap_or(status_result);
+    assert_eq!(
+        status_payload.get("provider").and_then(Value::as_str),
+        Some("lm_studio")
+    );
+
+    let prompt = post_json_rpc(
+        &rpc_base,
+        362,
+        "openhuman.inference_prompt",
+        json!({
+            "prompt": "hello",
+            "max_tokens": 16,
+            "no_think": true
+        }),
+    )
+    .await;
+    let prompt_result = assert_no_jsonrpc_error(&prompt, "inference_prompt");
+    assert_eq!(
+        extract_string_outcome(prompt_result),
+        "hello from inference namespace"
+    );
+
+    let summarize = post_json_rpc(
+        &rpc_base,
+        363,
+        "openhuman.inference_summarize",
+        json!({
+            "text": "summarize me",
+            "max_tokens": 16
+        }),
+    )
+    .await;
+    let summarize_result = assert_no_jsonrpc_error(&summarize, "inference_summarize");
+    assert_eq!(
+        extract_string_outcome(summarize_result),
+        "hello from inference namespace"
+    );
+
+    lm_join.abort();
+    mock_join.abort();
+    rpc_join.abort();
+}
+
+#[tokio::test]
+async fn json_rpc_inference_prompt_requires_external_ollama_runtime_when_unreachable() {
+    let _env_lock = json_rpc_e2e_env_lock();
+    let tmp = tempdir().expect("tempdir");
+    let home = tmp.path();
+    let openhuman_home = home.join(".openhuman");
+
+    let _home_guard = EnvVarGuard::set_to_path("HOME", home);
+    let _workspace_guard = EnvVarGuard::unset("OPENHUMAN_WORKSPACE");
+    let _backend_url_guard = EnvVarGuard::unset("BACKEND_URL");
+    let _vite_backend_guard = EnvVarGuard::unset("VITE_BACKEND_URL");
+    let _tier_guard = EnvVarGuard::unset("OPENHUMAN_LOCAL_AI_TIER");
+    let _ollama_url_guard = EnvVarGuard::set("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
+
+    let (mock_addr, mock_join) = serve_on_ephemeral(mock_upstream_router()).await;
+    let mock_origin = format!("http://{}", mock_addr);
+    write_min_config(&openhuman_home, &mock_origin);
+
+    let (rpc_addr, rpc_join) = serve_on_ephemeral(build_core_http_router(false)).await;
+    let rpc_base = format!("http://{}", rpc_addr);
+    tokio::time::sleep(Duration::from_millis(100)).await;
+
+    let update = post_json_rpc(
+        &rpc_base,
+        364,
+        "openhuman.config_update_local_ai_settings",
+        json!({
+            "runtime_enabled": true,
+            "opt_in_confirmed": true,
+            "provider": "ollama",
+            "model_id": "gemma3:1b-it-qat",
+            "chat_model_id": "gemma3:1b-it-qat"
+        }),
+    )
+    .await;
+    assert_no_jsonrpc_error(&update, "update_local_ai_settings for unreachable ollama");
+
+    let prompt = post_json_rpc(
+        &rpc_base,
+        365,
+        "openhuman.inference_prompt",
+        json!({
+            "prompt": "hello",
+            "max_tokens": 16,
+            "no_think": true
+        }),
+    )
+    .await;
+    let prompt_err = assert_jsonrpc_error(&prompt, "inference_prompt unreachable ollama");
+    assert!(
+        prompt_err.contains("no longer starts or installs Ollama automatically"),
+        "unexpected error: {prompt_err}"
+    );
+
+    mock_join.abort();
+    rpc_join.abort();
+}
+
 // ── Billing & Team E2E tests ──────────────────────────────────────────────────
 
 /// End-to-end test for billing RPC methods.

From 481482e1738df930966d31463ab466ad8d62b08b Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 15:31:24 -0700
Subject: [PATCH 02/18] Clarify external Ollama routing errors

---
 src/openhuman/local_ai/service/public_infer.rs | 12 ++++++++++--
 tests/json_rpc_e2e.rs                          |  6 +++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/openhuman/local_ai/service/public_infer.rs b/src/openhuman/local_ai/service/public_infer.rs
index 41a3420f94..ef49c75049 100644
--- a/src/openhuman/local_ai/service/public_infer.rs
+++ b/src/openhuman/local_ai/service/public_infer.rs
@@ -8,6 +8,14 @@ use crate::openhuman::local_ai::provider::{provider_from_config, LocalAiProvider
 
 use super::LocalAiService;
 
+fn external_ollama_request_error(prefix: &str, error: &reqwest::Error) -> String {
+    let base_url = ollama_base_url();
+    format!(
+        "{prefix}: OpenHuman routes inference through an external Ollama endpoint. \
+         Make sure Ollama is already running and reachable at {base_url} ({error})"
+    )
+}
+
 impl LocalAiService {
     pub async fn summarize(
         &self,
@@ -259,7 +267,7 @@ impl LocalAiService {
             .json(&body)
             .send()
             .await
-            .map_err(|e| format!("ollama chat request failed: {e}"))?;
+            .map_err(|e| external_ollama_request_error("ollama chat request failed", &e))?;
 
         if !response.status().is_success() {
             let status = response.status();
@@ -509,7 +517,7 @@ impl LocalAiService {
             .json(&body)
             .send()
             .await
-            .map_err(|e| format!("ollama request failed: {e}"))?;
+            .map_err(|e| external_ollama_request_error("ollama request failed", &e))?;
         if !response.status().is_success() {
             let status = response.status();
             let body = response.text().await.unwrap_or_default();
diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs
index 812ed0b728..06e95411ee 100644
--- a/tests/json_rpc_e2e.rs
+++ b/tests/json_rpc_e2e.rs
@@ -3511,8 +3511,12 @@ async fn json_rpc_inference_prompt_requires_external_ollama_runtime_when_unreach
     )
     .await;
     let prompt_err = assert_jsonrpc_error(&prompt, "inference_prompt unreachable ollama");
+    let prompt_err_message = prompt_err
+        .get("message")
+        .and_then(Value::as_str)
+        .unwrap_or_default();
     assert!(
-        prompt_err.contains("no longer starts or installs Ollama automatically"),
+        prompt_err_message.contains("routes inference through an external Ollama endpoint"),
         "unexpected error: {prompt_err}"
     );
 

From 56c89f6fc2b0d4e9987336de4e4d0dcc902efd8b Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 15:41:17 -0700
Subject: [PATCH 03/18] Remove legacy Ollama management RPCs

---
 .../settings/panels/LocalModelDebugPanel.tsx  |  72 ++-----------
 .../pages/onboarding/steps/LocalAIStep.tsx    |   2 +-
 app/src/services/api/aiSettingsApi.ts         |  21 ----
 .../utils/__tests__/localAiBootstrap.test.ts  |  14 +--
 app/src/utils/localAiBootstrap.ts             |  17 +--
 app/src/utils/tauriCommands/localAi.ts        |  47 --------
 src/core/jsonrpc_tests.rs                     |  12 ---
 src/openhuman/local_ai/README.md              |   2 +-
 src/openhuman/local_ai/ops.rs                 | 102 ++----------------
 src/openhuman/local_ai/schemas.rs             | 100 -----------------
 src/openhuman/local_ai/schemas_tests.rs       |  36 -------
 11 files changed, 21 insertions(+), 404 deletions(-)

diff --git a/app/src/components/settings/panels/LocalModelDebugPanel.tsx b/app/src/components/settings/panels/LocalModelDebugPanel.tsx
index 638e247040..25a8efcc8d 100644
--- a/app/src/components/settings/panels/LocalModelDebugPanel.tsx
+++ b/app/src/components/settings/panels/LocalModelDebugPanel.tsx
@@ -17,13 +17,10 @@ import {
   type LocalAiTtsResult,
   openhumanLocalAiAssetsStatus,
   openhumanLocalAiDiagnostics,
-  openhumanLocalAiDownload,
-  openhumanLocalAiDownloadAllAssets,
   openhumanLocalAiDownloadAsset,
   openhumanLocalAiDownloadsProgress,
   openhumanLocalAiEmbed,
   openhumanLocalAiPrompt,
-  openhumanLocalAiSetOllamaPath,
   openhumanLocalAiStatus,
   openhumanLocalAiSummarize,
   openhumanLocalAiTranscribe,
@@ -60,8 +57,6 @@ const LocalModelDebugPanel = () => {
   const [assets, setAssets] = useState<LocalAiAssetsStatus | null>(null);
   const [downloads, setDownloads] = useState<LocalAiDownloadsProgress | null>(null);
   const [statusError, setStatusError] = useState<string>('');
-  const [isTriggeringDownload, setIsTriggeringDownload] = useState(false);
-  const [bootstrapMessage, setBootstrapMessage] = useState<string>('');
   const [assetDownloadBusy, setAssetDownloadBusy] = useState<Record<string, boolean>>({});
 
   const [summaryInput, setSummaryInput] = useState('');
@@ -97,8 +92,6 @@ const LocalModelDebugPanel = () => {
   const [diagnosticsError, setDiagnosticsError] = useState('');
 
   const [showErrorDetail, setShowErrorDetail] = useState(false);
-  const [ollamaPathInput, setOllamaPathInput] = useState('');
-  const [isSettingPath, setIsSettingPath] = useState(false);
 
   const progress = useMemo(() => {
     const downloadProgress = progressFromDownloads(downloads);
@@ -158,29 +151,6 @@ const LocalModelDebugPanel = () => {
     };
   }, []);
 
-  const triggerDownload = async (force: boolean) => {
-    if (!runtimeEnabled) return;
-    setIsTriggeringDownload(true);
-    setStatusError('');
-    setBootstrapMessage('');
-    try {
-      await openhumanLocalAiDownload(force);
-      await openhumanLocalAiDownloadAllAssets(force);
-      const freshStatus = await openhumanLocalAiStatus();
-      setStatus(freshStatus.result);
-      if (freshStatus.result?.state === 'ready') {
-        setBootstrapMessage(force ? 'Re-bootstrap complete' : 'Models verified');
-      }
-      setTimeout(() => setBootstrapMessage(''), 3000);
-    } catch (err) {
-      const message =
-        err instanceof Error ? err.message : 'Failed to trigger local model bootstrap';
-      setStatusError(message);
-    } finally {
-      setIsTriggeringDownload(false);
-    }
-  };
-
   const runSummaryTest = async () => {
     if (!runtimeEnabled || !summaryInput.trim()) return;
     setIsSummaryLoading(true);
@@ -305,32 +275,6 @@ const LocalModelDebugPanel = () => {
     }
   };
 
-  const handleSetOllamaPath = async () => {
-    setIsSettingPath(true);
-    setStatusError('');
-    try {
-      await openhumanLocalAiSetOllamaPath(ollamaPathInput);
-      await loadStatus();
-    } catch (err) {
-      setStatusError(err instanceof Error ? err.message : 'Failed to set Ollama path');
-    } finally {
-      setIsSettingPath(false);
-    }
-  };
-
-  const handleClearOllamaPath = async () => {
-    setOllamaPathInput('');
-    setIsSettingPath(true);
-    try {
-      await openhumanLocalAiSetOllamaPath('');
-      await loadStatus();
-    } catch (err) {
-      setStatusError(err instanceof Error ? err.message : 'Failed to clear Ollama path');
-    } finally {
-      setIsSettingPath(false);
-    }
-  };
-
   const handleRunDiagnostics = async () => {
     setIsDiagnosticsLoading(true);
     setDiagnosticsError('');
@@ -361,25 +305,25 @@ const LocalModelDebugPanel = () => {
           isDiagnosticsLoading={isDiagnosticsLoading}
           diagnosticsError={diagnosticsError}
           statusError={statusError}
-          isTriggeringDownload={isTriggeringDownload}
-          bootstrapMessage={bootstrapMessage}
+          isTriggeringDownload={false}
+          bootstrapMessage=""
           progress={progress}
           isIndeterminateDownload={isIndeterminateDownload}
           isInstalling={isInstalling}
           isInstallError={isInstallError}
           showErrorDetail={showErrorDetail}
-          ollamaPathInput={ollamaPathInput}
-          isSettingPath={isSettingPath}
+          ollamaPathInput=""
+          isSettingPath={false}
           downloadedText={downloadedText}
           speedText={speedText}
           etaText={etaText}
           statusTone={statusTone}
           runtimeEnabled={runtimeEnabled}
           onRefreshStatus={() => void loadStatus()}
-          onTriggerDownload={force => void triggerDownload(force)}
-          onSetOllamaPath={() => void handleSetOllamaPath()}
-          onClearOllamaPath={() => void handleClearOllamaPath()}
-          onSetOllamaPathInput={setOllamaPathInput}
+          onTriggerDownload={() => {}}
+          onSetOllamaPath={() => {}}
+          onClearOllamaPath={() => {}}
+          onSetOllamaPathInput={() => {}}
           onToggleErrorDetail={() => setShowErrorDetail(v => !v)}
           onRunDiagnostics={() => void handleRunDiagnostics()}
         />
diff --git a/app/src/pages/onboarding/steps/LocalAIStep.tsx b/app/src/pages/onboarding/steps/LocalAIStep.tsx
index 3972ede2e0..8f324ad9cd 100644
--- a/app/src/pages/onboarding/steps/LocalAIStep.tsx
+++ b/app/src/pages/onboarding/steps/LocalAIStep.tsx
@@ -167,7 +167,7 @@ const LocalAIStep = ({ onNext, onBack: _onBack, onDownloadError }: LocalAIStepPr
         type="button"
         onClick={handleConsent}
         className="mt-3 w-full text-center text-xs text-stone-400 hover:text-stone-600 transition-colors">
-        Use local AI instead (install Ollama now)
+        Use local AI instead (connect Ollama now)
       </button>
     </div>
   );
diff --git a/app/src/services/api/aiSettingsApi.ts b/app/src/services/api/aiSettingsApi.ts
index 5a065b5244..7a5403c2ca 100644
--- a/app/src/services/api/aiSettingsApi.ts
+++ b/app/src/services/api/aiSettingsApi.ts
@@ -38,10 +38,7 @@ import {
   type ModelPresetResult,
   openhumanLocalAiApplyPreset,
   openhumanLocalAiDiagnostics,
-  openhumanLocalAiDownload,
   openhumanLocalAiPresets,
-  openhumanLocalAiSetOllamaPath,
-  openhumanLocalAiShutdownOwned,
   openhumanLocalAiStatus,
   type PresetsResponse,
 } from '../../utils/tauriCommands/localAi';
@@ -319,28 +316,10 @@ export async function setLocalRuntimeEnabled(enabled: boolean): Promise<void> {
   await openhumanUpdateLocalAiSettings({ runtime_enabled: enabled, opt_in_confirmed: enabled });
 }
 
-/**
- * Set / clear the user-configured Ollama binary path.
- */
-export async function setLocalOllamaPath(path: string): Promise<void> {
-  await openhumanLocalAiSetOllamaPath(path);
-}
-
-/**
- * Gate off the local-AI runtime.
- */
-export async function shutdownLocalProvider(): Promise<void> {
-  await setLocalRuntimeEnabled(false);
-  await openhumanLocalAiShutdownOwned();
-}
-
 /** Convenience helpers re-exported so the panel imports from one place. */
 export const localProvider = {
   applyPreset: (tier: string) => openhumanLocalAiApplyPreset(tier),
-  download: (retry: boolean) => openhumanLocalAiDownload(retry),
   setEnabled: (enabled: boolean) => setLocalRuntimeEnabled(enabled),
-  setBinaryPath: (path: string) => setLocalOllamaPath(path),
-  shutdown: () => shutdownLocalProvider(),
 };
 
 export type { ModelPresetResult };
diff --git a/app/src/utils/__tests__/localAiBootstrap.test.ts b/app/src/utils/__tests__/localAiBootstrap.test.ts
index 061534ed9b..a89e774b4a 100644
--- a/app/src/utils/__tests__/localAiBootstrap.test.ts
+++ b/app/src/utils/__tests__/localAiBootstrap.test.ts
@@ -7,7 +7,6 @@ import {
 
 vi.mock('../tauriCommands', () => ({
   openhumanLocalAiApplyPreset: vi.fn(),
-  openhumanLocalAiDownloadAllAssets: vi.fn(),
   openhumanLocalAiPresets: vi.fn(),
 }));
 
@@ -16,7 +15,7 @@ describe('localAiBootstrap', () => {
     vi.clearAllMocks();
   });
 
-  it('applies the recommended preset before starting background downloads when no tier is selected', async () => {
+  it('applies the recommended preset when no tier is selected', async () => {
     const tauriCommands = await import('../tauriCommands');
     vi.mocked(tauriCommands.openhumanLocalAiPresets).mockResolvedValue({
       presets: [],
@@ -40,21 +39,10 @@ describe('localAiBootstrap', () => {
       embedding_model_id: 'all-minilm:latest',
       quantization: 'qat',
     });
-    vi.mocked(tauriCommands.openhumanLocalAiDownloadAllAssets).mockResolvedValue({
-      result: { state: 'downloading', progress: 0 } as never,
-      logs: [],
-    });
-
     const result = await bootstrapLocalAiWithRecommendedPreset(false, '[test]');
 
     expect(tauriCommands.openhumanLocalAiPresets).toHaveBeenCalledOnce();
     expect(tauriCommands.openhumanLocalAiApplyPreset).toHaveBeenCalledWith('ram_2_4gb');
-    expect(tauriCommands.openhumanLocalAiDownloadAllAssets).toHaveBeenCalledWith(false);
-    expect(
-      vi.mocked(tauriCommands.openhumanLocalAiApplyPreset).mock.invocationCallOrder[0]
-    ).toBeLessThan(
-      vi.mocked(tauriCommands.openhumanLocalAiDownloadAllAssets).mock.invocationCallOrder[0]
-    );
     expect(result.preset.hadSelectedTier).toBe(false);
     expect(result.preset.appliedTier).toBe('ram_2_4gb');
   });
diff --git a/app/src/utils/localAiBootstrap.ts b/app/src/utils/localAiBootstrap.ts
index fccb7f3423..ba6d4f9b2e 100644
--- a/app/src/utils/localAiBootstrap.ts
+++ b/app/src/utils/localAiBootstrap.ts
@@ -1,6 +1,5 @@
 import {
   openhumanLocalAiApplyPreset,
-  openhumanLocalAiDownloadAllAssets,
   openhumanLocalAiPresets,
   type PresetsResponse,
 } from './tauriCommands';
@@ -95,23 +94,11 @@ export const ensureRecommendedLocalAiPresetIfNeeded = async (
   };
 };
 
-export const triggerLocalAiAssetBootstrap = async (
-  force = false,
-  logPrefix = '[local-ai-bootstrap]'
-) => {
-  console.debug(`${logPrefix} triggering local AI background bootstrap`, JSON.stringify({ force }));
-  return await retryLocalAiCommand(
-    force ? 're-bootstrap local AI assets' : 'bootstrap local AI assets',
-    () => openhumanLocalAiDownloadAllAssets(force),
-    logPrefix
-  );
-};
-
 export const bootstrapLocalAiWithRecommendedPreset = async (
   force = false,
   logPrefix = '[local-ai-bootstrap]'
 ) => {
+  void force;
   const preset = await ensureRecommendedLocalAiPresetIfNeeded(logPrefix);
-  const download = await triggerLocalAiAssetBootstrap(force, logPrefix);
-  return { preset, download };
+  return { preset };
 };
diff --git a/app/src/utils/tauriCommands/localAi.ts b/app/src/utils/tauriCommands/localAi.ts
index 432de5f6f0..a49730eec6 100644
--- a/app/src/utils/tauriCommands/localAi.ts
+++ b/app/src/utils/tauriCommands/localAi.ts
@@ -261,32 +261,6 @@ export async function openhumanLocalAiStatus(): Promise<CommandResponse<LocalAiS
   }
 }
 
-export async function openhumanLocalAiDownload(
-  force?: boolean
-): Promise<CommandResponse<LocalAiStatus>> {
-  try {
-    return await callCoreRpc<CommandResponse<LocalAiStatus>>({
-      method: 'openhuman.local_ai_download',
-      params: { force: force ?? false },
-    });
-  } catch (err) {
-    const message = tauriErrorMessage(err);
-    if (message.includes('unknown method: openhuman.local_ai_download')) {
-      return await openhumanLocalAiStatus();
-    }
-    throw new Error(message);
-  }
-}
-
-export async function openhumanLocalAiDownloadAllAssets(
-  force?: boolean
-): Promise<CommandResponse<LocalAiDownloadsProgress>> {
-  return await callCoreRpc<CommandResponse<LocalAiDownloadsProgress>>({
-    method: 'openhuman.local_ai_download_all_assets',
-    params: { force: force ?? false },
-  });
-}
-
 export async function openhumanLocalAiSummarize(
   text: string,
   maxTokens?: number
@@ -468,24 +442,3 @@ export async function openhumanLocalAiDiagnostics(): Promise<LocalAiDiagnostics>
     params: {},
   });
 }
-
-export async function openhumanLocalAiSetOllamaPath(
-  path: string
-): Promise<{ ollama_binary_path: string | null; status: LocalAiStatus }> {
-  return await callCoreRpc<{ ollama_binary_path: string | null; status: LocalAiStatus }>({
-    method: 'openhuman.local_ai_set_ollama_path',
-    params: { path },
-  });
-}
-
-/**
- * Gate off the local-AI runtime: kills the Ollama daemon only if OpenHuman
- * spawned it (external daemons are left running), and forces status to
- * `"disabled"` so the UI flips immediately.
- */
-export async function openhumanLocalAiShutdownOwned(): Promise<CommandResponse<LocalAiStatus>> {
-  return await callCoreRpc<CommandResponse<LocalAiStatus>>({
-    method: 'openhuman.local_ai_shutdown_owned',
-    params: {},
-  });
-}
diff --git a/src/core/jsonrpc_tests.rs b/src/core/jsonrpc_tests.rs
index ad2433eb8e..fdd7856deb 100644
--- a/src/core/jsonrpc_tests.rs
+++ b/src/core/jsonrpc_tests.rs
@@ -281,18 +281,6 @@ async fn invoke_migrate_openclaw_rejects_unknown_param() {
     assert!(err.contains("unknown param 'x'"));
 }
 
-#[tokio::test]
-async fn invoke_local_ai_download_asset_missing_required_param_fails_validation() {
-    let err = invoke_method(
-        default_state(),
-        "openhuman.local_ai_download_asset",
-        json!({}),
-    )
-    .await
-    .expect_err("missing capability should fail");
-    assert!(err.contains("missing required param 'capability'"));
-}
-
 #[test]
 fn http_schema_dump_includes_openhuman_and_core_methods() {
     let dump = build_http_schema_dump();
diff --git a/src/openhuman/local_ai/README.md b/src/openhuman/local_ai/README.md
index 2490c5463a..8a60bfce90 100644
--- a/src/openhuman/local_ai/README.md
+++ b/src/openhuman/local_ai/README.md
@@ -13,7 +13,7 @@ On-device inference stack. Owns the bundled Ollama runtime, LM Studio local-serv
 - `pub struct GifDecision` / `pub struct TenorGifResult` / `pub struct TenorSearchResult` — `gif_decision.rs`.
 - Status / progress / result types: `pub struct LocalAiStatus`, `LocalAiAssetStatus`, `LocalAiAssetsStatus`, `LocalAiDownloadProgressItem`, `LocalAiDownloadsProgress`, `LocalAiEmbeddingResult`, `LocalAiSpeechResult`, `LocalAiTtsResult` — `types.rs`.
 - `pub mod ops` (re-exported as `rpc`) — `ops.rs` — typed Rust wrappers around each capability (`agent_chat`, `agent_chat_simple`, `summarize`, `prompt`, `vision_prompt`, `embed`, `transcribe`, `tts`, `should_react`, `analyze_sentiment`, `should_send_gif`, `tenor_search`).
-- RPC `local_ai.{agent_chat, agent_chat_simple, local_ai_status, local_ai_download, local_ai_download_all_assets, local_ai_summarize, local_ai_prompt, local_ai_vision_prompt, local_ai_embed, local_ai_transcribe, local_ai_transcribe_bytes, local_ai_tts, local_ai_assets_status, local_ai_downloads_progress, local_ai_download_asset, local_ai_device_profile, local_ai_presets, local_ai_apply_preset, local_ai_diagnostics, local_ai_set_ollama_path, local_ai_chat, local_ai_should_react, local_ai_analyze_sentiment, local_ai_should_send_gif, local_ai_tenor_search}` — `schemas.rs`.
+- RPC `local_ai.{agent_chat, agent_chat_simple, local_ai_status, local_ai_summarize, local_ai_prompt, local_ai_vision_prompt, local_ai_embed, local_ai_transcribe, local_ai_transcribe_bytes, local_ai_tts, local_ai_assets_status, local_ai_downloads_progress, local_ai_download_asset, local_ai_device_profile, local_ai_presets, local_ai_apply_preset, local_ai_diagnostics, local_ai_chat, local_ai_should_react, local_ai_analyze_sentiment, local_ai_should_send_gif, local_ai_tenor_search}` — `schemas.rs`.
 
 ## Calls into
 
diff --git a/src/openhuman/local_ai/ops.rs b/src/openhuman/local_ai/ops.rs
index 35c2e500cf..24de41a222 100644
--- a/src/openhuman/local_ai/ops.rs
+++ b/src/openhuman/local_ai/ops.rs
@@ -153,85 +153,6 @@ pub async fn local_ai_status(
     ))
 }
 
-/// Stop the local-AI runtime, killing the Ollama daemon ONLY if OpenHuman
-/// spawned it, and shift any workload routed to `ollama:<model>` back to
-/// `"cloud"` (= primary).
-///
-/// Three coordinated effects:
-///
-/// 1. **Daemon shutdown** — `shutdown_owned_ollama` kills the child process
-///    only when the spawn marker matches. External daemons (system service,
-///    user-launched `ollama serve`, daemons from another OpenHuman workspace)
-///    are left untouched, per the same friendly-fire-avoidance rule
-///    `ensure_ollama_server` follows at startup.
-///
-/// 2. **Routing shift** — every `*_provider` field starting with `ollama:`
-///    is cleared (set to `None`, which resolves to `"cloud"` at the factory).
-///    Without this, the next chat call routed to `reasoning` (or any other
-///    workload the user had set to `ollama:<m>`) would fail at factory
-///    build time. The shift is one-way: re-enabling local AI does NOT
-///    restore the previous Ollama routes — the user re-picks.
-///
-/// 3. **Status forced to disabled** so the UI reflects the gate immediately.
-pub async fn local_ai_shutdown_owned(
-    config: &mut Config,
-) -> Result<RpcOutcome<local_ai::LocalAiStatus>, String> {
-    let _ = config;
-    Err("OpenHuman does not manage the Ollama process anymore. Stop or restart your external Ollama runtime directly.".to_string())
-}
-
-/// Clear every per-workload `*_provider` field whose stored value starts
-/// with `"ollama:"`. Returns the count of fields actually changed so the
-/// caller can decide whether to persist.
-fn clear_ollama_workload_routes(config: &mut Config) -> usize {
-    fn clear_if_ollama(field: &mut Option<String>) -> bool {
-        let is_ollama = field
-            .as_deref()
-            .map(|s| s.trim().starts_with("ollama:"))
-            .unwrap_or(false);
-        if is_ollama {
-            *field = None;
-            true
-        } else {
-            false
-        }
-    }
-    let mut changed = 0;
-    for field in [
-        &mut config.reasoning_provider,
-        &mut config.agentic_provider,
-        &mut config.coding_provider,
-        &mut config.memory_provider,
-        &mut config.embeddings_provider,
-        &mut config.heartbeat_provider,
-        &mut config.learning_provider,
-        &mut config.subconscious_provider,
-    ] {
-        if clear_if_ollama(field) {
-            changed += 1;
-        }
-    }
-    changed
-}
-
-/// Triggers a full download of all required local AI models.
-pub async fn local_ai_download(
-    config: &Config,
-    force: bool,
-) -> Result<RpcOutcome<local_ai::LocalAiStatus>, String> {
-    let _ = (config, force);
-    Err("OpenHuman no longer downloads or starts Ollama for you. Start your external Ollama runtime and pull models yourself.".to_string())
-}
-
-/// Triggers a download of all local AI assets and returns progress information.
-pub async fn local_ai_download_all_assets(
-    config: &Config,
-    force: bool,
-) -> Result<RpcOutcome<LocalAiDownloadsProgress>, String> {
-    let _ = (config, force);
-    Err("OpenHuman no longer downloads Ollama assets. Start your external Ollama runtime and manage model pulls yourself.".to_string())
-}
-
 /// Generates a summary of the provided text using local AI models.
 pub async fn local_ai_summarize(
     config: &Config,
@@ -422,21 +343,14 @@ pub async fn local_ai_download_asset(
     config: &Config,
     capability: &str,
 ) -> Result<RpcOutcome<LocalAiAssetsStatus>, String> {
-    let capability = capability.trim().to_ascii_lowercase();
-    if matches!(capability.as_str(), "stt" | "tts") {
-        let service = local_ai::global(config);
-        let output = service
-            .download_asset(config, capability.as_str())
-            .await
-            .map_err(|e| e.to_string())?;
-        return Ok(RpcOutcome::single_log(
-            output,
-            "local ai voice asset download triggered",
-        ));
-    }
-
-    Err(format!(
-        "OpenHuman no longer downloads `{capability}` via Ollama. Start your external Ollama runtime and pull that model yourself."
+    let service = local_ai::global(config);
+    let output = service
+        .download_asset(config, capability.trim())
+        .await
+        .map_err(|e| e.to_string())?;
+    Ok(RpcOutcome::single_log(
+        output,
+        "local ai voice asset download triggered",
     ))
 }
 
diff --git a/src/openhuman/local_ai/schemas.rs b/src/openhuman/local_ai/schemas.rs
index d2b9bb5b45..473d01ccd8 100644
--- a/src/openhuman/local_ai/schemas.rs
+++ b/src/openhuman/local_ai/schemas.rs
@@ -14,11 +14,6 @@ struct AgentChatParams {
     temperature: Option<f64>,
 }
 
-#[derive(Debug, Deserialize)]
-struct LocalAiDownloadParams {
-    force: Option<bool>,
-}
-
 #[derive(Debug, Deserialize)]
 struct LocalAiSummarizeParams {
     text: String,
@@ -71,11 +66,6 @@ struct LocalAiApplyPresetParams {
     tier: String,
 }
 
-#[derive(Debug, Deserialize)]
-struct LocalAiSetOllamaPathParams {
-    path: String,
-}
-
 #[derive(Debug, Deserialize)]
 struct LocalAiChatMessageParam {
     role: String,
@@ -138,9 +128,6 @@ pub fn all_controller_schemas() -> Vec<ControllerSchema> {
         schemas("agent_chat"),
         schemas("agent_chat_simple"),
         schemas("local_ai_status"),
-        schemas("local_ai_shutdown_owned"),
-        schemas("local_ai_download"),
-        schemas("local_ai_download_all_assets"),
         schemas("local_ai_summarize"),
         schemas("local_ai_prompt"),
         schemas("local_ai_vision_prompt"),
@@ -154,7 +141,6 @@ pub fn all_controller_schemas() -> Vec<ControllerSchema> {
         schemas("local_ai_device_profile"),
         schemas("local_ai_presets"),
         schemas("local_ai_apply_preset"),
-        schemas("local_ai_set_ollama_path"),
         schemas("local_ai_diagnostics"),
         schemas("local_ai_chat"),
         schemas("local_ai_should_react"),
@@ -182,18 +168,6 @@ pub fn all_registered_controllers() -> Vec<RegisteredController> {
             schema: schemas("local_ai_status"),
             handler: handle_local_ai_status,
         },
-        RegisteredController {
-            schema: schemas("local_ai_shutdown_owned"),
-            handler: handle_local_ai_shutdown_owned,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_download"),
-            handler: handle_local_ai_download,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_download_all_assets"),
-            handler: handle_local_ai_download_all_assets,
-        },
         RegisteredController {
             schema: schemas("local_ai_summarize"),
             handler: handle_local_ai_summarize,
@@ -246,10 +220,6 @@ pub fn all_registered_controllers() -> Vec<RegisteredController> {
             schema: schemas("local_ai_apply_preset"),
             handler: handle_local_ai_apply_preset,
         },
-        RegisteredController {
-            schema: schemas("local_ai_set_ollama_path"),
-            handler: handle_local_ai_set_ollama_path,
-        },
         RegisteredController {
             schema: schemas("local_ai_diagnostics"),
             handler: handle_local_ai_diagnostics,
@@ -324,30 +294,6 @@ pub fn schemas(function: &str) -> ControllerSchema {
             inputs: vec![],
             outputs: vec![json_output("status", "Local AI status payload.")],
         },
-        "local_ai_shutdown_owned" => ControllerSchema {
-            namespace: "local_ai",
-            function: "shutdown_owned",
-            description:
-                "Gate off the local AI runtime. Kills the Ollama daemon only \
-                 if OpenHuman spawned it (external daemons are left running). \
-                 Forces status to \"disabled\" so the UI flips immediately.",
-            inputs: vec![],
-            outputs: vec![json_output("status", "Local AI status after shutdown.")],
-        },
-        "local_ai_download" => ControllerSchema {
-            namespace: "local_ai",
-            function: "download",
-            description: "Trigger local AI model download bootstrap.",
-            inputs: vec![optional_bool("force", "Reset state before download.")],
-            outputs: vec![json_output("status", "Local AI status payload.")],
-        },
-        "local_ai_download_all_assets" => ControllerSchema {
-            namespace: "local_ai",
-            function: "download_all_assets",
-            description: "Trigger full local AI asset download.",
-            inputs: vec![optional_bool("force", "Reset state before download.")],
-            outputs: vec![json_output("progress", "Download progress payload.")],
-        },
         "local_ai_summarize" => ControllerSchema {
             namespace: "local_ai",
             function: "summarize",
@@ -488,13 +434,6 @@ pub fn schemas(function: &str) -> ControllerSchema {
             inputs: vec![],
             outputs: vec![json_output("diagnostics", "Diagnostic report.")],
         },
-        "local_ai_set_ollama_path" => ControllerSchema {
-            namespace: "local_ai",
-            function: "set_ollama_path",
-            description: "Set a custom Ollama binary path, persist to config, and trigger re-bootstrap.",
-            inputs: vec![required_string("path", "Absolute path to Ollama binary. Empty string to clear.")],
-            outputs: vec![json_output("result", "Updated status.")],
-        },
         "local_ai_chat" => ControllerSchema {
             namespace: "local_ai",
             function: "chat",
@@ -649,38 +588,6 @@ fn handle_local_ai_status(_params: Map<String, Value>) -> ControllerFuture {
     })
 }
 
-fn handle_local_ai_shutdown_owned(_params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let mut config = config_rpc::load_config_with_timeout().await?;
-        to_json(crate::openhuman::local_ai::rpc::local_ai_shutdown_owned(&mut config).await?)
-    })
-}
-
-fn handle_local_ai_download(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiDownloadParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::rpc::local_ai_download(&config, p.force.unwrap_or(false))
-                .await?,
-        )
-    })
-}
-
-fn handle_local_ai_download_all_assets(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiDownloadParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::rpc::local_ai_download_all_assets(
-                &config,
-                p.force.unwrap_or(false),
-            )
-            .await?,
-        )
-    })
-}
-
 fn handle_local_ai_summarize(params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let p = deserialize_params::<LocalAiSummarizeParams>(params)?;
@@ -923,13 +830,6 @@ fn handle_local_ai_diagnostics(_params: Map<String, Value>) -> ControllerFuture
     })
 }
 
-fn handle_local_ai_set_ollama_path(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let _ = deserialize_params::<LocalAiSetOllamaPathParams>(params)?;
-        Err("OpenHuman no longer manages an Ollama binary path. Point your inference setup at an already-running Ollama-compatible endpoint instead.".to_string())
-    })
-}
-
 fn handle_local_ai_should_react(params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let p = deserialize_params::<LocalAiShouldReactParams>(params)?;
diff --git a/src/openhuman/local_ai/schemas_tests.rs b/src/openhuman/local_ai/schemas_tests.rs
index b07e60568e..07ccc76481 100644
--- a/src/openhuman/local_ai/schemas_tests.rs
+++ b/src/openhuman/local_ai/schemas_tests.rs
@@ -30,8 +30,6 @@ fn every_registered_key_resolves_to_non_unknown_schema() {
         "agent_chat",
         "agent_chat_simple",
         "local_ai_status",
-        "local_ai_download",
-        "local_ai_download_all_assets",
         "local_ai_summarize",
         "local_ai_prompt",
         "local_ai_vision_prompt",
@@ -45,7 +43,6 @@ fn every_registered_key_resolves_to_non_unknown_schema() {
         "local_ai_device_profile",
         "local_ai_presets",
         "local_ai_apply_preset",
-        "local_ai_set_ollama_path",
         "local_ai_diagnostics",
         "local_ai_chat",
         "local_ai_should_react",
@@ -237,39 +234,6 @@ async fn handle_apply_preset_accepts_valid_tier_and_persists() {
     assert!(result.get("chat_model_id").is_some());
 }
 
-#[tokio::test]
-async fn handle_set_ollama_path_reports_external_runtime_contract() {
-    let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
-    let tmp = TempDir::new().unwrap();
-    unsafe {
-        std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
-    }
-    let params = Map::from_iter([(
-        "path".to_string(),
-        serde_json::json!("/this/path/should/not/exist/ollama"),
-    )]);
-    let err = handle_local_ai_set_ollama_path(params).await.unwrap_err();
-    unsafe {
-        std::env::remove_var("OPENHUMAN_WORKSPACE");
-    }
-    assert!(err.contains("no longer manages an Ollama binary path"));
-}
-
-#[tokio::test]
-async fn handle_set_ollama_path_rejects_empty_string_too() {
-    let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
-    let tmp = TempDir::new().unwrap();
-    unsafe {
-        std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
-    }
-    let params = Map::from_iter([("path".to_string(), serde_json::json!(""))]);
-    let err = handle_local_ai_set_ollama_path(params).await.unwrap_err();
-    unsafe {
-        std::env::remove_var("OPENHUMAN_WORKSPACE");
-    }
-    assert!(err.contains("no longer manages an Ollama binary path"));
-}
-
 /// Regression test for the CodeRabbit #7 race on PR #1755: when two
 /// concurrent RPC calls (e.g. a double-click, or the auto-install firing
 /// alongside a manual click) hit `handle_local_ai_install_whisper` at

From 8223b5c591b63c314fb90f2a71e8a1bb2b960760 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 15:58:09 -0700
Subject: [PATCH 04/18] Add direct runtime inference coverage

---
 .../local-model/ModelDownloadSection.test.tsx | 53 +++++++++++++++++++
 .../local_ai/service/public_infer_tests.rs    | 20 +++++++
 2 files changed, 73 insertions(+)

diff --git a/app/src/components/settings/panels/local-model/ModelDownloadSection.test.tsx b/app/src/components/settings/panels/local-model/ModelDownloadSection.test.tsx
index 1e377919bf..47e89f7607 100644
--- a/app/src/components/settings/panels/local-model/ModelDownloadSection.test.tsx
+++ b/app/src/components/settings/panels/local-model/ModelDownloadSection.test.tsx
@@ -64,4 +64,57 @@ describe('ModelDownloadSection runtime gate', () => {
     expect(props.onRunSummaryTest).not.toHaveBeenCalled();
     expect(props.onRunPromptTest).not.toHaveBeenCalled();
   });
+
+  it('shows external-runtime guidance for ollama-backed assets', () => {
+    render(
+      <ModelDownloadSection
+        {...makeProps()}
+        runtimeEnabled={true}
+        assets={{
+          quantization: 'q4',
+          chat: {
+            id: 'gemma3:1b-it-qat',
+            provider: 'ollama',
+            state: 'missing',
+            path: 'ollama://gemma3:1b-it-qat',
+            warning: null,
+          },
+          vision: {
+            id: '',
+            provider: 'ollama',
+            state: 'disabled',
+            path: null,
+            warning: null,
+          },
+          embedding: {
+            id: 'bge-m3',
+            provider: 'ollama',
+            state: 'missing',
+            path: 'ollama://bge-m3',
+            warning: null,
+          },
+          stt: {
+            id: 'whisper',
+            provider: 'whisper',
+            state: 'ondemand',
+            path: null,
+            warning: null,
+          },
+          tts: {
+            id: 'piper',
+            provider: 'piper',
+            state: 'ondemand',
+            path: null,
+            warning: null,
+          },
+          ollama_available: true,
+        }}
+      />
+    );
+
+    expect(
+      screen.getAllByText('Manage this model in your external runtime.').length
+    ).toBeGreaterThan(0);
+    expect(screen.getAllByRole('button', { name: 'Download' }).length).toBeGreaterThan(0);
+  });
 });
diff --git a/src/openhuman/local_ai/service/public_infer_tests.rs b/src/openhuman/local_ai/service/public_infer_tests.rs
index 20c95ae683..da6f77f188 100644
--- a/src/openhuman/local_ai/service/public_infer_tests.rs
+++ b/src/openhuman/local_ai/service/public_infer_tests.rs
@@ -96,6 +96,26 @@ async fn inference_errors_on_non_success_status() {
     }
 }
 
+#[tokio::test]
+async fn inference_connection_failure_mentions_external_ollama_runtime() {
+    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+
+    unsafe {
+        std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
+    }
+
+    let config = enabled_config();
+    let service = ready_service(&config);
+    let err = service.prompt(&config, "hi", None, true).await.unwrap_err();
+
+    unsafe {
+        std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
+    }
+
+    assert!(err.contains("external Ollama endpoint"), "unexpected error: {err}");
+    assert!(err.contains("already running"), "unexpected error: {err}");
+}
+
 #[tokio::test]
 async fn inference_errors_on_empty_response_when_allow_empty_false() {
     let _guard = crate::openhuman::local_ai::local_ai_test_guard();

From b1abf16b9232f5aa1df112cfd0a6baac82278695 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 15:59:48 -0700
Subject: [PATCH 05/18] Expand local model UI coverage

---
 .../DeviceCapabilitySection.test.tsx          | 87 ++++++++++++++++++
 .../api/__tests__/aiSettingsApi.test.ts       | 89 +++++++++++++++++--
 2 files changed, 168 insertions(+), 8 deletions(-)
 create mode 100644 app/src/components/settings/panels/local-model/DeviceCapabilitySection.test.tsx

diff --git a/app/src/components/settings/panels/local-model/DeviceCapabilitySection.test.tsx b/app/src/components/settings/panels/local-model/DeviceCapabilitySection.test.tsx
new file mode 100644
index 0000000000..bd80d55267
--- /dev/null
+++ b/app/src/components/settings/panels/local-model/DeviceCapabilitySection.test.tsx
@@ -0,0 +1,87 @@
+import { fireEvent, render, screen, waitFor } from '@testing-library/react';
+import { describe, expect, it, vi, beforeEach } from 'vitest';
+
+import DeviceCapabilitySection from './DeviceCapabilitySection';
+
+const mockApplyPreset = vi.fn();
+
+vi.mock('../../../../utils/tauriCommands', () => ({
+  openhumanLocalAiApplyPreset: (...args: unknown[]) => mockApplyPreset(...args),
+}));
+
+const makePresetsData = (overrides: Record<string, unknown> = {}) => ({
+  presets: [
+    {
+      tier: 'ram_2_4gb',
+      label: '2-4 GB',
+      description: 'Small local tier',
+      chat_model_id: 'gemma3:1b-it-qat',
+      vision_model_id: '',
+      embedding_model_id: 'bge-m3',
+      quantization: 'q4',
+      vision_mode: 'disabled',
+      supports_screen_summary: false,
+      target_ram_gb: 4,
+      min_ram_gb: 2,
+      approx_download_gb: 1.2,
+    },
+  ],
+  recommended_tier: 'ram_2_4gb',
+  current_tier: 'ram_2_4gb',
+  selected_tier: 'ram_2_4gb',
+  recommend_disabled: false,
+  local_ai_enabled: true,
+  device: {
+    total_ram_bytes: 16 * 1024 * 1024 * 1024,
+    cpu_count: 8,
+    cpu_brand: 'Test CPU',
+    os_name: 'macOS',
+    os_version: '15',
+    has_gpu: true,
+    gpu_description: 'Test GPU',
+  },
+  ...overrides,
+});
+
+describe('DeviceCapabilitySection', () => {
+  beforeEach(() => {
+    mockApplyPreset.mockReset();
+  });
+
+  it('renders external runtime guidance when ollama is unavailable', () => {
+    render(
+      <DeviceCapabilitySection
+        presetsData={makePresetsData()}
+        presetsLoading={false}
+        presetError=""
+        presetSuccess={null}
+        formatRamGb={() => '16 GB'}
+        ollamaAvailable={false}
+      />
+    );
+
+    expect(screen.getByText(/Run Ollama first/i)).toBeTruthy();
+    expect(screen.getByRole('link', { name: 'Ollama docs' })).toBeTruthy();
+    expect(screen.getByTitle('Run Ollama first to use this tier')).toBeTruthy();
+  });
+
+  it('allows selecting the disabled cloud fallback tier', async () => {
+    mockApplyPreset.mockResolvedValueOnce({ applied_tier: 'disabled' });
+
+    render(
+      <DeviceCapabilitySection
+        presetsData={makePresetsData({ local_ai_enabled: false })}
+        presetsLoading={false}
+        presetError=""
+        presetSuccess={null}
+        formatRamGb={() => '16 GB'}
+      />
+    );
+
+    fireEvent.click(screen.getByRole('button', { name: /Disabled.*0 GB/i }));
+
+    await waitFor(() => {
+      expect(mockApplyPreset).toHaveBeenCalledWith('disabled');
+    });
+  });
+});
diff --git a/app/src/services/api/__tests__/aiSettingsApi.test.ts b/app/src/services/api/__tests__/aiSettingsApi.test.ts
index a9d9ed5ca9..be7f565e69 100644
--- a/app/src/services/api/__tests__/aiSettingsApi.test.ts
+++ b/app/src/services/api/__tests__/aiSettingsApi.test.ts
@@ -12,11 +12,14 @@ import {
   type AISettings,
   clearCloudProviderKey,
   listProviderModels,
+  loadLocalProviderSnapshot,
+  localProvider,
   loadAISettings,
   parseProviderString,
   type ProviderRef,
   saveAISettings,
   serializeProviderRef,
+  setLocalRuntimeEnabled,
   setCloudProviderKey,
 } from '../aiSettingsApi';
 
@@ -25,10 +28,15 @@ import {
 const mockOpenhumanGetClientConfig = vi.fn();
 const mockAuthListProviderCredentials = vi.fn();
 const mockOpenhumanUpdateModelSettings = vi.fn();
+const mockOpenhumanUpdateLocalAiSettings = vi.fn();
 const mockAuthStoreProviderCredentials = vi.fn();
 const mockAuthRemoveProviderCredentials = vi.fn();
 const mockCallCoreRpc = vi.fn();
 const mockIsTauri = vi.fn(() => true);
+const mockOpenhumanLocalAiStatus = vi.fn();
+const mockOpenhumanLocalAiDiagnostics = vi.fn();
+const mockOpenhumanLocalAiPresets = vi.fn();
+const mockOpenhumanLocalAiApplyPreset = vi.fn();
 
 vi.mock('../../coreRpcClient', () => ({ callCoreRpc: (a: unknown) => mockCallCoreRpc(a) }));
 
@@ -46,17 +54,14 @@ vi.mock('../../../utils/tauriCommands/auth', () => ({
 vi.mock('../../../utils/tauriCommands/config', () => ({
   openhumanGetClientConfig: () => mockOpenhumanGetClientConfig(),
   openhumanUpdateModelSettings: (a: unknown) => mockOpenhumanUpdateModelSettings(a),
-  openhumanUpdateLocalAiSettings: vi.fn().mockResolvedValue({ result: {} }),
+  openhumanUpdateLocalAiSettings: (a: unknown) => mockOpenhumanUpdateLocalAiSettings(a),
 }));
 
 vi.mock('../../../utils/tauriCommands/localAi', () => ({
-  openhumanLocalAiStatus: vi.fn().mockResolvedValue({ result: null }),
-  openhumanLocalAiDiagnostics: vi.fn().mockResolvedValue(null),
-  openhumanLocalAiPresets: vi.fn().mockResolvedValue(null),
-  openhumanLocalAiApplyPreset: vi.fn().mockResolvedValue({}),
-  openhumanLocalAiDownload: vi.fn().mockResolvedValue({}),
-  openhumanLocalAiSetOllamaPath: vi.fn().mockResolvedValue({}),
-  openhumanLocalAiShutdownOwned: vi.fn().mockResolvedValue({}),
+  openhumanLocalAiStatus: (...args: unknown[]) => mockOpenhumanLocalAiStatus(...args),
+  openhumanLocalAiDiagnostics: (...args: unknown[]) => mockOpenhumanLocalAiDiagnostics(...args),
+  openhumanLocalAiPresets: (...args: unknown[]) => mockOpenhumanLocalAiPresets(...args),
+  openhumanLocalAiApplyPreset: (...args: unknown[]) => mockOpenhumanLocalAiApplyPreset(...args),
 }));
 
 // ─── Helpers ─────────────────────────────────────────────────────────────────
@@ -174,6 +179,11 @@ describe('loadAISettings', () => {
   beforeEach(() => {
     mockOpenhumanGetClientConfig.mockReset();
     mockAuthListProviderCredentials.mockReset();
+    mockOpenhumanUpdateLocalAiSettings.mockReset();
+    mockOpenhumanLocalAiStatus.mockReset();
+    mockOpenhumanLocalAiDiagnostics.mockReset();
+    mockOpenhumanLocalAiPresets.mockReset();
+    mockOpenhumanLocalAiApplyPreset.mockReset();
   });
 
   it('returns cloudProviders with has_api_key=false when no profiles stored', async () => {
@@ -362,6 +372,69 @@ describe('loadAISettings', () => {
   });
 });
 
+describe('local provider facade', () => {
+  beforeEach(() => {
+    mockOpenhumanUpdateLocalAiSettings.mockReset();
+    mockOpenhumanLocalAiStatus.mockReset();
+    mockOpenhumanLocalAiDiagnostics.mockReset();
+    mockOpenhumanLocalAiPresets.mockReset();
+    mockOpenhumanLocalAiApplyPreset.mockReset();
+  });
+
+  it('loadLocalProviderSnapshot joins status diagnostics and presets', async () => {
+    mockOpenhumanLocalAiStatus.mockResolvedValue({ result: { state: 'ready' } });
+    mockOpenhumanLocalAiDiagnostics.mockResolvedValue({
+      installed_models: [{ name: 'gemma3:1b-it-qat', size: 123 }],
+    });
+    mockOpenhumanLocalAiPresets.mockResolvedValue({
+      recommended_tier: 'ram_2_4gb',
+      current_tier: 'ram_2_4gb',
+      selected_tier: 'ram_2_4gb',
+      presets: [],
+      device: {
+        total_ram_bytes: 1,
+        cpu_count: 1,
+        cpu_brand: 'cpu',
+        os_name: 'os',
+        os_version: '1',
+        has_gpu: false,
+        gpu_description: null,
+      },
+    });
+
+    const snapshot = await loadLocalProviderSnapshot();
+
+    expect(snapshot.status).toEqual({ state: 'ready' });
+    expect(snapshot.installedModels).toEqual([{ name: 'gemma3:1b-it-qat', size: 123 }]);
+    expect(snapshot.presets?.recommended_tier).toBe('ram_2_4gb');
+  });
+
+  it('setLocalRuntimeEnabled updates runtime_enabled and opt_in_confirmed together', async () => {
+    mockOpenhumanUpdateLocalAiSettings.mockResolvedValue({ result: {} });
+
+    await setLocalRuntimeEnabled(true);
+
+    expect(mockOpenhumanUpdateLocalAiSettings).toHaveBeenCalledWith({
+      runtime_enabled: true,
+      opt_in_confirmed: true,
+    });
+  });
+
+  it('localProvider facade delegates applyPreset and setEnabled', async () => {
+    mockOpenhumanLocalAiApplyPreset.mockResolvedValue({ applied_tier: 'ram_2_4gb' });
+    mockOpenhumanUpdateLocalAiSettings.mockResolvedValue({ result: {} });
+
+    await localProvider.applyPreset('ram_2_4gb');
+    await localProvider.setEnabled(false);
+
+    expect(mockOpenhumanLocalAiApplyPreset).toHaveBeenCalledWith('ram_2_4gb');
+    expect(mockOpenhumanUpdateLocalAiSettings).toHaveBeenCalledWith({
+      runtime_enabled: false,
+      opt_in_confirmed: false,
+    });
+  });
+});
+
 // ─── saveAISettings ──────────────────────────────────────────────────────────
 
 describe('saveAISettings', () => {

From 12bed1a522dec88b292f509796eee6e76d75add6 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 17:24:58 -0700
Subject: [PATCH 06/18] test: polish direct runtime coverage

---
 .../DeviceCapabilitySection.test.tsx          |  2 +-
 .../local-model/DeviceCapabilitySection.tsx   |  8 +++----
 .../local-model/ModelDownloadSection.test.tsx | 24 +++----------------
 .../local-model/ModelStatusSection.test.tsx   |  5 +---
 .../panels/local-model/ModelStatusSection.tsx |  3 +--
 .../api/__tests__/aiSettingsApi.test.ts       |  4 ++--
 .../local_ai/service/public_infer_tests.rs    |  5 +++-
 7 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/app/src/components/settings/panels/local-model/DeviceCapabilitySection.test.tsx b/app/src/components/settings/panels/local-model/DeviceCapabilitySection.test.tsx
index bd80d55267..f9434c5309 100644
--- a/app/src/components/settings/panels/local-model/DeviceCapabilitySection.test.tsx
+++ b/app/src/components/settings/panels/local-model/DeviceCapabilitySection.test.tsx
@@ -1,5 +1,5 @@
 import { fireEvent, render, screen, waitFor } from '@testing-library/react';
-import { describe, expect, it, vi, beforeEach } from 'vitest';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
 
 import DeviceCapabilitySection from './DeviceCapabilitySection';
 
diff --git a/app/src/components/settings/panels/local-model/DeviceCapabilitySection.tsx b/app/src/components/settings/panels/local-model/DeviceCapabilitySection.tsx
index 8fdf648319..de95ccb127 100644
--- a/app/src/components/settings/panels/local-model/DeviceCapabilitySection.tsx
+++ b/app/src/components/settings/panels/local-model/DeviceCapabilitySection.tsx
@@ -176,10 +176,10 @@ const DeviceCapabilitySection = ({
           ) : (
             <>
               <div className="text-xs text-amber-800">
-                <span className="font-semibold text-amber-900">Run Ollama first.</span> Local
-                tiers depend on an externally managed Ollama endpoint. Start it yourself, pull the
-                models you want, and keep using &ldquo;Disabled (cloud fallback)&rdquo; until the
-                runtime is reachable.
+                <span className="font-semibold text-amber-900">Run Ollama first.</span> Local tiers
+                depend on an externally managed Ollama endpoint. Start it yourself, pull the models
+                you want, and keep using &ldquo;Disabled (cloud fallback)&rdquo; until the runtime
+                is reachable.
               </div>
               <div className="flex items-center gap-2">
                 <a
diff --git a/app/src/components/settings/panels/local-model/ModelDownloadSection.test.tsx b/app/src/components/settings/panels/local-model/ModelDownloadSection.test.tsx
index 47e89f7607..a5156241b8 100644
--- a/app/src/components/settings/panels/local-model/ModelDownloadSection.test.tsx
+++ b/app/src/components/settings/panels/local-model/ModelDownloadSection.test.tsx
@@ -79,13 +79,7 @@ describe('ModelDownloadSection runtime gate', () => {
             path: 'ollama://gemma3:1b-it-qat',
             warning: null,
           },
-          vision: {
-            id: '',
-            provider: 'ollama',
-            state: 'disabled',
-            path: null,
-            warning: null,
-          },
+          vision: { id: '', provider: 'ollama', state: 'disabled', path: null, warning: null },
           embedding: {
             id: 'bge-m3',
             provider: 'ollama',
@@ -93,20 +87,8 @@ describe('ModelDownloadSection runtime gate', () => {
             path: 'ollama://bge-m3',
             warning: null,
           },
-          stt: {
-            id: 'whisper',
-            provider: 'whisper',
-            state: 'ondemand',
-            path: null,
-            warning: null,
-          },
-          tts: {
-            id: 'piper',
-            provider: 'piper',
-            state: 'ondemand',
-            path: null,
-            warning: null,
-          },
+          stt: { id: 'whisper', provider: 'whisper', state: 'ondemand', path: null, warning: null },
+          tts: { id: 'piper', provider: 'piper', state: 'ondemand', path: null, warning: null },
           ollama_available: true,
         }}
       />
diff --git a/app/src/components/settings/panels/local-model/ModelStatusSection.test.tsx b/app/src/components/settings/panels/local-model/ModelStatusSection.test.tsx
index c511b8a77a..03bfcffbda 100644
--- a/app/src/components/settings/panels/local-model/ModelStatusSection.test.tsx
+++ b/app/src/components/settings/panels/local-model/ModelStatusSection.test.tsx
@@ -126,10 +126,7 @@ describe('ModelStatusSection diagnostics', () => {
     render(
       <ModelStatusSection
         {...defaultProps}
-        diagnostics={makeDiagnostics({
-          ok: false,
-          issues: ['Ollama server is not running'],
-        })}
+        diagnostics={makeDiagnostics({ ok: false, issues: ['Ollama server is not running'] })}
       />
     );
     expect(
diff --git a/app/src/components/settings/panels/local-model/ModelStatusSection.tsx b/app/src/components/settings/panels/local-model/ModelStatusSection.tsx
index 9854635a3d..8c2c93b002 100644
--- a/app/src/components/settings/panels/local-model/ModelStatusSection.tsx
+++ b/app/src/components/settings/panels/local-model/ModelStatusSection.tsx
@@ -387,8 +387,7 @@ const ModelStatusSection = ({
               )}
 
               <div className="text-xs text-stone-500">
-                Manage the Ollama process and model pulls outside OpenHuman, then rerun
-                diagnostics.
+                Manage the Ollama process and model pulls outside OpenHuman, then rerun diagnostics.
               </div>
             </>
           )}
diff --git a/app/src/services/api/__tests__/aiSettingsApi.test.ts b/app/src/services/api/__tests__/aiSettingsApi.test.ts
index be7f565e69..65850f447e 100644
--- a/app/src/services/api/__tests__/aiSettingsApi.test.ts
+++ b/app/src/services/api/__tests__/aiSettingsApi.test.ts
@@ -12,15 +12,15 @@ import {
   type AISettings,
   clearCloudProviderKey,
   listProviderModels,
+  loadAISettings,
   loadLocalProviderSnapshot,
   localProvider,
-  loadAISettings,
   parseProviderString,
   type ProviderRef,
   saveAISettings,
   serializeProviderRef,
-  setLocalRuntimeEnabled,
   setCloudProviderKey,
+  setLocalRuntimeEnabled,
 } from '../aiSettingsApi';
 
 // ─── Mock declarations (must be hoisted before imports) ───────────────────────
diff --git a/src/openhuman/local_ai/service/public_infer_tests.rs b/src/openhuman/local_ai/service/public_infer_tests.rs
index da6f77f188..44b62cdc90 100644
--- a/src/openhuman/local_ai/service/public_infer_tests.rs
+++ b/src/openhuman/local_ai/service/public_infer_tests.rs
@@ -112,7 +112,10 @@ async fn inference_connection_failure_mentions_external_ollama_runtime() {
         std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
     }
 
-    assert!(err.contains("external Ollama endpoint"), "unexpected error: {err}");
+    assert!(
+        err.contains("external Ollama endpoint"),
+        "unexpected error: {err}"
+    );
     assert!(err.contains("already running"), "unexpected error: {err}");
 }
 

From fe06af54faa5383857a5d538bd53f9e5358c7fcc Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 17:34:04 -0700
Subject: [PATCH 07/18] fix: address inference review follow-ups

---
 src/openhuman/inference/ops.rs       | 111 ++++++++++++++++++++++++---
 src/openhuman/inference/ops_tests.rs |  22 +++---
 src/openhuman/local_ai/ops.rs        |   5 +-
 3 files changed, 113 insertions(+), 25 deletions(-)

diff --git a/src/openhuman/inference/ops.rs b/src/openhuman/inference/ops.rs
index a0f2d76688..d65c04f95e 100644
--- a/src/openhuman/inference/ops.rs
+++ b/src/openhuman/inference/ops.rs
@@ -7,9 +7,18 @@ use crate::openhuman::local_ai::ops::{LocalAiChatMessage, ReactionDecision};
 use crate::openhuman::local_ai::sentiment::SentimentResult;
 use crate::openhuman::local_ai::{LocalAiEmbeddingResult, LocalAiStatus, TenorSearchResult};
 use crate::rpc::RpcOutcome;
+use tracing::{debug, error};
+
+const LOG_PREFIX: &str = "[inference::ops]";
 
 pub async fn inference_status(config: &Config) -> Result<RpcOutcome<LocalAiStatus>, String> {
-    local_ai::rpc::local_ai_status(config).await
+    debug!("{LOG_PREFIX} status:start");
+    let result = local_ai::rpc::local_ai_status(config).await;
+    match &result {
+        Ok(outcome) => debug!(state = %outcome.value.state, "{LOG_PREFIX} status:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} status:error"),
+    }
+    result
 }
 
 pub async fn inference_summarize(
@@ -17,7 +26,13 @@ pub async fn inference_summarize(
     text: &str,
     max_tokens: Option<u32>,
 ) -> Result<RpcOutcome<String>, String> {
-    local_ai::rpc::local_ai_summarize(config, text, max_tokens).await
+    debug!(text_len = text.len(), ?max_tokens, "{LOG_PREFIX} summarize:start");
+    let result = local_ai::rpc::local_ai_summarize(config, text, max_tokens).await;
+    match &result {
+        Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} summarize:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} summarize:error"),
+    }
+    result
 }
 
 pub async fn inference_prompt(
@@ -26,7 +41,18 @@ pub async fn inference_prompt(
     max_tokens: Option<u32>,
     no_think: Option<bool>,
 ) -> Result<RpcOutcome<String>, String> {
-    local_ai::rpc::local_ai_prompt(config, prompt, max_tokens, no_think).await
+    debug!(
+        prompt_len = prompt.len(),
+        ?max_tokens,
+        ?no_think,
+        "{LOG_PREFIX} prompt:start"
+    );
+    let result = local_ai::rpc::local_ai_prompt(config, prompt, max_tokens, no_think).await;
+    match &result {
+        Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} prompt:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} prompt:error"),
+    }
+    result
 }
 
 pub async fn inference_vision_prompt(
@@ -35,14 +61,35 @@ pub async fn inference_vision_prompt(
     image_refs: &[String],
     max_tokens: Option<u32>,
 ) -> Result<RpcOutcome<String>, String> {
-    local_ai::rpc::local_ai_vision_prompt(config, prompt, image_refs, max_tokens).await
+    debug!(
+        prompt_len = prompt.len(),
+        image_count = image_refs.len(),
+        ?max_tokens,
+        "{LOG_PREFIX} vision_prompt:start"
+    );
+    let result = local_ai::rpc::local_ai_vision_prompt(config, prompt, image_refs, max_tokens).await;
+    match &result {
+        Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} vision_prompt:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} vision_prompt:error"),
+    }
+    result
 }
 
 pub async fn inference_embed(
     config: &Config,
     inputs: &[String],
 ) -> Result<RpcOutcome<LocalAiEmbeddingResult>, String> {
-    local_ai::rpc::local_ai_embed(config, inputs).await
+    debug!(input_count = inputs.len(), "{LOG_PREFIX} embed:start");
+    let result = local_ai::rpc::local_ai_embed(config, inputs).await;
+    match &result {
+        Ok(outcome) => debug!(
+            vector_count = outcome.value.vectors.len(),
+            dimensions = outcome.value.dimensions,
+            "{LOG_PREFIX} embed:ok"
+        ),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} embed:error"),
+    }
+    result
 }
 
 pub async fn inference_chat(
@@ -50,7 +97,13 @@ pub async fn inference_chat(
     messages: Vec<LocalAiChatMessage>,
     max_tokens: Option<u32>,
 ) -> Result<RpcOutcome<String>, String> {
-    local_ai::rpc::local_ai_chat(config, messages, max_tokens).await
+    debug!(message_count = messages.len(), ?max_tokens, "{LOG_PREFIX} chat:start");
+    let result = local_ai::rpc::local_ai_chat(config, messages, max_tokens).await;
+    match &result {
+        Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} chat:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} chat:error"),
+    }
+    result
 }
 
 pub async fn inference_should_react(
@@ -58,14 +111,33 @@ pub async fn inference_should_react(
     message: &str,
     channel_type: &str,
 ) -> Result<RpcOutcome<ReactionDecision>, String> {
-    local_ai::rpc::local_ai_should_react(config, message, channel_type).await
+    debug!(
+        message_len = message.len(),
+        channel_type,
+        "{LOG_PREFIX} should_react:start"
+    );
+    let result = local_ai::rpc::local_ai_should_react(config, message, channel_type).await;
+    match &result {
+        Ok(outcome) => debug!(
+            should_react = outcome.value.should_react,
+            "{LOG_PREFIX} should_react:ok"
+        ),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} should_react:error"),
+    }
+    result
 }
 
 pub async fn inference_analyze_sentiment(
     config: &Config,
     message: &str,
 ) -> Result<RpcOutcome<SentimentResult>, String> {
-    local_ai::sentiment::local_ai_analyze_sentiment(config, message).await
+    debug!(message_len = message.len(), "{LOG_PREFIX} analyze_sentiment:start");
+    let result = local_ai::sentiment::local_ai_analyze_sentiment(config, message).await;
+    match &result {
+        Ok(outcome) => debug!(valence = %outcome.value.valence, "{LOG_PREFIX} analyze_sentiment:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} analyze_sentiment:error"),
+    }
+    result
 }
 
 pub async fn inference_should_send_gif(
@@ -73,7 +145,20 @@ pub async fn inference_should_send_gif(
     message: &str,
     channel_type: &str,
 ) -> Result<RpcOutcome<GifDecision>, String> {
-    local_ai::gif_decision::local_ai_should_send_gif(config, message, channel_type).await
+    debug!(
+        message_len = message.len(),
+        channel_type,
+        "{LOG_PREFIX} should_send_gif:start"
+    );
+    let result = local_ai::gif_decision::local_ai_should_send_gif(config, message, channel_type).await;
+    match &result {
+        Ok(outcome) => debug!(
+            should_send_gif = outcome.value.should_send_gif,
+            "{LOG_PREFIX} should_send_gif:ok"
+        ),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} should_send_gif:error"),
+    }
+    result
 }
 
 pub async fn inference_tenor_search(
@@ -81,7 +166,13 @@ pub async fn inference_tenor_search(
     query: &str,
     limit: Option<u32>,
 ) -> Result<RpcOutcome<TenorSearchResult>, String> {
-    local_ai::gif_decision::tenor_search(config, query, limit).await
+    debug!(query_len = query.len(), ?limit, "{LOG_PREFIX} tenor_search:start");
+    let result = local_ai::gif_decision::tenor_search(config, query, limit).await;
+    match &result {
+        Ok(outcome) => debug!(result_count = outcome.value.results.len(), "{LOG_PREFIX} tenor_search:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} tenor_search:error"),
+    }
+    result
 }
 
 #[cfg(test)]
diff --git a/src/openhuman/inference/ops_tests.rs b/src/openhuman/inference/ops_tests.rs
index 40870db77b..e95fcfbed0 100644
--- a/src/openhuman/inference/ops_tests.rs
+++ b/src/openhuman/inference/ops_tests.rs
@@ -1,19 +1,19 @@
 use super::*;
 use tempfile::tempdir;
 
-fn disabled_config() -> Config {
+fn disabled_config() -> (Config, tempfile::TempDir) {
     let tmp = tempdir().expect("tempdir");
     let mut config = Config::default();
     config.workspace_dir = tmp.path().join("workspace");
     config.config_path = tmp.path().join("config.toml");
     config.local_ai.runtime_enabled = false;
     config.local_ai.opt_in_confirmed = false;
-    config
+    (config, tmp)
 }
 
 #[tokio::test]
 async fn inference_status_reports_disabled_state_when_runtime_disabled() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let outcome = inference_status(&config).await.expect("status");
     assert!(
         matches!(outcome.value.state.as_str(), "idle" | "disabled"),
@@ -24,7 +24,7 @@ async fn inference_status_reports_disabled_state_when_runtime_disabled() {
 
 #[tokio::test]
 async fn inference_prompt_reuses_local_ai_disabled_error() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let err = inference_prompt(&config, "hello", None, Some(true))
         .await
         .expect_err("prompt should fail");
@@ -33,7 +33,7 @@ async fn inference_prompt_reuses_local_ai_disabled_error() {
 
 #[tokio::test]
 async fn inference_summarize_reuses_local_ai_disabled_error() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let err = inference_summarize(&config, "hello", None)
         .await
         .expect_err("summarize should fail");
@@ -42,7 +42,7 @@ async fn inference_summarize_reuses_local_ai_disabled_error() {
 
 #[tokio::test]
 async fn inference_embed_reuses_local_ai_disabled_error() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let err = inference_embed(&config, &["hello".to_string()])
         .await
         .expect_err("embed should fail");
@@ -51,7 +51,7 @@ async fn inference_embed_reuses_local_ai_disabled_error() {
 
 #[tokio::test]
 async fn inference_chat_rejects_empty_messages() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let err = inference_chat(&config, vec![], None)
         .await
         .expect_err("chat should fail");
@@ -60,7 +60,7 @@ async fn inference_chat_rejects_empty_messages() {
 
 #[tokio::test]
 async fn inference_should_react_short_circuits_for_empty_message() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let outcome = inference_should_react(&config, "   ", "web")
         .await
         .expect("reaction decision");
@@ -70,7 +70,7 @@ async fn inference_should_react_short_circuits_for_empty_message() {
 
 #[tokio::test]
 async fn inference_analyze_sentiment_handles_empty_message() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let outcome = inference_analyze_sentiment(&config, "   ")
         .await
         .expect("sentiment");
@@ -79,7 +79,7 @@ async fn inference_analyze_sentiment_handles_empty_message() {
 
 #[tokio::test]
 async fn inference_should_send_gif_short_circuits_for_empty_message() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let outcome = inference_should_send_gif(&config, "   ", "web")
         .await
         .expect("gif decision");
@@ -88,7 +88,7 @@ async fn inference_should_send_gif_short_circuits_for_empty_message() {
 
 #[tokio::test]
 async fn inference_tenor_search_requires_query() {
-    let config = disabled_config();
+    let (config, _tmp) = disabled_config();
     let err = inference_tenor_search(&config, "   ", Some(3))
         .await
         .expect_err("query validation should fail");
diff --git a/src/openhuman/local_ai/ops.rs b/src/openhuman/local_ai/ops.rs
index 24de41a222..d1da46db44 100644
--- a/src/openhuman/local_ai/ops.rs
+++ b/src/openhuman/local_ai/ops.rs
@@ -348,10 +348,7 @@ pub async fn local_ai_download_asset(
         .download_asset(config, capability.trim())
         .await
         .map_err(|e| e.to_string())?;
-    Ok(RpcOutcome::single_log(
-        output,
-        "local ai voice asset download triggered",
-    ))
+    Ok(RpcOutcome::single_log(output, "local ai asset download triggered"))
 }
 
 /// A single message in a local AI chat conversation.

From febf2fa35336a69a277764b99efbf62accadeea4 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 17:35:08 -0700
Subject: [PATCH 08/18] chore: apply rustfmt review follow-ups

---
 src/openhuman/inference/ops.rs | 54 +++++++++++++++++++++++++---------
 src/openhuman/local_ai/ops.rs  |  5 +++-
 2 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/src/openhuman/inference/ops.rs b/src/openhuman/inference/ops.rs
index d65c04f95e..a04052a21f 100644
--- a/src/openhuman/inference/ops.rs
+++ b/src/openhuman/inference/ops.rs
@@ -26,10 +26,17 @@ pub async fn inference_summarize(
     text: &str,
     max_tokens: Option<u32>,
 ) -> Result<RpcOutcome<String>, String> {
-    debug!(text_len = text.len(), ?max_tokens, "{LOG_PREFIX} summarize:start");
+    debug!(
+        text_len = text.len(),
+        ?max_tokens,
+        "{LOG_PREFIX} summarize:start"
+    );
     let result = local_ai::rpc::local_ai_summarize(config, text, max_tokens).await;
     match &result {
-        Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} summarize:ok"),
+        Ok(outcome) => debug!(
+            output_len = outcome.value.len(),
+            "{LOG_PREFIX} summarize:ok"
+        ),
         Err(err) => error!(error = %err, "{LOG_PREFIX} summarize:error"),
     }
     result
@@ -67,9 +74,13 @@ pub async fn inference_vision_prompt(
         ?max_tokens,
         "{LOG_PREFIX} vision_prompt:start"
     );
-    let result = local_ai::rpc::local_ai_vision_prompt(config, prompt, image_refs, max_tokens).await;
+    let result =
+        local_ai::rpc::local_ai_vision_prompt(config, prompt, image_refs, max_tokens).await;
     match &result {
-        Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} vision_prompt:ok"),
+        Ok(outcome) => debug!(
+            output_len = outcome.value.len(),
+            "{LOG_PREFIX} vision_prompt:ok"
+        ),
         Err(err) => error!(error = %err, "{LOG_PREFIX} vision_prompt:error"),
     }
     result
@@ -97,7 +108,11 @@ pub async fn inference_chat(
     messages: Vec<LocalAiChatMessage>,
     max_tokens: Option<u32>,
 ) -> Result<RpcOutcome<String>, String> {
-    debug!(message_count = messages.len(), ?max_tokens, "{LOG_PREFIX} chat:start");
+    debug!(
+        message_count = messages.len(),
+        ?max_tokens,
+        "{LOG_PREFIX} chat:start"
+    );
     let result = local_ai::rpc::local_ai_chat(config, messages, max_tokens).await;
     match &result {
         Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} chat:ok"),
@@ -113,8 +128,7 @@ pub async fn inference_should_react(
 ) -> Result<RpcOutcome<ReactionDecision>, String> {
     debug!(
         message_len = message.len(),
-        channel_type,
-        "{LOG_PREFIX} should_react:start"
+        channel_type, "{LOG_PREFIX} should_react:start"
     );
     let result = local_ai::rpc::local_ai_should_react(config, message, channel_type).await;
     match &result {
@@ -131,10 +145,15 @@ pub async fn inference_analyze_sentiment(
     config: &Config,
     message: &str,
 ) -> Result<RpcOutcome<SentimentResult>, String> {
-    debug!(message_len = message.len(), "{LOG_PREFIX} analyze_sentiment:start");
+    debug!(
+        message_len = message.len(),
+        "{LOG_PREFIX} analyze_sentiment:start"
+    );
     let result = local_ai::sentiment::local_ai_analyze_sentiment(config, message).await;
     match &result {
-        Ok(outcome) => debug!(valence = %outcome.value.valence, "{LOG_PREFIX} analyze_sentiment:ok"),
+        Ok(outcome) => {
+            debug!(valence = %outcome.value.valence, "{LOG_PREFIX} analyze_sentiment:ok")
+        }
         Err(err) => error!(error = %err, "{LOG_PREFIX} analyze_sentiment:error"),
     }
     result
@@ -147,10 +166,10 @@ pub async fn inference_should_send_gif(
 ) -> Result<RpcOutcome<GifDecision>, String> {
     debug!(
         message_len = message.len(),
-        channel_type,
-        "{LOG_PREFIX} should_send_gif:start"
+        channel_type, "{LOG_PREFIX} should_send_gif:start"
     );
-    let result = local_ai::gif_decision::local_ai_should_send_gif(config, message, channel_type).await;
+    let result =
+        local_ai::gif_decision::local_ai_should_send_gif(config, message, channel_type).await;
     match &result {
         Ok(outcome) => debug!(
             should_send_gif = outcome.value.should_send_gif,
@@ -166,10 +185,17 @@ pub async fn inference_tenor_search(
     query: &str,
     limit: Option<u32>,
 ) -> Result<RpcOutcome<TenorSearchResult>, String> {
-    debug!(query_len = query.len(), ?limit, "{LOG_PREFIX} tenor_search:start");
+    debug!(
+        query_len = query.len(),
+        ?limit,
+        "{LOG_PREFIX} tenor_search:start"
+    );
     let result = local_ai::gif_decision::tenor_search(config, query, limit).await;
     match &result {
-        Ok(outcome) => debug!(result_count = outcome.value.results.len(), "{LOG_PREFIX} tenor_search:ok"),
+        Ok(outcome) => debug!(
+            result_count = outcome.value.results.len(),
+            "{LOG_PREFIX} tenor_search:ok"
+        ),
         Err(err) => error!(error = %err, "{LOG_PREFIX} tenor_search:error"),
     }
     result
diff --git a/src/openhuman/local_ai/ops.rs b/src/openhuman/local_ai/ops.rs
index d1da46db44..5a453b8031 100644
--- a/src/openhuman/local_ai/ops.rs
+++ b/src/openhuman/local_ai/ops.rs
@@ -348,7 +348,10 @@ pub async fn local_ai_download_asset(
         .download_asset(config, capability.trim())
         .await
         .map_err(|e| e.to_string())?;
-    Ok(RpcOutcome::single_log(output, "local ai asset download triggered"))
+    Ok(RpcOutcome::single_log(
+        output,
+        "local ai asset download triggered",
+    ))
 }
 
 /// A single message in a local AI chat conversation.

From 97facae5927524ddb29d819ab3f39597a3d5e378 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 17:58:22 -0700
Subject: [PATCH 09/18] Remove unused Tenor backend search helper

---
 src/api/rest.rs | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/src/api/rest.rs b/src/api/rest.rs
index d068041540..81bfbb3a92 100644
--- a/src/api/rest.rs
+++ b/src/api/rest.rs
@@ -819,28 +819,6 @@ impl BackendOAuthClient {
         .await
     }
 
-    /// Searches for GIFs using the Tenor integration.
-    pub async fn search_tenor_gifs(
-        &self,
-        bearer_jwt: &str,
-        query: &str,
-        limit: Option<u32>,
-    ) -> Result<Value> {
-        anyhow::ensure!(!query.trim().is_empty(), "query is required");
-        let body = serde_json::json!({
-            "query": query.trim(),
-            "limit": limit.unwrap_or(5),
-            "contentFilter": "medium",
-        });
-        self.authed_json(
-            bearer_jwt,
-            Method::POST,
-            "agent-integrations/tenor/search",
-            Some(body),
-        )
-        .await
-    }
-
     /// Creates a new thread in a communication channel.
     pub async fn create_channel_thread(
         &self,

From 9c89f067a3c4c0d80df396cdf2ca2f4bda8be910 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 18:07:22 -0700
Subject: [PATCH 10/18] chore: apply module ordering format

---
 src/openhuman/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/openhuman/mod.rs b/src/openhuman/mod.rs
index b6d6737a0f..6194c6ae7e 100644
--- a/src/openhuman/mod.rs
+++ b/src/openhuman/mod.rs
@@ -36,8 +36,8 @@ pub mod embeddings;
 pub mod encryption;
 pub mod health;
 pub mod heartbeat;
-pub mod inference;
 pub mod http_host;
+pub mod inference;
 pub mod integrations;
 pub mod javascript;
 pub mod learning;

From 8464d55ab984607cddaf2491ef2d697abd0caa12 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 18:17:30 -0700
Subject: [PATCH 11/18] Move inference RPCs out of local_ai namespace

---
 .../local-model/ModelDownloadSection.tsx      |   2 +-
 app/src/utils/__tests__/tauriCommands.test.ts |   2 +-
 src/core/observability.rs                     |   2 +-
 src/openhuman/inference/ops.rs                |  45 +--
 src/openhuman/inference/ops_tests.rs          |  18 -
 src/openhuman/inference/schemas.rs            | 222 ++++++-----
 src/openhuman/inference/schemas_tests.rs      |   4 +-
 src/openhuman/local_ai/schemas.rs             | 348 ------------------
 tests/json_rpc_e2e.rs                         |   2 +-
 9 files changed, 133 insertions(+), 512 deletions(-)

diff --git a/app/src/components/settings/panels/local-model/ModelDownloadSection.tsx b/app/src/components/settings/panels/local-model/ModelDownloadSection.tsx
index 5ac567c75f..ae45f1e067 100644
--- a/app/src/components/settings/panels/local-model/ModelDownloadSection.tsx
+++ b/app/src/components/settings/panels/local-model/ModelDownloadSection.tsx
@@ -155,7 +155,7 @@ const ModelDownloadSection = ({
           />
           <div className="flex items-center justify-between">
             <div className="text-xs text-stone-500">
-              Calls `openhuman.local_ai_summarize` via Rust core
+              Calls `openhuman.inference_summarize` via Rust core
             </div>
             <button
               onClick={onRunSummaryTest}
diff --git a/app/src/utils/__tests__/tauriCommands.test.ts b/app/src/utils/__tests__/tauriCommands.test.ts
index 31be1c4282..af394047a1 100644
--- a/app/src/utils/__tests__/tauriCommands.test.ts
+++ b/app/src/utils/__tests__/tauriCommands.test.ts
@@ -98,7 +98,7 @@ describe('tauriCommands', () => {
   });
 
   test('openhumanLocalAiStatus returns upgrade hint on unknown method', async () => {
-    mockCallCoreRpc.mockRejectedValueOnce(new Error('unknown method: openhuman.local_ai_status'));
+    mockCallCoreRpc.mockRejectedValueOnce(new Error('unknown method: openhuman.inference_status'));
 
     await expect(openhumanLocalAiStatus()).rejects.toThrow(
       'Local model runtime is unavailable in this core build. Restart app after updating to the latest build.'
diff --git a/src/core/observability.rs b/src/core/observability.rs
index 68ad0054b4..a1d70deca3 100644
--- a/src/core/observability.rs
+++ b/src/core/observability.rs
@@ -2130,7 +2130,7 @@ mod tests {
             "local ai is disabled",
             "rpc",
             "invoke_method",
-            &[("method", "openhuman.local_ai_prompt")],
+            &[("method", "openhuman.inference_prompt")],
         );
         report_error_or_expected(
             "ollama API key not set",
diff --git a/src/openhuman/inference/ops.rs b/src/openhuman/inference/ops.rs
index a04052a21f..f0950966c8 100644
--- a/src/openhuman/inference/ops.rs
+++ b/src/openhuman/inference/ops.rs
@@ -2,10 +2,9 @@
 
 use crate::openhuman::config::Config;
 use crate::openhuman::local_ai;
-use crate::openhuman::local_ai::gif_decision::GifDecision;
 use crate::openhuman::local_ai::ops::{LocalAiChatMessage, ReactionDecision};
 use crate::openhuman::local_ai::sentiment::SentimentResult;
-use crate::openhuman::local_ai::{LocalAiEmbeddingResult, LocalAiStatus, TenorSearchResult};
+use crate::openhuman::local_ai::{LocalAiEmbeddingResult, LocalAiStatus};
 use crate::rpc::RpcOutcome;
 use tracing::{debug, error};
 
@@ -159,48 +158,6 @@ pub async fn inference_analyze_sentiment(
     result
 }
 
-pub async fn inference_should_send_gif(
-    config: &Config,
-    message: &str,
-    channel_type: &str,
-) -> Result<RpcOutcome<GifDecision>, String> {
-    debug!(
-        message_len = message.len(),
-        channel_type, "{LOG_PREFIX} should_send_gif:start"
-    );
-    let result =
-        local_ai::gif_decision::local_ai_should_send_gif(config, message, channel_type).await;
-    match &result {
-        Ok(outcome) => debug!(
-            should_send_gif = outcome.value.should_send_gif,
-            "{LOG_PREFIX} should_send_gif:ok"
-        ),
-        Err(err) => error!(error = %err, "{LOG_PREFIX} should_send_gif:error"),
-    }
-    result
-}
-
-pub async fn inference_tenor_search(
-    config: &Config,
-    query: &str,
-    limit: Option<u32>,
-) -> Result<RpcOutcome<TenorSearchResult>, String> {
-    debug!(
-        query_len = query.len(),
-        ?limit,
-        "{LOG_PREFIX} tenor_search:start"
-    );
-    let result = local_ai::gif_decision::tenor_search(config, query, limit).await;
-    match &result {
-        Ok(outcome) => debug!(
-            result_count = outcome.value.results.len(),
-            "{LOG_PREFIX} tenor_search:ok"
-        ),
-        Err(err) => error!(error = %err, "{LOG_PREFIX} tenor_search:error"),
-    }
-    result
-}
-
 #[cfg(test)]
 #[path = "ops_tests.rs"]
 mod tests;
diff --git a/src/openhuman/inference/ops_tests.rs b/src/openhuman/inference/ops_tests.rs
index e95fcfbed0..655bc029a9 100644
--- a/src/openhuman/inference/ops_tests.rs
+++ b/src/openhuman/inference/ops_tests.rs
@@ -76,21 +76,3 @@ async fn inference_analyze_sentiment_handles_empty_message() {
         .expect("sentiment");
     assert_eq!(outcome.value.valence, "neutral");
 }
-
-#[tokio::test]
-async fn inference_should_send_gif_short_circuits_for_empty_message() {
-    let (config, _tmp) = disabled_config();
-    let outcome = inference_should_send_gif(&config, "   ", "web")
-        .await
-        .expect("gif decision");
-    assert!(!outcome.value.should_send_gif);
-}
-
-#[tokio::test]
-async fn inference_tenor_search_requires_query() {
-    let (config, _tmp) = disabled_config();
-    let err = inference_tenor_search(&config, "   ", Some(3))
-        .await
-        .expect_err("query validation should fail");
-    assert!(err.contains("query is required"));
-}
diff --git a/src/openhuman/inference/schemas.rs b/src/openhuman/inference/schemas.rs
index d6233ecf29..9392a19c78 100644
--- a/src/openhuman/inference/schemas.rs
+++ b/src/openhuman/inference/schemas.rs
@@ -3,7 +3,7 @@ use serde::Deserialize;
 use serde_json::{Map, Value};
 
 use crate::core::all::{ControllerFuture, RegisteredController};
-use crate::core::ControllerSchema;
+use crate::core::{ControllerSchema, FieldSchema, TypeSchema};
 use crate::openhuman::config::rpc as config_rpc;
 use crate::rpc::RpcOutcome;
 
@@ -55,18 +55,6 @@ struct InferenceAnalyzeSentimentParams {
     message: String,
 }
 
-#[derive(Debug, Deserialize)]
-struct InferenceShouldSendGifParams {
-    message: String,
-    channel_type: String,
-}
-
-#[derive(Debug, Deserialize)]
-struct InferenceTenorSearchParams {
-    query: String,
-    limit: Option<u32>,
-}
-
 pub fn all_controller_schemas() -> Vec<ControllerSchema> {
     vec![
         schemas("status"),
@@ -77,8 +65,6 @@ pub fn all_controller_schemas() -> Vec<ControllerSchema> {
         schemas("chat"),
         schemas("should_react"),
         schemas("analyze_sentiment"),
-        schemas("should_send_gif"),
-        schemas("tenor_search"),
     ]
 }
 
@@ -116,68 +102,136 @@ pub fn all_registered_controllers() -> Vec<RegisteredController> {
             schema: schemas("analyze_sentiment"),
             handler: handle_inference_analyze_sentiment,
         },
-        RegisteredController {
-            schema: schemas("should_send_gif"),
-            handler: handle_inference_should_send_gif,
-        },
-        RegisteredController {
-            schema: schemas("tenor_search"),
-            handler: handle_inference_tenor_search,
-        },
     ]
 }
 
 pub fn schemas(function: &str) -> ControllerSchema {
-    let (source, target_function) = match function {
-        "status" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_status"),
-            "status",
-        ),
-        "summarize" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_summarize"),
-            "summarize",
-        ),
-        "prompt" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_prompt"),
-            "prompt",
-        ),
-        "vision_prompt" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_vision_prompt"),
-            "vision_prompt",
-        ),
-        "embed" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_embed"),
-            "embed",
-        ),
-        "chat" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_chat"),
-            "chat",
-        ),
-        "should_react" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_should_react"),
-            "should_react",
-        ),
-        "analyze_sentiment" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_analyze_sentiment"),
-            "analyze_sentiment",
-        ),
-        "should_send_gif" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_should_send_gif"),
-            "should_send_gif",
-        ),
-        "tenor_search" => (
-            crate::openhuman::local_ai::local_ai_controller_schema("local_ai_tenor_search"),
-            "tenor_search",
-        ),
+    match function {
+        "status" => ControllerSchema {
+            namespace: "inference",
+            function: "status",
+            description: "Read inference service status.",
+            inputs: vec![],
+            outputs: vec![json_output("status", "Inference status payload.")],
+        },
+        "summarize" => ControllerSchema {
+            namespace: "inference",
+            function: "summarize",
+            description: "Summarize text with the configured inference provider.",
+            inputs: vec![
+                required_string("text", "Input text."),
+                optional_u64("max_tokens", "Optional max output tokens."),
+            ],
+            outputs: vec![json_output("summary", "Summary text.")],
+        },
+        "prompt" => ControllerSchema {
+            namespace: "inference",
+            function: "prompt",
+            description: "Run a direct inference prompt.",
+            inputs: vec![
+                required_string("prompt", "Prompt text."),
+                optional_u64("max_tokens", "Optional max output tokens."),
+                optional_bool("no_think", "Disable thinking mode."),
+            ],
+            outputs: vec![json_output("output", "Prompt output text.")],
+        },
+        "vision_prompt" => ControllerSchema {
+            namespace: "inference",
+            function: "vision_prompt",
+            description: "Run a multimodal inference prompt with image refs.",
+            inputs: vec![
+                required_string("prompt", "Prompt text."),
+                FieldSchema {
+                    name: "image_refs",
+                    ty: TypeSchema::Array(Box::new(TypeSchema::String)),
+                    comment: "Image references to include.",
+                    required: true,
+                },
+                optional_u64("max_tokens", "Optional max output tokens."),
+            ],
+            outputs: vec![json_output("output", "Prompt output text.")],
+        },
+        "embed" => ControllerSchema {
+            namespace: "inference",
+            function: "embed",
+            description: "Generate embeddings for text inputs.",
+            inputs: vec![FieldSchema {
+                name: "inputs",
+                ty: TypeSchema::Array(Box::new(TypeSchema::String)),
+                comment: "Texts to embed.",
+                required: true,
+            }],
+            outputs: vec![json_output("embedding", "Embedding result payload.")],
+        },
+        "chat" => ControllerSchema {
+            namespace: "inference",
+            function: "chat",
+            description: "Multi-turn chat completion via the configured inference provider.",
+            inputs: vec![
+                FieldSchema {
+                    name: "messages",
+                    ty: TypeSchema::Array(Box::new(TypeSchema::Json)),
+                    comment: "Chat message history [{role, content}]. Last entry is the user turn.",
+                    required: true,
+                },
+                optional_u64("max_tokens", "Optional max output tokens."),
+            ],
+            outputs: vec![json_output("reply", "Assistant reply text.")],
+        },
+        "should_react" => ControllerSchema {
+            namespace: "inference",
+            function: "should_react",
+            description: "Ask the inference provider whether the assistant should add an emoji reaction to a user message, based on channel type.",
+            inputs: vec![
+                required_string("message", "User message content to evaluate."),
+                required_string("channel_type", "Channel type: web, telegram, discord, slack, etc."),
+            ],
+            outputs: vec![json_output("decision", "Reaction decision: {should_react, emoji}.")],
+        },
+        "analyze_sentiment" => ControllerSchema {
+            namespace: "inference",
+            function: "analyze_sentiment",
+            description: "Classify the emotion and valence of a user message with the inference provider.",
+            inputs: vec![required_string("message", "User message content to classify.")],
+            outputs: vec![json_output("sentiment", "Sentiment analysis payload.")],
+        },
         other => panic!("unknown inference schema: {other}"),
-    };
+    }
+}
+
+fn required_string(name: &'static str, comment: &'static str) -> FieldSchema {
+    FieldSchema {
+        name,
+        ty: TypeSchema::String,
+        comment,
+        required: true,
+    }
+}
 
-    ControllerSchema {
-        namespace: "inference",
-        function: target_function,
-        description: source.description,
-        inputs: source.inputs,
-        outputs: source.outputs,
+fn optional_bool(name: &'static str, comment: &'static str) -> FieldSchema {
+    FieldSchema {
+        name,
+        ty: TypeSchema::Option(Box::new(TypeSchema::Bool)),
+        comment,
+        required: false,
+    }
+}
+
+fn optional_u64(name: &'static str, comment: &'static str) -> FieldSchema {
+    FieldSchema {
+        name,
+        ty: TypeSchema::Option(Box::new(TypeSchema::U64)),
+        comment,
+        required: false,
+    }
+}
+
+fn json_output(name: &'static str, comment: &'static str) -> FieldSchema {
+    FieldSchema {
+        name,
+        ty: TypeSchema::Json,
+        comment,
+        required: true,
     }
 }
 
@@ -286,32 +340,6 @@ fn handle_inference_analyze_sentiment(params: Map<String, Value>) -> ControllerF
     })
 }
 
-fn handle_inference_should_send_gif(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<InferenceShouldSendGifParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::inference::rpc::inference_should_send_gif(
-                &config,
-                &p.message,
-                &p.channel_type,
-            )
-            .await?,
-        )
-    })
-}
-
-fn handle_inference_tenor_search(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<InferenceTenorSearchParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::inference::rpc::inference_tenor_search(&config, &p.query, p.limit)
-                .await?,
-        )
-    })
-}
-
 fn deserialize_params<T: DeserializeOwned>(params: Map<String, Value>) -> Result<T, String> {
     serde_json::from_value(Value::Object(params)).map_err(|e| format!("invalid params: {e}"))
 }
diff --git a/src/openhuman/inference/schemas_tests.rs b/src/openhuman/inference/schemas_tests.rs
index 86a682c1d0..50b4c9bfa1 100644
--- a/src/openhuman/inference/schemas_tests.rs
+++ b/src/openhuman/inference/schemas_tests.rs
@@ -5,7 +5,7 @@ fn inference_catalog_counts_match_and_nonempty() {
     let declared = all_controller_schemas();
     let registered = all_registered_controllers();
     assert_eq!(declared.len(), registered.len());
-    assert!(declared.len() >= 10);
+    assert!(declared.len() >= 8);
 }
 
 #[test]
@@ -32,6 +32,8 @@ fn inference_schema_function_names_are_stable() {
     assert!(functions.contains(&"vision_prompt"));
     assert!(functions.contains(&"embed"));
     assert!(functions.contains(&"chat"));
+    assert!(!functions.contains(&"should_send_gif"));
+    assert!(!functions.contains(&"tenor_search"));
 }
 
 #[test]
diff --git a/src/openhuman/local_ai/schemas.rs b/src/openhuman/local_ai/schemas.rs
index 473d01ccd8..58ccfa8e84 100644
--- a/src/openhuman/local_ai/schemas.rs
+++ b/src/openhuman/local_ai/schemas.rs
@@ -14,31 +14,6 @@ struct AgentChatParams {
     temperature: Option<f64>,
 }
 
-#[derive(Debug, Deserialize)]
-struct LocalAiSummarizeParams {
-    text: String,
-    max_tokens: Option<u32>,
-}
-
-#[derive(Debug, Deserialize)]
-struct LocalAiPromptParams {
-    prompt: String,
-    max_tokens: Option<u32>,
-    no_think: Option<bool>,
-}
-
-#[derive(Debug, Deserialize)]
-struct LocalAiVisionPromptParams {
-    prompt: String,
-    image_refs: Vec<String>,
-    max_tokens: Option<u32>,
-}
-
-#[derive(Debug, Deserialize)]
-struct LocalAiEmbedParams {
-    inputs: Vec<String>,
-}
-
 #[derive(Debug, Deserialize)]
 struct LocalAiTranscribeParams {
     audio_path: String,
@@ -66,41 +41,6 @@ struct LocalAiApplyPresetParams {
     tier: String,
 }
 
-#[derive(Debug, Deserialize)]
-struct LocalAiChatMessageParam {
-    role: String,
-    content: String,
-}
-
-#[derive(Debug, Deserialize)]
-struct LocalAiChatParams {
-    messages: Vec<LocalAiChatMessageParam>,
-    max_tokens: Option<u32>,
-}
-
-#[derive(Debug, Deserialize)]
-struct LocalAiShouldReactParams {
-    message: String,
-    channel_type: String,
-}
-
-#[derive(Debug, Deserialize)]
-struct LocalAiAnalyzeSentimentParams {
-    message: String,
-}
-
-#[derive(Debug, Deserialize)]
-struct LocalAiShouldSendGifParams {
-    message: String,
-    channel_type: String,
-}
-
-#[derive(Debug, Deserialize)]
-struct LocalAiTenorSearchParams {
-    query: String,
-    limit: Option<u32>,
-}
-
 #[derive(Debug, Deserialize)]
 struct LocalAiInstallWhisperParams {
     /// Optional model size (`tiny`, `base`, `small`, `medium`,
@@ -127,11 +67,6 @@ pub fn all_controller_schemas() -> Vec<ControllerSchema> {
     vec![
         schemas("agent_chat"),
         schemas("agent_chat_simple"),
-        schemas("local_ai_status"),
-        schemas("local_ai_summarize"),
-        schemas("local_ai_prompt"),
-        schemas("local_ai_vision_prompt"),
-        schemas("local_ai_embed"),
         schemas("local_ai_transcribe"),
         schemas("local_ai_transcribe_bytes"),
         schemas("local_ai_tts"),
@@ -142,11 +77,6 @@ pub fn all_controller_schemas() -> Vec<ControllerSchema> {
         schemas("local_ai_presets"),
         schemas("local_ai_apply_preset"),
         schemas("local_ai_diagnostics"),
-        schemas("local_ai_chat"),
-        schemas("local_ai_should_react"),
-        schemas("local_ai_analyze_sentiment"),
-        schemas("local_ai_should_send_gif"),
-        schemas("local_ai_tenor_search"),
         schemas("local_ai_install_whisper"),
         schemas("local_ai_install_piper"),
         schemas("local_ai_whisper_install_status"),
@@ -164,26 +94,6 @@ pub fn all_registered_controllers() -> Vec<RegisteredController> {
             schema: schemas("agent_chat_simple"),
             handler: handle_agent_chat_simple,
         },
-        RegisteredController {
-            schema: schemas("local_ai_status"),
-            handler: handle_local_ai_status,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_summarize"),
-            handler: handle_local_ai_summarize,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_prompt"),
-            handler: handle_local_ai_prompt,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_vision_prompt"),
-            handler: handle_local_ai_vision_prompt,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_embed"),
-            handler: handle_local_ai_embed,
-        },
         RegisteredController {
             schema: schemas("local_ai_transcribe"),
             handler: handle_local_ai_transcribe,
@@ -224,26 +134,6 @@ pub fn all_registered_controllers() -> Vec<RegisteredController> {
             schema: schemas("local_ai_diagnostics"),
             handler: handle_local_ai_diagnostics,
         },
-        RegisteredController {
-            schema: schemas("local_ai_chat"),
-            handler: handle_local_ai_chat,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_should_react"),
-            handler: handle_local_ai_should_react,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_analyze_sentiment"),
-            handler: handle_local_ai_analyze_sentiment,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_should_send_gif"),
-            handler: handle_local_ai_should_send_gif,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_tenor_search"),
-            handler: handle_local_ai_tenor_search,
-        },
         RegisteredController {
             schema: schemas("local_ai_install_whisper"),
             handler: handle_local_ai_install_whisper,
@@ -287,62 +177,6 @@ pub fn schemas(function: &str) -> ControllerSchema {
             ],
             outputs: vec![json_output("response", "Agent response payload.")],
         },
-        "local_ai_status" => ControllerSchema {
-            namespace: "local_ai",
-            function: "status",
-            description: "Read local AI service status.",
-            inputs: vec![],
-            outputs: vec![json_output("status", "Local AI status payload.")],
-        },
-        "local_ai_summarize" => ControllerSchema {
-            namespace: "local_ai",
-            function: "summarize",
-            description: "Summarize text with local AI model.",
-            inputs: vec![
-                required_string("text", "Input text."),
-                optional_u64("max_tokens", "Optional max output tokens."),
-            ],
-            outputs: vec![json_output("summary", "Summary text.")],
-        },
-        "local_ai_prompt" => ControllerSchema {
-            namespace: "local_ai",
-            function: "prompt",
-            description: "Run direct local AI prompt.",
-            inputs: vec![
-                required_string("prompt", "Prompt text."),
-                optional_u64("max_tokens", "Optional max output tokens."),
-                optional_bool("no_think", "Disable thinking mode."),
-            ],
-            outputs: vec![json_output("output", "Prompt output text.")],
-        },
-        "local_ai_vision_prompt" => ControllerSchema {
-            namespace: "local_ai",
-            function: "vision_prompt",
-            description: "Run multimodal local AI prompt with image refs.",
-            inputs: vec![
-                required_string("prompt", "Prompt text."),
-                FieldSchema {
-                    name: "image_refs",
-                    ty: TypeSchema::Array(Box::new(TypeSchema::String)),
-                    comment: "Image references to include.",
-                    required: true,
-                },
-                optional_u64("max_tokens", "Optional max output tokens."),
-            ],
-            outputs: vec![json_output("output", "Prompt output text.")],
-        },
-        "local_ai_embed" => ControllerSchema {
-            namespace: "local_ai",
-            function: "embed",
-            description: "Generate embeddings for text inputs.",
-            inputs: vec![FieldSchema {
-                name: "inputs",
-                ty: TypeSchema::Array(Box::new(TypeSchema::String)),
-                comment: "Texts to embed.",
-                required: true,
-            }],
-            outputs: vec![json_output("embedding", "Embedding result payload.")],
-        },
         "local_ai_transcribe" => ControllerSchema {
             namespace: "local_ai",
             function: "transcribe",
@@ -434,60 +268,6 @@ pub fn schemas(function: &str) -> ControllerSchema {
             inputs: vec![],
             outputs: vec![json_output("diagnostics", "Diagnostic report.")],
         },
-        "local_ai_chat" => ControllerSchema {
-            namespace: "local_ai",
-            function: "chat",
-            description: "Multi-turn chat completion via local Ollama model. Does not call the cloud API.",
-            inputs: vec![
-                FieldSchema {
-                    name: "messages",
-                    ty: TypeSchema::Array(Box::new(TypeSchema::Json)),
-                    comment: "Chat message history [{role, content}]. Last entry is the user turn.",
-                    required: true,
-                },
-                optional_u64("max_tokens", "Optional max output tokens."),
-            ],
-            outputs: vec![json_output("reply", "Assistant reply text.")],
-        },
-        "local_ai_should_react" => ControllerSchema {
-            namespace: "local_ai",
-            function: "should_react",
-            description: "Ask the local model whether the assistant should add an emoji reaction to a user message, based on channel type.",
-            inputs: vec![
-                required_string("message", "User message content to evaluate."),
-                required_string("channel_type", "Channel type: web, telegram, discord, slack, etc."),
-            ],
-            outputs: vec![json_output("decision", "Reaction decision: {should_react, emoji}.")],
-        },
-        "local_ai_analyze_sentiment" => ControllerSchema {
-            namespace: "local_ai",
-            function: "analyze_sentiment",
-            description: "Classify the emotion and sentiment of a user message. Returns emotion label, valence, and confidence.",
-            inputs: vec![
-                required_string("message", "User message content to analyze."),
-            ],
-            outputs: vec![json_output("sentiment", "Sentiment result: {emotion, valence, confidence}.")],
-        },
-        "local_ai_should_send_gif" => ControllerSchema {
-            namespace: "local_ai",
-            function: "should_send_gif",
-            description: "Ask the local model whether a GIF response is appropriate, and if so return a Tenor search query.",
-            inputs: vec![
-                required_string("message", "User message content to evaluate."),
-                required_string("channel_type", "Channel type: web, telegram, discord, slack, etc."),
-            ],
-            outputs: vec![json_output("decision", "GIF decision: {should_send_gif, search_query}.")],
-        },
-        "local_ai_tenor_search" => ControllerSchema {
-            namespace: "local_ai",
-            function: "tenor_search",
-            description: "Search for GIFs via the backend Tenor proxy. Requires a valid session.",
-            inputs: vec![
-                required_string("query", "Tenor search query."),
-                optional_u64("limit", "Max results to return (default 5, max 50)."),
-            ],
-            outputs: vec![json_output("result", "Tenor search result: {results, next}.")],
-        },
         "local_ai_install_whisper" => ControllerSchema {
             namespace: "local_ai",
             function: "install_whisper",
@@ -581,64 +361,6 @@ fn handle_agent_chat_simple(params: Map<String, Value>) -> ControllerFuture {
     })
 }
 
-fn handle_local_ai_status(_params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(crate::openhuman::local_ai::rpc::local_ai_status(&config).await?)
-    })
-}
-
-fn handle_local_ai_summarize(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiSummarizeParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::rpc::local_ai_summarize(&config, &p.text, p.max_tokens)
-                .await?,
-        )
-    })
-}
-
-fn handle_local_ai_prompt(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiPromptParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::rpc::local_ai_prompt(
-                &config,
-                &p.prompt,
-                p.max_tokens,
-                p.no_think,
-            )
-            .await?,
-        )
-    })
-}
-
-fn handle_local_ai_vision_prompt(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiVisionPromptParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::rpc::local_ai_vision_prompt(
-                &config,
-                &p.prompt,
-                &p.image_refs,
-                p.max_tokens,
-            )
-            .await?,
-        )
-    })
-}
-
-fn handle_local_ai_embed(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiEmbedParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(crate::openhuman::local_ai::rpc::local_ai_embed(&config, &p.inputs).await?)
-    })
-}
-
 fn handle_local_ai_transcribe(params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let p = deserialize_params::<LocalAiTranscribeParams>(params)?;
@@ -830,76 +552,6 @@ fn handle_local_ai_diagnostics(_params: Map<String, Value>) -> ControllerFuture
     })
 }
 
-fn handle_local_ai_should_react(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiShouldReactParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::rpc::local_ai_should_react(
-                &config,
-                &p.message,
-                &p.channel_type,
-            )
-            .await?,
-        )
-    })
-}
-
-fn handle_local_ai_analyze_sentiment(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiAnalyzeSentimentParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::sentiment::local_ai_analyze_sentiment(&config, &p.message)
-                .await?,
-        )
-    })
-}
-
-fn handle_local_ai_should_send_gif(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiShouldSendGifParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::gif_decision::local_ai_should_send_gif(
-                &config,
-                &p.message,
-                &p.channel_type,
-            )
-            .await?,
-        )
-    })
-}
-
-fn handle_local_ai_tenor_search(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiTenorSearchParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        to_json(
-            crate::openhuman::local_ai::gif_decision::tenor_search(&config, &p.query, p.limit)
-                .await?,
-        )
-    })
-}
-
-fn handle_local_ai_chat(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiChatParams>(params)?;
-        let config = config_rpc::load_config_with_timeout().await?;
-        let messages: Vec<crate::openhuman::local_ai::rpc::LocalAiChatMessage> = p
-            .messages
-            .into_iter()
-            .map(|m| crate::openhuman::local_ai::rpc::LocalAiChatMessage {
-                role: m.role,
-                content: m.content,
-            })
-            .collect();
-        to_json(
-            crate::openhuman::local_ai::rpc::local_ai_chat(&config, messages, p.max_tokens).await?,
-        )
-    })
-}
-
 // The install RPCs are intentionally fire-and-forget: a binary+model
 // download can take minutes (1.6 GB GGML model, ~5 MB Piper binary
 // archive) but the core JSON-RPC client times out at
diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs
index cc35c6327b..f919f3b1f2 100644
--- a/tests/json_rpc_e2e.rs
+++ b/tests/json_rpc_e2e.rs
@@ -3595,7 +3595,7 @@ async fn json_rpc_local_ai_lm_studio_config_diagnostics_and_prompt() {
     let prompt = post_json_rpc(
         &rpc_base,
         38,
-        "openhuman.local_ai_prompt",
+        "openhuman.inference_prompt",
         json!({
             "prompt": "hello",
             "max_tokens": 16,

From 1bfa13e6eb455c84d889070b91d186574032fcab Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 18:39:54 -0700
Subject: [PATCH 12/18] Move inference management RPCs into inference

---
 .../services/__tests__/coreRpcClient.test.ts  |   2 +-
 app/src/services/__tests__/rpcMethods.test.ts |   8 +-
 .../api/__tests__/aiSettingsApi.test.ts       |   4 +-
 app/src/services/api/aiSettingsApi.ts         |   5 +-
 app/src/services/rpcMethods.ts                |  19 +-
 .../tauriCommands/__tests__/config.test.ts    |   4 +-
 app/src/utils/tauriCommands/config.test.ts    |   4 +-
 app/src/utils/tauriCommands/config.ts         |   6 +-
 app/src/utils/tauriCommands/localAi.ts        |   8 +-
 src/core/all.rs                               |   3 -
 src/core/legacy_aliases.rs                    |  21 +-
 src/openhuman/config/ops.rs                   |  59 +++
 src/openhuman/config/schemas.rs               |  66 +---
 src/openhuman/inference/ops.rs                | 166 ++++++++
 src/openhuman/inference/ops_tests.rs          |  33 ++
 src/openhuman/inference/schemas.rs            | 367 ++++++++++++++++++
 src/openhuman/inference/schemas_tests.rs      |  10 +-
 src/openhuman/local_ai/README.md              |   4 +-
 src/openhuman/local_ai/schemas.rs             | 188 ---------
 src/openhuman/local_ai/schemas_tests.rs       | 111 +-----
 src/openhuman/providers/mod.rs                |   4 -
 src/openhuman/providers/ops.rs                | 129 ++++++
 src/openhuman/providers/schemas.rs            | 139 +------
 tests/json_rpc_e2e.rs                         |  43 +-
 24 files changed, 856 insertions(+), 547 deletions(-)

diff --git a/app/src/services/__tests__/coreRpcClient.test.ts b/app/src/services/__tests__/coreRpcClient.test.ts
index 84667bddd8..33faf8aee2 100644
--- a/app/src/services/__tests__/coreRpcClient.test.ts
+++ b/app/src/services/__tests__/coreRpcClient.test.ts
@@ -176,7 +176,7 @@ describe('coreRpcClient', () => {
     ['openhuman.set_browser_allow_all', 'openhuman.config_set_browser_allow_all'],
     ['openhuman.update_browser_settings', 'openhuman.config_update_browser_settings'],
     ['openhuman.update_memory_settings', 'openhuman.config_update_memory_settings'],
-    ['openhuman.update_model_settings', 'openhuman.config_update_model_settings'],
+    ['openhuman.update_model_settings', 'openhuman.inference_update_model_settings'],
     ['openhuman.update_runtime_settings', 'openhuman.config_update_runtime_settings'],
     [
       'openhuman.update_screen_intelligence_settings',
diff --git a/app/src/services/__tests__/rpcMethods.test.ts b/app/src/services/__tests__/rpcMethods.test.ts
index 502e886832..dae81085ff 100644
--- a/app/src/services/__tests__/rpcMethods.test.ts
+++ b/app/src/services/__tests__/rpcMethods.test.ts
@@ -46,7 +46,7 @@ describe('rpcMethods catalog', () => {
 
   test('legacy aliases point at canonical method values', () => {
     expect(LEGACY_METHOD_ALIASES['openhuman.update_model_settings']).toBe(
-      CORE_RPC_METHODS.configUpdateModelSettings
+      CORE_RPC_METHODS.inferenceUpdateModelSettings
     );
     expect(LEGACY_METHOD_ALIASES['openhuman.workspace_onboarding_flag_set']).toBe(
       CORE_RPC_METHODS.configWorkspaceOnboardingFlagSet
@@ -67,6 +67,10 @@ describe('rpcMethods catalog', () => {
         path.resolve(__dirname, '../../../../src/openhuman/providers/schemas.rs'),
         'utf8'
       ),
+      fs.readFileSync(
+        path.resolve(__dirname, '../../../../src/openhuman/inference/schemas.rs'),
+        'utf8'
+      ),
     ].join('\n');
 
     for (const method of Object.values(CORE_RPC_METHODS)) {
@@ -75,6 +79,8 @@ describe('rpcMethods catalog', () => {
       const methodRoot = method.slice('openhuman.'.length);
       const namespace = methodRoot.startsWith('screen_intelligence_')
         ? 'screen_intelligence'
+        : methodRoot.startsWith('inference_')
+          ? 'inference'
         : methodRoot.startsWith('providers_')
           ? 'providers'
           : 'config';
diff --git a/app/src/services/api/__tests__/aiSettingsApi.test.ts b/app/src/services/api/__tests__/aiSettingsApi.test.ts
index 65850f447e..673e27a2b3 100644
--- a/app/src/services/api/__tests__/aiSettingsApi.test.ts
+++ b/app/src/services/api/__tests__/aiSettingsApi.test.ts
@@ -612,7 +612,7 @@ describe('listProviderModels', () => {
     mockIsTauri.mockReturnValue(true);
   });
 
-  it('dispatches openhuman.providers_list_models with provider_id and returns models', async () => {
+  it('dispatches openhuman.inference_list_models with provider_id and returns models', async () => {
     mockCallCoreRpc.mockResolvedValue({
       result: {
         models: [
@@ -625,7 +625,7 @@ describe('listProviderModels', () => {
     const models = await listProviderModels('p_openai_1');
 
     expect(mockCallCoreRpc).toHaveBeenCalledWith({
-      method: 'openhuman.providers_list_models',
+      method: 'openhuman.inference_list_models',
       params: { provider_id: 'p_openai_1' },
     });
     expect(models).toHaveLength(2);
diff --git a/app/src/services/api/aiSettingsApi.ts b/app/src/services/api/aiSettingsApi.ts
index 7a5403c2ca..bf5b7f544e 100644
--- a/app/src/services/api/aiSettingsApi.ts
+++ b/app/src/services/api/aiSettingsApi.ts
@@ -4,7 +4,7 @@
  * Sits between the panel's React state and the Rust JSON-RPC core. Three
  * orthogonal surfaces in one place:
  *
- *  1. Cloud providers + per-workload routing → `openhuman.update_model_settings`
+ *  1. Cloud providers + per-workload routing → `openhuman.inference_update_model_settings`
  *  2. API keys for cloud providers           → `openhuman.auth_*_provider_credentials`
  *                                              (encrypted at rest in
  *                                              `auth-profiles.json`)
@@ -16,7 +16,6 @@
  * presentation.
  */
 import { callCoreRpc } from '../../services/coreRpcClient';
-import { CORE_RPC_METHODS } from '../../services/rpcMethods';
 import {
   authListProviderCredentials,
   type AuthProfileSummary,
@@ -271,7 +270,7 @@ export async function listProviderModels(providerId: string): Promise<ModelInfo[
   }
   try {
     const res = await callCoreRpc<{ result: { models: ModelInfo[] } }>({
-      method: CORE_RPC_METHODS.providersListModels,
+      method: 'openhuman.inference_list_models',
       params: { provider_id: providerId },
     });
     return res?.result?.models ?? [];
diff --git a/app/src/services/rpcMethods.ts b/app/src/services/rpcMethods.ts
index bc9320522f..008e539fb7 100644
--- a/app/src/services/rpcMethods.ts
+++ b/app/src/services/rpcMethods.ts
@@ -15,7 +15,15 @@ export const CORE_RPC_METHODS = {
   configWorkspaceOnboardingFlagExists: 'openhuman.config_workspace_onboarding_flag_exists',
   configWorkspaceOnboardingFlagSet: 'openhuman.config_workspace_onboarding_flag_set',
   corePing: 'core.ping',
-  providersListModels: 'openhuman.providers_list_models',
+  inferenceApplyPreset: 'openhuman.inference_apply_preset',
+  inferenceDiagnostics: 'openhuman.inference_diagnostics',
+  inferenceDeviceProfile: 'openhuman.inference_device_profile',
+  inferenceGetClientConfig: 'openhuman.inference_get_client_config',
+  inferenceListModels: 'openhuman.inference_list_models',
+  inferencePresets: 'openhuman.inference_presets',
+  inferenceUpdateLocalSettings: 'openhuman.inference_update_local_settings',
+  inferenceUpdateModelSettings: 'openhuman.inference_update_model_settings',
+  providersListModels: 'openhuman.inference_list_models',
   screenIntelligenceStatus: 'openhuman.screen_intelligence_status',
 } as const;
 
@@ -32,15 +40,20 @@ export const LEGACY_METHOD_ALIASES: Record<string, CoreRpcMethod> = {
   'openhuman.update_browser_settings': CORE_RPC_METHODS.configUpdateBrowserSettings,
   'openhuman.update_composio_trigger_settings':
     CORE_RPC_METHODS.configUpdateComposioTriggerSettings,
-  'openhuman.update_local_ai_settings': CORE_RPC_METHODS.configUpdateLocalAiSettings,
+  'openhuman.update_local_ai_settings': CORE_RPC_METHODS.inferenceUpdateLocalSettings,
   'openhuman.update_memory_settings': CORE_RPC_METHODS.configUpdateMemorySettings,
-  'openhuman.update_model_settings': CORE_RPC_METHODS.configUpdateModelSettings,
+  'openhuman.update_model_settings': CORE_RPC_METHODS.inferenceUpdateModelSettings,
   'openhuman.update_runtime_settings': CORE_RPC_METHODS.configUpdateRuntimeSettings,
   'openhuman.update_screen_intelligence_settings':
     CORE_RPC_METHODS.configUpdateScreenIntelligenceSettings,
   'openhuman.workspace_onboarding_flag_exists':
     CORE_RPC_METHODS.configWorkspaceOnboardingFlagExists,
   'openhuman.workspace_onboarding_flag_set': CORE_RPC_METHODS.configWorkspaceOnboardingFlagSet,
+  'openhuman.local_ai_apply_preset': CORE_RPC_METHODS.inferenceApplyPreset,
+  'openhuman.local_ai_device_profile': CORE_RPC_METHODS.inferenceDeviceProfile,
+  'openhuman.local_ai_diagnostics': CORE_RPC_METHODS.inferenceDiagnostics,
+  'openhuman.local_ai_presets': CORE_RPC_METHODS.inferencePresets,
+  'openhuman.providers_list_models': CORE_RPC_METHODS.inferenceListModels,
 };
 
 export function normalizeRpcMethod(method: string): string {
diff --git a/app/src/utils/tauriCommands/__tests__/config.test.ts b/app/src/utils/tauriCommands/__tests__/config.test.ts
index 1733263649..4f091d2467 100644
--- a/app/src/utils/tauriCommands/__tests__/config.test.ts
+++ b/app/src/utils/tauriCommands/__tests__/config.test.ts
@@ -22,7 +22,7 @@ describe('openhumanGetClientConfig', () => {
     await expect(openhumanGetClientConfig()).rejects.toThrow(/Not running in Tauri/i);
   });
 
-  it('dispatches openhuman.config_get_client_config and returns the response', async () => {
+  it('dispatches openhuman.inference_get_client_config and returns the response', async () => {
     const expected = {
       result: {
         api_url: 'https://api.openai.com/v1/chat/completions',
@@ -36,7 +36,7 @@ describe('openhumanGetClientConfig', () => {
 
     const got = await openhumanGetClientConfig();
 
-    expect(callCoreRpc).toHaveBeenCalledWith({ method: 'openhuman.config_get_client_config' });
+    expect(callCoreRpc).toHaveBeenCalledWith({ method: 'openhuman.inference_get_client_config' });
     expect(got).toEqual(expected);
   });
 });
diff --git a/app/src/utils/tauriCommands/config.test.ts b/app/src/utils/tauriCommands/config.test.ts
index 5ea1c1d131..c643aeafaf 100644
--- a/app/src/utils/tauriCommands/config.test.ts
+++ b/app/src/utils/tauriCommands/config.test.ts
@@ -35,7 +35,7 @@ describe('tauriCommands/config', () => {
       expect(mockCallCoreRpc).not.toHaveBeenCalled();
     });
 
-    test('forwards the patch to openhuman.config_update_local_ai_settings', async () => {
+    test('forwards the patch to openhuman.inference_update_local_settings', async () => {
       mockCallCoreRpc.mockResolvedValue({
         result: { config: {}, workspace_dir: '/tmp', config_path: '/tmp/cfg.toml' },
         logs: [],
@@ -52,7 +52,7 @@ describe('tauriCommands/config', () => {
       };
       await openhumanUpdateLocalAiSettings(patch);
       expect(mockCallCoreRpc).toHaveBeenCalledWith({
-        method: 'openhuman.config_update_local_ai_settings',
+        method: 'openhuman.inference_update_local_settings',
         params: patch,
       });
     });
diff --git a/app/src/utils/tauriCommands/config.ts b/app/src/utils/tauriCommands/config.ts
index 7094fba1bc..625d7bb8f8 100644
--- a/app/src/utils/tauriCommands/config.ts
+++ b/app/src/utils/tauriCommands/config.ts
@@ -225,7 +225,7 @@ export async function openhumanGetClientConfig(): Promise<CommandResponse<Client
     throw new Error('Not running in Tauri');
   }
   return await callCoreRpc<CommandResponse<ClientConfig>>({
-    method: 'openhuman.config_get_client_config',
+    method: 'openhuman.inference_get_client_config',
   });
 }
 
@@ -236,7 +236,7 @@ export async function openhumanUpdateModelSettings(
     throw new Error('Not running in Tauri');
   }
   return await callCoreRpc<CommandResponse<ConfigSnapshot>>({
-    method: CORE_RPC_METHODS.configUpdateModelSettings,
+    method: 'openhuman.inference_update_model_settings',
     params: update,
   });
 }
@@ -296,7 +296,7 @@ export async function openhumanUpdateLocalAiSettings(
     throw new Error('Not running in Tauri');
   }
   return await callCoreRpc<CommandResponse<ConfigSnapshot>>({
-    method: 'openhuman.config_update_local_ai_settings',
+    method: 'openhuman.inference_update_local_settings',
     params: update,
   });
 }
diff --git a/app/src/utils/tauriCommands/localAi.ts b/app/src/utils/tauriCommands/localAi.ts
index 486cf854bc..bcae39a194 100644
--- a/app/src/utils/tauriCommands/localAi.ts
+++ b/app/src/utils/tauriCommands/localAi.ts
@@ -365,23 +365,23 @@ export async function openhumanLocalAiDownloadAsset(
 }
 
 export async function openhumanLocalAiDeviceProfile(): Promise<DeviceProfileResult> {
-  return await callCoreRpc<DeviceProfileResult>({ method: 'openhuman.local_ai_device_profile' });
+  return await callCoreRpc<DeviceProfileResult>({ method: 'openhuman.inference_device_profile' });
 }
 
 export async function openhumanLocalAiPresets(): Promise<PresetsResponse> {
-  return await callCoreRpc<PresetsResponse>({ method: 'openhuman.local_ai_presets' });
+  return await callCoreRpc<PresetsResponse>({ method: 'openhuman.inference_presets' });
 }
 
 export async function openhumanLocalAiApplyPreset(tier: string): Promise<ApplyPresetResult> {
   return await callCoreRpc<ApplyPresetResult>({
-    method: 'openhuman.local_ai_apply_preset',
+    method: 'openhuman.inference_apply_preset',
     params: { tier },
   });
 }
 
 export async function openhumanLocalAiDiagnostics(): Promise<LocalAiDiagnostics> {
   return await callCoreRpc<LocalAiDiagnostics>({
-    method: 'openhuman.local_ai_diagnostics',
+    method: 'openhuman.inference_diagnostics',
     params: {},
   });
 }
diff --git a/src/core/all.rs b/src/core/all.rs
index 174e1c6057..be8e367efd 100644
--- a/src/core/all.rs
+++ b/src/core/all.rs
@@ -143,8 +143,6 @@ fn build_registered_controllers() -> Vec<RegisteredController> {
         .extend(crate::openhuman::channels::controllers::all_channels_registered_controllers());
     // Persistent configuration management
     controllers.extend(crate::openhuman::config::all_config_registered_controllers());
-    // Cloud provider model catalog queries
-    controllers.extend(crate::openhuman::providers::all_providers_registered_controllers());
     // Local sidecar reachability + backend Socket.IO state diagnostics (#1527)
     controllers.extend(crate::openhuman::connectivity::all_connectivity_registered_controllers());
     // User credentials and session management
@@ -276,7 +274,6 @@ fn build_declared_controller_schemas() -> Vec<ControllerSchema> {
         .extend(crate::openhuman::channels::providers::web::all_web_channel_controller_schemas());
     schemas.extend(crate::openhuman::channels::controllers::all_channels_controller_schemas());
     schemas.extend(crate::openhuman::config::all_config_controller_schemas());
-    schemas.extend(crate::openhuman::providers::all_providers_controller_schemas());
     schemas.extend(crate::openhuman::connectivity::all_connectivity_controller_schemas());
     schemas.extend(crate::openhuman::credentials::all_credentials_controller_schemas());
     schemas.extend(crate::openhuman::service::all_service_controller_schemas());
diff --git a/src/core/legacy_aliases.rs b/src/core/legacy_aliases.rs
index e1c67b94f5..32fbe5e400 100644
--- a/src/core/legacy_aliases.rs
+++ b/src/core/legacy_aliases.rs
@@ -53,7 +53,7 @@ const LEGACY_ALIASES: &[(&str, &str)] = &[
     ),
     (
         "openhuman.update_local_ai_settings",
-        "openhuman.config_update_local_ai_settings",
+        "openhuman.inference_update_local_settings",
     ),
     (
         "openhuman.update_memory_settings",
@@ -61,7 +61,7 @@ const LEGACY_ALIASES: &[(&str, &str)] = &[
     ),
     (
         "openhuman.update_model_settings",
-        "openhuman.config_update_model_settings",
+        "openhuman.inference_update_model_settings",
     ),
     (
         "openhuman.update_runtime_settings",
@@ -79,6 +79,23 @@ const LEGACY_ALIASES: &[(&str, &str)] = &[
         "openhuman.workspace_onboarding_flag_set",
         "openhuman.config_workspace_onboarding_flag_set",
     ),
+    (
+        "openhuman.local_ai_apply_preset",
+        "openhuman.inference_apply_preset",
+    ),
+    (
+        "openhuman.local_ai_device_profile",
+        "openhuman.inference_device_profile",
+    ),
+    (
+        "openhuman.local_ai_diagnostics",
+        "openhuman.inference_diagnostics",
+    ),
+    ("openhuman.local_ai_presets", "openhuman.inference_presets"),
+    (
+        "openhuman.providers_list_models",
+        "openhuman.inference_list_models",
+    ),
 ];
 
 /// Returns the server-side legacy → canonical RPC alias table.
diff --git a/src/openhuman/config/ops.rs b/src/openhuman/config/ops.rs
index 2e4cbe39ce..a681d1b5be 100644
--- a/src/openhuman/config/ops.rs
+++ b/src/openhuman/config/ops.rs
@@ -205,6 +205,65 @@ pub fn snapshot_config_json(config: &Config) -> Result<serde_json::Value, String
     }))
 }
 
+/// Serializes the client-facing AI config slice consumed by the settings UI.
+pub fn client_config_json(config: &Config) -> serde_json::Value {
+    let app_version =
+        std::env::var("OPENHUMAN_APP_VERSION").unwrap_or_else(|_| "unknown".to_string());
+    let api_key_set = config
+        .api_key
+        .as_deref()
+        .map(|k| !k.trim().is_empty())
+        .unwrap_or(false);
+    let model_routes: Vec<serde_json::Value> = config
+        .model_routes
+        .iter()
+        .map(|r| serde_json::json!({ "hint": r.hint, "model": r.model }))
+        .collect();
+    let cloud_providers: Vec<serde_json::Value> = config
+        .cloud_providers
+        .iter()
+        .map(|c| {
+            serde_json::json!({
+                "id": c.id,
+                "slug": c.slug,
+                "label": c.label,
+                "endpoint": c.endpoint,
+                "auth_style": c.auth_style.as_str(),
+            })
+        })
+        .collect();
+
+    serde_json::json!({
+        "api_url": config.api_url,
+        "inference_url": config.inference_url,
+        "default_model": config.default_model,
+        "app_version": app_version,
+        "api_key_set": api_key_set,
+        "model_routes": model_routes,
+        "cloud_providers": cloud_providers,
+        "primary_cloud": config.primary_cloud,
+        "reasoning_provider": config.reasoning_provider,
+        "agentic_provider": config.agentic_provider,
+        "coding_provider": config.coding_provider,
+        "memory_provider": config.memory_provider,
+        "embeddings_provider": config.embeddings_provider,
+        "heartbeat_provider": config.heartbeat_provider,
+        "learning_provider": config.learning_provider,
+        "subconscious_provider": config.subconscious_provider,
+    })
+}
+
+/// Loads config and returns the client-facing AI config slice.
+pub async fn load_and_get_client_config_snapshot() -> Result<RpcOutcome<serde_json::Value>, String>
+{
+    let config = load_config_with_timeout().await?;
+    let snapshot = client_config_json(&config);
+    Ok(RpcOutcome::new(
+        snapshot,
+        vec!["client config read".to_string()],
+    ))
+}
+
 #[derive(Debug, Clone, Default)]
 pub struct ModelSettingsPatch {
     pub api_url: Option<String>,
diff --git a/src/openhuman/config/schemas.rs b/src/openhuman/config/schemas.rs
index 677452d291..7713d8e279 100644
--- a/src/openhuman/config/schemas.rs
+++ b/src/openhuman/config/schemas.rs
@@ -873,71 +873,13 @@ fn handle_get_config(_params: Map<String, Value>) -> ControllerFuture {
 fn handle_get_client_config(_params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         log::debug!("[config][rpc] get_client_config enter");
-        let config = match config_rpc::load_config_with_timeout().await {
-            Ok(c) => c,
+        match config_rpc::load_and_get_client_config_snapshot().await {
+            Ok(snapshot) => to_json(snapshot),
             Err(err) => {
                 log::warn!("[config][rpc] get_client_config load failed: {err}");
-                return Err(err);
+                Err(err)
             }
-        };
-        let app_version =
-            std::env::var("OPENHUMAN_APP_VERSION").unwrap_or_else(|_| "unknown".to_string());
-        let api_key_set = config
-            .api_key
-            .as_deref()
-            .map(|k| !k.trim().is_empty())
-            .unwrap_or(false);
-        let model_routes: Vec<serde_json::Value> = config
-            .model_routes
-            .iter()
-            .map(|r| serde_json::json!({ "hint": r.hint, "model": r.model }))
-            .collect();
-
-        // Surface the new unified AI routing surface (cloud_providers + the
-        // 8 per-workload provider strings + primary_cloud) so the AI
-        // settings panel doesn't have to round-trip the full Config blob.
-        let cloud_providers: Vec<serde_json::Value> = config
-            .cloud_providers
-            .iter()
-            .map(|c| {
-                serde_json::json!({
-                    "id": c.id,
-                    "slug": c.slug,
-                    "label": c.label,
-                    "endpoint": c.endpoint,
-                    "auth_style": c.auth_style.as_str(),
-                })
-            })
-            .collect();
-
-        log::debug!(
-            "[config][rpc] get_client_config ok api_key_set={} model_routes_count={} \
-             cloud_providers_count={}",
-            api_key_set,
-            model_routes.len(),
-            cloud_providers.len(),
-        );
-        to_json(RpcOutcome::new(
-            serde_json::json!({
-                "api_url": config.api_url,
-                "inference_url": config.inference_url,
-                "default_model": config.default_model,
-                "app_version": app_version,
-                "api_key_set": api_key_set,
-                "model_routes": model_routes,
-                "cloud_providers": cloud_providers,
-                "primary_cloud": config.primary_cloud,
-                "reasoning_provider": config.reasoning_provider,
-                "agentic_provider": config.agentic_provider,
-                "coding_provider": config.coding_provider,
-                "memory_provider": config.memory_provider,
-                "embeddings_provider": config.embeddings_provider,
-                "heartbeat_provider": config.heartbeat_provider,
-                "learning_provider": config.learning_provider,
-                "subconscious_provider": config.subconscious_provider,
-            }),
-            vec!["client config read".to_string()],
-        ))
+        }
     })
 }
 
diff --git a/src/openhuman/inference/ops.rs b/src/openhuman/inference/ops.rs
index f0950966c8..56d6258a78 100644
--- a/src/openhuman/inference/ops.rs
+++ b/src/openhuman/inference/ops.rs
@@ -1,11 +1,14 @@
 //! JSON-RPC controller surface for inference operations.
 
+use crate::openhuman::config::rpc as config_rpc;
 use crate::openhuman::config::Config;
 use crate::openhuman::local_ai;
 use crate::openhuman::local_ai::ops::{LocalAiChatMessage, ReactionDecision};
 use crate::openhuman::local_ai::sentiment::SentimentResult;
 use crate::openhuman::local_ai::{LocalAiEmbeddingResult, LocalAiStatus};
+use crate::openhuman::providers;
 use crate::rpc::RpcOutcome;
+use serde_json::{json, Value};
 use tracing::{debug, error};
 
 const LOG_PREFIX: &str = "[inference::ops]";
@@ -158,6 +161,169 @@ pub async fn inference_analyze_sentiment(
     result
 }
 
+pub async fn inference_get_client_config() -> Result<RpcOutcome<Value>, String> {
+    debug!("{LOG_PREFIX} get_client_config:start");
+    let result = config_rpc::load_and_get_client_config_snapshot().await;
+    match &result {
+        Ok(_) => debug!("{LOG_PREFIX} get_client_config:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} get_client_config:error"),
+    }
+    result
+}
+
+pub async fn inference_update_model_settings(
+    update: config_rpc::ModelSettingsPatch,
+) -> Result<RpcOutcome<Value>, String> {
+    debug!("{LOG_PREFIX} update_model_settings:start");
+    let result = config_rpc::load_and_apply_model_settings(update).await;
+    match &result {
+        Ok(_) => debug!("{LOG_PREFIX} update_model_settings:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} update_model_settings:error"),
+    }
+    result
+}
+
+pub async fn inference_update_local_settings(
+    update: config_rpc::LocalAiSettingsPatch,
+) -> Result<RpcOutcome<Value>, String> {
+    debug!("{LOG_PREFIX} update_local_settings:start");
+    let result = config_rpc::load_and_apply_local_ai_settings(update).await;
+    match &result {
+        Ok(_) => debug!("{LOG_PREFIX} update_local_settings:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} update_local_settings:error"),
+    }
+    result
+}
+
+pub async fn inference_list_models(provider_id: &str) -> Result<RpcOutcome<Value>, String> {
+    debug!(provider_id, "{LOG_PREFIX} list_models:start");
+    let result = providers::ops::list_configured_models(provider_id).await;
+    match &result {
+        Ok(_) => debug!("{LOG_PREFIX} list_models:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} list_models:error"),
+    }
+    result
+}
+
+pub async fn inference_device_profile() -> Result<RpcOutcome<Value>, String> {
+    debug!("{LOG_PREFIX} device_profile:start");
+    let profile = local_ai::device::detect_device_profile();
+    let result = Ok(RpcOutcome::single_log(
+        serde_json::to_value(profile).map_err(|e| format!("serialize: {e}"))?,
+        "inference device profile fetched",
+    ));
+    debug!("{LOG_PREFIX} device_profile:ok");
+    result
+}
+
+pub async fn inference_presets() -> Result<RpcOutcome<Value>, String> {
+    debug!("{LOG_PREFIX} presets:start");
+    let config = config_rpc::load_config_with_timeout().await?;
+    let device = local_ai::device::detect_device_profile();
+    let recommended = local_ai::presets::recommend_tier(&device);
+    let current = local_ai::presets::current_tier_from_config(&config.local_ai);
+    let selected_tier = config.local_ai.selected_tier.as_ref().and_then(|value| {
+        let normalized = value.trim().to_ascii_lowercase();
+        local_ai::presets::ModelTier::from_str_opt(&normalized)
+            .map(|tier| tier.as_str().to_string())
+            .or_else(|| (!normalized.is_empty()).then_some(normalized))
+    });
+    let presets = local_ai::presets::mvp_presets();
+    let recommend_disabled = local_ai::presets::should_default_to_cloud_fallback(&device);
+    let result = Ok(RpcOutcome::single_log(
+        json!({
+            "presets": presets,
+            "recommended_tier": recommended,
+            "current_tier": current,
+            "selected_tier": selected_tier,
+            "device": device,
+            "recommend_disabled": recommend_disabled,
+            "local_ai_enabled": config.local_ai.runtime_enabled,
+        }),
+        "inference presets fetched",
+    ));
+    debug!("{LOG_PREFIX} presets:ok");
+    result
+}
+
+pub async fn inference_apply_preset(tier: &str) -> Result<RpcOutcome<Value>, String> {
+    let tier_str = tier.trim().to_ascii_lowercase();
+    debug!(tier = %tier_str, "{LOG_PREFIX} apply_preset:start");
+
+    if tier_str == "disabled" {
+        let mut config = config_rpc::load_config_with_timeout().await?;
+        config.local_ai.runtime_enabled = false;
+        config.local_ai.selected_tier = Some("disabled".to_string());
+        config.local_ai.opt_in_confirmed = false;
+        config
+            .save()
+            .await
+            .map_err(|e| format!("save config: {e}"))?;
+        debug!("{LOG_PREFIX} apply_preset:disabled");
+        return Ok(RpcOutcome::single_log(
+            json!({
+                "applied_tier": "disabled",
+                "local_ai_enabled": false,
+            }),
+            "inference preset applied",
+        ));
+    }
+
+    let tier = local_ai::presets::ModelTier::from_str_opt(&tier_str).ok_or_else(|| {
+        format!(
+            "invalid tier '{}': expected one of disabled or ram_2_4gb",
+            tier_str
+        )
+    })?;
+
+    if tier == local_ai::presets::ModelTier::Custom {
+        return Err("cannot apply 'custom' tier; set model IDs directly".to_string());
+    }
+    if !tier.is_mvp_allowed() {
+        return Err(format!(
+            "tier '{}' is not available in this build; only the 1B local model preset is supported",
+            tier_str
+        ));
+    }
+
+    let mut config = config_rpc::load_config_with_timeout().await?;
+    config.local_ai.runtime_enabled = true;
+    config.local_ai.opt_in_confirmed = true;
+    local_ai::presets::apply_preset_to_config(&mut config.local_ai, tier);
+    config
+        .save()
+        .await
+        .map_err(|e| format!("save config: {e}"))?;
+
+    debug!(tier = %tier_str, "{LOG_PREFIX} apply_preset:ok");
+    Ok(RpcOutcome::single_log(
+        json!({
+            "applied_tier": tier,
+            "chat_model_id": config.local_ai.chat_model_id,
+            "vision_model_id": config.local_ai.vision_model_id,
+            "embedding_model_id": config.local_ai.embedding_model_id,
+            "quantization": config.local_ai.quantization,
+            "vision_mode": local_ai::presets::vision_mode_for_config(&config.local_ai),
+            "local_ai_enabled": true,
+        }),
+        "inference preset applied",
+    ))
+}
+
+pub async fn inference_diagnostics(config: &Config) -> Result<RpcOutcome<Value>, String> {
+    debug!("{LOG_PREFIX} diagnostics:start");
+    let service = local_ai::global(config);
+    let result = service
+        .diagnostics(config)
+        .await
+        .map(|value| RpcOutcome::single_log(value, "inference diagnostics fetched"));
+    match &result {
+        Ok(_) => debug!("{LOG_PREFIX} diagnostics:ok"),
+        Err(err) => error!(error = %err, "{LOG_PREFIX} diagnostics:error"),
+    }
+    result
+}
+
 #[cfg(test)]
 #[path = "ops_tests.rs"]
 mod tests;
diff --git a/src/openhuman/inference/ops_tests.rs b/src/openhuman/inference/ops_tests.rs
index 655bc029a9..7d76274b11 100644
--- a/src/openhuman/inference/ops_tests.rs
+++ b/src/openhuman/inference/ops_tests.rs
@@ -76,3 +76,36 @@ async fn inference_analyze_sentiment_handles_empty_message() {
         .expect("sentiment");
     assert_eq!(outcome.value.valence, "neutral");
 }
+
+#[tokio::test]
+async fn inference_get_client_config_returns_safe_snapshot() {
+    let (config, _tmp) = disabled_config();
+    config.save().await.expect("save config");
+
+    let outcome = inference_get_client_config()
+        .await
+        .expect("client config snapshot");
+    assert!(outcome.value.get("cloud_providers").is_some());
+    assert!(outcome.value.get("api_key_set").is_some());
+}
+
+#[tokio::test]
+async fn inference_apply_preset_rejects_invalid_tier() {
+    let (config, _tmp) = disabled_config();
+    config.save().await.expect("save config");
+
+    let err = inference_apply_preset("ram_bogus")
+        .await
+        .expect_err("invalid tier should fail");
+    assert!(err.contains("invalid tier"));
+}
+
+#[tokio::test]
+async fn inference_presets_returns_recommended_tier() {
+    let (config, _tmp) = disabled_config();
+    config.save().await.expect("save config");
+
+    let outcome = inference_presets().await.expect("presets");
+    assert!(outcome.value.get("recommended_tier").is_some());
+    assert!(outcome.value.get("presets").is_some());
+}
diff --git a/src/openhuman/inference/schemas.rs b/src/openhuman/inference/schemas.rs
index 9392a19c78..bf4308998a 100644
--- a/src/openhuman/inference/schemas.rs
+++ b/src/openhuman/inference/schemas.rs
@@ -55,9 +55,82 @@ struct InferenceAnalyzeSentimentParams {
     message: String,
 }
 
+#[derive(Debug, Deserialize)]
+struct InferenceModelRouteUpdate {
+    hint: String,
+    model: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceCloudProviderUpdate {
+    id: Option<String>,
+    slug: String,
+    #[serde(default)]
+    label: Option<String>,
+    endpoint: String,
+    #[serde(default)]
+    auth_style: Option<String>,
+    #[serde(rename = "type", default)]
+    legacy_type: Option<String>,
+    #[serde(default)]
+    default_model: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceUpdateModelSettingsParams {
+    api_url: Option<String>,
+    inference_url: Option<String>,
+    api_key: Option<String>,
+    default_model: Option<String>,
+    default_temperature: Option<f64>,
+    model_routes: Option<Vec<InferenceModelRouteUpdate>>,
+    cloud_providers: Option<Vec<InferenceCloudProviderUpdate>>,
+    primary_cloud: Option<String>,
+    reasoning_provider: Option<String>,
+    agentic_provider: Option<String>,
+    coding_provider: Option<String>,
+    memory_provider: Option<String>,
+    embeddings_provider: Option<String>,
+    heartbeat_provider: Option<String>,
+    learning_provider: Option<String>,
+    subconscious_provider: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceUpdateLocalSettingsParams {
+    runtime_enabled: Option<bool>,
+    opt_in_confirmed: Option<bool>,
+    provider: Option<String>,
+    base_url: Option<String>,
+    model_id: Option<String>,
+    chat_model_id: Option<String>,
+    usage_embeddings: Option<bool>,
+    usage_heartbeat: Option<bool>,
+    usage_learning_reflection: Option<bool>,
+    usage_subconscious: Option<bool>,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceListModelsParams {
+    provider_id: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct InferenceApplyPresetParams {
+    tier: String,
+}
+
 pub fn all_controller_schemas() -> Vec<ControllerSchema> {
     vec![
         schemas("status"),
+        schemas("get_client_config"),
+        schemas("update_model_settings"),
+        schemas("update_local_settings"),
+        schemas("list_models"),
+        schemas("device_profile"),
+        schemas("presets"),
+        schemas("apply_preset"),
+        schemas("diagnostics"),
         schemas("summarize"),
         schemas("prompt"),
         schemas("vision_prompt"),
@@ -74,6 +147,38 @@ pub fn all_registered_controllers() -> Vec<RegisteredController> {
             schema: schemas("status"),
             handler: handle_inference_status,
         },
+        RegisteredController {
+            schema: schemas("get_client_config"),
+            handler: handle_inference_get_client_config,
+        },
+        RegisteredController {
+            schema: schemas("update_model_settings"),
+            handler: handle_inference_update_model_settings,
+        },
+        RegisteredController {
+            schema: schemas("update_local_settings"),
+            handler: handle_inference_update_local_settings,
+        },
+        RegisteredController {
+            schema: schemas("list_models"),
+            handler: handle_inference_list_models,
+        },
+        RegisteredController {
+            schema: schemas("device_profile"),
+            handler: handle_inference_device_profile,
+        },
+        RegisteredController {
+            schema: schemas("presets"),
+            handler: handle_inference_presets,
+        },
+        RegisteredController {
+            schema: schemas("apply_preset"),
+            handler: handle_inference_apply_preset,
+        },
+        RegisteredController {
+            schema: schemas("diagnostics"),
+            handler: handle_inference_diagnostics,
+        },
         RegisteredController {
             schema: schemas("summarize"),
             handler: handle_inference_summarize,
@@ -114,6 +219,90 @@ pub fn schemas(function: &str) -> ControllerSchema {
             inputs: vec![],
             outputs: vec![json_output("status", "Inference status payload.")],
         },
+        "get_client_config" => ControllerSchema {
+            namespace: "inference",
+            function: "get_client_config",
+            description: "Read the client-facing inference/provider config used by the AI settings UI.",
+            inputs: vec![],
+            outputs: vec![json_output("config", "Client-facing inference config payload.")],
+        },
+        "update_model_settings" => ControllerSchema {
+            namespace: "inference",
+            function: "update_model_settings",
+            description: "Persist cloud-provider routing, custom inference endpoint, and per-workload provider settings.",
+            inputs: vec![
+                optional_string("api_url", "Optional OpenHuman product backend URL."),
+                optional_string("inference_url", "Optional custom inference base URL."),
+                optional_string("api_key", "Optional API key for a custom inference endpoint."),
+                optional_string("default_model", "Optional default model override."),
+                optional_f64("default_temperature", "Optional default temperature override."),
+                optional_json("model_routes", "Optional full replacement for legacy model routes."),
+                optional_json("cloud_providers", "Optional full replacement for configured cloud providers."),
+                optional_string("primary_cloud", "Optional primary cloud provider id."),
+                optional_string("reasoning_provider", "Optional reasoning workload provider string."),
+                optional_string("agentic_provider", "Optional agentic workload provider string."),
+                optional_string("coding_provider", "Optional coding workload provider string."),
+                optional_string("memory_provider", "Optional memory workload provider string."),
+                optional_string("embeddings_provider", "Optional embeddings workload provider string."),
+                optional_string("heartbeat_provider", "Optional heartbeat workload provider string."),
+                optional_string("learning_provider", "Optional learning workload provider string."),
+                optional_string("subconscious_provider", "Optional subconscious workload provider string."),
+            ],
+            outputs: vec![json_output("snapshot", "Updated config snapshot.")],
+        },
+        "update_local_settings" => ControllerSchema {
+            namespace: "inference",
+            function: "update_local_settings",
+            description: "Persist local inference provider selection, endpoint URL, and local-runtime routing flags.",
+            inputs: vec![
+                optional_bool("runtime_enabled", "Enable or disable local inference runtime routing."),
+                optional_bool("opt_in_confirmed", "Persist the local inference opt-in flag."),
+                optional_string("provider", "Optional local provider slug, e.g. ollama or lm_studio."),
+                optional_string("base_url", "Optional local provider base URL."),
+                optional_string("model_id", "Optional generic model id override."),
+                optional_string("chat_model_id", "Optional chat model id override."),
+                optional_bool("usage_embeddings", "Whether embeddings workload may use the local provider."),
+                optional_bool("usage_heartbeat", "Whether heartbeat workload may use the local provider."),
+                optional_bool("usage_learning_reflection", "Whether learning reflection workload may use the local provider."),
+                optional_bool("usage_subconscious", "Whether subconscious workload may use the local provider."),
+            ],
+            outputs: vec![json_output("snapshot", "Updated config snapshot.")],
+        },
+        "list_models" => ControllerSchema {
+            namespace: "inference",
+            function: "list_models",
+            description: "Fetch the available model list from a configured inference provider's /models API.",
+            inputs: vec![required_string("provider_id", "Opaque id of the cloud provider entry to query.")],
+            outputs: vec![json_output("models", "Provider model list payload.")],
+        },
+        "device_profile" => ControllerSchema {
+            namespace: "inference",
+            function: "device_profile",
+            description: "Detect the local hardware profile used for local inference recommendations.",
+            inputs: vec![],
+            outputs: vec![json_output("profile", "Device hardware profile.")],
+        },
+        "presets" => ControllerSchema {
+            namespace: "inference",
+            function: "presets",
+            description: "List local inference model presets with recommendation and current selection.",
+            inputs: vec![],
+            outputs: vec![json_output("presets", "Inference preset payload.")],
+        },
+        "apply_preset" => ControllerSchema {
+            namespace: "inference",
+            function: "apply_preset",
+            description: "Apply a local inference preset to the persisted config.",
+            inputs: vec![required_string("tier", "Tier to apply: ram_2_4gb or disabled.")],
+            outputs: vec![json_output("result", "Applied preset payload.")],
+        },
+        "diagnostics" => ControllerSchema {
+            namespace: "inference",
+            function: "diagnostics",
+            description: "Run diagnostics for the configured local inference provider endpoint and expected models.",
+            inputs: vec![],
+            outputs: vec![json_output("diagnostics", "Inference diagnostics payload.")],
+        },
         "summarize" => ControllerSchema {
             namespace: "inference",
             function: "summarize",
@@ -226,6 +415,33 @@ fn optional_u64(name: &'static str, comment: &'static str) -> FieldSchema {
     }
 }
 
+fn optional_f64(name: &'static str, comment: &'static str) -> FieldSchema {
+    FieldSchema {
+        name,
+        ty: TypeSchema::Option(Box::new(TypeSchema::F64)),
+        comment,
+        required: false,
+    }
+}
+
+fn optional_string(name: &'static str, comment: &'static str) -> FieldSchema {
+    FieldSchema {
+        name,
+        ty: TypeSchema::Option(Box::new(TypeSchema::String)),
+        comment,
+        required: false,
+    }
+}
+
+fn optional_json(name: &'static str, comment: &'static str) -> FieldSchema {
+    FieldSchema {
+        name,
+        ty: TypeSchema::Option(Box::new(TypeSchema::Json)),
+        comment,
+        required: false,
+    }
+}
+
 fn json_output(name: &'static str, comment: &'static str) -> FieldSchema {
     FieldSchema {
         name,
@@ -242,6 +458,157 @@ fn handle_inference_status(_params: Map<String, Value>) -> ControllerFuture {
     })
 }
 
+fn handle_inference_get_client_config(_params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        to_json(crate::openhuman::inference::rpc::inference_get_client_config().await?)
+    })
+}
+
+fn handle_inference_update_model_settings(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let update = deserialize_params::<InferenceUpdateModelSettingsParams>(params)?;
+        let patch = config_rpc::ModelSettingsPatch {
+            api_url: update.api_url,
+            inference_url: update.inference_url,
+            api_key: update.api_key,
+            default_model: update.default_model,
+            default_temperature: update.default_temperature,
+            model_routes: update.model_routes.map(|routes| {
+                routes
+                    .into_iter()
+                    .map(|route| crate::openhuman::config::ModelRouteConfig {
+                        hint: route.hint,
+                        model: route.model,
+                    })
+                    .collect()
+            }),
+            cloud_providers: update
+                .cloud_providers
+                .map(|entries| {
+                    use crate::openhuman::config::schema::cloud_providers::{
+                        generate_provider_id, is_slug_reserved, migrate_legacy_fields, AuthStyle,
+                        CloudProviderCreds,
+                    };
+                    entries
+                        .into_iter()
+                        .map(|entry| {
+                            let slug = entry.slug.trim().to_string();
+                            if slug.is_empty() {
+                                return Err("cloud provider slug must not be empty".to_string());
+                            }
+                            if is_slug_reserved(&slug) {
+                                return Err(format!(
+                                    "slug '{}' is reserved and cannot be used for a custom provider",
+                                    slug
+                                ));
+                            }
+                            let auth_style = match entry
+                                .auth_style
+                                .as_deref()
+                                .unwrap_or("bearer")
+                                .to_ascii_lowercase()
+                                .as_str()
+                            {
+                                "bearer" => AuthStyle::Bearer,
+                                "anthropic" => AuthStyle::Anthropic,
+                                "openhuman_jwt" | "openhumanjwt" => AuthStyle::OpenhumanJwt,
+                                "none" => AuthStyle::None,
+                                other => {
+                                    return Err(format!(
+                                        "unknown auth_style '{}'; valid: bearer, anthropic, openhuman_jwt, none",
+                                        other
+                                    ))
+                                }
+                            };
+                            let id = entry
+                                .id
+                                .filter(|s| !s.trim().is_empty())
+                                .unwrap_or_else(|| generate_provider_id(&slug));
+                            let label = entry
+                                .label
+                                .filter(|s| !s.trim().is_empty())
+                                .unwrap_or_else(|| slug.clone());
+                            let mut provider = CloudProviderCreds {
+                                id,
+                                slug,
+                                label,
+                                endpoint: entry.endpoint,
+                                auth_style,
+                                legacy_type: entry.legacy_type,
+                                default_model: entry.default_model,
+                            };
+                            migrate_legacy_fields(&mut provider);
+                            Ok(provider)
+                        })
+                        .collect::<Result<Vec<_>, String>>()
+                })
+                .transpose()?,
+            primary_cloud: update.primary_cloud,
+            reasoning_provider: update.reasoning_provider,
+            agentic_provider: update.agentic_provider,
+            coding_provider: update.coding_provider,
+            memory_provider: update.memory_provider,
+            embeddings_provider: update.embeddings_provider,
+            heartbeat_provider: update.heartbeat_provider,
+            learning_provider: update.learning_provider,
+            subconscious_provider: update.subconscious_provider,
+        };
+        to_json(crate::openhuman::inference::rpc::inference_update_model_settings(patch).await?)
+    })
+}
+
+fn handle_inference_update_local_settings(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let update = deserialize_params::<InferenceUpdateLocalSettingsParams>(params)?;
+        let patch = config_rpc::LocalAiSettingsPatch {
+            runtime_enabled: update.runtime_enabled,
+            opt_in_confirmed: update.opt_in_confirmed,
+            provider: update.provider,
+            base_url: update.base_url,
+            model_id: update.model_id,
+            chat_model_id: update.chat_model_id,
+            usage_embeddings: update.usage_embeddings,
+            usage_heartbeat: update.usage_heartbeat,
+            usage_learning_reflection: update.usage_learning_reflection,
+            usage_subconscious: update.usage_subconscious,
+        };
+        to_json(crate::openhuman::inference::rpc::inference_update_local_settings(patch).await?)
+    })
+}
+
+fn handle_inference_list_models(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let request = deserialize_params::<InferenceListModelsParams>(params)?;
+        to_json(
+            crate::openhuman::inference::rpc::inference_list_models(&request.provider_id).await?,
+        )
+    })
+}
+
+fn handle_inference_device_profile(_params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(
+        async move { to_json(crate::openhuman::inference::rpc::inference_device_profile().await?) },
+    )
+}
+
+fn handle_inference_presets(_params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move { to_json(crate::openhuman::inference::rpc::inference_presets().await?) })
+}
+
+fn handle_inference_apply_preset(params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let request = deserialize_params::<InferenceApplyPresetParams>(params)?;
+        to_json(crate::openhuman::inference::rpc::inference_apply_preset(&request.tier).await?)
+    })
+}
+
+fn handle_inference_diagnostics(_params: Map<String, Value>) -> ControllerFuture {
+    Box::pin(async move {
+        let config = config_rpc::load_config_with_timeout().await?;
+        to_json(crate::openhuman::inference::rpc::inference_diagnostics(&config).await?)
+    })
+}
+
 fn handle_inference_summarize(params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let p = deserialize_params::<InferenceSummarizeParams>(params)?;
diff --git a/src/openhuman/inference/schemas_tests.rs b/src/openhuman/inference/schemas_tests.rs
index 50b4c9bfa1..576504701f 100644
--- a/src/openhuman/inference/schemas_tests.rs
+++ b/src/openhuman/inference/schemas_tests.rs
@@ -5,7 +5,7 @@ fn inference_catalog_counts_match_and_nonempty() {
     let declared = all_controller_schemas();
     let registered = all_registered_controllers();
     assert_eq!(declared.len(), registered.len());
-    assert!(declared.len() >= 8);
+    assert!(declared.len() >= 16);
 }
 
 #[test]
@@ -28,6 +28,14 @@ fn inference_schema_function_names_are_stable() {
         .map(|schema| schema.function)
         .collect();
     assert!(functions.contains(&"status"));
+    assert!(functions.contains(&"get_client_config"));
+    assert!(functions.contains(&"update_model_settings"));
+    assert!(functions.contains(&"update_local_settings"));
+    assert!(functions.contains(&"list_models"));
+    assert!(functions.contains(&"device_profile"));
+    assert!(functions.contains(&"presets"));
+    assert!(functions.contains(&"apply_preset"));
+    assert!(functions.contains(&"diagnostics"));
     assert!(functions.contains(&"prompt"));
     assert!(functions.contains(&"vision_prompt"));
     assert!(functions.contains(&"embed"));
diff --git a/src/openhuman/local_ai/README.md b/src/openhuman/local_ai/README.md
index 2c4e6104b4..58da878577 100644
--- a/src/openhuman/local_ai/README.md
+++ b/src/openhuman/local_ai/README.md
@@ -11,8 +11,8 @@ Local asset/runtime support for speech models and localhost-style integrations.
 - `pub struct ModelPreset` / `pub enum ModelTier` / `pub enum VisionMode` — `presets.rs` — bundled preset matrix.
 - `pub struct SentimentResult` — `sentiment.rs` — internal sentiment result type used by inference delegates.
 - Status / progress / result types: `pub struct LocalAiStatus`, `LocalAiAssetStatus`, `LocalAiAssetsStatus`, `LocalAiDownloadProgressItem`, `LocalAiDownloadsProgress`, `LocalAiEmbeddingResult`, `LocalAiSpeechResult`, `LocalAiTtsResult` — `types.rs`.
-- `pub mod ops` (re-exported as `rpc`) — `ops.rs` — typed Rust wrappers. Public `local_ai.*` RPCs are limited to local speech/assets/device-profile flows; prompt/chat/embed/status helpers remain available for internal delegation from `inference`.
-- RPC `local_ai.{agent_chat, agent_chat_simple, local_ai_transcribe, local_ai_transcribe_bytes, local_ai_tts, local_ai_assets_status, local_ai_downloads_progress, local_ai_download_asset, local_ai_device_profile, local_ai_presets, local_ai_apply_preset, local_ai_diagnostics, local_ai_install_whisper, local_ai_install_piper, local_ai_whisper_install_status, local_ai_piper_install_status}` — `schemas.rs`.
+- `pub mod ops` (re-exported as `rpc`) — `ops.rs` — typed Rust wrappers. Public `local_ai.*` RPCs are limited to local speech/assets flows; prompt/chat/embed/status helpers remain available for internal delegation from `inference`.
+- RPC `local_ai.{agent_chat, agent_chat_simple, local_ai_transcribe, local_ai_transcribe_bytes, local_ai_tts, local_ai_assets_status, local_ai_downloads_progress, local_ai_download_asset, local_ai_install_whisper, local_ai_install_piper, local_ai_whisper_install_status, local_ai_piper_install_status}` — `schemas.rs`.
 
 ## Calls into
 
diff --git a/src/openhuman/local_ai/schemas.rs b/src/openhuman/local_ai/schemas.rs
index 58ccfa8e84..e83efbf367 100644
--- a/src/openhuman/local_ai/schemas.rs
+++ b/src/openhuman/local_ai/schemas.rs
@@ -36,11 +36,6 @@ struct LocalAiDownloadAssetParams {
     capability: String,
 }
 
-#[derive(Debug, Deserialize)]
-struct LocalAiApplyPresetParams {
-    tier: String,
-}
-
 #[derive(Debug, Deserialize)]
 struct LocalAiInstallWhisperParams {
     /// Optional model size (`tiny`, `base`, `small`, `medium`,
@@ -73,10 +68,6 @@ pub fn all_controller_schemas() -> Vec<ControllerSchema> {
         schemas("local_ai_assets_status"),
         schemas("local_ai_downloads_progress"),
         schemas("local_ai_download_asset"),
-        schemas("local_ai_device_profile"),
-        schemas("local_ai_presets"),
-        schemas("local_ai_apply_preset"),
-        schemas("local_ai_diagnostics"),
         schemas("local_ai_install_whisper"),
         schemas("local_ai_install_piper"),
         schemas("local_ai_whisper_install_status"),
@@ -118,22 +109,6 @@ pub fn all_registered_controllers() -> Vec<RegisteredController> {
             schema: schemas("local_ai_download_asset"),
             handler: handle_local_ai_download_asset,
         },
-        RegisteredController {
-            schema: schemas("local_ai_device_profile"),
-            handler: handle_local_ai_device_profile,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_presets"),
-            handler: handle_local_ai_presets,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_apply_preset"),
-            handler: handle_local_ai_apply_preset,
-        },
-        RegisteredController {
-            schema: schemas("local_ai_diagnostics"),
-            handler: handle_local_ai_diagnostics,
-        },
         RegisteredController {
             schema: schemas("local_ai_install_whisper"),
             handler: handle_local_ai_install_whisper,
@@ -230,44 +205,6 @@ pub fn schemas(function: &str) -> ControllerSchema {
             inputs: vec![required_string("capability", "Asset capability id.")],
             outputs: vec![json_output("status", "Assets status payload.")],
         },
-        "local_ai_device_profile" => ControllerSchema {
-            namespace: "local_ai",
-            function: "device_profile",
-            description: "Detect local device hardware profile (RAM, CPU, GPU).",
-            inputs: vec![],
-            outputs: vec![json_output("profile", "Device hardware profile.")],
-        },
-        "local_ai_presets" => ControllerSchema {
-            namespace: "local_ai",
-            function: "presets",
-            description: "List model tier presets with recommendation and current selection.",
-            inputs: vec![],
-            outputs: vec![json_output(
-                "presets",
-                "Object containing: presets (array of ModelPreset), recommended_tier (string), \
-                 current_tier (string), selected_tier (string | null), device (DeviceProfile), \
-                 recommend_disabled (boolean — true when the device is below the RAM floor and \
-                 cloud fallback is the recommended default), local_ai_enabled (boolean — mirrors \
-                 config.local_ai.runtime_enabled so the UI can render the active state when disabled).",
-            )],
-        },
-        "local_ai_apply_preset" => ControllerSchema {
-            namespace: "local_ai",
-            function: "apply_preset",
-            description: "Apply a model tier preset to local AI config and persist.",
-            inputs: vec![required_string(
-                "tier",
-                "Tier to apply: ram_2_4gb, or disabled to use cloud fallback.",
-            )],
-            outputs: vec![json_output("result", "Applied tier status.")],
-        },
-        "local_ai_diagnostics" => ControllerSchema {
-            namespace: "local_ai",
-            function: "diagnostics",
-            description: "Run Ollama diagnostics: check server health, list installed models, verify expected models.",
-            inputs: vec![],
-            outputs: vec![json_output("diagnostics", "Diagnostic report.")],
-        },
         "local_ai_install_whisper" => ControllerSchema {
             namespace: "local_ai",
             function: "install_whisper",
@@ -427,131 +364,6 @@ fn handle_local_ai_download_asset(params: Map<String, Value>) -> ControllerFutur
     })
 }
 
-fn handle_local_ai_device_profile(_params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        tracing::debug!("[local_ai] device_profile: detecting hardware");
-        let profile = crate::openhuman::local_ai::device::detect_device_profile();
-        tracing::debug!("[local_ai] device_profile: done");
-        let value = serde_json::to_value(&profile).map_err(|e| format!("serialize: {e}"))?;
-        Ok(value)
-    })
-}
-
-fn handle_local_ai_presets(_params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        tracing::debug!("[local_ai] presets: loading config and computing tiers");
-        let config = config_rpc::load_config_with_timeout().await?;
-        let device = crate::openhuman::local_ai::device::detect_device_profile();
-        let recommended = crate::openhuman::local_ai::presets::recommend_tier(&device);
-        let current =
-            crate::openhuman::local_ai::presets::current_tier_from_config(&config.local_ai);
-        let selected_tier = config.local_ai.selected_tier.as_ref().and_then(|value| {
-            let normalized = value.trim().to_ascii_lowercase();
-            crate::openhuman::local_ai::presets::ModelTier::from_str_opt(&normalized)
-                .map(|tier| tier.as_str().to_string())
-                .or_else(|| (!normalized.is_empty()).then_some(normalized))
-        });
-        let presets = crate::openhuman::local_ai::presets::mvp_presets();
-        tracing::debug!(
-            ?recommended,
-            ?current,
-            selected_tier = ?selected_tier,
-            preset_count = presets.len(),
-            "[local_ai] presets: returning"
-        );
-        let recommend_disabled =
-            crate::openhuman::local_ai::presets::should_default_to_cloud_fallback(&device);
-        let value = serde_json::json!({
-            "presets": presets,
-            "recommended_tier": recommended,
-            "current_tier": current,
-            "selected_tier": selected_tier,
-            "device": device,
-            "recommend_disabled": recommend_disabled,
-            "local_ai_enabled": config.local_ai.runtime_enabled,
-        });
-        Ok(value)
-    })
-}
-
-fn handle_local_ai_apply_preset(params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let p = deserialize_params::<LocalAiApplyPresetParams>(params)?;
-        let tier_str = p.tier.trim().to_ascii_lowercase();
-        tracing::debug!(tier = %tier_str, "[local_ai] apply_preset: parsing tier");
-
-        // Special "disabled" tier: turn local_ai off and route AI to cloud.
-        if tier_str == "disabled" {
-            let mut config = config_rpc::load_config_with_timeout().await?;
-            config.local_ai.runtime_enabled = false;
-            config.local_ai.selected_tier = Some("disabled".to_string());
-            // Explicit opt-out also clears the MVP opt-in marker so bootstrap
-            // keeps local AI off across restarts.
-            config.local_ai.opt_in_confirmed = false;
-            config
-                .save()
-                .await
-                .map_err(|e| format!("save config: {e}"))?;
-            tracing::debug!("[local_ai] apply_preset: local_ai disabled (cloud fallback)");
-            return Ok(serde_json::json!({
-                "applied_tier": "disabled",
-                "local_ai_enabled": false,
-            }));
-        }
-
-        let tier = crate::openhuman::local_ai::presets::ModelTier::from_str_opt(&tier_str)
-            .ok_or_else(|| {
-                format!(
-                    "invalid tier '{}': expected one of disabled or ram_2_4gb",
-                    tier_str
-                )
-            })?;
-
-        if tier == crate::openhuman::local_ai::presets::ModelTier::Custom {
-            return Err("cannot apply 'custom' tier; set model IDs directly".to_string());
-        }
-        if !tier.is_mvp_allowed() {
-            return Err(format!(
-                "tier '{}' is not available in this build; only the 1B local model preset is supported",
-                tier_str
-            ));
-        }
-
-        let mut config = config_rpc::load_config_with_timeout().await?;
-        // Re-enable local AI in case it was previously disabled via the
-        // "disabled" tier, so the user can switch back to local inference.
-        config.local_ai.runtime_enabled = true;
-        // Explicit tier selection is the MVP opt-in — flip the marker so
-        // `config_with_recommended_tier_if_unselected` stops hard-overriding
-        // to disabled on subsequent boots.
-        config.local_ai.opt_in_confirmed = true;
-        crate::openhuman::local_ai::presets::apply_preset_to_config(&mut config.local_ai, tier);
-        config
-            .save()
-            .await
-            .map_err(|e| format!("save config: {e}"))?;
-        tracing::debug!(tier = %tier_str, "[local_ai] apply_preset: config saved");
-
-        Ok(serde_json::json!({
-            "applied_tier": tier,
-            "chat_model_id": config.local_ai.chat_model_id,
-            "vision_model_id": config.local_ai.vision_model_id,
-            "embedding_model_id": config.local_ai.embedding_model_id,
-            "quantization": config.local_ai.quantization,
-            "vision_mode": crate::openhuman::local_ai::presets::vision_mode_for_config(&config.local_ai),
-            "local_ai_enabled": true,
-        }))
-    })
-}
-
-fn handle_local_ai_diagnostics(_params: Map<String, Value>) -> ControllerFuture {
-    Box::pin(async move {
-        let config = config_rpc::load_config_with_timeout().await?;
-        let service = crate::openhuman::local_ai::global(&config);
-        service.diagnostics(&config).await
-    })
-}
-
 // The install RPCs are intentionally fire-and-forget: a binary+model
 // download can take minutes (1.6 GB GGML model, ~5 MB Piper binary
 // archive) but the core JSON-RPC client times out at
diff --git a/src/openhuman/local_ai/schemas_tests.rs b/src/openhuman/local_ai/schemas_tests.rs
index 2054ce748c..4f92528604 100644
--- a/src/openhuman/local_ai/schemas_tests.rs
+++ b/src/openhuman/local_ai/schemas_tests.rs
@@ -5,7 +5,7 @@ fn catalog_counts_match_and_nonempty() {
     let s = all_controller_schemas();
     let h = all_registered_controllers();
     assert_eq!(s.len(), h.len());
-    assert!(s.len() >= 16, "local_ai should expose >=16 controller fns");
+    assert!(s.len() >= 12, "local_ai should expose >=12 controller fns");
 }
 
 #[test]
@@ -35,10 +35,6 @@ fn every_registered_key_resolves_to_non_unknown_schema() {
         "local_ai_assets_status",
         "local_ai_downloads_progress",
         "local_ai_download_asset",
-        "local_ai_device_profile",
-        "local_ai_presets",
-        "local_ai_apply_preset",
-        "local_ai_diagnostics",
         "local_ai_install_whisper",
         "local_ai_install_piper",
         "local_ai_whisper_install_status",
@@ -99,116 +95,11 @@ fn deserialize_params_errors_on_invalid_shape() {
     assert!(err.contains("invalid params"));
 }
 
-#[test]
-fn apply_preset_schema_has_inputs() {
-    let s = schemas("local_ai_apply_preset");
-    assert!(!s.inputs.is_empty());
-}
-
 // ── Handler-level tests that don't need Ollama ────────────────
 
 use crate::openhuman::config::TEST_ENV_LOCK as ENV_LOCK;
 use tempfile::TempDir;
 
-#[tokio::test]
-async fn handle_device_profile_returns_device_shape() {
-    let v = handle_local_ai_device_profile(Map::new())
-        .await
-        .expect("ok");
-    // device profile exposes at least a few expected fields.
-    assert!(v.is_object());
-}
-
-#[tokio::test]
-async fn handle_presets_returns_presets_list_and_recommended_tier() {
-    let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
-    let tmp = TempDir::new().unwrap();
-    unsafe {
-        std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
-    }
-    let v = handle_local_ai_presets(Map::new()).await.expect("ok");
-    unsafe {
-        std::env::remove_var("OPENHUMAN_WORKSPACE");
-    }
-    assert!(v.get("presets").is_some());
-    assert!(v.get("recommended_tier").is_some());
-    assert!(v.get("device").is_some());
-    let presets = v
-        .get("presets")
-        .and_then(|value| value.as_array())
-        .expect("presets array");
-    assert_eq!(presets.len(), 1, "only the 1B preset should be exposed");
-    assert_eq!(
-        presets[0]
-            .get("chat_model_id")
-            .and_then(|value| value.as_str()),
-        Some("gemma3:1b-it-qat")
-    );
-}
-
-#[tokio::test]
-async fn handle_apply_preset_rejects_invalid_tier() {
-    let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
-    let tmp = TempDir::new().unwrap();
-    unsafe {
-        std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
-    }
-    let params = Map::from_iter([("tier".to_string(), serde_json::json!("ram_bogus"))]);
-    let err = handle_local_ai_apply_preset(params).await.unwrap_err();
-    unsafe {
-        std::env::remove_var("OPENHUMAN_WORKSPACE");
-    }
-    assert!(err.contains("invalid tier"));
-}
-
-#[tokio::test]
-async fn handle_apply_preset_rejects_custom_tier() {
-    let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
-    let tmp = TempDir::new().unwrap();
-    unsafe {
-        std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
-    }
-    let params = Map::from_iter([("tier".to_string(), serde_json::json!("custom"))]);
-    let err = handle_local_ai_apply_preset(params).await.unwrap_err();
-    unsafe {
-        std::env::remove_var("OPENHUMAN_WORKSPACE");
-    }
-    assert!(err.contains("cannot apply 'custom'"));
-}
-
-#[tokio::test]
-async fn handle_apply_preset_rejects_unsupported_large_tier() {
-    let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
-    let tmp = TempDir::new().unwrap();
-    unsafe {
-        std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
-    }
-    let params = Map::from_iter([("tier".to_string(), serde_json::json!("ram_8_16gb"))]);
-    let err = handle_local_ai_apply_preset(params).await.unwrap_err();
-    unsafe {
-        std::env::remove_var("OPENHUMAN_WORKSPACE");
-    }
-    assert!(err.contains("only the 1B local model preset is supported"));
-}
-
-#[tokio::test]
-async fn handle_apply_preset_accepts_valid_tier_and_persists() {
-    let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
-    let tmp = TempDir::new().unwrap();
-    unsafe {
-        std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
-    }
-    let params = Map::from_iter([("tier".to_string(), serde_json::json!("ram_2_4gb"))]);
-    let result = handle_local_ai_apply_preset(params)
-        .await
-        .expect("apply ok");
-    unsafe {
-        std::env::remove_var("OPENHUMAN_WORKSPACE");
-    }
-    assert!(result.get("applied_tier").is_some());
-    assert!(result.get("chat_model_id").is_some());
-}
-
 /// Regression test for the CodeRabbit #7 race on PR #1755: when two
 /// concurrent RPC calls (e.g. a double-click, or the auto-install firing
 /// alongside a manual click) hit `handle_local_ai_install_whisper` at
diff --git a/src/openhuman/providers/mod.rs b/src/openhuman/providers/mod.rs
index bd18f3dad5..65324c2a2d 100644
--- a/src/openhuman/providers/mod.rs
+++ b/src/openhuman/providers/mod.rs
@@ -18,7 +18,3 @@ pub use traits::{
 pub use billing_error::is_budget_exhausted_message;
 pub use factory::{create_chat_provider, provider_for_role};
 pub use ops::*;
-pub use schemas::{
-    all_controller_schemas as all_providers_controller_schemas,
-    all_registered_controllers as all_providers_registered_controllers,
-};
diff --git a/src/openhuman/providers/ops.rs b/src/openhuman/providers/ops.rs
index 4a06d50c87..8e64226052 100644
--- a/src/openhuman/providers/ops.rs
+++ b/src/openhuman/providers/ops.rs
@@ -1,5 +1,6 @@
 use super::*;
 
+use serde::Serialize;
 use std::path::PathBuf;
 
 const MAX_API_ERROR_CHARS: usize = 200;
@@ -15,6 +16,134 @@ pub struct ProviderRuntimeOptions {
     pub reasoning_enabled: Option<bool>,
 }
 
+#[derive(Debug, Serialize)]
+pub struct ModelInfo {
+    pub id: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub owned_by: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub context_window: Option<u64>,
+}
+
+pub async fn list_configured_models(
+    provider_id: &str,
+) -> Result<crate::rpc::RpcOutcome<serde_json::Value>, String> {
+    let provider_id = provider_id.trim().to_string();
+    if provider_id.is_empty() {
+        return Err("provider_id must not be empty".to_string());
+    }
+
+    log::debug!("[providers][list_models] provider_id={}", provider_id);
+
+    let config = crate::openhuman::config::Config::load_or_init()
+        .await
+        .map_err(|e| e.to_string())?;
+
+    let entry = config
+        .cloud_providers
+        .iter()
+        .find(|e| e.id == provider_id)
+        .cloned()
+        .ok_or_else(|| format!("no cloud provider with id '{}' found", provider_id))?;
+
+    let base = entry.endpoint.trim_end_matches('/');
+    let models_url = format!("{}/models", base);
+
+    log::debug!(
+        "[providers][list_models] fetching url={} slug={}",
+        models_url,
+        entry.slug
+    );
+
+    let api_key = crate::openhuman::providers::factory::lookup_key_for_slug(&entry.slug, &config)
+        .unwrap_or_default();
+
+    let client = crate::openhuman::config::build_runtime_proxy_client_with_timeouts(
+        "providers.list_models",
+        30,
+        10,
+    );
+
+    let mut request = client.get(&models_url);
+
+    use crate::openhuman::config::schema::cloud_providers::AuthStyle;
+    request = match entry.auth_style {
+        AuthStyle::Bearer => {
+            if !api_key.is_empty() {
+                request.header("Authorization", format!("Bearer {}", api_key))
+            } else {
+                request
+            }
+        }
+        AuthStyle::Anthropic => {
+            let mut r = request.header("anthropic-version", "2023-06-01");
+            if !api_key.is_empty() {
+                r = r.header("x-api-key", &api_key);
+            }
+            r
+        }
+        AuthStyle::OpenhumanJwt | AuthStyle::None => request,
+    };
+
+    let response = request
+        .send()
+        .await
+        .map_err(|e| format!("[providers][list_models] HTTP request failed: {}", e))?;
+
+    let status = response.status();
+    if !status.is_success() {
+        let body = response.text().await.unwrap_or_default();
+        let truncated = crate::openhuman::util::truncate_with_ellipsis(&body, 300);
+        return Err(format!(
+            "provider returned {}: {}",
+            status.as_u16(),
+            truncated
+        ));
+    }
+
+    let body: serde_json::Value = response
+        .json()
+        .await
+        .map_err(|e| format!("[providers][list_models] failed to parse JSON: {}", e))?;
+
+    let data = body
+        .get("data")
+        .and_then(|d| d.as_array())
+        .cloned()
+        .unwrap_or_default();
+
+    let models: Vec<ModelInfo> = data
+        .iter()
+        .filter_map(|item| {
+            let id = item.get("id")?.as_str()?.to_string();
+            let owned_by = item
+                .get("owned_by")
+                .and_then(|v| v.as_str())
+                .map(|s| s.to_string());
+            let context_window = item
+                .get("context_length")
+                .or_else(|| item.get("context_window"))
+                .and_then(|v| v.as_u64());
+            Some(ModelInfo {
+                id,
+                owned_by,
+                context_window,
+            })
+        })
+        .collect();
+
+    log::info!(
+        "[providers][list_models] slug={} fetched {} models",
+        entry.slug,
+        models.len()
+    );
+
+    Ok(crate::rpc::RpcOutcome::new(
+        serde_json::json!({ "models": models }),
+        vec![format!("fetched {} models", models.len())],
+    ))
+}
+
 impl Default for ProviderRuntimeOptions {
     fn default() -> Self {
         Self {
diff --git a/src/openhuman/providers/schemas.rs b/src/openhuman/providers/schemas.rs
index 45feb283fd..a9f5f12f3a 100644
--- a/src/openhuman/providers/schemas.rs
+++ b/src/openhuman/providers/schemas.rs
@@ -5,13 +5,12 @@
 
 use crate::core::all::{ControllerFuture, RegisteredController};
 use crate::core::{ControllerSchema, FieldSchema, TypeSchema};
-use crate::rpc::RpcOutcome;
-use serde::{Deserialize, Serialize};
+use serde::Deserialize;
 use serde_json::{Map, Value};
 
 // ── Helpers ──────────────────────────────────────────────────────────────────
 
-fn to_json<T: Serialize>(outcome: RpcOutcome<T>) -> Result<Value, String> {
+fn to_json<T: serde::Serialize>(outcome: crate::rpc::RpcOutcome<T>) -> Result<Value, String> {
     outcome.into_cli_compatible_json()
 }
 
@@ -65,143 +64,11 @@ struct ListModelsRequest {
     provider_id: String,
 }
 
-#[derive(Debug, Serialize)]
-struct ModelInfo {
-    id: String,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    owned_by: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    context_window: Option<u64>,
-}
-
 // ── Handler ───────────────────────────────────────────────────────────────────
 
 fn handle_list_models(params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let req: ListModelsRequest = deserialize_params(params)?;
-        let provider_id = req.provider_id.trim().to_string();
-
-        if provider_id.is_empty() {
-            return Err("provider_id must not be empty".to_string());
-        }
-
-        log::debug!("[providers][list_models] provider_id={}", provider_id);
-
-        let config = crate::openhuman::config::Config::load_or_init()
-            .await
-            .map_err(|e| e.to_string())?;
-
-        let entry = config
-            .cloud_providers
-            .iter()
-            .find(|e| e.id == provider_id)
-            .cloned()
-            .ok_or_else(|| format!("no cloud provider with id '{}' found", provider_id))?;
-
-        // Build the /models URL from the provider's endpoint.
-        let base = entry.endpoint.trim_end_matches('/');
-        let models_url = format!("{}/models", base);
-
-        log::debug!(
-            "[providers][list_models] fetching url={} slug={}",
-            models_url,
-            entry.slug
-        );
-
-        // Fetch the API key for this provider.
-        let api_key =
-            crate::openhuman::providers::factory::lookup_key_for_slug(&entry.slug, &config)
-                .unwrap_or_default();
-
-        // Build the HTTP client (reuse the runtime proxy config). Explicit
-        // timeouts mirror the other external integrations (composio,
-        // multimodal) so a slow/unresponsive provider can't hang the panel.
-        let client = crate::openhuman::config::build_runtime_proxy_client_with_timeouts(
-            "providers.list_models",
-            30,
-            10,
-        );
-
-        let mut request = client.get(&models_url);
-
-        // Attach auth header per auth_style.
-        use crate::openhuman::config::schema::cloud_providers::AuthStyle;
-        request = match entry.auth_style {
-            AuthStyle::Bearer => {
-                if !api_key.is_empty() {
-                    request.header("Authorization", format!("Bearer {}", api_key))
-                } else {
-                    request
-                }
-            }
-            AuthStyle::Anthropic => {
-                let mut r = request.header("anthropic-version", "2023-06-01");
-                if !api_key.is_empty() {
-                    r = r.header("x-api-key", &api_key);
-                }
-                r
-            }
-            AuthStyle::OpenhumanJwt | AuthStyle::None => request,
-        };
-
-        let response = request
-            .send()
-            .await
-            .map_err(|e| format!("[providers][list_models] HTTP request failed: {}", e))?;
-
-        let status = response.status();
-        if !status.is_success() {
-            let body = response.text().await.unwrap_or_default();
-            let truncated = crate::openhuman::util::truncate_with_ellipsis(&body, 300);
-            return Err(format!(
-                "provider returned {}: {}",
-                status.as_u16(),
-                truncated
-            ));
-        }
-
-        let body: Value = response
-            .json()
-            .await
-            .map_err(|e| format!("[providers][list_models] failed to parse JSON: {}", e))?;
-
-        // Parse OpenAI-compatible `{ data: [{ id, owned_by? }] }` or
-        // Anthropic `{ data: [{ id, display_name }] }`.
-        let data = body
-            .get("data")
-            .and_then(|d| d.as_array())
-            .cloned()
-            .unwrap_or_default();
-
-        let models: Vec<ModelInfo> = data
-            .iter()
-            .filter_map(|item| {
-                let id = item.get("id")?.as_str()?.to_string();
-                let owned_by = item
-                    .get("owned_by")
-                    .and_then(|v| v.as_str())
-                    .map(|s| s.to_string());
-                let context_window = item
-                    .get("context_length")
-                    .or_else(|| item.get("context_window"))
-                    .and_then(|v| v.as_u64());
-                Some(ModelInfo {
-                    id,
-                    owned_by,
-                    context_window,
-                })
-            })
-            .collect();
-
-        log::info!(
-            "[providers][list_models] slug={} fetched {} models",
-            entry.slug,
-            models.len()
-        );
-
-        to_json(RpcOutcome::new(
-            serde_json::json!({ "models": models }),
-            vec![format!("fetched {} models", models.len())],
-        ))
+        to_json(crate::openhuman::providers::ops::list_configured_models(&req.provider_id).await?)
     })
 }
diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs
index f919f3b1f2..55f9aaeafa 100644
--- a/tests/json_rpc_e2e.rs
+++ b/tests/json_rpc_e2e.rs
@@ -3353,13 +3353,14 @@ async fn json_rpc_local_ai_device_profile_and_presets() {
     let profile = post_json_rpc(
         &rpc_base,
         30,
-        "openhuman.local_ai_device_profile",
+        "openhuman.inference_device_profile",
         json!({}),
     )
     .await;
     let profile_result = assert_no_jsonrpc_error(&profile, "device_profile");
+    let profile_payload = profile_result.get("result").unwrap_or(profile_result);
     assert!(
-        profile_result
+        profile_payload
             .get("total_ram_bytes")
             .and_then(Value::as_u64)
             .unwrap_or(0)
@@ -3367,7 +3368,7 @@ async fn json_rpc_local_ai_device_profile_and_presets() {
         "expected positive RAM: {profile_result}"
     );
     assert!(
-        profile_result
+        profile_payload
             .get("cpu_count")
             .and_then(Value::as_u64)
             .unwrap_or(0)
@@ -3376,9 +3377,10 @@ async fn json_rpc_local_ai_device_profile_and_presets() {
     );
 
     // --- presets ---
-    let presets = post_json_rpc(&rpc_base, 31, "openhuman.local_ai_presets", json!({})).await;
+    let presets = post_json_rpc(&rpc_base, 31, "openhuman.inference_presets", json!({})).await;
     let presets_result = assert_no_jsonrpc_error(&presets, "presets");
-    let presets_arr = presets_result
+    let presets_payload = presets_result.get("result").unwrap_or(presets_result);
+    let presets_arr = presets_payload
         .get("presets")
         .and_then(Value::as_array)
         .expect("presets should be an array");
@@ -3393,7 +3395,7 @@ async fn json_rpc_local_ai_device_profile_and_presets() {
         "only the ram_2_4gb (1B) preset should be exposed: {presets_result}"
     );
 
-    let recommended = presets_result
+    let recommended = presets_payload
         .get("recommended_tier")
         .and_then(Value::as_str)
         .expect("should have recommended_tier");
@@ -3402,7 +3404,7 @@ async fn json_rpc_local_ai_device_profile_and_presets() {
         "MVP recommends the only allowed tier: {recommended}"
     );
 
-    let current = presets_result
+    let current = presets_payload
         .get("current_tier")
         .and_then(Value::as_str)
         .expect("should have current_tier");
@@ -3416,29 +3418,34 @@ async fn json_rpc_local_ai_device_profile_and_presets() {
     let apply = post_json_rpc(
         &rpc_base,
         32,
-        "openhuman.local_ai_apply_preset",
+        "openhuman.inference_apply_preset",
         json!({"tier": "ram_2_4gb"}),
     )
     .await;
     let apply_result = assert_no_jsonrpc_error(&apply, "apply_preset");
+    let apply_payload = apply_result.get("result").unwrap_or(apply_result);
     assert_eq!(
-        apply_result.get("applied_tier").and_then(Value::as_str),
+        apply_payload.get("applied_tier").and_then(Value::as_str),
         Some("ram_2_4gb")
     );
     assert_eq!(
-        apply_result.get("chat_model_id").and_then(Value::as_str),
+        apply_payload.get("chat_model_id").and_then(Value::as_str),
         Some("gemma3:1b-it-qat")
     );
     assert_eq!(
-        apply_result.get("vision_mode").and_then(Value::as_str),
+        apply_payload.get("vision_mode").and_then(Value::as_str),
         Some("disabled")
     );
 
     // --- verify presets reflects the change ---
-    let presets_after = post_json_rpc(&rpc_base, 33, "openhuman.local_ai_presets", json!({})).await;
+    let presets_after =
+        post_json_rpc(&rpc_base, 33, "openhuman.inference_presets", json!({})).await;
     let presets_after_result = assert_no_jsonrpc_error(&presets_after, "presets_after");
+    let presets_after_payload = presets_after_result
+        .get("result")
+        .unwrap_or(presets_after_result);
     assert_eq!(
-        presets_after_result
+        presets_after_payload
             .get("current_tier")
             .and_then(Value::as_str),
         Some("ram_2_4gb"),
@@ -3449,7 +3456,7 @@ async fn json_rpc_local_ai_device_profile_and_presets() {
     let bad_apply = post_json_rpc(
         &rpc_base,
         34,
-        "openhuman.local_ai_apply_preset",
+        "openhuman.inference_apply_preset",
         json!({"tier": "ultra"}),
     )
     .await;
@@ -3540,7 +3547,7 @@ async fn json_rpc_local_ai_lm_studio_config_diagnostics_and_prompt() {
     let update = post_json_rpc(
         &rpc_base,
         36,
-        "openhuman.config_update_local_ai_settings",
+        "openhuman.inference_update_local_settings",
         json!({
             "runtime_enabled": true,
             "opt_in_confirmed": true,
@@ -3572,7 +3579,7 @@ async fn json_rpc_local_ai_lm_studio_config_diagnostics_and_prompt() {
     );
 
     let diagnostics =
-        post_json_rpc(&rpc_base, 37, "openhuman.local_ai_diagnostics", json!({})).await;
+        post_json_rpc(&rpc_base, 37, "openhuman.inference_diagnostics", json!({})).await;
     let diagnostics_result = assert_no_jsonrpc_error(&diagnostics, "lm_studio_diagnostics");
     assert_eq!(
         diagnostics_result.get("provider").and_then(Value::as_str),
@@ -3677,7 +3684,7 @@ async fn json_rpc_inference_namespace_lm_studio_prompt_and_status() {
     let update = post_json_rpc(
         &rpc_base,
         360,
-        "openhuman.config_update_local_ai_settings",
+        "openhuman.inference_update_local_settings",
         json!({
             "runtime_enabled": true,
             "opt_in_confirmed": true,
@@ -3761,7 +3768,7 @@ async fn json_rpc_inference_prompt_requires_external_ollama_runtime_when_unreach
     let update = post_json_rpc(
         &rpc_base,
         364,
-        "openhuman.config_update_local_ai_settings",
+        "openhuman.inference_update_local_settings",
         json!({
             "runtime_enabled": true,
             "opt_in_confirmed": true,

From bfc3885e30c4277aaf591ffb0552548a7963ebb3 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 19:32:42 -0700
Subject: [PATCH 13/18] refactor(inference): unify all inference concerns under
 src/openhuman/inference/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move local_ai/, providers/, and voice inference files under inference/ so that
the entire inference surface lives in one place:

  inference/local/   ← Ollama / LM Studio / Whisper / Piper runtime
  inference/provider/ ← cloud + local provider trait, routing, reliability
  inference/voice/   ← STT and TTS inference implementations
  inference/http/    ← OpenAI-compatible /v1/chat/completions + /v1/models

src/openhuman/local_ai/ and src/openhuman/providers/ become thin re-export shims
for backward compatibility; src/openhuman/voice/ re-exports the moved modules.

Add OpenAI-compatible HTTP endpoint mounted at /v1 with three integration tests
(no-bearer → 401, bearer present → not 401/403, GET /v1/models → 401 without
bearer). Both cargo check manifests (core + Tauri shell) pass clean.
---
 src/api/rest.rs                               |   2 +-
 src/core/all.rs                               |   8 +-
 src/core/jsonrpc.rs                           |  11 +-
 src/core/observability.rs                     |  10 +-
 src/main.rs                                   |   4 +-
 src/openhuman/agent/bus.rs                    |   2 +-
 src/openhuman/agent/cost.rs                   |   2 +-
 src/openhuman/agent/dispatcher.rs             |   2 +-
 src/openhuman/agent/dispatcher_tests.rs       |   2 +-
 src/openhuman/agent/harness/bughunt_tests.rs  |   2 +-
 src/openhuman/agent/harness/fork_context.rs   |   2 +-
 .../agent/harness/harness_gap_tests.rs        |   6 +-
 src/openhuman/agent/harness/parse.rs          |   2 +-
 .../agent/harness/session/builder.rs          |  41 +--
 .../agent/harness/session/runtime.rs          |  16 +-
 .../agent/harness/session/runtime_tests.rs    |   8 +-
 src/openhuman/agent/harness/session/tests.rs  |  44 +--
 .../agent/harness/session/transcript.rs       |   2 +-
 src/openhuman/agent/harness/session/turn.rs   |   6 +-
 .../agent/harness/session/turn_tests.rs       |   2 +-
 src/openhuman/agent/harness/session/types.rs  |   2 +-
 .../harness/subagent_runner/extract_tool.rs   |   4 +-
 .../agent/harness/subagent_runner/mod.rs      |   2 +-
 .../agent/harness/subagent_runner/ops.rs      |  24 +-
 .../harness/subagent_runner/ops_tests.rs      |   6 +-
 src/openhuman/agent/harness/test_support.rs   |   6 +-
 .../agent/harness/test_support_test.rs        |   2 +-
 src/openhuman/agent/harness/tests.rs          |   4 +-
 src/openhuman/agent/harness/tool_loop.rs      |  10 +-
 .../agent/harness/tool_loop_tests.rs          |   6 +-
 src/openhuman/agent/multimodal.rs             |   2 +-
 src/openhuman/agent/schemas.rs                |   4 +-
 src/openhuman/agent/stop_hooks.rs             |   2 +-
 src/openhuman/agent/tests.rs                  |   4 +-
 src/openhuman/agent/triage/evaluator.rs       |   4 +-
 src/openhuman/agent/triage/evaluator_tests.rs |   2 +-
 src/openhuman/agent/triage/routing.rs         |  12 +-
 src/openhuman/app_state/ops.rs                |   4 +-
 src/openhuman/autocomplete/core/engine.rs     |   2 +-
 src/openhuman/channels/context.rs             |  23 +-
 src/openhuman/channels/providers/web.rs       |   9 +-
 src/openhuman/channels/routes.rs              |  10 +-
 src/openhuman/channels/routes_tests.rs        |   7 +-
 src/openhuman/channels/runtime/dispatch.rs    |   6 +-
 src/openhuman/channels/runtime/startup.rs     |   8 +-
 src/openhuman/channels/tests/common.rs        |   2 +-
 src/openhuman/channels/tests/context.rs       |   4 +-
 .../channels/tests/discord_integration.rs     |   5 +-
 src/openhuman/channels/tests/memory.rs        |   6 +-
 .../channels/tests/runtime_dispatch.rs        |   8 +-
 .../channels/tests/runtime_tool_calls.rs      |  14 +-
 .../channels/tests/telegram_integration.rs    |   5 +-
 src/openhuman/config/ops.rs                   |   2 +-
 .../config/schema/cloud_providers.rs          |   4 +-
 src/openhuman/config/schema/load.rs           |   6 +-
 src/openhuman/context/guard.rs                |   2 +-
 src/openhuman/context/manager.rs              |   2 +-
 src/openhuman/context/manager_tests.rs        |   2 +-
 src/openhuman/context/microcompact.rs         |   4 +-
 src/openhuman/context/pipeline.rs             |   4 +-
 src/openhuman/context/summarizer.rs           |   2 +-
 src/openhuman/context/summarizer_tests.rs     |   4 +-
 src/openhuman/credentials/ops.rs              |   4 +-
 src/openhuman/cron/scheduler.rs               |   2 +-
 src/openhuman/doctor/core.rs                  |   2 +-
 src/openhuman/embeddings/cloud.rs             |   2 +-
 src/openhuman/embeddings/factory.rs           |   2 +-
 .../{local_ai => inference}/device.rs         |   0
 src/openhuman/inference/http/mod.rs           |  14 +
 src/openhuman/inference/http/server.rs        | 281 ++++++++++++++++++
 src/openhuman/inference/http/tests.rs         | 103 +++++++
 src/openhuman/inference/http/types.rs         |  93 ++++++
 .../{local_ai => inference/local}/core.rs     |   0
 .../{local_ai => inference/local}/install.rs  |   4 +-
 .../local}/install_piper.rs                   |   6 +-
 .../local}/install_whisper.rs                 |   8 +-
 .../local/lm_studio.rs}                       |   0
 src/openhuman/inference/local/mod.rs          |  51 ++++
 .../local/ollama.rs}                          |   2 +-
 .../{local_ai => inference/local}/ops.rs      |  26 +-
 .../local}/ops_tests.rs                       |   0
 .../local}/process_util.rs                    |   0
 .../{local_ai => inference/local}/provider.rs |   0
 .../{local_ai => inference/local}/schemas.rs  | 128 ++++----
 .../local}/schemas_tests.rs                   |  32 +-
 .../local}/service/assets.rs                  |  14 +-
 .../local}/service/bootstrap.rs               |  34 +--
 .../local}/service/lm_studio.rs               |  16 +-
 .../local}/service/mod.rs                     |   2 +-
 .../local}/service/ollama_admin.rs            |  40 +--
 .../local}/service/ollama_admin_tests.rs      |  74 +++--
 .../local}/service/public_infer.rs            |  28 +-
 .../local}/service/public_infer_tests.rs      |  22 +-
 .../local}/service/spawn_marker.rs            |   2 +-
 .../local}/service/speech.rs                  |   8 +-
 .../local}/service/vision_embed.rs            |  14 +-
 .../local}/service/whisper_engine.rs          |   0
 .../local}/voice_install_common.rs            |   0
 src/openhuman/inference/mod.rs                |  45 ++-
 .../{local_ai => inference}/model_ids.rs      |   2 +-
 src/openhuman/inference/ops.rs                |  50 ++--
 .../{local_ai => inference}/parse.rs          |   0
 .../{local_ai => inference}/paths.rs          |  10 +-
 .../{local_ai => inference}/presets.rs        |   0
 .../{local_ai => inference}/presets_tests.rs  |   0
 .../provider}/billing_error.rs                |   0
 .../provider}/compatible.rs                   |  21 +-
 .../provider}/compatible_dump.rs              |   0
 .../provider}/compatible_parse.rs             |   4 +-
 .../provider}/compatible_stream.rs            |   2 +-
 .../provider}/compatible_tests.rs             |   2 +-
 .../provider}/compatible_types.rs             |   4 +-
 .../provider}/factory.rs                      |   8 +-
 .../provider}/factory_test.rs                 |   0
 .../{providers => inference/provider}/mod.rs  |  10 +
 .../provider}/openhuman_backend.rs            |   0
 .../{providers => inference/provider}/ops.rs  |   9 +-
 .../provider}/reliable.rs                     |   0
 .../provider}/reliable_tests.rs               |   0
 .../provider}/router.rs                       |   0
 .../provider}/router_test.rs                  |   0
 .../provider}/schemas.rs                      |   5 +-
 .../provider}/thread_context.rs               |   2 +-
 .../provider}/traits.rs                       |   0
 .../provider}/traits_tests.rs                 |   0
 src/openhuman/inference/schemas.rs            |   2 +-
 .../{local_ai => inference}/sentiment.rs      |   2 +-
 .../{local_ai => inference}/types.rs          |   4 +-
 .../{ => inference}/voice/cloud_transcribe.rs |   0
 .../{ => inference}/voice/hallucination.rs    |   0
 .../{ => inference}/voice/local_speech.rs     |  10 +-
 .../{ => inference}/voice/local_transcribe.rs |   8 +-
 src/openhuman/inference/voice/mod.rs          |  12 +
 .../{ => inference}/voice/postprocess.rs      |  16 +-
 .../{ => inference}/voice/streaming.rs        |   4 +-
 src/openhuman/learning/linkedin_enrichment.rs |   2 +-
 src/openhuman/learning/reflection.rs          |   6 +-
 src/openhuman/learning/reflection_tests.rs    |   4 +-
 .../learning/transcript_ingest/extract.rs     |   2 +-
 .../learning/transcript_ingest/tests.rs       |   2 +-
 src/openhuman/local_ai/README.md              |  42 ---
 src/openhuman/local_ai/mod.rs                 |  51 ----
 src/openhuman/mcp_server/tools.rs             |   2 +-
 src/openhuman/memory/store/factories.rs       |   8 +-
 src/openhuman/memory/tree/chat/cloud.rs       |   8 +-
 src/openhuman/memory/tree/chat/mod.rs         |   2 +-
 .../memory/tree/score/embed/factory.rs        |   2 +-
 src/openhuman/migrations/mod_tests.rs         |   2 +-
 .../migrations/phase_out_profile_md_tests.rs  |   2 +-
 .../migrations/unify_ai_provider_settings.rs  |   2 +-
 src/openhuman/mod.rs                          |   2 -
 src/openhuman/routing/factory.rs              |  14 +-
 src/openhuman/routing/mod.rs                  |   4 +-
 src/openhuman/routing/provider.rs             |   2 +-
 src/openhuman/routing/provider_tests.rs       |  12 +-
 .../screen_intelligence/processing_worker.rs  |   2 +-
 src/openhuman/subconscious/executor.rs        |   7 +-
 src/openhuman/threads/ops.rs                  |   4 +-
 src/openhuman/tools/impl/agent/delegate.rs    |  28 +-
 .../impl/agent/spawn_parallel_agents_test.rs  |   6 +-
 .../tools/impl/agent/spawn_worker_thread.rs   |  99 +++---
 src/openhuman/tools/impl/agent/todo_write.rs  |   6 +-
 src/openhuman/tools/ops.rs                    |   2 +-
 src/openhuman/tree_summarizer/engine.rs       |   2 +-
 src/openhuman/tree_summarizer/ops.rs          |  12 +-
 src/openhuman/voice/mod.rs                    |  20 +-
 src/openhuman/voice/ops.rs                    |   8 +-
 src/openhuman/voice/types.rs                  |   2 +-
 tests/agent_builder_public.rs                 |   2 +-
 tests/agent_harness_public.rs                 |   4 +-
 tests/agent_multimodal_public.rs              |   2 +-
 tests/calendar_grounding_e2e.rs               |   2 +-
 tests/json_rpc_e2e.rs                         |   8 +-
 173 files changed, 1305 insertions(+), 743 deletions(-)
 rename src/openhuman/{local_ai => inference}/device.rs (100%)
 create mode 100644 src/openhuman/inference/http/mod.rs
 create mode 100644 src/openhuman/inference/http/server.rs
 create mode 100644 src/openhuman/inference/http/tests.rs
 create mode 100644 src/openhuman/inference/http/types.rs
 rename src/openhuman/{local_ai => inference/local}/core.rs (100%)
 rename src/openhuman/{local_ai => inference/local}/install.rs (99%)
 rename src/openhuman/{local_ai => inference/local}/install_piper.rs (99%)
 rename src/openhuman/{local_ai => inference/local}/install_whisper.rs (98%)
 rename src/openhuman/{local_ai/lm_studio_api.rs => inference/local/lm_studio.rs} (100%)
 create mode 100644 src/openhuman/inference/local/mod.rs
 rename src/openhuman/{local_ai/ollama_api.rs => inference/local/ollama.rs} (99%)
 rename src/openhuman/{local_ai => inference/local}/ops.rs (96%)
 rename src/openhuman/{local_ai => inference/local}/ops_tests.rs (100%)
 rename src/openhuman/{local_ai => inference/local}/process_util.rs (100%)
 rename src/openhuman/{local_ai => inference/local}/provider.rs (100%)
 rename src/openhuman/{local_ai => inference/local}/schemas.rs (82%)
 rename src/openhuman/{local_ai => inference/local}/schemas_tests.rs (82%)
 rename src/openhuman/{local_ai => inference/local}/service/assets.rs (98%)
 rename src/openhuman/{local_ai => inference/local}/service/bootstrap.rs (95%)
 rename src/openhuman/{local_ai => inference/local}/service/lm_studio.rs (92%)
 rename src/openhuman/{local_ai => inference/local}/service/mod.rs (94%)
 rename src/openhuman/{local_ai => inference/local}/service/ollama_admin.rs (97%)
 rename src/openhuman/{local_ai => inference/local}/service/ollama_admin_tests.rs (92%)
 rename src/openhuman/{local_ai => inference/local}/service/public_infer.rs (94%)
 rename src/openhuman/{local_ai => inference/local}/service/public_infer_tests.rs (94%)
 rename src/openhuman/{local_ai => inference/local}/service/spawn_marker.rs (99%)
 rename src/openhuman/{local_ai => inference/local}/service/speech.rs (97%)
 rename src/openhuman/{local_ai => inference/local}/service/vision_embed.rs (95%)
 rename src/openhuman/{local_ai => inference/local}/service/whisper_engine.rs (100%)
 rename src/openhuman/{local_ai => inference/local}/voice_install_common.rs (100%)
 rename src/openhuman/{local_ai => inference}/model_ids.rs (99%)
 rename src/openhuman/{local_ai => inference}/parse.rs (100%)
 rename src/openhuman/{local_ai => inference}/paths.rs (98%)
 rename src/openhuman/{local_ai => inference}/presets.rs (100%)
 rename src/openhuman/{local_ai => inference}/presets_tests.rs (100%)
 rename src/openhuman/{providers => inference/provider}/billing_error.rs (100%)
 rename src/openhuman/{providers => inference/provider}/compatible.rs (98%)
 rename src/openhuman/{providers => inference/provider}/compatible_dump.rs (100%)
 rename src/openhuman/{providers => inference/provider}/compatible_parse.rs (98%)
 rename src/openhuman/{providers => inference/provider}/compatible_stream.rs (97%)
 rename src/openhuman/{providers => inference/provider}/compatible_tests.rs (99%)
 rename src/openhuman/{providers => inference/provider}/compatible_types.rs (98%)
 rename src/openhuman/{providers => inference/provider}/factory.rs (97%)
 rename src/openhuman/{providers => inference/provider}/factory_test.rs (100%)
 rename src/openhuman/{providers => inference/provider}/mod.rs (56%)
 rename src/openhuman/{providers => inference/provider}/openhuman_backend.rs (100%)
 rename src/openhuman/{providers => inference/provider}/ops.rs (98%)
 rename src/openhuman/{providers => inference/provider}/reliable.rs (100%)
 rename src/openhuman/{providers => inference/provider}/reliable_tests.rs (100%)
 rename src/openhuman/{providers => inference/provider}/router.rs (100%)
 rename src/openhuman/{providers => inference/provider}/router_test.rs (100%)
 rename src/openhuman/{providers => inference/provider}/schemas.rs (95%)
 rename src/openhuman/{providers => inference/provider}/thread_context.rs (97%)
 rename src/openhuman/{providers => inference/provider}/traits.rs (100%)
 rename src/openhuman/{providers => inference/provider}/traits_tests.rs (100%)
 rename src/openhuman/{local_ai => inference}/sentiment.rs (99%)
 rename src/openhuman/{local_ai => inference}/types.rs (98%)
 rename src/openhuman/{ => inference}/voice/cloud_transcribe.rs (100%)
 rename src/openhuman/{ => inference}/voice/hallucination.rs (100%)
 rename src/openhuman/{ => inference}/voice/local_speech.rs (97%)
 rename src/openhuman/{ => inference}/voice/local_transcribe.rs (97%)
 create mode 100644 src/openhuman/inference/voice/mod.rs
 rename src/openhuman/{ => inference}/voice/postprocess.rs (96%)
 rename src/openhuman/{ => inference}/voice/streaming.rs (98%)
 delete mode 100644 src/openhuman/local_ai/README.md
 delete mode 100644 src/openhuman/local_ai/mod.rs

diff --git a/src/api/rest.rs b/src/api/rest.rs
index 81bfbb3a92..d85e3b0ad9 100644
--- a/src/api/rest.rs
+++ b/src/api/rest.rs
@@ -533,7 +533,7 @@ impl BackendOAuthClient {
             let is_transient_infra =
                 crate::core::observability::is_transient_http_status_code(status_code);
             let is_budget_exhausted = status_code == 400
-                && crate::openhuman::providers::is_budget_exhausted_message(&text);
+                && crate::openhuman::inference::provider::is_budget_exhausted_message(&text);
             if is_budget_exhausted {
                 tracing::info!(
                     method = method.as_str(),
diff --git a/src/core/all.rs b/src/core/all.rs
index be8e367efd..09ede181bc 100644
--- a/src/core/all.rs
+++ b/src/core/all.rs
@@ -151,10 +151,10 @@ fn build_registered_controllers() -> Vec<RegisteredController> {
     controllers.extend(crate::openhuman::service::all_service_registered_controllers());
     // Data migration utilities
     controllers.extend(crate::openhuman::migration::all_migration_registered_controllers());
-    // External inference runtime access
+    // Unified inference domain: text / vision / embedding / local runtime / cloud providers.
+    // (Formerly split across inference, local_ai, and providers namespaces.)
     controllers.extend(crate::openhuman::inference::all_inference_registered_controllers());
-    // Local AI model management and inference
-    controllers.extend(crate::openhuman::local_ai::all_local_ai_registered_controllers());
+    controllers.extend(crate::openhuman::inference::all_local_ai_registered_controllers());
     // People resolution and interaction scoring
     controllers.extend(crate::openhuman::people::all_people_registered_controllers());
     // Screen capture and UI analysis
@@ -279,7 +279,7 @@ fn build_declared_controller_schemas() -> Vec<ControllerSchema> {
     schemas.extend(crate::openhuman::service::all_service_controller_schemas());
     schemas.extend(crate::openhuman::migration::all_migration_controller_schemas());
     schemas.extend(crate::openhuman::inference::all_inference_controller_schemas());
-    schemas.extend(crate::openhuman::local_ai::all_local_ai_controller_schemas());
+    schemas.extend(crate::openhuman::inference::all_local_ai_controller_schemas());
     schemas.extend(crate::openhuman::people::all_people_controller_schemas());
     schemas.extend(
         crate::openhuman::screen_intelligence::all_screen_intelligence_controller_schemas(),
diff --git a/src/core/jsonrpc.rs b/src/core/jsonrpc.rs
index 32bbb0c58d..da8376965e 100644
--- a/src/core/jsonrpc.rs
+++ b/src/core/jsonrpc.rs
@@ -123,8 +123,9 @@ pub async fn rpc_handler(State(state): State<AppState>, Json(req): Json<RpcReque
                 // query params, or pasted-through provider error text that
                 // includes tokens. `sanitize_api_error` runs the same scrub
                 // used in the SessionExpired publish path below.
-                let redacted =
-                    crate::openhuman::providers::ops::sanitize_api_error(&display_message);
+                let redacted = crate::openhuman::inference::provider::ops::sanitize_api_error(
+                    &display_message,
+                );
                 tracing::warn!(
                     method = %method,
                     elapsed_ms = ms as u64,
@@ -189,7 +190,7 @@ pub async fn invoke_method(state: AppState, method: &str, params: Value) -> Resu
             crate::core::event_bus::publish_global(
                 crate::core::event_bus::DomainEvent::SessionExpired {
                     source: format!("jsonrpc.invoke_method:{method}"),
-                    reason: crate::openhuman::providers::ops::sanitize_api_error(msg),
+                    reason: crate::openhuman::inference::provider::ops::sanitize_api_error(msg),
                 },
             );
         }
@@ -554,6 +555,8 @@ pub fn build_core_http_router(socketio_enabled: bool) -> Router {
         .route("/rpc", post(rpc_handler))
         .route("/ws/dictation", get(dictation_ws_handler))
         .route("/auth/telegram", get(telegram_auth_handler))
+        // OpenAI-compatible inference endpoint (/v1/chat/completions, /v1/models)
+        .nest("/v1", crate::openhuman::inference::http::router())
         .fallback(not_found_handler)
         .layer(middleware::from_fn(http_request_log_middleware))
         .layer(middleware::from_fn(crate::core::auth::rpc_auth_middleware))
@@ -1033,7 +1036,7 @@ async fn run_server_inner(
     // daemon was externally managed) and clear the spawn marker so the
     // next launch doesn't try to reclaim a daemon that's already dead.
     // Bounded so a wedged Ollama can't hold up app shutdown.
-    if let Some(svc) = crate::openhuman::local_ai::try_global() {
+    if let Some(svc) = crate::openhuman::inference::local::try_global() {
         let cfg = crate::openhuman::config::Config::load_or_init()
             .await
             .unwrap_or_default();
diff --git a/src/core/observability.rs b/src/core/observability.rs
index a1d70deca3..8f685f17d8 100644
--- a/src/core/observability.rs
+++ b/src/core/observability.rs
@@ -31,7 +31,7 @@ pub type Tag<'a> = (&'a str, &'a str);
 /// - **504** Gateway Timeout
 ///
 /// Single source of truth for both the call-site classifier
-/// (`openhuman::providers::ops::should_report_provider_http_failure`) and the
+/// (`openhuman::inference::provider::ops::should_report_provider_http_failure`) and the
 /// `before_send` filter (`is_transient_provider_http_failure`). Update here
 /// and both sites pick it up — keeps the two layers from drifting.
 pub const TRANSIENT_PROVIDER_HTTP_STATUSES: &[u16] = &[408, 429, 502, 503, 504, 520];
@@ -123,7 +123,7 @@ pub fn expected_error_kind(message: &str) -> Option<ExpectedErrorKind> {
     if is_local_ai_capability_unavailable_message(&lower) {
         return Some(ExpectedErrorKind::LocalAiCapabilityUnavailable);
     }
-    if crate::openhuman::providers::is_budget_exhausted_message(message) {
+    if crate::openhuman::inference::provider::is_budget_exhausted_message(message) {
         return Some(ExpectedErrorKind::BudgetExhausted);
     }
     if is_session_expired_message(message) {
@@ -617,7 +617,7 @@ pub(crate) fn report_error_message(
 /// that the reliable-provider layer already handles via retry + fallback.
 ///
 /// The primary suppression lives at the call site
-/// (`openhuman::providers::ops::should_report_provider_http_failure`),
+/// (`openhuman::inference::provider::ops::should_report_provider_http_failure`),
 /// which short-circuits transient codes before `report_error` ever fires.
 /// This helper is intended for use inside the `sentry::ClientOptions`
 /// `before_send` hook as defense-in-depth — it catches any future call
@@ -920,7 +920,7 @@ fn event_contains_budget_exhausted_message(event: &sentry::protocol::Event<'_>)
     if event
         .message
         .as_deref()
-        .is_some_and(crate::openhuman::providers::is_budget_exhausted_message)
+        .is_some_and(crate::openhuman::inference::provider::is_budget_exhausted_message)
     {
         return true;
     }
@@ -929,7 +929,7 @@ fn event_contains_budget_exhausted_message(event: &sentry::protocol::Event<'_>)
         exception
             .value
             .as_deref()
-            .is_some_and(crate::openhuman::providers::is_budget_exhausted_message)
+            .is_some_and(crate::openhuman::inference::provider::is_budget_exhausted_message)
     })
 }
 
diff --git a/src/main.rs b/src/main.rs
index 51d346fa5c..e2c456db55 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -53,7 +53,7 @@ fn main() {
             // still fires for genuine outages. Per-attempt reports flood
             // Sentry — see OPENHUMAN-TAURI-2E (~1393 events), -84 (~1050),
             // -T (~871). The primary fix lives in
-            // `openhuman::providers::ops::should_report_provider_http_failure`
+            // `openhuman::inference::provider::ops::should_report_provider_http_failure`
             // (transient codes excluded). This filter catches any future call
             // site that bypasses it.
             if openhuman_core::core::observability::is_transient_provider_http_failure(&event) {
@@ -86,7 +86,7 @@ fn main() {
             // Drop 401 "Session expired. Please log in again." bodies surfaced
             // by llm_provider / backend_api, plus pre-flight "no session token
             // stored" guards from the rpc dispatcher. Primary suppression
-            // lives at the call sites (`openhuman::providers::ops::api_error`
+            // lives at the call sites (`openhuman::inference::provider::ops::api_error`
             // publishes a SessionExpired event_bus signal and short-circuits;
             // the rpc dispatcher's `is_session_expired_error` skip-path in
             // `src/core/jsonrpc.rs` redirects to a tracing::info). This
diff --git a/src/openhuman/agent/bus.rs b/src/openhuman/agent/bus.rs
index 922fcb8f0d..55e32685dc 100644
--- a/src/openhuman/agent/bus.rs
+++ b/src/openhuman/agent/bus.rs
@@ -21,10 +21,10 @@ use tokio::sync::mpsc;
 use crate::core::event_bus::register_native_global;
 use crate::openhuman::agent::progress::AgentProgress;
 use crate::openhuman::config::MultimodalConfig;
+use crate::openhuman::inference::provider::{ChatMessage, Provider};
 use crate::openhuman::prompt_injection::{
     enforce_prompt_input, PromptEnforcementAction, PromptEnforcementContext,
 };
-use crate::openhuman::providers::{ChatMessage, Provider};
 use crate::openhuman::tools::Tool;
 
 use super::harness::definition::{AgentDefinitionRegistry, SandboxMode};
diff --git a/src/openhuman/agent/cost.rs b/src/openhuman/agent/cost.rs
index 188e8bee32..af9bfd37c7 100644
--- a/src/openhuman/agent/cost.rs
+++ b/src/openhuman/agent/cost.rs
@@ -23,7 +23,7 @@
 //! cents-per-Mtok at the tier level is good enough for client-side
 //! telemetry and budget gating. PRs adding new tiers should add a row.
 
-use crate::openhuman::providers::UsageInfo;
+use crate::openhuman::inference::provider::UsageInfo;
 
 /// Per-million-token rates for a single model tier.
 ///
diff --git a/src/openhuman/agent/dispatcher.rs b/src/openhuman/agent/dispatcher.rs
index d0da21af0f..36d2ec64cd 100644
--- a/src/openhuman/agent/dispatcher.rs
+++ b/src/openhuman/agent/dispatcher.rs
@@ -1,7 +1,7 @@
 use crate::openhuman::agent::harness::parse_tool_calls;
 use crate::openhuman::agent::pformat::{self, PFormatRegistry};
 use crate::openhuman::context::prompt::ToolCallFormat;
-use crate::openhuman::providers::{
+use crate::openhuman::inference::provider::{
     ChatMessage, ChatResponse, ConversationMessage, ToolResultMessage,
 };
 use crate::openhuman::tools::{Tool, ToolSpec};
diff --git a/src/openhuman/agent/dispatcher_tests.rs b/src/openhuman/agent/dispatcher_tests.rs
index 6a0c9af871..34ada29d16 100644
--- a/src/openhuman/agent/dispatcher_tests.rs
+++ b/src/openhuman/agent/dispatcher_tests.rs
@@ -21,7 +21,7 @@ fn xml_dispatcher_parses_tool_calls() {
 fn native_dispatcher_roundtrip() {
     let response = ChatResponse {
         text: Some("ok".into()),
-        tool_calls: vec![crate::openhuman::providers::ToolCall {
+        tool_calls: vec![crate::openhuman::inference::provider::ToolCall {
             id: "tc1".into(),
             name: "file_read".into(),
             arguments: "{\"path\":\"a.txt\"}".into(),
diff --git a/src/openhuman/agent/harness/bughunt_tests.rs b/src/openhuman/agent/harness/bughunt_tests.rs
index dae7656ce5..481b0350db 100644
--- a/src/openhuman/agent/harness/bughunt_tests.rs
+++ b/src/openhuman/agent/harness/bughunt_tests.rs
@@ -9,7 +9,7 @@
 
 use super::test_support::{KeywordRule, KeywordScriptedProvider, ScriptedToolCall};
 use super::tool_loop::run_tool_call_loop;
-use crate::openhuman::providers::{ChatMessage, ChatResponse, ToolCall};
+use crate::openhuman::inference::provider::{ChatMessage, ChatResponse, ToolCall};
 use crate::openhuman::tools::traits::{Tool, ToolResult};
 use async_trait::async_trait;
 use parking_lot::Mutex;
diff --git a/src/openhuman/agent/harness/fork_context.rs b/src/openhuman/agent/harness/fork_context.rs
index 477998bbbe..03c51b0b92 100644
--- a/src/openhuman/agent/harness/fork_context.rs
+++ b/src/openhuman/agent/harness/fork_context.rs
@@ -12,8 +12,8 @@
 
 use crate::openhuman::agent::progress::AgentProgress;
 use crate::openhuman::config::AgentConfig;
+use crate::openhuman::inference::provider::Provider;
 use crate::openhuman::memory::Memory;
-use crate::openhuman::providers::Provider;
 use crate::openhuman::skills::Skill;
 use crate::openhuman::tools::{Tool, ToolSpec};
 use std::path::PathBuf;
diff --git a/src/openhuman/agent/harness/harness_gap_tests.rs b/src/openhuman/agent/harness/harness_gap_tests.rs
index 032bba505e..d3348b4a7b 100644
--- a/src/openhuman/agent/harness/harness_gap_tests.rs
+++ b/src/openhuman/agent/harness/harness_gap_tests.rs
@@ -22,9 +22,9 @@
 use crate::openhuman::agent::error::AgentError;
 use crate::openhuman::agent::harness::tool_loop::run_tool_call_loop;
 use crate::openhuman::context::guard::{ContextCheckResult, ContextGuard};
-use crate::openhuman::providers::traits::ProviderCapabilities;
-use crate::openhuman::providers::Provider;
-use crate::openhuman::providers::{ChatMessage, ChatRequest, ChatResponse, UsageInfo};
+use crate::openhuman::inference::provider::traits::ProviderCapabilities;
+use crate::openhuman::inference::provider::Provider;
+use crate::openhuman::inference::provider::{ChatMessage, ChatRequest, ChatResponse, UsageInfo};
 use crate::openhuman::tool_timeout::parse_tool_timeout_secs;
 use crate::openhuman::tools::{Tool, ToolResult};
 use async_trait::async_trait;
diff --git a/src/openhuman/agent/harness/parse.rs b/src/openhuman/agent/harness/parse.rs
index 5c38a8e5ab..52b34afbc1 100644
--- a/src/openhuman/agent/harness/parse.rs
+++ b/src/openhuman/agent/harness/parse.rs
@@ -1,4 +1,4 @@
-use crate::openhuman::providers::ToolCall;
+use crate::openhuman::inference::provider::ToolCall;
 use crate::openhuman::tools::Tool;
 use regex::Regex;
 use std::sync::LazyLock;
diff --git a/src/openhuman/agent/harness/session/builder.rs b/src/openhuman/agent/harness/session/builder.rs
index 6a958cb5db..4fd0f0f041 100644
--- a/src/openhuman/agent/harness/session/builder.rs
+++ b/src/openhuman/agent/harness/session/builder.rs
@@ -18,8 +18,8 @@ use crate::openhuman::agent::memory_loader::{DefaultMemoryLoader, MemoryLoader};
 use crate::openhuman::config::{Config, ContextConfig};
 use crate::openhuman::context::prompt::SystemPromptBuilder;
 use crate::openhuman::context::{ContextManager, ProviderSummarizer};
+use crate::openhuman::inference::provider::{self, Provider};
 use crate::openhuman::memory::{self, Memory};
-use crate::openhuman::providers::{self, Provider};
 use crate::openhuman::security::SecurityPolicy;
 use crate::openhuman::tools::{self, Tool, ToolSpec};
 use anyhow::Result;
@@ -514,7 +514,7 @@ impl Agent {
     ///   legacy behaviour).
     ///
     /// The welcome agent uses this entry point when routed from the
-    /// Tauri web channel (see `channels::providers::web::build_session_agent`).
+    /// Tauri web channel (see `channels::provider::web::build_session_agent`).
     pub fn from_config_for_agent(config: &Config, agent_id: &str) -> Result<Self> {
         // Look up the target definition up front so we can fail fast
         // with a clear error instead of building half an agent and then
@@ -594,7 +594,7 @@ impl Agent {
     /// [`SystemPromptBuilder`], seeded with the `source_chunks` snapshot
     /// from the spawning subconscious reflection (#623).
     ///
-    /// Used by `channels::providers::web::build_session_agent` when a
+    /// Used by `channels::provider::web::build_session_agent` when a
     /// chat thread's seed message metadata flags
     /// `origin == "subconscious_reflection"` — the orchestrator then
     /// has the same memory context the reflection-LLM had, so the user's
@@ -762,7 +762,7 @@ impl Agent {
         // backend. Those are valuable but orthogonal — they can be layered
         // back on top of the factory's output in a follow-up without
         // re-introducing the routing bypass.
-        let _ = providers::ProviderRuntimeOptions {
+        let _ = provider::ProviderRuntimeOptions {
             auth_profile_override: None,
             openhuman_dir: config.config_path.parent().map(std::path::PathBuf::from),
             secrets_encrypt: config.secrets.encrypt,
@@ -775,7 +775,7 @@ impl Agent {
             _ => "reasoning",
         };
         let (provider, model_name): (Box<dyn Provider>, String) =
-            crate::openhuman::providers::create_chat_provider(provider_role, config)?;
+            crate::openhuman::inference::provider::create_chat_provider(provider_role, config)?;
 
         // Dispatcher selection is deferred until after the tool list is
         // finalised (orchestrator tools are appended below). We capture
@@ -924,21 +924,22 @@ impl Agent {
                 let full_config = Arc::new(config.clone());
                 // For cloud reflection, wrap the provider in an Arc.
                 // For local, no provider needed.
-                let reflection_provider: Option<Arc<dyn crate::openhuman::providers::Provider>> =
-                    if config.learning.reflection_source
-                        == crate::openhuman::config::ReflectionSource::Cloud
-                    {
-                        Some(Arc::from(providers::create_routed_provider(
-                            config.inference_url.as_deref(),
-                            config.api_url.as_deref(),
-                            config.api_key.as_deref(),
-                            &config.reliability,
-                            &config.model_routes,
-                            &model_name,
-                        )?))
-                    } else {
-                        None
-                    };
+                let reflection_provider: Option<
+                    Arc<dyn crate::openhuman::inference::provider::Provider>,
+                > = if config.learning.reflection_source
+                    == crate::openhuman::config::ReflectionSource::Cloud
+                {
+                    Some(Arc::from(provider::create_routed_provider(
+                        config.inference_url.as_deref(),
+                        config.api_url.as_deref(),
+                        config.api_key.as_deref(),
+                        &config.reliability,
+                        &config.model_routes,
+                        &model_name,
+                    )?))
+                } else {
+                    None
+                };
                 post_turn_hooks.push(Arc::new(crate::openhuman::learning::ReflectionHook::new(
                     config.learning.clone(),
                     full_config.clone(),
diff --git a/src/openhuman/agent/harness/session/runtime.rs b/src/openhuman/agent/harness/session/runtime.rs
index b79bdf7ea3..1b44ad021f 100644
--- a/src/openhuman/agent/harness/session/runtime.rs
+++ b/src/openhuman/agent/harness/session/runtime.rs
@@ -11,11 +11,11 @@ use super::types::{Agent, AgentBuilder};
 use crate::core::event_bus::{publish_global, DomainEvent};
 use crate::openhuman::agent::dispatcher::ParsedToolCall;
 use crate::openhuman::agent::error::AgentError;
+use crate::openhuman::inference::provider::{self, ConversationMessage, Provider, ToolCall};
 use crate::openhuman::memory::Memory;
 use crate::openhuman::prompt_injection::{
     enforce_prompt_input, PromptEnforcementAction, PromptEnforcementContext,
 };
-use crate::openhuman::providers::{self, ConversationMessage, Provider, ToolCall};
 use crate::openhuman::tools::{Tool, ToolSpec};
 use crate::openhuman::util::truncate_with_ellipsis;
 use anyhow::Result;
@@ -276,21 +276,21 @@ impl Agent {
         let learned = crate::openhuman::agent::prompts::LearnedContextData::default();
         let system_prompt = self.build_system_prompt(learned)?;
 
-        let mut cached: Vec<crate::openhuman::providers::ChatMessage> =
+        let mut cached: Vec<crate::openhuman::inference::provider::ChatMessage> =
             Vec::with_capacity(prior.len() + 1);
-        cached.push(crate::openhuman::providers::ChatMessage::system(
+        cached.push(crate::openhuman::inference::provider::ChatMessage::system(
             system_prompt,
         ));
         for (role, content) in prior {
             let chat = match role.as_str() {
-                "user" => crate::openhuman::providers::ChatMessage::user(content),
+                "user" => crate::openhuman::inference::provider::ChatMessage::user(content),
                 "agent" | "assistant" => {
-                    crate::openhuman::providers::ChatMessage::assistant(content)
+                    crate::openhuman::inference::provider::ChatMessage::assistant(content)
                 }
                 // Fall back to user role for unknown senders rather than
                 // dropping the message — losing context is worse than
                 // mislabelling a system/tool message.
-                _ => crate::openhuman::providers::ChatMessage::user(content),
+                _ => crate::openhuman::inference::provider::ChatMessage::user(content),
             };
             cached.push(chat);
         }
@@ -376,7 +376,7 @@ impl Agent {
             return kind.to_string();
         }
 
-        let scrubbed = providers::sanitize_api_error(&err.to_string())
+        let scrubbed = provider::sanitize_api_error(&err.to_string())
             .replace(['\n', '\r', '\t'], " ")
             .split_whitespace()
             .collect::<Vec<_>>()
@@ -405,7 +405,7 @@ impl Agent {
     /// If the provider response already contains native tool calls, they are
     /// returned as-is.
     pub(super) fn persisted_tool_calls_for_history(
-        response: &crate::openhuman::providers::ChatResponse,
+        response: &crate::openhuman::inference::provider::ChatResponse,
         parsed_calls: &[ParsedToolCall],
         iteration: usize,
     ) -> Vec<ToolCall> {
diff --git a/src/openhuman/agent/harness/session/runtime_tests.rs b/src/openhuman/agent/harness/session/runtime_tests.rs
index acf5ec422b..adc34aaa6f 100644
--- a/src/openhuman/agent/harness/session/runtime_tests.rs
+++ b/src/openhuman/agent/harness/session/runtime_tests.rs
@@ -2,8 +2,8 @@ use super::*;
 use crate::core::event_bus::{global, init_global, DomainEvent};
 use crate::openhuman::agent::dispatcher::XmlToolDispatcher;
 use crate::openhuman::agent::error::AgentError;
+use crate::openhuman::inference::provider::{ChatMessage, ChatRequest, ChatResponse, UsageInfo};
 use crate::openhuman::memory::Memory;
-use crate::openhuman::providers::{ChatMessage, ChatRequest, ChatResponse, UsageInfo};
 use anyhow::anyhow;
 use async_trait::async_trait;
 use parking_lot::Mutex;
@@ -121,7 +121,7 @@ fn sanitizers_and_tool_call_helpers_cover_fallback_paths() {
     assert_eq!(calls[0].tool_call_id.as_deref(), Some("parsed-3-1"));
     assert_eq!(calls[1].tool_call_id.as_deref(), Some("keep"));
 
-    let response = crate::openhuman::providers::ChatResponse {
+    let response = crate::openhuman::inference::provider::ChatResponse {
         text: Some(String::new()),
         tool_calls: vec![],
         usage: None,
@@ -302,12 +302,12 @@ fn helper_paths_cover_no_overlap_native_calls_and_truncation() {
     assert_eq!(appended.len(), 1);
     assert!(matches!(&appended[0], ConversationMessage::Chat(msg) if msg.content == "b"));
 
-    let native_calls = vec![crate::openhuman::providers::ToolCall {
+    let native_calls = vec![crate::openhuman::inference::provider::ToolCall {
         id: "native-1".into(),
         name: "echo".into(),
         arguments: "{}".into(),
     }];
-    let response = crate::openhuman::providers::ChatResponse {
+    let response = crate::openhuman::inference::provider::ChatResponse {
         text: Some(String::new()),
         tool_calls: native_calls.clone(),
         usage: None,
diff --git a/src/openhuman/agent/harness/session/tests.rs b/src/openhuman/agent/harness/session/tests.rs
index 43bcff9de0..efa50917dd 100644
--- a/src/openhuman/agent/harness/session/tests.rs
+++ b/src/openhuman/agent/harness/session/tests.rs
@@ -7,8 +7,8 @@
 
 use super::types::{Agent, AgentBuilder};
 use crate::openhuman::agent::dispatcher::{NativeToolDispatcher, XmlToolDispatcher};
+use crate::openhuman::inference::provider::{ChatRequest, ConversationMessage, Provider};
 use crate::openhuman::memory::Memory;
-use crate::openhuman::providers::{ChatRequest, ConversationMessage, Provider};
 use crate::openhuman::tools::Tool;
 use anyhow::Result;
 use async_trait::async_trait;
@@ -16,7 +16,7 @@ use parking_lot::Mutex;
 use std::sync::Arc;
 
 struct MockProvider {
-    responses: Mutex<Vec<crate::openhuman::providers::ChatResponse>>,
+    responses: Mutex<Vec<crate::openhuman::inference::provider::ChatResponse>>,
 }
 
 #[async_trait]
@@ -36,10 +36,10 @@ impl Provider for MockProvider {
         _request: ChatRequest<'_>,
         _model: &str,
         _temperature: f64,
-    ) -> Result<crate::openhuman::providers::ChatResponse> {
+    ) -> Result<crate::openhuman::inference::provider::ChatResponse> {
         let mut guard = self.responses.lock();
         if guard.is_empty() {
-            return Ok(crate::openhuman::providers::ChatResponse {
+            return Ok(crate::openhuman::inference::provider::ChatResponse {
                 text: Some("done".into()),
                 tool_calls: vec![],
                 usage: None,
@@ -56,7 +56,7 @@ impl Provider for MockProvider {
 #[derive(Default)]
 struct RecordingProvider {
     captures: Mutex<Vec<CapturedCall>>,
-    responses: Mutex<Vec<crate::openhuman::providers::ChatResponse>>,
+    responses: Mutex<Vec<crate::openhuman::inference::provider::ChatResponse>>,
 }
 
 #[derive(Clone)]
@@ -82,7 +82,7 @@ impl Provider for RecordingProvider {
         request: ChatRequest<'_>,
         model: &str,
         _temperature: f64,
-    ) -> Result<crate::openhuman::providers::ChatResponse> {
+    ) -> Result<crate::openhuman::inference::provider::ChatResponse> {
         let system_prompt = request
             .messages
             .iter()
@@ -95,7 +95,7 @@ impl Provider for RecordingProvider {
 
         let mut guard = self.responses.lock();
         if guard.is_empty() {
-            return Ok(crate::openhuman::providers::ChatResponse {
+            return Ok(crate::openhuman::inference::provider::ChatResponse {
                 text: Some("done".into()),
                 tool_calls: vec![],
                 usage: None,
@@ -248,7 +248,7 @@ async fn turn_without_tools_returns_text() {
     let workspace_path = workspace.path().to_path_buf();
 
     let provider = Box::new(MockProvider {
-        responses: Mutex::new(vec![crate::openhuman::providers::ChatResponse {
+        responses: Mutex::new(vec![crate::openhuman::inference::provider::ChatResponse {
             text: Some("hello".into()),
             tool_calls: vec![],
             usage: None,
@@ -282,16 +282,16 @@ async fn turn_with_native_dispatcher_handles_tool_results_variant() {
 
     let provider = Box::new(MockProvider {
         responses: Mutex::new(vec![
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some(String::new()),
-                tool_calls: vec![crate::openhuman::providers::ToolCall {
+                tool_calls: vec![crate::openhuman::inference::provider::ToolCall {
                     id: "tc1".into(),
                     name: "echo".into(),
                     arguments: "{}".into(),
                 }],
                 usage: None,
             },
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some("done".into()),
                 tool_calls: vec![],
                 usage: None,
@@ -330,7 +330,7 @@ async fn turn_with_native_dispatcher_persists_fallback_tool_calls() {
 
     let provider = Box::new(MockProvider {
         responses: Mutex::new(vec![
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some(
                     "Checking...\n<tool_call>{\"name\":\"echo\",\"arguments\":{}}</tool_call>"
                         .into(),
@@ -338,7 +338,7 @@ async fn turn_with_native_dispatcher_persists_fallback_tool_calls() {
                 tool_calls: vec![],
                 usage: None,
             },
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some("done".into()),
                 tool_calls: vec![],
                 usage: None,
@@ -415,9 +415,9 @@ async fn turn_dispatches_spawn_subagent_through_full_path() {
     //   3. Parent turn iter 1 — fold sub-agent result into "Based on the research, X is Y."
     let provider = Box::new(MockProvider {
         responses: Mutex::new(vec![
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some(String::new()),
-                tool_calls: vec![crate::openhuman::providers::ToolCall {
+                tool_calls: vec![crate::openhuman::inference::provider::ToolCall {
                     id: "call-spawn".into(),
                     name: "spawn_subagent".into(),
                     arguments: serde_json::json!({
@@ -428,12 +428,12 @@ async fn turn_dispatches_spawn_subagent_through_full_path() {
                 }],
                 usage: None,
             },
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some("X is Y".into()),
                 tool_calls: vec![],
                 usage: None,
             },
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some("Based on the research, X is Y.".into()),
                 tool_calls: vec![],
                 usage: None,
@@ -510,17 +510,17 @@ async fn system_prompt_and_model_are_byte_stable_across_turns() {
 
     let provider = Arc::new(RecordingProvider {
         responses: Mutex::new(vec![
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some("first".into()),
                 tool_calls: vec![],
                 usage: None,
             },
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some("second".into()),
                 tool_calls: vec![],
                 usage: None,
             },
-            crate::openhuman::providers::ChatResponse {
+            crate::openhuman::inference::provider::ChatResponse {
                 text: Some("third".into()),
                 tool_calls: vec![],
                 usage: None,
@@ -690,8 +690,8 @@ fn seed_resume_from_messages_primes_cached_transcript() {
 fn seed_resume_from_messages_is_noop_on_warm_agent() {
     let mut agent = build_minimal_agent_with_definition_name(Some("orchestrator"));
     agent.cached_transcript_messages = Some(vec![
-        crate::openhuman::providers::ChatMessage::system("warm prefix"),
-        crate::openhuman::providers::ChatMessage::user("hi"),
+        crate::openhuman::inference::provider::ChatMessage::system("warm prefix"),
+        crate::openhuman::inference::provider::ChatMessage::user("hi"),
     ]);
     agent
         .seed_resume_from_messages(vec![("user".into(), "different".into())], "different")
diff --git a/src/openhuman/agent/harness/session/transcript.rs b/src/openhuman/agent/harness/session/transcript.rs
index 4108757823..3ee740fc51 100644
--- a/src/openhuman/agent/harness/session/transcript.rs
+++ b/src/openhuman/agent/harness/session/transcript.rs
@@ -50,7 +50,7 @@
 //! the session transcript can eventually replace the separate thread
 //! message log without losing message-level addressing.
 
-use crate::openhuman::providers::ChatMessage;
+use crate::openhuman::inference::provider::ChatMessage;
 use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
diff --git a/src/openhuman/agent/harness/session/turn.rs b/src/openhuman/agent/harness/session/turn.rs
index 05f0a8b2e1..ad1fe5d63f 100644
--- a/src/openhuman/agent/harness/session/turn.rs
+++ b/src/openhuman/agent/harness/session/turn.rs
@@ -27,8 +27,10 @@ use crate::openhuman::agent::memory_loader::collect_recall_citations;
 use crate::openhuman::agent::progress::AgentProgress;
 use crate::openhuman::context::prompt::{LearnedContextData, PromptContext, PromptTool};
 use crate::openhuman::context::{ReductionOutcome, ARCHIVIST_EXTRACTION_PROMPT};
+use crate::openhuman::inference::provider::{
+    ChatMessage, ChatRequest, ConversationMessage, ProviderDelta,
+};
 use crate::openhuman::memory::MemoryCategory;
-use crate::openhuman::providers::{ChatMessage, ChatRequest, ConversationMessage, ProviderDelta};
 use crate::openhuman::tools::traits::ToolCallOptions;
 use crate::openhuman::tools::Tool;
 use crate::openhuman::util::truncate_with_ellipsis;
@@ -1664,7 +1666,7 @@ impl Agent {
             output_tokens,
             cached_input_tokens,
             charged_amount_usd,
-            thread_id: crate::openhuman::providers::thread_context::current_thread_id(),
+            thread_id: crate::openhuman::inference::provider::thread_context::current_thread_id(),
         };
 
         if let Err(err) = transcript::write_transcript(path, messages, &meta, turn_usage) {
diff --git a/src/openhuman/agent/harness/session/turn_tests.rs b/src/openhuman/agent/harness/session/turn_tests.rs
index f5618e2dbc..e39f94639e 100644
--- a/src/openhuman/agent/harness/session/turn_tests.rs
+++ b/src/openhuman/agent/harness/session/turn_tests.rs
@@ -3,8 +3,8 @@ use crate::core::event_bus::{global, init_global, DomainEvent};
 use crate::openhuman::agent::dispatcher::XmlToolDispatcher;
 use crate::openhuman::agent::hooks::{PostTurnHook, TurnContext};
 use crate::openhuman::agent::memory_loader::MemoryLoader;
+use crate::openhuman::inference::provider::{ChatRequest, ChatResponse, Provider};
 use crate::openhuman::memory::Memory;
-use crate::openhuman::providers::{ChatRequest, ChatResponse, Provider};
 use crate::openhuman::tools::Tool;
 use crate::openhuman::tools::ToolResult;
 use async_trait::async_trait;
diff --git a/src/openhuman/agent/harness/session/types.rs b/src/openhuman/agent/harness/session/types.rs
index 487be16357..e44ad4a35f 100644
--- a/src/openhuman/agent/harness/session/types.rs
+++ b/src/openhuman/agent/harness/session/types.rs
@@ -12,8 +12,8 @@ use crate::openhuman::agent::memory_loader::MemoryLoader;
 use crate::openhuman::agent::progress::AgentProgress;
 use crate::openhuman::context::prompt::SystemPromptBuilder;
 use crate::openhuman::context::ContextManager;
+use crate::openhuman::inference::provider::{ChatMessage, ConversationMessage, Provider};
 use crate::openhuman::memory::Memory;
-use crate::openhuman::providers::{ChatMessage, ConversationMessage, Provider};
 use crate::openhuman::tools::{Tool, ToolSpec};
 use std::path::PathBuf;
 use std::sync::Arc;
diff --git a/src/openhuman/agent/harness/subagent_runner/extract_tool.rs b/src/openhuman/agent/harness/subagent_runner/extract_tool.rs
index fa2c327cd3..1d07f216d6 100644
--- a/src/openhuman/agent/harness/subagent_runner/extract_tool.rs
+++ b/src/openhuman/agent/harness/subagent_runner/extract_tool.rs
@@ -28,7 +28,7 @@ use super::handoff::{chunk_content, ResultHandoffCache, HANDOFF_MAX_ENTRIES};
 use crate::openhuman::agent::harness::session::transcript::{
     resolve_keyed_transcript_path, write_transcript, MessageUsage, TranscriptMeta, TurnUsage,
 };
-use crate::openhuman::providers::{ChatMessage, Provider};
+use crate::openhuman::inference::provider::{ChatMessage, Provider};
 use crate::openhuman::tools::{Tool, ToolCategory, ToolResult};
 
 // ── Tunables ──────────────────────────────────────────────────────────
@@ -485,7 +485,7 @@ fn write_extract_transcript(
         output_tokens: 0,
         cached_input_tokens: 0,
         charged_amount_usd: 0.0,
-        thread_id: crate::openhuman::providers::thread_context::current_thread_id(),
+        thread_id: crate::openhuman::inference::provider::thread_context::current_thread_id(),
     };
 
     if let Err(e) = write_transcript(&path, &messages, &meta, Some(&turn_usage)) {
diff --git a/src/openhuman/agent/harness/subagent_runner/mod.rs b/src/openhuman/agent/harness/subagent_runner/mod.rs
index e74fd708ad..6ffb1abe24 100644
--- a/src/openhuman/agent/harness/subagent_runner/mod.rs
+++ b/src/openhuman/agent/harness/subagent_runner/mod.rs
@@ -13,7 +13,7 @@
 //!    definition asks to omit (`omit_identity`, `omit_memory_context`,
 //!    `omit_safety_preamble`, `omit_skills_catalog`).
 //! 5. Runs a slim inner tool-call loop using the parent's
-//!    [`crate::openhuman::providers::Provider`] and returns a single
+//!    [`crate::openhuman::inference::provider::Provider`] and returns a single
 //!    text result. The intra-sub-agent history never leaks back to the
 //!    parent — the parent only sees one compact tool result.
 //!
diff --git a/src/openhuman/agent/harness/subagent_runner/ops.rs b/src/openhuman/agent/harness/subagent_runner/ops.rs
index a33a26d418..247e95763a 100644
--- a/src/openhuman/agent/harness/subagent_runner/ops.rs
+++ b/src/openhuman/agent/harness/subagent_runner/ops.rs
@@ -29,8 +29,8 @@ use crate::openhuman::agent::progress::AgentProgress;
 use crate::openhuman::context::prompt::{
     render_subagent_system_prompt, PromptContext, PromptTool, SubagentRenderOptions,
 };
+use crate::openhuman::inference::provider::{ChatMessage, ChatRequest, Provider, ToolCall};
 use crate::openhuman::memory::conversations::ConversationMessage;
-use crate::openhuman::providers::{ChatMessage, ChatRequest, Provider, ToolCall};
 use crate::openhuman::tools::{Tool, ToolCategory, ToolSpec};
 
 /// Prompt suffix injected into every typed sub-agent run.
@@ -109,23 +109,25 @@ pub(super) fn resolve_subagent_provider(
     use crate::openhuman::agent::harness::definition::ModelSpec;
     match spec {
         ModelSpec::Hint(workload) => match config {
-            Some(cfg) => match crate::openhuman::providers::create_chat_provider(workload, cfg) {
-                Ok((p, m)) => {
-                    log::info!(
+            Some(cfg) => {
+                match crate::openhuman::inference::provider::create_chat_provider(workload, cfg) {
+                    Ok((p, m)) => {
+                        log::info!(
                         "[subagent_runner] role={} agent_id={} resolved via workload factory model={}",
                         workload, agent_id, m
                     );
-                    (std::sync::Arc::from(p), m)
-                }
-                Err(e) => {
-                    log::warn!(
+                        (std::sync::Arc::from(p), m)
+                    }
+                    Err(e) => {
+                        log::warn!(
                         "[subagent_runner] workload '{}' provider build failed ({}) for agent_id={} — \
                          falling back to parent provider + parent model '{}'",
                         workload, e, agent_id, parent_model
                     );
-                    (parent_provider, parent_model)
+                        (parent_provider, parent_model)
+                    }
                 }
-            },
+            }
             None => {
                 log::warn!(
                     "[subagent_runner] config load failed for workload '{}' (agent_id={}) — \
@@ -1156,7 +1158,7 @@ async fn run_inner_loop(
             output_tokens: usage.output_tokens,
             cached_input_tokens: usage.cached_input_tokens,
             charged_amount_usd: usage.charged_amount_usd,
-            thread_id: crate::openhuman::providers::thread_context::current_thread_id(),
+            thread_id: crate::openhuman::inference::provider::thread_context::current_thread_id(),
         };
         if let Err(err) = transcript::write_transcript(&path, history, &meta, None) {
             tracing::debug!(
diff --git a/src/openhuman/agent/harness/subagent_runner/ops_tests.rs b/src/openhuman/agent/harness/subagent_runner/ops_tests.rs
index 31ce7b02be..9d2c2d23b3 100644
--- a/src/openhuman/agent/harness/subagent_runner/ops_tests.rs
+++ b/src/openhuman/agent/harness/subagent_runner/ops_tests.rs
@@ -134,7 +134,9 @@ fn append_subagent_role_contract_is_idempotent() {
 // ── End-to-end runner tests with mock provider ────────────────────────
 
 use crate::openhuman::agent::harness::fork_context::with_parent_context;
-use crate::openhuman::providers::{ChatRequest as PChatRequest, ChatResponse, Provider, ToolCall};
+use crate::openhuman::inference::provider::{
+    ChatRequest as PChatRequest, ChatResponse, Provider, ToolCall,
+};
 use parking_lot::Mutex;
 use std::sync::Arc;
 
@@ -142,7 +144,7 @@ use std::sync::Arc;
 /// to verify the bytes that arrive at the model.
 #[derive(Clone)]
 struct CapturedRequest {
-    messages: Vec<crate::openhuman::providers::ChatMessage>,
+    messages: Vec<crate::openhuman::inference::provider::ChatMessage>,
     tool_count: usize,
 }
 
diff --git a/src/openhuman/agent/harness/test_support.rs b/src/openhuman/agent/harness/test_support.rs
index ec44869dc8..e97d453d5b 100644
--- a/src/openhuman/agent/harness/test_support.rs
+++ b/src/openhuman/agent/harness/test_support.rs
@@ -39,8 +39,10 @@ use async_trait::async_trait;
 use parking_lot::Mutex;
 use serde_json::json;
 
-use crate::openhuman::providers::traits::ProviderCapabilities;
-use crate::openhuman::providers::{ChatMessage, ChatRequest, ChatResponse, Provider, ToolCall};
+use crate::openhuman::inference::provider::traits::ProviderCapabilities;
+use crate::openhuman::inference::provider::{
+    ChatMessage, ChatRequest, ChatResponse, Provider, ToolCall,
+};
 
 /// One scripted reaction the [`KeywordScriptedProvider`] can emit when
 /// it sees its keyword in the latest user/tool turn.
diff --git a/src/openhuman/agent/harness/test_support_test.rs b/src/openhuman/agent/harness/test_support_test.rs
index 2436ac522e..74732644b1 100644
--- a/src/openhuman/agent/harness/test_support_test.rs
+++ b/src/openhuman/agent/harness/test_support_test.rs
@@ -9,7 +9,7 @@ use super::test_support::{
     KeywordScriptedProvider, ScriptedToolCall,
 };
 use super::tool_loop::run_tool_call_loop;
-use crate::openhuman::providers::{ChatMessage, ChatRequest, ChatResponse, Provider};
+use crate::openhuman::inference::provider::{ChatMessage, ChatRequest, ChatResponse, Provider};
 use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolCategory, ToolResult, ToolScope};
 use async_trait::async_trait;
 use serde_json::json;
diff --git a/src/openhuman/agent/harness/tests.rs b/src/openhuman/agent/harness/tests.rs
index c144d24c25..550800589b 100644
--- a/src/openhuman/agent/harness/tests.rs
+++ b/src/openhuman/agent/harness/tests.rs
@@ -5,8 +5,8 @@ use super::parse::{
     parse_tool_calls, parse_tool_calls_from_json_value, tools_to_openai_format,
 };
 use super::tool_loop::{run_tool_call_loop, DEFAULT_MAX_TOOL_ITERATIONS};
-use crate::openhuman::providers::traits::ProviderCapabilities;
-use crate::openhuman::providers::{ChatMessage, ChatRequest, ChatResponse, Provider};
+use crate::openhuman::inference::provider::traits::ProviderCapabilities;
+use crate::openhuman::inference::provider::{ChatMessage, ChatRequest, ChatResponse, Provider};
 use crate::openhuman::tools::{self, Tool};
 use async_trait::async_trait;
 use base64::{engine::general_purpose::STANDARD, Engine as _};
diff --git a/src/openhuman/agent/harness/tool_loop.rs b/src/openhuman/agent/harness/tool_loop.rs
index 35ce566481..140d4e9971 100644
--- a/src/openhuman/agent/harness/tool_loop.rs
+++ b/src/openhuman/agent/harness/tool_loop.rs
@@ -3,7 +3,7 @@ use crate::openhuman::agent::multimodal;
 use crate::openhuman::agent::progress::AgentProgress;
 use crate::openhuman::agent::stop_hooks::{current_stop_hooks, StopDecision, TurnState};
 use crate::openhuman::approval::{ApprovalManager, ApprovalRequest, ApprovalResponse};
-use crate::openhuman::providers::{
+use crate::openhuman::inference::provider::{
     ChatMessage, ChatRequest, Provider, ProviderCapabilityError, ProviderDelta,
 };
 use crate::openhuman::tools::traits::ToolScope;
@@ -417,8 +417,12 @@ pub(crate) async fn run_tool_call_loop(
                     // signal and floods Sentry — see OPENHUMAN-TAURI-3Y/3Z
                     // (~46 events combined) and the underlying TAURI-2E/84/T
                     // (~3300 events from raw per-attempt 429/503/504 reports).
-                    let transient = crate::openhuman::providers::reliable::is_rate_limited(&e)
-                        || crate::openhuman::providers::reliable::is_upstream_unhealthy(&e);
+                    let transient = crate::openhuman::inference::provider::reliable::is_rate_limited(
+                        &e,
+                    )
+                        || crate::openhuman::inference::provider::reliable::is_upstream_unhealthy(
+                            &e,
+                        );
                     if transient {
                         tracing::warn!(
                             domain = "agent",
diff --git a/src/openhuman/agent/harness/tool_loop_tests.rs b/src/openhuman/agent/harness/tool_loop_tests.rs
index 6ca463471c..684d4e69da 100644
--- a/src/openhuman/agent/harness/tool_loop_tests.rs
+++ b/src/openhuman/agent/harness/tool_loop_tests.rs
@@ -1,8 +1,8 @@
 use super::*;
 use crate::openhuman::approval::ApprovalManager;
 use crate::openhuman::config::AutonomyConfig;
-use crate::openhuman::providers::traits::ProviderCapabilities;
-use crate::openhuman::providers::ChatResponse;
+use crate::openhuman::inference::provider::traits::ProviderCapabilities;
+use crate::openhuman::inference::provider::ChatResponse;
 use crate::openhuman::security::AutonomyLevel;
 use crate::openhuman::tools::{ToolResult, ToolScope};
 use async_trait::async_trait;
@@ -388,7 +388,7 @@ async fn run_tool_call_loop_persists_native_tool_results_as_tool_messages() {
         responses: Mutex::new(vec![
             Ok(ChatResponse {
                 text: Some(String::new()),
-                tool_calls: vec![crate::openhuman::providers::ToolCall {
+                tool_calls: vec![crate::openhuman::inference::provider::ToolCall {
                     id: "call-1".into(),
                     name: "echo".into(),
                     arguments: "{}".into(),
diff --git a/src/openhuman/agent/multimodal.rs b/src/openhuman/agent/multimodal.rs
index c9195efbf7..921d334d2f 100644
--- a/src/openhuman/agent/multimodal.rs
+++ b/src/openhuman/agent/multimodal.rs
@@ -1,5 +1,5 @@
 use crate::openhuman::config::{build_runtime_proxy_client_with_timeouts, MultimodalConfig};
-use crate::openhuman::providers::ChatMessage;
+use crate::openhuman::inference::provider::ChatMessage;
 use base64::{engine::general_purpose::STANDARD, Engine as _};
 use reqwest::Client;
 use std::path::Path;
diff --git a/src/openhuman/agent/schemas.rs b/src/openhuman/agent/schemas.rs
index 12bfc853d6..3854076b4f 100644
--- a/src/openhuman/agent/schemas.rs
+++ b/src/openhuman/agent/schemas.rs
@@ -240,7 +240,7 @@ fn handle_chat(params: Map<String, Value>) -> ControllerFuture {
         let p = deserialize_params::<AgentChatParams>(params)?;
         let mut config = config_rpc::load_config_with_timeout().await?;
         to_json(
-            crate::openhuman::local_ai::rpc::agent_chat(
+            crate::openhuman::inference::local::rpc::agent_chat(
                 &mut config,
                 &p.message,
                 p.model_override,
@@ -256,7 +256,7 @@ fn handle_chat_simple(params: Map<String, Value>) -> ControllerFuture {
         let p = deserialize_params::<AgentChatParams>(params)?;
         let config = config_rpc::load_config_with_timeout().await?;
         to_json(
-            crate::openhuman::local_ai::rpc::agent_chat_simple(
+            crate::openhuman::inference::local::rpc::agent_chat_simple(
                 &config,
                 &p.message,
                 p.model_override,
diff --git a/src/openhuman/agent/stop_hooks.rs b/src/openhuman/agent/stop_hooks.rs
index f3ad0d759d..b89df26c23 100644
--- a/src/openhuman/agent/stop_hooks.rs
+++ b/src/openhuman/agent/stop_hooks.rs
@@ -180,7 +180,7 @@ impl StopHook for MaxIterationsStopHook {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::openhuman::providers::UsageInfo;
+    use crate::openhuman::inference::provider::UsageInfo;
 
     fn cost_with_usd(usd: f64) -> TurnCost {
         let mut tc = TurnCost::new();
diff --git a/src/openhuman/agent/tests.rs b/src/openhuman/agent/tests.rs
index 4cccb90154..125107f43f 100644
--- a/src/openhuman/agent/tests.rs
+++ b/src/openhuman/agent/tests.rs
@@ -29,11 +29,11 @@ use crate::openhuman::agent::dispatcher::{
 };
 use crate::openhuman::agent::harness::session::Agent;
 use crate::openhuman::config::{AgentConfig, MemoryConfig};
-use crate::openhuman::memory::{self, Memory};
-use crate::openhuman::providers::{
+use crate::openhuman::inference::provider::{
     ChatMessage, ChatRequest, ChatResponse, ConversationMessage, Provider, ToolCall,
     ToolResultMessage,
 };
+use crate::openhuman::memory::{self, Memory};
 use crate::openhuman::tools::{Tool, ToolResult};
 use anyhow::Result;
 use async_trait::async_trait;
diff --git a/src/openhuman/agent/triage/evaluator.rs b/src/openhuman/agent/triage/evaluator.rs
index 375c58ce7c..c69625f81f 100644
--- a/src/openhuman/agent/triage/evaluator.rs
+++ b/src/openhuman/agent/triage/evaluator.rs
@@ -38,10 +38,10 @@ use crate::openhuman::agent::harness::definition::{AgentDefinition, PromptSource
 use crate::openhuman::agent::harness::AgentDefinitionRegistry;
 use crate::openhuman::config::Config;
 use crate::openhuman::config::MultimodalConfig;
-use crate::openhuman::providers::reliable::{
+use crate::openhuman::inference::provider::reliable::{
     is_rate_limited, is_upstream_unhealthy, parse_retry_after_ms,
 };
-use crate::openhuman::providers::ChatMessage;
+use crate::openhuman::inference::provider::ChatMessage;
 use crate::openhuman::scheduler_gate::LlmPermit;
 
 use super::decision::{parse_triage_decision, ParseError, TriageDecision};
diff --git a/src/openhuman/agent/triage/evaluator_tests.rs b/src/openhuman/agent/triage/evaluator_tests.rs
index b9d9b74539..4a8a17dace 100644
--- a/src/openhuman/agent/triage/evaluator_tests.rs
+++ b/src/openhuman/agent/triage/evaluator_tests.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::openhuman::agent::agents::BUILTINS;
 use crate::openhuman::agent::bus::{mock_agent_run_turn, AgentTurnResponse};
 use crate::openhuman::agent::harness::AgentDefinitionRegistry;
-use crate::openhuman::providers::Provider;
+use crate::openhuman::inference::provider::Provider;
 use async_trait::async_trait;
 use serde_json::json;
 use std::sync::atomic::{AtomicUsize, Ordering};
diff --git a/src/openhuman/agent/triage/routing.rs b/src/openhuman/agent/triage/routing.rs
index d2b40aaef4..838863b8cf 100644
--- a/src/openhuman/agent/triage/routing.rs
+++ b/src/openhuman/agent/triage/routing.rs
@@ -14,7 +14,9 @@ use std::sync::Arc;
 use anyhow::Context;
 
 use crate::openhuman::config::Config;
-use crate::openhuman::providers::{self, Provider, ProviderRuntimeOptions, INFERENCE_BACKEND_ID};
+use crate::openhuman::inference::provider::{
+    self, Provider, ProviderRuntimeOptions, INFERENCE_BACKEND_ID,
+};
 
 /// The concrete provider + metadata that [`crate::openhuman::agent::triage::evaluator::run_triage`]
 /// should use for this particular triage turn.
@@ -66,7 +68,7 @@ pub async fn resolve_provider_with_config(config: &Config) -> anyhow::Result<Res
 /// `IntelligentRoutingProvider` so the same model that serves
 /// lightweight chat also serves the triage fallback.
 pub fn build_local_provider_with_config(config: &Config) -> Option<ResolvedProvider> {
-    use crate::openhuman::providers::compatible::{AuthStyle, OpenAiCompatibleProvider};
+    use crate::openhuman::inference::provider::compatible::{AuthStyle, OpenAiCompatibleProvider};
 
     let local_cfg = &config.local_ai;
     if !local_cfg.runtime_enabled {
@@ -100,7 +102,7 @@ pub fn build_local_provider_with_config(config: &Config) -> Option<ResolvedProvi
         };
         (label, base)
     } else {
-        let ollama_base = crate::openhuman::local_ai::ollama_base_url();
+        let ollama_base = crate::openhuman::inference::local::ollama_base_url();
         ("ollama", format!("{ollama_base}/v1"))
     };
 
@@ -136,7 +138,7 @@ pub fn build_local_provider_with_config(config: &Config) -> Option<ResolvedProvi
 // ── Provider builder ────────────────────────────────────────────────────
 
 /// Build the default remote routed backend provider. Same wiring as
-/// `local_ai::ops::agent_chat_simple` uses so we stay consistent with
+/// `inference::local::ops::agent_chat_simple` uses so we stay consistent with
 /// the existing direct-chat path.
 fn build_remote_provider(config: &Config) -> anyhow::Result<ResolvedProvider> {
     let default_model = config
@@ -149,7 +151,7 @@ fn build_remote_provider(config: &Config) -> anyhow::Result<ResolvedProvider> {
         secrets_encrypt: config.secrets.encrypt,
         reasoning_enabled: config.runtime.reasoning_enabled,
     };
-    let provider_box = providers::create_routed_provider_with_options(
+    let provider_box = provider::create_routed_provider_with_options(
         config.inference_url.as_deref(),
         config.api_url.as_deref(),
         config.api_key.as_deref(),
diff --git a/src/openhuman/app_state/ops.rs b/src/openhuman/app_state/ops.rs
index 309f9684ed..d94a49b86a 100644
--- a/src/openhuman/app_state/ops.rs
+++ b/src/openhuman/app_state/ops.rs
@@ -19,7 +19,7 @@ use crate::openhuman::autocomplete::AutocompleteStatus;
 use crate::openhuman::config::rpc as config_rpc;
 use crate::openhuman::config::Config;
 use crate::openhuman::credentials::session_support::build_session_state;
-use crate::openhuman::local_ai::LocalAiStatus;
+use crate::openhuman::inference::LocalAiStatus;
 use crate::openhuman::screen_intelligence::AccessibilityStatus;
 use crate::openhuman::service::{ServiceState, ServiceStatus};
 use crate::rpc::RpcOutcome;
@@ -414,7 +414,7 @@ async fn build_runtime_snapshot(config: &Config) -> RuntimeSnapshot {
         Ok(outcome) => outcome.value,
         Err(error) => {
             warn!("{LOG_PREFIX} local_ai status failed during snapshot: {error}");
-            crate::openhuman::local_ai::LocalAiStatus::disabled(config)
+            crate::openhuman::inference::LocalAiStatus::disabled(config)
         }
     };
 
diff --git a/src/openhuman/autocomplete/core/engine.rs b/src/openhuman/autocomplete/core/engine.rs
index 2051def25f..b471a26022 100644
--- a/src/openhuman/autocomplete/core/engine.rs
+++ b/src/openhuman/autocomplete/core/engine.rs
@@ -1,5 +1,5 @@
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai;
+use crate::openhuman::inference::local as local_ai;
 use chrono::Utc;
 use once_cell::sync::Lazy;
 use std::sync::{Arc, Once};
diff --git a/src/openhuman/channels/context.rs b/src/openhuman/channels/context.rs
index aee676fba5..6068de1ca5 100644
--- a/src/openhuman/channels/context.rs
+++ b/src/openhuman/channels/context.rs
@@ -1,7 +1,7 @@
 //! Shared channel runtime state and memory helpers.
 
+use crate::openhuman::inference::provider::{ChatMessage, Provider};
 use crate::openhuman::memory::Memory;
-use crate::openhuman::providers::{ChatMessage, Provider};
 use crate::openhuman::tools::Tool;
 use crate::openhuman::util::truncate_with_ellipsis;
 use std::collections::HashMap;
@@ -61,7 +61,8 @@ pub(crate) struct ChannelRuntimeContext {
     pub(crate) api_url: Option<String>,
     pub(crate) inference_url: Option<String>,
     pub(crate) reliability: Arc<crate::openhuman::config::ReliabilityConfig>,
-    pub(crate) provider_runtime_options: crate::openhuman::providers::ProviderRuntimeOptions,
+    pub(crate) provider_runtime_options:
+        crate::openhuman::inference::provider::ProviderRuntimeOptions,
     pub(crate) workspace_dir: Arc<PathBuf>,
     pub(crate) message_timeout_secs: u64,
     pub(crate) multimodal: crate::openhuman::config::MultimodalConfig,
@@ -207,8 +208,8 @@ pub(crate) async fn build_memory_context(
 mod tests {
     use super::*;
     use crate::openhuman::channels::traits;
+    use crate::openhuman::inference::provider::Provider;
     use crate::openhuman::memory::{Memory, MemoryCategory, MemoryEntry};
-    use crate::openhuman::providers::Provider;
     use crate::openhuman::tools::{Tool, ToolResult};
     use async_trait::async_trait;
 
@@ -344,8 +345,8 @@ mod tests {
             api_url: None,
             inference_url: None,
             reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-            provider_runtime_options: crate::openhuman::providers::ProviderRuntimeOptions::default(
-            ),
+            provider_runtime_options:
+                crate::openhuman::inference::provider::ProviderRuntimeOptions::default(),
             workspace_dir: Arc::new(PathBuf::from("/tmp")),
             message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
             multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -387,12 +388,12 @@ mod tests {
         let ctx = runtime_context();
         let sender = "discord_alice_reply_thread:thread-1";
         let mut history = Vec::new();
-        history.push(crate::openhuman::providers::ChatMessage::user("short"));
-        history.extend(
-            (0..20).map(|idx| {
-                crate::openhuman::providers::ChatMessage::assistant("x".repeat(700 + idx))
-            }),
-        );
+        history.push(crate::openhuman::inference::provider::ChatMessage::user(
+            "short",
+        ));
+        history.extend((0..20).map(|idx| {
+            crate::openhuman::inference::provider::ChatMessage::assistant("x".repeat(700 + idx))
+        }));
         ctx.conversation_histories
             .lock()
             .unwrap()
diff --git a/src/openhuman/channels/providers/web.rs b/src/openhuman/channels/providers/web.rs
index 39fcc6d82f..fc0d2c5781 100644
--- a/src/openhuman/channels/providers/web.rs
+++ b/src/openhuman/channels/providers/web.rs
@@ -194,7 +194,7 @@ fn extract_provider_error_detail(err: &str) -> Option<String> {
                 if trimmed.is_empty() {
                     return None;
                 }
-                let sanitized = crate::openhuman::providers::sanitize_api_error(trimmed);
+                let sanitized = crate::openhuman::inference::provider::sanitize_api_error(trimmed);
                 return Some(crate::openhuman::util::truncate_with_ellipsis(
                     &sanitized,
                     MAX_DETAIL_CHARS,
@@ -681,7 +681,10 @@ async fn run_chat_task(
         model_override: model_override.clone(),
         temperature,
         target_agent_id: target_agent_id.clone(),
-        provider_binding: crate::openhuman::providers::provider_for_role(provider_role, &config),
+        provider_binding: crate::openhuman::inference::provider::provider_for_role(
+            provider_role,
+            &config,
+        ),
     };
 
     let prior = {
@@ -803,7 +806,7 @@ async fn run_chat_task(
     // `thread_context::current_thread_id()` and forwards it on
     // `/openai/v1/chat/completions` so the backend can group
     // InferenceLog entries and reuse the KV cache for this thread.
-    let result = match crate::openhuman::providers::thread_context::with_thread_id(
+    let result = match crate::openhuman::inference::provider::thread_context::with_thread_id(
         thread_id.to_string(),
         agent.run_single(message),
     )
diff --git a/src/openhuman/channels/routes.rs b/src/openhuman/channels/routes.rs
index d9dfca444b..3daa0650b6 100644
--- a/src/openhuman/channels/routes.rs
+++ b/src/openhuman/channels/routes.rs
@@ -5,7 +5,7 @@ use super::context::{
 };
 use super::traits;
 use super::{Channel, SendMessage};
-use crate::openhuman::providers::{self, Provider};
+use crate::openhuman::inference::provider::{self, Provider};
 use serde::Deserialize;
 use std::fmt::Write;
 use std::path::Path;
@@ -83,7 +83,7 @@ fn resolve_provider_alias(name: &str) -> Option<String> {
         return None;
     }
 
-    let providers_list = providers::list_providers();
+    let providers_list = provider::list_providers();
     for provider in providers_list {
         if provider.name.eq_ignore_ascii_case(candidate)
             || provider
@@ -177,7 +177,7 @@ pub(crate) async fn get_or_create_provider(
         (None, None)
     };
 
-    let provider = providers::create_resilient_provider_with_options(
+    let provider = provider::create_resilient_provider_with_options(
         inference_url,
         backend_url,
         None,
@@ -237,7 +237,7 @@ fn build_providers_help_response(current: &ChannelRouteSelection) -> String {
     response.push_str("\nSwitch provider with `/models <provider>`.\n");
     response.push_str("Switch model with `/model <model-id>`.\n\n");
     response.push_str("Available providers:\n");
-    for provider in providers::list_providers() {
+    for provider in provider::list_providers() {
         if provider.aliases.is_empty() {
             let _ = writeln!(response, "- {}", provider.name);
         } else {
@@ -286,7 +286,7 @@ pub(crate) async fn handle_runtime_command_if_needed(
                         )
                     }
                     Err(err) => {
-                        let safe_err = providers::sanitize_api_error(&err.to_string());
+                        let safe_err = provider::sanitize_api_error(&err.to_string());
                         format!(
                             "Failed to initialize provider `{provider_name}`. Route unchanged.\nDetails: {safe_err}"
                         )
diff --git a/src/openhuman/channels/routes_tests.rs b/src/openhuman/channels/routes_tests.rs
index d5527767ff..06124b51ba 100644
--- a/src/openhuman/channels/routes_tests.rs
+++ b/src/openhuman/channels/routes_tests.rs
@@ -3,8 +3,8 @@ use crate::openhuman::channels::context::{
     ChannelRuntimeContext, ProviderCacheMap, RouteSelectionMap,
 };
 use crate::openhuman::channels::traits::ChannelMessage;
+use crate::openhuman::inference::provider::{ChatMessage, Provider};
 use crate::openhuman::memory::{Memory, MemoryCategory, MemoryEntry};
-use crate::openhuman::providers::{ChatMessage, Provider};
 use crate::openhuman::tools::{Tool, ToolResult};
 use async_trait::async_trait;
 use std::collections::HashMap;
@@ -147,7 +147,8 @@ fn runtime_context(workspace_dir: PathBuf) -> ChannelRuntimeContext {
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: crate::openhuman::providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options:
+            crate::openhuman::inference::provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(workspace_dir),
         message_timeout_secs: 60,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -182,7 +183,7 @@ fn runtime_command_parsing_and_provider_support_are_channel_scoped() {
 
 #[test]
 fn provider_alias_and_route_selection_round_trip() {
-    let first_provider = providers::list_providers()
+    let first_provider = provider::list_providers()
         .into_iter()
         .next()
         .expect("provider registry should not be empty");
diff --git a/src/openhuman/channels/runtime/dispatch.rs b/src/openhuman/channels/runtime/dispatch.rs
index 08346515b8..72763c9348 100644
--- a/src/openhuman/channels/runtime/dispatch.rs
+++ b/src/openhuman/channels/runtime/dispatch.rs
@@ -20,7 +20,7 @@ use crate::openhuman::channels::traits;
 use crate::openhuman::channels::{Channel, SendMessage};
 use crate::openhuman::composio::fetch_connected_integrations;
 use crate::openhuman::config::Config;
-use crate::openhuman::providers::{self, ChatMessage};
+use crate::openhuman::inference::provider::{self, ChatMessage};
 use crate::openhuman::tools::{orchestrator_tools, Tool};
 use crate::openhuman::util::truncate_with_ellipsis;
 use std::collections::HashSet;
@@ -389,7 +389,7 @@ async fn resolve_target_agent(channel: &str) -> AgentScoping {
     };
 
     // Welcome is **desktop-app only**. The web channel has its own
-    // bespoke chat path (`channels::providers::web::run_chat_task` →
+    // bespoke chat path (`channels::provider::web::run_chat_task` →
     // `pick_target_agent_id`) that routes to the welcome agent while
     // `chat_onboarding_completed` is false. Every other channel
     // (telegram, slack, discord, mattermost, signal, …) flows through
@@ -798,7 +798,7 @@ pub(crate) async fn process_channel_message(
                     ("provider", route.provider.as_str()),
                 ],
             );
-            let safe_err = providers::sanitize_api_error(&err.to_string());
+            let safe_err = provider::sanitize_api_error(&err.to_string());
             let message = format!(
                 "⚠️ Failed to initialize provider `{}`. Please run `/models` to choose another provider.\nDetails: {safe_err}",
                 route.provider
diff --git a/src/openhuman/channels/runtime/startup.rs b/src/openhuman/channels/runtime/startup.rs
index 3fa680a905..d61b8c7009 100644
--- a/src/openhuman/channels/runtime/startup.rs
+++ b/src/openhuman/channels/runtime/startup.rs
@@ -31,8 +31,8 @@ use crate::openhuman::channels::whatsapp_web::WhatsAppWebChannel;
 use crate::openhuman::channels::Channel;
 use crate::openhuman::config::Config;
 use crate::openhuman::context::channels_prompt::build_system_prompt;
+use crate::openhuman::inference::provider::{self, Provider};
 use crate::openhuman::memory::{self, Memory};
-use crate::openhuman::providers::{self, Provider};
 use crate::openhuman::security::SecurityPolicy;
 use crate::openhuman::tools;
 use anyhow::Result;
@@ -154,13 +154,13 @@ pub async fn start_channels(config: Config) -> Result<()> {
     // in bootstrap_core_runtime() (src/core/jsonrpc.rs) to avoid double-registration
     // when both startup paths run in the same process.
 
-    let provider_runtime_options = providers::ProviderRuntimeOptions {
+    let provider_runtime_options = provider::ProviderRuntimeOptions {
         auth_profile_override: None,
         openhuman_dir: config.config_path.parent().map(std::path::PathBuf::from),
         secrets_encrypt: config.secrets.encrypt,
         reasoning_enabled: config.runtime.reasoning_enabled,
     };
-    let provider: Arc<dyn Provider> = Arc::from(providers::create_intelligent_routing_provider(
+    let provider: Arc<dyn Provider> = Arc::from(provider::create_intelligent_routing_provider(
         config.inference_url.as_deref(),
         config.api_url.as_deref(),
         config.api_key.as_deref(),
@@ -572,7 +572,7 @@ pub async fn start_channels(config: Config) -> Result<()> {
 
     println!("  🚦 In-flight message limit: {max_in_flight_messages}");
 
-    let provider_name = providers::INFERENCE_BACKEND_ID.to_string();
+    let provider_name = provider::INFERENCE_BACKEND_ID.to_string();
     let mut provider_cache_seed: HashMap<String, Arc<dyn Provider>> = HashMap::new();
     provider_cache_seed.insert(provider_name.clone(), Arc::clone(&provider));
     let message_timeout_secs =
diff --git a/src/openhuman/channels/tests/common.rs b/src/openhuman/channels/tests/common.rs
index 2adb0b71dc..7d389265e5 100644
--- a/src/openhuman/channels/tests/common.rs
+++ b/src/openhuman/channels/tests/common.rs
@@ -1,6 +1,6 @@
 use crate::openhuman::channels::{traits, Channel, SendMessage};
+use crate::openhuman::inference::provider::{ChatMessage, Provider};
 use crate::openhuman::memory::{Memory, MemoryCategory, MemoryEntry};
-use crate::openhuman::providers::{ChatMessage, Provider};
 use crate::openhuman::tools::{Tool, ToolResult};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
diff --git a/src/openhuman/channels/tests/context.rs b/src/openhuman/channels/tests/context.rs
index 624e30345b..b36f4b5340 100644
--- a/src/openhuman/channels/tests/context.rs
+++ b/src/openhuman/channels/tests/context.rs
@@ -6,7 +6,7 @@ use super::super::context::{
     CHANNEL_MESSAGE_TIMEOUT_SECS, MIN_CHANNEL_MESSAGE_TIMEOUT_SECS,
 };
 use super::super::traits;
-use crate::openhuman::providers::ChatMessage;
+use crate::openhuman::inference::provider::ChatMessage;
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 
@@ -81,7 +81,7 @@ fn compact_sender_history_keeps_recent_truncated_messages() {
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
-        provider_runtime_options: crate::openhuman::providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: crate::openhuman::inference::provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
     };
diff --git a/src/openhuman/channels/tests/discord_integration.rs b/src/openhuman/channels/tests/discord_integration.rs
index 5f8ee756b9..1fcc9a7e7f 100644
--- a/src/openhuman/channels/tests/discord_integration.rs
+++ b/src/openhuman/channels/tests/discord_integration.rs
@@ -31,7 +31,7 @@ use super::super::traits;
 use super::super::{Channel, SendMessage};
 use super::common::{HistoryCaptureProvider, NoopMemory};
 use crate::openhuman::agent::bus::{mock_agent_run_turn, AgentTurnResponse};
-use crate::openhuman::providers::{ChatMessage, Provider};
+use crate::openhuman::inference::provider::{ChatMessage, Provider};
 use std::collections::HashMap;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
@@ -134,7 +134,8 @@ fn make_discord_ctx(
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: crate::openhuman::providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options:
+            crate::openhuman::inference::provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
diff --git a/src/openhuman/channels/tests/memory.rs b/src/openhuman/channels/tests/memory.rs
index 64b55c3c05..5f96434a42 100644
--- a/src/openhuman/channels/tests/memory.rs
+++ b/src/openhuman/channels/tests/memory.rs
@@ -6,8 +6,8 @@ use super::super::runtime::process_channel_message;
 use super::super::{traits, Channel};
 use super::common::{HistoryCaptureProvider, NoopMemory, RecordingChannel};
 use crate::openhuman::embeddings::NoopEmbedding;
+use crate::openhuman::inference::provider;
 use crate::openhuman::memory::{Memory, MemoryCategory, UnifiedMemory};
-use crate::openhuman::providers;
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tempfile::TempDir;
@@ -153,7 +153,7 @@ async fn process_channel_message_restores_per_sender_history_on_follow_ups() {
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -236,7 +236,7 @@ async fn process_channel_message_uses_autosaved_memory_after_history_is_cleared(
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
diff --git a/src/openhuman/channels/tests/runtime_dispatch.rs b/src/openhuman/channels/tests/runtime_dispatch.rs
index 6557823a4e..c9a919d7fb 100644
--- a/src/openhuman/channels/tests/runtime_dispatch.rs
+++ b/src/openhuman/channels/tests/runtime_dispatch.rs
@@ -3,7 +3,7 @@ use super::super::runtime::{process_channel_message, run_message_dispatch_loop};
 use super::super::{traits, Channel};
 use super::common::{use_real_agent_handler, NoopMemory, RecordingChannel, SlowProvider};
 use crate::openhuman::agent::bus::{mock_agent_run_turn, AgentTurnRequest, AgentTurnResponse};
-use crate::openhuman::providers;
+use crate::openhuman::inference::provider;
 use std::collections::HashMap;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
@@ -49,7 +49,7 @@ async fn message_dispatch_processes_messages_in_parallel() {
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -123,7 +123,7 @@ async fn process_channel_message_cancels_scoped_typing_task() {
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -211,7 +211,7 @@ async fn dispatch_routes_through_agent_run_turn_bus_handler() {
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
diff --git a/src/openhuman/channels/tests/runtime_tool_calls.rs b/src/openhuman/channels/tests/runtime_tool_calls.rs
index 3f3cdaee9c..eda9153914 100644
--- a/src/openhuman/channels/tests/runtime_tool_calls.rs
+++ b/src/openhuman/channels/tests/runtime_tool_calls.rs
@@ -8,7 +8,7 @@ use super::common::{
     IterativeToolProvider, MockPriceTool, ModelCaptureProvider, NoopMemory, RecordingChannel,
     TelegramRecordingChannel, ToolCallingAliasProvider, ToolCallingProvider,
 };
-use crate::openhuman::providers::{self, Provider};
+use crate::openhuman::inference::provider::{self, Provider};
 use std::collections::HashMap;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
@@ -39,7 +39,7 @@ async fn process_channel_message_executes_tool_calls_instead_of_sending_raw_json
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -94,7 +94,7 @@ async fn process_channel_message_executes_tool_calls_with_alias_tags() {
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -158,7 +158,7 @@ async fn process_channel_message_handles_models_command_without_llm_call() {
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -249,7 +249,7 @@ async fn process_channel_message_uses_route_override_provider_and_model() {
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -298,7 +298,7 @@ async fn process_channel_message_respects_configured_max_tool_iterations_above_d
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
@@ -354,7 +354,7 @@ async fn process_channel_message_reports_configured_max_tool_iterations_limit()
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options: provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
diff --git a/src/openhuman/channels/tests/telegram_integration.rs b/src/openhuman/channels/tests/telegram_integration.rs
index 16bb3e3f67..9199786a75 100644
--- a/src/openhuman/channels/tests/telegram_integration.rs
+++ b/src/openhuman/channels/tests/telegram_integration.rs
@@ -13,7 +13,7 @@ use super::super::traits;
 use super::super::{Channel, SendMessage};
 use super::common::{NoopMemory, SlowProvider};
 use crate::openhuman::agent::bus::{mock_agent_run_turn, AgentTurnResponse};
-use crate::openhuman::providers::{ChatMessage, Provider};
+use crate::openhuman::inference::provider::{ChatMessage, Provider};
 use std::collections::HashMap;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
@@ -110,7 +110,8 @@ fn make_test_context(
         api_url: None,
         inference_url: None,
         reliability: Arc::new(crate::openhuman::config::ReliabilityConfig::default()),
-        provider_runtime_options: crate::openhuman::providers::ProviderRuntimeOptions::default(),
+        provider_runtime_options:
+            crate::openhuman::inference::provider::ProviderRuntimeOptions::default(),
         workspace_dir: Arc::new(std::env::temp_dir()),
         message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
         multimodal: crate::openhuman::config::MultimodalConfig::default(),
diff --git a/src/openhuman/config/ops.rs b/src/openhuman/config/ops.rs
index a681d1b5be..1138d9ee81 100644
--- a/src/openhuman/config/ops.rs
+++ b/src/openhuman/config/ops.rs
@@ -740,7 +740,7 @@ pub async fn apply_local_ai_settings(
     }
     if let Some(provider) = update.provider {
         config.local_ai.provider =
-            crate::openhuman::local_ai::provider::normalize_provider(&provider);
+            crate::openhuman::inference::local::provider::normalize_provider(&provider);
     }
     if let Some(base_url) = update.base_url {
         config.local_ai.base_url = if base_url.trim().is_empty() {
diff --git a/src/openhuman/config/schema/cloud_providers.rs b/src/openhuman/config/schema/cloud_providers.rs
index a18774c358..9afefbad02 100644
--- a/src/openhuman/config/schema/cloud_providers.rs
+++ b/src/openhuman/config/schema/cloud_providers.rs
@@ -2,7 +2,7 @@
 //!
 //! Each entry in `Config::cloud_providers` represents one configured LLM
 //! backend. Providers are keyed by a user-chosen `slug` (e.g. `"openai"`,
-//! `"my-deepseek"`). The factory in `crate::openhuman::providers::factory`
+//! `"my-deepseek"`). The factory in `crate::openhuman::inference::provider::factory`
 //! resolves workload-to-provider strings against this list at runtime using
 //! the grammar `"<slug>:<model>"`.
 //!
@@ -47,7 +47,7 @@ impl AuthStyle {
 /// `auth-profiles.json` via [`crate::openhuman::credentials::AuthService`],
 /// keyed by `provider:<slug>` (falling back to bare `<slug>` for legacy
 /// entries). The factory looks up the token at call time via
-/// [`crate::openhuman::providers::factory::auth_key_for_slug`].
+/// [`crate::openhuman::inference::provider::factory::auth_key_for_slug`].
 ///
 /// ## Back-compat
 ///
diff --git a/src/openhuman/config/schema/load.rs b/src/openhuman/config/schema/load.rs
index fd52ee0609..158b3bc686 100644
--- a/src/openhuman/config/schema/load.rs
+++ b/src/openhuman/config/schema/load.rs
@@ -1096,9 +1096,9 @@ impl Config {
             let tier_str = tier_str.trim().to_ascii_lowercase();
             if !tier_str.is_empty() {
                 if let Some(tier) =
-                    crate::openhuman::local_ai::presets::ModelTier::from_str_opt(&tier_str)
+                    crate::openhuman::inference::presets::ModelTier::from_str_opt(&tier_str)
                 {
-                    if tier == crate::openhuman::local_ai::presets::ModelTier::Custom {
+                    if tier == crate::openhuman::inference::presets::ModelTier::Custom {
                         tracing::warn!(
                             tier = %tier_str,
                             "ignoring custom OPENHUMAN_LOCAL_AI_TIER; only built-in presets are supported"
@@ -1109,7 +1109,7 @@ impl Config {
                             "ignoring OPENHUMAN_LOCAL_AI_TIER outside the 1B local-model allowlist"
                         );
                     } else {
-                        crate::openhuman::local_ai::presets::apply_preset_to_config(
+                        crate::openhuman::inference::presets::apply_preset_to_config(
                             &mut self.local_ai,
                             tier,
                         );
diff --git a/src/openhuman/context/guard.rs b/src/openhuman/context/guard.rs
index d23a8d82ea..0f35797749 100644
--- a/src/openhuman/context/guard.rs
+++ b/src/openhuman/context/guard.rs
@@ -4,7 +4,7 @@
 //! when usage exceeds a threshold. A circuit breaker disables compaction after
 //! consecutive failures to prevent infinite retry loops.
 
-use crate::openhuman::providers::UsageInfo;
+use crate::openhuman::inference::provider::UsageInfo;
 
 /// Threshold (0.0–1.0) at which auto-compaction is triggered.
 pub(crate) const COMPACTION_TRIGGER_THRESHOLD: f64 = 0.90;
diff --git a/src/openhuman/context/manager.rs b/src/openhuman/context/manager.rs
index f86c5d60b6..6cc1fbfce1 100644
--- a/src/openhuman/context/manager.rs
+++ b/src/openhuman/context/manager.rs
@@ -37,7 +37,7 @@ use super::prompt::{PromptContext, SystemPromptBuilder};
 use super::session_memory::SessionMemoryConfig;
 use super::summarizer::{Summarizer, SummaryStats};
 use crate::openhuman::config::ContextConfig;
-use crate::openhuman::providers::{ConversationMessage, UsageInfo};
+use crate::openhuman::inference::provider::{ConversationMessage, UsageInfo};
 use anyhow::Result;
 
 /// Outcome of a reduction pass driven by [`ContextManager::reduce_before_call`].
diff --git a/src/openhuman/context/manager_tests.rs b/src/openhuman/context/manager_tests.rs
index c12278a687..2622cacef5 100644
--- a/src/openhuman/context/manager_tests.rs
+++ b/src/openhuman/context/manager_tests.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::openhuman::providers::{ChatMessage, ToolCall, ToolResultMessage};
+use crate::openhuman::inference::provider::{ChatMessage, ToolCall, ToolResultMessage};
 use async_trait::async_trait;
 use std::sync::Mutex;
 
diff --git a/src/openhuman/context/microcompact.rs b/src/openhuman/context/microcompact.rs
index 587bb70261..084e93c8af 100644
--- a/src/openhuman/context/microcompact.rs
+++ b/src/openhuman/context/microcompact.rs
@@ -23,7 +23,7 @@
 //! otherwise be too large to fit — the pipeline orchestrator handles
 //! gating.
 
-use crate::openhuman::providers::ConversationMessage;
+use crate::openhuman::inference::provider::ConversationMessage;
 
 /// Placeholder used in place of cleared tool-result bodies. Must be
 /// stable across versions so callers can pattern-match on it for
@@ -102,7 +102,7 @@ pub fn microcompact(history: &mut [ConversationMessage], keep_recent: usize) ->
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::openhuman::providers::{ChatMessage, ToolCall, ToolResultMessage};
+    use crate::openhuman::inference::provider::{ChatMessage, ToolCall, ToolResultMessage};
 
     fn user(text: &str) -> ConversationMessage {
         ConversationMessage::Chat(ChatMessage::user(text))
diff --git a/src/openhuman/context/pipeline.rs b/src/openhuman/context/pipeline.rs
index 6a3eb7d208..e5a96a5080 100644
--- a/src/openhuman/context/pipeline.rs
+++ b/src/openhuman/context/pipeline.rs
@@ -36,7 +36,7 @@
 use super::guard::{ContextCheckResult, ContextGuard};
 use super::microcompact::{microcompact, MicrocompactStats, DEFAULT_KEEP_RECENT_TOOL_RESULTS};
 use super::session_memory::{SessionMemoryConfig, SessionMemoryState};
-use crate::openhuman::providers::{ConversationMessage, UsageInfo};
+use crate::openhuman::inference::provider::{ConversationMessage, UsageInfo};
 use std::sync::{Arc, Mutex};
 
 /// Shared handle to a [`SessionMemoryState`] so both the synchronous
@@ -257,7 +257,7 @@ impl ContextPipeline {
 mod tests {
     use super::super::microcompact::CLEARED_PLACEHOLDER;
     use super::*;
-    use crate::openhuman::providers::{
+    use crate::openhuman::inference::provider::{
         ChatMessage, ConversationMessage, ToolCall, ToolResultMessage, UsageInfo,
     };
 
diff --git a/src/openhuman/context/summarizer.rs b/src/openhuman/context/summarizer.rs
index 608938e638..145040bf28 100644
--- a/src/openhuman/context/summarizer.rs
+++ b/src/openhuman/context/summarizer.rs
@@ -28,7 +28,7 @@
 //! complete turns.
 
 use super::microcompact::MicrocompactStats;
-use crate::openhuman::providers::{ChatMessage, ConversationMessage, Provider};
+use crate::openhuman::inference::provider::{ChatMessage, ConversationMessage, Provider};
 use anyhow::Result;
 use async_trait::async_trait;
 use std::fmt::Write as _;
diff --git a/src/openhuman/context/summarizer_tests.rs b/src/openhuman/context/summarizer_tests.rs
index 7812a55cee..bc81a343c6 100644
--- a/src/openhuman/context/summarizer_tests.rs
+++ b/src/openhuman/context/summarizer_tests.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::openhuman::providers::{ChatResponse, ToolCall, ToolResultMessage};
+use crate::openhuman::inference::provider::{ChatResponse, ToolCall, ToolResultMessage};
 use async_trait::async_trait;
 use std::sync::Mutex;
 
@@ -75,7 +75,7 @@ impl Provider for StubProvider {
 
     async fn chat(
         &self,
-        _request: crate::openhuman::providers::ChatRequest<'_>,
+        _request: crate::openhuman::inference::provider::ChatRequest<'_>,
         _model: &str,
         _temperature: f64,
     ) -> anyhow::Result<ChatResponse> {
diff --git a/src/openhuman/credentials/ops.rs b/src/openhuman/credentials/ops.rs
index 0c27ccd53d..d29e32e18b 100644
--- a/src/openhuman/credentials/ops.rs
+++ b/src/openhuman/credentials/ops.rs
@@ -26,7 +26,7 @@ use crate::openhuman::memory::conversations;
 pub async fn start_login_gated_services(config: &Config) {
     // 1. Local AI (Ollama, whisper, embeddings)
     if config.local_ai.runtime_enabled {
-        let service = crate::openhuman::local_ai::global(config);
+        let service = crate::openhuman::inference::local::global(config);
         service.bootstrap(config).await;
         log::info!("[services] local AI bootstrapped after login");
     }
@@ -78,7 +78,7 @@ pub async fn stop_login_gated_services(config: &Config) {
     //    (it may be serving other clients or mid-download), but we clear
     //    the internal state so it re-bootstraps on next login.
     if config.local_ai.runtime_enabled {
-        let service = crate::openhuman::local_ai::global(config);
+        let service = crate::openhuman::inference::local::global(config);
         service.reset_to_idle(config);
         log::info!("[services] local AI reset to idle on logout");
     }
diff --git a/src/openhuman/cron/scheduler.rs b/src/openhuman/cron/scheduler.rs
index 3a90ab3596..35b4055203 100644
--- a/src/openhuman/cron/scheduler.rs
+++ b/src/openhuman/cron/scheduler.rs
@@ -252,7 +252,7 @@ async fn run_agent_job(config: &Config, job: &CronJob) -> (bool, String, Option<
                     .unwrap_or_else(|| crate::openhuman::config::DEFAULT_MODEL.to_string());
                 let resolved_model = match &def.model {
                     ModelSpec::Hint(workload) => {
-                        match crate::openhuman::providers::create_chat_provider(
+                        match crate::openhuman::inference::provider::create_chat_provider(
                             workload, &effective,
                         ) {
                             Ok((_, m)) => {
diff --git a/src/openhuman/doctor/core.rs b/src/openhuman/doctor/core.rs
index 100108a872..29a1325be2 100644
--- a/src/openhuman/doctor/core.rs
+++ b/src/openhuman/doctor/core.rs
@@ -123,7 +123,7 @@ pub struct ModelProbeReport {
 }
 
 fn doctor_model_targets() -> Vec<String> {
-    crate::openhuman::providers::list_providers()
+    crate::openhuman::inference::provider::list_providers()
         .into_iter()
         .map(|provider| provider.name.to_string())
         .collect()
diff --git a/src/openhuman/embeddings/cloud.rs b/src/openhuman/embeddings/cloud.rs
index 3b674036ca..83722a8383 100644
--- a/src/openhuman/embeddings/cloud.rs
+++ b/src/openhuman/embeddings/cloud.rs
@@ -9,7 +9,7 @@
 //!
 //! The JWT and API URL are resolved per call so a session refresh between
 //! embed batches is picked up transparently — matching
-//! [`crate::openhuman::providers::openhuman_backend::OpenHumanBackendProvider`].
+//! [`crate::openhuman::inference::provider::openhuman_backend::OpenHumanBackendProvider`].
 
 use std::path::PathBuf;
 
diff --git a/src/openhuman/embeddings/factory.rs b/src/openhuman/embeddings/factory.rs
index a3e7da13e0..00bcb69c74 100644
--- a/src/openhuman/embeddings/factory.rs
+++ b/src/openhuman/embeddings/factory.rs
@@ -30,7 +30,7 @@ pub fn create_embedding_provider(
             None, None, true, model, dims,
         ))),
         "ollama" => {
-            let base_url = crate::openhuman::local_ai::ollama_base_url();
+            let base_url = crate::openhuman::inference::local::ollama_base_url();
             Ok(Box::new(OllamaEmbedding::try_new(&base_url, model, dims)?))
         }
         "openai" => Ok(Box::new(OpenAiEmbedding::new(
diff --git a/src/openhuman/local_ai/device.rs b/src/openhuman/inference/device.rs
similarity index 100%
rename from src/openhuman/local_ai/device.rs
rename to src/openhuman/inference/device.rs
diff --git a/src/openhuman/inference/http/mod.rs b/src/openhuman/inference/http/mod.rs
new file mode 100644
index 0000000000..775ee9c359
--- /dev/null
+++ b/src/openhuman/inference/http/mod.rs
@@ -0,0 +1,14 @@
+//! OpenAI-compatible HTTP endpoint at `/v1/chat/completions` and `/v1/models`.
+//!
+//! ## Mounting
+//!
+//! The router is mounted by `src/core/jsonrpc.rs`:
+//! ```ignore
+//! .nest("/v1", crate::openhuman::inference::http::router())
+//! ```
+//! It inherits the same bearer-token auth middleware that guards `/rpc`.
+
+pub mod server;
+pub mod types;
+
+pub use server::router;
diff --git a/src/openhuman/inference/http/server.rs b/src/openhuman/inference/http/server.rs
new file mode 100644
index 0000000000..a12da7bdba
--- /dev/null
+++ b/src/openhuman/inference/http/server.rs
@@ -0,0 +1,281 @@
+//! OpenAI-compatible HTTP handlers for `/v1/chat/completions` and `/v1/models`.
+//!
+//! ## Mounting
+//!
+//! The router returned by [`router()`] is merged into the core axum server
+//! in `src/core/jsonrpc.rs` via `.nest("/v1", inference::http::router())`.
+//! It reuses the same bearer-token auth middleware that guards `/rpc`.
+//!
+//! ## Authentication
+//!
+//! All routes require `Authorization: Bearer <OPENHUMAN_CORE_TOKEN>` — the
+//! same per-launch token used by the JSON-RPC endpoint. Missing or wrong
+//! tokens get a `401 Unauthorized` from the shared middleware.
+//!
+//! ## Provider routing
+//!
+//! The `model` field in the request selects the provider:
+//! - `"ollama:<model>"` or a bare model name → local Ollama
+//! - `"<slug>:<model>"` → cloud provider entry by slug
+//! - everything else → OpenHuman backend (session JWT)
+
+use axum::http::StatusCode;
+use axum::response::sse::{Event, KeepAlive, Sse};
+use axum::response::{IntoResponse, Response};
+use axum::routing::{get, post};
+use axum::{extract::State, Json, Router};
+use futures_util::stream::{self, StreamExt};
+use serde_json::json;
+use tracing::{debug, error};
+
+use crate::core::types::AppState;
+use crate::openhuman::config::Config;
+use crate::openhuman::inference::provider;
+use crate::openhuman::inference::provider::traits::ChatMessage;
+
+use super::types::{
+    ChatCompletionChoice, ChatCompletionChunk, ChatCompletionChunkChoice, ChatCompletionDelta,
+    ChatCompletionMessage, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionUsage,
+    ModelObject, ModelsResponse,
+};
+
+const LOG_PREFIX: &str = "[inference::http]";
+
+/// Build the `/v1` axum sub-router.
+pub fn router() -> Router<AppState> {
+    Router::new()
+        .route("/chat/completions", post(chat_completions_handler))
+        .route("/models", get(models_handler))
+}
+
+/// `POST /v1/chat/completions`
+///
+/// Accepts an OpenAI-compatible request body. Routes through the unified
+/// `Provider` trait — local (Ollama) for `ollama:*` model names, cloud otherwise.
+async fn chat_completions_handler(
+    State(_state): State<AppState>,
+    Json(req): Json<ChatCompletionRequest>,
+) -> Response {
+    debug!(
+        model = %req.model,
+        stream = req.stream,
+        message_count = req.messages.len(),
+        "{LOG_PREFIX} chat_completions: start"
+    );
+
+    let config = match Config::load_or_init().await {
+        Ok(c) => c,
+        Err(e) => {
+            error!("{LOG_PREFIX} chat_completions: config load failed: {e}");
+            return (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                Json(json!({ "error": { "message": format!("config load failed: {e}"), "type": "internal_error" }})),
+            )
+                .into_response();
+        }
+    };
+
+    // Build provider string from model name.
+    // If the model already looks like a provider string, use it directly.
+    // Otherwise treat a bare model name as an Ollama model.
+    let provider_string = if req.model.starts_with("ollama:")
+        || req.model.contains(':')
+        || req.model == "openhuman"
+    {
+        req.model.clone()
+    } else {
+        // Bare model name (no colon) — route to Ollama local runtime.
+        format!("ollama:{}", req.model)
+    };
+
+    let (provider_box, model_id) = match provider::factory::create_chat_provider_from_string(
+        "agentic",
+        &provider_string,
+        &config,
+    ) {
+        Ok(pair) => pair,
+        Err(e) => {
+            error!("{LOG_PREFIX} chat_completions: provider build failed: {e}");
+            return (
+                    StatusCode::BAD_REQUEST,
+                    Json(json!({ "error": { "message": format!("provider error: {e}"), "type": "invalid_request_error" }})),
+                )
+                    .into_response();
+        }
+    };
+
+    // Map request messages to provider ChatMessage type.
+    let messages: Vec<ChatMessage> = req
+        .messages
+        .iter()
+        .map(|m| ChatMessage {
+            id: None,
+            role: m.role.clone(),
+            content: m.content.clone(),
+            extra_metadata: None,
+        })
+        .collect();
+
+    let temperature = req.temperature.unwrap_or(config.default_temperature);
+    let completion_id = format!("chatcmpl-{}", uuid::Uuid::new_v4());
+    let created = chrono::Utc::now().timestamp();
+    let model_name = req.model.clone();
+
+    if req.stream {
+        // Streaming response via SSE
+        let options = provider::traits::StreamOptions::new(true);
+        let stream =
+            provider_box.stream_chat_with_history(&messages, &model_id, temperature, options);
+
+        let cid = completion_id.clone();
+        let model_clone = model_name.clone();
+        let event_stream = stream
+            .enumerate()
+            .map(move |(i, chunk_result)| {
+                let cid = cid.clone();
+                let model_clone = model_clone.clone();
+                match chunk_result {
+                    Ok(chunk) => {
+                        let finish_reason = if chunk.is_final { Some("stop") } else { None };
+                        let content = if chunk.delta.is_empty() && chunk.is_final {
+                            None
+                        } else {
+                            Some(chunk.delta)
+                        };
+                        let sse_chunk = ChatCompletionChunk {
+                            id: cid,
+                            object: "chat.completion.chunk",
+                            created,
+                            model: model_clone,
+                            choices: vec![ChatCompletionChunkChoice {
+                                index: 0,
+                                delta: ChatCompletionDelta {
+                                    role: if i == 0 {
+                                        Some("assistant".to_string())
+                                    } else {
+                                        None
+                                    },
+                                    content,
+                                },
+                                finish_reason,
+                            }],
+                        };
+                        let data =
+                            serde_json::to_string(&sse_chunk).unwrap_or_else(|_| "{}".to_string());
+                        Ok::<Event, std::convert::Infallible>(Event::default().data(data))
+                    }
+                    Err(e) => {
+                        let err_event = json!({
+                            "error": { "message": e.to_string(), "type": "stream_error" }
+                        });
+                        Ok(Event::default()
+                            .data(serde_json::to_string(&err_event).unwrap_or_default()))
+                    }
+                }
+            })
+            .chain(stream::once(async {
+                Ok::<Event, std::convert::Infallible>(Event::default().data("[DONE]"))
+            }));
+
+        debug!("{LOG_PREFIX} chat_completions: streaming response started");
+        return Sse::new(event_stream)
+            .keep_alive(KeepAlive::default())
+            .into_response();
+    }
+
+    // Non-streaming: call chat_with_history
+    match provider_box
+        .chat_with_history(&messages, &model_id, temperature)
+        .await
+    {
+        Ok(content) => {
+            debug!("{LOG_PREFIX} chat_completions: non-streaming ok");
+            let response = ChatCompletionResponse {
+                id: completion_id,
+                object: "chat.completion",
+                created,
+                model: model_name,
+                choices: vec![ChatCompletionChoice {
+                    index: 0,
+                    message: ChatCompletionMessage {
+                        role: "assistant".to_string(),
+                        content,
+                    },
+                    finish_reason: "stop",
+                }],
+                usage: ChatCompletionUsage {
+                    prompt_tokens: 0,
+                    completion_tokens: 0,
+                    total_tokens: 0,
+                },
+            };
+            (StatusCode::OK, Json(response)).into_response()
+        }
+        Err(e) => {
+            error!("{LOG_PREFIX} chat_completions: inference failed: {e}");
+            (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                Json(json!({ "error": { "message": format!("inference error: {e}"), "type": "internal_error" }})),
+            )
+                .into_response()
+        }
+    }
+}
+
+/// `GET /v1/models`
+///
+/// Lists all configured models (local Ollama + cloud providers).
+async fn models_handler(State(_state): State<AppState>) -> Response {
+    debug!("{LOG_PREFIX} models: start");
+
+    let config = match Config::load_or_init().await {
+        Ok(c) => c,
+        Err(e) => {
+            error!("{LOG_PREFIX} models: config load failed: {e}");
+            return (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                Json(json!({ "error": { "message": format!("config load failed: {e}") }})),
+            )
+                .into_response();
+        }
+    };
+
+    let created = chrono::Utc::now().timestamp();
+    let mut data: Vec<ModelObject> = Vec::new();
+
+    // Cloud provider default models
+    for cp in &config.cloud_providers {
+        if let Some(ref model) = cp.default_model {
+            data.push(ModelObject {
+                id: format!("{}:{}", cp.slug, model),
+                object: "model",
+                created,
+                owned_by: cp.slug.clone(),
+            });
+        }
+    }
+
+    // Configured local chat model (Ollama)
+    if !config.local_ai.chat_model_id.is_empty() {
+        data.push(ModelObject {
+            id: format!("ollama:{}", config.local_ai.chat_model_id),
+            object: "model",
+            created,
+            owned_by: "ollama".to_string(),
+        });
+    }
+
+    debug!(model_count = data.len(), "{LOG_PREFIX} models: ok");
+    (
+        StatusCode::OK,
+        Json(ModelsResponse {
+            object: "list",
+            data,
+        }),
+    )
+        .into_response()
+}
+
+#[cfg(test)]
+#[path = "tests.rs"]
+mod tests;
diff --git a/src/openhuman/inference/http/tests.rs b/src/openhuman/inference/http/tests.rs
new file mode 100644
index 0000000000..d25268413a
--- /dev/null
+++ b/src/openhuman/inference/http/tests.rs
@@ -0,0 +1,103 @@
+//! Integration tests for the OpenAI-compatible `/v1` HTTP endpoint.
+//!
+//! These tests spin up an in-process axum router (no network), send
+//! crafted HTTP requests via `tower::ServiceExt::oneshot`, and assert on
+//! the response status codes.
+//!
+//! A running inference backend is NOT required — the tests exercise the
+//! routing and auth-middleware layers only.
+
+use axum::body::Body;
+use axum::http::{header, Method, Request, StatusCode};
+use tower::ServiceExt;
+
+use crate::core::jsonrpc::build_core_http_router;
+
+/// Build the test router (Socket.IO disabled — no real runtime needed).
+fn test_router() -> axum::Router {
+    build_core_http_router(false)
+}
+
+/// Convenience: dispatch a single request through the in-process router.
+async fn dispatch(req: Request<Body>) -> axum::response::Response {
+    test_router().oneshot(req).await.unwrap()
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+/// Requests to `POST /v1/chat/completions` without any `Authorization` header
+/// must be rejected with `401 Unauthorized`.
+#[tokio::test]
+async fn test_chat_completions_no_bearer_returns_401() {
+    let body = serde_json::json!({
+        "model": "ollama:llama3",
+        "messages": [{ "role": "user", "content": "hello" }]
+    });
+    let req = Request::builder()
+        .method(Method::POST)
+        .uri("/v1/chat/completions")
+        .header(header::CONTENT_TYPE, "application/json")
+        .body(Body::from(serde_json::to_string(&body).unwrap()))
+        .unwrap();
+
+    let resp = dispatch(req).await;
+    assert_eq!(
+        resp.status(),
+        StatusCode::UNAUTHORIZED,
+        "POST /v1/chat/completions without bearer must return 401"
+    );
+}
+
+/// Requests to `GET /v1/models` without any `Authorization` header must be
+/// rejected with `401 Unauthorized`.
+#[tokio::test]
+async fn test_models_no_bearer_returns_401() {
+    let req = Request::builder()
+        .method(Method::GET)
+        .uri("/v1/models")
+        .body(Body::empty())
+        .unwrap();
+
+    let resp = dispatch(req).await;
+    assert_eq!(
+        resp.status(),
+        StatusCode::UNAUTHORIZED,
+        "GET /v1/models without bearer must return 401"
+    );
+}
+
+/// A request with a bearer token must not be rejected as 401/403. The actual
+/// response code depends on whether a live inference backend is running; the
+/// test only asserts that auth passed.
+#[tokio::test]
+async fn test_chat_completions_with_bearer_not_rejected_as_auth_error() {
+    // Use the env var if set (CI with a real core token), otherwise use any
+    // non-empty string — the test-support middleware accepts it.
+    let token = std::env::var("OPENHUMAN_CORE_TOKEN").unwrap_or_else(|_| "test-token".to_string());
+
+    let body = serde_json::json!({
+        "model": "ollama:llama3",
+        "messages": [{ "role": "user", "content": "ping" }],
+        "stream": false
+    });
+    let req = Request::builder()
+        .method(Method::POST)
+        .uri("/v1/chat/completions")
+        .header(header::CONTENT_TYPE, "application/json")
+        .header(header::AUTHORIZATION, format!("Bearer {}", token))
+        .body(Body::from(serde_json::to_string(&body).unwrap()))
+        .unwrap();
+
+    let resp = dispatch(req).await;
+    let status = resp.status();
+    assert_ne!(
+        status,
+        StatusCode::UNAUTHORIZED,
+        "401 must not fire when bearer is present"
+    );
+    assert_ne!(
+        status,
+        StatusCode::FORBIDDEN,
+        "403 must not fire when bearer is present"
+    );
+}
diff --git a/src/openhuman/inference/http/types.rs b/src/openhuman/inference/http/types.rs
new file mode 100644
index 0000000000..6bf58fe518
--- /dev/null
+++ b/src/openhuman/inference/http/types.rs
@@ -0,0 +1,93 @@
+//! OpenAI-compatible HTTP request / response types.
+
+use serde::{Deserialize, Serialize};
+
+// ── Chat Completions ──────────────────────────────────────────────────────────
+
+#[derive(Debug, Deserialize)]
+pub struct ChatCompletionRequest {
+    pub model: String,
+    pub messages: Vec<ChatCompletionMessage>,
+    #[serde(default)]
+    pub stream: bool,
+    #[serde(default)]
+    pub temperature: Option<f64>,
+    #[serde(default)]
+    pub max_tokens: Option<u32>,
+    /// Optional tool definitions (ignored if the provider doesn't support them).
+    #[serde(default)]
+    pub tools: Option<serde_json::Value>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatCompletionMessage {
+    pub role: String,
+    pub content: String,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ChatCompletionResponse {
+    pub id: String,
+    pub object: &'static str,
+    pub created: i64,
+    pub model: String,
+    pub choices: Vec<ChatCompletionChoice>,
+    pub usage: ChatCompletionUsage,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ChatCompletionChoice {
+    pub index: u32,
+    pub message: ChatCompletionMessage,
+    pub finish_reason: &'static str,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ChatCompletionUsage {
+    pub prompt_tokens: u64,
+    pub completion_tokens: u64,
+    pub total_tokens: u64,
+}
+
+// ── Streaming (SSE) ───────────────────────────────────────────────────────────
+
+#[derive(Debug, Serialize)]
+pub struct ChatCompletionChunk {
+    pub id: String,
+    pub object: &'static str,
+    pub created: i64,
+    pub model: String,
+    pub choices: Vec<ChatCompletionChunkChoice>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ChatCompletionChunkChoice {
+    pub index: u32,
+    pub delta: ChatCompletionDelta,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub finish_reason: Option<&'static str>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ChatCompletionDelta {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub role: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content: Option<String>,
+}
+
+// ── Models ────────────────────────────────────────────────────────────────────
+
+#[derive(Debug, Serialize)]
+pub struct ModelsResponse {
+    pub object: &'static str,
+    pub data: Vec<ModelObject>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ModelObject {
+    pub id: String,
+    pub object: &'static str,
+    pub created: i64,
+    pub owned_by: String,
+}
diff --git a/src/openhuman/local_ai/core.rs b/src/openhuman/inference/local/core.rs
similarity index 100%
rename from src/openhuman/local_ai/core.rs
rename to src/openhuman/inference/local/core.rs
diff --git a/src/openhuman/local_ai/install.rs b/src/openhuman/inference/local/install.rs
similarity index 99%
rename from src/openhuman/local_ai/install.rs
rename to src/openhuman/inference/local/install.rs
index 780dffe1f9..d6d8c6de6f 100644
--- a/src/openhuman/local_ai/install.rs
+++ b/src/openhuman/inference/local/install.rs
@@ -124,7 +124,7 @@ fn build_install_command(install_dir: &Path) -> Result<tokio::process::Command,
         // crash-resume detection in `is_ollama_installer_running` picks it
         // up on the next OpenHuman launch and waits.
         cmd.kill_on_drop(true);
-        crate::openhuman::local_ai::process_util::apply_no_window(&mut cmd);
+        crate::openhuman::inference::local::process_util::apply_no_window(&mut cmd);
         cmd.env("OPENHUMAN_OLLAMA_INSTALL_DIR", install_dir);
         cmd.args([
             "-NoProfile",
@@ -341,7 +341,7 @@ mod tests {
     /// calls and cause flakes.
 
     fn env_lock() -> std::sync::MutexGuard<'static, ()> {
-        crate::openhuman::local_ai::local_ai_test_guard()
+        crate::openhuman::inference::inference_test_guard()
     }
 
     /// RAII guard: records the prior value of `var` on construction and
diff --git a/src/openhuman/local_ai/install_piper.rs b/src/openhuman/inference/local/install_piper.rs
similarity index 99%
rename from src/openhuman/local_ai/install_piper.rs
rename to src/openhuman/inference/local/install_piper.rs
index c04ad6d1ea..ae975ed5be 100644
--- a/src/openhuman/local_ai/install_piper.rs
+++ b/src/openhuman/inference/local/install_piper.rs
@@ -142,7 +142,7 @@ fn decode_voice_id(voice_id: &str) -> (String, String, String, String) {
 /// to "installed" when on-disk artifacts pass validation.
 pub fn status(config: &Config) -> VoiceInstallStatus {
     let mut snapshot = read_status(ENGINE_PIPER);
-    let configured_voice = crate::openhuman::local_ai::model_ids::effective_tts_voice_id(config);
+    let configured_voice = crate::openhuman::inference::model_ids::effective_tts_voice_id(config);
     let configured_voice = configured_voice.trim_end_matches(".onnx").to_string();
     if matches!(snapshot.state, VoiceInstallState::Missing)
         && installed_artifacts_ok(config, &configured_voice)
@@ -469,7 +469,7 @@ pub(crate) fn find_workspace_piper_binary(config: &Config) -> Option<PathBuf> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::openhuman::local_ai::voice_install_common::reset_status;
+    use crate::openhuman::inference::local::voice_install_common::reset_status;
 
     fn temp_config() -> (tempfile::TempDir, Config) {
         let dir = tempfile::tempdir().expect("tempdir");
@@ -553,7 +553,7 @@ mod tests {
     /// directory; reuses the module-wide `local_ai_test_guard` so paths +
     /// install_whisper tests are serialised through the same lock.
     fn shared_install_lock() -> std::sync::MutexGuard<'static, ()> {
-        crate::openhuman::local_ai::local_ai_test_guard()
+        crate::openhuman::inference::inference_test_guard()
     }
 
     fn wipe_shared_install_dir(config: &Config) {
diff --git a/src/openhuman/local_ai/install_whisper.rs b/src/openhuman/inference/local/install_whisper.rs
similarity index 98%
rename from src/openhuman/local_ai/install_whisper.rs
rename to src/openhuman/inference/local/install_whisper.rs
index 378fd16d33..b6a654d56e 100644
--- a/src/openhuman/local_ai/install_whisper.rs
+++ b/src/openhuman/inference/local/install_whisper.rs
@@ -21,7 +21,7 @@
 //!    moment a binary lands on PATH.
 //!
 //! Per-engine progress is reported via the shared
-//! [`crate::openhuman::local_ai::voice_install_common`] status table so
+//! [`crate::openhuman::inference::local::voice_install_common`] status table so
 //! the renderer can poll one RPC for state across both Whisper and Piper.
 
 use std::path::PathBuf;
@@ -106,7 +106,7 @@ pub fn status(config: &Config) -> VoiceInstallStatus {
     // artifacts so the UI doesn't show a perpetual "missing" after a
     // successful install across a process restart.
     if matches!(snapshot.state, VoiceInstallState::Missing) {
-        let configured = crate::openhuman::local_ai::model_ids::effective_stt_model_id(config);
+        let configured = crate::openhuman::inference::model_ids::effective_stt_model_id(config);
         if installed_artifacts_ok(config, &configured) {
             snapshot.state = VoiceInstallState::Installed;
             snapshot.stage = Some(format!("{configured} present"));
@@ -339,7 +339,7 @@ pub(crate) fn find_workspace_whisper_binary(config: &Config) -> Option<PathBuf>
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::openhuman::local_ai::voice_install_common::reset_status;
+    use crate::openhuman::inference::local::voice_install_common::reset_status;
 
     fn temp_config() -> (tempfile::TempDir, Config) {
         let dir = tempfile::tempdir().expect("tempdir");
@@ -408,7 +408,7 @@ mod tests {
     /// in parallel. Reuses the module-wide `local_ai_test_guard` so paths
     /// + install_piper tests are serialised through the same lock.
     fn shared_install_lock() -> std::sync::MutexGuard<'static, ()> {
-        crate::openhuman::local_ai::local_ai_test_guard()
+        crate::openhuman::inference::inference_test_guard()
     }
 
     /// Wipe the shared-root install dir for whisper so the absence
diff --git a/src/openhuman/local_ai/lm_studio_api.rs b/src/openhuman/inference/local/lm_studio.rs
similarity index 100%
rename from src/openhuman/local_ai/lm_studio_api.rs
rename to src/openhuman/inference/local/lm_studio.rs
diff --git a/src/openhuman/inference/local/mod.rs b/src/openhuman/inference/local/mod.rs
new file mode 100644
index 0000000000..e158f06ee3
--- /dev/null
+++ b/src/openhuman/inference/local/mod.rs
@@ -0,0 +1,51 @@
+//! Local AI runtime — Ollama, LM Studio, Whisper, Piper sub-process management.
+//!
+//! This module was previously `src/openhuman/local_ai/`. It now lives under
+//! `inference/local/` so all inference concerns share a single domain root.
+
+#[cfg(test)]
+pub(crate) static INFERENCE_TEST_MUTEX: once_cell::sync::Lazy<std::sync::Mutex<()>> =
+    once_cell::sync::Lazy::new(|| std::sync::Mutex::new(()));
+
+#[cfg(test)]
+pub(crate) fn inference_test_guard() -> std::sync::MutexGuard<'static, ()> {
+    INFERENCE_TEST_MUTEX
+        .lock()
+        .unwrap_or_else(|p| p.into_inner())
+}
+
+mod core;
+pub mod ops;
+mod schemas;
+
+// Re-expose inference-level modules under `local::` so that files that
+// were moved from `local_ai/` and used `super::model_ids` etc. continue
+// to compile without rewriting every callsite.
+pub use super::device;
+pub use super::model_ids;
+pub use super::parse;
+pub use super::paths;
+pub use super::presets;
+pub use super::sentiment;
+pub use super::types;
+
+pub mod install;
+pub(crate) mod install_piper;
+pub(crate) mod install_whisper;
+pub(crate) mod lm_studio;
+mod ollama;
+mod process_util;
+pub(crate) mod provider;
+pub(crate) use ollama::{ollama_base_url, OLLAMA_BASE_URL};
+pub mod service;
+pub(crate) mod voice_install_common;
+
+pub use core::*;
+pub use ops as rpc;
+pub use ops::*;
+pub use schemas::{
+    all_controller_schemas as all_local_ai_controller_schemas,
+    all_registered_controllers as all_local_ai_registered_controllers,
+};
+pub(crate) use service::whisper_engine;
+pub use service::LocalAiService;
diff --git a/src/openhuman/local_ai/ollama_api.rs b/src/openhuman/inference/local/ollama.rs
similarity index 99%
rename from src/openhuman/local_ai/ollama_api.rs
rename to src/openhuman/inference/local/ollama.rs
index f794c433d7..cb96fb2756 100644
--- a/src/openhuman/local_ai/ollama_api.rs
+++ b/src/openhuman/inference/local/ollama.rs
@@ -341,7 +341,7 @@ mod tests {
     }
 
     fn test_lock() -> std::sync::MutexGuard<'static, ()> {
-        crate::openhuman::local_ai::local_ai_test_guard()
+        crate::openhuman::inference::inference_test_guard()
     }
 
     #[test]
diff --git a/src/openhuman/local_ai/ops.rs b/src/openhuman/inference/local/ops.rs
similarity index 96%
rename from src/openhuman/local_ai/ops.rs
rename to src/openhuman/inference/local/ops.rs
index 5a453b8031..06f1911817 100644
--- a/src/openhuman/local_ai/ops.rs
+++ b/src/openhuman/inference/local/ops.rs
@@ -8,14 +8,16 @@ use chrono::Utc;
 
 use crate::openhuman::agent::Agent;
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::{
-    self, LocalAiAssetsStatus, LocalAiDownloadsProgress, LocalAiEmbeddingResult,
-    LocalAiSpeechResult, LocalAiTtsResult,
+use crate::openhuman::inference::local as local_ai;
+use crate::openhuman::inference::provider as providers;
+use crate::openhuman::inference::provider::ops::ProviderRuntimeOptions;
+use crate::openhuman::inference::{
+    LocalAiAssetsStatus, LocalAiDownloadsProgress, LocalAiEmbeddingResult, LocalAiSpeechResult,
+    LocalAiStatus, LocalAiTtsResult,
 };
 use crate::openhuman::prompt_injection::{
     enforce_prompt_input, PromptEnforcementAction, PromptEnforcementContext,
 };
-use crate::openhuman::providers::{self, ProviderRuntimeOptions};
 use crate::rpc::RpcOutcome;
 
 fn prompt_guard_user_message(action: PromptEnforcementAction) -> &'static str {
@@ -135,9 +137,7 @@ pub async fn agent_chat_simple(
 }
 
 /// Returns the current operational status of the local AI stack.
-pub async fn local_ai_status(
-    config: &Config,
-) -> Result<RpcOutcome<local_ai::LocalAiStatus>, String> {
+pub async fn local_ai_status(config: &Config) -> Result<RpcOutcome<LocalAiStatus>, String> {
     let service = local_ai::global(config);
     let status = service.status();
     if matches!(status.state.as_str(), "idle" | "degraded") {
@@ -378,7 +378,7 @@ pub async fn local_ai_chat(
         return Err("messages must not be empty".to_string());
     }
 
-    let mut ollama_messages: Vec<crate::openhuman::local_ai::ollama_api::OllamaChatMessage> =
+    let mut ollama_messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage> =
         Vec::with_capacity(messages.len());
 
     for msg in messages.into_iter() {
@@ -396,10 +396,12 @@ pub async fn local_ai_chat(
             }
         }
 
-        ollama_messages.push(crate::openhuman::local_ai::ollama_api::OllamaChatMessage {
-            role: normalized_role,
-            content: msg.content,
-        });
+        ollama_messages.push(
+            crate::openhuman::inference::local::ollama::OllamaChatMessage {
+                role: normalized_role,
+                content: msg.content,
+            },
+        );
     }
 
     let service = local_ai::global(config);
diff --git a/src/openhuman/local_ai/ops_tests.rs b/src/openhuman/inference/local/ops_tests.rs
similarity index 100%
rename from src/openhuman/local_ai/ops_tests.rs
rename to src/openhuman/inference/local/ops_tests.rs
diff --git a/src/openhuman/local_ai/process_util.rs b/src/openhuman/inference/local/process_util.rs
similarity index 100%
rename from src/openhuman/local_ai/process_util.rs
rename to src/openhuman/inference/local/process_util.rs
diff --git a/src/openhuman/local_ai/provider.rs b/src/openhuman/inference/local/provider.rs
similarity index 100%
rename from src/openhuman/local_ai/provider.rs
rename to src/openhuman/inference/local/provider.rs
diff --git a/src/openhuman/local_ai/schemas.rs b/src/openhuman/inference/local/schemas.rs
similarity index 82%
rename from src/openhuman/local_ai/schemas.rs
rename to src/openhuman/inference/local/schemas.rs
index e83efbf367..aefc7d5af0 100644
--- a/src/openhuman/local_ai/schemas.rs
+++ b/src/openhuman/inference/local/schemas.rs
@@ -271,7 +271,7 @@ fn handle_agent_chat(params: Map<String, Value>) -> ControllerFuture {
         let p = deserialize_params::<AgentChatParams>(params)?;
         let mut config = config_rpc::load_config_with_timeout().await?;
         to_json(
-            crate::openhuman::local_ai::rpc::agent_chat(
+            crate::openhuman::inference::local::ops::agent_chat(
                 &mut config,
                 &p.message,
                 p.model_override,
@@ -287,7 +287,7 @@ fn handle_agent_chat_simple(params: Map<String, Value>) -> ControllerFuture {
         let p = deserialize_params::<AgentChatParams>(params)?;
         let config = config_rpc::load_config_with_timeout().await?;
         to_json(
-            crate::openhuman::local_ai::rpc::agent_chat_simple(
+            crate::openhuman::inference::local::ops::agent_chat_simple(
                 &config,
                 &p.message,
                 p.model_override,
@@ -303,8 +303,11 @@ fn handle_local_ai_transcribe(params: Map<String, Value>) -> ControllerFuture {
         let p = deserialize_params::<LocalAiTranscribeParams>(params)?;
         let config = config_rpc::load_config_with_timeout().await?;
         to_json(
-            crate::openhuman::local_ai::rpc::local_ai_transcribe(&config, p.audio_path.trim())
-                .await?,
+            crate::openhuman::inference::local::ops::local_ai_transcribe(
+                &config,
+                p.audio_path.trim(),
+            )
+            .await?,
         )
     })
 }
@@ -314,7 +317,7 @@ fn handle_local_ai_transcribe_bytes(params: Map<String, Value>) -> ControllerFut
         let p = deserialize_params::<LocalAiTranscribeBytesParams>(params)?;
         let config = config_rpc::load_config_with_timeout().await?;
         to_json(
-            crate::openhuman::local_ai::rpc::local_ai_transcribe_bytes(
+            crate::openhuman::inference::local::ops::local_ai_transcribe_bytes(
                 &config,
                 &p.audio_bytes,
                 p.extension,
@@ -329,7 +332,7 @@ fn handle_local_ai_tts(params: Map<String, Value>) -> ControllerFuture {
         let p = deserialize_params::<LocalAiTtsParams>(params)?;
         let config = config_rpc::load_config_with_timeout().await?;
         to_json(
-            crate::openhuman::local_ai::rpc::local_ai_tts(
+            crate::openhuman::inference::local::ops::local_ai_tts(
                 &config,
                 &p.text,
                 p.output_path.as_deref(),
@@ -342,14 +345,16 @@ fn handle_local_ai_tts(params: Map<String, Value>) -> ControllerFuture {
 fn handle_local_ai_assets_status(_params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let config = config_rpc::load_config_with_timeout().await?;
-        to_json(crate::openhuman::local_ai::rpc::local_ai_assets_status(&config).await?)
+        to_json(crate::openhuman::inference::local::ops::local_ai_assets_status(&config).await?)
     })
 }
 
 fn handle_local_ai_downloads_progress(_params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let config = config_rpc::load_config_with_timeout().await?;
-        to_json(crate::openhuman::local_ai::rpc::local_ai_downloads_progress(&config).await?)
+        to_json(
+            crate::openhuman::inference::local::ops::local_ai_downloads_progress(&config).await?,
+        )
     })
 }
 
@@ -358,8 +363,11 @@ fn handle_local_ai_download_asset(params: Map<String, Value>) -> ControllerFutur
         let p = deserialize_params::<LocalAiDownloadAssetParams>(params)?;
         let config = config_rpc::load_config_with_timeout().await?;
         to_json(
-            crate::openhuman::local_ai::rpc::local_ai_download_asset(&config, p.capability.trim())
-                .await?,
+            crate::openhuman::inference::local::ops::local_ai_download_asset(
+                &config,
+                p.capability.trim(),
+            )
+            .await?,
         )
     })
 }
@@ -393,30 +401,31 @@ fn handle_local_ai_install_whisper(params: Map<String, Value>) -> ControllerFutu
         // write_status sequence was non-atomic and let two callers slip
         // through; `try_acquire_install_slot` does the check-and-claim
         // under a single mutex acquisition.
-        let slot = match crate::openhuman::local_ai::voice_install_common::try_acquire_install_slot(
-            crate::openhuman::local_ai::voice_install_common::ENGINE_WHISPER,
-        ) {
-            Some(slot) => slot,
-            None => {
-                tracing::debug!(
-                    "[voice-install:whisper] slot already held — returning current status"
+        let slot =
+            match crate::openhuman::inference::local::voice_install_common::try_acquire_install_slot(
+                crate::openhuman::inference::local::voice_install_common::ENGINE_WHISPER,
+            ) {
+                Some(slot) => slot,
+                None => {
+                    tracing::debug!(
+                        "[voice-install:whisper] slot already held — returning current status"
+                    );
+                    let current = crate::openhuman::inference::local::voice_install_common::read_status(
+                    crate::openhuman::inference::local::voice_install_common::ENGINE_WHISPER,
                 );
-                let current = crate::openhuman::local_ai::voice_install_common::read_status(
-                    crate::openhuman::local_ai::voice_install_common::ENGINE_WHISPER,
-                );
-                return serde_json::to_value(current)
-                    .map_err(|e| format!("serialize whisper status: {e}"));
-            }
-        };
+                    return serde_json::to_value(current)
+                        .map_err(|e| format!("serialize whisper status: {e}"));
+                }
+            };
 
         // Mark "installing" before the spawn so the very next status poll
         // (≤ 2s away) reflects the new state without a stale read.
-        crate::openhuman::local_ai::voice_install_common::write_status(
-            crate::openhuman::local_ai::voice_install_common::VoiceInstallStatus {
-                engine: crate::openhuman::local_ai::voice_install_common::ENGINE_WHISPER
+        crate::openhuman::inference::local::voice_install_common::write_status(
+            crate::openhuman::inference::local::voice_install_common::VoiceInstallStatus {
+                engine: crate::openhuman::inference::local::voice_install_common::ENGINE_WHISPER
                     .to_string(),
                 state:
-                    crate::openhuman::local_ai::voice_install_common::VoiceInstallState::Installing,
+                    crate::openhuman::inference::local::voice_install_common::VoiceInstallState::Installing,
                 progress: Some(0),
                 downloaded_bytes: None,
                 total_bytes: None,
@@ -437,7 +446,7 @@ fn handle_local_ai_install_whisper(params: Map<String, Value>) -> ControllerFutu
         // single-writer guard on task exit, including via panic.
         tokio::spawn(async move {
             let _slot = slot;
-            if let Err(e) = crate::openhuman::local_ai::install_whisper::install_whisper(
+            if let Err(e) = crate::openhuman::inference::local::install_whisper::install_whisper(
                 &config, model_size, force,
             )
             .await
@@ -446,8 +455,8 @@ fn handle_local_ai_install_whisper(params: Map<String, Value>) -> ControllerFutu
             }
         });
 
-        let status = crate::openhuman::local_ai::voice_install_common::read_status(
-            crate::openhuman::local_ai::voice_install_common::ENGINE_WHISPER,
+        let status = crate::openhuman::inference::local::voice_install_common::read_status(
+            crate::openhuman::inference::local::voice_install_common::ENGINE_WHISPER,
         );
         serde_json::to_value(status).map_err(|e| format!("serialize whisper status: {e}"))
     })
@@ -461,27 +470,29 @@ fn handle_local_ai_install_piper(params: Map<String, Value>) -> ControllerFuture
 
         // See the whisper handler above for why this is an atomic slot
         // acquisition rather than a read_status / write_status pair.
-        let slot = match crate::openhuman::local_ai::voice_install_common::try_acquire_install_slot(
-            crate::openhuman::local_ai::voice_install_common::ENGINE_PIPER,
-        ) {
-            Some(slot) => slot,
-            None => {
-                tracing::debug!(
-                    "[voice-install:piper] slot already held — returning current status"
-                );
-                let current = crate::openhuman::local_ai::voice_install_common::read_status(
-                    crate::openhuman::local_ai::voice_install_common::ENGINE_PIPER,
-                );
-                return serde_json::to_value(current)
-                    .map_err(|e| format!("serialize piper status: {e}"));
-            }
-        };
-
-        crate::openhuman::local_ai::voice_install_common::write_status(
-            crate::openhuman::local_ai::voice_install_common::VoiceInstallStatus {
-                engine: crate::openhuman::local_ai::voice_install_common::ENGINE_PIPER.to_string(),
+        let slot =
+            match crate::openhuman::inference::local::voice_install_common::try_acquire_install_slot(
+                crate::openhuman::inference::local::voice_install_common::ENGINE_PIPER,
+            ) {
+                Some(slot) => slot,
+                None => {
+                    tracing::debug!(
+                        "[voice-install:piper] slot already held — returning current status"
+                    );
+                    let current =
+                        crate::openhuman::inference::local::voice_install_common::read_status(
+                            crate::openhuman::inference::local::voice_install_common::ENGINE_PIPER,
+                        );
+                    return serde_json::to_value(current)
+                        .map_err(|e| format!("serialize piper status: {e}"));
+                }
+            };
+
+        crate::openhuman::inference::local::voice_install_common::write_status(
+            crate::openhuman::inference::local::voice_install_common::VoiceInstallStatus {
+                engine: crate::openhuman::inference::local::voice_install_common::ENGINE_PIPER.to_string(),
                 state:
-                    crate::openhuman::local_ai::voice_install_common::VoiceInstallState::Installing,
+                    crate::openhuman::inference::local::voice_install_common::VoiceInstallState::Installing,
                 progress: Some(0),
                 downloaded_bytes: None,
                 total_bytes: None,
@@ -500,16 +511,17 @@ fn handle_local_ai_install_piper(params: Map<String, Value>) -> ControllerFuture
         // whisper handler.
         tokio::spawn(async move {
             let _slot = slot;
-            if let Err(e) =
-                crate::openhuman::local_ai::install_piper::install_piper(&config, voice_id, force)
-                    .await
+            if let Err(e) = crate::openhuman::inference::local::install_piper::install_piper(
+                &config, voice_id, force,
+            )
+            .await
             {
                 log::warn!("[voice-install:piper] background install failed: {e}");
             }
         });
 
-        let status = crate::openhuman::local_ai::voice_install_common::read_status(
-            crate::openhuman::local_ai::voice_install_common::ENGINE_PIPER,
+        let status = crate::openhuman::inference::local::voice_install_common::read_status(
+            crate::openhuman::inference::local::voice_install_common::ENGINE_PIPER,
         );
         serde_json::to_value(status).map_err(|e| format!("serialize piper status: {e}"))
     })
@@ -518,7 +530,7 @@ fn handle_local_ai_install_piper(params: Map<String, Value>) -> ControllerFuture
 fn handle_local_ai_whisper_install_status(_params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let config = config_rpc::load_config_with_timeout().await?;
-        let status = crate::openhuman::local_ai::install_whisper::status(&config);
+        let status = crate::openhuman::inference::local::install_whisper::status(&config);
         serde_json::to_value(status).map_err(|e| format!("serialize whisper status: {e}"))
     })
 }
@@ -526,7 +538,7 @@ fn handle_local_ai_whisper_install_status(_params: Map<String, Value>) -> Contro
 fn handle_local_ai_piper_install_status(_params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let config = config_rpc::load_config_with_timeout().await?;
-        let status = crate::openhuman::local_ai::install_piper::status(&config);
+        let status = crate::openhuman::inference::local::install_piper::status(&config);
         serde_json::to_value(status).map_err(|e| format!("serialize piper status: {e}"))
     })
 }
diff --git a/src/openhuman/local_ai/schemas_tests.rs b/src/openhuman/inference/local/schemas_tests.rs
similarity index 82%
rename from src/openhuman/local_ai/schemas_tests.rs
rename to src/openhuman/inference/local/schemas_tests.rs
index 4f92528604..4308ce19e8 100644
--- a/src/openhuman/local_ai/schemas_tests.rs
+++ b/src/openhuman/inference/local/schemas_tests.rs
@@ -122,18 +122,18 @@ async fn install_whisper_handler_serializes_concurrent_calls() {
     // also means the handler under test will short-circuit immediately
     // rather than spawning a real install task that would try to hit
     // the network in CI.
-    let slot = crate::openhuman::local_ai::voice_install_common::try_acquire_install_slot(
-        crate::openhuman::local_ai::voice_install_common::ENGINE_WHISPER,
+    let slot = crate::openhuman::inference::local::voice_install_common::try_acquire_install_slot(
+        crate::openhuman::inference::local::voice_install_common::ENGINE_WHISPER,
     )
     .expect("test should be able to claim the slot first");
 
     // Mark the status table as `Installing` so the handler's
     // short-circuit branch (which reads current status to return) sees
     // a coherent snapshot.
-    crate::openhuman::local_ai::voice_install_common::write_status(
-        crate::openhuman::local_ai::voice_install_common::VoiceInstallStatus {
-            engine: crate::openhuman::local_ai::voice_install_common::ENGINE_WHISPER.to_string(),
-            state: crate::openhuman::local_ai::voice_install_common::VoiceInstallState::Installing,
+    crate::openhuman::inference::local::voice_install_common::write_status(
+        crate::openhuman::inference::local::voice_install_common::VoiceInstallStatus {
+            engine: crate::openhuman::inference::local::voice_install_common::ENGINE_WHISPER.to_string(),
+            state: crate::openhuman::inference::local::voice_install_common::VoiceInstallState::Installing,
             progress: Some(0),
             downloaded_bytes: None,
             total_bytes: None,
@@ -156,8 +156,8 @@ async fn install_whisper_handler_serializes_concurrent_calls() {
     }
     drop(slot);
     // Clean up so other tests see Missing.
-    crate::openhuman::local_ai::voice_install_common::reset_status(
-        crate::openhuman::local_ai::voice_install_common::ENGINE_WHISPER,
+    crate::openhuman::inference::local::voice_install_common::reset_status(
+        crate::openhuman::inference::local::voice_install_common::ENGINE_WHISPER,
     );
 
     let v1 = r1.expect("first call ok");
@@ -185,15 +185,15 @@ async fn install_piper_handler_serializes_concurrent_calls() {
         std::env::set_var("OPENHUMAN_WORKSPACE", tmp.path());
     }
 
-    let slot = crate::openhuman::local_ai::voice_install_common::try_acquire_install_slot(
-        crate::openhuman::local_ai::voice_install_common::ENGINE_PIPER,
+    let slot = crate::openhuman::inference::local::voice_install_common::try_acquire_install_slot(
+        crate::openhuman::inference::local::voice_install_common::ENGINE_PIPER,
     )
     .expect("test should be able to claim the slot first");
 
-    crate::openhuman::local_ai::voice_install_common::write_status(
-        crate::openhuman::local_ai::voice_install_common::VoiceInstallStatus {
-            engine: crate::openhuman::local_ai::voice_install_common::ENGINE_PIPER.to_string(),
-            state: crate::openhuman::local_ai::voice_install_common::VoiceInstallState::Installing,
+    crate::openhuman::inference::local::voice_install_common::write_status(
+        crate::openhuman::inference::local::voice_install_common::VoiceInstallStatus {
+            engine: crate::openhuman::inference::local::voice_install_common::ENGINE_PIPER.to_string(),
+            state: crate::openhuman::inference::local::voice_install_common::VoiceInstallState::Installing,
             progress: Some(0),
             downloaded_bytes: None,
             total_bytes: None,
@@ -211,8 +211,8 @@ async fn install_piper_handler_serializes_concurrent_calls() {
         std::env::remove_var("OPENHUMAN_WORKSPACE");
     }
     drop(slot);
-    crate::openhuman::local_ai::voice_install_common::reset_status(
-        crate::openhuman::local_ai::voice_install_common::ENGINE_PIPER,
+    crate::openhuman::inference::local::voice_install_common::reset_status(
+        crate::openhuman::inference::local::voice_install_common::ENGINE_PIPER,
     );
 
     let v1 = r1.expect("first call ok");
diff --git a/src/openhuman/local_ai/service/assets.rs b/src/openhuman/inference/local/service/assets.rs
similarity index 98%
rename from src/openhuman/local_ai/service/assets.rs
rename to src/openhuman/inference/local/service/assets.rs
index 8802843406..b45fbcf2e3 100644
--- a/src/openhuman/local_ai/service/assets.rs
+++ b/src/openhuman/inference/local/service/assets.rs
@@ -3,15 +3,15 @@ use std::path::Path;
 use futures_util::TryStreamExt;
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::model_ids;
+use crate::openhuman::inference::model_ids;
 use tracing::{debug, trace};
 
-use crate::openhuman::local_ai::paths::{
+use crate::openhuman::inference::local::provider::{provider_from_config, LocalAiProvider};
+use crate::openhuman::inference::paths::{
     resolve_stt_model_path, resolve_tts_voice_path, stt_model_target_path, tts_model_target_path,
 };
-use crate::openhuman::local_ai::presets::{self, VisionMode};
-use crate::openhuman::local_ai::provider::{provider_from_config, LocalAiProvider};
-use crate::openhuman::local_ai::types::{
+use crate::openhuman::inference::presets::{self, VisionMode};
+use crate::openhuman::inference::types::{
     LocalAiAssetStatus, LocalAiAssetsStatus, LocalAiDownloadProgressItem, LocalAiDownloadsProgress,
 };
 
@@ -726,7 +726,7 @@ impl LocalAiService {
         self.assets_status(config).await
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn ensure_stt_asset_available(
+    pub(in crate::openhuman::inference::local::service) async fn ensure_stt_asset_available(
         &self,
         config: &Config,
     ) -> Result<(), String> {
@@ -749,7 +749,7 @@ impl LocalAiService {
         Ok(())
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn ensure_tts_asset_available(
+    pub(in crate::openhuman::inference::local::service) async fn ensure_tts_asset_available(
         &self,
         config: &Config,
     ) -> Result<(), String> {
diff --git a/src/openhuman/local_ai/service/bootstrap.rs b/src/openhuman/inference/local/service/bootstrap.rs
similarity index 95%
rename from src/openhuman/local_ai/service/bootstrap.rs
rename to src/openhuman/inference/local/service/bootstrap.rs
index 1220989d74..943c8cff6f 100644
--- a/src/openhuman/local_ai/service/bootstrap.rs
+++ b/src/openhuman/inference/local/service/bootstrap.rs
@@ -1,9 +1,9 @@
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::device::DeviceProfile;
-use crate::openhuman::local_ai::model_ids;
-use crate::openhuman::local_ai::presets::{self, VisionMode};
-use crate::openhuman::local_ai::provider::{provider_from_config, LocalAiProvider};
-use crate::openhuman::local_ai::types::LocalAiStatus;
+use crate::openhuman::inference::device::DeviceProfile;
+use crate::openhuman::inference::local::provider::{provider_from_config, LocalAiProvider};
+use crate::openhuman::inference::model_ids;
+use crate::openhuman::inference::presets::{self, VisionMode};
+use crate::openhuman::inference::types::LocalAiStatus;
 
 use super::LocalAiService;
 
@@ -129,7 +129,7 @@ impl LocalAiService {
 
     pub async fn bootstrap(&self, config: &Config) {
         let _guard = self.bootstrap_lock.lock().await;
-        let device = crate::openhuman::local_ai::device::detect_device_profile();
+        let device = crate::openhuman::inference::device::detect_device_profile();
         let effective_config = config_with_recommended_tier_if_unselected(config, &device);
 
         if !effective_config.local_ai.runtime_enabled {
@@ -324,7 +324,7 @@ impl LocalAiService {
         // Pass GPU info from the device profile so whisper can use hardware acceleration.
         if effective_config.local_ai.whisper_in_process {
             if let Ok(model_path) =
-                crate::openhuman::local_ai::paths::resolve_stt_model_path(&effective_config)
+                crate::openhuman::inference::paths::resolve_stt_model_path(&effective_config)
             {
                 let model = std::path::PathBuf::from(&model_path);
                 let handle = self.whisper.clone();
@@ -401,7 +401,7 @@ impl LocalAiService {
 
 fn config_with_recommended_tier_if_unselected(config: &Config, device: &DeviceProfile) -> Config {
     let current_tier =
-        crate::openhuman::local_ai::presets::current_tier_from_config(&config.local_ai);
+        crate::openhuman::inference::presets::current_tier_from_config(&config.local_ai);
 
     // Local AI is opt-in on every device. The only way to keep it enabled
     // across a restart is an explicit opt-in (`apply_preset` on a real tier),
@@ -411,7 +411,7 @@ fn config_with_recommended_tier_if_unselected(config: &Config, device: &DevicePr
     if !config.local_ai.opt_in_confirmed {
         tracing::debug!(
             total_ram_gb = device.total_ram_gb(),
-            min_required_gb = crate::openhuman::local_ai::presets::MIN_RAM_GB_FOR_LOCAL_AI,
+            min_required_gb = crate::openhuman::inference::presets::MIN_RAM_GB_FOR_LOCAL_AI,
             ?current_tier,
             selected_tier = ?config.local_ai.selected_tier,
             "[local_ai] bootstrap: opt_in_confirmed=false, hard-overriding to disabled (cloud fallback)"
@@ -431,21 +431,21 @@ fn config_with_recommended_tier_if_unselected(config: &Config, device: &DevicePr
 }
 
 fn format_degraded_warning(err: &str, config: &Config) -> String {
-    let current = crate::openhuman::local_ai::presets::current_tier_from_config(&config.local_ai);
+    let current = crate::openhuman::inference::presets::current_tier_from_config(&config.local_ai);
     match current {
-        crate::openhuman::local_ai::presets::ModelTier::Ram16PlusGb => {
+        crate::openhuman::inference::presets::ModelTier::Ram16PlusGb => {
             format!(
                 "{err}. Hint: your device may not support the 16 GB+ tier model. \
                  Try switching to the 8-16 GB or 4-8 GB tier in Settings > Local AI Model."
             )
         }
-        crate::openhuman::local_ai::presets::ModelTier::Ram8To16Gb => {
+        crate::openhuman::inference::presets::ModelTier::Ram8To16Gb => {
             format!(
                 "{err}. Hint: your device may not support the 8-16 GB tier model. \
                  Try switching to the 4-8 GB or 2-4 GB tier in Settings > Local AI Model."
             )
         }
-        crate::openhuman::local_ai::presets::ModelTier::Ram4To8Gb => format!(
+        crate::openhuman::inference::presets::ModelTier::Ram4To8Gb => format!(
             "{err}. Hint: your device may not support the 4-8 GB tier vision sidecar. \
              Try switching to the 2-4 GB tier for text-only local AI."
         ),
@@ -531,9 +531,9 @@ mod tests {
         let mut config = Config::default();
         config.local_ai.selected_tier = Some("ram_2_4gb".to_string());
         config.local_ai.opt_in_confirmed = true;
-        crate::openhuman::local_ai::presets::apply_preset_to_config(
+        crate::openhuman::inference::presets::apply_preset_to_config(
             &mut config.local_ai,
-            crate::openhuman::local_ai::presets::ModelTier::Ram2To4Gb,
+            crate::openhuman::inference::presets::ModelTier::Ram2To4Gb,
         );
         let device = test_device(4);
 
@@ -550,9 +550,9 @@ mod tests {
         let mut config = Config::default();
         config.local_ai.selected_tier = Some("ram_2_4gb".to_string());
         config.local_ai.opt_in_confirmed = true;
-        crate::openhuman::local_ai::presets::apply_preset_to_config(
+        crate::openhuman::inference::presets::apply_preset_to_config(
             &mut config.local_ai,
-            crate::openhuman::local_ai::presets::ModelTier::Ram2To4Gb,
+            crate::openhuman::inference::presets::ModelTier::Ram2To4Gb,
         );
         let device = test_device(16);
 
diff --git a/src/openhuman/local_ai/service/lm_studio.rs b/src/openhuman/inference/local/service/lm_studio.rs
similarity index 92%
rename from src/openhuman/local_ai/service/lm_studio.rs
rename to src/openhuman/inference/local/service/lm_studio.rs
index 218d1b5d54..742d7c46b8 100644
--- a/src/openhuman/local_ai/service/lm_studio.rs
+++ b/src/openhuman/inference/local/service/lm_studio.rs
@@ -1,10 +1,10 @@
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::lm_studio_api::{
+use crate::openhuman::inference::local::lm_studio::{
     apply_lm_studio_auth, lm_studio_base_url, LmStudioChatCompletionRequest,
     LmStudioChatCompletionResponse, LmStudioChatMessage, LmStudioModelsResponse,
 };
-use crate::openhuman::local_ai::model_ids;
-use crate::openhuman::local_ai::ollama_api::OllamaModelTag;
+use crate::openhuman::inference::local::ollama::OllamaModelTag;
+use crate::openhuman::inference::model_ids;
 
 use super::LocalAiService;
 
@@ -17,14 +17,14 @@ fn diagnostic_body_snippet(body: &str) -> String {
     snippet
 }
 
-pub(in crate::openhuman::local_ai::service) struct LmStudioCompletionOutcome {
+pub(in crate::openhuman::inference::local::service) struct LmStudioCompletionOutcome {
     pub reply: String,
     pub prompt_tokens: Option<u32>,
     pub completion_tokens: Option<u32>,
 }
 
 impl LocalAiService {
-    pub(in crate::openhuman::local_ai::service) async fn ensure_lm_studio_available(
+    pub(in crate::openhuman::inference::local::service) async fn ensure_lm_studio_available(
         &self,
         config: &Config,
     ) -> Result<(), String> {
@@ -37,7 +37,7 @@ impl LocalAiService {
         Ok(())
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn list_lm_studio_models(
+    pub(in crate::openhuman::inference::local::service) async fn list_lm_studio_models(
         &self,
         config: &Config,
     ) -> Result<Vec<OllamaModelTag>, String> {
@@ -120,7 +120,7 @@ impl LocalAiService {
             .collect())
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn has_lm_studio_model(
+    pub(in crate::openhuman::inference::local::service) async fn has_lm_studio_model(
         &self,
         config: &Config,
         model: &str,
@@ -133,7 +133,7 @@ impl LocalAiService {
             .any(|m| m.name.to_ascii_lowercase() == target))
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn lm_studio_chat_completion(
+    pub(in crate::openhuman::inference::local::service) async fn lm_studio_chat_completion(
         &self,
         config: &Config,
         messages: Vec<LmStudioChatMessage>,
diff --git a/src/openhuman/local_ai/service/mod.rs b/src/openhuman/inference/local/service/mod.rs
similarity index 94%
rename from src/openhuman/local_ai/service/mod.rs
rename to src/openhuman/inference/local/service/mod.rs
index d9f03cd3ed..0da432d265 100644
--- a/src/openhuman/local_ai/service/mod.rs
+++ b/src/openhuman/inference/local/service/mod.rs
@@ -10,7 +10,7 @@ mod speech;
 mod vision_embed;
 pub(crate) mod whisper_engine;
 
-use crate::openhuman::local_ai::types::LocalAiStatus;
+use crate::openhuman::inference::types::LocalAiStatus;
 use parking_lot::Mutex;
 
 pub struct LocalAiService {
diff --git a/src/openhuman/local_ai/service/ollama_admin.rs b/src/openhuman/inference/local/service/ollama_admin.rs
similarity index 97%
rename from src/openhuman/local_ai/service/ollama_admin.rs
rename to src/openhuman/inference/local/service/ollama_admin.rs
index 417134ba36..5114ec7f39 100644
--- a/src/openhuman/local_ai/service/ollama_admin.rs
+++ b/src/openhuman/inference/local/service/ollama_admin.rs
@@ -3,17 +3,19 @@ use std::path::{Path, PathBuf};
 use futures_util::StreamExt;
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::install::{find_system_ollama_binary, run_ollama_install_script};
-use crate::openhuman::local_ai::lm_studio_api::lm_studio_base_url;
-use crate::openhuman::local_ai::model_ids;
-use crate::openhuman::local_ai::ollama_api::{
+use crate::openhuman::inference::local::install::{
+    find_system_ollama_binary, run_ollama_install_script,
+};
+use crate::openhuman::inference::local::lm_studio::lm_studio_base_url;
+use crate::openhuman::inference::local::ollama::{
     ollama_base_url, OllamaModelTag, OllamaPullEvent, OllamaPullProgress, OllamaPullRequest,
     OllamaTagsResponse,
 };
-use crate::openhuman::local_ai::paths::{find_workspace_ollama_binary, workspace_ollama_binary};
-use crate::openhuman::local_ai::presets::{self, VisionMode};
-use crate::openhuman::local_ai::process_util::apply_no_window;
-use crate::openhuman::local_ai::provider::{provider_from_config, LocalAiProvider};
+use crate::openhuman::inference::local::process_util::apply_no_window;
+use crate::openhuman::inference::local::provider::{provider_from_config, LocalAiProvider};
+use crate::openhuman::inference::model_ids;
+use crate::openhuman::inference::paths::{find_workspace_ollama_binary, workspace_ollama_binary};
+use crate::openhuman::inference::presets::{self, VisionMode};
 
 use super::spawn_marker::{self, OllamaSpawnMarker};
 use super::LocalAiService;
@@ -23,7 +25,7 @@ fn lm_studio_models_error_means_unreachable(error: &str) -> bool {
 }
 
 impl LocalAiService {
-    pub(in crate::openhuman::local_ai::service) async fn ensure_ollama_server(
+    pub(in crate::openhuman::inference::local::service) async fn ensure_ollama_server(
         &self,
         _config: &Config,
     ) -> Result<(), String> {
@@ -45,7 +47,7 @@ impl LocalAiService {
 
     /// Like `ensure_ollama_server`, but forces a fresh install of the Ollama binary
     /// (ignoring cached/workspace binaries). Used as a retry after the first attempt fails.
-    pub(in crate::openhuman::local_ai::service) async fn ensure_ollama_server_fresh(
+    pub(in crate::openhuman::inference::local::service) async fn ensure_ollama_server_fresh(
         &self,
         config: &Config,
     ) -> Result<(), String> {
@@ -325,7 +327,7 @@ impl LocalAiService {
     }
 
     async fn download_and_install_ollama(&self, config: &Config) -> Result<(), String> {
-        let install_dir = crate::openhuman::local_ai::paths::workspace_ollama_dir(config);
+        let install_dir = crate::openhuman::inference::paths::workspace_ollama_dir(config);
         tokio::fs::create_dir_all(&install_dir)
             .await
             .map_err(|e| format!("failed to create Ollama install directory: {e}"))?;
@@ -337,7 +339,7 @@ impl LocalAiService {
         // OllamaSetup.exe running, wait for it instead of starting a
         // second one — two concurrent installers race on the same dir
         // and corrupt the install.
-        if crate::openhuman::local_ai::install::is_ollama_installer_running() {
+        if crate::openhuman::inference::local::install::is_ollama_installer_running() {
             log::info!(
                 "[local_ai] detected in-flight OllamaSetup.exe — \
                  waiting for it to finish before deciding whether to install"
@@ -357,7 +359,7 @@ impl LocalAiService {
             const INSTALLER_WAIT_TIMEOUT: std::time::Duration =
                 std::time::Duration::from_secs(5 * 60);
             let mut timed_out = false;
-            while crate::openhuman::local_ai::install::is_ollama_installer_running() {
+            while crate::openhuman::inference::local::install::is_ollama_installer_running() {
                 if wait_start.elapsed() >= INSTALLER_WAIT_TIMEOUT {
                     timed_out = true;
                     break;
@@ -472,7 +474,7 @@ impl LocalAiService {
         Ok(())
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn ollama_healthy(&self) -> bool {
+    pub(in crate::openhuman::inference::local::service) async fn ollama_healthy(&self) -> bool {
         self.http
             .get(format!("{}/api/tags", ollama_base_url()))
             .timeout(std::time::Duration::from_secs(2))
@@ -489,7 +491,7 @@ impl LocalAiService {
     /// to `/api/tags` should consult this first. Returning `false` here means
     /// the UI should drive the user to install Ollama instead of polling for
     /// model state that can never appear.
-    pub(in crate::openhuman::local_ai::service) fn ollama_binary_present(
+    pub(in crate::openhuman::inference::local::service) fn ollama_binary_present(
         &self,
         config: &Config,
     ) -> bool {
@@ -512,7 +514,7 @@ impl LocalAiService {
         find_system_ollama_binary().is_some()
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn ensure_models_available(
+    pub(in crate::openhuman::inference::local::service) async fn ensure_models_available(
         &self,
         config: &Config,
     ) -> Result<(), String> {
@@ -553,7 +555,7 @@ impl LocalAiService {
         Ok(())
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn ensure_ollama_model_available(
+    pub(in crate::openhuman::inference::local::service) async fn ensure_ollama_model_available(
         &self,
         model_id: &str,
         label: &str,
@@ -1136,7 +1138,7 @@ impl LocalAiService {
         }
 
         // 5. Platform-specific well-known locations (macOS bundles, Windows, Linux).
-        crate::openhuman::local_ai::install::find_system_ollama_binary()
+        crate::openhuman::inference::local::install::find_system_ollama_binary()
             .map(|p| p.display().to_string())
     }
 
@@ -1227,7 +1229,7 @@ impl LocalAiService {
         spawn_marker::clear_marker(config);
     }
 
-    pub(in crate::openhuman::local_ai::service) async fn has_model(
+    pub(in crate::openhuman::inference::local::service) async fn has_model(
         &self,
         model: &str,
     ) -> Result<bool, String> {
diff --git a/src/openhuman/local_ai/service/ollama_admin_tests.rs b/src/openhuman/inference/local/service/ollama_admin_tests.rs
similarity index 92%
rename from src/openhuman/local_ai/service/ollama_admin_tests.rs
rename to src/openhuman/inference/local/service/ollama_admin_tests.rs
index b821301d90..d85e9356f2 100644
--- a/src/openhuman/local_ai/service/ollama_admin_tests.rs
+++ b/src/openhuman/inference/local/service/ollama_admin_tests.rs
@@ -11,7 +11,7 @@ fn interrupted_pull_does_not_wait_before_any_progress() {
 }
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::service::LocalAiService;
+use crate::openhuman::inference::local::service::LocalAiService;
 use axum::{routing::get, Json, Router};
 use serde_json::json;
 
@@ -35,7 +35,7 @@ fn lm_studio_config(base: &str) -> Config {
 
 #[tokio::test]
 async fn has_model_detects_exact_and_prefixed_tag() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/api/tags",
@@ -67,7 +67,7 @@ async fn has_model_detects_exact_and_prefixed_tag() {
 
 #[tokio::test]
 async fn has_model_errors_on_non_success_tags_response() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/api/tags",
@@ -90,7 +90,7 @@ async fn has_model_errors_on_non_success_tags_response() {
 
 #[tokio::test]
 async fn ollama_healthy_returns_true_on_200_tags_response() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route("/api/tags", get(|| async { Json(json!({ "models": [] })) }));
     let base = spawn_mock(app).await;
@@ -109,7 +109,7 @@ async fn ollama_healthy_returns_true_on_200_tags_response() {
 
 #[tokio::test]
 async fn ollama_healthy_returns_false_on_unreachable_url() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     // Point at a port we never bind → connect fails → healthy = false.
     unsafe {
@@ -125,7 +125,7 @@ async fn ollama_healthy_returns_false_on_unreachable_url() {
 
 #[tokio::test]
 async fn ensure_ollama_server_requires_external_runtime_when_unreachable() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     unsafe {
         std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
@@ -150,7 +150,7 @@ async fn ensure_ollama_server_requires_external_runtime_when_unreachable() {
 
 #[tokio::test]
 async fn ensure_ollama_server_reports_broken_external_runner_without_restart_attempt() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new()
         .route("/api/tags", get(|| async { Json(json!({ "models": [] })) }))
@@ -187,7 +187,7 @@ async fn ensure_ollama_server_reports_broken_external_runner_without_restart_att
 
 #[tokio::test]
 async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_binary_exists() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     unsafe {
         std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
@@ -219,7 +219,7 @@ async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_bin
 
 #[tokio::test]
 async fn diagnostics_reports_server_unreachable_when_url_unbound() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     unsafe {
         std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
@@ -255,7 +255,7 @@ async fn diagnostics_reports_server_unreachable_when_url_unbound() {
 
 #[tokio::test]
 async fn diagnostics_with_running_server_but_missing_models_flags_issues() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route("/api/tags", get(|| async { Json(json!({ "models": [] })) }));
     let base = spawn_mock(app).await;
@@ -290,11 +290,11 @@ async fn diagnostics_with_running_server_but_missing_models_flags_issues() {
 
 #[tokio::test]
 async fn diagnostics_ok_when_expected_models_are_present() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let config = Config::default();
-    let chat = crate::openhuman::local_ai::model_ids::effective_chat_model_id(&config);
-    let embedding = crate::openhuman::local_ai::model_ids::effective_embedding_model_id(&config);
+    let chat = crate::openhuman::inference::model_ids::effective_chat_model_id(&config);
+    let embedding = crate::openhuman::inference::model_ids::effective_embedding_model_id(&config);
     let chat_tag = format!("{}:latest", chat);
     let embed_tag = format!("{}:latest", embedding);
     let app = Router::new().route(
@@ -345,7 +345,7 @@ async fn diagnostics_ok_when_expected_models_are_present() {
 
 #[tokio::test]
 async fn resolve_binary_path_finds_binary_via_ollama_bin_env() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let tmp = tempfile::tempdir().unwrap();
     let fake_bin = tmp.path().join(if cfg!(windows) {
@@ -378,7 +378,7 @@ async fn resolve_binary_path_finds_binary_via_ollama_bin_env() {
 
 #[tokio::test]
 async fn diagnostics_repair_actions_are_empty_when_binary_is_known_but_server_is_down() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let tmp = tempfile::tempdir().unwrap();
     let fake_bin = tmp.path().join(if cfg!(windows) {
@@ -417,7 +417,7 @@ async fn diagnostics_repair_actions_are_empty_when_binary_is_known_but_server_is
 async fn diagnostics_repair_actions_field_always_present() {
     // Verifies that the "repair_actions" key is always present in the diagnostics
     // JSON, regardless of the server state, so the UI can always iterate over it.
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     unsafe {
         std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
@@ -438,7 +438,7 @@ async fn diagnostics_repair_actions_field_always_present() {
 
 #[tokio::test]
 async fn list_models_returns_parsed_payload() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/api/tags",
@@ -469,7 +469,7 @@ async fn list_models_returns_parsed_payload() {
 
 #[tokio::test]
 async fn list_models_errors_on_non_success() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/api/tags",
@@ -491,7 +491,7 @@ async fn list_models_errors_on_non_success() {
 
 #[tokio::test]
 async fn lm_studio_list_models_returns_loaded_models() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/v1/models",
@@ -524,7 +524,7 @@ async fn lm_studio_list_models_returns_loaded_models() {
 
 #[tokio::test]
 async fn lm_studio_diagnostics_reports_loaded_chat_model() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/v1/models",
@@ -550,7 +550,7 @@ async fn lm_studio_diagnostics_reports_loaded_chat_model() {
 
 #[tokio::test]
 async fn lm_studio_diagnostics_flags_missing_chat_model() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/v1/models",
@@ -580,7 +580,7 @@ async fn lm_studio_diagnostics_flags_missing_chat_model() {
 
 #[tokio::test]
 async fn lm_studio_diagnostics_surfaces_reachable_model_list_errors() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route("/v1/models", get(|| async { "not json" }));
     let base = spawn_mock(app).await;
@@ -605,7 +605,7 @@ async fn lm_studio_diagnostics_surfaces_reachable_model_list_errors() {
 
 #[tokio::test]
 async fn lm_studio_assets_reports_embedding_as_ollama_managed() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/v1/models",
@@ -659,9 +659,7 @@ async fn lm_studio_assets_reports_embedding_as_ollama_managed() {
 
 #[tokio::test]
 async fn kill_ollama_server_with_no_owned_child_is_noop() {
-    let _guard = crate::openhuman::local_ai::LOCAL_AI_TEST_MUTEX
-        .lock()
-        .expect("local ai mutex");
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let config = Config::default();
     let service = LocalAiService::new(&config);
@@ -684,9 +682,7 @@ async fn kill_ollama_server_with_no_owned_child_is_noop() {
 
 #[tokio::test]
 async fn kill_ollama_server_kills_owned_child() {
-    let _guard = crate::openhuman::local_ai::LOCAL_AI_TEST_MUTEX
-        .lock()
-        .expect("local ai mutex");
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let config = Config::default();
     let service = LocalAiService::new(&config);
@@ -711,7 +707,7 @@ async fn kill_ollama_server_kills_owned_child() {
 
     // Sanity: child should be alive immediately after spawn.
     assert!(
-        crate::openhuman::local_ai::service::spawn_marker::pid_is_alive(pid),
+        crate::openhuman::inference::local::service::spawn_marker::pid_is_alive(pid),
         "child pid {pid} should be alive right after spawn"
     );
 
@@ -727,7 +723,7 @@ async fn kill_ollama_server_kills_owned_child() {
     // update its process table — the kill is signalled but reap is async.
     let mut still_alive = true;
     for _ in 0..40 {
-        if !crate::openhuman::local_ai::service::spawn_marker::pid_is_alive(pid) {
+        if !crate::openhuman::inference::local::service::spawn_marker::pid_is_alive(pid) {
             still_alive = false;
             break;
         }
@@ -741,9 +737,7 @@ async fn kill_ollama_server_kills_owned_child() {
 
 #[tokio::test]
 async fn shutdown_owned_ollama_clears_marker_and_kills_child() {
-    let _guard = crate::openhuman::local_ai::LOCAL_AI_TEST_MUTEX
-        .lock()
-        .expect("local ai mutex");
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     // Redirect the workspace root to a tempdir so the marker file doesn't
     // touch the real `~/.openhuman/`. Per `paths::shared_root_dir`, when
@@ -781,12 +775,12 @@ async fn shutdown_owned_ollama_clears_marker_and_kills_child() {
     // resolves to a writable temp path, the write is exercised. On hosts
     // where `default_root_openhuman_dir()` succeeds against the real home
     // dir, we skip the marker assertion to avoid touching `~/.openhuman/`.
-    let marker_path = crate::openhuman::local_ai::paths::ollama_spawn_marker_path(&config);
+    let marker_path = crate::openhuman::inference::paths::ollama_spawn_marker_path(&config);
     let marker_writable = marker_path.starts_with(tmp.path());
     if marker_writable {
-        crate::openhuman::local_ai::service::spawn_marker::write_marker_at(
+        crate::openhuman::inference::local::service::spawn_marker::write_marker_at(
             &marker_path,
-            &crate::openhuman::local_ai::service::spawn_marker::OllamaSpawnMarker::new(
+            &crate::openhuman::inference::local::service::spawn_marker::OllamaSpawnMarker::new(
                 pid,
                 std::path::Path::new("test-stub"),
             ),
@@ -810,7 +804,7 @@ async fn shutdown_owned_ollama_clears_marker_and_kills_child() {
     // And the spawned process is dead.
     let mut still_alive = true;
     for _ in 0..40 {
-        if !crate::openhuman::local_ai::service::spawn_marker::pid_is_alive(pid) {
+        if !crate::openhuman::inference::local::service::spawn_marker::pid_is_alive(pid) {
             still_alive = false;
             break;
         }
@@ -827,7 +821,7 @@ async fn shutdown_owned_ollama_clears_marker_and_kills_child() {
 /// `ollama_available: false` immediately.
 #[tokio::test]
 async fn assets_status_sets_ollama_available_false_when_binary_missing() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let tmp = tempfile::tempdir().unwrap();
     let mut config = Config::default();
@@ -925,7 +919,7 @@ fn binary_present_uses_ollama_bin_env_var_when_set() {
     // When OLLAMA_BIN points to a real file, it must be preferred over the
     // workspace/system lookup. Use the current test binary itself as the
     // "fake ollama" — it's guaranteed to be a real file.
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let real_file = std::env::current_exe().expect("current test exe path");
     let prev = std::env::var_os("OLLAMA_BIN");
diff --git a/src/openhuman/local_ai/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs
similarity index 94%
rename from src/openhuman/local_ai/service/public_infer.rs
rename to src/openhuman/inference/local/service/public_infer.rs
index ef49c75049..ad3aee8773 100644
--- a/src/openhuman/local_ai/service/public_infer.rs
+++ b/src/openhuman/inference/local/service/public_infer.rs
@@ -1,10 +1,10 @@
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::model_ids;
-use crate::openhuman::local_ai::ollama_api::{
+use crate::openhuman::inference::local::ollama::{
     ns_to_tps, ollama_base_url, OllamaGenerateOptions, OllamaGenerateRequest,
 };
-use crate::openhuman::local_ai::parse::sanitize_inline_completion;
-use crate::openhuman::local_ai::provider::{provider_from_config, LocalAiProvider};
+use crate::openhuman::inference::local::provider::{provider_from_config, LocalAiProvider};
+use crate::openhuman::inference::model_ids;
+use crate::openhuman::inference::parse::sanitize_inline_completion;
 
 use super::LocalAiService;
 
@@ -182,7 +182,7 @@ impl LocalAiService {
     pub(crate) async fn chat_with_history(
         &self,
         config: &Config,
-        messages: Vec<crate::openhuman::local_ai::ollama_api::OllamaChatMessage>,
+        messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
         max_tokens: Option<u32>,
     ) -> Result<String, String> {
         if !config.local_ai.runtime_enabled {
@@ -205,7 +205,7 @@ impl LocalAiService {
             let lm_messages = messages
                 .into_iter()
                 .map(
-                    |message| crate::openhuman::local_ai::lm_studio_api::LmStudioChatMessage {
+                    |message| crate::openhuman::inference::local::lm_studio::LmStudioChatMessage {
                         role: message.role,
                         content: message.content,
                     },
@@ -241,18 +241,18 @@ impl LocalAiService {
 
         tracing::debug!(
             message_count = messages.len(),
-            model = %crate::openhuman::local_ai::model_ids::effective_chat_model_id(config),
+            model = %crate::openhuman::inference::model_ids::effective_chat_model_id(config),
             "[local_ai:chat] sending to ollama /api/chat"
         );
 
         let started = std::time::Instant::now();
 
-        let body = crate::openhuman::local_ai::ollama_api::OllamaChatRequest {
-            model: crate::openhuman::local_ai::model_ids::effective_chat_model_id(config),
+        let body = crate::openhuman::inference::local::ollama::OllamaChatRequest {
+            model: crate::openhuman::inference::model_ids::effective_chat_model_id(config),
             messages,
             stream: false,
             options: Some(
-                crate::openhuman::local_ai::ollama_api::OllamaGenerateOptions {
+                crate::openhuman::inference::local::ollama::OllamaGenerateOptions {
                     temperature: Some(config.default_temperature as f32),
                     top_k: Some(40),
                     top_p: Some(0.9),
@@ -284,7 +284,7 @@ impl LocalAiService {
             ));
         }
 
-        let payload: crate::openhuman::local_ai::ollama_api::OllamaChatResponse = response
+        let payload: crate::openhuman::inference::local::ollama::OllamaChatResponse = response
             .json()
             .await
             .map_err(|e| format!("ollama chat response parse failed: {e}"))?;
@@ -466,11 +466,11 @@ impl LocalAiService {
 
         if provider_from_config(config) == LocalAiProvider::LmStudio {
             let messages = vec![
-                crate::openhuman::local_ai::lm_studio_api::LmStudioChatMessage {
+                crate::openhuman::inference::local::lm_studio::LmStudioChatMessage {
                     role: "system".to_string(),
                     content: effective_system,
                 },
-                crate::openhuman::local_ai::lm_studio_api::LmStudioChatMessage {
+                crate::openhuman::inference::local::lm_studio::LmStudioChatMessage {
                     role: "user".to_string(),
                     content: prompt.to_string(),
                 },
@@ -533,7 +533,7 @@ impl LocalAiService {
             ));
         }
 
-        let payload: crate::openhuman::local_ai::ollama_api::OllamaGenerateResponse = response
+        let payload: crate::openhuman::inference::local::ollama::OllamaGenerateResponse = response
             .json()
             .await
             .map_err(|e| format!("ollama response parse failed: {e}"))?;
diff --git a/src/openhuman/local_ai/service/public_infer_tests.rs b/src/openhuman/inference/local/service/public_infer_tests.rs
similarity index 94%
rename from src/openhuman/local_ai/service/public_infer_tests.rs
rename to src/openhuman/inference/local/service/public_infer_tests.rs
index 44b62cdc90..8931438e2c 100644
--- a/src/openhuman/local_ai/service/public_infer_tests.rs
+++ b/src/openhuman/inference/local/service/public_infer_tests.rs
@@ -38,7 +38,7 @@ fn ready_service(config: &Config) -> LocalAiService {
 
 #[tokio::test]
 async fn inference_hits_ollama_generate_and_returns_response() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/api/generate",
@@ -75,7 +75,7 @@ async fn inference_hits_ollama_generate_and_returns_response() {
 
 #[tokio::test]
 async fn inference_errors_on_non_success_status() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/api/generate",
@@ -98,7 +98,7 @@ async fn inference_errors_on_non_success_status() {
 
 #[tokio::test]
 async fn inference_connection_failure_mentions_external_ollama_runtime() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     unsafe {
         std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", "http://127.0.0.1:1");
@@ -121,7 +121,7 @@ async fn inference_connection_failure_mentions_external_ollama_runtime() {
 
 #[tokio::test]
 async fn inference_errors_on_empty_response_when_allow_empty_false() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/api/generate",
@@ -158,7 +158,7 @@ async fn inference_errors_on_empty_response_when_allow_empty_false() {
 
 #[tokio::test]
 async fn lm_studio_prompt_hits_openai_chat_completions() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/v1/chat/completions",
@@ -197,7 +197,7 @@ async fn lm_studio_prompt_hits_openai_chat_completions() {
 
 #[tokio::test]
 async fn lm_studio_chat_with_history_returns_response() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/v1/chat/completions",
@@ -219,11 +219,11 @@ async fn lm_studio_chat_with_history_returns_response() {
         .chat_with_history(
             &config,
             vec![
-                crate::openhuman::local_ai::ollama_api::OllamaChatMessage {
+                crate::openhuman::inference::local::ollama::OllamaChatMessage {
                     role: "system".to_string(),
                     content: "be terse".to_string(),
                 },
-                crate::openhuman::local_ai::ollama_api::OllamaChatMessage {
+                crate::openhuman::inference::local::ollama::OllamaChatMessage {
                     role: "user".to_string(),
                     content: "hi".to_string(),
                 },
@@ -238,7 +238,7 @@ async fn lm_studio_chat_with_history_returns_response() {
 
 #[tokio::test]
 async fn lm_studio_prompt_errors_on_non_success_status() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let app = Router::new().route(
         "/v1/chat/completions",
@@ -307,7 +307,7 @@ async fn inline_complete_interactive_disabled_returns_empty_string() {
 /// the permit it would deadlock or time out.
 #[tokio::test]
 async fn inline_complete_interactive_does_not_block_on_held_permit() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     // Hold the global LLM permit for the duration of the test.
     let _held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit()
@@ -364,7 +364,7 @@ async fn inline_complete_interactive_does_not_block_on_held_permit() {
 // safer trade-off. See PR #1524.
 #[ignore = "flaky timing under full-suite load — see PR #1524"]
 async fn gated_inline_complete_blocks_on_held_permit() {
-    let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+    let _guard = crate::openhuman::inference::inference_test_guard();
 
     let held = crate::openhuman::scheduler_gate::gate::try_acquire_llm_permit()
         .expect("test must start with a free permit");
diff --git a/src/openhuman/local_ai/service/spawn_marker.rs b/src/openhuman/inference/local/service/spawn_marker.rs
similarity index 99%
rename from src/openhuman/local_ai/service/spawn_marker.rs
rename to src/openhuman/inference/local/service/spawn_marker.rs
index e8f5dd59e8..cc791d6e49 100644
--- a/src/openhuman/local_ai/service/spawn_marker.rs
+++ b/src/openhuman/inference/local/service/spawn_marker.rs
@@ -17,7 +17,7 @@ use std::path::Path;
 use serde::{Deserialize, Serialize};
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::paths::ollama_spawn_marker_path;
+use crate::openhuman::inference::paths::ollama_spawn_marker_path;
 
 /// On-disk record of an openhuman-spawned `ollama serve` process.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
diff --git a/src/openhuman/local_ai/service/speech.rs b/src/openhuman/inference/local/service/speech.rs
similarity index 97%
rename from src/openhuman/local_ai/service/speech.rs
rename to src/openhuman/inference/local/service/speech.rs
index 5cbf1a0c4e..79b56a0a26 100644
--- a/src/openhuman/local_ai/service/speech.rs
+++ b/src/openhuman/inference/local/service/speech.rs
@@ -4,12 +4,12 @@ use std::time::Instant;
 use log::{debug, warn};
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::model_ids;
-use crate::openhuman::local_ai::paths::{
+use crate::openhuman::inference::model_ids;
+use crate::openhuman::inference::paths::{
     config_root_dir, resolve_piper_binary, resolve_stt_model_path, resolve_tts_voice_path,
     resolve_whisper_binary,
 };
-use crate::openhuman::local_ai::types::{LocalAiSpeechResult, LocalAiTtsResult};
+use crate::openhuman::inference::types::{LocalAiSpeechResult, LocalAiTtsResult};
 
 use super::whisper_engine;
 use super::LocalAiService;
@@ -54,7 +54,7 @@ impl LocalAiService {
                         "{LOG_PREFIX} whisper in-process enabled but unloaded; loading model lazily"
                     );
                     // Detect GPU at lazy-load time so whisper can use acceleration.
-                    let device = crate::openhuman::local_ai::device::detect_device_profile();
+                    let device = crate::openhuman::inference::device::detect_device_profile();
                     let gpu = device.has_gpu;
                     let gpu_desc = device.gpu_description.clone();
                     let load_result = tokio::task::spawn_blocking(move || {
diff --git a/src/openhuman/local_ai/service/vision_embed.rs b/src/openhuman/inference/local/service/vision_embed.rs
similarity index 95%
rename from src/openhuman/local_ai/service/vision_embed.rs
rename to src/openhuman/inference/local/service/vision_embed.rs
index 950f70b842..0ed010cfa1 100644
--- a/src/openhuman/local_ai/service/vision_embed.rs
+++ b/src/openhuman/inference/local/service/vision_embed.rs
@@ -1,12 +1,12 @@
 use crate::openhuman::agent::multimodal;
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::model_ids;
-use crate::openhuman::local_ai::ollama_api::{
+use crate::openhuman::inference::local::ollama::{
     ollama_base_url, OllamaEmbedRequest, OllamaEmbedResponse, OllamaGenerateOptions,
     OllamaGenerateRequest,
 };
-use crate::openhuman::local_ai::presets::{self, VisionMode};
-use crate::openhuman::local_ai::types::LocalAiEmbeddingResult;
+use crate::openhuman::inference::model_ids;
+use crate::openhuman::inference::presets::{self, VisionMode};
+use crate::openhuman::inference::types::LocalAiEmbeddingResult;
 
 use super::LocalAiService;
 
@@ -118,7 +118,7 @@ impl LocalAiService {
             ));
         }
 
-        let payload: crate::openhuman::local_ai::ollama_api::OllamaGenerateResponse = response
+        let payload: crate::openhuman::inference::local::ollama::OllamaGenerateResponse = response
             .json()
             .await
             .map_err(|e| format!("ollama vision response parse failed: {e}"))?;
@@ -250,7 +250,7 @@ mod tests {
 
     #[tokio::test]
     async fn embed_against_mock_returns_vectors_with_dimensions() {
-        let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+        let _guard = crate::openhuman::inference::inference_test_guard();
 
         let app = mock_with_tags_and(
             "/api/embed",
@@ -281,7 +281,7 @@ mod tests {
 
     #[tokio::test]
     async fn embed_rejects_all_empty_inputs_before_network_call() {
-        let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+        let _guard = crate::openhuman::inference::inference_test_guard();
 
         // Even without a working mock server, entirely-empty inputs must be
         // rejected before any HTTP call.
diff --git a/src/openhuman/local_ai/service/whisper_engine.rs b/src/openhuman/inference/local/service/whisper_engine.rs
similarity index 100%
rename from src/openhuman/local_ai/service/whisper_engine.rs
rename to src/openhuman/inference/local/service/whisper_engine.rs
diff --git a/src/openhuman/local_ai/voice_install_common.rs b/src/openhuman/inference/local/voice_install_common.rs
similarity index 100%
rename from src/openhuman/local_ai/voice_install_common.rs
rename to src/openhuman/inference/local/voice_install_common.rs
diff --git a/src/openhuman/inference/mod.rs b/src/openhuman/inference/mod.rs
index 79ec294b50..7fb5549e0f 100644
--- a/src/openhuman/inference/mod.rs
+++ b/src/openhuman/inference/mod.rs
@@ -1,15 +1,50 @@
-//! External inference domain.
+//! Unified inference domain.
 //!
-//! This module is the canonical controller surface for text / vision /
-//! embedding inference. The underlying implementation still reuses the
-//! existing local-runtime service during the migration away from the
-//! `local_ai` catch-all namespace.
+//! This module is the canonical home for all inference concerns:
+//! - `local/`    — Ollama / LM Studio / Whisper / Piper runtime management
+//!                 (was `src/openhuman/local_ai/`)
+//! - `provider/` — cloud + local provider trait, routing, reliability
+//!                 (was `src/openhuman/providers/`)
+//! - `voice/`    — transcription (STT) and TTS inference implementations
+//!                 (moved from `src/openhuman/voice/`)
+//! - `http/`     — OpenAI-compatible `/v1/chat/completions` endpoint
+//!
+//! The RPC surface remains under the `inference.*` and `local_ai.*` namespaces
+//! for backwards compatibility.
 
+pub mod device;
+pub mod http;
+pub mod local;
+pub mod model_ids;
 pub mod ops;
+pub mod parse;
+pub mod paths;
+pub mod presets;
+pub mod provider;
 mod schemas;
+pub mod sentiment;
+pub mod types;
+pub mod voice;
 
 pub use ops as rpc;
 pub use schemas::{
     all_controller_schemas as all_inference_controller_schemas,
     all_registered_controllers as all_inference_registered_controllers,
 };
+
+// Re-export the types that external callers (voice, agent, etc.) import from inference
+pub use device::DeviceProfile;
+pub use local::all_local_ai_controller_schemas;
+pub use local::all_local_ai_registered_controllers;
+pub use presets::{ModelPreset, ModelTier, VisionMode};
+pub use sentiment::SentimentResult;
+pub use types::{
+    LocalAiAssetStatus, LocalAiAssetsStatus, LocalAiDownloadProgressItem, LocalAiDownloadsProgress,
+    LocalAiEmbeddingResult, LocalAiSpeechResult, LocalAiStatus, LocalAiTtsResult,
+};
+
+// Test helpers (re-exported for sibling test files that use inference_test_guard)
+#[cfg(test)]
+pub(crate) fn inference_test_guard() -> std::sync::MutexGuard<'static, ()> {
+    local::inference_test_guard()
+}
diff --git a/src/openhuman/local_ai/model_ids.rs b/src/openhuman/inference/model_ids.rs
similarity index 99%
rename from src/openhuman/local_ai/model_ids.rs
rename to src/openhuman/inference/model_ids.rs
index 754a835ae9..a407df65d2 100644
--- a/src/openhuman/local_ai/model_ids.rs
+++ b/src/openhuman/inference/model_ids.rs
@@ -8,7 +8,7 @@
 //! tier restriction for OpenHuman-managed Ollama assets.
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::provider::{provider_from_config, LocalAiProvider};
+use crate::openhuman::inference::local::provider::{provider_from_config, LocalAiProvider};
 
 pub(crate) const DEFAULT_OLLAMA_MODEL: &str = "gemma3:1b-it-qat";
 pub(crate) const DEFAULT_OLLAMA_VISION_MODEL: &str = "";
diff --git a/src/openhuman/inference/ops.rs b/src/openhuman/inference/ops.rs
index 56d6258a78..f64580344b 100644
--- a/src/openhuman/inference/ops.rs
+++ b/src/openhuman/inference/ops.rs
@@ -2,11 +2,11 @@
 
 use crate::openhuman::config::rpc as config_rpc;
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai;
-use crate::openhuman::local_ai::ops::{LocalAiChatMessage, ReactionDecision};
-use crate::openhuman::local_ai::sentiment::SentimentResult;
-use crate::openhuman::local_ai::{LocalAiEmbeddingResult, LocalAiStatus};
-use crate::openhuman::providers;
+use crate::openhuman::inference::local as local_runtime;
+use crate::openhuman::inference::local::ops::{LocalAiChatMessage, ReactionDecision};
+use crate::openhuman::inference::provider as providers;
+use crate::openhuman::inference::{device, presets, sentiment, SentimentResult};
+use crate::openhuman::inference::{LocalAiEmbeddingResult, LocalAiStatus};
 use crate::rpc::RpcOutcome;
 use serde_json::{json, Value};
 use tracing::{debug, error};
@@ -15,7 +15,7 @@ const LOG_PREFIX: &str = "[inference::ops]";
 
 pub async fn inference_status(config: &Config) -> Result<RpcOutcome<LocalAiStatus>, String> {
     debug!("{LOG_PREFIX} status:start");
-    let result = local_ai::rpc::local_ai_status(config).await;
+    let result = local_runtime::rpc::local_ai_status(config).await;
     match &result {
         Ok(outcome) => debug!(state = %outcome.value.state, "{LOG_PREFIX} status:ok"),
         Err(err) => error!(error = %err, "{LOG_PREFIX} status:error"),
@@ -33,7 +33,7 @@ pub async fn inference_summarize(
         ?max_tokens,
         "{LOG_PREFIX} summarize:start"
     );
-    let result = local_ai::rpc::local_ai_summarize(config, text, max_tokens).await;
+    let result = local_runtime::rpc::local_ai_summarize(config, text, max_tokens).await;
     match &result {
         Ok(outcome) => debug!(
             output_len = outcome.value.len(),
@@ -56,7 +56,7 @@ pub async fn inference_prompt(
         ?no_think,
         "{LOG_PREFIX} prompt:start"
     );
-    let result = local_ai::rpc::local_ai_prompt(config, prompt, max_tokens, no_think).await;
+    let result = local_runtime::rpc::local_ai_prompt(config, prompt, max_tokens, no_think).await;
     match &result {
         Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} prompt:ok"),
         Err(err) => error!(error = %err, "{LOG_PREFIX} prompt:error"),
@@ -77,7 +77,7 @@ pub async fn inference_vision_prompt(
         "{LOG_PREFIX} vision_prompt:start"
     );
     let result =
-        local_ai::rpc::local_ai_vision_prompt(config, prompt, image_refs, max_tokens).await;
+        local_runtime::rpc::local_ai_vision_prompt(config, prompt, image_refs, max_tokens).await;
     match &result {
         Ok(outcome) => debug!(
             output_len = outcome.value.len(),
@@ -93,7 +93,7 @@ pub async fn inference_embed(
     inputs: &[String],
 ) -> Result<RpcOutcome<LocalAiEmbeddingResult>, String> {
     debug!(input_count = inputs.len(), "{LOG_PREFIX} embed:start");
-    let result = local_ai::rpc::local_ai_embed(config, inputs).await;
+    let result = local_runtime::rpc::local_ai_embed(config, inputs).await;
     match &result {
         Ok(outcome) => debug!(
             vector_count = outcome.value.vectors.len(),
@@ -115,7 +115,7 @@ pub async fn inference_chat(
         ?max_tokens,
         "{LOG_PREFIX} chat:start"
     );
-    let result = local_ai::rpc::local_ai_chat(config, messages, max_tokens).await;
+    let result = local_runtime::rpc::local_ai_chat(config, messages, max_tokens).await;
     match &result {
         Ok(outcome) => debug!(output_len = outcome.value.len(), "{LOG_PREFIX} chat:ok"),
         Err(err) => error!(error = %err, "{LOG_PREFIX} chat:error"),
@@ -132,7 +132,7 @@ pub async fn inference_should_react(
         message_len = message.len(),
         channel_type, "{LOG_PREFIX} should_react:start"
     );
-    let result = local_ai::rpc::local_ai_should_react(config, message, channel_type).await;
+    let result = local_runtime::rpc::local_ai_should_react(config, message, channel_type).await;
     match &result {
         Ok(outcome) => debug!(
             should_react = outcome.value.should_react,
@@ -151,7 +151,7 @@ pub async fn inference_analyze_sentiment(
         message_len = message.len(),
         "{LOG_PREFIX} analyze_sentiment:start"
     );
-    let result = local_ai::sentiment::local_ai_analyze_sentiment(config, message).await;
+    let result = sentiment::local_ai_analyze_sentiment(config, message).await;
     match &result {
         Ok(outcome) => {
             debug!(valence = %outcome.value.valence, "{LOG_PREFIX} analyze_sentiment:ok")
@@ -207,7 +207,7 @@ pub async fn inference_list_models(provider_id: &str) -> Result<RpcOutcome<Value
 
 pub async fn inference_device_profile() -> Result<RpcOutcome<Value>, String> {
     debug!("{LOG_PREFIX} device_profile:start");
-    let profile = local_ai::device::detect_device_profile();
+    let profile = device::detect_device_profile();
     let result = Ok(RpcOutcome::single_log(
         serde_json::to_value(profile).map_err(|e| format!("serialize: {e}"))?,
         "inference device profile fetched",
@@ -219,17 +219,17 @@ pub async fn inference_device_profile() -> Result<RpcOutcome<Value>, String> {
 pub async fn inference_presets() -> Result<RpcOutcome<Value>, String> {
     debug!("{LOG_PREFIX} presets:start");
     let config = config_rpc::load_config_with_timeout().await?;
-    let device = local_ai::device::detect_device_profile();
-    let recommended = local_ai::presets::recommend_tier(&device);
-    let current = local_ai::presets::current_tier_from_config(&config.local_ai);
+    let device = device::detect_device_profile();
+    let recommended = presets::recommend_tier(&device);
+    let current = presets::current_tier_from_config(&config.local_ai);
     let selected_tier = config.local_ai.selected_tier.as_ref().and_then(|value| {
         let normalized = value.trim().to_ascii_lowercase();
-        local_ai::presets::ModelTier::from_str_opt(&normalized)
+        presets::ModelTier::from_str_opt(&normalized)
             .map(|tier| tier.as_str().to_string())
             .or_else(|| (!normalized.is_empty()).then_some(normalized))
     });
-    let presets = local_ai::presets::mvp_presets();
-    let recommend_disabled = local_ai::presets::should_default_to_cloud_fallback(&device);
+    let presets = presets::mvp_presets();
+    let recommend_disabled = presets::should_default_to_cloud_fallback(&device);
     let result = Ok(RpcOutcome::single_log(
         json!({
             "presets": presets,
@@ -269,14 +269,14 @@ pub async fn inference_apply_preset(tier: &str) -> Result<RpcOutcome<Value>, Str
         ));
     }
 
-    let tier = local_ai::presets::ModelTier::from_str_opt(&tier_str).ok_or_else(|| {
+    let tier = presets::ModelTier::from_str_opt(&tier_str).ok_or_else(|| {
         format!(
             "invalid tier '{}': expected one of disabled or ram_2_4gb",
             tier_str
         )
     })?;
 
-    if tier == local_ai::presets::ModelTier::Custom {
+    if tier == presets::ModelTier::Custom {
         return Err("cannot apply 'custom' tier; set model IDs directly".to_string());
     }
     if !tier.is_mvp_allowed() {
@@ -289,7 +289,7 @@ pub async fn inference_apply_preset(tier: &str) -> Result<RpcOutcome<Value>, Str
     let mut config = config_rpc::load_config_with_timeout().await?;
     config.local_ai.runtime_enabled = true;
     config.local_ai.opt_in_confirmed = true;
-    local_ai::presets::apply_preset_to_config(&mut config.local_ai, tier);
+    presets::apply_preset_to_config(&mut config.local_ai, tier);
     config
         .save()
         .await
@@ -303,7 +303,7 @@ pub async fn inference_apply_preset(tier: &str) -> Result<RpcOutcome<Value>, Str
             "vision_model_id": config.local_ai.vision_model_id,
             "embedding_model_id": config.local_ai.embedding_model_id,
             "quantization": config.local_ai.quantization,
-            "vision_mode": local_ai::presets::vision_mode_for_config(&config.local_ai),
+            "vision_mode": presets::vision_mode_for_config(&config.local_ai),
             "local_ai_enabled": true,
         }),
         "inference preset applied",
@@ -312,7 +312,7 @@ pub async fn inference_apply_preset(tier: &str) -> Result<RpcOutcome<Value>, Str
 
 pub async fn inference_diagnostics(config: &Config) -> Result<RpcOutcome<Value>, String> {
     debug!("{LOG_PREFIX} diagnostics:start");
-    let service = local_ai::global(config);
+    let service = local_runtime::global(config);
     let result = service
         .diagnostics(config)
         .await
diff --git a/src/openhuman/local_ai/parse.rs b/src/openhuman/inference/parse.rs
similarity index 100%
rename from src/openhuman/local_ai/parse.rs
rename to src/openhuman/inference/parse.rs
diff --git a/src/openhuman/local_ai/paths.rs b/src/openhuman/inference/paths.rs
similarity index 98%
rename from src/openhuman/local_ai/paths.rs
rename to src/openhuman/inference/paths.rs
index 2ce7a50576..848cf9f24e 100644
--- a/src/openhuman/local_ai/paths.rs
+++ b/src/openhuman/inference/paths.rs
@@ -154,7 +154,9 @@ pub(crate) fn resolve_whisper_binary() -> Option<PathBuf> {
 /// `Config` reference (e.g. the bare-process voice STT subprocess code)
 /// stay compiling without rewiring.
 pub(crate) fn resolve_whisper_binary_with_config(config: &Config) -> Option<PathBuf> {
-    if let Some(workspace) = super::install_whisper::find_workspace_whisper_binary(config) {
+    if let Some(workspace) =
+        crate::openhuman::inference::local::install_whisper::find_workspace_whisper_binary(config)
+    {
         return Some(workspace);
     }
     resolve_whisper_binary()
@@ -203,7 +205,9 @@ pub(crate) fn resolve_piper_binary() -> Option<PathBuf> {
 /// `resolve_whisper_binary_with_config` — workspace install first, env
 /// second, PATH third.
 pub(crate) fn resolve_piper_binary_with_config(config: &Config) -> Option<PathBuf> {
-    if let Some(workspace) = super::install_piper::find_workspace_piper_binary(config) {
+    if let Some(workspace) =
+        crate::openhuman::inference::local::install_piper::find_workspace_piper_binary(config)
+    {
         return Some(workspace);
     }
     resolve_piper_binary()
@@ -620,7 +624,7 @@ mod tests {
     /// the existing module-wide guard so all readers/writers go through
     /// one critical section.
     fn shared_install_lock() -> std::sync::MutexGuard<'static, ()> {
-        crate::openhuman::local_ai::local_ai_test_guard()
+        crate::openhuman::inference::inference_test_guard()
     }
 
     #[test]
diff --git a/src/openhuman/local_ai/presets.rs b/src/openhuman/inference/presets.rs
similarity index 100%
rename from src/openhuman/local_ai/presets.rs
rename to src/openhuman/inference/presets.rs
diff --git a/src/openhuman/local_ai/presets_tests.rs b/src/openhuman/inference/presets_tests.rs
similarity index 100%
rename from src/openhuman/local_ai/presets_tests.rs
rename to src/openhuman/inference/presets_tests.rs
diff --git a/src/openhuman/providers/billing_error.rs b/src/openhuman/inference/provider/billing_error.rs
similarity index 100%
rename from src/openhuman/providers/billing_error.rs
rename to src/openhuman/inference/provider/billing_error.rs
diff --git a/src/openhuman/providers/compatible.rs b/src/openhuman/inference/provider/compatible.rs
similarity index 98%
rename from src/openhuman/providers/compatible.rs
rename to src/openhuman/inference/provider/compatible.rs
index 1701855230..8d3c3f3813 100644
--- a/src/openhuman/providers/compatible.rs
+++ b/src/openhuman/inference/provider/compatible.rs
@@ -18,7 +18,7 @@ pub(crate) use compatible_parse::{
 #[cfg(test)]
 pub(crate) use compatible_types::ResponsesResponse;
 
-use crate::openhuman::providers::traits::{
+use crate::openhuman::inference::provider::traits::{
     ChatMessage, ChatRequest as ProviderChatRequest, ChatResponse as ProviderChatResponse,
     Provider, StreamChunk, StreamError, StreamOptions, StreamResult, ToolCall as ProviderToolCall,
     UsageInfo as ProviderUsageInfo,
@@ -543,7 +543,8 @@ impl OpenAiCompatibleProvider {
             return messages.to_vec();
         }
 
-        let instructions = crate::openhuman::providers::traits::build_tool_instructions_text(tools);
+        let instructions =
+            crate::openhuman::inference::provider::traits::build_tool_instructions_text(tools);
         let mut modified_messages = messages.to_vec();
 
         if let Some(system_message) = modified_messages.iter_mut().find(|m| m.role == "system") {
@@ -712,7 +713,7 @@ impl OpenAiCompatibleProvider {
         &self,
         credential: Option<&str>,
         native_request: &NativeChatRequest,
-        delta_tx: &tokio::sync::mpsc::Sender<crate::openhuman::providers::ProviderDelta>,
+        delta_tx: &tokio::sync::mpsc::Sender<crate::openhuman::inference::provider::ProviderDelta>,
         dump_seq: u64,
     ) -> anyhow::Result<ProviderChatResponse> {
         use futures_util::StreamExt;
@@ -858,7 +859,7 @@ impl OpenAiCompatibleProvider {
                             if !content.is_empty() {
                                 text_accum.push_str(content);
                                 let _ = delta_tx
-                                    .send(crate::openhuman::providers::ProviderDelta::TextDelta {
+                                    .send(crate::openhuman::inference::provider::ProviderDelta::TextDelta {
                                         delta: content.clone(),
                                     })
                                     .await;
@@ -870,7 +871,7 @@ impl OpenAiCompatibleProvider {
                                 thinking_accum.push_str(reasoning);
                                 let _ = delta_tx
                                     .send(
-                                        crate::openhuman::providers::ProviderDelta::ThinkingDelta {
+                                        crate::openhuman::inference::provider::ProviderDelta::ThinkingDelta {
                                             delta: reasoning.clone(),
                                         },
                                     )
@@ -950,7 +951,7 @@ impl OpenAiCompatibleProvider {
                                             name,
                                         );
                                         let _ = delta_tx
-                                            .send(crate::openhuman::providers::ProviderDelta::ToolCallStart {
+                                            .send(crate::openhuman::inference::provider::ProviderDelta::ToolCallStart {
                                                 call_id: id.clone(),
                                                 tool_name: name.clone(),
                                             })
@@ -968,7 +969,7 @@ impl OpenAiCompatibleProvider {
                                             );
                                             let buffered = entry.arguments.clone();
                                             let _ = delta_tx
-                                                .send(crate::openhuman::providers::ProviderDelta::ToolCallArgsDelta {
+                                                .send(crate::openhuman::inference::provider::ProviderDelta::ToolCallArgsDelta {
                                                     call_id: id.clone(),
                                                     delta: buffered,
                                                 })
@@ -984,7 +985,7 @@ impl OpenAiCompatibleProvider {
                                         let fresh =
                                             entry.arguments[entry.emitted_chars..].to_string();
                                         let _ = delta_tx
-                                            .send(crate::openhuman::providers::ProviderDelta::ToolCallArgsDelta {
+                                            .send(crate::openhuman::inference::provider::ProviderDelta::ToolCallArgsDelta {
                                                 call_id: id.clone(),
                                                 delta: fresh,
                                             })
@@ -1102,8 +1103,8 @@ impl OpenAiCompatibleProvider {
 
 #[async_trait]
 impl Provider for OpenAiCompatibleProvider {
-    fn capabilities(&self) -> crate::openhuman::providers::traits::ProviderCapabilities {
-        crate::openhuman::providers::traits::ProviderCapabilities {
+    fn capabilities(&self) -> crate::openhuman::inference::provider::traits::ProviderCapabilities {
+        crate::openhuman::inference::provider::traits::ProviderCapabilities {
             native_tool_calling: true,
             vision: false,
         }
diff --git a/src/openhuman/providers/compatible_dump.rs b/src/openhuman/inference/provider/compatible_dump.rs
similarity index 100%
rename from src/openhuman/providers/compatible_dump.rs
rename to src/openhuman/inference/provider/compatible_dump.rs
diff --git a/src/openhuman/providers/compatible_parse.rs b/src/openhuman/inference/provider/compatible_parse.rs
similarity index 98%
rename from src/openhuman/providers/compatible_parse.rs
rename to src/openhuman/inference/provider/compatible_parse.rs
index 1d8e070e45..086482e31e 100644
--- a/src/openhuman/providers/compatible_parse.rs
+++ b/src/openhuman/inference/provider/compatible_parse.rs
@@ -3,7 +3,7 @@
 //! All functions here are stateless transforms — no I/O, no HTTP. They take
 //! raw strings or deserialized values and return structured results.
 
-use crate::openhuman::providers::traits::{
+use crate::openhuman::inference::provider::traits::{
     ChatMessage, StreamError, StreamResult, ToolCall as ProviderToolCall,
 };
 
@@ -82,7 +82,7 @@ pub(crate) fn parse_sse_line(line: &str) -> StreamResult<Option<String>> {
 
 pub(crate) fn compact_sanitized_body_snippet(body: &str) -> String {
     // super = compatible module; super::super = providers module (where sanitize_api_error lives)
-    super::super::sanitize_api_error(body)
+    crate::openhuman::inference::provider::sanitize_api_error(body)
         .split_whitespace()
         .collect::<Vec<_>>()
         .join(" ")
diff --git a/src/openhuman/providers/compatible_stream.rs b/src/openhuman/inference/provider/compatible_stream.rs
similarity index 97%
rename from src/openhuman/providers/compatible_stream.rs
rename to src/openhuman/inference/provider/compatible_stream.rs
index f19c53a0b7..4209e13571 100644
--- a/src/openhuman/providers/compatible_stream.rs
+++ b/src/openhuman/inference/provider/compatible_stream.rs
@@ -3,7 +3,7 @@
 //! Converts a raw `reqwest::Response` byte stream into a typed
 //! `StreamChunk` stream via Server-Sent Events parsing.
 
-use crate::openhuman::providers::traits::{StreamChunk, StreamError, StreamResult};
+use crate::openhuman::inference::provider::traits::{StreamChunk, StreamError, StreamResult};
 use futures_util::{stream, StreamExt};
 
 use super::compatible_parse::parse_sse_line;
diff --git a/src/openhuman/providers/compatible_tests.rs b/src/openhuman/inference/provider/compatible_tests.rs
similarity index 99%
rename from src/openhuman/providers/compatible_tests.rs
rename to src/openhuman/inference/provider/compatible_tests.rs
index a8e43d4e0f..e28296db68 100644
--- a/src/openhuman/providers/compatible_tests.rs
+++ b/src/openhuman/inference/provider/compatible_tests.rs
@@ -135,7 +135,7 @@ fn non_streaming_request_omits_stream_options() {
 
 #[tokio::test]
 async fn outbound_thread_id_is_gated_per_provider() {
-    use crate::openhuman::providers::thread_context::with_thread_id;
+    use crate::openhuman::inference::provider::thread_context::with_thread_id;
 
     let third_party = make_provider("Venice", "https://api.venice.ai", None);
     let openhuman =
diff --git a/src/openhuman/providers/compatible_types.rs b/src/openhuman/inference/provider/compatible_types.rs
similarity index 98%
rename from src/openhuman/providers/compatible_types.rs
rename to src/openhuman/inference/provider/compatible_types.rs
index 87f0ac0631..b0bb07f38d 100644
--- a/src/openhuman/providers/compatible_types.rs
+++ b/src/openhuman/inference/provider/compatible_types.rs
@@ -44,7 +44,7 @@ pub(crate) struct NativeChatRequest {
     /// when serialising for vanilla OpenAI-compatible providers that
     /// don't recognise it (most reject only unknown *required* fields,
     /// but emitting it here is gated on the ambient task-local being
-    /// set — see `crate::openhuman::providers::thread_context`).
+    /// set — see `crate::openhuman::inference::provider::thread_context`).
     #[serde(skip_serializing_if = "Option::is_none")]
     pub(crate) thread_id: Option<String>,
     /// OpenAI streaming `stream_options`. Set to `{"include_usage": true}`
@@ -59,7 +59,7 @@ pub(crate) struct NativeChatRequest {
 }
 
 /// OpenAI-spec `stream_options` payload (sent on the wire). Distinct from
-/// `crate::openhuman::providers::traits::StreamOptions`, which is the
+/// `crate::openhuman::inference::provider::traits::StreamOptions`, which is the
 /// caller-side knob set on `ChatRequest` to toggle agent streaming.
 #[derive(Debug, Serialize)]
 pub(crate) struct OpenAiStreamOptions {
diff --git a/src/openhuman/providers/factory.rs b/src/openhuman/inference/provider/factory.rs
similarity index 97%
rename from src/openhuman/providers/factory.rs
rename to src/openhuman/inference/provider/factory.rs
index 7f10cb094d..3e6af1544d 100644
--- a/src/openhuman/providers/factory.rs
+++ b/src/openhuman/inference/provider/factory.rs
@@ -20,12 +20,12 @@
 use crate::openhuman::config::schema::cloud_providers::AuthStyle;
 use crate::openhuman::config::Config;
 use crate::openhuman::credentials::AuthService;
-use crate::openhuman::providers::compatible::{
+use crate::openhuman::inference::provider::compatible::{
     AuthStyle as CompatAuthStyle, OpenAiCompatibleProvider,
 };
-use crate::openhuman::providers::openhuman_backend::OpenHumanBackendProvider;
-use crate::openhuman::providers::traits::Provider;
-use crate::openhuman::providers::ProviderRuntimeOptions;
+use crate::openhuman::inference::provider::openhuman_backend::OpenHumanBackendProvider;
+use crate::openhuman::inference::provider::traits::Provider;
+use crate::openhuman::inference::provider::ProviderRuntimeOptions;
 
 /// Sentinel meaning "use the OpenHuman backend session JWT".
 pub const PROVIDER_OPENHUMAN: &str = "openhuman";
diff --git a/src/openhuman/providers/factory_test.rs b/src/openhuman/inference/provider/factory_test.rs
similarity index 100%
rename from src/openhuman/providers/factory_test.rs
rename to src/openhuman/inference/provider/factory_test.rs
diff --git a/src/openhuman/providers/mod.rs b/src/openhuman/inference/provider/mod.rs
similarity index 56%
rename from src/openhuman/providers/mod.rs
rename to src/openhuman/inference/provider/mod.rs
index 65324c2a2d..79573c99e8 100644
--- a/src/openhuman/providers/mod.rs
+++ b/src/openhuman/inference/provider/mod.rs
@@ -1,5 +1,15 @@
+//! Unified provider abstraction — cloud + local chat, embedding, and streaming.
+//!
+//! This module was previously `src/openhuman/providers/`. It now lives under
+//! `inference/provider/` so all inference concerns (local runtime, cloud
+//! providers, HTTP endpoint) share a single domain root.
+
 pub mod billing_error;
 pub mod compatible;
+pub mod compatible_dump;
+pub mod compatible_parse;
+pub mod compatible_stream;
+pub mod compatible_types;
 pub mod factory;
 pub mod openhuman_backend;
 pub mod ops;
diff --git a/src/openhuman/providers/openhuman_backend.rs b/src/openhuman/inference/provider/openhuman_backend.rs
similarity index 100%
rename from src/openhuman/providers/openhuman_backend.rs
rename to src/openhuman/inference/provider/openhuman_backend.rs
diff --git a/src/openhuman/providers/ops.rs b/src/openhuman/inference/provider/ops.rs
similarity index 98%
rename from src/openhuman/providers/ops.rs
rename to src/openhuman/inference/provider/ops.rs
index 8e64226052..a48339127b 100644
--- a/src/openhuman/providers/ops.rs
+++ b/src/openhuman/inference/provider/ops.rs
@@ -55,8 +55,9 @@ pub async fn list_configured_models(
         entry.slug
     );
 
-    let api_key = crate::openhuman::providers::factory::lookup_key_for_slug(&entry.slug, &config)
-        .unwrap_or_default();
+    let api_key =
+        crate::openhuman::inference::provider::factory::lookup_key_for_slug(&entry.slug, &config)
+            .unwrap_or_default();
 
     let client = crate::openhuman::config::build_runtime_proxy_client_with_timeouts(
         "providers.list_models",
@@ -375,11 +376,11 @@ pub fn create_backend_inference_provider(
             key.len()
         );
         Ok(Box::new(
-            crate::openhuman::providers::compatible::OpenAiCompatibleProvider::new(
+            crate::openhuman::inference::provider::compatible::OpenAiCompatibleProvider::new(
                 "custom_openai",
                 url,
                 Some(key),
-                crate::openhuman::providers::compatible::AuthStyle::Bearer,
+                crate::openhuman::inference::provider::compatible::AuthStyle::Bearer,
             ),
         ))
     } else {
diff --git a/src/openhuman/providers/reliable.rs b/src/openhuman/inference/provider/reliable.rs
similarity index 100%
rename from src/openhuman/providers/reliable.rs
rename to src/openhuman/inference/provider/reliable.rs
diff --git a/src/openhuman/providers/reliable_tests.rs b/src/openhuman/inference/provider/reliable_tests.rs
similarity index 100%
rename from src/openhuman/providers/reliable_tests.rs
rename to src/openhuman/inference/provider/reliable_tests.rs
diff --git a/src/openhuman/providers/router.rs b/src/openhuman/inference/provider/router.rs
similarity index 100%
rename from src/openhuman/providers/router.rs
rename to src/openhuman/inference/provider/router.rs
diff --git a/src/openhuman/providers/router_test.rs b/src/openhuman/inference/provider/router_test.rs
similarity index 100%
rename from src/openhuman/providers/router_test.rs
rename to src/openhuman/inference/provider/router_test.rs
diff --git a/src/openhuman/providers/schemas.rs b/src/openhuman/inference/provider/schemas.rs
similarity index 95%
rename from src/openhuman/providers/schemas.rs
rename to src/openhuman/inference/provider/schemas.rs
index a9f5f12f3a..fd81d65b53 100644
--- a/src/openhuman/providers/schemas.rs
+++ b/src/openhuman/inference/provider/schemas.rs
@@ -69,6 +69,9 @@ struct ListModelsRequest {
 fn handle_list_models(params: Map<String, Value>) -> ControllerFuture {
     Box::pin(async move {
         let req: ListModelsRequest = deserialize_params(params)?;
-        to_json(crate::openhuman::providers::ops::list_configured_models(&req.provider_id).await?)
+        to_json(
+            crate::openhuman::inference::provider::ops::list_configured_models(&req.provider_id)
+                .await?,
+        )
     })
 }
diff --git a/src/openhuman/providers/thread_context.rs b/src/openhuman/inference/provider/thread_context.rs
similarity index 97%
rename from src/openhuman/providers/thread_context.rs
rename to src/openhuman/inference/provider/thread_context.rs
index b70b25b4fb..c8e23088e3 100644
--- a/src/openhuman/providers/thread_context.rs
+++ b/src/openhuman/inference/provider/thread_context.rs
@@ -14,7 +14,7 @@
 //! it.
 //!
 //! ```ignore
-//! use crate::openhuman::providers::thread_context::{with_thread_id, current_thread_id};
+//! use crate::openhuman::inference::provider::thread_context::{with_thread_id, current_thread_id};
 //!
 //! with_thread_id("abc123", async {
 //!     // any provider.chat() call inside this future sees thread_id=Some("abc123")
diff --git a/src/openhuman/providers/traits.rs b/src/openhuman/inference/provider/traits.rs
similarity index 100%
rename from src/openhuman/providers/traits.rs
rename to src/openhuman/inference/provider/traits.rs
diff --git a/src/openhuman/providers/traits_tests.rs b/src/openhuman/inference/provider/traits_tests.rs
similarity index 100%
rename from src/openhuman/providers/traits_tests.rs
rename to src/openhuman/inference/provider/traits_tests.rs
diff --git a/src/openhuman/inference/schemas.rs b/src/openhuman/inference/schemas.rs
index bf4308998a..7db5f85f71 100644
--- a/src/openhuman/inference/schemas.rs
+++ b/src/openhuman/inference/schemas.rs
@@ -668,7 +668,7 @@ fn handle_inference_chat(params: Map<String, Value>) -> ControllerFuture {
             .messages
             .into_iter()
             .map(
-                |message| crate::openhuman::local_ai::ops::LocalAiChatMessage {
+                |message| crate::openhuman::inference::local::ops::LocalAiChatMessage {
                     role: message.role,
                     content: message.content,
                 },
diff --git a/src/openhuman/local_ai/sentiment.rs b/src/openhuman/inference/sentiment.rs
similarity index 99%
rename from src/openhuman/local_ai/sentiment.rs
rename to src/openhuman/inference/sentiment.rs
index c94bd8411c..7d15e6d8b3 100644
--- a/src/openhuman/local_ai/sentiment.rs
+++ b/src/openhuman/inference/sentiment.rs
@@ -1,7 +1,7 @@
 //! Emotion / sentiment analysis via the bundled local AI model.
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai;
+use crate::openhuman::inference::local as local_ai;
 use crate::rpc::RpcOutcome;
 
 /// Result of sentiment / emotion analysis on a user message.
diff --git a/src/openhuman/local_ai/types.rs b/src/openhuman/inference/types.rs
similarity index 98%
rename from src/openhuman/local_ai/types.rs
rename to src/openhuman/inference/types.rs
index ca4b9425bf..b1cfdbf39e 100644
--- a/src/openhuman/local_ai/types.rs
+++ b/src/openhuman/inference/types.rs
@@ -3,9 +3,9 @@
 use crate::openhuman::config::Config;
 use serde::{Deserialize, Serialize};
 
+use super::local::provider::provider_from_config;
 use super::model_ids;
 use super::presets;
-use super::provider::provider_from_config;
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LocalAiStatus {
@@ -173,7 +173,7 @@ mod tests {
 
     #[test]
     fn disabled_status_reflects_lm_studio_provider() {
-        use crate::openhuman::local_ai::provider::LocalAiProvider;
+        use crate::openhuman::inference::local::provider::LocalAiProvider;
 
         let mut config = Config::default();
         config.local_ai.provider = LocalAiProvider::LmStudio.as_str().to_string();
diff --git a/src/openhuman/voice/cloud_transcribe.rs b/src/openhuman/inference/voice/cloud_transcribe.rs
similarity index 100%
rename from src/openhuman/voice/cloud_transcribe.rs
rename to src/openhuman/inference/voice/cloud_transcribe.rs
diff --git a/src/openhuman/voice/hallucination.rs b/src/openhuman/inference/voice/hallucination.rs
similarity index 100%
rename from src/openhuman/voice/hallucination.rs
rename to src/openhuman/inference/voice/hallucination.rs
diff --git a/src/openhuman/voice/local_speech.rs b/src/openhuman/inference/voice/local_speech.rs
similarity index 97%
rename from src/openhuman/voice/local_speech.rs
rename to src/openhuman/inference/voice/local_speech.rs
index 716f32d95d..ed7bc75ba7 100644
--- a/src/openhuman/voice/local_speech.rs
+++ b/src/openhuman/inference/voice/local_speech.rs
@@ -26,7 +26,7 @@
 //! 2. `piper` / `piper.exe` on `$PATH`
 //!
 //! Both branches share the same resolution helper as the legacy voice
-//! pipeline ([`crate::openhuman::local_ai::paths::resolve_piper_binary`]),
+//! pipeline ([`crate::openhuman::inference::paths::resolve_piper_binary`]),
 //! so STT availability checks, the installer UI, and the factory dispatch
 //! all agree on what counts as "installed".
 //!
@@ -34,7 +34,7 @@
 //!
 //! **Easy path:** click "Install Piper" in `Settings → Voice → Voice
 //! Providers`. That triggers
-//! [`crate::openhuman::local_ai::install_piper`] which downloads the
+//! [`crate::openhuman::inference::local::install_piper`] which downloads the
 //! Piper binary archive (`.zip` on Windows, `.tar.gz` on macOS / Linux)
 //! into `~/.openhuman/bin/piper/`, extracts it, and stages the bundled
 //! `en_US-lessac-medium` voice (`.onnx` + `.onnx.json`) alongside via a
@@ -67,10 +67,12 @@ use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
 use log::debug;
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::paths::{resolve_piper_binary_with_config, resolve_tts_voice_path};
+use crate::openhuman::inference::paths::{
+    resolve_piper_binary_with_config, resolve_tts_voice_path,
+};
 use crate::rpc::RpcOutcome;
 
-use super::reply_speech::{ReplySpeechResult, VisemeFrame};
+use crate::openhuman::voice::reply_speech::{ReplySpeechResult, VisemeFrame};
 
 const LOG_PREFIX: &str = "[voice-tts]";
 
diff --git a/src/openhuman/voice/local_transcribe.rs b/src/openhuman/inference/voice/local_transcribe.rs
similarity index 97%
rename from src/openhuman/voice/local_transcribe.rs
rename to src/openhuman/inference/voice/local_transcribe.rs
index c2a9106a27..7822b4f742 100644
--- a/src/openhuman/voice/local_transcribe.rs
+++ b/src/openhuman/inference/voice/local_transcribe.rs
@@ -8,14 +8,14 @@
 //!
 //! When neither resolves, transcription fails with a clear, actionable
 //! error pointing the user at the install path. Resolution lives in
-//! [`crate::openhuman::local_ai::paths::resolve_whisper_binary`] — kept in
+//! [`crate::openhuman::inference::paths::resolve_whisper_binary`] — kept in
 //! one place so STT, voice-status, and the installer all agree.
 //!
 //! ## Where to get the binary
 //!
 //! **Easy path:** click "Install Whisper" in `Settings → Voice → Voice
 //! Providers`. That triggers
-//! [`crate::openhuman::local_ai::install_whisper`] which streams the
+//! [`crate::openhuman::inference::local::install_whisper`] which streams the
 //! GGML model file (`ggml-<size>.bin`) into
 //! `~/.openhuman/bin/whisper/` via a `.part` file + atomic rename, plus
 //! the `whisper-cli` binary on Windows where upstream ships a release
@@ -55,7 +55,7 @@ use log::{debug, warn};
 use serde::{Deserialize, Serialize};
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::paths::resolve_whisper_binary_with_config;
+use crate::openhuman::inference::paths::resolve_whisper_binary_with_config;
 use crate::rpc::RpcOutcome;
 
 const LOG_PREFIX: &str = "[voice-stt]";
@@ -169,7 +169,7 @@ pub async fn transcribe_whisper(
     // config default is, producing a mismatch between the returned model_id
     // and the model actually used for transcription.
     let model_path =
-        crate::openhuman::local_ai::paths::resolve_stt_model_path_by_id(&model_id, config)
+        crate::openhuman::inference::paths::resolve_stt_model_path_by_id(&model_id, config)
             .map_err(|e| format!("{LOG_PREFIX} {e}"))?;
     debug!("{LOG_PREFIX} resolved STT model path={model_path}");
 
diff --git a/src/openhuman/inference/voice/mod.rs b/src/openhuman/inference/voice/mod.rs
new file mode 100644
index 0000000000..b436408640
--- /dev/null
+++ b/src/openhuman/inference/voice/mod.rs
@@ -0,0 +1,12 @@
+//! Inference-side voice: local/cloud transcription (STT) and local TTS.
+//!
+//! Audio I/O, hotkeys, dictation, and the voice RPC surface remain in
+//! `crate::openhuman::voice`. The files here are the actual inference
+//! implementations that `voice/` imports.
+
+pub mod cloud_transcribe;
+pub mod hallucination;
+pub mod local_speech;
+pub mod local_transcribe;
+pub mod postprocess;
+pub mod streaming;
diff --git a/src/openhuman/voice/postprocess.rs b/src/openhuman/inference/voice/postprocess.rs
similarity index 96%
rename from src/openhuman/voice/postprocess.rs
rename to src/openhuman/inference/voice/postprocess.rs
index 52b17ab353..3e6934f359 100644
--- a/src/openhuman/voice/postprocess.rs
+++ b/src/openhuman/inference/voice/postprocess.rs
@@ -8,7 +8,7 @@ use log::{debug, info, warn};
 use std::time::Instant;
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai;
+use crate::openhuman::inference::local as local_ai;
 
 const LOG_PREFIX: &str = "[voice_postprocess]";
 
@@ -225,7 +225,7 @@ mod tests {
 
     #[tokio::test]
     async fn disabled_cleanup_returns_raw_text() {
-        let _g = crate::openhuman::local_ai::local_ai_test_guard();
+        let _g = crate::openhuman::inference::inference_test_guard();
         let mut config = Config::default();
         config.local_ai.voice_llm_cleanup_enabled = false;
         let service = local_ai::global(&config);
@@ -241,7 +241,7 @@ mod tests {
         // Covers the branch where cleanup is enabled in config but the
         // local LLM hasn't reached the ready/degraded state yet —
         // cleanup must gracefully fall back to the raw Whisper output.
-        let _g = crate::openhuman::local_ai::local_ai_test_guard();
+        let _g = crate::openhuman::inference::inference_test_guard();
         let config = Config::default(); // voice_llm_cleanup_enabled = true by default
         let service = local_ai::global(&config);
         let previous = service.status.lock().state.clone();
@@ -282,7 +282,7 @@ mod tests {
 
     #[tokio::test]
     async fn ready_llm_returns_trimmed_cleanup_or_falls_back() {
-        let _g = crate::openhuman::local_ai::local_ai_test_guard();
+        let _g = crate::openhuman::inference::inference_test_guard();
         let app = Router::new().route(
             "/api/generate",
             post(|| async {
@@ -305,7 +305,7 @@ mod tests {
 
     #[tokio::test]
     async fn ready_llm_empty_response_falls_back_to_raw_text() {
-        let _g = crate::openhuman::local_ai::local_ai_test_guard();
+        let _g = crate::openhuman::inference::inference_test_guard();
         let app = Router::new().route(
             "/api/generate",
             post(|| async { Json(json!({"model":"test","response":"   ","done": true})) }),
@@ -323,7 +323,7 @@ mod tests {
 
     #[tokio::test]
     async fn ready_llm_error_response_falls_back_to_raw_text() {
-        let _g = crate::openhuman::local_ai::local_ai_test_guard();
+        let _g = crate::openhuman::inference::inference_test_guard();
         let app = Router::new().route(
             "/api/generate",
             post(|| async {
@@ -349,7 +349,7 @@ mod tests {
         // glued the conversation context in front of the raw text when
         // the LLM ran. If the global state raced away from "ready" the
         // call short-circuits to raw — still valid, just the other branch.
-        let _g = crate::openhuman::local_ai::local_ai_test_guard();
+        let _g = crate::openhuman::inference::inference_test_guard();
         #[derive(serde::Deserialize)]
         struct Body {
             prompt: String,
@@ -386,7 +386,7 @@ mod tests {
         // "Conversation context:" header regardless of which branch
         // runs — the LLM path uses the raw-text-only prompt, and the
         // short-circuit path never builds a prompt at all.
-        let _g = crate::openhuman::local_ai::local_ai_test_guard();
+        let _g = crate::openhuman::inference::inference_test_guard();
         #[derive(serde::Deserialize)]
         struct Body {
             prompt: String,
diff --git a/src/openhuman/voice/streaming.rs b/src/openhuman/inference/voice/streaming.rs
similarity index 98%
rename from src/openhuman/voice/streaming.rs
rename to src/openhuman/inference/voice/streaming.rs
index df7e6861b0..266341d6c5 100644
--- a/src/openhuman/voice/streaming.rs
+++ b/src/openhuman/inference/voice/streaming.rs
@@ -22,8 +22,8 @@ use tokio::sync::Mutex;
 
 use super::postprocess;
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai;
-use crate::openhuman::local_ai::whisper_engine;
+use crate::openhuman::inference::local as local_ai;
+use crate::openhuman::inference::local::service::whisper_engine;
 use crate::openhuman::util::utf8_safe_prefix_at_byte_boundary;
 
 const LOG_PREFIX: &str = "[voice-stream]";
diff --git a/src/openhuman/learning/linkedin_enrichment.rs b/src/openhuman/learning/linkedin_enrichment.rs
index 081b768b4a..3d8dd6c063 100644
--- a/src/openhuman/learning/linkedin_enrichment.rs
+++ b/src/openhuman/learning/linkedin_enrichment.rs
@@ -309,7 +309,7 @@ async fn write_profile_md(
 /// Ask the backend LLM to distil the raw LinkedIn Markdown into a
 /// concise, high-signal profile document suitable for agent context.
 pub async fn summarise_profile_with_llm(config: &Config, raw_md: &str) -> anyhow::Result<String> {
-    use crate::openhuman::providers::ops::{
+    use crate::openhuman::inference::provider::ops::{
         create_backend_inference_provider, ProviderRuntimeOptions,
     };
 
diff --git a/src/openhuman/learning/reflection.rs b/src/openhuman/learning/reflection.rs
index a9ec12a27f..7ad8f11ffd 100644
--- a/src/openhuman/learning/reflection.rs
+++ b/src/openhuman/learning/reflection.rs
@@ -44,7 +44,7 @@ pub struct ReflectionHook {
     config: LearningConfig,
     full_config: Arc<Config>,
     memory: Arc<dyn Memory>,
-    provider: Option<Arc<dyn crate::openhuman::providers::Provider>>,
+    provider: Option<Arc<dyn crate::openhuman::inference::provider::Provider>>,
     /// Per-session reflection counts for throttling. Key is session_id (or "__global__").
     session_counts: Mutex<HashMap<String, usize>>,
 }
@@ -54,7 +54,7 @@ impl ReflectionHook {
         config: LearningConfig,
         full_config: Arc<Config>,
         memory: Arc<dyn Memory>,
-        provider: Option<Arc<dyn crate::openhuman::providers::Provider>>,
+        provider: Option<Arc<dyn crate::openhuman::inference::provider::Provider>>,
     ) -> Self {
         Self {
             config,
@@ -186,7 +186,7 @@ impl ReflectionHook {
                 log::debug!(
                     "[learning::reflection] local route — gate permit acquired via LocalAiService"
                 );
-                let service = crate::openhuman::local_ai::global(&self.full_config);
+                let service = crate::openhuman::inference::local::global(&self.full_config);
                 service
                     .prompt(&self.full_config, prompt, Some(512), true)
                     .await
diff --git a/src/openhuman/learning/reflection_tests.rs b/src/openhuman/learning/reflection_tests.rs
index 9d9d65f850..08c84dbfd7 100644
--- a/src/openhuman/learning/reflection_tests.rs
+++ b/src/openhuman/learning/reflection_tests.rs
@@ -288,7 +288,7 @@ async fn persist_reflection_writes_to_dedicated_namespace_and_category() {
 
 #[tokio::test]
 async fn on_turn_complete_dedupes_reflections_across_heuristic_and_llm_paths() {
-    use crate::openhuman::providers::Provider;
+    use crate::openhuman::inference::provider::Provider;
     use async_trait::async_trait;
 
     // Stub provider returning a reflection LLM response whose
@@ -480,8 +480,8 @@ async fn on_turn_complete_emits_candidates_to_buffer_for_heuristic_cues() {
 
 #[tokio::test]
 async fn on_turn_complete_emits_style_candidates_from_llm_preferences() {
+    use crate::openhuman::inference::provider::Provider;
     use crate::openhuman::learning::candidate::{self, FacetClass};
-    use crate::openhuman::providers::Provider;
 
     struct StubPrefProvider;
     #[async_trait]
diff --git a/src/openhuman/learning/transcript_ingest/extract.rs b/src/openhuman/learning/transcript_ingest/extract.rs
index bf1f57dd9d..0d456ad46e 100644
--- a/src/openhuman/learning/transcript_ingest/extract.rs
+++ b/src/openhuman/learning/transcript_ingest/extract.rs
@@ -18,7 +18,7 @@
 //! - Tool messages are never mined — they're high-noise and fully
 //!   reconstructable from the transcript itself.
 
-use crate::openhuman::providers::ChatMessage;
+use crate::openhuman::inference::provider::ChatMessage;
 
 use super::types::{CandidateKind, ConversationReflection, Importance, MemoryCandidate};
 
diff --git a/src/openhuman/learning/transcript_ingest/tests.rs b/src/openhuman/learning/transcript_ingest/tests.rs
index 7536ac3bb7..03197653e4 100644
--- a/src/openhuman/learning/transcript_ingest/tests.rs
+++ b/src/openhuman/learning/transcript_ingest/tests.rs
@@ -5,8 +5,8 @@
 
 use super::*;
 use crate::openhuman::agent::harness::session::transcript::{SessionTranscript, TranscriptMeta};
+use crate::openhuman::inference::provider::ChatMessage;
 use crate::openhuman::memory::{Memory, MemoryCategory, MemoryEntry, NamespaceSummary, RecallOpts};
-use crate::openhuman::providers::ChatMessage;
 use async_trait::async_trait;
 use std::path::PathBuf;
 use std::sync::Mutex;
diff --git a/src/openhuman/local_ai/README.md b/src/openhuman/local_ai/README.md
deleted file mode 100644
index 58da878577..0000000000
--- a/src/openhuman/local_ai/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# Local AI
-
-Local asset/runtime support for speech models and localhost-style integrations. This module no longer owns the public LLM inference contract. `src/openhuman/inference/` is the core namespace for prompt/chat/embed/status routing, while `local_ai` keeps the local speech/download/device-profile pieces and the lower-level helpers those inference adapters may delegate to.
-
-## Public surface
-
-- `pub struct LocalAiService` — `service/mod.rs` — singleton for Ollama/LM Studio health checks plus whisper/Piper helpers.
-- `pub fn global(config: &Config) -> Arc<LocalAiService>` — `core.rs` — singleton accessor.
-- `pub fn model_artifact_path(config: &Config) -> PathBuf` — `core.rs` — resolve on-disk model path.
-- `pub struct DeviceProfile` — `device.rs` — RAM / VRAM / CPU classification used for preset selection.
-- `pub struct ModelPreset` / `pub enum ModelTier` / `pub enum VisionMode` — `presets.rs` — bundled preset matrix.
-- `pub struct SentimentResult` — `sentiment.rs` — internal sentiment result type used by inference delegates.
-- Status / progress / result types: `pub struct LocalAiStatus`, `LocalAiAssetStatus`, `LocalAiAssetsStatus`, `LocalAiDownloadProgressItem`, `LocalAiDownloadsProgress`, `LocalAiEmbeddingResult`, `LocalAiSpeechResult`, `LocalAiTtsResult` — `types.rs`.
-- `pub mod ops` (re-exported as `rpc`) — `ops.rs` — typed Rust wrappers. Public `local_ai.*` RPCs are limited to local speech/assets flows; prompt/chat/embed/status helpers remain available for internal delegation from `inference`.
-- RPC `local_ai.{agent_chat, agent_chat_simple, local_ai_transcribe, local_ai_transcribe_bytes, local_ai_tts, local_ai_assets_status, local_ai_downloads_progress, local_ai_download_asset, local_ai_install_whisper, local_ai_install_piper, local_ai_whisper_install_status, local_ai_piper_install_status}` — `schemas.rs`.
-
-## Calls into
-
-- `src/openhuman/config/` — provider selection, model IDs, localhost base URL override, device-profile inputs.
-- Bundled binaries and assets for whisper.cpp and Piper.
-- External Ollama / LM Studio endpoints for diagnostics and model-state checks.
-- Filesystem under `~/.openhuman/local-ai/` for downloaded speech/model artifacts.
-
-## Called by
-
-- `src/openhuman/inference/` — delegates LLM/provider-facing status, prompt, chat, embed, reaction, and sentiment flows here as an implementation detail.
-- `src/openhuman/voice/{streaming,postprocess,ops,types}.rs` — speech-to-text + text-to-speech.
-- `src/openhuman/screen_intelligence/processing_worker.rs` — local multimodal helpers.
-- `src/openhuman/autocomplete/core/engine.rs` — local completions.
-- `src/openhuman/tree_summarizer/ops.rs` — summarisation backend.
-- `src/openhuman/app_state/ops.rs` — runtime snapshot support.
-- `src/core/all.rs` — registers `all_local_ai_*`.
-
-## Tests
-
-- Unit: `ops_tests.rs`, `schemas_tests.rs`, plus `service/ollama_admin_tests.rs`, `service/public_infer_tests.rs`.
-- Domain mutex: `LOCAL_AI_TEST_MUTEX` (`mod.rs`) serializes tests that mutate the singleton or env vars.
-- Routing: `agent/triage/routing_tests.rs` covers local-vs-remote escalation.
-
-## Provider notes
-
-OpenHuman does not ship or launch Ollama. The UI talks to the core, the core talks to `inference`, and `inference` can route to an external Ollama-compatible endpoint when configured. `local_ai` still exposes diagnostics and asset state so the UI can guide users through speech-model downloads and localhost runtime setup without treating Ollama as an app-managed runtime.
diff --git a/src/openhuman/local_ai/mod.rs b/src/openhuman/local_ai/mod.rs
deleted file mode 100644
index ca628e9d98..0000000000
--- a/src/openhuman/local_ai/mod.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-//! Local asset/runtime support for speech models and localhost-backed AI integrations.
-
-#[cfg(test)]
-pub(crate) static LOCAL_AI_TEST_MUTEX: once_cell::sync::Lazy<std::sync::Mutex<()>> =
-    once_cell::sync::Lazy::new(|| std::sync::Mutex::new(()));
-
-#[cfg(test)]
-pub(crate) fn local_ai_test_guard() -> std::sync::MutexGuard<'static, ()> {
-    LOCAL_AI_TEST_MUTEX
-        .lock()
-        .unwrap_or_else(|p| p.into_inner())
-}
-
-mod core;
-pub mod device;
-pub mod ops;
-pub mod presets;
-mod schemas;
-pub mod sentiment;
-
-mod install;
-pub(crate) mod install_piper;
-pub(crate) mod install_whisper;
-pub(crate) mod lm_studio_api;
-pub(crate) mod model_ids;
-mod ollama_api;
-mod process_util;
-pub(crate) use ollama_api::{ollama_base_url, OLLAMA_BASE_URL};
-mod parse;
-pub(crate) mod paths;
-pub(crate) mod provider;
-mod service;
-mod types;
-pub(crate) mod voice_install_common;
-
-pub use core::*;
-pub use device::DeviceProfile;
-pub use ops as rpc;
-pub use ops::*;
-pub use presets::{ModelPreset, ModelTier, VisionMode};
-pub use schemas::{
-    all_controller_schemas as all_local_ai_controller_schemas,
-    all_registered_controllers as all_local_ai_registered_controllers,
-};
-pub use sentiment::SentimentResult;
-pub(crate) use service::whisper_engine;
-pub use service::LocalAiService;
-pub use types::{
-    LocalAiAssetStatus, LocalAiAssetsStatus, LocalAiDownloadProgressItem, LocalAiDownloadsProgress,
-    LocalAiEmbeddingResult, LocalAiSpeechResult, LocalAiStatus, LocalAiTtsResult,
-};
diff --git a/src/openhuman/mcp_server/tools.rs b/src/openhuman/mcp_server/tools.rs
index 40fefae77f..dc76b0bfc9 100644
--- a/src/openhuman/mcp_server/tools.rs
+++ b/src/openhuman/mcp_server/tools.rs
@@ -4,7 +4,7 @@ use crate::core::all;
 use crate::openhuman::agent::harness::AgentDefinitionRegistry;
 use crate::openhuman::agent::Agent;
 use crate::openhuman::config::rpc as config_rpc;
-use crate::openhuman::providers::traits::build_tool_instructions_text;
+use crate::openhuman::inference::provider::traits::build_tool_instructions_text;
 use crate::openhuman::security::{SecurityPolicy, ToolOperation};
 
 const DEFAULT_LIMIT: u64 = 10;
diff --git a/src/openhuman/memory/store/factories.rs b/src/openhuman/memory/store/factories.rs
index 7f889ed30b..f2d42bc6ae 100644
--- a/src/openhuman/memory/store/factories.rs
+++ b/src/openhuman/memory/store/factories.rs
@@ -85,12 +85,12 @@ fn reset_health_gate_for_test() {
 
 /// Effective Ollama base URL.
 ///
-/// Delegates to [`crate::openhuman::local_ai::ollama_base_url`] so the probe
+/// Delegates to [`crate::openhuman::inference::local::ollama_base_url`] so the probe
 /// always agrees with the rest of the Ollama machinery on the daemon address.
 /// If a future change adds another env-var override or shifts precedence, the
 /// memory health-gate picks it up automatically.
 fn ollama_base_url_for_probe() -> String {
-    crate::openhuman::local_ai::ollama_base_url()
+    crate::openhuman::inference::local::ollama_base_url()
 }
 
 /// Canonical `(provider, model, dimensions)` tuple used everywhere the
@@ -465,7 +465,7 @@ mod tests {
 
     impl EnvGuard {
         fn set(value: &str) -> Self {
-            let lock = crate::openhuman::local_ai::local_ai_test_guard();
+            let lock = crate::openhuman::inference::local::inference_test_guard();
             let prev = std::env::var_os("OPENHUMAN_OLLAMA_BASE_URL");
             // SAFETY: env mutation is wrapped because Rust 2024 marks it
             // unsafe; the call is gated by the local-AI domain mutex so no
@@ -709,7 +709,7 @@ mod tests {
     /// fresh "first", flaking the suppression assertion.
     #[test]
     fn ollama_health_gate_reports_at_most_once_per_process() {
-        let _lock = crate::openhuman::local_ai::local_ai_test_guard();
+        let _lock = crate::openhuman::inference::local::inference_test_guard();
         reset_health_gate_for_test();
 
         assert!(
diff --git a/src/openhuman/memory/tree/chat/cloud.rs b/src/openhuman/memory/tree/chat/cloud.rs
index 210c7c55a9..a95a979e00 100644
--- a/src/openhuman/memory/tree/chat/cloud.rs
+++ b/src/openhuman/memory/tree/chat/cloud.rs
@@ -1,6 +1,6 @@
 //! Cloud chat provider — routes through the OpenHuman backend's
 //! `/openai/v1/chat/completions` surface using the existing
-//! [`crate::openhuman::providers::openhuman_backend::OpenHumanBackendProvider`].
+//! [`crate::openhuman::inference::provider::openhuman_backend::OpenHumanBackendProvider`].
 //!
 //! Used when `memory_tree.llm_backend = "cloud"` (the default). The
 //! request shape is the standard OpenAI-compatible chat-completions
@@ -16,9 +16,9 @@ use std::path::PathBuf;
 use anyhow::{Context, Result};
 use async_trait::async_trait;
 
-use crate::openhuman::providers::openhuman_backend::OpenHumanBackendProvider;
-use crate::openhuman::providers::traits::{ChatMessage, Provider};
-use crate::openhuman::providers::ProviderRuntimeOptions;
+use crate::openhuman::inference::provider::openhuman_backend::OpenHumanBackendProvider;
+use crate::openhuman::inference::provider::traits::{ChatMessage, Provider};
+use crate::openhuman::inference::provider::ProviderRuntimeOptions;
 
 use super::{ChatPrompt, ChatProvider};
 
diff --git a/src/openhuman/memory/tree/chat/mod.rs b/src/openhuman/memory/tree/chat/mod.rs
index 087648a6bb..0efded0388 100644
--- a/src/openhuman/memory/tree/chat/mod.rs
+++ b/src/openhuman/memory/tree/chat/mod.rs
@@ -15,7 +15,7 @@
 //!
 //! ## Why a memory-tree-local trait
 //!
-//! The existing top-level [`crate::openhuman::providers::Provider`] trait
+//! The existing top-level [`crate::openhuman::inference::provider::Provider`] trait
 //! is rich (streaming, native tool calling, vision, …) and depends on the
 //! agent's full conversation surface. The extractor and summariser only
 //! need:
diff --git a/src/openhuman/memory/tree/score/embed/factory.rs b/src/openhuman/memory/tree/score/embed/factory.rs
index d570041858..e3fb73a969 100644
--- a/src/openhuman/memory/tree/score/embed/factory.rs
+++ b/src/openhuman/memory/tree/score/embed/factory.rs
@@ -33,7 +33,7 @@ use anyhow::Result;
 
 use super::{CloudEmbedder, Embedder, InertEmbedder, OllamaEmbedder};
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai::ollama_base_url;
+use crate::openhuman::inference::local::ollama_base_url;
 
 /// Cheap heuristic for "is a backend session reachable?" — the cloud
 /// embedder needs one and bails on first embed call without it. We use
diff --git a/src/openhuman/migrations/mod_tests.rs b/src/openhuman/migrations/mod_tests.rs
index 3256c2089b..defe5651d2 100644
--- a/src/openhuman/migrations/mod_tests.rs
+++ b/src/openhuman/migrations/mod_tests.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::openhuman::agent::harness::session::transcript::{
     read_transcript, write_transcript, TranscriptMeta,
 };
-use crate::openhuman::providers::ChatMessage;
+use crate::openhuman::inference::provider::ChatMessage;
 use std::fs;
 use std::path::Path;
 use tempfile::TempDir;
diff --git a/src/openhuman/migrations/phase_out_profile_md_tests.rs b/src/openhuman/migrations/phase_out_profile_md_tests.rs
index 48f191e10b..a084cee523 100644
--- a/src/openhuman/migrations/phase_out_profile_md_tests.rs
+++ b/src/openhuman/migrations/phase_out_profile_md_tests.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::openhuman::agent::harness::session::transcript::{
     read_transcript, write_transcript, TranscriptMeta,
 };
-use crate::openhuman::providers::ChatMessage;
+use crate::openhuman::inference::provider::ChatMessage;
 use std::fs;
 use tempfile::TempDir;
 
diff --git a/src/openhuman/migrations/unify_ai_provider_settings.rs b/src/openhuman/migrations/unify_ai_provider_settings.rs
index bb51eba657..a45ef05fe1 100644
--- a/src/openhuman/migrations/unify_ai_provider_settings.rs
+++ b/src/openhuman/migrations/unify_ai_provider_settings.rs
@@ -14,7 +14,7 @@
 //! - `memory_tree.llm_backend` (+ `cloud_llm_model`) — memory summariser
 //!
 //! After this migration there is one grammar — provider strings parsed by
-//! [`crate::openhuman::providers::factory`] — addressing all eight workloads
+//! [`crate::openhuman::inference::provider::factory`] — addressing all eight workloads
 //! uniformly:
 //!
 //! ```text
diff --git a/src/openhuman/mod.rs b/src/openhuman/mod.rs
index 6194c6ae7e..fa22f907e6 100644
--- a/src/openhuman/mod.rs
+++ b/src/openhuman/mod.rs
@@ -41,7 +41,6 @@ pub mod inference;
 pub mod integrations;
 pub mod javascript;
 pub mod learning;
-pub mod local_ai;
 pub mod mcp_client;
 pub mod mcp_server;
 pub mod meet;
@@ -54,7 +53,6 @@ pub mod overlay;
 pub mod people;
 pub mod prompt_injection;
 pub mod provider_surfaces;
-pub mod providers;
 pub mod redirect_links;
 pub mod referral;
 pub mod routing;
diff --git a/src/openhuman/routing/factory.rs b/src/openhuman/routing/factory.rs
index c03b6c3ecc..07941d65e0 100644
--- a/src/openhuman/routing/factory.rs
+++ b/src/openhuman/routing/factory.rs
@@ -2,11 +2,11 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use crate::openhuman::config::LocalAiConfig;
-use crate::openhuman::local_ai::lm_studio_api::lm_studio_base_url_from_local_ai;
-use crate::openhuman::local_ai::ollama_base_url;
-use crate::openhuman::local_ai::provider::normalize_provider;
-use crate::openhuman::providers::compatible::{AuthStyle, OpenAiCompatibleProvider};
-use crate::openhuman::providers::Provider;
+use crate::openhuman::inference::local::lm_studio::lm_studio_base_url_from_local_ai;
+use crate::openhuman::inference::local::ollama_base_url;
+use crate::openhuman::inference::local::provider::normalize_provider;
+use crate::openhuman::inference::provider::compatible::{AuthStyle, OpenAiCompatibleProvider};
+use crate::openhuman::inference::provider::Provider;
 
 use super::health::LocalHealthChecker;
 use super::provider::IntelligentRoutingProvider;
@@ -135,7 +135,7 @@ pub fn new_provider(
 mod tests {
     use super::*;
     use crate::openhuman::config::LocalAiConfig;
-    use crate::openhuman::providers::traits::{ProviderCapabilities, ToolsPayload};
+    use crate::openhuman::inference::provider::traits::{ProviderCapabilities, ToolsPayload};
     use crate::openhuman::tools::ToolSpec;
     use async_trait::async_trait;
 
@@ -242,7 +242,7 @@ mod tests {
         // OPENHUMAN_LOCAL_INFERENCE_URL env var must override config.base_url.
         // This is tested by ensuring construction succeeds when the env var
         // is set — a real URL check would require a running server.
-        let _guard = crate::openhuman::local_ai::local_ai_test_guard();
+        let _guard = crate::openhuman::inference::local::inference_test_guard();
         unsafe {
             std::env::set_var("OPENHUMAN_LOCAL_INFERENCE_URL", "http://127.0.0.1:9999/v1");
         }
diff --git a/src/openhuman/routing/mod.rs b/src/openhuman/routing/mod.rs
index ffff8c5771..2fa6fba166 100644
--- a/src/openhuman/routing/mod.rs
+++ b/src/openhuman/routing/mod.rs
@@ -24,8 +24,8 @@
 //! ```rust,ignore
 //! use std::sync::Arc;
 //! use crate::openhuman::routing;
-//! use crate::openhuman::providers::create_backend_inference_provider;
-//! use crate::openhuman::providers::compatible::{AuthStyle, OpenAiCompatibleProvider};
+//! use crate::openhuman::inference::provider::create_backend_inference_provider;
+//! use crate::openhuman::inference::provider::compatible::{AuthStyle, OpenAiCompatibleProvider};
 //!
 //! let remote = create_backend_inference_provider(api_url, &opts)?;
 //! let provider = routing::new_provider(remote, &config.local_ai, &config.default_model);
diff --git a/src/openhuman/routing/provider.rs b/src/openhuman/routing/provider.rs
index cfbf8043fb..f595d55184 100644
--- a/src/openhuman/routing/provider.rs
+++ b/src/openhuman/routing/provider.rs
@@ -20,7 +20,7 @@ use async_trait::async_trait;
 use crate::openhuman::config::{
     MODEL_AGENTIC_V1, MODEL_CODING_V1, MODEL_REASONING_QUICK_V1, MODEL_REASONING_V1,
 };
-use crate::openhuman::providers::traits::{
+use crate::openhuman::inference::provider::traits::{
     ChatMessage, ChatRequest, ChatResponse, Provider, ProviderCapabilities, StreamChunk,
     StreamError, StreamOptions, StreamResult, ToolsPayload,
 };
diff --git a/src/openhuman/routing/provider_tests.rs b/src/openhuman/routing/provider_tests.rs
index 0ceee1039a..90094f30cc 100644
--- a/src/openhuman/routing/provider_tests.rs
+++ b/src/openhuman/routing/provider_tests.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::openhuman::providers::traits::ProviderCapabilities;
+use crate::openhuman::inference::provider::traits::ProviderCapabilities;
 use crate::openhuman::routing::health::LocalHealthChecker;
 use crate::openhuman::routing::policy::RoutingHints;
 use std::sync::{
@@ -463,7 +463,7 @@ async fn capabilities_delegate_to_remote() {
 
 #[tokio::test]
 async fn history_lightweight_uses_local_when_healthy() {
-    use crate::openhuman::providers::traits::ChatMessage;
+    use crate::openhuman::inference::provider::traits::ChatMessage;
     let local = MockProvider::new("local", "local history answer");
     let remote = MockProvider::new("remote", "remote answer");
     let health = LocalHealthChecker::seeded(true);
@@ -487,7 +487,7 @@ async fn history_lightweight_uses_local_when_healthy() {
 
 #[tokio::test]
 async fn history_local_error_falls_back_to_remote() {
-    use crate::openhuman::providers::traits::ChatMessage;
+    use crate::openhuman::inference::provider::traits::ChatMessage;
     let local = MockProvider::new("local", "never");
     local.set_fail(true);
     let remote = MockProvider::new("remote", "remote recovery");
@@ -512,7 +512,7 @@ async fn history_local_error_falls_back_to_remote() {
 
 #[tokio::test]
 async fn history_low_quality_local_falls_back_to_remote() {
-    use crate::openhuman::providers::traits::ChatMessage;
+    use crate::openhuman::inference::provider::traits::ChatMessage;
     // "I cannot help with that." is a known low-quality refusal phrase.
     let local = MockProvider::new("local", "I cannot help with that.");
     let remote = MockProvider::new("remote", "proper answer from remote");
@@ -537,7 +537,7 @@ async fn history_low_quality_local_falls_back_to_remote() {
 
 #[tokio::test]
 async fn history_privacy_required_suppresses_fallback_even_on_error() {
-    use crate::openhuman::providers::traits::ChatMessage;
+    use crate::openhuman::inference::provider::traits::ChatMessage;
     let local = MockProvider::new("local", "blocked");
     local.set_fail(true);
     let remote = MockProvider::new("remote", "should not be called");
@@ -567,7 +567,7 @@ async fn history_privacy_required_suppresses_fallback_even_on_error() {
 
 #[tokio::test]
 async fn tools_present_forces_remote_even_when_local_healthy_and_lightweight() {
-    use crate::openhuman::providers::traits::{ChatMessage, ChatRequest};
+    use crate::openhuman::inference::provider::traits::{ChatMessage, ChatRequest};
     use crate::openhuman::tools::ToolSpec;
 
     let local = MockProvider::new("local", "local answer");
diff --git a/src/openhuman/screen_intelligence/processing_worker.rs b/src/openhuman/screen_intelligence/processing_worker.rs
index ef2784b6ae..a088922f5b 100644
--- a/src/openhuman/screen_intelligence/processing_worker.rs
+++ b/src/openhuman/screen_intelligence/processing_worker.rs
@@ -12,7 +12,7 @@ use std::path::PathBuf;
 use std::sync::Arc;
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai;
+use crate::openhuman::inference::local as local_ai;
 
 use super::helpers::{persist_vision_summary, push_ephemeral_vision_summary, truncate_tail};
 use super::state::AccessibilityEngine;
diff --git a/src/openhuman/subconscious/executor.rs b/src/openhuman/subconscious/executor.rs
index db2e99a09a..43a8054773 100644
--- a/src/openhuman/subconscious/executor.rs
+++ b/src/openhuman/subconscious/executor.rs
@@ -199,11 +199,11 @@ async fn execute_with_local_model(
     let prompt_text = prompt::build_text_execution_prompt(task, situation_report, identity_context);
 
     let messages = vec![
-        crate::openhuman::local_ai::ops::LocalAiChatMessage {
+        crate::openhuman::inference::local::ops::LocalAiChatMessage {
             role: "system".to_string(),
             content: prompt_text,
         },
-        crate::openhuman::local_ai::ops::LocalAiChatMessage {
+        crate::openhuman::inference::local::ops::LocalAiChatMessage {
             role: "user".to_string(),
             content: "Execute the task now.".to_string(),
         },
@@ -259,7 +259,8 @@ async fn agent_chat_with_retry(
 
     loop {
         let result =
-            crate::openhuman::local_ai::ops::agent_chat(config, prompt, None, Some(0.3)).await;
+            crate::openhuman::inference::local::ops::agent_chat(config, prompt, None, Some(0.3))
+                .await;
 
         match result {
             Ok(outcome) => return Ok(outcome.value),
diff --git a/src/openhuman/threads/ops.rs b/src/openhuman/threads/ops.rs
index 4cc569d902..7bcec8de0c 100644
--- a/src/openhuman/threads/ops.rs
+++ b/src/openhuman/threads/ops.rs
@@ -2,6 +2,7 @@
 
 use crate::openhuman::channels::providers::web as web_channel;
 use crate::openhuman::config::Config;
+use crate::openhuman::inference::provider::{self, ProviderRuntimeOptions};
 use crate::openhuman::memory::conversations::{
     self, ConversationMessage, ConversationMessagePatch, ConversationThread,
     CreateConversationThread,
@@ -15,7 +16,6 @@ use crate::openhuman::memory::{
     UpdateConversationMessageRequest, UpdateConversationThreadLabelsRequest,
     UpsertConversationThreadRequest,
 };
-use crate::openhuman::providers::{self, ProviderRuntimeOptions};
 use crate::openhuman::threads::title::{
     build_title_prompt, is_auto_generated_thread_title, sanitize_generated_title,
     title_from_user_message, title_log_fingerprint, THREAD_TITLE_LOG_PREFIX,
@@ -322,7 +322,7 @@ pub async fn thread_generate_title(
         reasoning_enabled: config.runtime.reasoning_enabled,
     };
 
-    let provider = match providers::create_intelligent_routing_provider(
+    let provider = match provider::create_intelligent_routing_provider(
         config.inference_url.as_deref(),
         config.api_url.as_deref(),
         config.api_key.as_deref(),
diff --git a/src/openhuman/tools/impl/agent/delegate.rs b/src/openhuman/tools/impl/agent/delegate.rs
index a122966556..dd43b6479f 100644
--- a/src/openhuman/tools/impl/agent/delegate.rs
+++ b/src/openhuman/tools/impl/agent/delegate.rs
@@ -1,5 +1,7 @@
 use crate::openhuman::config::DelegateAgentConfig;
-use crate::openhuman::providers::{self, Provider};
+use crate::openhuman::inference::provider::{
+    create_backend_inference_provider, Provider, ProviderRuntimeOptions, INFERENCE_BACKEND_ID,
+};
 use crate::openhuman::security::policy::ToolOperation;
 use crate::openhuman::security::SecurityPolicy;
 use crate::openhuman::tool_timeout::tool_execution_timeout_secs;
@@ -18,7 +20,7 @@ pub struct DelegateTool {
     agents: Arc<HashMap<String, DelegateAgentConfig>>,
     security: Arc<SecurityPolicy>,
     /// Provider runtime options inherited from root config.
-    provider_runtime_options: providers::ProviderRuntimeOptions,
+    provider_runtime_options: ProviderRuntimeOptions,
     /// Depth at which this tool instance lives in the delegation chain.
     depth: u32,
 }
@@ -28,17 +30,13 @@ impl DelegateTool {
         agents: HashMap<String, DelegateAgentConfig>,
         security: Arc<SecurityPolicy>,
     ) -> Self {
-        Self::new_with_options(
-            agents,
-            security,
-            providers::ProviderRuntimeOptions::default(),
-        )
+        Self::new_with_options(agents, security, ProviderRuntimeOptions::default())
     }
 
     pub fn new_with_options(
         agents: HashMap<String, DelegateAgentConfig>,
         security: Arc<SecurityPolicy>,
-        provider_runtime_options: providers::ProviderRuntimeOptions,
+        provider_runtime_options: ProviderRuntimeOptions,
     ) -> Self {
         Self {
             agents: Arc::new(agents),
@@ -56,19 +54,14 @@ impl DelegateTool {
         security: Arc<SecurityPolicy>,
         depth: u32,
     ) -> Self {
-        Self::with_depth_and_options(
-            agents,
-            security,
-            depth,
-            providers::ProviderRuntimeOptions::default(),
-        )
+        Self::with_depth_and_options(agents, security, depth, ProviderRuntimeOptions::default())
     }
 
     pub fn with_depth_and_options(
         agents: HashMap<String, DelegateAgentConfig>,
         security: Arc<SecurityPolicy>,
         depth: u32,
-        provider_runtime_options: providers::ProviderRuntimeOptions,
+        provider_runtime_options: ProviderRuntimeOptions,
     ) -> Self {
         Self {
             agents: Arc::new(agents),
@@ -184,7 +177,7 @@ impl Tool for DelegateTool {
             return Ok(ToolResult::error(error));
         }
 
-        let provider: Box<dyn Provider> = match providers::create_backend_inference_provider(
+        let provider: Box<dyn Provider> = match create_backend_inference_provider(
             None,
             None,
             None,
@@ -238,8 +231,7 @@ impl Tool for DelegateTool {
 
                 Ok(ToolResult::success(format!(
                     "[Agent '{agent_name}' ({}/{})]\n{rendered}",
-                    providers::INFERENCE_BACKEND_ID,
-                    agent_config.model
+                    INFERENCE_BACKEND_ID, agent_config.model
                 )))
             }
             Err(e) => Ok(ToolResult::error(format!(
diff --git a/src/openhuman/tools/impl/agent/spawn_parallel_agents_test.rs b/src/openhuman/tools/impl/agent/spawn_parallel_agents_test.rs
index aa364457d5..eaa228fc1a 100644
--- a/src/openhuman/tools/impl/agent/spawn_parallel_agents_test.rs
+++ b/src/openhuman/tools/impl/agent/spawn_parallel_agents_test.rs
@@ -5,11 +5,11 @@ use crate::openhuman::agent::harness::fork_context::{with_parent_context, Parent
 use crate::openhuman::agent::Agent;
 use crate::openhuman::config::AgentConfig;
 use crate::openhuman::context::prompt::ToolCallFormat;
-use crate::openhuman::memory::{Memory, MemoryCategory, MemoryEntry, NamespaceSummary, RecallOpts};
-use crate::openhuman::providers::traits::ProviderCapabilities;
-use crate::openhuman::providers::{
+use crate::openhuman::inference::provider::traits::ProviderCapabilities;
+use crate::openhuman::inference::provider::{
     ChatRequest, ChatResponse, ConversationMessage, Provider, ToolCall,
 };
+use crate::openhuman::memory::{Memory, MemoryCategory, MemoryEntry, NamespaceSummary, RecallOpts};
 use crate::openhuman::tools::{PermissionLevel, Tool, ToolResult};
 use async_trait::async_trait;
 use parking_lot::Mutex;
diff --git a/src/openhuman/tools/impl/agent/spawn_worker_thread.rs b/src/openhuman/tools/impl/agent/spawn_worker_thread.rs
index c015b08dc5..82077f5d43 100644
--- a/src/openhuman/tools/impl/agent/spawn_worker_thread.rs
+++ b/src/openhuman/tools/impl/agent/spawn_worker_thread.rs
@@ -135,8 +135,9 @@ impl Tool for SpawnWorkerThreadTool {
 
         // ── Depth Guard ────────────────────────────────────────────────
         // Check if the current thread is already a worker thread.
-        let current_thread_id = crate::openhuman::providers::thread_context::current_thread_id()
-            .unwrap_or_else(|| "unknown".to_string());
+        let current_thread_id =
+            crate::openhuman::inference::provider::thread_context::current_thread_id()
+                .unwrap_or_else(|| "unknown".to_string());
 
         tracing::info!(
             agent_id = %agent_id,
@@ -288,7 +289,7 @@ mod tests {
 
     struct MockProvider;
     #[async_trait]
-    impl crate::openhuman::providers::Provider for MockProvider {
+    impl crate::openhuman::inference::provider::Provider for MockProvider {
         async fn chat_with_system(
             &self,
             _: Option<&str>,
@@ -300,11 +301,11 @@ mod tests {
         }
         async fn chat(
             &self,
-            _: crate::openhuman::providers::ChatRequest<'_>,
+            _: crate::openhuman::inference::provider::ChatRequest<'_>,
             _: &str,
             _: f64,
-        ) -> anyhow::Result<crate::openhuman::providers::ChatResponse> {
-            Ok(crate::openhuman::providers::ChatResponse {
+        ) -> anyhow::Result<crate::openhuman::inference::provider::ChatResponse> {
+            Ok(crate::openhuman::inference::provider::ChatResponse {
                 text: Some("done".into()),
                 tool_calls: vec![],
                 usage: None,
@@ -408,26 +409,29 @@ mod tests {
         )
         .unwrap();
 
-        crate::openhuman::providers::thread_context::with_thread_id(thread_id.to_string(), async {
-            let parent = test_parent_ctx(temp.path().to_path_buf());
-            with_parent_context(parent, async {
-                let tool = SpawnWorkerThreadTool::new();
-                let result = tool
-                    .execute(json!({
-                        "agent_id": "researcher",
-                        "prompt": "do it",
-                        "task_title": "Task"
-                    }))
-                    .await
-                    .unwrap();
-
-                assert!(result.is_error);
-                assert!(result
-                    .output()
-                    .contains("cannot spawn other worker threads"));
-            })
-            .await;
-        })
+        crate::openhuman::inference::provider::thread_context::with_thread_id(
+            thread_id.to_string(),
+            async {
+                let parent = test_parent_ctx(temp.path().to_path_buf());
+                with_parent_context(parent, async {
+                    let tool = SpawnWorkerThreadTool::new();
+                    let result = tool
+                        .execute(json!({
+                            "agent_id": "researcher",
+                            "prompt": "do it",
+                            "task_title": "Task"
+                        }))
+                        .await
+                        .unwrap();
+
+                    assert!(result.is_error);
+                    assert!(result
+                        .output()
+                        .contains("cannot spawn other worker threads"));
+                })
+                .await;
+            },
+        )
         .await;
     }
 
@@ -447,26 +451,29 @@ mod tests {
         )
         .unwrap();
 
-        crate::openhuman::providers::thread_context::with_thread_id(thread_id.to_string(), async {
-            let parent = test_parent_ctx(temp.path().to_path_buf());
-            with_parent_context(parent, async {
-                let tool = SpawnWorkerThreadTool::new();
-                let result = tool
-                    .execute(json!({
-                        "agent_id": "researcher",
-                        "prompt": "do it",
-                        "task_title": "Task"
-                    }))
-                    .await
-                    .unwrap();
-
-                assert!(result.is_error);
-                assert!(result
-                    .output()
-                    .contains("cannot spawn other worker threads"));
-            })
-            .await;
-        })
+        crate::openhuman::inference::provider::thread_context::with_thread_id(
+            thread_id.to_string(),
+            async {
+                let parent = test_parent_ctx(temp.path().to_path_buf());
+                with_parent_context(parent, async {
+                    let tool = SpawnWorkerThreadTool::new();
+                    let result = tool
+                        .execute(json!({
+                            "agent_id": "researcher",
+                            "prompt": "do it",
+                            "task_title": "Task"
+                        }))
+                        .await
+                        .unwrap();
+
+                    assert!(result.is_error);
+                    assert!(result
+                        .output()
+                        .contains("cannot spawn other worker threads"));
+                })
+                .await;
+            },
+        )
         .await;
     }
 }
diff --git a/src/openhuman/tools/impl/agent/todo_write.rs b/src/openhuman/tools/impl/agent/todo_write.rs
index 388ac27263..3ee3f92dd5 100644
--- a/src/openhuman/tools/impl/agent/todo_write.rs
+++ b/src/openhuman/tools/impl/agent/todo_write.rs
@@ -8,7 +8,7 @@ use crate::openhuman::agent::progress::AgentProgress;
 use crate::openhuman::agent::task_board::{
     TaskBoard, TaskBoardCard, TaskBoardStore, TaskCardStatus,
 };
-use crate::openhuman::providers::thread_context;
+use crate::openhuman::inference::provider::thread_context;
 use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolResult};
 use async_trait::async_trait;
 use parking_lot::Mutex;
@@ -288,11 +288,11 @@ mod tests {
         with_parent_context, ParentExecutionContext,
     };
     use crate::openhuman::context::prompt::ToolCallFormat;
+    use crate::openhuman::inference::provider::thread_context::with_thread_id;
+    use crate::openhuman::inference::provider::{ChatRequest, ChatResponse, Provider};
     use crate::openhuman::memory::{
         Memory, MemoryCategory, MemoryEntry, NamespaceSummary, RecallOpts,
     };
-    use crate::openhuman::providers::thread_context::with_thread_id;
-    use crate::openhuman::providers::{ChatRequest, ChatResponse, Provider};
 
     #[tokio::test]
     async fn todowrite_basic() {
diff --git a/src/openhuman/tools/ops.rs b/src/openhuman/tools/ops.rs
index 0e656d3b38..0caff68c50 100644
--- a/src/openhuman/tools/ops.rs
+++ b/src/openhuman/tools/ops.rs
@@ -338,7 +338,7 @@ pub fn all_tools_with_runtime(
         tools.push(Box::new(DelegateTool::new_with_options(
             delegate_agents,
             security.clone(),
-            crate::openhuman::providers::ProviderRuntimeOptions {
+            crate::openhuman::inference::provider::ProviderRuntimeOptions {
                 auth_profile_override: None,
                 openhuman_dir: root_config
                     .config_path
diff --git a/src/openhuman/tree_summarizer/engine.rs b/src/openhuman/tree_summarizer/engine.rs
index 8398604e55..83ca2425a9 100644
--- a/src/openhuman/tree_summarizer/engine.rs
+++ b/src/openhuman/tree_summarizer/engine.rs
@@ -7,7 +7,7 @@ use std::collections::BTreeMap;
 
 use crate::core::event_bus::{publish_global, DomainEvent};
 use crate::openhuman::config::Config;
-use crate::openhuman::providers::traits::Provider;
+use crate::openhuman::inference::provider::traits::Provider;
 use crate::openhuman::tree_summarizer::store;
 use crate::openhuman::tree_summarizer::types::{
     derive_node_ids, derive_parent_id, estimate_tokens, level_from_node_id, NodeLevel, TreeNode,
diff --git a/src/openhuman/tree_summarizer/ops.rs b/src/openhuman/tree_summarizer/ops.rs
index 667c9c1fd2..87ad921296 100644
--- a/src/openhuman/tree_summarizer/ops.rs
+++ b/src/openhuman/tree_summarizer/ops.rs
@@ -146,7 +146,7 @@ pub async fn tree_summarizer_rebuild(
 
 fn create_provider(
     config: &Config,
-) -> Result<Box<dyn crate::openhuman::providers::traits::Provider>, String> {
+) -> Result<Box<dyn crate::openhuman::inference::provider::traits::Provider>, String> {
     // Tree summarization runs exclusively on local AI to keep memory
     // processing private and offline — no backend calls.
     if !config.local_ai.runtime_enabled {
@@ -159,10 +159,10 @@ fn create_provider(
 /// wrapped in `ReliableProvider` for retry/backoff on transient failures.
 fn create_local_ai_provider(
     config: &Config,
-) -> Result<Box<dyn crate::openhuman::providers::traits::Provider>, String> {
-    use crate::openhuman::local_ai::OLLAMA_BASE_URL;
-    use crate::openhuman::providers::compatible::{AuthStyle, OpenAiCompatibleProvider};
-    use crate::openhuman::providers::reliable::ReliableProvider;
+) -> Result<Box<dyn crate::openhuman::inference::provider::traits::Provider>, String> {
+    use crate::openhuman::inference::local::OLLAMA_BASE_URL;
+    use crate::openhuman::inference::provider::compatible::{AuthStyle, OpenAiCompatibleProvider};
+    use crate::openhuman::inference::provider::reliable::ReliableProvider;
 
     let base_url = format!("{}/v1", OLLAMA_BASE_URL);
     let inner = OpenAiCompatibleProvider::new_no_responses_fallback(
@@ -174,7 +174,7 @@ fn create_local_ai_provider(
 
     let providers: Vec<(
         String,
-        Box<dyn crate::openhuman::providers::traits::Provider>,
+        Box<dyn crate::openhuman::inference::provider::traits::Provider>,
     )> = vec![("ollama-local".to_string(), Box::new(inner))];
     let reliable = ReliableProvider::new(
         providers,
diff --git a/src/openhuman/voice/mod.rs b/src/openhuman/voice/mod.rs
index ffcc043ce1..6d713dcfbe 100644
--- a/src/openhuman/voice/mod.rs
+++ b/src/openhuman/voice/mod.rs
@@ -3,25 +3,33 @@
 //! Provides RPC endpoints under the `openhuman.voice_*` namespace for
 //! transcription, synthesis, proactive availability checking, and a
 //! standalone voice dictation server (hotkey → record → transcribe → insert).
+//!
+//! Inference implementations (local_speech, local_transcribe, cloud_transcribe,
+//! hallucination, streaming, postprocess) now live under
+//! `crate::openhuman::inference::voice` so all inference concerns share a
+//! single domain root.
 
 pub mod audio_capture;
 pub(crate) mod cli;
-pub mod cloud_transcribe;
 pub mod dictation_listener;
 pub mod factory;
-pub mod hallucination;
 pub mod hotkey;
-pub mod local_speech;
-pub mod local_transcribe;
 mod ops;
-mod postprocess;
 pub mod reply_speech;
 mod schemas;
 pub mod server;
-pub mod streaming;
 pub mod text_input;
 mod types;
 
+// Re-export the inference-side voice modules so `voice::local_speech`,
+// `voice::local_transcribe`, etc. continue to resolve for existing callers.
+pub use crate::openhuman::inference::voice::cloud_transcribe;
+pub use crate::openhuman::inference::voice::hallucination;
+pub use crate::openhuman::inference::voice::local_speech;
+pub use crate::openhuman::inference::voice::local_transcribe;
+pub use crate::openhuman::inference::voice::postprocess;
+pub use crate::openhuman::inference::voice::streaming;
+
 pub use factory::{
     create_stt_provider, create_tts_provider, default_stt_provider, default_tts_provider,
     SttProvider, SttResult, TtsProvider, DEFAULT_PIPER_VOICE, DEFAULT_WHISPER_MODEL,
diff --git a/src/openhuman/voice/ops.rs b/src/openhuman/voice/ops.rs
index 5f25e0570f..85e59ea559 100644
--- a/src/openhuman/voice/ops.rs
+++ b/src/openhuman/voice/ops.rs
@@ -8,12 +8,12 @@ use log::{debug, warn};
 use std::time::Instant;
 
 use crate::openhuman::config::Config;
-use crate::openhuman::local_ai;
-use crate::openhuman::local_ai::model_ids;
-use crate::openhuman::local_ai::paths::{
+use crate::openhuman::inference::local as local_ai;
+use crate::openhuman::inference::local::model_ids;
+use crate::openhuman::inference::local::paths::{
     resolve_piper_binary, resolve_stt_model_path, resolve_tts_voice_path, resolve_whisper_binary,
 };
-use crate::openhuman::local_ai::whisper_engine;
+use crate::openhuman::inference::local::whisper_engine;
 use crate::rpc::RpcOutcome;
 
 use super::hallucination::{is_hallucinated_output, HallucinationMode};
diff --git a/src/openhuman/voice/types.rs b/src/openhuman/voice/types.rs
index 1103090633..b4136647f5 100644
--- a/src/openhuman/voice/types.rs
+++ b/src/openhuman/voice/types.rs
@@ -2,7 +2,7 @@
 
 use serde::{Deserialize, Serialize};
 
-use crate::openhuman::local_ai::{LocalAiSpeechResult, LocalAiTtsResult};
+use crate::openhuman::inference::{LocalAiSpeechResult, LocalAiTtsResult};
 
 /// Result of a speech-to-text transcription.
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/tests/agent_builder_public.rs b/tests/agent_builder_public.rs
index bb16e0206b..46433459b5 100644
--- a/tests/agent_builder_public.rs
+++ b/tests/agent_builder_public.rs
@@ -3,8 +3,8 @@ use async_trait::async_trait;
 use openhuman_core::openhuman::agent::dispatcher::XmlToolDispatcher;
 use openhuman_core::openhuman::agent::Agent;
 use openhuman_core::openhuman::context::prompt::SystemPromptBuilder;
+use openhuman_core::openhuman::inference::provider::{ChatRequest, ChatResponse, Provider};
 use openhuman_core::openhuman::memory::{Memory, MemoryCategory, MemoryEntry};
-use openhuman_core::openhuman::providers::{ChatRequest, ChatResponse, Provider};
 use openhuman_core::openhuman::tools::{Tool, ToolResult};
 use std::collections::HashSet;
 use std::sync::Arc;
diff --git a/tests/agent_harness_public.rs b/tests/agent_harness_public.rs
index 6eb9e21ccc..5e1fd96c3a 100644
--- a/tests/agent_harness_public.rs
+++ b/tests/agent_harness_public.rs
@@ -7,8 +7,10 @@ use openhuman_core::openhuman::agent::hooks::{
     fire_hooks, sanitize_tool_output, PostTurnHook, ToolCallRecord, TurnContext,
 };
 use openhuman_core::openhuman::config::AgentConfig;
+use openhuman_core::openhuman::inference::provider::{
+    ChatMessage, ChatRequest, ChatResponse, Provider,
+};
 use openhuman_core::openhuman::memory::{Memory, MemoryCategory, MemoryEntry};
-use openhuman_core::openhuman::providers::{ChatMessage, ChatRequest, ChatResponse, Provider};
 use parking_lot::Mutex;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
diff --git a/tests/agent_multimodal_public.rs b/tests/agent_multimodal_public.rs
index 7d67da8703..5b17ca276f 100644
--- a/tests/agent_multimodal_public.rs
+++ b/tests/agent_multimodal_public.rs
@@ -4,7 +4,7 @@ use openhuman_core::openhuman::agent::multimodal::{
     prepare_messages_for_provider,
 };
 use openhuman_core::openhuman::config::MultimodalConfig;
-use openhuman_core::openhuman::providers::ChatMessage;
+use openhuman_core::openhuman::inference::provider::ChatMessage;
 
 #[test]
 fn marker_helpers_cover_mixed_content_and_payload_extraction() {
diff --git a/tests/calendar_grounding_e2e.rs b/tests/calendar_grounding_e2e.rs
index cdd3664b58..20df3900ab 100644
--- a/tests/calendar_grounding_e2e.rs
+++ b/tests/calendar_grounding_e2e.rs
@@ -2,7 +2,7 @@ use anyhow::Result;
 use async_trait::async_trait;
 use openhuman_core::openhuman::agent::dispatcher::NativeToolDispatcher;
 use openhuman_core::openhuman::agent::Agent;
-use openhuman_core::openhuman::providers::{
+use openhuman_core::openhuman::inference::provider::{
     ChatMessage, ChatRequest, ChatResponse, Provider, ToolCall,
 };
 use openhuman_core::openhuman::tools::{PermissionLevel, Tool, ToolResult};
diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs
index 55f9aaeafa..13215581cb 100644
--- a/tests/json_rpc_e2e.rs
+++ b/tests/json_rpc_e2e.rs
@@ -2247,9 +2247,11 @@ async fn json_rpc_web_chat_custom_reasoning_provider_with_auth_none_omits_auth_h
     let loaded_config = openhuman_core::openhuman::config::load_config_with_timeout()
         .await
         .expect("load_config after auth-none update");
-    let (provider, model) =
-        openhuman_core::openhuman::providers::create_chat_provider("reasoning", &loaded_config)
-            .expect("custom auth-none provider should build");
+    let (provider, model) = openhuman_core::openhuman::inference::provider::create_chat_provider(
+        "reasoning",
+        &loaded_config,
+    )
+    .expect("custom auth-none provider should build");
     let direct = provider
         .simple_chat("direct custom-provider smoke test", &model, 0.0)
         .await

From 2eab968689eab5afe51914b55df294ae7bbacaaf Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 20:45:44 -0700
Subject: [PATCH 14/18] feat(inference): per-model temperature toggle, provider
 E2E tests, docker wiring

- temperature_unsupported_models config field (glob, default ["o1*","o3*","o4*","gpt-5*"])
  added to Config; serde default and Config::default() both initialize it
- temperature.rs: glob_match helper + temperature_for_model; 13 unit tests cover
  prefix/suffix/contains globs and all default model patterns
- ApiChatRequest and NativeChatRequest temperature fields changed to Option<f64>
  with skip_serializing_if, so matched models get no temperature key on the wire
- OpenAiCompatibleProvider gains temperature_unsupported_models field and
  effective_temperature() helper; all six request-body construction sites updated
- Factory threads the config list through make_cloud_provider_by_slug and
  make_ollama_provider via new with_temperature_unsupported_models builder
- server.rs logs a warning when the caller supplies temperature for an unsupported model
- tests/inference_provider_e2e.rs: 14 tests via wiremock covering OpenAI-compat chat,
  temperature present/absent, Anthropic auth style, streaming, Ollama, /v1 HTTP auth
- scripts/test-rust-inference-e2e.sh + e2e/docker-compose.yml inference-e2e service
---
 Cargo.lock                                    |  34 ++
 Cargo.toml                                    |   2 +
 e2e/docker-compose.yml                        |  13 +
 gitbooks/developing/e2e-testing.md            |  14 +
 scripts/test-rust-inference-e2e.sh            |  22 +
 src/openhuman/config/schema/types.rs          |  19 +
 src/openhuman/inference/http/server.rs        |  19 +-
 src/openhuman/inference/http/tests.rs         |  27 +-
 .../inference/provider/compatible.rs          |  46 +-
 .../inference/provider/compatible_tests.rs    |  12 +-
 .../inference/provider/compatible_types.rs    |   6 +-
 src/openhuman/inference/provider/factory.rs   |  49 +-
 src/openhuman/inference/provider/mod.rs       |   1 +
 .../inference/provider/temperature.rs         | 200 ++++++
 tests/inference_provider_e2e.rs               | 569 ++++++++++++++++++
 15 files changed, 1006 insertions(+), 27 deletions(-)
 create mode 100755 scripts/test-rust-inference-e2e.sh
 create mode 100644 src/openhuman/inference/provider/temperature.rs
 create mode 100644 tests/inference_provider_e2e.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1a15232824..b12897f9ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -257,6 +257,16 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9dbc3a507a82b17ba0d98f6ce8fd6954ea0c8152e98009d36a40d8dcc8ce078a"
 
+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "assign"
 version = "1.1.1"
@@ -5062,6 +5072,7 @@ dependencies = [
  "whatsapp-rust-tokio-transport",
  "whatsapp-rust-ureq-http-client",
  "whisper-rs",
+ "wiremock",
  "xz2",
  "zip",
 ]
@@ -9471,6 +9482,29 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "wiremock"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08db1edfb05d9b3c1542e521aea074442088292f00b5f28e435c714a98f85031"
+dependencies = [
+ "assert-json-diff",
+ "base64 0.22.1",
+ "deadpool",
+ "futures",
+ "http 1.4.0",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "log",
+ "once_cell",
+ "regex",
+ "serde",
+ "serde_json",
+ "tokio",
+ "url",
+]
+
 [[package]]
 name = "wit-bindgen"
 version = "0.51.0"
diff --git a/Cargo.toml b/Cargo.toml
index 6ca3331df9..031cf9d4bc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -169,6 +169,8 @@ rppal = { version = "0.22", optional = true }
 # crates we never use (and that bloat the dev Cargo.lock noticeably).
 # TestTransport only needs the `test` feature.
 sentry = { version = "0.47.0", default-features = false, features = ["test"] }
+# Mock HTTP server for provider E2E tests (inference_provider_e2e).
+wiremock = "0.6"
 
 [features]
 sandbox-landlock = ["dep:landlock"]
diff --git a/e2e/docker-compose.yml b/e2e/docker-compose.yml
index fbdd02b572..97bf1a55ba 100644
--- a/e2e/docker-compose.yml
+++ b/e2e/docker-compose.yml
@@ -76,6 +76,19 @@ services:
     # ubuntu-22.04 runners give CEF.
     shm_size: 2gb
 
+  # Rust inference provider E2E tests (wiremock-based, no live LLM needed).
+  # Run: docker compose -f e2e/docker-compose.yml run --rm inference-e2e
+  inference-e2e:
+    image: ${OPENHUMAN_CI_IMAGE:-ghcr.io/tinyhumansai/openhuman_ci:latest}
+    working_dir: /workspace
+    volumes:
+      - ../:/workspace
+      - e2e-cargo-registry:/usr/local/cargo/registry
+      - e2e-cargo-git:/usr/local/cargo/git
+      - e2e-rust-target:/workspace/target
+    entrypoint: ["bash", "-lc"]
+    command: ["./scripts/test-rust-inference-e2e.sh"]
+
 volumes:
   e2e-cargo-registry:
   e2e-cargo-git:
diff --git a/gitbooks/developing/e2e-testing.md b/gitbooks/developing/e2e-testing.md
index 79e1fc6e82..5371200ff8 100644
--- a/gitbooks/developing/e2e-testing.md
+++ b/gitbooks/developing/e2e-testing.md
@@ -215,3 +215,17 @@ bash app/scripts/e2e-agent-review.sh
 ```
 
 Artifacts land in `app/test/e2e/artifacts/<timestamp>-agent-review/`. Full details + helper API: [`AGENT-OBSERVABILITY.md`](AGENT-OBSERVABILITY.md). Any failing test triggers `wdio.conf.ts`'s `afterTest` hook, which writes `failure-*.png` + `failure-*.source.xml` into the same run dir.
+
+---
+
+## Rust inference provider E2E
+
+These tests (`tests/inference_provider_e2e.rs`) use **wiremock** to mock HTTP upstreams and require no live LLM API calls. They cover OpenAI-compat chat, Anthropic auth style, per-model temperature suppression, Ollama local provider, and the `/v1` HTTP endpoint auth layer.
+
+```bash
+# Local:
+bash scripts/test-rust-inference-e2e.sh
+
+# Via Docker (Linux, same image as CI):
+docker compose -f e2e/docker-compose.yml run --rm inference-e2e
+```
diff --git a/scripts/test-rust-inference-e2e.sh b/scripts/test-rust-inference-e2e.sh
new file mode 100755
index 0000000000..2760b7082a
--- /dev/null
+++ b/scripts/test-rust-inference-e2e.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Run the inference provider E2E tests (tests/inference_provider_e2e.rs).
+#
+# These tests use wiremock to mock HTTP upstreams — no live LLM API is needed.
+# They exercise:
+#   - OpenAI-compatible chat and streaming paths
+#   - Anthropic auth style header verification
+#   - Per-model temperature suppression (o1/o3/o4/gpt-5 patterns)
+#   - Ollama local provider (via OpenAI-compat /v1 endpoint)
+#   - /v1/chat/completions and /v1/models HTTP endpoint auth layer
+#
+# Usage:
+#   bash scripts/test-rust-inference-e2e.sh
+#
+# Via Docker (Linux):
+#   docker compose -f e2e/docker-compose.yml run --rm inference-e2e
+#
+# The shared mock backend is NOT required by these tests (they use wiremock
+# directly), but this script delegates to test-rust-with-mock.sh for
+# consistency with the rest of the Rust test runner tooling.
+set -euo pipefail
+exec bash "$(dirname "$0")/test-rust-with-mock.sh" --test inference_provider_e2e "$@"
diff --git a/src/openhuman/config/schema/types.rs b/src/openhuman/config/schema/types.rs
index d409bd24d6..d783353b82 100644
--- a/src/openhuman/config/schema/types.rs
+++ b/src/openhuman/config/schema/types.rs
@@ -56,6 +56,12 @@ pub struct Config {
     #[serde(default = "default_temperature_value")]
     pub default_temperature: f64,
 
+    /// Models (by exact ID match OR shell-style glob like `gpt-5*`, `o1-*`) that
+    /// MUST NOT receive a `temperature` parameter. Used for reasoning models
+    /// that error out when temperature is set (OpenAI o-series, GPT-5).
+    #[serde(default = "default_temperature_unsupported_models")]
+    pub temperature_unsupported_models: Vec<String>,
+
     #[serde(default)]
     pub observability: ObservabilityConfig,
 
@@ -315,6 +321,18 @@ fn default_temperature_value() -> f64 {
     DEFAULT_TEMPERATURE
 }
 
+/// Returns the default list of model glob patterns that do not support the
+/// `temperature` parameter. These cover OpenAI o-series and GPT-5 reasoning
+/// models that return an error when `temperature` is included in the request.
+fn default_temperature_unsupported_models() -> Vec<String> {
+    vec![
+        "o1*".to_string(),
+        "o3*".to_string(),
+        "o4*".to_string(),
+        "gpt-5*".to_string(),
+    ]
+}
+
 impl Config {
     /// Resolve the root directory where chunk `.md` files are stored.
     ///
@@ -398,6 +416,7 @@ impl Default for Config {
             inference_url: None,
             default_model: Some(DEFAULT_MODEL.to_string()),
             default_temperature: DEFAULT_TEMPERATURE,
+            temperature_unsupported_models: default_temperature_unsupported_models(),
             observability: ObservabilityConfig::default(),
             autonomy: AutonomyConfig::default(),
             runtime: RuntimeConfig::default(),
diff --git a/src/openhuman/inference/http/server.rs b/src/openhuman/inference/http/server.rs
index a12da7bdba..5c4f877de8 100644
--- a/src/openhuman/inference/http/server.rs
+++ b/src/openhuman/inference/http/server.rs
@@ -116,7 +116,24 @@ async fn chat_completions_handler(
         })
         .collect();
 
-    let temperature = req.temperature.unwrap_or(config.default_temperature);
+    // If the caller supplied a temperature but the model is on the unsupported
+    // list, log a warning and drop it — sending temperature to o1/o3/o4/gpt-5
+    // reasoning models causes an API error. The provider layer applies the same
+    // check on the outbound body, so this is belt-and-suspenders for logging.
+    let temperature = {
+        let raw = req.temperature.unwrap_or(config.default_temperature);
+        let suppressed = crate::openhuman::inference::provider::temperature::temperature_for_model(
+            &model_id, raw, &config,
+        );
+        if suppressed.is_none() && req.temperature.is_some() {
+            tracing::warn!(
+                model = %model_id,
+                requested_temperature = req.temperature.unwrap_or(0.0),
+                "{LOG_PREFIX} dropping caller-supplied temperature — model is on temperature_unsupported_models list"
+            );
+        }
+        raw // the Provider layer handles omission; we pass the value through
+    };
     let completion_id = format!("chatcmpl-{}", uuid::Uuid::new_v4());
     let created = chrono::Utc::now().timestamp();
     let model_name = req.model.clone();
diff --git a/src/openhuman/inference/http/tests.rs b/src/openhuman/inference/http/tests.rs
index d25268413a..987ceaffd6 100644
--- a/src/openhuman/inference/http/tests.rs
+++ b/src/openhuman/inference/http/tests.rs
@@ -7,14 +7,35 @@
 //! A running inference backend is NOT required — the tests exercise the
 //! routing and auth-middleware layers only.
 
+use std::sync::Once;
+
 use axum::body::Body;
 use axum::http::{header, Method, Request, StatusCode};
 use tower::ServiceExt;
 
+use crate::core::auth::CORE_TOKEN_ENV_VAR;
 use crate::core::jsonrpc::build_core_http_router;
 
+const TEST_RPC_TOKEN: &str = "inference-http-tests-token";
+
+/// Initialize the per-process RPC bearer token exactly once, so that the
+/// auth middleware can answer 401 instead of 500 ("auth subsystem not
+/// initialized") in tests that don't spin up a real core.
+fn ensure_test_rpc_auth() {
+    static INIT: Once = Once::new();
+    INIT.call_once(|| {
+        // SAFETY: test-only init; we serialize via `Once`, and live_routing_e2e
+        // uses its own env lock + a different token value so the two test
+        // binaries don't collide (they run in separate processes anyway).
+        unsafe { std::env::set_var(CORE_TOKEN_ENV_VAR, TEST_RPC_TOKEN) };
+        let tmp = tempfile::tempdir().expect("tempdir for token file");
+        crate::core::auth::init_rpc_token(tmp.path()).expect("init rpc auth token for http tests");
+    });
+}
+
 /// Build the test router (Socket.IO disabled — no real runtime needed).
 fn test_router() -> axum::Router {
+    ensure_test_rpc_auth();
     build_core_http_router(false)
 }
 
@@ -71,9 +92,9 @@ async fn test_models_no_bearer_returns_401() {
 /// test only asserts that auth passed.
 #[tokio::test]
 async fn test_chat_completions_with_bearer_not_rejected_as_auth_error() {
-    // Use the env var if set (CI with a real core token), otherwise use any
-    // non-empty string — the test-support middleware accepts it.
-    let token = std::env::var("OPENHUMAN_CORE_TOKEN").unwrap_or_else(|_| "test-token".to_string());
+    // Use the same token that `ensure_test_rpc_auth` installed via the
+    // `Once` initializer in this module.
+    let token = TEST_RPC_TOKEN.to_string();
 
     let body = serde_json::json!({
         "model": "ollama:llama3",
diff --git a/src/openhuman/inference/provider/compatible.rs b/src/openhuman/inference/provider/compatible.rs
index 8d3c3f3813..80d7711023 100644
--- a/src/openhuman/inference/provider/compatible.rs
+++ b/src/openhuman/inference/provider/compatible.rs
@@ -66,6 +66,11 @@ pub struct OpenAiCompatibleProvider {
     /// never see an unrecognized field that could trip strict input
     /// validation.
     emit_openhuman_thread_id: bool,
+    /// Shell-style glob patterns (`*` only) for model IDs that MUST NOT
+    /// receive a `temperature` field. Matches are done by
+    /// `temperature::glob_match`. Defaults to empty (all models support
+    /// temperature); populated by the factory when the config has entries.
+    pub(crate) temperature_unsupported_models: Vec<String>,
 }
 
 /// How the provider expects the API key to be sent.
@@ -165,6 +170,35 @@ impl OpenAiCompatibleProvider {
             user_agent: user_agent.map(ToString::to_string),
             merge_system_into_user,
             emit_openhuman_thread_id: false,
+            temperature_unsupported_models: Vec::new(),
+        }
+    }
+
+    /// Set the list of model glob patterns for which temperature must be
+    /// omitted from request bodies. Called by the provider factory to
+    /// propagate `config.temperature_unsupported_models`.
+    pub fn with_temperature_unsupported_models(mut self, patterns: Vec<String>) -> Self {
+        self.temperature_unsupported_models = patterns;
+        self
+    }
+
+    /// Resolve the effective temperature for `model`. Returns `None` when the
+    /// model matches a pattern in `temperature_unsupported_models` (causing the
+    /// field to be omitted from the serialised request).
+    fn effective_temperature(&self, model: &str, temperature: f64) -> Option<f64> {
+        if self
+            .temperature_unsupported_models
+            .iter()
+            .any(|pat| super::temperature::glob_match(pat, model))
+        {
+            tracing::debug!(
+                "[provider:{}] model='{}' matched temperature_unsupported_models — omitting temperature",
+                self.name,
+                model
+            );
+            None
+        } else {
+            Some(temperature)
         }
     }
 
@@ -1146,7 +1180,7 @@ impl Provider for OpenAiCompatibleProvider {
         let request = ApiChatRequest {
             model: model.to_string(),
             messages,
-            temperature,
+            temperature: self.effective_temperature(model, temperature),
             stream: Some(false),
             tools: None,
             tool_choice: None,
@@ -1280,7 +1314,7 @@ impl Provider for OpenAiCompatibleProvider {
         let request = ApiChatRequest {
             model: model.to_string(),
             messages: api_messages,
-            temperature,
+            temperature: self.effective_temperature(model, temperature),
             stream: Some(false),
             tools: None,
             tool_choice: None,
@@ -1380,7 +1414,7 @@ impl Provider for OpenAiCompatibleProvider {
         let request = ApiChatRequest {
             model: model.to_string(),
             messages: api_messages,
-            temperature,
+            temperature: self.effective_temperature(model, temperature),
             stream: Some(false),
             tools: if tools.is_empty() {
                 None
@@ -1477,7 +1511,7 @@ impl Provider for OpenAiCompatibleProvider {
             let native_request = NativeChatRequest {
                 model: model.to_string(),
                 messages: Self::convert_messages_for_native(&effective_messages),
-                temperature,
+                temperature: self.effective_temperature(model, temperature),
                 stream: Some(true),
                 tool_choice: tools.as_ref().map(|_| "auto".to_string()),
                 tools: tools.clone(),
@@ -1519,7 +1553,7 @@ impl Provider for OpenAiCompatibleProvider {
         let native_request = NativeChatRequest {
             model: model.to_string(),
             messages: Self::convert_messages_for_native(&effective_messages),
-            temperature,
+            temperature: self.effective_temperature(model, temperature),
             stream: Some(false),
             tool_choice: tools.as_ref().map(|_| "auto".to_string()),
             tools,
@@ -1670,7 +1704,7 @@ impl Provider for OpenAiCompatibleProvider {
         let request = ApiChatRequest {
             model: model.to_string(),
             messages,
-            temperature,
+            temperature: self.effective_temperature(model, temperature),
             stream: Some(options.enabled),
             tools: None,
             tool_choice: None,
diff --git a/src/openhuman/inference/provider/compatible_tests.rs b/src/openhuman/inference/provider/compatible_tests.rs
index e28296db68..fb242db30e 100644
--- a/src/openhuman/inference/provider/compatible_tests.rs
+++ b/src/openhuman/inference/provider/compatible_tests.rs
@@ -55,7 +55,7 @@ fn native_request_emits_thread_id_when_present() {
     let req = super::NativeChatRequest {
         model: "sonnet".to_string(),
         messages: Vec::new(),
-        temperature: 0.7,
+        temperature: Some(0.7),
         stream: Some(false),
         tools: None,
         tool_choice: None,
@@ -72,7 +72,7 @@ fn native_request_emits_thread_id_when_present() {
     let req_no_thread = super::NativeChatRequest {
         model: "sonnet".to_string(),
         messages: Vec::new(),
-        temperature: 0.7,
+        temperature: Some(0.7),
         stream: Some(false),
         tools: None,
         tool_choice: None,
@@ -96,7 +96,7 @@ fn streaming_request_sets_stream_options_include_usage() {
     let req = super::NativeChatRequest {
         model: "sonnet".to_string(),
         messages: Vec::new(),
-        temperature: 0.0,
+        temperature: Some(0.0),
         stream: Some(true),
         tools: None,
         tool_choice: None,
@@ -119,7 +119,7 @@ fn non_streaming_request_omits_stream_options() {
     let req = super::NativeChatRequest {
         model: "sonnet".to_string(),
         messages: Vec::new(),
-        temperature: 0.0,
+        temperature: Some(0.0),
         stream: Some(false),
         tools: None,
         tool_choice: None,
@@ -171,7 +171,7 @@ fn request_serializes_correctly() {
                 content: "hello".to_string(),
             },
         ],
-        temperature: 0.4,
+        temperature: Some(0.4),
         stream: Some(false),
         tools: None,
         tool_choice: None,
@@ -771,7 +771,7 @@ fn request_serializes_with_tools() {
             role: "user".to_string(),
             content: "What is the weather?".to_string(),
         }],
-        temperature: 0.7,
+        temperature: Some(0.7),
         stream: Some(false),
         tools: Some(tools),
         tool_choice: Some("auto".to_string()),
diff --git a/src/openhuman/inference/provider/compatible_types.rs b/src/openhuman/inference/provider/compatible_types.rs
index b0bb07f38d..b956b1b83d 100644
--- a/src/openhuman/inference/provider/compatible_types.rs
+++ b/src/openhuman/inference/provider/compatible_types.rs
@@ -12,7 +12,8 @@ use serde::{Deserialize, Serialize};
 pub(crate) struct ApiChatRequest {
     pub(crate) model: String,
     pub(crate) messages: Vec<Message>,
-    pub(crate) temperature: f64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub(crate) temperature: Option<f64>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub(crate) stream: Option<bool>,
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -31,7 +32,8 @@ pub(crate) struct Message {
 pub(crate) struct NativeChatRequest {
     pub(crate) model: String,
     pub(crate) messages: Vec<NativeMessage>,
-    pub(crate) temperature: f64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub(crate) temperature: Option<f64>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub(crate) stream: Option<bool>,
     #[serde(skip_serializing_if = "Option::is_none")]
diff --git a/src/openhuman/inference/provider/factory.rs b/src/openhuman/inference/provider/factory.rs
index 3e6af1544d..08522bda03 100644
--- a/src/openhuman/inference/provider/factory.rs
+++ b/src/openhuman/inference/provider/factory.rs
@@ -213,7 +213,12 @@ fn make_ollama_provider(
         model,
         redact_endpoint(&endpoint)
     );
-    let p = make_openai_compatible_provider(&endpoint, "", CompatAuthStyle::None)?;
+    let p = make_openai_compatible_provider_with_config(
+        &endpoint,
+        "",
+        CompatAuthStyle::None,
+        &config.temperature_unsupported_models,
+    )?;
     Ok((p, model.to_string()))
 }
 
@@ -260,10 +265,15 @@ fn make_cloud_provider_by_slug(
 
     let key = lookup_key_for_slug(slug, config)?;
 
+    let unsupported = &config.temperature_unsupported_models;
     match entry.auth_style {
         AuthStyle::Anthropic => {
-            let p =
-                make_openai_compatible_provider(&entry.endpoint, &key, CompatAuthStyle::Anthropic)?;
+            let p = make_openai_compatible_provider_with_config(
+                &entry.endpoint,
+                &key,
+                CompatAuthStyle::Anthropic,
+                unsupported,
+            )?;
             Ok((p, effective_model))
         }
         AuthStyle::OpenhumanJwt => {
@@ -276,12 +286,21 @@ fn make_cloud_provider_by_slug(
             make_openhuman_backend(config)
         }
         AuthStyle::None => {
-            let p = make_openai_compatible_provider(&entry.endpoint, "", CompatAuthStyle::None)?;
+            let p = make_openai_compatible_provider_with_config(
+                &entry.endpoint,
+                "",
+                CompatAuthStyle::None,
+                unsupported,
+            )?;
             Ok((p, effective_model))
         }
         AuthStyle::Bearer => {
-            let p =
-                make_openai_compatible_provider(&entry.endpoint, &key, CompatAuthStyle::Bearer)?;
+            let p = make_openai_compatible_provider_with_config(
+                &entry.endpoint,
+                &key,
+                CompatAuthStyle::Bearer,
+                unsupported,
+            )?;
             Ok((p, effective_model))
         }
     }
@@ -331,15 +350,27 @@ fn make_openai_compatible_provider(
     endpoint: &str,
     api_key: &str,
     auth_style: CompatAuthStyle,
+) -> anyhow::Result<Box<dyn Provider>> {
+    make_openai_compatible_provider_with_config(endpoint, api_key, auth_style, &[])
+}
+
+/// Build an `OpenAiCompatibleProvider` with auth style and temperature
+/// suppression list from config.
+fn make_openai_compatible_provider_with_config(
+    endpoint: &str,
+    api_key: &str,
+    auth_style: CompatAuthStyle,
+    temperature_unsupported_models: &[String],
 ) -> anyhow::Result<Box<dyn Provider>> {
     let key = if api_key.trim().is_empty() {
         None
     } else {
         Some(api_key)
     };
-    Ok(Box::new(OpenAiCompatibleProvider::new(
-        "cloud", endpoint, key, auth_style,
-    )))
+    Ok(Box::new(
+        OpenAiCompatibleProvider::new("cloud", endpoint, key, auth_style)
+            .with_temperature_unsupported_models(temperature_unsupported_models.to_vec()),
+    ))
 }
 
 /// Return a safe-to-log representation of a URL endpoint: `scheme://host` only.
diff --git a/src/openhuman/inference/provider/mod.rs b/src/openhuman/inference/provider/mod.rs
index 79573c99e8..e98b51659b 100644
--- a/src/openhuman/inference/provider/mod.rs
+++ b/src/openhuman/inference/provider/mod.rs
@@ -16,6 +16,7 @@ pub mod ops;
 pub mod reliable;
 pub mod router;
 pub mod schemas;
+pub mod temperature;
 pub mod thread_context;
 pub mod traits;
 
diff --git a/src/openhuman/inference/provider/temperature.rs b/src/openhuman/inference/provider/temperature.rs
new file mode 100644
index 0000000000..40b088fda1
--- /dev/null
+++ b/src/openhuman/inference/provider/temperature.rs
@@ -0,0 +1,200 @@
+//! Per-model temperature suppression helpers.
+//!
+//! Some models (OpenAI o-series, GPT-5 reasoning variants) reject the
+//! `temperature` field in the request body and return an error when it is
+//! present. `temperature_for_model` consults the config's
+//! `temperature_unsupported_models` list (which accepts shell-style `*`
+//! globs) and returns `None` when the model matches, causing the
+//! serialisation layer to omit the field via `skip_serializing_if`.
+
+use crate::openhuman::config::Config;
+
+/// Returns the effective temperature for `model`, or `None` if the model
+/// is listed in `config.temperature_unsupported_models`.
+///
+/// The list entries support shell-style `*` wildcard matching (no `?` or
+/// `[]`). Matching is case-sensitive and done against the full model ID.
+///
+/// # Examples
+///
+/// ```
+/// // model "o1-preview" matches pattern "o1*" → None
+/// // model "gpt-4o-mini" matches no pattern   → Some(0.7)
+/// ```
+pub fn temperature_for_model(model: &str, default: f64, config: &Config) -> Option<f64> {
+    if config
+        .temperature_unsupported_models
+        .iter()
+        .any(|pat| glob_match(pat, model))
+    {
+        tracing::debug!(
+            "[inference][temperature] model='{}' matched unsupported-temperature list — omitting temperature field",
+            model
+        );
+        None
+    } else {
+        Some(default)
+    }
+}
+
+/// Minimal shell-style glob matcher supporting only `*` (match any sequence
+/// of characters, including empty). Does not support `?` or `[...]`.
+///
+/// This avoids pulling in the `glob` crate for what is effectively a
+/// starts-with / ends-with / contains check.
+pub fn glob_match(pattern: &str, text: &str) -> bool {
+    // Split on `*` and consume the text segment by segment.
+    let parts: Vec<&str> = pattern.split('*').collect();
+
+    if parts.is_empty() {
+        // Pattern is purely `*` — matches everything.
+        return true;
+    }
+
+    let mut remaining = text;
+
+    for (i, part) in parts.iter().enumerate() {
+        if part.is_empty() {
+            // Consecutive stars or leading/trailing star — skip.
+            continue;
+        }
+
+        if i == 0 {
+            // First segment: must match the start of `text`.
+            if !remaining.starts_with(part) {
+                return false;
+            }
+            remaining = &remaining[part.len()..];
+        } else {
+            // Middle or last segment: find first occurrence in `remaining`.
+            match remaining.find(part) {
+                Some(pos) => {
+                    remaining = &remaining[pos + part.len()..];
+                }
+                None => return false,
+            }
+        }
+    }
+
+    // If the pattern did NOT end with `*`, the remaining text must be empty.
+    if !pattern.ends_with('*') && !remaining.is_empty() {
+        return false;
+    }
+
+    true
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::openhuman::config::Config;
+
+    // ── glob_match unit tests ─────────────────────────────────────────────────
+
+    #[test]
+    fn glob_exact_match() {
+        assert!(glob_match("o1-preview", "o1-preview"));
+    }
+
+    #[test]
+    fn glob_prefix_star() {
+        assert!(glob_match("o1*", "o1-preview"));
+        assert!(glob_match("o1*", "o1-mini"));
+        assert!(glob_match("o1*", "o1"));
+        assert!(!glob_match("o1*", "gpt-4o"));
+    }
+
+    #[test]
+    fn glob_suffix_star() {
+        assert!(glob_match("*mini", "gpt-4o-mini"));
+        assert!(!glob_match("*mini", "gpt-4o-large"));
+    }
+
+    #[test]
+    fn glob_contains_star() {
+        assert!(glob_match("gpt*mini", "gpt-4o-mini"));
+        assert!(!glob_match("gpt*mini", "gpt-4o-large"));
+    }
+
+    #[test]
+    fn glob_pure_star() {
+        assert!(glob_match("*", "anything"));
+        assert!(glob_match("*", ""));
+    }
+
+    #[test]
+    fn glob_no_star_mismatch() {
+        assert!(!glob_match("o1", "o1-preview"));
+        assert!(glob_match("o1", "o1"));
+    }
+
+    #[test]
+    fn glob_gpt5_pattern() {
+        assert!(glob_match("gpt-5*", "gpt-5"));
+        assert!(glob_match("gpt-5*", "gpt-5-turbo"));
+        assert!(!glob_match("gpt-5*", "gpt-4o"));
+    }
+
+    // ── temperature_for_model tests ───────────────────────────────────────────
+
+    fn config_with_unsupported(patterns: Vec<String>) -> Config {
+        let mut config = Config::default();
+        config.temperature_unsupported_models = patterns;
+        config
+    }
+
+    #[test]
+    fn temperature_returned_for_normal_model() {
+        let config = Config::default(); // has ["o1*","o3*","o4*","gpt-5*"] by default
+        assert_eq!(
+            temperature_for_model("gpt-4o-mini", 0.7, &config),
+            Some(0.7)
+        );
+        assert_eq!(
+            temperature_for_model("claude-3-opus", 0.5, &config),
+            Some(0.5)
+        );
+    }
+
+    #[test]
+    fn temperature_suppressed_for_o1_model() {
+        let config = Config::default();
+        assert_eq!(temperature_for_model("o1-preview", 0.7, &config), None);
+        assert_eq!(temperature_for_model("o1-mini", 0.7, &config), None);
+        assert_eq!(temperature_for_model("o1", 0.7, &config), None);
+    }
+
+    #[test]
+    fn temperature_suppressed_for_o3_o4() {
+        let config = Config::default();
+        assert_eq!(temperature_for_model("o3", 0.7, &config), None);
+        assert_eq!(temperature_for_model("o3-mini", 0.7, &config), None);
+        assert_eq!(temperature_for_model("o4-mini", 0.7, &config), None);
+    }
+
+    #[test]
+    fn temperature_suppressed_for_gpt5() {
+        let config = Config::default();
+        assert_eq!(temperature_for_model("gpt-5", 0.7, &config), None);
+        assert_eq!(temperature_for_model("gpt-5-turbo", 0.7, &config), None);
+    }
+
+    #[test]
+    fn temperature_uses_custom_unsupported_list() {
+        let config = config_with_unsupported(vec!["custom-*".to_string()]);
+        assert_eq!(temperature_for_model("custom-model", 0.7, &config), None);
+        assert_eq!(
+            temperature_for_model("gpt-4o-mini", 0.7, &config),
+            Some(0.7)
+        );
+        // Default patterns no longer apply when list is replaced.
+        assert_eq!(temperature_for_model("o1-preview", 0.7, &config), Some(0.7));
+    }
+
+    #[test]
+    fn temperature_empty_list_always_returns_some() {
+        let config = config_with_unsupported(vec![]);
+        assert_eq!(temperature_for_model("o1-preview", 0.7, &config), Some(0.7));
+        assert_eq!(temperature_for_model("gpt-5", 0.3, &config), Some(0.3));
+    }
+}
diff --git a/tests/inference_provider_e2e.rs b/tests/inference_provider_e2e.rs
new file mode 100644
index 0000000000..7aadc3a0ae
--- /dev/null
+++ b/tests/inference_provider_e2e.rs
@@ -0,0 +1,569 @@
+//! Inference provider end-to-end tests using wiremock.
+//!
+//! These tests spin up a wiremock HTTP server on a random port and verify
+//! that `OpenAiCompatibleProvider` sends correct request bodies and correctly
+//! interprets responses for the major provider shapes (OpenAI-compat,
+//! Anthropic auth, streaming, temperature suppression, Ollama endpoint).
+//!
+//! The `/v1/chat/completions` and `/v1/models` HTTP endpoint tests verify the
+//! full axum router layer (auth middleware + provider routing) end-to-end.
+//!
+//! No live LLM API calls are made.
+
+use std::sync::{Mutex, OnceLock};
+
+use axum::body::Body;
+use axum::http::{header, Method, Request, StatusCode};
+use serde_json::{json, Value};
+use tempfile::tempdir;
+use tower::ServiceExt;
+use wiremock::matchers::{header as wm_header, method, path};
+use wiremock::{Mock, MockServer, ResponseTemplate};
+
+use openhuman_core::core::auth::{init_rpc_token, CORE_TOKEN_ENV_VAR};
+use openhuman_core::core::jsonrpc::build_core_http_router;
+use openhuman_core::openhuman::inference::provider::compatible::{
+    AuthStyle, OpenAiCompatibleProvider,
+};
+use openhuman_core::openhuman::inference::provider::traits::{ChatMessage, Provider};
+
+// ── Environment serialisation lock ───────────────────────────────────────────
+//
+// Tests that mutate OPENHUMAN_WORKSPACE or OPENHUMAN_CORE_TOKEN must acquire
+// this lock first to prevent races when cargo runs tests in parallel threads
+// within the same process.
+
+static ENV_LOCK: OnceLock<Mutex<()>> = OnceLock::new();
+static RPC_AUTH_INIT: OnceLock<()> = OnceLock::new();
+
+fn env_lock() -> std::sync::MutexGuard<'static, ()> {
+    let m = ENV_LOCK.get_or_init(|| Mutex::new(()));
+    match m.lock() {
+        Ok(g) => g,
+        Err(p) => p.into_inner(),
+    }
+}
+
+const TEST_RPC_TOKEN: &str = "inference-provider-e2e-token";
+
+fn ensure_rpc_auth() {
+    RPC_AUTH_INIT.get_or_init(|| {
+        // SAFETY: test-only, serialised by OnceLock.
+        unsafe { std::env::set_var(CORE_TOKEN_ENV_VAR, TEST_RPC_TOKEN) };
+        let tmp = tempdir().expect("tempdir");
+        init_rpc_token(tmp.path()).expect("init rpc auth token");
+        // Keep tmp alive for the process duration by leaking it — the token
+        // file must remain readable for all subsequent auth checks.
+        std::mem::forget(tmp);
+    });
+}
+
+// ── Canned OpenAI-compatible response body ────────────────────────────────────
+
+fn openai_chat_response(content: &str) -> Value {
+    json!({
+        "id": "chatcmpl-test",
+        "object": "chat.completion",
+        "created": 1_700_000_000_u64,
+        "model": "gpt-4o-mini",
+        "choices": [{
+            "index": 0,
+            "message": { "role": "assistant", "content": content },
+            "finish_reason": "stop"
+        }],
+        "usage": { "prompt_tokens": 5, "completion_tokens": 10, "total_tokens": 15 }
+    })
+}
+
+// ── Helper: build an env-isolated Config pointing at tempdir ─────────────────
+
+/// Sets OPENHUMAN_WORKSPACE to `dir` and returns an `EnvVarGuard` that
+/// restores the previous value on drop.  Must be called under `env_lock()`.
+struct EnvGuard {
+    key: &'static str,
+    prev: Option<String>,
+}
+
+impl EnvGuard {
+    fn set(key: &'static str, val: &str) -> Self {
+        let prev = std::env::var(key).ok();
+        // SAFETY: caller holds env_lock().
+        unsafe { std::env::set_var(key, val) };
+        Self { key, prev }
+    }
+}
+
+impl Drop for EnvGuard {
+    fn drop(&mut self) {
+        match &self.prev {
+            // SAFETY: caller's env_lock guard is still alive during drop.
+            Some(v) => unsafe { std::env::set_var(self.key, v) },
+            None => unsafe { std::env::remove_var(self.key) },
+        }
+    }
+}
+
+// ── Test 1: OpenAI-compat chat returns canned text ───────────────────────────
+
+#[tokio::test]
+async fn openai_compat_chat_returns_canned_text() {
+    let server = MockServer::start().await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("Hello!")))
+        .mount(&server)
+        .await;
+
+    let provider = OpenAiCompatibleProvider::new(
+        "test",
+        &format!("{}/v1", server.uri()),
+        Some("test-key"),
+        AuthStyle::Bearer,
+    );
+
+    let messages = vec![ChatMessage::user("hi")];
+    let result = provider
+        .chat_with_history(&messages, "gpt-4o-mini", 0.7)
+        .await
+        .expect("chat_with_history should succeed");
+
+    assert_eq!(result, "Hello!");
+}
+
+// ── Test 2: Temperature present for normal model ──────────────────────────────
+
+#[tokio::test]
+async fn openai_compat_temperature_present_for_normal_model() {
+    let server = MockServer::start().await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("ok")))
+        .mount(&server)
+        .await;
+
+    let provider = OpenAiCompatibleProvider::new(
+        "test",
+        &format!("{}/v1", server.uri()),
+        Some("key"),
+        AuthStyle::Bearer,
+    );
+
+    provider
+        .chat_with_history(&[ChatMessage::user("hi")], "gpt-4o-mini", 0.7)
+        .await
+        .expect("should succeed");
+
+    let requests = server.received_requests().await.unwrap();
+    assert_eq!(requests.len(), 1);
+    let body: Value = serde_json::from_slice(&requests[0].body).unwrap();
+    assert!(
+        body.get("temperature").is_some(),
+        "temperature should be present for gpt-4o-mini; body={body}"
+    );
+    assert_eq!(body["temperature"].as_f64().unwrap(), 0.7);
+}
+
+// ── Test 3: Temperature omitted for o1 models ────────────────────────────────
+
+#[tokio::test]
+async fn openai_compat_omits_temperature_for_o1_models() {
+    let server = MockServer::start().await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("done")))
+        .mount(&server)
+        .await;
+
+    let provider = OpenAiCompatibleProvider::new(
+        "test",
+        &format!("{}/v1", server.uri()),
+        Some("key"),
+        AuthStyle::Bearer,
+    )
+    .with_temperature_unsupported_models(vec!["o1*".to_string()]);
+
+    provider
+        .chat_with_history(&[ChatMessage::user("reason")], "o1-preview", 0.7)
+        .await
+        .expect("should succeed");
+
+    let requests = server.received_requests().await.unwrap();
+    assert_eq!(requests.len(), 1);
+    let body: Value = serde_json::from_slice(&requests[0].body).unwrap();
+    assert!(
+        body.get("temperature").is_none(),
+        "temperature must be absent for o1-preview; body={body}"
+    );
+    // Response should still be returned correctly.
+}
+
+// ── Test 4: Temperature omitted for gpt-5 models ─────────────────────────────
+
+#[tokio::test]
+async fn openai_compat_omits_temperature_for_gpt5_models() {
+    let server = MockServer::start().await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("done")))
+        .mount(&server)
+        .await;
+
+    let provider = OpenAiCompatibleProvider::new(
+        "test",
+        &format!("{}/v1", server.uri()),
+        Some("key"),
+        AuthStyle::Bearer,
+    )
+    .with_temperature_unsupported_models(vec![
+        "o1*".to_string(),
+        "o3*".to_string(),
+        "o4*".to_string(),
+        "gpt-5*".to_string(),
+    ]);
+
+    for model in &["gpt-5", "gpt-5-turbo", "o3-mini", "o4-preview"] {
+        server.reset().await;
+        Mock::given(method("POST"))
+            .and(path("/v1/chat/completions"))
+            .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("done")))
+            .mount(&server)
+            .await;
+
+        provider
+            .chat_with_history(&[ChatMessage::user("test")], model, 0.7)
+            .await
+            .expect("should succeed");
+
+        let requests = server.received_requests().await.unwrap();
+        assert_eq!(requests.len(), 1, "model={model}");
+        let body: Value = serde_json::from_slice(&requests[0].body).unwrap();
+        assert!(
+            body.get("temperature").is_none(),
+            "temperature must be absent for model={model}; body={body}"
+        );
+    }
+}
+
+// ── Test 5: Anthropic auth style ─────────────────────────────────────────────
+
+#[tokio::test]
+async fn openai_compat_anthropic_auth_uses_x_api_key_header() {
+    let server = MockServer::start().await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .and(wm_header("x-api-key", "sk-ant-test"))
+        .and(wm_header("anthropic-version", "2023-06-01"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("hi")))
+        .mount(&server)
+        .await;
+
+    let provider = OpenAiCompatibleProvider::new(
+        "anthropic",
+        &format!("{}/v1", server.uri()),
+        Some("sk-ant-test"),
+        AuthStyle::Anthropic,
+    );
+
+    let result = provider
+        .chat_with_history(&[ChatMessage::user("hello")], "claude-3-haiku", 0.5)
+        .await
+        .expect("Anthropic auth chat should succeed");
+
+    assert_eq!(result, "hi");
+
+    // Verify Bearer header was NOT sent.
+    let requests = server.received_requests().await.unwrap();
+    assert_eq!(requests.len(), 1);
+    let auth = requests[0].headers.get("authorization");
+    assert!(
+        auth.is_none(),
+        "Authorization header must NOT be set for Anthropic auth; found {:?}",
+        auth
+    );
+}
+
+// ── Test 6: Streaming response returns ordered deltas ────────────────────────
+
+#[tokio::test]
+async fn openai_compat_streaming_returns_ordered_deltas() {
+    let server = MockServer::start().await;
+
+    let sse_body = concat!(
+        "data: {\"id\":\"x\",\"choices\":[{\"delta\":{\"role\":\"assistant\",\"content\":\"Hel\"},\"finish_reason\":null}]}\n\n",
+        "data: {\"id\":\"x\",\"choices\":[{\"delta\":{\"content\":\"lo\"},\"finish_reason\":null}]}\n\n",
+        "data: {\"id\":\"x\",\"choices\":[{\"delta\":{\"content\":\"!\"},\"finish_reason\":\"stop\"}]}\n\n",
+        "data: [DONE]\n\n",
+    );
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .respond_with(
+            ResponseTemplate::new(200)
+                .insert_header("content-type", "text/event-stream")
+                .set_body_string(sse_body),
+        )
+        .mount(&server)
+        .await;
+
+    let provider = OpenAiCompatibleProvider::new(
+        "test",
+        &format!("{}/v1", server.uri()),
+        Some("key"),
+        AuthStyle::Bearer,
+    );
+
+    // stream_chat_with_system is the implemented streaming method on this provider.
+    let options = openhuman_core::openhuman::inference::provider::traits::StreamOptions::new(true);
+    use futures_util::StreamExt;
+    let mut stream = provider.stream_chat_with_system(
+        Some("You are helpful."),
+        "Say Hello!",
+        "gpt-4o-mini",
+        0.7,
+        options,
+    );
+
+    let mut deltas = Vec::new();
+    while let Some(result) = stream.next().await {
+        let chunk = result.expect("stream chunk should be Ok");
+        if !chunk.delta.is_empty() {
+            deltas.push(chunk.delta);
+        }
+    }
+
+    let combined = deltas.join("");
+    assert_eq!(
+        combined, "Hello!",
+        "combined stream deltas should equal 'Hello!'; got '{combined}'"
+    );
+}
+
+// ── Test 7: Ollama endpoint shape ────────────────────────────────────────────
+
+#[tokio::test]
+async fn ollama_compat_chat_via_openai_v1_endpoint() {
+    let server = MockServer::start().await;
+
+    // Ollama via OpenAI-compat /v1 endpoint — wiremock pretends to be Ollama.
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("Bonjour!")))
+        .mount(&server)
+        .await;
+
+    // Factory builds Ollama provider via OpenAiCompatibleProvider at /v1.
+    let base = server.uri();
+    let endpoint = format!("{}/v1", base.trim_end_matches('/'));
+    let provider = OpenAiCompatibleProvider::new("ollama", &endpoint, None, AuthStyle::None);
+
+    let result = provider
+        .chat_with_history(&[ChatMessage::user("Bonjour?")], "llama3", 0.7)
+        .await
+        .expect("Ollama compat chat should succeed");
+
+    assert_eq!(result, "Bonjour!");
+}
+
+// ── Test 8: /v1/chat/completions HTTP endpoint — unauthorized ─────────────────
+
+#[tokio::test]
+async fn http_endpoint_chat_completions_no_bearer_returns_401() {
+    let _lock = env_lock();
+    ensure_rpc_auth();
+
+    let body = json!({
+        "model": "ollama:llama3",
+        "messages": [{ "role": "user", "content": "hello" }]
+    });
+    let req = Request::builder()
+        .method(Method::POST)
+        .uri("/v1/chat/completions")
+        .header(header::CONTENT_TYPE, "application/json")
+        .body(Body::from(serde_json::to_string(&body).unwrap()))
+        .unwrap();
+
+    let resp = build_core_http_router(false).oneshot(req).await.unwrap();
+    assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
+}
+
+// ── Test 9: /v1/models — unauthorized ────────────────────────────────────────
+
+#[tokio::test]
+async fn http_endpoint_models_no_bearer_returns_401() {
+    let _lock = env_lock();
+    ensure_rpc_auth();
+
+    let req = Request::builder()
+        .method(Method::GET)
+        .uri("/v1/models")
+        .body(Body::empty())
+        .unwrap();
+
+    let resp = build_core_http_router(false).oneshot(req).await.unwrap();
+    assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
+}
+
+// ── Test 10: /v1/models with bearer returns non-empty list ───────────────────
+
+#[tokio::test]
+async fn http_endpoint_models_with_bearer_returns_model_list() {
+    let _lock = env_lock();
+    ensure_rpc_auth();
+
+    let tmp = tempdir().expect("tempdir");
+    let _workspace_guard = EnvGuard::set("OPENHUMAN_WORKSPACE", tmp.path().to_str().unwrap());
+
+    let req = Request::builder()
+        .method(Method::GET)
+        .uri("/v1/models")
+        .header(header::AUTHORIZATION, format!("Bearer {TEST_RPC_TOKEN}"))
+        .body(Body::empty())
+        .unwrap();
+
+    let resp = build_core_http_router(false).oneshot(req).await.unwrap();
+    assert_ne!(
+        resp.status(),
+        StatusCode::UNAUTHORIZED,
+        "401 must not fire when bearer is present"
+    );
+    assert_ne!(
+        resp.status(),
+        StatusCode::FORBIDDEN,
+        "403 must not fire when bearer is present"
+    );
+
+    if resp.status().is_success() {
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let json: Value = serde_json::from_slice(&body).unwrap();
+        let models = json.get("data").and_then(Value::as_array);
+        if let Some(list) = models {
+            assert!(
+                !list.is_empty(),
+                "/v1/models should return at least one model"
+            );
+        }
+    }
+}
+
+// ── Test 11: /v1/chat/completions with bearer passes auth ────────────────────
+
+#[tokio::test]
+async fn http_endpoint_chat_completions_with_bearer_passes_auth() {
+    let _lock = env_lock();
+    ensure_rpc_auth();
+
+    let body = json!({
+        "model": "ollama:llama3",
+        "messages": [{ "role": "user", "content": "ping" }],
+        "stream": false
+    });
+    let req = Request::builder()
+        .method(Method::POST)
+        .uri("/v1/chat/completions")
+        .header(header::CONTENT_TYPE, "application/json")
+        .header(header::AUTHORIZATION, format!("Bearer {TEST_RPC_TOKEN}"))
+        .body(Body::from(serde_json::to_string(&body).unwrap()))
+        .unwrap();
+
+    let resp = build_core_http_router(false).oneshot(req).await.unwrap();
+    assert_ne!(
+        resp.status(),
+        StatusCode::UNAUTHORIZED,
+        "401 must not fire when bearer is present"
+    );
+    assert_ne!(
+        resp.status(),
+        StatusCode::FORBIDDEN,
+        "403 must not fire when bearer is present"
+    );
+}
+
+// ── Test 12: Request model field is preserved ─────────────────────────────────
+
+#[tokio::test]
+async fn openai_compat_request_body_contains_correct_model() {
+    let server = MockServer::start().await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("ok")))
+        .mount(&server)
+        .await;
+
+    let provider = OpenAiCompatibleProvider::new(
+        "test",
+        &format!("{}/v1", server.uri()),
+        Some("key"),
+        AuthStyle::Bearer,
+    );
+
+    provider
+        .chat_with_history(&[ChatMessage::user("hi")], "claude-3-sonnet", 0.5)
+        .await
+        .expect("should succeed");
+
+    let requests = server.received_requests().await.unwrap();
+    let body: Value = serde_json::from_slice(&requests[0].body).unwrap();
+    assert_eq!(body["model"].as_str().unwrap(), "claude-3-sonnet");
+}
+
+// ── Test 13: Bearer token is sent in Authorization header ────────────────────
+
+#[tokio::test]
+async fn openai_compat_bearer_auth_sends_authorization_header() {
+    let server = MockServer::start().await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .and(wm_header("authorization", "Bearer secret-key"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(openai_chat_response("ok")))
+        .mount(&server)
+        .await;
+
+    let provider = OpenAiCompatibleProvider::new(
+        "test",
+        &format!("{}/v1", server.uri()),
+        Some("secret-key"),
+        AuthStyle::Bearer,
+    );
+
+    let result = provider
+        .chat_with_history(&[ChatMessage::user("hi")], "gpt-4o", 0.7)
+        .await
+        .expect("should succeed");
+
+    assert_eq!(result, "ok");
+}
+
+// ── Test 14: temperature_for_model helper ────────────────────────────────────
+
+#[test]
+fn temperature_helper_suppresses_o1_by_default_config() {
+    use openhuman_core::openhuman::config::Config;
+    use openhuman_core::openhuman::inference::provider::temperature::temperature_for_model;
+
+    let config = Config::default();
+
+    // Normal model → temperature returned
+    assert_eq!(
+        temperature_for_model("gpt-4o-mini", 0.7, &config),
+        Some(0.7)
+    );
+    assert_eq!(
+        temperature_for_model("claude-3-sonnet", 0.5, &config),
+        Some(0.5)
+    );
+
+    // o1/o3/o4/gpt-5 → temperature suppressed
+    assert_eq!(temperature_for_model("o1-preview", 0.7, &config), None);
+    assert_eq!(temperature_for_model("o3-mini", 0.7, &config), None);
+    assert_eq!(temperature_for_model("o4-turbo", 0.7, &config), None);
+    assert_eq!(temperature_for_model("gpt-5-turbo", 0.7, &config), None);
+}

From acaf8b5ba49a030e57e615484a19b74d4adeedd1 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 20:55:58 -0700
Subject: [PATCH 15/18] chore: apply prettier auto-fix on rpcMethods test

---
 app/src/services/__tests__/rpcMethods.test.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/app/src/services/__tests__/rpcMethods.test.ts b/app/src/services/__tests__/rpcMethods.test.ts
index dae81085ff..e7baf67b0f 100644
--- a/app/src/services/__tests__/rpcMethods.test.ts
+++ b/app/src/services/__tests__/rpcMethods.test.ts
@@ -81,9 +81,9 @@ describe('rpcMethods catalog', () => {
         ? 'screen_intelligence'
         : methodRoot.startsWith('inference_')
           ? 'inference'
-        : methodRoot.startsWith('providers_')
-          ? 'providers'
-          : 'config';
+          : methodRoot.startsWith('providers_')
+            ? 'providers'
+            : 'config';
       const fnName = methodRoot.slice(`${namespace}_`.length);
       expect(schemaSources).toContain(`namespace: "${namespace}"`);
       expect(schemaSources).toContain(`function: "${fnName}"`);

From c6850815dccf9a2be89d4326371d1d6a4643c054 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sat, 16 May 2026 21:41:19 -0700
Subject: [PATCH 16/18] fix: address CodeRabbit review on PR #1975

- ollama_admin: correct stale docstring on `ensure_ollama_server_fresh`
  to reflect external-runtime mode (no auto install/start anymore)
- public_infer: redact userinfo/query/fragment from ollama_base_url
  before embedding in error payloads; add unit tests
- inference::provider::ops: route AuthStyle::OpenhumanJwt through the
  Bearer header path instead of falling into the unauthenticated branch
- inference::provider::ops: sanitize upstream provider error bodies via
  sanitize_api_error before returning them in RPC errors
- tests/json_rpc_e2e: add coverage for openhuman.inference_list_models
  and openhuman.inference_update_model_settings over the RPC transport
---
 .../inference/local/service/ollama_admin.rs   |  5 +-
 .../inference/local/service/public_infer.rs   | 44 ++++++++++++++++-
 src/openhuman/inference/provider/ops.rs       | 12 ++++-
 tests/json_rpc_e2e.rs                         | 48 +++++++++++++++++++
 4 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/src/openhuman/inference/local/service/ollama_admin.rs b/src/openhuman/inference/local/service/ollama_admin.rs
index 5114ec7f39..3741b5a390 100644
--- a/src/openhuman/inference/local/service/ollama_admin.rs
+++ b/src/openhuman/inference/local/service/ollama_admin.rs
@@ -45,8 +45,9 @@ impl LocalAiService {
         ))
     }
 
-    /// Like `ensure_ollama_server`, but forces a fresh install of the Ollama binary
-    /// (ignoring cached/workspace binaries). Used as a retry after the first attempt fails.
+    /// Alias of `ensure_ollama_server` in external-runtime mode.
+    /// OpenHuman no longer installs or starts Ollama automatically; the
+    /// "fresh" retry path is a no-op that defers to the standard check.
     pub(in crate::openhuman::inference::local::service) async fn ensure_ollama_server_fresh(
         &self,
         config: &Config,
diff --git a/src/openhuman/inference/local/service/public_infer.rs b/src/openhuman/inference/local/service/public_infer.rs
index ad3aee8773..2eb40d0f12 100644
--- a/src/openhuman/inference/local/service/public_infer.rs
+++ b/src/openhuman/inference/local/service/public_infer.rs
@@ -8,14 +8,54 @@ use crate::openhuman::inference::parse::sanitize_inline_completion;
 
 use super::LocalAiService;
 
+fn redact_ollama_base_url(raw: &str) -> String {
+    // Strip userinfo, query, and fragment so error payloads + logs don't
+    // leak `user:pass@host` style credentials embedded in the endpoint.
+    reqwest::Url::parse(raw)
+        .map(|mut url| {
+            let _ = url.set_username("");
+            let _ = url.set_password(None);
+            url.set_query(None);
+            url.set_fragment(None);
+            url.to_string()
+        })
+        .unwrap_or_else(|_| "<invalid-endpoint>".to_string())
+}
+
 fn external_ollama_request_error(prefix: &str, error: &reqwest::Error) -> String {
-    let base_url = ollama_base_url();
+    let safe_base_url = redact_ollama_base_url(&ollama_base_url());
     format!(
         "{prefix}: OpenHuman routes inference through an external Ollama endpoint. \
-         Make sure Ollama is already running and reachable at {base_url} ({error})"
+         Make sure Ollama is already running and reachable at {safe_base_url} ({error})"
     )
 }
 
+#[cfg(test)]
+mod redact_tests {
+    use super::redact_ollama_base_url;
+
+    #[test]
+    fn redact_strips_userinfo_query_and_fragment() {
+        assert_eq!(
+            redact_ollama_base_url("http://user:pass@host:11434/api?token=abc#frag"),
+            "http://host:11434/api"
+        );
+    }
+
+    #[test]
+    fn redact_keeps_plain_url() {
+        assert_eq!(
+            redact_ollama_base_url("http://127.0.0.1:11434/"),
+            "http://127.0.0.1:11434/"
+        );
+    }
+
+    #[test]
+    fn redact_handles_invalid_url() {
+        assert_eq!(redact_ollama_base_url("not a url"), "<invalid-endpoint>");
+    }
+}
+
 impl LocalAiService {
     pub async fn summarize(
         &self,
diff --git a/src/openhuman/inference/provider/ops.rs b/src/openhuman/inference/provider/ops.rs
index a48339127b..cb3533de8a 100644
--- a/src/openhuman/inference/provider/ops.rs
+++ b/src/openhuman/inference/provider/ops.rs
@@ -83,7 +83,14 @@ pub async fn list_configured_models(
             }
             r
         }
-        AuthStyle::OpenhumanJwt | AuthStyle::None => request,
+        AuthStyle::OpenhumanJwt => {
+            if !api_key.is_empty() {
+                request.header("Authorization", format!("Bearer {}", api_key))
+            } else {
+                request
+            }
+        }
+        AuthStyle::None => request,
     };
 
     let response = request
@@ -94,7 +101,8 @@ pub async fn list_configured_models(
     let status = response.status();
     if !status.is_success() {
         let body = response.text().await.unwrap_or_default();
-        let truncated = crate::openhuman::util::truncate_with_ellipsis(&body, 300);
+        let sanitized = sanitize_api_error(&body);
+        let truncated = crate::openhuman::util::truncate_with_ellipsis(&sanitized, 300);
         return Err(format!(
             "provider returned {}: {}",
             status.as_u16(),
diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs
index 13215581cb..d1a82d1058 100644
--- a/tests/json_rpc_e2e.rs
+++ b/tests/json_rpc_e2e.rs
@@ -3740,6 +3740,54 @@ async fn json_rpc_inference_namespace_lm_studio_prompt_and_status() {
         "hello from inference namespace"
     );
 
+    // openhuman.inference_update_model_settings — mutate `default_model`
+    // through the RPC transport so a controller-registration or param-shape
+    // regression surfaces here instead of in the settings-save UI flow.
+    // (We assert on `default_model` because that field is exposed by
+    // `inference_get_client_config`; `default_temperature` is not.)
+    let model_update = post_json_rpc(
+        &rpc_base,
+        366,
+        "openhuman.inference_update_model_settings",
+        json!({ "default_model": "e2e-updated-model" }),
+    )
+    .await;
+    assert_no_jsonrpc_error(&model_update, "inference_update_model_settings");
+    let client_cfg = post_json_rpc(
+        &rpc_base,
+        367,
+        "openhuman.inference_get_client_config",
+        json!({}),
+    )
+    .await;
+    let client_cfg_result = assert_no_jsonrpc_error(&client_cfg, "inference_get_client_config");
+    let updated_model = client_cfg_result
+        .pointer("/result/default_model")
+        .or_else(|| client_cfg_result.get("default_model"))
+        .and_then(Value::as_str);
+    assert_eq!(
+        updated_model,
+        Some("e2e-updated-model"),
+        "inference_get_client_config did not reflect updated default_model: {client_cfg_result}"
+    );
+
+    // openhuman.inference_list_models — no cloud provider configured for this
+    // local-only test, so we expect a structured error rather than a panic.
+    // Asserting an error here proves the controller is registered and reaches
+    // its handler over the RPC transport (the empty-picker symptom CodeRabbit
+    // flagged would surface as a controller-not-found error instead).
+    let list_models = post_json_rpc(
+        &rpc_base,
+        368,
+        "openhuman.inference_list_models",
+        json!({ "provider_id": "does-not-exist" }),
+    )
+    .await;
+    let _ = assert_jsonrpc_error(
+        &list_models,
+        "inference_list_models with unknown provider id",
+    );
+
     lm_join.abort();
     mock_join.abort();
     rpc_join.abort();

From 9b8fcffb4a82b0b7a16b72bcb6c16e24a36bf9be Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sun, 17 May 2026 02:05:13 -0700
Subject: [PATCH 17/18] fix(inference): unwrap diagnostics RpcOutcome + fix
 rpcMethods drift-guard

- inference_diagnostics: return value with empty logs (`RpcOutcome::new`)
  instead of `single_log` so callers see the diagnostics object directly,
  matching the legacy `local_ai_diagnostics` shape that the UI and
  json_rpc_e2e tests assert against (`provider`, `lm_studio_running`,
  `expected.chat_found`, etc.).
- rpcMethods drift guard: read schemas from `inference/provider/schemas.rs`
  rather than the deleted `providers/schemas.rs`.
---
 app/src/services/__tests__/rpcMethods.test.ts | 2 +-
 src/openhuman/inference/ops.rs                | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/app/src/services/__tests__/rpcMethods.test.ts b/app/src/services/__tests__/rpcMethods.test.ts
index e7baf67b0f..581cf58554 100644
--- a/app/src/services/__tests__/rpcMethods.test.ts
+++ b/app/src/services/__tests__/rpcMethods.test.ts
@@ -64,7 +64,7 @@ describe('rpcMethods catalog', () => {
         'utf8'
       ),
       fs.readFileSync(
-        path.resolve(__dirname, '../../../../src/openhuman/providers/schemas.rs'),
+        path.resolve(__dirname, '../../../../src/openhuman/inference/provider/schemas.rs'),
         'utf8'
       ),
       fs.readFileSync(
diff --git a/src/openhuman/inference/ops.rs b/src/openhuman/inference/ops.rs
index f64580344b..06459931ce 100644
--- a/src/openhuman/inference/ops.rs
+++ b/src/openhuman/inference/ops.rs
@@ -313,10 +313,14 @@ pub async fn inference_apply_preset(tier: &str) -> Result<RpcOutcome<Value>, Str
 pub async fn inference_diagnostics(config: &Config) -> Result<RpcOutcome<Value>, String> {
     debug!("{LOG_PREFIX} diagnostics:start");
     let service = local_runtime::global(config);
+    // Return the diagnostics payload directly (no `{result, logs}` wrap) so
+    // callers (UI + json_rpc_e2e tests) can read `provider`, `lm_studio_running`,
+    // etc. straight off the response — mirrors the legacy
+    // `local_ai_diagnostics` shape that the test asserts against.
     let result = service
         .diagnostics(config)
         .await
-        .map(|value| RpcOutcome::single_log(value, "inference diagnostics fetched"));
+        .map(|value| RpcOutcome::new(value, Vec::new()));
     match &result {
         Ok(_) => debug!("{LOG_PREFIX} diagnostics:ok"),
         Err(err) => error!(error = %err, "{LOG_PREFIX} diagnostics:error"),

From 0b92c50472163b3f69172bd7e8b8fe575b12a757 Mon Sep 17 00:00:00 2001
From: Steven Enamakel <enamakel@tinyhumans.ai>
Date: Sun, 17 May 2026 02:20:49 -0700
Subject: [PATCH 18/18] fix(inference): refresh provider field on status
 snapshot from current config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`LocalAiService` is a process-wide `OnceCell` singleton whose cached
`provider` field is set at first init and never refreshed. After an
`inference_update_local_settings` call swaps providers (ollama → lm_studio)
the cached value goes stale, so `inference_status` returns the previous
provider even though the on-disk config has the new one.

CI ran this test as part of the full suite where an earlier ollama-config
test had already initialized the singleton, so the lm_studio assertion
failed; locally in isolation the singleton picked lm_studio first and the
test passed.

Overlay the current config's provider on the returned snapshot so
`local_ai_status` reflects on-disk config without disturbing other
service state (state machine, model ids, etc.).
---
 src/openhuman/inference/local/ops.rs | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/openhuman/inference/local/ops.rs b/src/openhuman/inference/local/ops.rs
index 06f1911817..f3b64ecb57 100644
--- a/src/openhuman/inference/local/ops.rs
+++ b/src/openhuman/inference/local/ops.rs
@@ -147,10 +147,16 @@ pub async fn local_ai_status(config: &Config) -> Result<RpcOutcome<LocalAiStatus
             service_clone.bootstrap(&config_clone).await;
         });
     }
-    Ok(RpcOutcome::single_log(
-        service.status(),
-        "local ai status fetched",
-    ))
+    // `LocalAiService` is a process-wide singleton whose cached `provider`
+    // field was set at first init from whichever config it saw. After an
+    // `inference_update_local_settings` call that swaps providers
+    // (e.g. ollama → lm_studio) the cached value is stale, so we overlay
+    // the current config's provider on the status snapshot before returning.
+    let mut snapshot = service.status();
+    snapshot.provider = local_ai::provider::provider_from_config(config)
+        .as_str()
+        .to_string();
+    Ok(RpcOutcome::single_log(snapshot, "local ai status fetched"))
 }
 
 /// Generates a summary of the provided text using local AI models.