Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ OPENHUMAN_PROXY_SERVICES=
# [optional] Override selected model tier: low, medium, high
# Applies the corresponding preset at config load time (overrides config.toml).
OPENHUMAN_LOCAL_AI_TIER=
# [optional] Override Ollama's HTTP server base URL (default: http://localhost:11434).
# Useful when Ollama runs in another container, on another host, or on a non-default port.
# OPENHUMAN_OLLAMA_BASE_URL=http://127.0.0.1:11434
# [optional] Override LM Studio's OpenAI-compatible local server base URL.
# Defaults to http://localhost:1234/v1 when local_ai.provider = "lm_studio".
OPENHUMAN_LM_STUDIO_BASE_URL=
Expand Down
63 changes: 62 additions & 1 deletion src/openhuman/inference/local/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ pub async fn local_ai_prompt(
service.bootstrap(config).await;
}
let output = service
.prompt(config, prompt.trim(), max_tokens, no_think.unwrap_or(true))
.prompt_interactive(config, prompt.trim(), max_tokens, no_think.unwrap_or(true))
.await
.map_err(|e| e.to_string())?;
Ok(RpcOutcome::single_log(output, "local ai prompt completed"))
Expand Down Expand Up @@ -360,6 +360,67 @@ pub async fn local_ai_download_asset(
))
}

/// A single message in a local AI chat conversation.
#[derive(Debug, serde::Deserialize)]
pub struct LocalAiChatMessage {
/// The role of the message sender (e.g., "user", "assistant").
pub role: String,
/// The text content of the message.
pub content: String,
}

/// Executes a multi-turn chat conversation using the local model.
pub async fn local_ai_chat(
config: &Config,
messages: Vec<LocalAiChatMessage>,
max_tokens: Option<u32>,
) -> Result<RpcOutcome<String>, String> {
tracing::debug!(
message_count = messages.len(),
"[local_ai:chat] local_ai_chat op: validating"
);

if messages.is_empty() {
return Err("messages must not be empty".to_string());
}

let mut ollama_messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage> =
Vec::with_capacity(messages.len());

for msg in messages.into_iter() {
let normalized_role = msg.role.trim().to_ascii_lowercase();
match normalized_role.as_str() {
"user" => {
enforce_user_prompt_or_reject(msg.content.as_str(), "local_ai.ops.local_ai_chat")?;
}
"system" | "assistant" => {}
_ => {
return Err(format!(
"unsupported message role: '{}'; expected one of: user, system, assistant",
msg.role.trim()
));
}
}

ollama_messages.push(
crate::openhuman::inference::local::ollama::OllamaChatMessage {
role: normalized_role,
content: msg.content,
},
);
}

let service = local_ai::global(config);
let reply = service
.chat_with_history_interactive(config, ollama_messages, max_tokens)
.await?;

tracing::debug!(
reply_len = reply.len(),
"[local_ai:chat] local_ai_chat op: done"
);
Ok(RpcOutcome::single_log(reply, "local ai chat completed"))
}
/// Result of the reaction-decision prompt.
#[derive(Debug, serde::Serialize)]
pub struct ReactionDecision {
Expand Down
2 changes: 1 addition & 1 deletion src/openhuman/inference/local/service/ollama_admin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1302,7 +1302,7 @@ impl LocalAiService {
async fn ollama_runner_ok_at(&self, base_url: &str) -> bool {
let resp = self
.http
.post(format!("{base_url}/api/tags"))
.get(format!("{base_url}/api/tags"))
.timeout(std::time::Duration::from_secs(3))
.send()
.await;
Expand Down
32 changes: 32 additions & 0 deletions src/openhuman/inference/local/service/ollama_admin_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,38 @@ async fn ensure_ollama_server_reports_broken_external_runner_without_restart_att
);
}

#[tokio::test]
async fn ensure_ollama_server_accepts_healthy_external_runner() {
let _guard = crate::openhuman::inference::inference_test_guard();

let app = Router::new()
.route("/api/tags", get(|| async { Json(json!({ "models": [] })) }))
.route(
"/api/show",
axum::routing::post(|| async {
(
axum::http::StatusCode::NOT_FOUND,
Json(json!({ "error": "model '___nonexistent_probe___' not found" })),
)
}),
);
let base = spawn_mock(app).await;
unsafe {
std::env::set_var("OPENHUMAN_OLLAMA_BASE_URL", &base);
}

let config = Config::default();
let service = LocalAiService::new(&config);
service
.ensure_ollama_server(&config)
.await
.expect("healthy external runner should pass");

unsafe {
std::env::remove_var("OPENHUMAN_OLLAMA_BASE_URL");
}
}

#[tokio::test]
async fn assets_status_marks_ollama_unavailable_when_runtime_is_down_even_if_binary_exists() {
let _guard = crate::openhuman::inference::inference_test_guard();
Expand Down
202 changes: 199 additions & 3 deletions src/openhuman/inference/local/service/public_infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,26 @@ impl LocalAiService {
.await
}

pub async fn prompt_interactive(
&self,
config: &Config,
prompt: &str,
max_tokens: Option<u32>,
no_think: bool,
) -> Result<String, String> {
log::trace!("[local_ai] prompt_interactive bypasses scheduler_gate permit");
if !config.local_ai.runtime_enabled {
return Err("local ai is disabled".to_string());
}
let system = if no_think {
"You are a concise assistant. Return only the final answer. Do not include reasoning or chain-of-thought."
} else {
"You are a helpful assistant."
};
Comment thread
senamakel marked this conversation as resolved.
self.inference_interactive(config, system, prompt, max_tokens.or(Some(160)), no_think)
.await
}
Comment thread
senamakel marked this conversation as resolved.

pub async fn inline_complete(
&self,
config: &Config,
Expand Down Expand Up @@ -94,9 +114,11 @@ impl LocalAiService {
/// turn against it than show stale or empty completions for the
/// duration of the backfill.
///
/// This is the only path inside [`LocalAiService`] that opts out of
/// the gate. Every other entry point (`inference`, `prompt`,
/// `summarize`, `inline_complete`, `vision_prompt`, `embed`)
/// Along with [`Self::prompt_interactive`] and
/// [`Self::chat_with_history_interactive`], this is one of the paths
/// inside [`LocalAiService`] that opts out of the gate. Every other
/// entry point (`inference`, `prompt`, `summarize`,
/// `inline_complete`, `vision_prompt`, `embed`, `chat_with_history`)
/// acquires before talking to Ollama.
pub async fn inline_complete_interactive(
&self,
Expand Down Expand Up @@ -185,6 +207,180 @@ impl LocalAiService {
Ok(sanitize_inline_completion(&raw, context))
}

/// Multi-turn chat completion via Ollama /api/chat.
/// Messages are `[{role: "user"|"assistant"|"system", content: "..."}]`.
/// Returns the assistant reply string.
pub(crate) async fn chat_with_history(
&self,
config: &Config,
messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
max_tokens: Option<u32>,
) -> Result<String, String> {
self.chat_with_history_internal(config, messages, max_tokens, true)
.await
}

pub(crate) async fn chat_with_history_interactive(
&self,
config: &Config,
messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
max_tokens: Option<u32>,
) -> Result<String, String> {
log::trace!("[local_ai] chat_with_history_interactive bypasses scheduler_gate permit");
self.chat_with_history_internal(config, messages, max_tokens, false)
.await
}
Comment thread
senamakel marked this conversation as resolved.

async fn chat_with_history_internal(
&self,
config: &Config,
messages: Vec<crate::openhuman::inference::local::ollama::OllamaChatMessage>,
max_tokens: Option<u32>,
gated: bool,
) -> Result<String, String> {
if !config.local_ai.runtime_enabled {
return Err("local ai is disabled".to_string());
}

if !matches!(self.status.lock().state.as_str(), "ready") {
self.bootstrap(config).await;
}

if messages.is_empty() {
return Err("messages must not be empty".to_string());
}

let _gate_permit = if gated {
crate::openhuman::scheduler_gate::wait_for_capacity().await
} else {
None
};

if provider_from_config(config) == LocalAiProvider::LmStudio {
let started = std::time::Instant::now();
let lm_messages = messages
.into_iter()
.map(
|message| crate::openhuman::inference::local::lm_studio::LmStudioChatMessage {
role: message.role,
content: message.content,
},
)
.collect();
let outcome = self
.lm_studio_chat_completion(
config,
lm_messages,
max_tokens,
config.default_temperature as f32,
false,
)
.await?;
let elapsed_ms = started.elapsed().as_millis() as u64;
{
let mut status = self.status.lock();
status.state = "ready".to_string();
status.last_latency_ms = Some(elapsed_ms);
status.prompt_toks_per_sec = None;
status.gen_toks_per_sec = None;
status.warning = None;
}
tracing::debug!(
elapsed_ms,
prompt_tokens = ?outcome.prompt_tokens,
completion_tokens = ?outcome.completion_tokens,
reply_len = outcome.reply.len(),
"[local_ai:chat] lm studio /v1/chat/completions done"
);
return Ok(outcome.reply);
}

tracing::debug!(
message_count = messages.len(),
model = %crate::openhuman::inference::model_ids::effective_chat_model_id(config),
"[local_ai:chat] sending to ollama /api/chat"
);

let started = std::time::Instant::now();

let body = crate::openhuman::inference::local::ollama::OllamaChatRequest {
model: crate::openhuman::inference::model_ids::effective_chat_model_id(config),
messages,
stream: false,
options: Some(
crate::openhuman::inference::local::ollama::OllamaGenerateOptions {
temperature: Some(config.default_temperature as f32),
top_k: Some(40),
top_p: Some(0.9),
num_predict: max_tokens.map(|v| v as i32),
},
),
};

let base_url = ollama_base_url_from_config(config);
let response = self
.http
.post(format!("{base_url}/api/chat"))
.json(&body)
.send()
.await
.map_err(|e| {
external_ollama_request_error_with_url("ollama chat request failed", &e, &base_url)
})?;

if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
let detail = body.trim();
return Err(format!(
"ollama chat failed with status {}{}",
status,
if detail.is_empty() {
String::new()
} else {
format!(": {detail}")
}
));
}

let payload: crate::openhuman::inference::local::ollama::OllamaChatResponse = response
.json()
.await
.map_err(|e| format!("ollama chat response parse failed: {e}"))?;

let elapsed_ms = started.elapsed().as_millis() as u64;
let prompt_tps = payload
.prompt_eval_count
.zip(payload.prompt_eval_duration)
.and_then(|(count, dur_ns)| ns_to_tps(count as f32, dur_ns));
let gen_tps = payload
.eval_count
.zip(payload.eval_duration)
.and_then(|(count, dur_ns)| ns_to_tps(count as f32, dur_ns));

{
let mut status = self.status.lock();
status.state = "ready".to_string();
status.last_latency_ms = Some(elapsed_ms);
status.prompt_toks_per_sec = prompt_tps;
status.gen_toks_per_sec = gen_tps;
status.warning = None;
}

tracing::debug!(
elapsed_ms,
reply_len = payload.message.content.len(),
"[local_ai:chat] ollama /api/chat done"
);

let reply = payload.message.content.trim().to_string();
if reply.is_empty() {
Err("ollama returned empty reply".to_string())
} else {
Ok(reply)
}
}

pub(crate) async fn inference(
&self,
config: &Config,
Expand Down
Loading
Loading