From 253c845fbefc8c3c62c04285f94df29ab22ecf49 Mon Sep 17 00:00:00 2001
From: Alex Fournier <afournier@nvidia.com>
Date: Sun, 31 May 2026 09:21:20 -0700
Subject: [PATCH 1/8] feat: add local backend for built-in nemo guardrails

Signed-off-by: Alex Fournier <afournier@nvidia.com>
---
 .../src/plugins/nemo_guardrails/component.rs  |  20 +-
 .../core/src/plugins/nemo_guardrails/local.rs |  51 ++
 .../nemo_guardrails/component_tests.rs        |  48 +-
 crates/python/src/lib.rs                      |  83 +++
 crates/python/src/py_plugin.rs                |  39 +-
 .../python/tests/coverage/coverage_tests.rs   | 648 +++++++++++++++++-
 docs/about-nemo-relay/concepts/plugins.mdx    |   7 +-
 docs/build-plugins/nemoguardrails.mdx         |   1 -
 docs/nemo-guardrails-plugin/about.mdx         | 108 ++-
 docs/nemo-guardrails-plugin/configuration.mdx | 205 ++++--
 python/nemo_relay/_guardrails_local.py        | 589 ++++++++++++++++
 11 files changed, 1654 insertions(+), 145 deletions(-)
 create mode 100644 crates/core/src/plugins/nemo_guardrails/local.rs
 create mode 100644 python/nemo_relay/_guardrails_local.py

diff --git a/crates/core/src/plugins/nemo_guardrails/component.rs b/crates/core/src/plugins/nemo_guardrails/component.rs
index 13695405..28decfbe 100644
--- a/crates/core/src/plugins/nemo_guardrails/component.rs
+++ b/crates/core/src/plugins/nemo_guardrails/component.rs
@@ -17,9 +17,13 @@ use crate::plugin::{
     register_plugin,
 };
 
+#[path = "local.rs"]
+mod local;
 #[cfg(all(feature = "guardrails-remote", not(target_arch = "wasm32")))]
 #[path = "remote.rs"]
 mod remote;
+use local::register_local_backend;
+pub use local::{clear_local_backend_provider, register_local_backend_provider};
 #[cfg(all(feature = "guardrails-remote", not(target_arch = "wasm32")))]
 use remote::register_remote_backend;
 
@@ -447,9 +451,7 @@ fn register_nemo_guardrails_backend(
 ) -> PluginResult<()> {
     match config.mode.as_str() {
         "remote" => register_remote_backend(config, ctx),
-        "local" => Err(PluginError::RegistrationFailed(
-            "built-in NeMo Guardrails local backend is not implemented yet".to_string(),
-        )),
+        "local" => register_local_backend(config, ctx),
         other => Err(PluginError::InvalidConfig(format!(
             "unsupported NeMo Guardrails mode '{other}'"
         ))),
@@ -955,6 +957,18 @@ fn validate_request_defaults(
         return;
     };
 
+    if config.mode == "local" {
+        push_policy_diag(
+            diagnostics,
+            policy.unsupported_value,
+            "nemo_guardrails.unsupported_value",
+            Some(NEMO_GUARDRAILS_PLUGIN_KIND.to_string()),
+            Some("request_defaults".to_string()),
+            "local mode does not currently support request_defaults".to_string(),
+        );
+        return;
+    }
+
     validate_json_object_field(
         diagnostics,
         policy,
diff --git a/crates/core/src/plugins/nemo_guardrails/local.rs b/crates/core/src/plugins/nemo_guardrails/local.rs
new file mode 100644
index 00000000..31f4e1c8
--- /dev/null
+++ b/crates/core/src/plugins/nemo_guardrails/local.rs
@@ -0,0 +1,51 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use std::sync::{Arc, LazyLock, Mutex, MutexGuard};
+
+use crate::plugin::{PluginError, PluginRegistrationContext, Result as PluginResult};
+
+use super::NeMoGuardrailsConfig;
+
+type LocalBackendProvider = Arc<
+    dyn Fn(NeMoGuardrailsConfig, &mut PluginRegistrationContext) -> PluginResult<()> + Send + Sync,
+>;
+
+static LOCAL_BACKEND_PROVIDER: LazyLock<Mutex<Option<LocalBackendProvider>>> =
+    LazyLock::new(|| Mutex::new(None));
+
+fn local_backend_provider_guard() -> PluginResult<MutexGuard<'static, Option<LocalBackendProvider>>> {
+    LOCAL_BACKEND_PROVIDER.lock().map_err(|e| {
+        PluginError::Internal(format!(
+            "NeMo Guardrails local backend provider lock poisoned: {e}"
+        ))
+    })
+}
+
+#[doc(hidden)]
+pub fn register_local_backend_provider(provider: LocalBackendProvider) -> PluginResult<()> {
+    let mut guard = local_backend_provider_guard()?;
+    *guard = Some(provider);
+    Ok(())
+}
+
+#[doc(hidden)]
+pub fn clear_local_backend_provider() -> PluginResult<()> {
+    let mut guard = local_backend_provider_guard()?;
+    *guard = None;
+    Ok(())
+}
+
+pub(super) fn register_local_backend(
+    config: NeMoGuardrailsConfig,
+    ctx: &mut PluginRegistrationContext,
+) -> PluginResult<()> {
+    let provider = local_backend_provider_guard()?.clone();
+
+    match provider {
+        Some(provider) => provider(config, ctx),
+        None => Err(PluginError::RegistrationFailed(
+            "built-in NeMo Guardrails local backend is unavailable in this runtime".to_string(),
+        )),
+    }
+}
diff --git a/crates/core/tests/unit/plugins/nemo_guardrails/component_tests.rs b/crates/core/tests/unit/plugins/nemo_guardrails/component_tests.rs
index 852b8928..0823bbac 100644
--- a/crates/core/tests/unit/plugins/nemo_guardrails/component_tests.rs
+++ b/crates/core/tests/unit/plugins/nemo_guardrails/component_tests.rs
@@ -42,6 +42,7 @@ const TEST_TIMEOUT: Duration = Duration::from_secs(5);
 
 fn reset_runtime() {
     let _ = clear_plugin_configuration();
+    crate::plugins::nemo_guardrails::component::clear_local_backend_provider().unwrap();
     crate::shared_runtime::reset_runtime_owner_for_tests();
     let context = global_context();
     *context.write().unwrap() = NemoRelayContextState::new();
@@ -789,6 +790,22 @@ fn invalid_shapes_and_values_are_reported() {
             .any(|diag| diag.field.as_deref() == Some("local.python_module"))
     );
 
+    let local_request_defaults = validate_plugin_config(&plugin_config(json!({
+        "mode": "local",
+        "codec": "openai_chat",
+        "config_path": "./rails",
+        "request_defaults": {
+            "context": {"tenant": "demo"}
+        }
+    })));
+    assert!(local_request_defaults.has_errors());
+    assert!(local_request_defaults.diagnostics.iter().any(|diag| {
+        diag.field.as_deref() == Some("request_defaults")
+            && diag
+                .message
+                .contains("local mode does not currently support request_defaults")
+    }));
+
     let invalid_request_defaults = validate_plugin_config(&plugin_config(json!({
         "mode": "remote",
         "codec": "openai_chat",
@@ -975,7 +992,7 @@ fn enabled_local_initialization_fails_fast_until_backend_exists() {
 
     match error {
         crate::plugin::PluginError::RegistrationFailed(message) => {
-            assert!(message.contains("local backend"));
+            assert!(message.contains("unavailable in this runtime"));
         }
         other => panic!("unexpected error: {other}"),
     }
@@ -1007,5 +1024,34 @@ fn enabled_unknown_mode_initialization_fails_fast_when_policy_ignores_validation
     }
 }
 
+#[test]
+fn enabled_local_initialization_dispatches_through_installed_provider() {
+    let _guard = crate::plugins::nemo_guardrails::test_mutex()
+        .lock()
+        .unwrap_or_else(|err| err.into_inner());
+    reset_runtime();
+
+    let provider_called = Arc::new(AtomicBool::new(false));
+    let provider_called_clone = Arc::clone(&provider_called);
+    crate::plugins::nemo_guardrails::component::register_local_backend_provider(Arc::new(
+        move |config, _ctx| {
+            provider_called_clone.store(true, Ordering::SeqCst);
+            assert_eq!(config.mode, "local");
+            assert_eq!(config.config_path.as_deref(), Some("./rails"));
+            Ok(())
+        },
+    ))
+    .unwrap();
+
+    futures::executor::block_on(initialize_plugins(plugin_config(json!({
+        "mode": "local",
+        "codec": "openai_chat",
+        "config_path": "./rails"
+    }))))
+    .unwrap();
+
+    assert!(provider_called.load(Ordering::SeqCst));
+}
+
 #[path = "remote_tests.rs"]
 mod remote_tests;
diff --git a/crates/python/src/lib.rs b/crates/python/src/lib.rs
index d11df353..13d0c29f 100644
--- a/crates/python/src/lib.rs
+++ b/crates/python/src/lib.rs
@@ -20,9 +20,16 @@
 //! - `py_adaptive` — Python-facing adaptive helpers (`set_latency_sensitivity`)
 //! - `py_plugin` — Python-facing generic plugin config/registration helpers
 //! - `convert` — JSON ↔ Python conversion utilities
+use nemo_relay::plugin::{PluginRegistrationContext, Result as PluginResult};
+use nemo_relay::plugins::nemo_guardrails::component::{
+    NeMoGuardrailsConfig, register_local_backend_provider,
+};
 use nemo_relay::shared_runtime::initialize_shared_runtime_binding;
 use nemo_relay_adaptive::plugin_component::register_adaptive_component;
 use pyo3::prelude::*;
+use serde_json::Value as Json;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
 
 mod convert;
 #[doc(hidden)]
@@ -52,6 +59,13 @@ fn _native(m: &Bound<'_, PyModule>) -> PyResult<()> {
             "failed to register adaptive plugin component: {e}"
         ))
     })?;
+    register_local_backend_provider(Arc::new(register_python_local_guardrails_backend)).map_err(
+        |e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!(
+                "failed to register NeMo Guardrails local backend provider: {e}"
+            ))
+        },
+    )?;
     py_types::register(m)?;
     py_api::register(m)?;
     py_plugin::register(m)?;
@@ -59,6 +73,75 @@ fn _native(m: &Bound<'_, PyModule>) -> PyResult<()> {
     Ok(())
 }
 
+fn register_python_local_guardrails_backend(
+    config: NeMoGuardrailsConfig,
+    ctx: &mut PluginRegistrationContext,
+) -> PluginResult<()> {
+    let plugin_config = match serde_json::to_value(config) {
+        Ok(Json::Object(config)) => config,
+        Ok(_) => {
+            return Err(nemo_relay::plugin::PluginError::Internal(
+                "NeMo Guardrails local config did not serialize to a JSON object".to_string(),
+            ));
+        }
+        Err(err) => {
+            return Err(nemo_relay::plugin::PluginError::Internal(format!(
+                "failed to serialize NeMo Guardrails local config: {err}"
+            )));
+        }
+    };
+
+    let registrations = Python::attach(|py| {
+        let register_fn = load_guardrails_local_register_fn(py)?;
+        let namespace_prefix = ctx.qualify_name("");
+        crate::py_plugin::invoke_python_plugin_register(
+            py,
+            "nemo_guardrails",
+            &register_fn,
+            &plugin_config,
+            namespace_prefix,
+        )
+    })
+    .map_err(|err| nemo_relay::plugin::PluginError::RegistrationFailed(err.to_string()))?;
+
+    ctx.extend_registrations(registrations);
+    Ok(())
+}
+
+fn load_guardrails_local_register_fn(py: Python<'_>) -> PyResult<Bound<'_, PyAny>> {
+    let module = match py.import("nemo_relay._guardrails_local") {
+        Ok(module) => module,
+        Err(err) => {
+            let source_python_dir = guardrails_local_source_python_dir();
+            if !source_python_dir.exists() {
+                return Err(err);
+            }
+
+            prepend_python_path_if_missing(py, &source_python_dir)?;
+            py.import("nemo_relay._guardrails_local")?
+        }
+    };
+    module.getattr("register_local_backend")
+}
+
+fn guardrails_local_source_python_dir() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python")
+}
+
+fn prepend_python_path_if_missing(py: Python<'_>, path: &Path) -> PyResult<()> {
+    let sys = py.import("sys")?;
+    let sys_path = sys.getattr("path")?;
+    let path_str = path.to_string_lossy();
+
+    if !sys_path.contains(path_str.as_ref())? {
+        // Source-tree fallback for local development and in-repo tests where the
+        // Python package has not been installed into the active environment yet.
+        sys_path.call_method1("insert", (0, path_str.as_ref()))?;
+    }
+
+    Ok(())
+}
+
 #[cfg(test)]
 #[path = "../tests/coverage/coverage_tests.rs"]
 mod coverage_tests;
diff --git a/crates/python/src/py_plugin.rs b/crates/python/src/py_plugin.rs
index d483375b..ee668ea1 100644
--- a/crates/python/src/py_plugin.rs
+++ b/crates/python/src/py_plugin.rs
@@ -160,6 +160,27 @@ fn new_py_plugin_context(
     )
 }
 
+pub(crate) fn invoke_python_plugin_register(
+    py: Python<'_>,
+    plugin_kind: &str,
+    register_fn: &Bound<'_, PyAny>,
+    plugin_config: &Map<String, Json>,
+    namespace_prefix: String,
+) -> PyResult<Vec<PluginRegistration>> {
+    let py_ctx = new_py_plugin_context(
+        py,
+        plugin_kind,
+        Arc::new(Mutex::new(vec![])),
+        namespace_prefix,
+    )?;
+    let plugin_config_py = plugin_config_to_py(py, plugin_kind, plugin_config)?;
+    register_fn.call1((plugin_config_py, py_ctx.clone_ref(py)))?;
+    {
+        let py_ctx_ref = py_ctx.bind(py).borrow();
+        py_ctx_ref.drain_registrations()
+    }
+}
+
 #[pyclass(name = "PluginContext")]
 pub struct PyPluginContext {
     registrations: Arc<Mutex<Vec<PluginRegistration>>>,
@@ -695,22 +716,14 @@ impl Plugin for PyPlugin {
         let plugin_config = plugin_config.clone();
         Box::pin(async move {
             let registrations = Python::attach(|py| -> PyResult<Vec<PluginRegistration>> {
-                let py_ctx = new_py_plugin_context(
+                let register_fn = self.plugin.getattr(py, "register")?.into_bound(py);
+                invoke_python_plugin_register(
                     py,
                     &self.plugin_kind,
-                    Arc::new(Mutex::new(vec![])),
+                    &register_fn,
+                    &plugin_config,
                     namespace_prefix,
-                )?;
-                let plugin_config_py = json_to_py(py, &Json::Object(plugin_config.clone()))?;
-                self.plugin.call_method1(
-                    py,
-                    "register",
-                    (plugin_config_py, py_ctx.clone_ref(py)),
-                )?;
-                {
-                    let py_ctx_ref = py_ctx.bind(py).borrow();
-                    py_ctx_ref.drain_registrations()
-                }
+                )
             })
             .map_err(|err| PluginError::RegistrationFailed(err.to_string()))?;
 
diff --git a/crates/python/tests/coverage/coverage_tests.rs b/crates/python/tests/coverage/coverage_tests.rs
index 6c3205e0..3e553341 100644
--- a/crates/python/tests/coverage/coverage_tests.rs
+++ b/crates/python/tests/coverage/coverage_tests.rs
@@ -4,11 +4,13 @@
 //! Coverage tests for coverage in the NeMo Relay Python crate.
 
 use std::ffi::CString;
+use std::path::PathBuf;
 use std::pin::Pin;
 use std::sync::Arc;
 
+use pyo3::ffi::c_str;
 use pyo3::prelude::*;
-use pyo3::types::PyModule;
+use pyo3::types::{IntoPyDict, PyModule};
 use serde_json::{Value as Json, json};
 use tokio_stream::Stream;
 use tokio_stream::StreamExt;
@@ -24,7 +26,13 @@ use crate::py_callable::{
 };
 use nemo_relay::api::event::{BaseEvent, Event, EventCategory, ScopeCategory, ScopeEvent};
 use nemo_relay::api::llm::LlmRequest;
-use nemo_relay::api::runtime::{LlmExecutionNextFn, LlmStreamExecutionNextFn, ToolExecutionNextFn};
+use nemo_relay::api::runtime::{
+    LlmExecutionNextFn, LlmStreamExecutionNextFn, NemoRelayContextState, ToolExecutionNextFn,
+    global_context,
+};
+use nemo_relay::plugin::{
+    PluginComponentSpec, PluginConfig, clear_plugin_configuration, initialize_plugins,
+};
 
 fn load_module<'py>(py: Python<'py>, code: &str) -> Bound<'py, PyModule> {
     let code = CString::new(code).unwrap();
@@ -65,6 +73,13 @@ fn with_event_loop<T>(py: Python<'_>, f: impl FnOnce(Bound<'_, PyAny>) -> T) ->
     result
 }
 
+fn reset_runtime_state() {
+    let _ = clear_plugin_configuration();
+    nemo_relay::plugins::nemo_guardrails::component::clear_local_backend_provider().unwrap();
+    let context = global_context();
+    *context.write().unwrap() = NemoRelayContextState::new();
+}
+
 #[test]
 fn test_native_module_registers_types_and_api_functions() {
     let _python = crate::test_support::init_python_test();
@@ -94,6 +109,635 @@ fn test_native_pymodule_entrypoint_registers_bindings() {
     });
 }
 
+#[test]
+fn test_native_pymodule_entrypoint_installs_nemo_guardrails_local_provider() {
+    let _python = crate::test_support::init_python_test();
+    Python::attach(|py| {
+        let module = PyModule::new(py, "_native_guardrails_provider").unwrap();
+        crate::_native(&module).unwrap();
+    });
+
+    let runtime = tokio::runtime::Runtime::new().unwrap();
+    let error = runtime
+        .block_on(initialize_plugins(PluginConfig {
+            version: 1,
+            components: vec![PluginComponentSpec {
+                kind: "nemo_guardrails".to_string(),
+                enabled: true,
+                config: serde_json::from_value(json!({
+                    "mode": "local",
+                    "codec": "openai_chat",
+                    "config_path": "./rails"
+                }))
+                .unwrap(),
+            }],
+            policy: Default::default(),
+        }))
+        .unwrap_err();
+
+    let _ = clear_plugin_configuration();
+    match error {
+        nemo_relay::plugin::PluginError::RegistrationFailed(message) => {
+            assert!(
+                message.contains(
+                    "NeMo Guardrails is required for the built-in NeMo Guardrails local backend"
+                ),
+                "unexpected message: {message}"
+            );
+        }
+        other => panic!("unexpected error: {other}"),
+    }
+}
+
+#[test]
+fn test_guardrails_local_helper_registers_and_enforces_llm_and_tool_checks() {
+    let _python = crate::test_support::init_python_test();
+    Python::attach(|py| {
+        let python_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python");
+        let module = load_module(
+            py,
+            &format!(
+                r#"
+import pathlib
+import sys
+import types
+
+sys.path.insert(0, {python_dir:?})
+
+MODULE_NAME = "fake_guardrails_local_helper"
+
+fake_root = types.ModuleType(MODULE_NAME)
+fake_options = types.ModuleType(MODULE_NAME + ".rails.llm.options")
+
+class Result:
+    def __init__(self, status, content=None, rail=None):
+        self.status = status
+        self.content = content
+        self.rail = rail
+
+class RailType:
+    INPUT = "input"
+    OUTPUT = "output"
+
+class RailStatus:
+    BLOCKED = "blocked"
+    MODIFIED = "modified"
+    PASSED = "passed"
+
+class RailsConfig:
+    @staticmethod
+    def from_content(*, colang_content=None, yaml_content=None):
+        return {{"yaml": yaml_content, "colang": colang_content}}
+
+    @staticmethod
+    def from_path(path):
+        return {{"path": path}}
+
+check_results = []
+check_calls = []
+
+class LLMRails:
+    def __init__(self, config):
+        self.config = config
+
+    async def check_async(self, messages, rail_types):
+        check_calls.append((messages, rail_types))
+        return check_results.pop(0)
+
+fake_root.RailsConfig = RailsConfig
+fake_root.LLMRails = LLMRails
+fake_options.RailType = RailType
+fake_options.RailStatus = RailStatus
+
+sys.modules[MODULE_NAME] = fake_root
+sys.modules[MODULE_NAME + ".rails"] = types.ModuleType(MODULE_NAME + ".rails")
+sys.modules[MODULE_NAME + ".rails.llm"] = types.ModuleType(MODULE_NAME + ".rails.llm")
+sys.modules[MODULE_NAME + ".rails.llm.options"] = fake_options
+
+from nemo_relay._native import LLMRequest
+from nemo_relay._guardrails_local import register_local_backend
+
+class Context:
+    def register_llm_execution_intercept(self, name, priority, callback):
+        self.llm = callback
+
+    def register_llm_stream_execution_intercept(self, name, priority, callback):
+        self.stream = callback
+
+    def register_tool_execution_intercept(self, name, priority, callback):
+        self.tool = callback
+
+async def run_case():
+    ctx = Context()
+    event_log = []
+    register_local_backend(
+        {{
+            "mode": "local",
+            "codec": "openai_chat",
+            "config_yaml": "models: []",
+            "input": True,
+            "output": True,
+            "tool_input": True,
+            "tool_output": True,
+            "local": {{"python_module": MODULE_NAME}},
+        }},
+        ctx,
+    )
+
+    request = LLMRequest(
+        {{}},
+        {{
+            "model": "gpt-4o-mini",
+            "messages": [{{"role": "user", "content": "unsafe"}}],
+        }},
+    )
+    seen_request_messages = []
+
+    async def next_call(req):
+        seen_request_messages.append(req.content["messages"][-1]["content"])
+        return {{
+            "choices": [{{"message": {{"role": "assistant", "content": "safe reply"}}}}],
+            "id": "resp_1",
+            "model": "gpt-4o-mini",
+        }}
+
+    check_results.extend(
+        [
+            Result(RailStatus.MODIFIED, content="sanitized user"),
+            Result(RailStatus.PASSED),
+        ]
+    )
+    llm_result = await ctx.llm("demo", request, next_call)
+
+    seen_tool_args = []
+
+    async def next_tool(args):
+        seen_tool_args.append(args)
+        return {{"raw": True}}
+
+    check_results.extend(
+        [
+            Result(RailStatus.MODIFIED, content='{{"arguments": {{"city": "Boston"}}}}'),
+            Result(RailStatus.MODIFIED, content='{{"result": {{"ok": true}}}}'),
+        ]
+    )
+    tool_result = await ctx.tool("weather_lookup", {{"city": "Phoenix"}}, next_tool)
+
+    return {{
+        "llm_result": llm_result,
+        "tool_result": tool_result,
+        "seen_request_messages": seen_request_messages,
+        "seen_tool_args": seen_tool_args,
+        "check_calls": check_calls,
+    }}
+"#,
+                python_dir = python_dir.display(),
+            ),
+        );
+
+        let result_json = with_event_loop(py, |event_loop| {
+            let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
+            let result = event_loop
+                .call_method1("run_until_complete", (coroutine,))
+                .unwrap();
+            crate::convert::py_to_json(&result).unwrap()
+        });
+
+        assert_eq!(
+            result_json["seen_request_messages"][0],
+            json!("sanitized user")
+        );
+        assert_eq!(result_json["tool_result"], json!({ "ok": true }));
+        assert_eq!(
+            result_json["seen_tool_args"][0],
+            json!({ "city": "Boston" })
+        );
+        assert_eq!(
+            result_json["llm_result"]["choices"][0]["message"]["content"],
+            json!("safe reply")
+        );
+        assert_eq!(result_json["check_calls"].as_array().unwrap().len(), 4);
+    });
+}
+
+#[test]
+fn test_guardrails_local_helper_enforces_streamed_output_rails() {
+    let _python = crate::test_support::init_python_test();
+    Python::attach(|py| {
+        let native_module = PyModule::new(py, "_native_guardrails_streaming").unwrap();
+        crate::_native(&native_module).unwrap();
+        let sys = py.import("sys").unwrap();
+        let modules = sys.getattr("modules").unwrap();
+        modules
+            .set_item("nemo_relay._native", native_module.clone())
+            .unwrap();
+
+        let python_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python");
+        let module = load_module(
+            py,
+            &format!(
+                r#"
+import sys
+import types
+
+sys.path.insert(0, {python_dir:?})
+
+MODULE_NAME = "fake_guardrails_streaming"
+
+fake_root = types.ModuleType(MODULE_NAME)
+fake_options = types.ModuleType(MODULE_NAME + ".rails.llm.options")
+
+class Result:
+    def __init__(self, status, content=None, rail=None):
+        self.status = status
+        self.content = content
+        self.rail = rail
+
+class RailType:
+    INPUT = "input"
+    OUTPUT = "output"
+
+class RailStatus:
+    BLOCKED = "blocked"
+    MODIFIED = "modified"
+    PASSED = "passed"
+
+class RailsConfig:
+    @staticmethod
+    def from_content(*, colang_content=None, yaml_content=None):
+        return {{"yaml": yaml_content}}
+
+stream_results = []
+event_log = []
+
+class LLMRails:
+    def __init__(self, config):
+        self.config = types.SimpleNamespace(
+            rails=types.SimpleNamespace(
+                output=types.SimpleNamespace(
+                    flows=["self check output"],
+                    streaming=types.SimpleNamespace(enabled=True, stream_first=True),
+                )
+            )
+        )
+
+    async def check_async(self, messages, rail_types):
+        return Result(RailStatus.PASSED)
+
+    def stream_async(self, *, messages=None, generator=None, include_metadata=False):
+        async def _run():
+            outcome = stream_results.pop(0)
+            async for chunk in generator:
+                event_log.append(f"guardrails-sees:{{chunk}}")
+                if outcome == "pass":
+                    yield chunk
+            if outcome == "block":
+                yield '{{"error": {{"message": "Blocked by output rails: output-policy", "type": "guardrails_violation"}}}}'
+        return _run()
+
+fake_root.RailsConfig = RailsConfig
+fake_root.LLMRails = LLMRails
+fake_options.RailType = RailType
+fake_options.RailStatus = RailStatus
+
+sys.modules[MODULE_NAME] = fake_root
+sys.modules[MODULE_NAME + ".rails"] = types.ModuleType(MODULE_NAME + ".rails")
+sys.modules[MODULE_NAME + ".rails.llm"] = types.ModuleType(MODULE_NAME + ".rails.llm")
+sys.modules[MODULE_NAME + ".rails.llm.options"] = fake_options
+
+from nemo_relay._native import LLMRequest
+from nemo_relay._guardrails_local import register_local_backend
+
+class Context:
+    def register_llm_execution_intercept(self, name, priority, callback):
+        self.llm = callback
+
+    def register_llm_stream_execution_intercept(self, name, priority, callback):
+        self.stream = callback
+
+    def register_tool_execution_intercept(self, name, priority, callback):
+        self.tool = callback
+
+async def run_case():
+    ctx = Context()
+    event_log.clear()
+    register_local_backend(
+        {{
+            "mode": "local",
+            "codec": "openai_chat",
+            "config_yaml": "models: []",
+            "input": False,
+            "output": True,
+            "local": {{"python_module": MODULE_NAME}},
+        }},
+        ctx,
+    )
+
+    request = LLMRequest(
+        {{}},
+        {{
+            "model": "gpt-4o-mini",
+            "messages": [{{"role": "user", "content": "hello"}}],
+        }},
+    )
+
+    async def next_call(req):
+        async def _stream():
+            event_log.append("source:hello")
+            yield {{"choices": [{{"delta": {{"content": "hello"}}}}]}}
+            event_log.append("source:world")
+            yield {{"choices": [{{"delta": {{"content": "world"}}}}]}}
+        return _stream()
+
+    stream_results.append("pass")
+    allowed_stream = await ctx.stream(request, next_call)
+    allowed_chunks = []
+    async for chunk in allowed_stream:
+        event_log.append(f"yield:{{chunk['choices'][0]['delta']['content']}}")
+        allowed_chunks.append(chunk)
+
+    stream_results.append("block")
+    try:
+        blocked_stream = await ctx.stream(request, next_call)
+        async for _chunk in blocked_stream:
+            pass
+    except RuntimeError as error:
+        blocked = str(error)
+    else:
+        raise AssertionError("expected streamed output block")
+
+    ctx_stream_first_false = Context()
+    fake_root.LLMRails = lambda config: types.SimpleNamespace(
+        config=types.SimpleNamespace(
+            rails=types.SimpleNamespace(
+                output=types.SimpleNamespace(
+                    flows=["self check output"],
+                    streaming=types.SimpleNamespace(enabled=True, stream_first=False),
+                )
+            )
+        ),
+        check_async=LLMRails(config).check_async,
+        stream_async=LLMRails(config).stream_async,
+    )
+    register_local_backend(
+        {{
+            "mode": "local",
+            "codec": "openai_chat",
+            "config_yaml": "models: []",
+            "input": False,
+            "output": True,
+            "local": {{"python_module": MODULE_NAME}},
+        }},
+        ctx_stream_first_false,
+    )
+    try:
+        failing_stream = await ctx_stream_first_false.stream(request, next_call)
+        async for _chunk in failing_stream:
+            pass
+    except RuntimeError as error:
+        modified = str(error)
+    else:
+        raise AssertionError("expected stream_first=false error")
+
+    return {{
+        "allowed_chunks": allowed_chunks,
+        "blocked": blocked,
+        "event_log": event_log,
+        "modified": modified,
+    }}
+"#,
+                python_dir = python_dir.display(),
+            ),
+        );
+
+        let result = with_event_loop(py, |event_loop| {
+            let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
+            let result = event_loop
+                .call_method1("run_until_complete", (coroutine,))
+                .unwrap();
+            crate::convert::py_to_json(&result).unwrap()
+        });
+        assert_eq!(
+            result["allowed_chunks"],
+            json!([
+                {"choices": [{"delta": {"content": "hello"}}]},
+                {"choices": [{"delta": {"content": "world"}}]}
+            ])
+        );
+        let event_log = result["event_log"].as_array().unwrap();
+        assert_eq!(
+            &event_log[..6],
+            json!([
+                "source:hello",
+                "yield:hello",
+                "source:world",
+                "yield:world",
+                "guardrails-sees:hello",
+                "guardrails-sees:world",
+            ])
+            .as_array()
+            .unwrap()
+        );
+        assert!(
+            result["blocked"]
+                .as_str()
+                .unwrap()
+                .contains("output rail blocked the LLM call")
+        );
+        assert!(
+            result["modified"]
+                .as_str()
+                .unwrap()
+                .contains("stream_first = true")
+        );
+    });
+}
+
+#[test]
+fn test_local_guardrails_provider_initializes_and_enforces_managed_core_calls() {
+    let _python = crate::test_support::init_python_test();
+    reset_runtime_state();
+
+    Python::attach(|py| {
+        let native_module = PyModule::new(py, "_native_guardrails_e2e").unwrap();
+        crate::_native(&native_module).unwrap();
+        let sys = py.import("sys").unwrap();
+        let modules = sys.getattr("modules").unwrap();
+        let module_names = py
+            .eval(
+                c_str!("list(sys.modules.keys())"),
+                None,
+                Some(&[(c_str!("sys"), sys)].into_py_dict(py).unwrap()),
+            )
+            .unwrap()
+            .extract::<Vec<String>>()
+            .unwrap();
+        for name in module_names {
+            if name == "nemo_relay" || name.starts_with("nemo_relay.") {
+                modules.del_item(name).unwrap();
+            }
+        }
+        modules
+            .set_item("nemo_relay._native", native_module.clone())
+            .unwrap();
+
+        let python_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python");
+        let module = load_module(
+            py,
+            &format!(
+                r#"
+import sys
+import types
+
+sys.path.insert(0, {python_dir:?})
+
+MODULE_NAME = "fake_guardrails_local_e2e"
+
+fake_root = types.ModuleType(MODULE_NAME)
+fake_options = types.ModuleType(MODULE_NAME + ".rails.llm.options")
+
+class Result:
+    def __init__(self, status, content=None, rail=None):
+        self.status = status
+        self.content = content
+        self.rail = rail
+
+class RailType:
+    INPUT = "input"
+    OUTPUT = "output"
+
+class RailStatus:
+    BLOCKED = "blocked"
+    MODIFIED = "modified"
+    PASSED = "passed"
+
+class RailsConfig:
+    @staticmethod
+    def from_content(*, colang_content=None, yaml_content=None):
+        return {{"yaml": yaml_content}}
+
+check_results = []
+
+class LLMRails:
+    def __init__(self, config):
+        self.config = config
+
+    async def check_async(self, messages, rail_types):
+        return check_results.pop(0)
+
+fake_root.RailsConfig = RailsConfig
+fake_root.LLMRails = LLMRails
+fake_options.RailType = RailType
+fake_options.RailStatus = RailStatus
+
+sys.modules[MODULE_NAME] = fake_root
+sys.modules[MODULE_NAME + ".rails"] = types.ModuleType(MODULE_NAME + ".rails")
+sys.modules[MODULE_NAME + ".rails.llm"] = types.ModuleType(MODULE_NAME + ".rails.llm")
+sys.modules[MODULE_NAME + ".rails.llm.options"] = fake_options
+
+import nemo_relay
+
+async def run_case():
+    stack = nemo_relay.create_scope_stack()
+    nemo_relay.set_thread_scope_stack(stack)
+
+    await nemo_relay.plugin.initialize(
+        {{
+            "version": 1,
+            "components": [
+                {{
+                    "kind": "nemo_guardrails",
+                    "enabled": True,
+                    "config": {{
+                        "mode": "local",
+                        "codec": "openai_chat",
+                        "config_yaml": "models: []",
+                        "input": True,
+                        "output": True,
+                        "tool_input": True,
+                        "tool_output": True,
+                        "local": {{"python_module": MODULE_NAME}},
+                    }},
+                }}
+            ],
+        }}
+    )
+
+    check_results.extend(
+        [
+            Result(RailStatus.MODIFIED, content="sanitized user"),
+            Result(RailStatus.PASSED),
+            Result(RailStatus.MODIFIED, content='{{"arguments": {{"city": "Boston"}}}}'),
+            Result(RailStatus.MODIFIED, content='{{"result": {{"ok": true}}}}'),
+        ]
+    )
+
+    request = nemo_relay.LLMRequest(
+        {{}},
+        {{
+            "model": "gpt-4o-mini",
+            "messages": [{{"role": "user", "content": "unsafe"}}],
+        }},
+    )
+
+    seen_request_messages = []
+    async def llm_impl(req):
+        seen_request_messages.append(req.content["messages"][-1]["content"])
+        return {{
+            "choices": [{{"message": {{"role": "assistant", "content": "safe reply"}}}}],
+            "id": "resp_1",
+            "model": req.content["model"],
+        }}
+
+    llm_result = await nemo_relay.llm.execute(
+        "demo",
+        request,
+        llm_impl,
+        response_codec=nemo_relay.codecs.OpenAIChatCodec(),
+    )
+
+    seen_tool_args = []
+    async def tool_impl(args):
+        seen_tool_args.append(args)
+        return {{"raw": True}}
+
+    tool_result = await nemo_relay.tools.execute("weather_lookup", {{"city": "Phoenix"}}, tool_impl)
+    return {{
+        "llm_result": llm_result,
+        "tool_result": tool_result,
+        "seen_request_messages": seen_request_messages,
+        "seen_tool_args": seen_tool_args,
+    }}
+"#,
+                python_dir = python_dir.display(),
+            ),
+        );
+        let result_json = with_event_loop(py, |event_loop| {
+            let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
+            let result = event_loop
+                .call_method1("run_until_complete", (coroutine,))
+                .unwrap();
+            crate::convert::py_to_json(&result).unwrap()
+        });
+
+        assert_eq!(
+            result_json["llm_result"]["choices"][0]["message"]["content"],
+            json!("safe reply")
+        );
+        assert_eq!(result_json["tool_result"], json!({ "ok": true }));
+        assert_eq!(
+            result_json["seen_request_messages"][0],
+            json!("sanitized user")
+        );
+        assert_eq!(
+            result_json["seen_tool_args"][0],
+            json!({ "city": "Boston" })
+        );
+    });
+
+    reset_runtime_state();
+}
+
 #[test]
 fn test_python_test_guard_restores_existing_runtime_env() {
     let lock = crate::test_support::lock_python_test();
diff --git a/docs/about-nemo-relay/concepts/plugins.mdx b/docs/about-nemo-relay/concepts/plugins.mdx
index b9c412e9..065b4b96 100644
--- a/docs/about-nemo-relay/concepts/plugins.mdx
+++ b/docs/about-nemo-relay/concepts/plugins.mdx
@@ -171,9 +171,10 @@ The core crate also ships a built-in `nemo_guardrails` plugin component. It is
 the first-party Guardrails integration point that NeMo Relay owns through the
 shared plugin system.
 
-The current shipped user-facing lane is the remote backend. It gives NeMo Relay
-one canonical plugin kind and config shape for Guardrails-backed managed LLM
-and tool checks while broader backend parity work remains separate.
+The current shipped user-facing lanes are:
+
+- the remote backend for Guardrails-service integration
+- the Python-backed local backend for in-process `nemoguardrails` integration
 
 Detailed Guardrails plugin configuration belongs in
 [NeMo Guardrails Configuration](/nemo-guardrails-plugin/configuration).
diff --git a/docs/build-plugins/nemoguardrails.mdx b/docs/build-plugins/nemoguardrails.mdx
index e5517612..a347c3f7 100644
--- a/docs/build-plugins/nemoguardrails.mdx
+++ b/docs/build-plugins/nemoguardrails.mdx
@@ -15,7 +15,6 @@ first-party `nemo_guardrails` component, see
 [NeMo Guardrails Plugin](/nemo-guardrails-plugin/about).
 
 </Note>
-
 The example lives under `examples/nemoguardrails`. The single-file plugin
 implementation, runnable agent, and Guardrails config artifacts are under
 `example`.
diff --git a/docs/nemo-guardrails-plugin/about.mdx b/docs/nemo-guardrails-plugin/about.mdx
index aa1c6925..5c0cd2f0 100644
--- a/docs/nemo-guardrails-plugin/about.mdx
+++ b/docs/nemo-guardrails-plugin/about.mdx
@@ -17,12 +17,11 @@ first-party NeMo Relay plugin.
 The plugin is designed around backend modes:
 
 - `remote`
-  - Implemented now.
   - Calls a Guardrails service over HTTP(S), including streaming over the same
     remote contract.
 - `local`
-  - Planned.
-  - Reserved for a future in-process Python `nemoguardrails` backend.
+  - Calls `nemoguardrails` in process through the Python runtime instead of a
+    separate Guardrails service.
 
 ## Use This Plugin When
 
@@ -30,39 +29,43 @@ Start here when you need to:
 
 - Apply Guardrails input and output checks around managed `llm.execute(...)`
   calls.
-- Apply Guardrails policy around managed tool execution, including the current
-  remote managed `tool_output` lane.
+- Apply Guardrails policy around managed tool execution.
 - Configure Guardrails behavior through the same plugin config surface used by
   other first-party NeMo Relay components.
-- Keep Guardrails behavior in a reusable process-level config document instead
-  of wiring provider-specific checks into each application call site.
+- Keep Guardrails policy authoring in Guardrails-native config while NeMo Relay
+  owns when those checks run around managed execution.
 
 ## Current Scope
 
-The current shipped user-facing lane is the built-in `remote` backend.
+The built-in plugin currently exposes two user-facing modes with
+different boundaries.
 
-That lane supports:
+| Area | `remote` | `local` |
+|---|---|---|
+| Managed non-streaming LLM `input` | Supported | Supported |
+| Managed non-streaming LLM `output` | Supported | Supported |
+| Managed streaming LLM execution | Supported over the remote HTTP(S) contract | Supported for managed input checks and Guardrails-native output streaming when `rails.output.streaming.enabled = true`; with `stream_first = true`, output rails can stop the stream after some chunks have already been delivered; `stream_first = false` is not supported yet |
+| Managed `tool_input` | Not supported against the stock Guardrails remote contract | Supported |
+| Managed `tool_output` | Supported | Supported |
+| `request_defaults` | Supported as backend pass-through request semantics | Not supported |
+| Codec support | `openai_chat` | `openai_chat`, `openai_responses`, `anthropic_messages` |
+| Runtime availability | Any runtime that includes the remote backend | Python-enabled runtimes that can import `nemoguardrails` |
 
-- Managed non-streaming LLM `input` checks.
-- Managed non-streaming LLM `output` checks.
-- Managed streaming LLM execution over the remote HTTP(S) path.
-- Managed tool-result checks through `tool_output`.
-- Request-time Guardrails defaults passed through to the remote backend.
-
-The current built-in remote backend does not support:
-
-- Managed `tool_input` checks against the stock Guardrails remote contract.
-- `local` mode.
-- Remote managed LLM parity beyond `codec = "openai_chat"`.
+The `local` backend is a Python-backed runtime feature, not a universal
+cross-binding backend. Runtimes that do not install the local backend provider
+report `local` mode as unavailable during plugin initialization.
 
 ## Managed Surfaces Versus Request Defaults
 
-The NeMo Guardrails plugin model uses two different concepts:
+Both `remote` mode and `local` mode share the same top-level plugin model, but
+they do not implement every part of that model in the same way.
+
+At the plugin-model level, NeMo Guardrails uses two different concepts:
 
-- Currently supported managed NeMo Relay execution surfaces in the shipped
-  remote backend:
+- Top-level managed NeMo Relay execution surfaces:
   - `input`
   - `output`
+  - `tool_input`
   - `tool_output`
 - Guardrails backend request defaults:
   - `request_defaults.context`
@@ -78,62 +81,43 @@ This distinction matters:
 
 - Managed surfaces wrap real NeMo Relay execution boundaries such as
   `llm.execute(...)` and `tools.execute(...)`.
-- Managed surfaces let NeMo Relay enforce behavior around those boundaries.
-  Depending on the surface, Relay can block work, allow it, or apply managed
-  request or result handling before the application sees the final outcome.
+- Managed surfaces give NeMo Relay an owned enforcement point around a known
+  runtime step. Depending on the backend and surface, Relay can block work,
+  allow it, or apply managed request or result handling before the application
+  sees the outcome.
 - Managed surfaces also give NeMo Relay a stable runtime boundary for its own
-  middleware ordering, lifecycle behavior, and observability marks. Relay knows
-  exactly which step is being wrapped and can attach policy and telemetry to
-  that step directly.
-- `request_defaults` fields are forwarded to the selected Guardrails backend as
-  request semantics. They do not create new NeMo Relay-native execution
-  surfaces.
-- `request_defaults` can still influence Guardrails behavior, but they do not
-  give NeMo Relay a new local runtime step to wrap. Relay is passing backend
-  options along with a request, not creating a new middleware boundary of its
-  own.
-- `request_defaults` are also backend-contract dependent. A selected Guardrails
-  backend can use them when evaluating a request, but the exact effect depends
-  on what that backend supports. Relay is not creating a separate local
-  retrieval, dialog, or tool boundary just because those fields exist in the
-  request.
-
-In practice, the tradeoff is:
-
-- Managed surfaces give you a Relay-owned enforcement point around a known
-  runtime step, with Relay-owned enforcement, ordering, and marks around that
-  step.
-- `request_defaults` give you backend-level configuration for a request, but
-  not a separate Relay-owned interception point, runtime boundary, or
-  middleware surface.
-
-Another way to think about it:
+  middleware ordering, lifecycle behavior, and observability marks.
+
+In practice:
 
 - Managed surfaces are places where NeMo Relay is holding the steering wheel.
-- `request_defaults` are notes that NeMo Relay passes to the Guardrails backend
-  with a request.
 
-Top-level `tool_input` is still part of the built-in plugin contract, but it is
-not supported by the current stock-remote backend.
+The forwarded request-default side is more mode-specific:
+
+- In `remote` mode, `request_defaults` fields are forwarded to the selected
+  Guardrails backend as request semantics. They do not create new NeMo
+  Relay-native execution surfaces.
+- In `local` mode, `request_defaults` is rejected instead of passed through.
 
-The overlap in names is important:
+The overlap in names is important in `remote` mode:
 
 - Top-level `input` is a managed NeMo Relay execution surface.
 - `request_defaults.rails.input` is a backend pass-through option.
 - Top-level `output` is a managed NeMo Relay execution surface.
 - `request_defaults.rails.output` is a backend pass-through option.
-- Top-level `tool_input` is part of the built-in plugin model, but the current
-  stock-remote backend rejects it.
+- Top-level `tool_input` is a managed NeMo Relay execution surface in the
+  plugin contract. The current stock-remote backend rejects it, while the local
+  backend supports it.
 - `request_defaults.rails.tool_input` is a backend pass-through option.
 - Top-level `tool_output` is a managed NeMo Relay execution surface.
 - `request_defaults.rails.tool_output` is a backend pass-through option.
 
 In particular, `request_defaults.rails.dialog` and
-`request_defaults.rails.retrieval` are simple pass-through options. They are
-not separate managed middleware surfaces in NeMo Relay.
+`request_defaults.rails.retrieval` are pass-through options. They are not
+separate managed middleware surfaces in NeMo Relay.
 
 ## Pages
 
 - [NeMo Guardrails Configuration](/nemo-guardrails-plugin/configuration)
-  documents the built-in component shape, remote-mode boundaries, and current
+  documents the built-in component shape, mode boundaries, and current
   support matrix.
diff --git a/docs/nemo-guardrails-plugin/configuration.mdx b/docs/nemo-guardrails-plugin/configuration.mdx
index ddaa1fb0..b1554e1c 100644
--- a/docs/nemo-guardrails-plugin/configuration.mdx
+++ b/docs/nemo-guardrails-plugin/configuration.mdx
@@ -10,9 +10,6 @@ SPDX-License-Identifier: Apache-2.0 */}
 Use this page when you want to configure the built-in NeMo Guardrails plugin
 component. The component kind is `nemo_guardrails`.
 
-The current shipped user-facing backend is `mode = "remote"`. `local` remains
-part of the config model, but it is not yet a finished user-facing backend.
-
 For plugin file discovery, precedence, merge behavior, editor controls, and
 gateway conflict rules, see
 [Plugin Configuration Files](/build-plugins/plugin-configuration-files).
@@ -37,32 +34,36 @@ The top-level NeMo Guardrails object contains:
 | `codec` | Managed LLM provider codec. |
 | `input` | Enables managed LLM input checks. |
 | `output` | Enables managed LLM output checks. |
-| `tool_input` | Part of the built-in plugin model for managed tool-argument checks before execution. The current stock-remote backend rejects it. |
+| `tool_input` | Enables managed tool-argument checks before execution. |
 | `tool_output` | Enables managed tool-result checks after execution. |
 | `priority` | Middleware priority for installed execution intercepts. |
 | `remote` | Remote backend settings. |
-| `local` | Local backend settings for future local mode. |
-| `request_defaults` | Default request-time Guardrails semantics passed to the backend. |
+| `local` | Local backend settings. |
+| `request_defaults` | Default request-time Guardrails semantics passed to the remote backend. |
 | `policy` | Component-local handling for unknown fields and unsupported values. |
 
 At least one managed Guardrails surface must be enabled.
 
-## Current Remote Support
+## Backend Support
 
-The current built-in remote backend supports:
+| Area | `remote` | `local` |
+|---|---|---|
+| Built-in component kind and config validation | Supported | Supported |
+| Managed LLM `input` | Supported | Supported |
+| Managed LLM `output` | Supported | Supported |
+| Managed streaming LLM execution | Supported over the remote HTTP(S) contract | Supported for managed input checks and Guardrails-native output streaming when `rails.output.streaming.enabled = true`; with `stream_first = true`, output rails can stop the stream after some chunks have already been delivered; `stream_first = false` is not supported yet |
+| Managed `tool_input` | Not supported against the stock Guardrails remote contract | Supported |
+| Managed `tool_output` | Supported | Supported |
+| `request_defaults` pass-through | Supported | Not supported |
+| Codec support | `openai_chat` | `openai_chat`, `openai_responses`, `anthropic_messages` |
+| Runtime availability | Any runtime that includes the remote backend | Python-enabled runtimes that can import `nemoguardrails` |
 
-| Area | Support |
-|---|---|
-| Built-in component kind and config validation | Supported |
-| Managed LLM `input` | Supported |
-| Managed LLM `output` | Supported |
-| Managed streaming LLM execution over the remote HTTP(S) contract | Supported |
-| Managed `tool_output` | Supported |
-| Managed `tool_input` | Not supported against the stock Guardrails remote contract |
-| `request_defaults` pass-through | Supported |
-| `local` mode | Not implemented yet |
+## Remote Mode
+
+Use `remote` mode when NeMo Relay should call a Guardrails service over
+HTTP(S).
 
-## Remote Requirements
+### Requirements
 
 To use `mode = "remote"`, the configured `remote.endpoint` must point at a
 Guardrails service that NeMo Relay can reach from the running process and that
@@ -73,7 +74,7 @@ Guardrails service still owns the actual policy content. In practice, NeMo
 Relay decides when managed checks run, while the Guardrails config decides what
 to block, allow, or rewrite.
 
-## `plugins.toml` Example
+### `plugins.toml` Example
 
 ```toml
 version = 1
@@ -108,32 +109,12 @@ unknown_field = "warn"
 unsupported_value = "error"
 ```
 
-This example configures the built-in remote backend for a Guardrails service
-that uses `codec = "openai_chat"`, managed LLM `input` and `output`, managed
+This example configures the built-in remote mode for a Guardrails service that
+uses `codec = "openai_chat"`, managed LLM `input` and `output`, managed
 `tool_output`, and request-default pass-through for backend context plus
 backend `input` and `output` rail selection.
 
-In that setup, the NeMo Relay plugin chose the managed surfaces to wrap, while
-the Guardrails config defined the actual blocking policy, such as rejecting
-secret-seeking prompts, bypass attempts, specific blocked tokens, or
-private-key-like output.
-
-For example, the Guardrails-side policy can look like this:
-
-```yaml
-rails:
-  input:
-    flows:
-      - self check input
-  output:
-    flows:
-      - self check output
-```
-
-This Guardrails-side config defines the policy logic. The NeMo Relay plugin
-config decides when those checks run.
-
-## Remote Mode Rules
+### Rules
 
 When `mode = "remote"`:
 
@@ -146,24 +127,24 @@ When `mode = "remote"`:
 
 ### Codec Boundary
 
-The current built-in remote backend supports managed LLM execution only with:
+The current built-in remote mode supports managed LLM execution only with:
 
 - `openai_chat`
 
-## Managed Tool Boundary
+### Managed Tool Boundary
 
-The current remote backend supports managed `tool_output`.
+The current remote mode supports managed `tool_output`.
 
-The current remote backend rejects managed `tool_input` explicitly because the
+The current remote mode rejects managed `tool_input` explicitly because the
 stock Guardrails remote contract does not activate pre-execution tool-call
 rails from externally submitted `/v1/chat/completions` history. NeMo Relay
 rejects `tool_input` in remote mode rather than leaving a silent
 non-enforcing path.
 
-## Request Defaults
+### Request Defaults
 
 `request_defaults` lets the built-in plugin pass request-time semantics through
-to the selected backend.
+to the selected remote backend.
 
 Supported request-default fields are:
 
@@ -201,11 +182,12 @@ The `rails` section can include:
 - `tool_output`
 - `tool_input`
 
-Those values are forwarded to the backend as request semantics. They do not
-mean NeMo Relay owns separate managed retrieval or dialog execution surfaces.
-`dialog` and `retrieval` are pass-through request options only. Likewise,
-`request_defaults.rails.tool_input` is only a backend pass-through selector. It
-does not make managed remote `tool_input` supported in the stock-remote lane.
+Those values are forwarded to the remote backend as request semantics. They do
+not mean NeMo Relay owns separate managed retrieval or dialog execution
+surfaces. `dialog` and `retrieval` are pass-through request options only.
+Likewise, `request_defaults.rails.tool_input` is only a backend pass-through
+selector. It does not make managed remote `tool_input` supported in the
+stock-remote lane.
 
 For more targeted request-time pass-through, the remote backend also forwards
 selectors like these:
@@ -219,11 +201,7 @@ dialog = true
 tool_output = ["validate_tool_output"]
 ```
 
-This richer selector shape demonstrates how request-time Guardrails semantics
-can be forwarded even when NeMo Relay does not own a separate native managed
-surface for that category.
-
-## Observability
+### Observability
 
 The current remote backend emits coarse backend-level marks for remote
 Guardrails activity:
@@ -232,4 +210,111 @@ Guardrails activity:
 - `nemo_guardrails.remote.end`
 - `nemo_guardrails.remote.error`
 
-These marks cover managed LLM remote execution and managed tool-result checks.
+## Local Mode
+
+Use `local` mode when NeMo Relay should call `nemoguardrails` in process
+through the Python runtime instead of a separate Guardrails service.
+
+### Requirements
+
+To use `mode = "local"`, the running Python environment must be able to import
+`nemoguardrails`.
+
+The built-in local backend is installed by the Python binding and runs
+Guardrails in process. Use it when the runtime has direct access to the Python
+Guardrails dependency and configuration files rather than a separate Guardrails
+service.
+
+The same ownership boundary still applies:
+
+- NeMo Relay decides when managed checks run.
+- Guardrails-native config still decides what to block, allow, or rewrite.
+
+### `plugins.toml` Example
+
+```toml
+version = 1
+
+[[components]]
+kind = "nemo_guardrails"
+enabled = true
+
+[components.config]
+version = 1
+mode = "local"
+codec = "openai_chat"
+input = true
+output = true
+tool_input = true
+tool_output = true
+config_path = "./rails"
+
+[components.config.policy]
+unknown_component = "warn"
+unknown_field = "warn"
+unsupported_value = "error"
+```
+
+This example configures the built-in local mode for a Python-enabled runtime
+that can import `nemoguardrails` and read a native Guardrails config directory
+from `./rails`.
+
+For example, the Guardrails-side policy can look like this:
+
+```yaml
+rails:
+  input:
+    flows:
+      - self check input
+  output:
+    flows:
+      - self check output
+```
+
+This Guardrails-side config defines the policy logic. The NeMo Relay plugin
+config decides when those checks run.
+
+### Rules
+
+When `mode = "local"`:
+
+- Exactly one of `config_path` or `config_yaml` is required.
+- `colang_content` can only be used with `config_yaml`.
+- `remote` settings cannot be present.
+- `request_defaults` is rejected.
+- `local.python_module` is optional and only needed when the runtime should
+  import the Guardrails dependency from a custom Python module path instead of
+  the default `nemoguardrails` package.
+
+### Codec Boundary
+
+The current built-in local mode supports managed LLM execution with:
+
+- `openai_chat`
+- `openai_responses`
+- `anthropic_messages`
+
+### Managed Tool Boundary
+
+The current local mode supports both:
+
+- managed `tool_input`
+- managed `tool_output`
+
+### Streaming Boundary
+
+The current local mode supports streaming LLM input checks before the stream
+callback runs.
+
+When output rails are configured, the current local mode uses Guardrails-native
+streaming output rails instead of buffering the full provider stream. That
+requires `rails.output.streaming.enabled = true` in the Guardrails config.
+
+The current local mode supports the `stream_first = true` streaming semantics:
+provider chunks can still flow to the caller while Guardrails evaluates the
+stream in parallel. If Guardrails later blocks the stream, the call fails at
+that point even though some chunks may already have been delivered.
+
+The current local mode does not support `rails.output.streaming.stream_first = false`
+yet, because that would require converting guarded text chunks back into valid
+provider-shaped stream chunks.
diff --git a/python/nemo_relay/_guardrails_local.py b/python/nemo_relay/_guardrails_local.py
new file mode 100644
index 00000000..5f30eb49
--- /dev/null
+++ b/python/nemo_relay/_guardrails_local.py
@@ -0,0 +1,589 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Internal helpers for the built-in NeMo Guardrails local backend."""
+
+from __future__ import annotations
+
+import asyncio
+import importlib
+import json
+from collections.abc import Callable
+from typing import Any, Protocol, cast
+
+from nemo_relay import Json, LLMRequest
+from nemo_relay.codecs import (
+    AnthropicMessagesCodec,
+    LlmCodec,
+    LlmResponseCodec,
+    OpenAIChatCodec,
+    OpenAIResponsesCodec,
+)
+from nemo_relay.plugin import PluginContext
+
+_DEFAULT_PRIORITY = 100
+
+
+class NeMoGuardrailsDependencyError(RuntimeError):
+    """Raised when the optional ``nemoguardrails`` dependency is unavailable."""
+
+
+class NeMoGuardrailsViolation(RuntimeError):
+    """Raised when NeMo Guardrails blocks or cannot safely apply a rail result."""
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        rail_type: str,
+        rail: str | None = None,
+        content: str | None = None,
+    ) -> None:
+        super().__init__(message)
+        self.rail_type = rail_type
+        self.rail = rail
+        self.content = content
+
+
+class _GuardrailsCodec(LlmCodec, LlmResponseCodec, Protocol):
+    """Codec shape required by the local backend."""
+
+
+_CODECS: dict[str, Callable[[], _GuardrailsCodec]] = {
+    "openai_chat": OpenAIChatCodec,
+    "openai_responses": OpenAIResponsesCodec,
+    "anthropic_messages": AnthropicMessagesCodec,
+}
+
+
+def _load_nemoguardrails(module_name: str | None):
+    root_module = module_name or "nemoguardrails"
+    try:
+        guardrails = cast(Any, importlib.import_module(root_module))
+        options = cast(Any, importlib.import_module(f"{root_module}.rails.llm.options"))
+    except ImportError as error:
+        if error.name == root_module:
+            raise NeMoGuardrailsDependencyError(
+                "NeMo Guardrails is required for the built-in NeMo Guardrails local backend. "
+                "Install it with: pip install nemoguardrails"
+            ) from error
+        raise NeMoGuardrailsDependencyError(
+            "NeMo Guardrails local backend could not import a required dependency: "
+            f"{error.name or error}. Install the full NeMo Guardrails runtime dependencies."
+        ) from error
+
+    return (
+        guardrails.RailsConfig,
+        guardrails.LLMRails,
+        options.RailType,
+        options.RailStatus,
+    )
+
+
+def _status_value(status: Any) -> str:
+    return str(getattr(status, "value", status)).lower()
+
+
+def _messages_from_annotated(annotated: Any) -> list[dict[str, Any]]:
+    return [dict(message) for message in annotated.messages]
+
+
+async def _apply_input_rails(
+    rails: Any,
+    rail_type: Any,
+    rail_status: Any,
+    codec: _GuardrailsCodec,
+    request: LLMRequest,
+) -> tuple[LLMRequest, list[dict[str, Any]]]:
+    annotated_request = codec.decode(request)
+    messages = _messages_from_annotated(annotated_request)
+    input_result = await rails.check_async(messages, rail_types=[rail_type.INPUT])
+    input_status = _status_value(input_result.status)
+    if input_status == _status_value(rail_status.BLOCKED):
+        _raise_blocked(input_result, "input")
+    if input_status == _status_value(rail_status.MODIFIED):
+        input_content = getattr(input_result, "content", "")
+        annotated_request.messages = _replace_last_role_content(
+            messages,
+            "user",
+            "" if input_content is None else str(input_content),
+        )
+        request = codec.encode(annotated_request, request)
+        messages = _messages_from_annotated(annotated_request)
+    return request, messages
+
+
+def _replace_last_role_content(messages: list[dict[str, Any]], role: str, content: str) -> list[dict[str, Any]]:
+    updated = [dict(message) for message in messages]
+    for index in range(len(updated) - 1, -1, -1):
+        if updated[index].get("role") == role:
+            updated[index]["content"] = content
+            return updated
+    raise NeMoGuardrailsViolation(
+        f"NeMo Guardrails returned modified {role} content but no {role} message was present.",
+        rail_type="input" if role == "user" else "output",
+        content=content,
+    )
+
+
+def _tool_input_content(name: str, args: Json) -> str:
+    return json.dumps(
+        {
+            "tool_name": name,
+            "arguments": args,
+        },
+        sort_keys=True,
+        separators=(",", ":"),
+    )
+
+
+def _tool_output_content(name: str, args: Json, result: Json) -> str:
+    return json.dumps(
+        {
+            "tool_name": name,
+            "arguments": args,
+            "result": result,
+        },
+        sort_keys=True,
+        separators=(",", ":"),
+    )
+
+
+def _modified_tool_payload(content: str, field: str) -> Json:
+    try:
+        value = json.loads(content)
+    except json.JSONDecodeError as error:
+        raise NeMoGuardrailsViolation(
+            f"NeMo Guardrails returned modified tool {field} content that is not valid JSON.",
+            rail_type=f"tool_{field}",
+            content=content,
+        ) from error
+
+    if not isinstance(value, dict) or field not in value:
+        raise NeMoGuardrailsViolation(
+            f"NeMo Guardrails returned modified tool {field} content without a '{field}' field.",
+            rail_type=f"tool_{field}",
+            content=content,
+        )
+    return cast(Json, value[field])
+
+
+def _raise_modified_output_not_supported(result: Any) -> None:
+    output_content = getattr(result, "content", "")
+    output_rail = getattr(result, "rail", None)
+    raise NeMoGuardrailsViolation(
+        "NeMo Guardrails output rail returned modified content, but the local backend "
+        "does not rewrite provider responses yet.",
+        rail_type="output",
+        rail=None if output_rail is None else str(output_rail),
+        content="" if output_content is None else str(output_content),
+    )
+
+
+async def _check_output_rails(
+    rails: Any,
+    rail_type: Any,
+    rail_status: Any,
+    messages: list[dict[str, Any]],
+    response_text: str | None,
+) -> None:
+    if response_text is None:
+        return
+
+    output_messages = [*messages, {"role": "assistant", "content": response_text}]
+    output_result = await rails.check_async(output_messages, rail_types=[rail_type.OUTPUT])
+    output_status = _status_value(output_result.status)
+    if output_status == _status_value(rail_status.BLOCKED):
+        _raise_blocked(output_result, "output")
+    if output_status == _status_value(rail_status.MODIFIED):
+        _raise_modified_output_not_supported(output_result)
+
+
+def _has_streaming_output_rails(rails: Any) -> bool:
+    return bool(getattr(rails.config.rails.output, "flows", []))
+
+
+def _output_streaming_config(rails: Any) -> Any | None:
+    return getattr(rails.config.rails.output, "streaming", None)
+
+
+def _guardrails_streaming_enabled(rails: Any) -> bool:
+    streaming = _output_streaming_config(rails)
+    return bool(streaming is not None and getattr(streaming, "enabled", False))
+
+
+def _extract_stream_text(codec_name: str, chunk: Json) -> str | None:
+    if not isinstance(chunk, dict):
+        return None
+
+    if codec_name == "openai_chat":
+        choices = chunk.get("choices")
+        if not isinstance(choices, list):
+            return None
+        parts: list[str] = []
+        for choice in choices:
+            if not isinstance(choice, dict):
+                continue
+            delta = choice.get("delta")
+            if not isinstance(delta, dict):
+                continue
+            content = delta.get("content")
+            if isinstance(content, str) and content:
+                parts.append(content)
+        return "".join(parts) if parts else None
+
+    if codec_name == "openai_responses":
+        if chunk.get("type") == "response.output_text.delta":
+            delta = chunk.get("delta")
+            return delta if isinstance(delta, str) and delta else None
+        return None
+
+    if codec_name == "anthropic_messages":
+        if chunk.get("type") != "content_block_delta":
+            return None
+        delta = chunk.get("delta")
+        if not isinstance(delta, dict):
+            return None
+        if delta.get("type") != "text_delta":
+            return None
+        text = delta.get("text")
+        return text if isinstance(text, str) and text else None
+
+    return None
+
+
+def _guardrails_stream_error_message(chunk: str) -> str | None:
+    try:
+        payload = json.loads(chunk)
+    except json.JSONDecodeError:
+        return None
+    if not isinstance(payload, dict):
+        return None
+    error = payload.get("error")
+    if not isinstance(error, dict):
+        return None
+    if error.get("type") != "guardrails_violation":
+        return None
+    message = error.get("message")
+    return message if isinstance(message, str) and message else "Blocked by output rails."
+
+
+async def _queue_string_stream(queue: "asyncio.Queue[str | None]"):
+    while True:
+        item = await queue.get()
+        if item is None:
+            return
+        yield item
+
+
+async def _monitor_streaming_output_rails(
+    *,
+    rails: Any,
+    messages: list[dict[str, Any]],
+    text_queue: "asyncio.Queue[str | None]",
+    blocked: dict[str, str | None],
+) -> None:
+    guarded_stream = rails.stream_async(
+        messages=messages,
+        generator=_queue_string_stream(text_queue),
+        include_metadata=False,
+    )
+    async for chunk in guarded_stream:
+        if isinstance(chunk, str):
+            message = _guardrails_stream_error_message(chunk)
+            if message is not None:
+                blocked["message"] = message
+                return
+
+
+def _raise_streaming_output_blocked(blocked_message: str) -> None:
+    raise NeMoGuardrailsViolation(
+        f"NeMo Guardrails output rail blocked the LLM call: {blocked_message}",
+        rail_type="output",
+        content=blocked_message,
+    )
+
+
+def _build_guardrails_config(config: dict[str, Any], rails_config_cls: Any) -> Any:
+    if config.get("config_path") is not None:
+        return rails_config_cls.from_path(cast(str, config["config_path"]))
+    return rails_config_cls.from_content(
+        colang_content=cast(str | None, config.get("colang_content")),
+        yaml_content=cast(str, config["config_yaml"]),
+    )
+
+
+def _resolve_codec(config: dict[str, Any]) -> tuple[str, _GuardrailsCodec]:
+    codec_name = cast(str | None, config.get("codec"))
+    if codec_name is None or codec_name not in _CODECS:
+        raise RuntimeError("local NeMo Guardrails backend requires a supported codec")
+    return codec_name, _CODECS[codec_name]()
+
+
+async def _check_tool_input(
+    rails: Any,
+    rail_type: Any,
+    rail_status: Any,
+    tool_name: str,
+    args: Json,
+) -> Json:
+    input_result = await rails.check_async(
+        [{"role": "user", "content": _tool_input_content(tool_name, args)}],
+        rail_types=[rail_type.INPUT],
+    )
+    input_status = _status_value(input_result.status)
+    if input_status == _status_value(rail_status.BLOCKED):
+        _raise_blocked(input_result, "tool_input")
+    if input_status == _status_value(rail_status.MODIFIED):
+        input_content = getattr(input_result, "content", "")
+        return _modified_tool_payload(
+            "" if input_content is None else str(input_content),
+            "arguments",
+        )
+    return args
+
+
+async def _check_tool_output(
+    rails: Any,
+    rail_type: Any,
+    rail_status: Any,
+    tool_name: str,
+    args: Json,
+    result: Json,
+) -> Json:
+    output_result = await rails.check_async(
+        [
+            {"role": "user", "content": _tool_input_content(tool_name, args)},
+            {
+                "role": "assistant",
+                "content": _tool_output_content(tool_name, args, result),
+            },
+        ],
+        rail_types=[rail_type.OUTPUT],
+    )
+    output_status = _status_value(output_result.status)
+    if output_status == _status_value(rail_status.BLOCKED):
+        _raise_blocked(output_result, "tool_output")
+    if output_status == _status_value(rail_status.MODIFIED):
+        output_content = getattr(output_result, "content", "")
+        return _modified_tool_payload(
+            "" if output_content is None else str(output_content),
+            "result",
+        )
+    return result
+
+
+def _make_llm_intercept(
+    *,
+    rails: Any,
+    rail_type: Any,
+    rail_status: Any,
+    codec: _GuardrailsCodec,
+    enable_input: bool,
+    enable_output: bool,
+):
+    async def intercept(_name: str, request: LLMRequest, next_call):
+        current_request = request
+        messages = _messages_from_annotated(codec.decode(current_request))
+
+        if enable_input:
+            current_request, messages = await _apply_input_rails(
+                rails,
+                rail_type,
+                rail_status,
+                codec,
+                current_request,
+            )
+
+        response = await next_call(current_request)
+        if not enable_output:
+            return response
+
+        annotated_response = codec.decode_response(response)
+        await _check_output_rails(
+            rails,
+            rail_type,
+            rail_status,
+            messages,
+            annotated_response.response_text(),
+        )
+        return response
+
+    return intercept
+
+
+def _make_llm_stream_intercept(
+    *,
+    rails: Any,
+    rail_type: Any,
+    rail_status: Any,
+    codec_name: str,
+    codec: _GuardrailsCodec,
+    enable_input: bool,
+    enable_output: bool,
+):
+    async def stream_intercept(request: LLMRequest, next_call):
+        current_request = request
+        messages = _messages_from_annotated(codec.decode(current_request))
+        if enable_input:
+            current_request, messages = await _apply_input_rails(
+                rails,
+                rail_type,
+                rail_status,
+                codec,
+                current_request,
+            )
+
+        stream = await next_call(current_request)
+        if not enable_output:
+            return stream
+        if not _has_streaming_output_rails(rails):
+            return stream
+        if not _guardrails_streaming_enabled(rails):
+            raise RuntimeError(
+                "local NeMo Guardrails streaming output rails require "
+                "rails.output.streaming.enabled = true in the Guardrails config."
+            )
+
+        streaming_config = _output_streaming_config(rails)
+        if streaming_config is None or not getattr(streaming_config, "stream_first", True):
+            raise RuntimeError(
+                "local NeMo Guardrails streaming output rails currently require "
+                "rails.output.streaming.stream_first = true."
+            )
+
+        text_queue: asyncio.Queue[str | None] = asyncio.Queue()
+        blocked: dict[str, str | None] = {"message": None}
+        monitor = asyncio.create_task(
+            _monitor_streaming_output_rails(
+                rails=rails,
+                messages=messages,
+                text_queue=text_queue,
+                blocked=blocked,
+            )
+        )
+
+        async def guarded_provider_stream():
+            try:
+                async for chunk in stream:
+                    if blocked["message"] is not None:
+                        _raise_streaming_output_blocked(blocked["message"])
+
+                    text = _extract_stream_text(codec_name, chunk)
+                    if text is not None:
+                        await text_queue.put(text)
+
+                    yield chunk
+
+                    if blocked["message"] is not None:
+                        _raise_streaming_output_blocked(blocked["message"])
+            finally:
+                await text_queue.put(None)
+                await monitor
+                if blocked["message"] is not None:
+                    _raise_streaming_output_blocked(blocked["message"])
+
+        return guarded_provider_stream()
+
+    return stream_intercept
+
+
+def _make_tool_intercept(
+    *,
+    rails: Any,
+    rail_type: Any,
+    rail_status: Any,
+    enable_tool_input: bool,
+    enable_tool_output: bool,
+):
+    async def tool_intercept(tool_name: str, args: Json, next_call):
+        current_args = args
+
+        if enable_tool_input:
+            current_args = await _check_tool_input(
+                rails,
+                rail_type,
+                rail_status,
+                tool_name,
+                current_args,
+            )
+
+        tool_result = await next_call(current_args)
+        if not enable_tool_output:
+            return tool_result
+
+        return await _check_tool_output(
+            rails,
+            rail_type,
+            rail_status,
+            tool_name,
+            current_args,
+            tool_result,
+        )
+
+    return tool_intercept
+
+
+def _raise_blocked(result: Any, rail_type: str) -> None:
+    rail_value = getattr(result, "rail", None)
+    rail = None if rail_value is None else str(rail_value)
+    content = getattr(result, "content", "")
+    detail = f" by rail '{rail}'" if rail else ""
+    subject = "LLM call" if rail_type in {"input", "output"} else "tool call"
+    raise NeMoGuardrailsViolation(
+        f"NeMo Guardrails {rail_type} rail blocked the {subject}{detail}.",
+        rail_type=rail_type,
+        rail=rail,
+        content="" if content is None else str(content),
+    )
+
+
+def register_local_backend(config: dict[str, Any], context: PluginContext) -> None:
+    """Install the built-in NeMo Guardrails local backend."""
+
+    local = cast(dict[str, Any], config.get("local") or {})
+    module_name = cast(str | None, local.get("python_module"))
+    RailsConfig, LLMRails, RailType, RailStatus = _load_nemoguardrails(module_name)
+    guardrails_config = _build_guardrails_config(config, RailsConfig)
+    rails = LLMRails(guardrails_config)
+    enable_input = bool(config.get("input", True))
+    enable_output = bool(config.get("output", True))
+    enable_tool_input = bool(config.get("tool_input", False))
+    enable_tool_output = bool(config.get("tool_output", False))
+    priority = int(config.get("priority", _DEFAULT_PRIORITY))
+
+    if enable_input or enable_output:
+        codec_name, codec = _resolve_codec(config)
+        intercept = _make_llm_intercept(
+            rails=rails,
+            rail_type=RailType,
+            rail_status=RailStatus,
+            codec=codec,
+            enable_input=enable_input,
+            enable_output=enable_output,
+        )
+        stream_intercept = _make_llm_stream_intercept(
+            rails=rails,
+            rail_type=RailType,
+            rail_status=RailStatus,
+            codec_name=codec_name,
+            codec=codec,
+            enable_input=enable_input,
+            enable_output=enable_output,
+        )
+        context.register_llm_execution_intercept("nemo_guardrails_local", priority, intercept)
+        context.register_llm_stream_execution_intercept(
+            "nemo_guardrails_local_stream",
+            priority,
+            stream_intercept,
+        )
+
+    if enable_tool_input or enable_tool_output:
+        tool_intercept = _make_tool_intercept(
+            rails=rails,
+            rail_type=RailType,
+            rail_status=RailStatus,
+            enable_tool_input=enable_tool_input,
+            enable_tool_output=enable_tool_output,
+        )
+        context.register_tool_execution_intercept("nemo_guardrails_local", priority, tool_intercept)

From ec49259df6bc42e2c70c344c4741c4c322693171 Mon Sep 17 00:00:00 2001
From: Alex Fournier <afournier@nvidia.com>
Date: Mon, 1 Jun 2026 06:48:08 -0700
Subject: [PATCH 2/8] docs: refine local guardrails mode docs

Signed-off-by: Alex Fournier <afournier@nvidia.com>
---
 docs/nemo-guardrails-plugin/about.mdx         | 27 +++++++------
 docs/nemo-guardrails-plugin/configuration.mdx | 39 +++++++++++++++----
 2 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/docs/nemo-guardrails-plugin/about.mdx b/docs/nemo-guardrails-plugin/about.mdx
index 5c0cd2f0..d346fcb0 100644
--- a/docs/nemo-guardrails-plugin/about.mdx
+++ b/docs/nemo-guardrails-plugin/about.mdx
@@ -37,19 +37,18 @@ Start here when you need to:
 
 ## Current Scope
 
-The built-in plugin currently exposes two user-facing modes with
-different boundaries.
-
-| Area | `remote` | `local` |
-|---|---|---|
-| Managed non-streaming LLM `input` | Supported | Supported |
-| Managed non-streaming LLM `output` | Supported | Supported |
-| Managed streaming LLM execution | Supported over the remote HTTP(S) contract | Supported for managed input checks and Guardrails-native output streaming when `rails.output.streaming.enabled = true`; with `stream_first = true`, output rails can stop the stream after some chunks have already been delivered; `stream_first = false` is not supported yet |
-| Managed `tool_input` | Not supported against the stock Guardrails remote contract | Supported |
-| Managed `tool_output` | Supported | Supported |
-| `request_defaults` | Supported as backend pass-through request semantics | Not supported |
-| Codec support | `openai_chat` | `openai_chat`, `openai_responses`, `anthropic_messages` |
-| Runtime availability | Any runtime that includes the remote backend | Python-enabled runtimes that can import `nemoguardrails` |
+The built-in plugin currently exposes two user-facing modes:
+
+- `remote` for Guardrails-service integration over HTTP(S)
+- `local` for in-process `nemoguardrails` integration through the Python runtime
+
+Both modes support managed LLM `input` and `output`. The current mode-specific
+differences are:
+
+- `remote` supports `request_defaults` pass-through but does not support managed
+  `tool_input`
+- `local` supports managed `tool_input` and broader LLM codec coverage, but it
+  does not support `request_defaults`
 
 The `local` backend is a Python-backed runtime feature, not a universal
 cross-binding backend. Runtimes that do not install the local backend provider
@@ -119,5 +118,5 @@ separate managed middleware surfaces in NeMo Relay.
 ## Pages
 
 - [NeMo Guardrails Configuration](/nemo-guardrails-plugin/configuration)
-  documents the built-in component shape, mode boundaries, and current
+  documents the built-in component shape, mode boundaries, and the detailed
   support matrix.
diff --git a/docs/nemo-guardrails-plugin/configuration.mdx b/docs/nemo-guardrails-plugin/configuration.mdx
index b1554e1c..24cc12c6 100644
--- a/docs/nemo-guardrails-plugin/configuration.mdx
+++ b/docs/nemo-guardrails-plugin/configuration.mdx
@@ -61,7 +61,9 @@ At least one managed Guardrails surface must be enabled.
 ## Remote Mode
 
 Use `remote` mode when NeMo Relay should call a Guardrails service over
-HTTP(S).
+HTTP(S), especially when Guardrails must be shared across runtimes, used from
+non-Python environments, or deployed independently from the application
+process.
 
 ### Requirements
 
@@ -76,6 +78,11 @@ to block, allow, or rewrite.
 
 ### `plugins.toml` Example
 
+You can write this config directly in `plugins.toml`, or create and edit it
+through the CLI with `nemo-relay plugins edit`. For plugin file discovery,
+precedence, merge behavior, and editor controls, see
+[Plugin Configuration Files](/build-plugins/plugin-configuration-files).
+
 ```toml
 version = 1
 
@@ -232,6 +239,11 @@ The same ownership boundary still applies:
 
 ### `plugins.toml` Example
 
+You can write this config directly in `plugins.toml`, or create and edit it
+through the CLI with `nemo-relay plugins edit`. For plugin file discovery,
+precedence, merge behavior, and editor controls, see
+[Plugin Configuration Files](/build-plugins/plugin-configuration-files).
+
 ```toml
 version = 1
 
@@ -310,11 +322,24 @@ When output rails are configured, the current local mode uses Guardrails-native
 streaming output rails instead of buffering the full provider stream. That
 requires `rails.output.streaming.enabled = true` in the Guardrails config.
 
-The current local mode supports the `stream_first = true` streaming semantics:
-provider chunks can still flow to the caller while Guardrails evaluates the
-stream in parallel. If Guardrails later blocks the stream, the call fails at
-that point even though some chunks may already have been delivered.
+Guardrails calls the main streaming-output switch
+`rails.output.streaming.stream_first`.
+
+When `stream_first = true`, the current local mode uses pass-through-first
+streaming semantics:
+
+- provider chunks can flow to the caller immediately
+- Guardrails evaluates the streamed text in parallel
+- if Guardrails later blocks the stream, the call fails at that point even
+  though some chunks may already have been delivered
 
 The current local mode does not support `rails.output.streaming.stream_first = false`
-yet, because that would require converting guarded text chunks back into valid
-provider-shaped stream chunks.
+yet. That mode would be Guardrails-first streaming semantics:
+
+- Guardrails would need to evaluate streamed text before chunks are released to
+  the caller
+- the local backend would then need to convert Guardrails-approved text back
+  into valid provider-shaped stream chunks
+
+That guarded-text-to-provider-chunk adapter does not exist yet in the current
+local backend.

From 98d49155906b6ccab70710c854faa22e719660b0 Mon Sep 17 00:00:00 2001
From: Alex Fournier <afournier@nvidia.com>
Date: Mon, 1 Jun 2026 07:03:26 -0700
Subject: [PATCH 3/8] test: factor local guardrails coverage fixtures

Signed-off-by: Alex Fournier <afournier@nvidia.com>
---
 .../python/tests/coverage/coverage_tests.rs   | 256 ++++++++----------
 1 file changed, 112 insertions(+), 144 deletions(-)

diff --git a/crates/python/tests/coverage/coverage_tests.rs b/crates/python/tests/coverage/coverage_tests.rs
index 3e553341..90b792f8 100644
--- a/crates/python/tests/coverage/coverage_tests.rs
+++ b/crates/python/tests/coverage/coverage_tests.rs
@@ -41,6 +41,80 @@ fn load_module<'py>(py: Python<'py>, code: &str) -> Bound<'py, PyModule> {
     PyModule::from_code(py, &code, &file_name, &module_name).unwrap()
 }
 
+fn python_package_dir() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python")
+}
+
+fn fake_guardrails_module_prelude(module_name: &str, python_dir: &str) -> String {
+    format!(
+        r#"
+import sys
+import types
+
+sys.path.insert(0, {python_dir:?})
+
+MODULE_NAME = {module_name:?}
+
+fake_root = types.ModuleType(MODULE_NAME)
+fake_options = types.ModuleType(MODULE_NAME + ".rails.llm.options")
+
+class Result:
+    def __init__(self, status, content=None, rail=None):
+        self.status = status
+        self.content = content
+        self.rail = rail
+
+class RailType:
+    INPUT = "input"
+    OUTPUT = "output"
+
+class RailStatus:
+    BLOCKED = "blocked"
+    MODIFIED = "modified"
+    PASSED = "passed"
+
+class RailsConfig:
+    @staticmethod
+    def from_content(*, colang_content=None, yaml_content=None):
+        return {{"yaml": yaml_content, "colang": colang_content}}
+
+    @staticmethod
+    def from_path(path):
+        return {{"path": path}}
+"#,
+        python_dir = python_dir,
+        module_name = module_name,
+    )
+}
+
+fn register_fake_guardrails_module_epilogue() -> &'static str {
+    r#"
+fake_root.RailsConfig = RailsConfig
+fake_root.LLMRails = LLMRails
+fake_options.RailType = RailType
+fake_options.RailStatus = RailStatus
+
+sys.modules[MODULE_NAME] = fake_root
+sys.modules[MODULE_NAME + ".rails"] = types.ModuleType(MODULE_NAME + ".rails")
+sys.modules[MODULE_NAME + ".rails.llm"] = types.ModuleType(MODULE_NAME + ".rails.llm")
+sys.modules[MODULE_NAME + ".rails.llm.options"] = fake_options
+"#
+}
+
+fn local_plugin_context_python() -> &'static str {
+    r#"
+class Context:
+    def register_llm_execution_intercept(self, name, priority, callback):
+        self.llm = callback
+
+    def register_llm_stream_execution_intercept(self, name, priority, callback):
+        self.stream = callback
+
+    def register_tool_execution_intercept(self, name, priority, callback):
+        self.tool = callback
+"#
+}
+
 fn make_request() -> LlmRequest {
     LlmRequest {
         headers: serde_json::Map::from_iter([("x-trace".into(), json!("1"))]),
@@ -153,45 +227,24 @@ fn test_native_pymodule_entrypoint_installs_nemo_guardrails_local_provider() {
 fn test_guardrails_local_helper_registers_and_enforces_llm_and_tool_checks() {
     let _python = crate::test_support::init_python_test();
     Python::attach(|py| {
-        let python_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python");
+        let native_module = PyModule::new(py, "_native_guardrails_helper").unwrap();
+        crate::_native(&native_module).unwrap();
+        let sys = py.import("sys").unwrap();
+        let modules = sys.getattr("modules").unwrap();
+        modules
+            .set_item("nemo_relay._native", native_module.clone())
+            .unwrap();
+
+        let python_dir = python_package_dir();
+        let prelude =
+            fake_guardrails_module_prelude("fake_guardrails_local_helper", &python_dir.display().to_string());
+        let epilogue = register_fake_guardrails_module_epilogue();
+        let context_class = local_plugin_context_python();
         let module = load_module(
             py,
             &format!(
                 r#"
-import pathlib
-import sys
-import types
-
-sys.path.insert(0, {python_dir:?})
-
-MODULE_NAME = "fake_guardrails_local_helper"
-
-fake_root = types.ModuleType(MODULE_NAME)
-fake_options = types.ModuleType(MODULE_NAME + ".rails.llm.options")
-
-class Result:
-    def __init__(self, status, content=None, rail=None):
-        self.status = status
-        self.content = content
-        self.rail = rail
-
-class RailType:
-    INPUT = "input"
-    OUTPUT = "output"
-
-class RailStatus:
-    BLOCKED = "blocked"
-    MODIFIED = "modified"
-    PASSED = "passed"
-
-class RailsConfig:
-    @staticmethod
-    def from_content(*, colang_content=None, yaml_content=None):
-        return {{"yaml": yaml_content, "colang": colang_content}}
-
-    @staticmethod
-    def from_path(path):
-        return {{"path": path}}
+{prelude}
 
 check_results = []
 check_calls = []
@@ -204,32 +257,15 @@ class LLMRails:
         check_calls.append((messages, rail_types))
         return check_results.pop(0)
 
-fake_root.RailsConfig = RailsConfig
-fake_root.LLMRails = LLMRails
-fake_options.RailType = RailType
-fake_options.RailStatus = RailStatus
-
-sys.modules[MODULE_NAME] = fake_root
-sys.modules[MODULE_NAME + ".rails"] = types.ModuleType(MODULE_NAME + ".rails")
-sys.modules[MODULE_NAME + ".rails.llm"] = types.ModuleType(MODULE_NAME + ".rails.llm")
-sys.modules[MODULE_NAME + ".rails.llm.options"] = fake_options
+{epilogue}
 
 from nemo_relay._native import LLMRequest
 from nemo_relay._guardrails_local import register_local_backend
 
-class Context:
-    def register_llm_execution_intercept(self, name, priority, callback):
-        self.llm = callback
-
-    def register_llm_stream_execution_intercept(self, name, priority, callback):
-        self.stream = callback
-
-    def register_tool_execution_intercept(self, name, priority, callback):
-        self.tool = callback
+{context_class}
 
 async def run_case():
     ctx = Context()
-    event_log = []
     register_local_backend(
         {{
             "mode": "local",
@@ -291,7 +327,9 @@ async def run_case():
         "check_calls": check_calls,
     }}
 "#,
-                python_dir = python_dir.display(),
+                prelude = prelude,
+                epilogue = epilogue,
+                context_class = context_class,
             ),
         );
 
@@ -332,40 +370,16 @@ fn test_guardrails_local_helper_enforces_streamed_output_rails() {
             .set_item("nemo_relay._native", native_module.clone())
             .unwrap();
 
-        let python_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python");
+        let python_dir = python_package_dir();
+        let prelude =
+            fake_guardrails_module_prelude("fake_guardrails_streaming", &python_dir.display().to_string());
+        let epilogue = register_fake_guardrails_module_epilogue();
+        let context_class = local_plugin_context_python();
         let module = load_module(
             py,
             &format!(
                 r#"
-import sys
-import types
-
-sys.path.insert(0, {python_dir:?})
-
-MODULE_NAME = "fake_guardrails_streaming"
-
-fake_root = types.ModuleType(MODULE_NAME)
-fake_options = types.ModuleType(MODULE_NAME + ".rails.llm.options")
-
-class Result:
-    def __init__(self, status, content=None, rail=None):
-        self.status = status
-        self.content = content
-        self.rail = rail
-
-class RailType:
-    INPUT = "input"
-    OUTPUT = "output"
-
-class RailStatus:
-    BLOCKED = "blocked"
-    MODIFIED = "modified"
-    PASSED = "passed"
-
-class RailsConfig:
-    @staticmethod
-    def from_content(*, colang_content=None, yaml_content=None):
-        return {{"yaml": yaml_content}}
+{prelude}
 
 stream_results = []
 event_log = []
@@ -395,28 +409,12 @@ class LLMRails:
                 yield '{{"error": {{"message": "Blocked by output rails: output-policy", "type": "guardrails_violation"}}}}'
         return _run()
 
-fake_root.RailsConfig = RailsConfig
-fake_root.LLMRails = LLMRails
-fake_options.RailType = RailType
-fake_options.RailStatus = RailStatus
-
-sys.modules[MODULE_NAME] = fake_root
-sys.modules[MODULE_NAME + ".rails"] = types.ModuleType(MODULE_NAME + ".rails")
-sys.modules[MODULE_NAME + ".rails.llm"] = types.ModuleType(MODULE_NAME + ".rails.llm")
-sys.modules[MODULE_NAME + ".rails.llm.options"] = fake_options
+{epilogue}
 
 from nemo_relay._native import LLMRequest
 from nemo_relay._guardrails_local import register_local_backend
 
-class Context:
-    def register_llm_execution_intercept(self, name, priority, callback):
-        self.llm = callback
-
-    def register_llm_stream_execution_intercept(self, name, priority, callback):
-        self.stream = callback
-
-    def register_tool_execution_intercept(self, name, priority, callback):
-        self.tool = callback
+{context_class}
 
 async def run_case():
     ctx = Context()
@@ -506,7 +504,9 @@ async def run_case():
         "modified": modified,
     }}
 "#,
-                python_dir = python_dir.display(),
+                prelude = prelude,
+                epilogue = epilogue,
+                context_class = context_class,
             ),
         );
 
@@ -581,40 +581,15 @@ fn test_local_guardrails_provider_initializes_and_enforces_managed_core_calls()
             .set_item("nemo_relay._native", native_module.clone())
             .unwrap();
 
-        let python_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python");
+        let python_dir = python_package_dir();
+        let prelude =
+            fake_guardrails_module_prelude("fake_guardrails_local_e2e", &python_dir.display().to_string());
+        let epilogue = register_fake_guardrails_module_epilogue();
         let module = load_module(
             py,
             &format!(
                 r#"
-import sys
-import types
-
-sys.path.insert(0, {python_dir:?})
-
-MODULE_NAME = "fake_guardrails_local_e2e"
-
-fake_root = types.ModuleType(MODULE_NAME)
-fake_options = types.ModuleType(MODULE_NAME + ".rails.llm.options")
-
-class Result:
-    def __init__(self, status, content=None, rail=None):
-        self.status = status
-        self.content = content
-        self.rail = rail
-
-class RailType:
-    INPUT = "input"
-    OUTPUT = "output"
-
-class RailStatus:
-    BLOCKED = "blocked"
-    MODIFIED = "modified"
-    PASSED = "passed"
-
-class RailsConfig:
-    @staticmethod
-    def from_content(*, colang_content=None, yaml_content=None):
-        return {{"yaml": yaml_content}}
+{prelude}
 
 check_results = []
 
@@ -625,15 +600,7 @@ class LLMRails:
     async def check_async(self, messages, rail_types):
         return check_results.pop(0)
 
-fake_root.RailsConfig = RailsConfig
-fake_root.LLMRails = LLMRails
-fake_options.RailType = RailType
-fake_options.RailStatus = RailStatus
-
-sys.modules[MODULE_NAME] = fake_root
-sys.modules[MODULE_NAME + ".rails"] = types.ModuleType(MODULE_NAME + ".rails")
-sys.modules[MODULE_NAME + ".rails.llm"] = types.ModuleType(MODULE_NAME + ".rails.llm")
-sys.modules[MODULE_NAME + ".rails.llm.options"] = fake_options
+{epilogue}
 
 import nemo_relay
 
@@ -709,7 +676,8 @@ async def run_case():
         "seen_tool_args": seen_tool_args,
     }}
 "#,
-                python_dir = python_dir.display(),
+                prelude = prelude,
+                epilogue = epilogue,
             ),
         );
         let result_json = with_event_loop(py, |event_loop| {

From 244f29f022929b395fe64ace2e4b0dc428ccea5d Mon Sep 17 00:00:00 2001
From: Alex Fournier <afournier@nvidia.com>
Date: Mon, 1 Jun 2026 09:16:28 -0700
Subject: [PATCH 4/8] style: apply rustfmt for local guardrails tests

Signed-off-by: Alex Fournier <afournier@nvidia.com>
---
 .../core/src/plugins/nemo_guardrails/local.rs  |  3 ++-
 crates/python/tests/coverage/coverage_tests.rs | 18 ++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/crates/core/src/plugins/nemo_guardrails/local.rs b/crates/core/src/plugins/nemo_guardrails/local.rs
index 31f4e1c8..240ed186 100644
--- a/crates/core/src/plugins/nemo_guardrails/local.rs
+++ b/crates/core/src/plugins/nemo_guardrails/local.rs
@@ -14,7 +14,8 @@ type LocalBackendProvider = Arc<
 static LOCAL_BACKEND_PROVIDER: LazyLock<Mutex<Option<LocalBackendProvider>>> =
     LazyLock::new(|| Mutex::new(None));
 
-fn local_backend_provider_guard() -> PluginResult<MutexGuard<'static, Option<LocalBackendProvider>>> {
+fn local_backend_provider_guard() -> PluginResult<MutexGuard<'static, Option<LocalBackendProvider>>>
+{
     LOCAL_BACKEND_PROVIDER.lock().map_err(|e| {
         PluginError::Internal(format!(
             "NeMo Guardrails local backend provider lock poisoned: {e}"
diff --git a/crates/python/tests/coverage/coverage_tests.rs b/crates/python/tests/coverage/coverage_tests.rs
index 90b792f8..029eee58 100644
--- a/crates/python/tests/coverage/coverage_tests.rs
+++ b/crates/python/tests/coverage/coverage_tests.rs
@@ -236,8 +236,10 @@ fn test_guardrails_local_helper_registers_and_enforces_llm_and_tool_checks() {
             .unwrap();
 
         let python_dir = python_package_dir();
-        let prelude =
-            fake_guardrails_module_prelude("fake_guardrails_local_helper", &python_dir.display().to_string());
+        let prelude = fake_guardrails_module_prelude(
+            "fake_guardrails_local_helper",
+            &python_dir.display().to_string(),
+        );
         let epilogue = register_fake_guardrails_module_epilogue();
         let context_class = local_plugin_context_python();
         let module = load_module(
@@ -371,8 +373,10 @@ fn test_guardrails_local_helper_enforces_streamed_output_rails() {
             .unwrap();
 
         let python_dir = python_package_dir();
-        let prelude =
-            fake_guardrails_module_prelude("fake_guardrails_streaming", &python_dir.display().to_string());
+        let prelude = fake_guardrails_module_prelude(
+            "fake_guardrails_streaming",
+            &python_dir.display().to_string(),
+        );
         let epilogue = register_fake_guardrails_module_epilogue();
         let context_class = local_plugin_context_python();
         let module = load_module(
@@ -582,8 +586,10 @@ fn test_local_guardrails_provider_initializes_and_enforces_managed_core_calls()
             .unwrap();
 
         let python_dir = python_package_dir();
-        let prelude =
-            fake_guardrails_module_prelude("fake_guardrails_local_e2e", &python_dir.display().to_string());
+        let prelude = fake_guardrails_module_prelude(
+            "fake_guardrails_local_e2e",
+            &python_dir.display().to_string(),
+        );
         let epilogue = register_fake_guardrails_module_epilogue();
         let module = load_module(
             py,

From ffa88dcee41143e3bfb4131162155a57ef922876 Mon Sep 17 00:00:00 2001
From: Alex Fournier <afournier@nvidia.com>
Date: Mon, 1 Jun 2026 09:24:48 -0700
Subject: [PATCH 5/8] refactor: name local guardrails imports

Signed-off-by: Alex Fournier <afournier@nvidia.com>
---
 python/nemo_relay/_guardrails_local.py | 41 ++++++++++++++++----------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/python/nemo_relay/_guardrails_local.py b/python/nemo_relay/_guardrails_local.py
index 5f30eb49..86f5946c 100644
--- a/python/nemo_relay/_guardrails_local.py
+++ b/python/nemo_relay/_guardrails_local.py
@@ -9,7 +9,7 @@
 import importlib
 import json
 from collections.abc import Callable
-from typing import Any, Protocol, cast
+from typing import Any, NamedTuple, Protocol, cast
 
 from nemo_relay import Json, LLMRequest
 from nemo_relay.codecs import (
@@ -49,6 +49,15 @@ class _GuardrailsCodec(LlmCodec, LlmResponseCodec, Protocol):
     """Codec shape required by the local backend."""
 
 
+class _GuardrailsRuntimeImports(NamedTuple):
+    """Resolved Python symbols required by the local Guardrails backend."""
+
+    rails_config_cls: Any
+    llm_rails_cls: Any
+    rail_type: Any
+    rail_status: Any
+
+
 _CODECS: dict[str, Callable[[], _GuardrailsCodec]] = {
     "openai_chat": OpenAIChatCodec,
     "openai_responses": OpenAIResponsesCodec,
@@ -56,7 +65,7 @@ class _GuardrailsCodec(LlmCodec, LlmResponseCodec, Protocol):
 }
 
 
-def _load_nemoguardrails(module_name: str | None):
+def _load_nemoguardrails(module_name: str | None) -> _GuardrailsRuntimeImports:
     root_module = module_name or "nemoguardrails"
     try:
         guardrails = cast(Any, importlib.import_module(root_module))
@@ -72,11 +81,11 @@ def _load_nemoguardrails(module_name: str | None):
             f"{error.name or error}. Install the full NeMo Guardrails runtime dependencies."
         ) from error
 
-    return (
-        guardrails.RailsConfig,
-        guardrails.LLMRails,
-        options.RailType,
-        options.RailStatus,
+    return _GuardrailsRuntimeImports(
+        rails_config_cls=guardrails.RailsConfig,
+        llm_rails_cls=guardrails.LLMRails,
+        rail_type=options.RailType,
+        rail_status=options.RailStatus,
     )
 
 
@@ -543,9 +552,9 @@ def register_local_backend(config: dict[str, Any], context: PluginContext) -> No
 
     local = cast(dict[str, Any], config.get("local") or {})
     module_name = cast(str | None, local.get("python_module"))
-    RailsConfig, LLMRails, RailType, RailStatus = _load_nemoguardrails(module_name)
-    guardrails_config = _build_guardrails_config(config, RailsConfig)
-    rails = LLMRails(guardrails_config)
+    runtime_imports = _load_nemoguardrails(module_name)
+    guardrails_config = _build_guardrails_config(config, runtime_imports.rails_config_cls)
+    rails = runtime_imports.llm_rails_cls(guardrails_config)
     enable_input = bool(config.get("input", True))
     enable_output = bool(config.get("output", True))
     enable_tool_input = bool(config.get("tool_input", False))
@@ -556,16 +565,16 @@ def register_local_backend(config: dict[str, Any], context: PluginContext) -> No
         codec_name, codec = _resolve_codec(config)
         intercept = _make_llm_intercept(
             rails=rails,
-            rail_type=RailType,
-            rail_status=RailStatus,
+            rail_type=runtime_imports.rail_type,
+            rail_status=runtime_imports.rail_status,
             codec=codec,
             enable_input=enable_input,
             enable_output=enable_output,
         )
         stream_intercept = _make_llm_stream_intercept(
             rails=rails,
-            rail_type=RailType,
-            rail_status=RailStatus,
+            rail_type=runtime_imports.rail_type,
+            rail_status=runtime_imports.rail_status,
             codec_name=codec_name,
             codec=codec,
             enable_input=enable_input,
@@ -581,8 +590,8 @@ def register_local_backend(config: dict[str, Any], context: PluginContext) -> No
     if enable_tool_input or enable_tool_output:
         tool_intercept = _make_tool_intercept(
             rails=rails,
-            rail_type=RailType,
-            rail_status=RailStatus,
+            rail_type=runtime_imports.rail_type,
+            rail_status=runtime_imports.rail_status,
             enable_tool_input=enable_tool_input,
             enable_tool_output=enable_tool_output,
         )

From f8dead5c41a81bfc58b6c895e396181674b2833a Mon Sep 17 00:00:00 2001
From: Alex Fournier <afournier@nvidia.com>
Date: Mon, 1 Jun 2026 09:36:47 -0700
Subject: [PATCH 6/8] fix: address local guardrails review nits

Signed-off-by: Alex Fournier <afournier@nvidia.com>
---
 crates/python/src/lib.rs                      |  21 +
 crates/python/src/py_plugin.rs                |  18 +-
 .../python/tests/coverage/coverage_tests.rs   | 370 ++++++++++--------
 .../coverage/py_plugin_coverage_tests.rs      |  45 +++
 python/nemo_relay/_guardrails_local.py        |  30 +-
 5 files changed, 307 insertions(+), 177 deletions(-)

diff --git a/crates/python/src/lib.rs b/crates/python/src/lib.rs
index 13d0c29f..4a40eaf8 100644
--- a/crates/python/src/lib.rs
+++ b/crates/python/src/lib.rs
@@ -112,6 +112,10 @@ fn load_guardrails_local_register_fn(py: Python<'_>) -> PyResult<Bound<'_, PyAny
     let module = match py.import("nemo_relay._guardrails_local") {
         Ok(module) => module,
         Err(err) => {
+            if !is_missing_guardrails_local_module(py, &err)? {
+                return Err(err);
+            }
+
             let source_python_dir = guardrails_local_source_python_dir();
             if !source_python_dir.exists() {
                 return Err(err);
@@ -124,6 +128,23 @@ fn load_guardrails_local_register_fn(py: Python<'_>) -> PyResult<Bound<'_, PyAny
     module.getattr("register_local_backend")
 }
 
+fn is_missing_guardrails_local_module(py: Python<'_>, err: &PyErr) -> PyResult<bool> {
+    if !err.is_instance_of::<pyo3::exceptions::PyModuleNotFoundError>(py) {
+        return Ok(false);
+    }
+
+    let err_value = err.value(py);
+    let module_name = err_value
+        .getattr("name")
+        .ok()
+        .and_then(|name| name.extract::<String>().ok());
+
+    Ok(matches!(
+        module_name.as_deref(),
+        Some("nemo_relay") | Some("nemo_relay._guardrails_local")
+    ))
+}
+
 fn guardrails_local_source_python_dir() -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python")
 }
diff --git a/crates/python/src/py_plugin.rs b/crates/python/src/py_plugin.rs
index ee668ea1..09209b03 100644
--- a/crates/python/src/py_plugin.rs
+++ b/crates/python/src/py_plugin.rs
@@ -29,7 +29,8 @@ use nemo_relay::api::subscriber::{deregister_subscriber, register_subscriber};
 use nemo_relay::plugin::{
     ConfigDiagnostic, DiagnosticLevel, Plugin, PluginConfig, PluginError, PluginRegistration,
     PluginRegistrationContext, active_plugin_report, clear_plugin_configuration, deregister_plugin,
-    initialize_plugins, list_plugin_kinds, register_plugin, validate_plugin_config,
+    initialize_plugins, list_plugin_kinds, register_plugin, rollback_registrations,
+    validate_plugin_config,
 };
 
 use crate::convert::{json_to_py, py_to_json};
@@ -174,10 +175,17 @@ pub(crate) fn invoke_python_plugin_register(
         namespace_prefix,
     )?;
     let plugin_config_py = plugin_config_to_py(py, plugin_kind, plugin_config)?;
-    register_fn.call1((plugin_config_py, py_ctx.clone_ref(py)))?;
-    {
-        let py_ctx_ref = py_ctx.bind(py).borrow();
-        py_ctx_ref.drain_registrations()
+    match register_fn.call1((plugin_config_py, py_ctx.clone_ref(py))) {
+        Ok(_) => {
+            let py_ctx_ref = py_ctx.bind(py).borrow();
+            py_ctx_ref.drain_registrations()
+        }
+        Err(err) => {
+            if let Ok(mut registrations) = py_ctx.bind(py).borrow().drain_registrations() {
+                rollback_registrations(&mut registrations);
+            }
+            Err(err)
+        }
     }
 }
 
diff --git a/crates/python/tests/coverage/coverage_tests.rs b/crates/python/tests/coverage/coverage_tests.rs
index 029eee58..a104d68c 100644
--- a/crates/python/tests/coverage/coverage_tests.rs
+++ b/crates/python/tests/coverage/coverage_tests.rs
@@ -4,13 +4,13 @@
 //! Coverage tests for coverage in the NeMo Relay Python crate.
 
 use std::ffi::CString;
+use std::panic::{AssertUnwindSafe, catch_unwind};
 use std::path::PathBuf;
 use std::pin::Pin;
 use std::sync::Arc;
 
-use pyo3::ffi::c_str;
 use pyo3::prelude::*;
-use pyo3::types::{IntoPyDict, PyModule};
+use pyo3::types::{PyDict, PyModule};
 use serde_json::{Value as Json, json};
 use tokio_stream::Stream;
 use tokio_stream::StreamExt;
@@ -115,6 +115,59 @@ class Context:
 "#
 }
 
+fn with_isolated_nemo_relay_modules<T>(
+    py: Python<'_>,
+    native_module: &Bound<'_, PyModule>,
+    f: impl FnOnce() -> T,
+) -> T {
+    let sys = py.import("sys").unwrap();
+    let modules = sys
+        .getattr("modules")
+        .unwrap()
+        .cast_into::<PyDict>()
+        .unwrap();
+    let saved_modules = modules
+        .iter()
+        .filter_map(|(name, module)| {
+            let name = name.extract::<String>().ok()?;
+            if name == "nemo_relay" || name.starts_with("nemo_relay.") {
+                Some((name, module.unbind()))
+            } else {
+                None
+            }
+        })
+        .collect::<Vec<_>>();
+
+    clear_nemo_relay_modules(&modules);
+    modules
+        .set_item("nemo_relay._native", native_module.clone())
+        .unwrap();
+
+    let result = catch_unwind(AssertUnwindSafe(f));
+
+    clear_nemo_relay_modules(&modules);
+    for (name, module) in saved_modules {
+        modules.set_item(name, module).unwrap();
+    }
+
+    match result {
+        Ok(value) => value,
+        Err(payload) => std::panic::resume_unwind(payload),
+    }
+}
+
+fn clear_nemo_relay_modules(modules: &Bound<'_, PyDict>) {
+    let module_names = modules
+        .iter()
+        .filter_map(|(name, _)| name.extract::<String>().ok())
+        .filter(|name| name == "nemo_relay" || name.starts_with("nemo_relay."))
+        .collect::<Vec<_>>();
+
+    for name in module_names {
+        modules.del_item(name).unwrap();
+    }
+}
+
 fn make_request() -> LlmRequest {
     LlmRequest {
         headers: serde_json::Map::from_iter([("x-trace".into(), json!("1"))]),
@@ -229,23 +282,19 @@ fn test_guardrails_local_helper_registers_and_enforces_llm_and_tool_checks() {
     Python::attach(|py| {
         let native_module = PyModule::new(py, "_native_guardrails_helper").unwrap();
         crate::_native(&native_module).unwrap();
-        let sys = py.import("sys").unwrap();
-        let modules = sys.getattr("modules").unwrap();
-        modules
-            .set_item("nemo_relay._native", native_module.clone())
-            .unwrap();
 
-        let python_dir = python_package_dir();
-        let prelude = fake_guardrails_module_prelude(
-            "fake_guardrails_local_helper",
-            &python_dir.display().to_string(),
-        );
-        let epilogue = register_fake_guardrails_module_epilogue();
-        let context_class = local_plugin_context_python();
-        let module = load_module(
-            py,
-            &format!(
-                r#"
+        with_isolated_nemo_relay_modules(py, &native_module, || {
+            let python_dir = python_package_dir();
+            let prelude = fake_guardrails_module_prelude(
+                "fake_guardrails_local_helper",
+                &python_dir.display().to_string(),
+            );
+            let epilogue = register_fake_guardrails_module_epilogue();
+            let context_class = local_plugin_context_python();
+            let module = load_module(
+                py,
+                &format!(
+                    r#"
 {prelude}
 
 check_results = []
@@ -329,34 +378,61 @@ async def run_case():
         "check_calls": check_calls,
     }}
 "#,
-                prelude = prelude,
-                epilogue = epilogue,
-                context_class = context_class,
-            ),
-        );
+                    prelude = prelude,
+                    epilogue = epilogue,
+                    context_class = context_class,
+                ),
+            );
 
-        let result_json = with_event_loop(py, |event_loop| {
-            let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
-            let result = event_loop
-                .call_method1("run_until_complete", (coroutine,))
-                .unwrap();
-            crate::convert::py_to_json(&result).unwrap()
-        });
+            let result_json = with_event_loop(py, |event_loop| {
+                let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
+                let result = event_loop
+                    .call_method1("run_until_complete", (coroutine,))
+                    .unwrap();
+                crate::convert::py_to_json(&result).unwrap()
+            });
 
-        assert_eq!(
-            result_json["seen_request_messages"][0],
-            json!("sanitized user")
-        );
-        assert_eq!(result_json["tool_result"], json!({ "ok": true }));
-        assert_eq!(
-            result_json["seen_tool_args"][0],
-            json!({ "city": "Boston" })
-        );
-        assert_eq!(
-            result_json["llm_result"]["choices"][0]["message"]["content"],
-            json!("safe reply")
-        );
-        assert_eq!(result_json["check_calls"].as_array().unwrap().len(), 4);
+            assert_eq!(
+                result_json["seen_request_messages"][0],
+                json!("sanitized user")
+            );
+            assert_eq!(result_json["tool_result"], json!({ "ok": true }));
+            assert_eq!(
+                result_json["seen_tool_args"][0],
+                json!({ "city": "Boston" })
+            );
+            assert_eq!(
+                result_json["llm_result"]["choices"][0]["message"]["content"],
+                json!("safe reply")
+            );
+            assert_eq!(
+                result_json["check_calls"],
+                json!([
+                    [
+                        [{"role": "user", "content": "unsafe"}],
+                        ["input"]
+                    ],
+                    [
+                        [
+                            {"role": "user", "content": "sanitized user"},
+                            {"role": "assistant", "content": "safe reply"}
+                        ],
+                        ["output"]
+                    ],
+                    [
+                        [{"role": "user", "content": "{\"arguments\":{\"city\":\"Phoenix\"},\"tool_name\":\"weather_lookup\"}"}],
+                        ["input"]
+                    ],
+                    [
+                        [
+                            {"role": "user", "content": "{\"arguments\":{\"city\":\"Boston\"},\"tool_name\":\"weather_lookup\"}"},
+                            {"role": "assistant", "content": "{\"arguments\":{\"city\":\"Boston\"},\"result\":{\"raw\":true},\"tool_name\":\"weather_lookup\"}"}
+                        ],
+                        ["output"]
+                    ]
+                ])
+            );
+        });
     });
 }
 
@@ -366,23 +442,19 @@ fn test_guardrails_local_helper_enforces_streamed_output_rails() {
     Python::attach(|py| {
         let native_module = PyModule::new(py, "_native_guardrails_streaming").unwrap();
         crate::_native(&native_module).unwrap();
-        let sys = py.import("sys").unwrap();
-        let modules = sys.getattr("modules").unwrap();
-        modules
-            .set_item("nemo_relay._native", native_module.clone())
-            .unwrap();
 
-        let python_dir = python_package_dir();
-        let prelude = fake_guardrails_module_prelude(
-            "fake_guardrails_streaming",
-            &python_dir.display().to_string(),
-        );
-        let epilogue = register_fake_guardrails_module_epilogue();
-        let context_class = local_plugin_context_python();
-        let module = load_module(
-            py,
-            &format!(
-                r#"
+        with_isolated_nemo_relay_modules(py, &native_module, || {
+            let python_dir = python_package_dir();
+            let prelude = fake_guardrails_module_prelude(
+                "fake_guardrails_streaming",
+                &python_dir.display().to_string(),
+            );
+            let epilogue = register_fake_guardrails_module_epilogue();
+            let context_class = local_plugin_context_python();
+            let module = load_module(
+                py,
+                &format!(
+                    r#"
 {prelude}
 
 stream_results = []
@@ -508,52 +580,53 @@ async def run_case():
         "modified": modified,
     }}
 "#,
-                prelude = prelude,
-                epilogue = epilogue,
-                context_class = context_class,
-            ),
-        );
+                    prelude = prelude,
+                    epilogue = epilogue,
+                    context_class = context_class,
+                ),
+            );
 
-        let result = with_event_loop(py, |event_loop| {
-            let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
-            let result = event_loop
-                .call_method1("run_until_complete", (coroutine,))
-                .unwrap();
-            crate::convert::py_to_json(&result).unwrap()
-        });
-        assert_eq!(
-            result["allowed_chunks"],
-            json!([
-                {"choices": [{"delta": {"content": "hello"}}]},
-                {"choices": [{"delta": {"content": "world"}}]}
-            ])
-        );
-        let event_log = result["event_log"].as_array().unwrap();
-        assert_eq!(
-            &event_log[..6],
-            json!([
-                "source:hello",
-                "yield:hello",
-                "source:world",
-                "yield:world",
-                "guardrails-sees:hello",
-                "guardrails-sees:world",
-            ])
-            .as_array()
-            .unwrap()
-        );
-        assert!(
-            result["blocked"]
-                .as_str()
-                .unwrap()
-                .contains("output rail blocked the LLM call")
-        );
-        assert!(
-            result["modified"]
-                .as_str()
+            let result = with_event_loop(py, |event_loop| {
+                let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
+                let result = event_loop
+                    .call_method1("run_until_complete", (coroutine,))
+                    .unwrap();
+                crate::convert::py_to_json(&result).unwrap()
+            });
+            assert_eq!(
+                result["allowed_chunks"],
+                json!([
+                    {"choices": [{"delta": {"content": "hello"}}]},
+                    {"choices": [{"delta": {"content": "world"}}]}
+                ])
+            );
+            let event_log = result["event_log"].as_array().unwrap();
+            assert_eq!(
+                &event_log[..6],
+                json!([
+                    "source:hello",
+                    "yield:hello",
+                    "source:world",
+                    "yield:world",
+                    "guardrails-sees:hello",
+                    "guardrails-sees:world",
+                ])
+                .as_array()
                 .unwrap()
-                .contains("stream_first = true")
-        );
+            );
+            assert!(
+                result["blocked"]
+                    .as_str()
+                    .unwrap()
+                    .contains("output rail blocked the LLM call")
+            );
+            assert!(
+                result["modified"]
+                    .as_str()
+                    .unwrap()
+                    .contains("stream_first = true")
+            );
+        });
     });
 }
 
@@ -565,36 +638,18 @@ fn test_local_guardrails_provider_initializes_and_enforces_managed_core_calls()
     Python::attach(|py| {
         let native_module = PyModule::new(py, "_native_guardrails_e2e").unwrap();
         crate::_native(&native_module).unwrap();
-        let sys = py.import("sys").unwrap();
-        let modules = sys.getattr("modules").unwrap();
-        let module_names = py
-            .eval(
-                c_str!("list(sys.modules.keys())"),
-                None,
-                Some(&[(c_str!("sys"), sys)].into_py_dict(py).unwrap()),
-            )
-            .unwrap()
-            .extract::<Vec<String>>()
-            .unwrap();
-        for name in module_names {
-            if name == "nemo_relay" || name.starts_with("nemo_relay.") {
-                modules.del_item(name).unwrap();
-            }
-        }
-        modules
-            .set_item("nemo_relay._native", native_module.clone())
-            .unwrap();
 
-        let python_dir = python_package_dir();
-        let prelude = fake_guardrails_module_prelude(
-            "fake_guardrails_local_e2e",
-            &python_dir.display().to_string(),
-        );
-        let epilogue = register_fake_guardrails_module_epilogue();
-        let module = load_module(
-            py,
-            &format!(
-                r#"
+        with_isolated_nemo_relay_modules(py, &native_module, || {
+            let python_dir = python_package_dir();
+            let prelude = fake_guardrails_module_prelude(
+                "fake_guardrails_local_e2e",
+                &python_dir.display().to_string(),
+            );
+            let epilogue = register_fake_guardrails_module_epilogue();
+            let module = load_module(
+                py,
+                &format!(
+                    r#"
 {prelude}
 
 check_results = []
@@ -682,31 +737,32 @@ async def run_case():
         "seen_tool_args": seen_tool_args,
     }}
 "#,
-                prelude = prelude,
-                epilogue = epilogue,
-            ),
-        );
-        let result_json = with_event_loop(py, |event_loop| {
-            let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
-            let result = event_loop
-                .call_method1("run_until_complete", (coroutine,))
-                .unwrap();
-            crate::convert::py_to_json(&result).unwrap()
-        });
+                    prelude = prelude,
+                    epilogue = epilogue,
+                ),
+            );
+            let result_json = with_event_loop(py, |event_loop| {
+                let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
+                let result = event_loop
+                    .call_method1("run_until_complete", (coroutine,))
+                    .unwrap();
+                crate::convert::py_to_json(&result).unwrap()
+            });
 
-        assert_eq!(
-            result_json["llm_result"]["choices"][0]["message"]["content"],
-            json!("safe reply")
-        );
-        assert_eq!(result_json["tool_result"], json!({ "ok": true }));
-        assert_eq!(
-            result_json["seen_request_messages"][0],
-            json!("sanitized user")
-        );
-        assert_eq!(
-            result_json["seen_tool_args"][0],
-            json!({ "city": "Boston" })
-        );
+            assert_eq!(
+                result_json["llm_result"]["choices"][0]["message"]["content"],
+                json!("safe reply")
+            );
+            assert_eq!(result_json["tool_result"], json!({ "ok": true }));
+            assert_eq!(
+                result_json["seen_request_messages"][0],
+                json!("sanitized user")
+            );
+            assert_eq!(
+                result_json["seen_tool_args"][0],
+                json!({ "city": "Boston" })
+            );
+        });
     });
 
     reset_runtime_state();
diff --git a/crates/python/tests/coverage/py_plugin_coverage_tests.rs b/crates/python/tests/coverage/py_plugin_coverage_tests.rs
index f774d5ea..dbb8a21b 100644
--- a/crates/python/tests/coverage/py_plugin_coverage_tests.rs
+++ b/crates/python/tests/coverage/py_plugin_coverage_tests.rs
@@ -792,6 +792,51 @@ async def initialize_plugins(module, config):
     });
 }
 
+#[test]
+fn invoke_python_plugin_register_rolls_back_partial_registrations_on_error() {
+    let _python = crate::test_support::init_python_test();
+    Python::attach(|py| {
+        let helpers = load_module(
+            py,
+            r#"
+def subscriber(event):
+    return None
+
+class FailingPlugin:
+    def register(self, plugin_config, context):
+        context.register_subscriber("sub", subscriber)
+        raise RuntimeError("boom")
+"#,
+        );
+
+        let plugin = helpers.getattr("FailingPlugin").unwrap().call0().unwrap();
+        let register_fn = plugin.getattr("register").unwrap();
+        let namespace_prefix = "rollback.".to_string();
+
+        for _ in 0..2 {
+            let err = invoke_python_plugin_register(
+                py,
+                "demo.rollback",
+                &register_fn,
+                &serde_json::Map::new(),
+                namespace_prefix.clone(),
+            )
+            .unwrap_err();
+            assert!(err.to_string().contains("boom"), "{err}");
+
+            let context = PyPluginContext {
+                registrations: Arc::new(Mutex::new(vec![])),
+                namespace_prefix: namespace_prefix.clone(),
+            };
+            context
+                .register_subscriber("sub", helpers.getattr("subscriber").unwrap().unbind())
+                .unwrap();
+            let mut registrations = context.drain_registrations().unwrap();
+            rollback_registrations(&mut registrations);
+        }
+    });
+}
+
 #[test]
 fn plugin_context_lock_poisoning_covers_error_paths() {
     let _python = crate::test_support::init_python_test();
diff --git a/python/nemo_relay/_guardrails_local.py b/python/nemo_relay/_guardrails_local.py
index 86f5946c..f16f839a 100644
--- a/python/nemo_relay/_guardrails_local.py
+++ b/python/nemo_relay/_guardrails_local.py
@@ -462,21 +462,21 @@ async def stream_intercept(request: LLMRequest, next_call):
             )
 
         text_queue: asyncio.Queue[str | None] = asyncio.Queue()
-        blocked: dict[str, str | None] = {"message": None}
-        monitor = asyncio.create_task(
-            _monitor_streaming_output_rails(
-                rails=rails,
-                messages=messages,
-                text_queue=text_queue,
-                blocked=blocked,
-            )
-        )
+        block_state: dict[str, str | None] = {"message": None}
 
         async def guarded_provider_stream():
+            monitor = asyncio.create_task(
+                _monitor_streaming_output_rails(
+                    rails=rails,
+                    messages=messages,
+                    text_queue=text_queue,
+                    blocked=block_state,
+                )
+            )
             try:
                 async for chunk in stream:
-                    if blocked["message"] is not None:
-                        _raise_streaming_output_blocked(blocked["message"])
+                    if block_state["message"] is not None:
+                        _raise_streaming_output_blocked(block_state["message"])
 
                     text = _extract_stream_text(codec_name, chunk)
                     if text is not None:
@@ -484,13 +484,13 @@ async def guarded_provider_stream():
 
                     yield chunk
 
-                    if blocked["message"] is not None:
-                        _raise_streaming_output_blocked(blocked["message"])
+                    if block_state["message"] is not None:
+                        _raise_streaming_output_blocked(block_state["message"])
             finally:
                 await text_queue.put(None)
                 await monitor
-                if blocked["message"] is not None:
-                    _raise_streaming_output_blocked(blocked["message"])
+                if block_state["message"] is not None:
+                    _raise_streaming_output_blocked(block_state["message"])
 
         return guarded_provider_stream()
 

From 67fd1b912e4da86c8362f6d93e0437933591dd20 Mon Sep 17 00:00:00 2001
From: Alex Fournier <afournier@nvidia.com>
Date: Mon, 1 Jun 2026 12:11:06 -0700
Subject: [PATCH 7/8] test: extend local guardrails cli coverage

Signed-off-by: Alex Fournier <afournier@nvidia.com>
---
 crates/cli/tests/coverage/plugins_tests.rs | 243 ++++++++++++++++++++-
 1 file changed, 242 insertions(+), 1 deletion(-)

diff --git a/crates/cli/tests/coverage/plugins_tests.rs b/crates/cli/tests/coverage/plugins_tests.rs
index 28bf0fd2..502e7b07 100644
--- a/crates/cli/tests/coverage/plugins_tests.rs
+++ b/crates/cli/tests/coverage/plugins_tests.rs
@@ -7,7 +7,7 @@ use nemo_relay::config_editor::{EditorConfig, EditorSchema};
 use nemo_relay::observability::plugin_component::{OBSERVABILITY_PLUGIN_KIND, ObservabilityConfig};
 use nemo_relay::plugin::{ConfigPolicy, PluginComponentSpec, PluginConfig};
 use nemo_relay::plugins::nemo_guardrails::component::{
-    NEMO_GUARDRAILS_PLUGIN_KIND, NeMoGuardrailsConfig, RemoteBackendConfig,
+    LocalBackendConfig, NEMO_GUARDRAILS_PLUGIN_KIND, NeMoGuardrailsConfig, RemoteBackendConfig,
 };
 use nemo_relay_adaptive::AdaptiveConfig;
 use nemo_relay_adaptive::plugin_component::ADAPTIVE_PLUGIN_KIND;
@@ -50,6 +50,40 @@ fn guardrails_component_config(config_id: &str) -> serde_json::Map<String, Value
     .clone()
 }
 
+fn local_guardrails_component_config(config_path: &str) -> serde_json::Map<String, Value> {
+    json!({
+        "mode": "local",
+        "input": false,
+        "output": false,
+        "config_path": config_path,
+        "tool_input": true,
+        "tool_output": true,
+        "local": {
+            "python_module": "custom_guardrails"
+        }
+    })
+    .as_object()
+    .unwrap()
+    .clone()
+}
+
+fn local_llm_guardrails_component_config(config_yaml: &str) -> serde_json::Map<String, Value> {
+    json!({
+        "mode": "local",
+        "codec": "openai_chat",
+        "input": true,
+        "output": true,
+        "config_yaml": config_yaml,
+        "colang_content": "define flow noop\n  pass",
+        "local": {
+            "python_module": "custom_guardrails"
+        }
+    })
+    .as_object()
+    .unwrap()
+    .clone()
+}
+
 #[test]
 fn target_scope_defaults_to_user_and_rejects_conflicts() {
     assert_eq!(
@@ -160,6 +194,24 @@ fn typed_editor_model_contains_nemo_guardrails_options() {
         EditorFieldKind::StringMap
     );
 
+    let local = schema.field("local").unwrap().schema().unwrap();
+    assert_eq!(
+        local.field("python_module").unwrap().kind,
+        EditorFieldKind::String
+    );
+    assert_eq!(
+        schema.field("config_path").unwrap().kind,
+        EditorFieldKind::String
+    );
+    assert_eq!(
+        schema.field("config_yaml").unwrap().kind,
+        EditorFieldKind::String
+    );
+    assert_eq!(
+        schema.field("colang_content").unwrap().kind,
+        EditorFieldKind::String
+    );
+
     let request_defaults = schema.field("request_defaults").unwrap().schema().unwrap();
     let rails = request_defaults.field("rails").unwrap().schema().unwrap();
     assert_eq!(
@@ -1137,6 +1189,98 @@ fn validate_config_accepts_nemo_guardrails_component() {
     validate_config(&config).unwrap();
 }
 
+#[test]
+fn validate_config_accepts_local_tool_only_nemo_guardrails_component() {
+    let config = PluginConfig {
+        components: vec![PluginComponentSpec {
+            kind: NEMO_GUARDRAILS_PLUGIN_KIND.to_string(),
+            enabled: true,
+            config: local_guardrails_component_config("./rails"),
+        }],
+        ..PluginConfig::default()
+    };
+
+    validate_config(&config).unwrap();
+}
+
+#[test]
+fn validate_config_rejects_local_nemo_guardrails_request_defaults() {
+    let config = PluginConfig {
+        components: vec![PluginComponentSpec {
+            kind: NEMO_GUARDRAILS_PLUGIN_KIND.to_string(),
+            enabled: true,
+            config: json!({
+                "mode": "local",
+                "codec": "openai_chat",
+                "input": true,
+                "output": true,
+                "config_yaml": "models: []",
+                "request_defaults": {
+                    "context": {"tenant": "demo"}
+                }
+            })
+            .as_object()
+            .unwrap()
+            .clone(),
+        }],
+        ..PluginConfig::default()
+    };
+
+    let error = validate_config(&config).unwrap_err().to_string();
+    assert!(error.contains("request_defaults"), "error was: {error}");
+    assert!(error.contains("local mode"), "error was: {error}");
+}
+
+#[test]
+fn validate_config_rejects_local_nemo_guardrails_multiple_config_sources() {
+    let config = PluginConfig {
+        components: vec![PluginComponentSpec {
+            kind: NEMO_GUARDRAILS_PLUGIN_KIND.to_string(),
+            enabled: true,
+            config: json!({
+                "mode": "local",
+                "config_path": "./rails",
+                "config_yaml": "models: []"
+            })
+            .as_object()
+            .unwrap()
+            .clone(),
+        }],
+        ..PluginConfig::default()
+    };
+
+    let error = validate_config(&config).unwrap_err().to_string();
+    assert!(
+        error.contains("exactly one of config_path or config_yaml"),
+        "error was: {error}"
+    );
+}
+
+#[test]
+fn validate_config_rejects_local_nemo_guardrails_colang_without_yaml() {
+    let config = PluginConfig {
+        components: vec![PluginComponentSpec {
+            kind: NEMO_GUARDRAILS_PLUGIN_KIND.to_string(),
+            enabled: true,
+            config: json!({
+                "mode": "local",
+                "config_path": "./rails",
+                "colang_content": "define flow noop\n  pass"
+            })
+            .as_object()
+            .unwrap()
+            .clone(),
+        }],
+        ..PluginConfig::default()
+    };
+
+    let error = validate_config(&config).unwrap_err().to_string();
+    assert!(
+        error.contains("colang_content can only be used with config_yaml"),
+        "error was: {error}"
+    );
+}
+
 #[test]
 fn nemo_guardrails_config_map_prunes_default_version() {
     let map = nemo_guardrails_config_map(&NeMoGuardrailsConfig {
@@ -1155,6 +1299,103 @@ fn nemo_guardrails_config_map_prunes_default_version() {
     assert_eq!(map["remote"]["config_id"], json!("default"));
 }
 
+#[test]
+fn write_plugin_config_round_trips_local_nemo_guardrails_component() {
+    let temp = tempfile::tempdir().unwrap();
+    let path = temp.path().join("plugins.toml");
+    let config = PluginConfig {
+        components: vec![PluginComponentSpec {
+            kind: NEMO_GUARDRAILS_PLUGIN_KIND.to_string(),
+            enabled: true,
+            config: local_guardrails_component_config("./rails"),
+        }],
+        ..PluginConfig::default()
+    };
+
+    write_plugin_config(&path, &config).unwrap();
+
+    let rendered = std::fs::read_to_string(&path).unwrap();
+    assert!(rendered.contains("mode = \"local\""));
+    assert!(rendered.contains("config_path = \"./rails\""));
+    assert!(rendered.contains("tool_input = true"));
+    assert!(rendered.contains("python_module = \"custom_guardrails\""));
+
+    let round_tripped = read_plugin_config(&path).unwrap();
+    let guardrails = round_tripped
+        .components
+        .iter()
+        .find(|component| component.kind == NEMO_GUARDRAILS_PLUGIN_KIND)
+        .unwrap();
+    assert!(guardrails.enabled);
+    assert_eq!(guardrails.config["mode"], json!("local"));
+    assert_eq!(guardrails.config["config_path"], json!("./rails"));
+    assert_eq!(guardrails.config["tool_input"], json!(true));
+    assert_eq!(
+        guardrails.config["local"]["python_module"],
+        json!("custom_guardrails")
+    );
+}
+
+#[test]
+fn nemo_guardrails_config_map_serializes_local_mode_fields() {
+    let map = nemo_guardrails_config_map(&NeMoGuardrailsConfig {
+        mode: "local".into(),
+        config_path: Some("./rails".into()),
+        tool_input: true,
+        tool_output: true,
+        local: Some(LocalBackendConfig {
+            python_module: Some("custom_guardrails".into()),
+        }),
+        ..NeMoGuardrailsConfig::default()
+    })
+    .unwrap();
+
+    assert!(!map.contains_key("version"));
+    assert_eq!(map.get("mode"), Some(&json!("local")));
+    assert_eq!(map.get("config_path"), Some(&json!("./rails")));
+    assert_eq!(map.get("tool_input"), Some(&json!(true)));
+    assert_eq!(map["local"]["python_module"], json!("custom_guardrails"));
+}
+
+#[test]
+fn write_plugin_config_round_trips_local_llm_nemo_guardrails_component() {
+    let temp = tempfile::tempdir().unwrap();
+    let path = temp.path().join("plugins.toml");
+    let config = PluginConfig {
+        components: vec![PluginComponentSpec {
+            kind: NEMO_GUARDRAILS_PLUGIN_KIND.to_string(),
+            enabled: true,
+            config: local_llm_guardrails_component_config("models: []"),
+        }],
+        ..PluginConfig::default()
+    };
+
+    write_plugin_config(&path, &config).unwrap();
+
+    let rendered = std::fs::read_to_string(&path).unwrap();
+    assert!(rendered.contains("mode = \"local\""));
+    assert!(rendered.contains("codec = \"openai_chat\""));
+    assert!(rendered.contains("input = true"));
+    assert!(rendered.contains("output = true"));
+    assert!(rendered.contains("config_yaml = \"models: []\""));
+
+    let round_tripped = read_plugin_config(&path).unwrap();
+    let guardrails = round_tripped
+        .components
+        .iter()
+        .find(|component| component.kind == NEMO_GUARDRAILS_PLUGIN_KIND)
+        .unwrap();
+    assert_eq!(guardrails.config["mode"], json!("local"));
+    assert_eq!(guardrails.config["codec"], json!("openai_chat"));
+    assert_eq!(guardrails.config["input"], json!(true));
+    assert_eq!(guardrails.config["output"], json!(true));
+    assert_eq!(guardrails.config["config_yaml"], json!("models: []"));
+    assert_eq!(
+        guardrails.config["colang_content"],
+        json!("define flow noop\n  pass")
+    );
+}
+
 #[test]
 fn display_helpers_render_scalars_json_and_defaults() {
     assert_eq!(display_value(&json!("logs")), "logs");

From e86ae58ef5e8b4d5dffab6b710db1b08d42614fc Mon Sep 17 00:00:00 2001
From: Alex Fournier <afournier@nvidia.com>
Date: Wed, 3 Jun 2026 17:11:46 -0700
Subject: [PATCH 8/8] refactor: embed local guardrails helper snapshot

Signed-off-by: Alex Fournier <afournier@nvidia.com>
---
 .../embedded_python}/_guardrails_local.py     |  11 +-
 crates/python/src/lib.rs                      |  80 +++++++----
 .../python/tests/coverage/coverage_tests.rs   | 128 ++++++++++++++++++
 docs/nemo-guardrails-plugin/configuration.mdx |   5 +-
 4 files changed, 191 insertions(+), 33 deletions(-)
 rename {python/nemo_relay => crates/python/embedded_python}/_guardrails_local.py (97%)

diff --git a/python/nemo_relay/_guardrails_local.py b/crates/python/embedded_python/_guardrails_local.py
similarity index 97%
rename from python/nemo_relay/_guardrails_local.py
rename to crates/python/embedded_python/_guardrails_local.py
index f16f839a..9f93367c 100644
--- a/python/nemo_relay/_guardrails_local.py
+++ b/crates/python/embedded_python/_guardrails_local.py
@@ -22,6 +22,7 @@
 from nemo_relay.plugin import PluginContext
 
 _DEFAULT_PRIORITY = 100
+_SUPPORTED_NEMOGUARDRAILS_VERSION = "0.22.0"
 
 
 class NeMoGuardrailsDependencyError(RuntimeError):
@@ -74,13 +75,21 @@ def _load_nemoguardrails(module_name: str | None) -> _GuardrailsRuntimeImports:
         if error.name == root_module:
             raise NeMoGuardrailsDependencyError(
                 "NeMo Guardrails is required for the built-in NeMo Guardrails local backend. "
-                "Install it with: pip install nemoguardrails"
+                "Install it with: pip install nemoguardrails==0.22.0"
             ) from error
         raise NeMoGuardrailsDependencyError(
             "NeMo Guardrails local backend could not import a required dependency: "
             f"{error.name or error}. Install the full NeMo Guardrails runtime dependencies."
         ) from error
 
+    version = getattr(guardrails, "__version__", None)
+    if version != _SUPPORTED_NEMOGUARDRAILS_VERSION:
+        raise NeMoGuardrailsDependencyError(
+            "NeMo Guardrails local backend requires nemoguardrails=="
+            f"{_SUPPORTED_NEMOGUARDRAILS_VERSION}, but found {version!r}. "
+            "Install it with: pip install nemoguardrails==0.22.0"
+        )
+
     return _GuardrailsRuntimeImports(
         rails_config_cls=guardrails.RailsConfig,
         llm_rails_cls=guardrails.LLMRails,
diff --git a/crates/python/src/lib.rs b/crates/python/src/lib.rs
index 4a40eaf8..8328c265 100644
--- a/crates/python/src/lib.rs
+++ b/crates/python/src/lib.rs
@@ -27,7 +27,9 @@ use nemo_relay::plugins::nemo_guardrails::component::{
 use nemo_relay::shared_runtime::initialize_shared_runtime_binding;
 use nemo_relay_adaptive::plugin_component::register_adaptive_component;
 use pyo3::prelude::*;
+use pyo3::types::{PyDict, PyModule};
 use serde_json::Value as Json;
+use std::ffi::CString;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 
@@ -46,6 +48,11 @@ pub mod py_types;
 #[cfg(test)]
 mod test_support;
 
+const EMBEDDED_GUARDRAILS_LOCAL_MODULE_NAME: &str = "nemo_relay._guardrails_local";
+const EMBEDDED_GUARDRAILS_LOCAL_FILENAME: &str = "nemo_relay/_guardrails_local.py";
+const EMBEDDED_GUARDRAILS_LOCAL_SOURCE: &str =
+    include_str!("../embedded_python/_guardrails_local.py");
+
 /// The `_native` PyO3 module entry point. Registers all types and functions.
 #[pymodule]
 fn _native(m: &Bound<'_, PyModule>) -> PyResult<()> {
@@ -70,6 +77,18 @@ fn _native(m: &Bound<'_, PyModule>) -> PyResult<()> {
     py_api::register(m)?;
     py_plugin::register(m)?;
     py_adaptive::register(m)?;
+    install_native_module_alias(m)?;
+    Ok(())
+}
+
+fn install_native_module_alias(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    let py = m.py();
+    let sys = py.import("sys")?;
+    let modules = sys.getattr("modules")?.cast_into::<PyDict>()?;
+    modules.set_item("nemo_relay._native", m)?;
+    if let Ok(package) = py.import("nemo_relay") {
+        let _ = package.setattr("_native", m);
+    }
     Ok(())
 }
 
@@ -109,43 +128,46 @@ fn register_python_local_guardrails_backend(
 }
 
 fn load_guardrails_local_register_fn(py: Python<'_>) -> PyResult<Bound<'_, PyAny>> {
-    let module = match py.import("nemo_relay._guardrails_local") {
-        Ok(module) => module,
-        Err(err) => {
-            if !is_missing_guardrails_local_module(py, &err)? {
-                return Err(err);
-            }
+    let module = load_embedded_guardrails_local_module(py)?;
+    module.getattr("register_local_backend")
+}
 
-            let source_python_dir = guardrails_local_source_python_dir();
-            if !source_python_dir.exists() {
-                return Err(err);
-            }
+fn load_embedded_guardrails_local_module(py: Python<'_>) -> PyResult<Bound<'_, PyModule>> {
+    ensure_nemo_relay_package_importable(py)?;
 
-            prepend_python_path_if_missing(py, &source_python_dir)?;
-            py.import("nemo_relay._guardrails_local")?
-        }
-    };
-    module.getattr("register_local_backend")
+    let sys = py.import("sys")?;
+    let modules = sys.getattr("modules")?.cast_into::<PyDict>()?;
+    if let Some(existing) = modules.get_item(EMBEDDED_GUARDRAILS_LOCAL_MODULE_NAME)? {
+        return Ok(existing.cast_into::<PyModule>()?);
+    }
+
+    let source = CString::new(EMBEDDED_GUARDRAILS_LOCAL_SOURCE).unwrap();
+    let filename = CString::new(EMBEDDED_GUARDRAILS_LOCAL_FILENAME).unwrap();
+    let module_name = CString::new(EMBEDDED_GUARDRAILS_LOCAL_MODULE_NAME).unwrap();
+    let module = PyModule::from_code(py, &source, &filename, &module_name)?;
+    modules.set_item(EMBEDDED_GUARDRAILS_LOCAL_MODULE_NAME, &module)?;
+    if let Ok(package) = py.import("nemo_relay") {
+        let _ = package.setattr("_guardrails_local", &module);
+    }
+    Ok(module)
 }
 
-fn is_missing_guardrails_local_module(py: Python<'_>, err: &PyErr) -> PyResult<bool> {
-    if !err.is_instance_of::<pyo3::exceptions::PyModuleNotFoundError>(py) {
-        return Ok(false);
+fn ensure_nemo_relay_package_importable(py: Python<'_>) -> PyResult<()> {
+    if py.import("nemo_relay").is_ok() {
+        return Ok(());
     }
 
-    let err_value = err.value(py);
-    let module_name = err_value
-        .getattr("name")
-        .ok()
-        .and_then(|name| name.extract::<String>().ok());
+    let source_python_dir = embedded_guardrails_source_python_dir();
+    if !source_python_dir.exists() {
+        return Ok(());
+    }
 
-    Ok(matches!(
-        module_name.as_deref(),
-        Some("nemo_relay") | Some("nemo_relay._guardrails_local")
-    ))
+    prepend_python_path_if_missing(py, &source_python_dir)?;
+    let _ = py.import("nemo_relay")?;
+    Ok(())
 }
 
-fn guardrails_local_source_python_dir() -> PathBuf {
+fn embedded_guardrails_source_python_dir() -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python")
 }
 
@@ -155,8 +177,6 @@ fn prepend_python_path_if_missing(py: Python<'_>, path: &Path) -> PyResult<()> {
     let path_str = path.to_string_lossy();
 
     if !sys_path.contains(path_str.as_ref())? {
-        // Source-tree fallback for local development and in-repo tests where the
-        // Python package has not been installed into the active environment yet.
         sys_path.call_method1("insert", (0, path_str.as_ref()))?;
     }
 
diff --git a/crates/python/tests/coverage/coverage_tests.rs b/crates/python/tests/coverage/coverage_tests.rs
index a104d68c..75a38ea1 100644
--- a/crates/python/tests/coverage/coverage_tests.rs
+++ b/crates/python/tests/coverage/coverage_tests.rs
@@ -45,6 +45,10 @@ fn python_package_dir() -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../python")
 }
 
+fn embedded_guardrails_local_source_path() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("embedded_python/_guardrails_local.py")
+}
+
 fn fake_guardrails_module_prelude(module_name: &str, python_dir: &str) -> String {
     format!(
         r#"
@@ -115,6 +119,34 @@ class Context:
 "#
 }
 
+fn embedded_guardrails_local_loader_python(source_path: &str) -> String {
+    format!(
+        r#"
+import pathlib
+import sys
+import types
+
+import nemo_relay
+
+GUARDRAILS_LOCAL_SOURCE_PATH = pathlib.Path({source_path:?})
+guardrails_local_module = types.ModuleType("nemo_relay._guardrails_local")
+guardrails_local_module.__file__ = str(GUARDRAILS_LOCAL_SOURCE_PATH)
+guardrails_local_module.__package__ = "nemo_relay"
+sys.modules["nemo_relay._guardrails_local"] = guardrails_local_module
+setattr(nemo_relay, "_guardrails_local", guardrails_local_module)
+exec(
+    compile(
+        GUARDRAILS_LOCAL_SOURCE_PATH.read_text(),
+        str(GUARDRAILS_LOCAL_SOURCE_PATH),
+        "exec",
+    ),
+    guardrails_local_module.__dict__,
+)
+"#,
+        source_path = source_path,
+    )
+}
+
 fn with_isolated_nemo_relay_modules<T>(
     py: Python<'_>,
     native_module: &Bound<'_, PyModule>,
@@ -291,6 +323,11 @@ fn test_guardrails_local_helper_registers_and_enforces_llm_and_tool_checks() {
             );
             let epilogue = register_fake_guardrails_module_epilogue();
             let context_class = local_plugin_context_python();
+            let embedded_loader = embedded_guardrails_local_loader_python(
+                &embedded_guardrails_local_source_path()
+                    .display()
+                    .to_string(),
+            );
             let module = load_module(
                 py,
                 &format!(
@@ -310,6 +347,8 @@ class LLMRails:
 
 {epilogue}
 
+{embedded_loader}
+
 from nemo_relay._native import LLMRequest
 from nemo_relay._guardrails_local import register_local_backend
 
@@ -381,6 +420,7 @@ async def run_case():
                     prelude = prelude,
                     epilogue = epilogue,
                     context_class = context_class,
+                    embedded_loader = embedded_loader,
                 ),
             );
 
@@ -436,6 +476,86 @@ async def run_case():
     });
 }
 
+#[test]
+fn test_guardrails_local_helper_rejects_unsupported_nemoguardrails_version() {
+    let _python = crate::test_support::init_python_test();
+    Python::attach(|py| {
+        let native_module = PyModule::new(py, "_native_guardrails_version").unwrap();
+        crate::_native(&native_module).unwrap();
+
+        with_isolated_nemo_relay_modules(py, &native_module, || {
+            let python_dir = python_package_dir();
+            let prelude = fake_guardrails_module_prelude(
+                "fake_guardrails_bad_version",
+                &python_dir.display().to_string(),
+            );
+            let epilogue = register_fake_guardrails_module_epilogue();
+            let context_class = local_plugin_context_python();
+            let embedded_loader = embedded_guardrails_local_loader_python(
+                &embedded_guardrails_local_source_path()
+                    .display()
+                    .to_string(),
+            );
+            let module = load_module(
+                py,
+                &format!(
+                    r#"
+{prelude}
+
+fake_root.__version__ = "0.21.0"
+
+class LLMRails:
+    def __init__(self, config):
+        self.config = config
+
+    async def check_async(self, messages, rail_types):
+        return Result(RailStatus.PASSED)
+
+{epilogue}
+
+{embedded_loader}
+
+from nemo_relay._guardrails_local import register_local_backend
+
+{context_class}
+
+async def run_case():
+    ctx = Context()
+    register_local_backend(
+        {{
+            "mode": "local",
+            "codec": "openai_chat",
+            "config_yaml": "models: []",
+            "input": True,
+            "local": {{"python_module": MODULE_NAME}},
+        }},
+        ctx,
+    )
+"#,
+                    prelude = prelude,
+                    epilogue = epilogue,
+                    embedded_loader = embedded_loader,
+                    context_class = context_class,
+                ),
+            );
+
+            let error = with_event_loop(py, |event_loop| {
+                let coroutine = module.getattr("run_case").unwrap().call0().unwrap();
+                event_loop
+                    .call_method1("run_until_complete", (coroutine,))
+                    .unwrap_err()
+                    .to_string()
+            });
+
+            assert!(
+                error.contains("requires nemoguardrails==0.22.0"),
+                "unexpected error: {error}"
+            );
+            assert!(error.contains("0.21.0"), "unexpected error: {error}");
+        });
+    });
+}
+
 #[test]
 fn test_guardrails_local_helper_enforces_streamed_output_rails() {
     let _python = crate::test_support::init_python_test();
@@ -451,6 +571,11 @@ fn test_guardrails_local_helper_enforces_streamed_output_rails() {
             );
             let epilogue = register_fake_guardrails_module_epilogue();
             let context_class = local_plugin_context_python();
+            let embedded_loader = embedded_guardrails_local_loader_python(
+                &embedded_guardrails_local_source_path()
+                    .display()
+                    .to_string(),
+            );
             let module = load_module(
                 py,
                 &format!(
@@ -487,6 +612,8 @@ class LLMRails:
 
 {epilogue}
 
+{embedded_loader}
+
 from nemo_relay._native import LLMRequest
 from nemo_relay._guardrails_local import register_local_backend
 
@@ -583,6 +710,7 @@ async def run_case():
                     prelude = prelude,
                     epilogue = epilogue,
                     context_class = context_class,
+                    embedded_loader = embedded_loader,
                 ),
             );
 
diff --git a/docs/nemo-guardrails-plugin/configuration.mdx b/docs/nemo-guardrails-plugin/configuration.mdx
index 24cc12c6..16245f24 100644
--- a/docs/nemo-guardrails-plugin/configuration.mdx
+++ b/docs/nemo-guardrails-plugin/configuration.mdx
@@ -225,12 +225,13 @@ through the Python runtime instead of a separate Guardrails service.
 ### Requirements
 
 To use `mode = "local"`, the running Python environment must be able to import
-`nemoguardrails`.
+`nemoguardrails==0.22.0`.
 
 The built-in local backend is installed by the Python binding and runs
 Guardrails in process. Use it when the runtime has direct access to the Python
 Guardrails dependency and configuration files rather than a separate Guardrails
-service.
+service. Install the tested local-mode Guardrails dependency with
+`pip install nemoguardrails==0.22.0`.
 
 The same ownership boundary still applies: