GetStream · dangusev · Mar 20, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 20, 2026
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -84,6 +84,8 @@ jobs:
           sudo rm -rf "/usr/local/lib/android" || true
           sudo rm -rf "/usr/local/share/boost" || true
           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y libportaudio2
       - name: Install dependencies
         uses: ./.github/actions/python-uv-setup
       - name: Run core tests

diff --git a/agents-core/pyproject.toml b/agents-core/pyproject.toml
@@ -73,6 +73,7 @@ turbopuffer = ["vision-agents-plugins-turbopuffer"]
 mistral = ["vision-agents-plugins-mistral"]
 assemblyai = ["vision-agents-plugins-assemblyai"]
 redis = ["redis[hiredis]>=5.0.0"]
+local = ["vision-agents-plugins-local"]
 
 all-plugins = [
     "vision-agents-plugins-anthropic",
@@ -107,6 +108,7 @@ all-plugins = [
     "vision-agents-plugins-turbopuffer",
     "vision-agents-plugins-mistral",
     "vision-agents-plugins-assemblyai",
+    "vision-agents-plugins-local",
 ]
 
 [tool.hatch.metadata]

diff --git a/examples/10_local_transport_example/README.md b/examples/10_local_transport_example/README.md
@@ -0,0 +1,104 @@
+# Local Transport Example
+
+This example demonstrates how to run a vision agent using local audio/video I/O (microphone, speakers, and camera) instead of a cloud-based edge network.
+
+## Overview
+
+The LocalEdge provides:
+
+- **Microphone input**: Captures audio from your microphone
+- **Speaker output**: Plays AI responses on your speakers
+- **Camera input**: Captures video from your camera (optional)
+- **No cloud dependencies**: Media runs locally (except for the LLM, TTS, and STT services)
+
+## Running
+
+Uses Gemini LLM with Deepgram STT and TTS for a voice experience with optional camera input.
+
+```bash
+uv run python local_transport_example.py
+```
+
+## Prerequisites
+
+1. A working microphone and speakers
+2. A camera (optional, for video input)
+3. API keys:
+   - Google AI (for Gemini LLM)
+   - Deepgram (for STT and TTS)
+
+## Setup
+
+1. Create a `.env` file with your API keys:
+
+```bash
+GOOGLE_API_KEY=your_google_api_key
+DEEPGRAM_API_KEY=your_deepgram_api_key
+```
+
+2. Install dependencies:
+
+```bash
+cd examples/10_local_transport_example
+uv sync
+```
+
+## Device Selection
+
+The example will prompt you to select:
+
+1. **Input device** (microphone)
+2. **Output device** (speakers)
+3. **Video device** (camera) - can be skipped by entering 'n'
+
+Press Enter to use the default device, or enter a number to select a specific device.
+
+Press `Ctrl+C` to stop the agent.
+
+## Listing Audio Devices
+
+To see available audio devices on your system:
+
+```python
+from vision_agents.plugins.local.devices import list_audio_input_devices, list_audio_output_devices
+
+list_audio_input_devices()
+list_audio_output_devices()
+```
+
+## Configuration
+
+You can customize the audio settings when creating the LocalEdge:
+
+```python
+from vision_agents.plugins.local import LocalEdge
+from vision_agents.plugins.local.devices import (
+    select_audio_input_device,
+    select_audio_output_device,
+)
+
+input_device = select_audio_input_device()
+output_device = select_audio_output_device()
+
+edge = LocalEdge(
+    audio_input=input_device,  # AudioInputDevice (microphone)
+    audio_output=output_device,  # AudioOutputDevice (speakers)
+)
+```
+
+## Troubleshooting
+
+### No audio input/output
+
+1. Check that your microphone and speakers are properly connected
+2. Run `list_audio_input_devices()` or `list_audio_output_devices()` to see available devices
+3. Try specifying explicit device indices in the LocalEdge constructor
+
+### Audio quality issues
+
+- Try increasing the `blocksize` parameter for smoother audio
+- Ensure your microphone isn't picking up too much background noise
+
+### Permission errors
+
+On macOS, you may need to grant microphone permissions to your terminal application.
diff --git a/examples/10_local_transport_example/__init__.py b/examples/10_local_transport_example/__init__.py
@@ -0,0 +1 @@
+# Local Transport Example
diff --git a/examples/10_local_transport_example/local_transport_example.py b/examples/10_local_transport_example/local_transport_example.py
@@ -0,0 +1,106 @@
+"""
+Local Transport Example
+
+Demonstrates using LocalTransport for local audio/video I/O with vision agents.
+This enables running agents using your microphone, speakers, and camera without
+cloud-based edge infrastructure.
+
+Usage:
+    uv run python local_transport_example.py run
+
+Requirements:
+    - Working microphone and speakers
+    - Optional: Camera for video input
+    - API keys for Gemini, Deepgram, and ElevenLabs in .env file
+"""
+
+import logging
+from typing import Any
+
+from dotenv import load_dotenv
+from vision_agents.core import Agent, AgentLauncher, Runner, User
+from vision_agents.core.utils.examples import get_weather_by_location
+from vision_agents.plugins import deepgram, gemini
+from vision_agents.plugins.local import LocalEdge
+from vision_agents.plugins.local.devices import (
+    select_audio_input_device,
+    select_audio_output_device,
+    select_video_device,
+)
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+INSTRUCTIONS = (
+    "You're a helpful voice AI assistant running on the user's local machine. "
+    "Keep responses short and conversational. Don't use special characters or "
+    "formatting. Be friendly and helpful."
+)
+
+
+def setup_llm(model: str = "gemini-3.1-flash-lite-preview") -> gemini.LLM:
+    llm = gemini.LLM(model)
+
+    @llm.register_function(description="Get current weather for a location")
+    async def get_weather(location: str) -> dict[str, Any]:
+        return await get_weather_by_location(location)
+
+    return llm
+
+
+async def create_agent() -> Agent:
+    llm = setup_llm()
+
+    if input_device is None:
+        raise RuntimeError("No audio input device available")
+    if output_device is None:
+        raise RuntimeError("No audio output device available")
+
+    logger.info(f"Using input: {input_device.name} ({input_device.sample_rate}Hz)")
+    logger.info(f"Using output: {output_device.name} ({output_device.sample_rate}Hz)")
+    if video_device:
+        logger.info(f"Using video device: {video_device.name}")
+
+    transport = LocalEdge(
+        audio_input=input_device,
+        audio_output=output_device,
+        video_input=video_device,
+    )
+
+    agent = Agent(
+        edge=transport,
+        agent_user=User(name="Local AI Assistant", id="local-agent"),
+        instructions=INSTRUCTIONS,
+        processors=[],
+        llm=llm,
+        tts=deepgram.TTS(),
+        stt=deepgram.STT(),
+    )
+
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs: Any) -> None:
+    call = await agent.edge.create_call(call_id)
+    async with agent.join(call=call, participant_wait_timeout=0):
+        await agent.simple_response("Greet the user briefly")
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    print("\n" + "=" * 60)
+    print("Local Transport Voice Agent")
+    print("=" * 60)
+    print("\nThis agent uses your local microphone, speakers, and optionally camera.")
+
+    input_device = select_audio_input_device()
+    output_device = select_audio_output_device()
+    video_device = select_video_device()
+
+    print("Speak into your microphone to interact with the AI.")
+    if video_device:
+        print("Camera is enabled for video input.")
+    print("Press Ctrl+C to stop.\n")
+
+    Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
diff --git a/examples/10_local_transport_example/pyproject.toml b/examples/10_local_transport_example/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "local-transport-example"
+version = "0.0.0"
+requires-python = ">=3.10"
+
+# Dependencies for local audio transport
+dependencies = [
+    "python-dotenv>=1.0",
+    "vision-agents-plugins-deepgram",
+    "vision-agents-plugins-gemini",
+    "vision-agents-plugins-local",
+]
+
+[tool.uv.sources]
+"vision-agents-plugins-deepgram" = { path = "../../plugins/deepgram", editable = true }
+"vision-agents-plugins-gemini" = { path = "../../plugins/gemini", editable = true }
+"vision-agents-plugins-local" = { path = "../../plugins/local", editable = true }
+"vision-agents" = { path = "../../agents-core", editable = true }
diff --git a/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py b/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import time
-from typing import Any, Optional
+from typing import Any, AsyncContextManager, Optional
 
 from deepgram import AsyncDeepgramClient
 from deepgram.core import EventType
@@ -79,7 +79,9 @@ def __init__(
         self._current_participant: Optional[Participant] = None
         self.connection: Optional[AsyncV2SocketClient] = None
         self._connection_ready = asyncio.Event()
-        self._connection_context: Optional[Any] = None
+        self._connection_context: Optional[AsyncContextManager[AsyncV2SocketClient]] = (
+            None
+        )
         self._listen_task: Optional[asyncio.Task[Any]] = None
         # Track when audio processing started for latency measurement
         self._audio_start_time: Optional[float] = None
@@ -297,12 +299,15 @@ async def close(self):
         # Close connection
         if self.connection and self._connection_context:
             try:
-                # Handle API differences between deepgram-sdk versions
                 close_msg = ListenV2CloseStream(type="CloseStream")
                 await self.connection.send_close_stream(close_msg)
+            except Exception as exc:
+                logger.warning(f"Error sending close stream to Deepgram: {exc}")
+
+            try:
                 await self._connection_context.__aexit__(None, None, None)
             except Exception as exc:
-                logger.warning(f"Error closing Deepgram websocket connection: {exc}")
+                logger.warning(f"Error closing Deepgram connection context: {exc}")
             finally:
                 self.connection = None
                 self._connection_context = None

diff --git a/plugins/local/README.md b/plugins/local/README.md
diff --git a/plugins/local/py.typed b/plugins/local/py.typed
diff --git a/plugins/local/pyproject.toml b/plugins/local/pyproject.toml
@@ -0,0 +1,42 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "vision-agents-plugins-local"
+dynamic = ["version"]
+description = "Local audio & video integration for Vision Agents"
+readme = "README.md"
+keywords = ["local", "AI", "voice agents", "agents"]
+requires-python = ">=3.10"
+license = "MIT"
+dependencies = [
+    "vision-agents",
+    "sounddevice>=0.5.0",
+    "aiortc>=1.14.0, <1.15.0",
+    "av>=14.2.0, <17",
+]
+
+[project.urls]
+Documentation = "https://visionagents.ai/"
+Website = "https://visionagents.ai/"
+Source = "https://github.com/GetStream/Vision-Agents"
+
+[tool.hatch.version]
+source = "vcs"
+raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
+
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+
+[tool.hatch.build.targets.sdist]
+include = ["/vision_agents"]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+
+[dependency-groups]
+dev = [
+    "pytest>=8.4.1",
+    "pytest-asyncio>=1.0.0",
+]
diff --git a/plugins/local/tests/__init__.py b/plugins/local/tests/__init__.py