xCatG · xCatG · Aug 26, 2025 · Aug 16, 2025 · Aug 24, 2025 · Aug 24, 2025
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -231,6 +231,7 @@ make test-specific TEST_PATH="test/python/unit/chat/test_chat_logger.py"
 - [x] `make test` - Full test suite with coverage reporting and 25% minimum threshold
 - [x] `make test-quiet` - Quiet mode execution for faster feedback
 - [x] `make test-chat` - Chat module specific testing with dedicated coverage
+- [x] `make test-voice` - Voice module specific testing with dedicated coverage
 - [x] `make test-unit` - Unit tests only for focused testing
 - [x] `make test-integration` - Integration tests for service interactions
 - [x] `make test-coverage-html` - HTML coverage reports for detailed analysis
@@ -259,6 +260,21 @@ make test-specific TEST_PATH="test/python/unit/chat/test_chat_logger.py"
 - [x] **Impact**: ~300 lines duplicate code eliminated, better maintainability, all 241 tests passing
 - [x] **Critical Fix**: Resolved frontend data loading issues in Phase 3 refactoring (API response handling bugs)
 
+### Voice Chat & Audio Debugging (Completed)
+- [x] **WebSocket Voice Handler**: Real-time bidirectional audio streaming with ADK integration
+- [x] **PCM Audio Logging**: Environment-based audio recording for debugging (dev/beta only)
+- [x] **Binary Data Fix**: Corrected PCM audio storage from `write()` to `write_bytes()` for proper binary handling
+- [x] **Audio Debug Utility**: `debug_audio.py` script for reassembling and analyzing recorded PCM chunks:
+  - `info <session_dir>` - Show audio session statistics and timing
+  - `reassemble <session_dir>` - Combine PCM chunks into playable WAV files
+  - `play <session_dir>` - Playback reassembled audio for debugging
+- [x] **Audio Format Support**: 16-bit PCM, 16kHz, mono format matching Gemini Live API requirements
+- [x] **Testing Infrastructure**: Comprehensive voice backend testing suite with WebSocket simulation
+- [x] **Unit Tests**: Comprehensive test coverage for voice handler methods, PCM logging, audio processing
+- [x] **Integration Tests**: WebSocket connection testing, mixed text/audio sessions, PCM file verification
+- [x] **Debug Utility Tests**: Complete test suite for audio reassembly and WAV generation functions
+- [x] **Documentation**: Complete README.md with usage examples and troubleshooting guides
+
 
 ## Implementation Phases
 1. Core Infrastructure → 2. Authentication → 3. Handlers → 4. WebSocket/Audio → 5. Polish
@@ -270,6 +286,7 @@ make test-specific TEST_PATH="test/python/unit/chat/test_chat_logger.py"
 - **Server**: FastAPI with stateless handlers, JWT auth, CORS, environment configs
 - **Auth**: RoleChecker pattern (replaced decorators), role hierarchy, proper HTTP codes, language preferences
 - **Chat**: ADK integration, JSONL logging, singleton services, POC endpoints, language-aware content, refactored for maintainability, centralized agent configuration
+- **Voice Chat**: WebSocket-based real-time audio streaming with Gemini Live API integration, PCM audio debugging utilities
 - **Evaluation**: AI agent evaluation system with persistent storage, comprehensive error handling, session validation, and resource cleanup
 - **Testing**: 260+ tests, language functionality coverage (ContentLoader, auth, models), evaluation module unit tests, ResourceLoader version validation tests, comprehensive Makefile targets
 - **Frontend**: Vue.js auth UI, i18n with Traditional Chinese, language switcher, reusable composables, dual-flow session creation

diff --git a/Makefile b/Makefile
@@ -484,6 +484,11 @@ test-no-coverage:
 	@echo "Running tests without coverage (faster)..."
 	@bash -c "source venv/bin/activate && python -m pytest test/python/ -v"
 
+.PHONY: test-voice
+test-voice:
+	@echo "Running voice-related tests with coverage..."
+	@bash -c "source venv/bin/activate && python -m pytest test/python/ -k 'voice' --cov=src/python/role_play/voice --cov-report=term-missing --cov-fail-under=0"
+
 .PHONY: test-specific
 test-specific:
 ifndef TEST_PATH

diff --git a/config/dev.yaml b/config/dev.yaml
@@ -61,6 +61,7 @@ enabled_handlers:
   evaluation: "role_play.evaluation.handler.EvaluationHandler"
   # Add more handlers as they're implemented:
   # scripter: "role_play.scripter.handler.ScripterHandler"
+  voice: "role_play.voice.handler.VoiceHandler"
 
 # Language configuration
 supported_languages:

diff --git a/data/dev_data b/data/dev_data
@@ -0,0 +1 @@
+/home/yenchi/data/rps_dev
diff --git a/src/python/requirements-test.txt b/src/python/requirements-test.txt
@@ -4,3 +4,5 @@ pytest-asyncio
 pytest-cov
 httpx
 factory_boy
+httpx
+websockets
diff --git a/src/python/role_play/chat/chat_logger.py b/src/python/role_play/chat/chat_logger.py
@@ -1,5 +1,6 @@
 """Service for logging chat sessions to JSONL files using storage backend."""
 import json
+import os
 import uuid
 from typing import Dict, List, Tuple, Any, Optional
 import logging
@@ -456,4 +457,147 @@ async def export_session_text(self, user_id: str, session_id: str, export_format
             lines.append("SESSION ACTIVE OR NOT PROPERLY ENDED")
         lines.append("=" * 70)
 
-        return "\n".join(lines)
+        return "\n".join(lines)
+
+    async def log_voice_message(
+            self,
+            user_id: str,
+            session_id: str,
+            role: str,
+            transcript_text: str,
+            duration_ms: int,
+            confidence: float,
+            message_number: int,
+            voice_metadata: Optional[Dict[str, Any]] = None
+    ) -> None:
+        """
+        Logs a message to the specified session.
+
+        Args:
+            user_id: The user ID who owns the session.
+            session_id: The application session ID.
+            role: The role of the message sender (e.g., "participant", "character").
+            transcript_text: The message content.
+            duration_ms: The message duration in ms.
+            confidence: Confidence score of the transcription (0.0-1.0).
+            message_number: The sequential number of the message in the session.
+            voice_metadata: Optional additional data for the message.
+        """
+        storage_path = self._get_chat_log_path(user_id, session_id)
+
+        if not await self.storage.exists(storage_path):
+            logger.error(f"Log file {storage_path} does not exist. Cannot log message.")
+            raise StorageError(f"Session log file not found: {storage_path}")
+
+        message_event = {
+            "type": "voice_message",
+            "timestamp": utc_now_isoformat(),
+            "app_session_id": session_id,
+            "role": role,
+            "content": transcript_text,
+            "message_number": message_number,
+            "voice_metadata": {
+                "duration_ms": duration_ms,
+                "confidence": confidence,
+                "is_voice": True,
+                **(voice_metadata or {})
+            }
+        }
+
+        try:
+            async with self.storage.lock(storage_path):
+                # Append the message event as a new line
+                event_line = json.dumps(message_event) + '\n'
+                await self.storage.append(storage_path, event_line)
+
+            logger.debug(f"Logged voice message to {storage_path} (Msg#: {message_number}, Role: {role}, Duration: {duration_ms}ms)")
+        except Exception as e:
+            logger.error(f"Error logging message to {storage_path}: {e}")
+            raise
+
+    async def log_pcm_audio(
+        self,
+        user_id: str,
+        session_id: str,
+        audio_data: bytes
+    ) -> None:
+        """
+        Logs a raw PCM audio chunk to storage.
+
+        This is intended for debugging in non-production environments.
+        As a defensive measure, this method is a no-op in production
+        to prevent accidental data collection.
+
+        Args:
+            user_id: The user ID who owns the session.
+            session_id: The application session ID.
+            audio_data: The raw PCM audio data as bytes.
+        """
+        # Security: Defensive check to ensure no PCM logging in production
+        # even if environment checks are bypassed elsewhere
+        env = os.environ.get("ENV", "dev").lower()
+        if env == "prod" or env == "production":
+            logger.debug("PCM audio logging disabled in production environment")
+            return
+
+        # Sanitize timestamp for filenames
+        safe_timestamp = utc_now_isoformat().replace(":", "-").replace("+", "_")
+        storage_path = f"users/{user_id}/voice_logs/{session_id}/audio_in_{safe_timestamp}.pcm"
+
+        try:
+            # We don't need a lock for writing a new, unique file.
+            await self.storage.write_bytes(storage_path, audio_data)
+            logger.debug(f"Logged {len(audio_data)} bytes of PCM audio to {storage_path}")
+        except Exception as e:
+            logger.error(f"Error logging PCM audio to {storage_path}: {e}")
+            # We don't re-raise the exception here because audio logging is a
+            # non-critical operation for debugging and shouldn't crash the main flow.
+            pass
+
+    async def log_voice_session_start(self, user_id:str, session_id:str, voice_config:Dict[str, str]):
+        storage_path = self._get_chat_log_path(user_id, session_id)
+
+        if not await self.storage.exists(storage_path):
+            logger.error(f"Log file {storage_path} does not exist. Cannot log message.")
+            raise StorageError(f"Session log file not found: {storage_path}")
+
+        voice_start_event = {
+            "type": "voice_session_start",
+            "timestamp": utc_now_isoformat(),
+            "app_session_id": session_id,
+            "voice_config": voice_config
+        }
+
+        try:
+            async with self.storage.lock(storage_path):
+                # Append the voice session start event
+                event_line = json.dumps(voice_start_event) + '\n'
+                await self.storage.append(storage_path, event_line)
+            logger.info(f"Logged voice session start for {session_id}")
+        except Exception as e:
+            logger.error(f"Error logging voice session start for {session_id}: {e}")
+            raise
+
+    async def log_voice_session_end(self, user_id:str, session_id:str, voice_stats:dict):
+        storage_path = self._get_chat_log_path(user_id, session_id)
+
+        if not await self.storage.exists(storage_path):
+            logger.warning(f"Log file {storage_path} does not exist for voice session end.")
+            return  # Don't raise error since session might be deleted
+        voice_end_event = {
+            "type": "voice_session_end",
+            "timestamp": utc_now_isoformat(),
+            "app_session_id": session_id,
+            "voice_stats": voice_stats
+        }
+
+        try:
+            async with self.storage.lock(storage_path):
+                # Append the voice session end event
+                event_line = json.dumps(voice_end_event) + '\n'
+                await self.storage.append(storage_path, event_line)
+
+            logger.info(f"Logged voice session end for {session_id}")
+        except Exception as e:
+            logger.error(f"Error logging voice session end for {session_id}: {e}")
+            raise
diff --git a/src/python/role_play/chat/models.py b/src/python/role_play/chat/models.py
@@ -71,6 +71,8 @@ class CharacterInfo(BaseModel):
     id: str
     name: str = Field(description="The name of the character.")
     description: str = Field(description="The description of the character. Could contain age, gender, character traits or brief bio")
+    # TODO: make this change to the json, also think about how to do mapping between different services
+    voice_id: Optional[str] = Field(default=None, description="Optional voice ID, only used in voice sessions")
 
 class CharacterListResponse(BaseResponse):
     """Response containing list of characters."""

diff --git a/src/python/role_play/common/models.py b/src/python/role_play/common/models.py
@@ -130,3 +130,10 @@ class Environment(str, Enum):
     DEV = "dev"
     BETA = "beta"
     PROD = "prod"
+
+
+class EnvironmentInfo(BaseModel):
+    """Provides detailed information about the current deployment environment."""
+    name: Environment
+    is_production: bool = Field(description="True if the environment is production, False otherwise.")
+    is_development: bool = Field(description="True if the environment is development, False otherwise.")
diff --git a/src/python/role_play/dev_agents/roleplay_agent/agent.py b/src/python/role_play/dev_agents/roleplay_agent/agent.py
@@ -46,7 +46,7 @@ def __init__(self, **kwargs):
 
 # --- Configuration Export for Production ---
 
-async def get_production_agent(character_id: str, scenario_id: str, language: str = "en", scripted: bool = False) -> Optional[Agent]:
+async def get_production_agent(character_id: str, scenario_id: str, language: str = "en", scripted: bool = False, agent_model: str = AGENT_MODEL) -> Optional[Agent]:
     """
     Creates a production-ready RolePlayAgent for a specific
     character, scenario, and language.
@@ -55,6 +55,8 @@ async def get_production_agent(character_id: str, scenario_id: str, language: st
         character_id: The ID of the character
         scenario_id: The ID of the scenario
         language: The language code (e.g., "en", "zh-TW", "ja")
+        scripted: whether the session is scripted or not
+        agent_model: id of the llm model to use
 
     Returns:
         A configured RolePlayAgent instance or None if character/scenario not found
@@ -103,7 +105,7 @@ async def get_production_agent(character_id: str, scenario_id: str, language: st
     # Create and return the configured agent
     return RolePlayAgent(
         name=f"roleplay_{character_id}_{scenario_id}",
-        model=AGENT_MODEL,
+        model=agent_model,
         description=f"Roleplay agent for {character.get('name', 'Unknown Character')} in {scenario.get('name', 'Unknown Scenario')}",
         instruction=prod_prompt
     )

diff --git a/src/python/role_play/server/dependencies.py b/src/python/role_play/server/dependencies.py
@@ -11,7 +11,7 @@
 from ..common.auth import AuthManager
 from ..common.storage import StorageBackend, FileStorage, FileStorageConfig, LockConfig
 from ..common.storage_factory import create_storage_backend
-from ..common.models import User, UserRole, Environment
+from ..common.models import User, UserRole, Environment, EnvironmentInfo
 from ..common.exceptions import AuthenticationError, TokenExpiredError
 from .config_loader import get_config, ServerConfig
 from ..chat.chat_logger import ChatLogger
@@ -21,6 +21,26 @@
 logger = logging.getLogger(__name__)
 
 
+@lru_cache(maxsize=None)
+def get_environment_info() -> EnvironmentInfo:
+    """Provides detailed information about the current deployment environment."""
+    env_str = os.getenv("ENV", "dev")
+    try:
+        env_enum = Environment(env_str)
+    except ValueError:
+        logger.warning(f"Unknown environment '{env_str}', defaulting to DEV")
+        env_enum = Environment.DEV
+
+    is_prod = (env_enum == Environment.PROD)
+    is_dev = (env_enum == Environment.DEV)
+
+    return EnvironmentInfo(
+        name=env_enum,
+        is_production=is_prod,
+        is_development=is_dev
+    )
+
+
 @lru_cache(maxsize=None)
 def get_server_config() -> ServerConfig:
     """Provides the global server configuration."""

diff --git a/src/python/role_play/voice/__init__.py b/src/python/role_play/voice/__init__.py