diff --git a/.gitignore b/.gitignore index b5baafb..08b8c91 100644 --- a/.gitignore +++ b/.gitignore @@ -10,10 +10,24 @@ .env* !.env*.example -# # Old expo -# .expo/ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +.venv/ # uv .uv/ -# Agent environment variables -agents/.env + +# SSL certificates +*.pem + +# Database +*.db +*.sqlite + +# Logs +*.log diff --git a/README.md b/README.md index d6b53f1..e6b87b1 100644 --- a/README.md +++ b/README.md @@ -1,151 +1,125 @@ -# Blindsighted (Sample App) +# Julie -**A hackathon-ready template for building AI-powered experiences with Ray-Ban Meta smart glasses.** +**AI-powered shopping assistant for visually impaired users.** -Blindsighted is a **sample app** that connects Ray-Ban Meta smart glasses to AI agents via LiveKit. The context is for a visual assistance app for blind/visually impaired users, but the architecture works for any AI-powered glasses experience. +## The Problem -The integration setup with Meta's wearables SDK and LiveKit streaming was finicky to get right. This template gives you a working foundation so you can skip that part and jump straight to the interesting bits. +Grocery shopping is a significant challenge for blind and visually impaired individuals. Identifying products on shelves, reading labels, and locating specific items typically requires assistance from others—limiting independence and privacy. -## Architecture Overview +## The Solution -``` -iOS App (Swift) → LiveKit Cloud (WebRTC) → AI Agents (Python) - ↓ ↑ - └──────→ FastAPI Backend (optional) ───────┘ - (sessions, storage, etc.) -``` - -**Three independent components:** - -- **`ios/`** - Native iOS app using Meta Wearables DAT SDK - - - Streams video/audio from Ray-Ban Meta glasses to LiveKit - - Receives audio/data responses from agents - - Works standalone if you just want to test the glasses SDK +Julie combines **Ray-Ban Meta smart glasses** with **AI vision and voice** to give users complete autonomy when shopping, providing them with enough information to make qualitative, subjective choices about product selection. No screen interaction required—everything works through natural voice and audio feedback. -- **`agents/`** - LiveKit agents (Python) +## How It Works - - Join LiveKit rooms as peers - - Process live video/audio streams with AI models - - Send responses back via audio/video/data channels - - **This is where the magic happens** - build your AI features here +1. **Point** — User faces a shelf wearing the glasses +2. **Scan** — Gemini [via Elevenlabs TTS] guides positioning until the full shelf is visible +3. **Identification** — Gemini identifies all products +4. **Discuss** — User has back and forth conversation with Elevenlabs Agent to determine item selection +5. **Reach** — AI guides their hand directly to the product using real-time camera feedback -- **`api/`** - FastAPI backend (Python) - - Session management and room creation - - R2 storage for life logs and replays - - Optional but useful for anything ad hoc you need a backend for +The entire experience is **eyes-free**. -**You can work on just one part.** Want to build a cool agent but not touch iOS? Great. Want to experiment with the glasses SDK without running agents? Also fine. Want to add interesting storage/indexing features? The backend's there for you. +## Key Features +- **Voice-first interaction** — No buttons, no screens, just conversation +- **Real-time guidance** — Continuous audio feedback using clock positions ("move to 2 o'clock") +- **Product identification** — Recognizes items, brands, prices, and shelf locations +- **Hand guidance** — Guides user's hand to the exact product location +- **Works with existing hardware** — Ray-Ban Meta glasses + iPhone -## Quick Start - -### iOS App +## System Architecture -```bash -cd ios -open Blindsighted.xcodeproj -# Build and run in Xcode (⌘R) ``` - -**Requirements**: Xcode 26.2+, iOS 17.0+, Swift 6.2+ - -See [ios/README.md](ios/README.md) for detailed setup. - -### Agents - -```bash -cd agents -uv sync -uv run example_agent.py dev + 👓 RAY-BAN META GLASSES + │ + │ photos + ▼ + ┌─────────────┐ + │ iOS App │ + └──────┬──────┘ + │ + ┌───────────────────────┼───────────────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ + │ LOW photos │ │ HIGH photo │ │ LOW photos │ + │ (position) │ │ (identify) │ │ (guidance) │ + └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ + │ │ │ + ▼ ▼ ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ GEMINI VISION AI │ +│ │ +│ ① Navigation Mode ② Identification Mode ③ Hand Guidance Mode │ +│ "Move camera right" "Found 12 products" "Move hand to 2 o'clock" │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ 🔊 TTS Audio CSV Product List 🔊 TTS Audio │ +└─────────┬───────────────────────┬───────────────────────────┬────────────────┘ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────┐ │ + │ │ FASTAPI BACKEND │ │ + │ │ │ │ + │ │ POST /csv/upload ←── Gemini │ │ + │ │ GET /csv/get-summary ──→ 11L │ │ + │ │ POST /user-choice ←── 11L │◄──────────┘ + │ │ GET /user-choice/latest ──→ Gemini │ + │ │ │ │ + │ └────────────────┬────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────┐ │ + │ │ ELEVENLABS CONVERSATIONAL AI │ │ + │ │ │ │ + │ │ 🎤 User: "What's available?" │ │ + │ │ 📋 Agent: Reads product list │ │ + │ │ 🎤 User: "I want the Coca Cola"│ │ + │ │ ✅ Agent: Posts choice to API ─┼───────────┘ + │ │ │ triggers ③ + │ └─────────────────────────────────┘ + │ + ▼ + 🔊 AUDIO OUTPUT (via glasses speakers) ``` -**Test without hardware**: Use the [LiveKit Agents Playground](https://agents-playground.livekit.io/) to test agents with your webcam/microphone instead of glasses. +**Flow Summary:** +1. **LOW photos** → Gemini guides camera positioning → Audio feedback +2. **HIGH photo** → Gemini identifies products → CSV uploaded to API +3. **ElevenLabs Agent** reads products, user selects via voice → Choice posted to API +4. **LOW photos** → Gemini reads user choice from API → Hand guidance mode → Audio feedback -See [agents/README.md](agents/README.md) for agent development. +| Component | Purpose | +|-----------|---------| +| `ios/` | Captures photos from Ray-Ban Meta glasses | +| `agents/` | Gemini AI for vision analysis + ElevenLabs TTS for audio output | +| `api/` | Backend storing product data and user selections | -### API Backend (Optional) +## Quick Start ```bash -cd api -uv sync -uv run main.py -``` - -API docs at `http://localhost:8000/docs` - -## What's Included - -### iOS App Features - -- Live video streaming from Ray-Ban Meta glasses -- Audio routing to/from glasses (left/right channel testing) -- Photo capture during streaming -- Video recording and local storage -- Video gallery with playback -- LiveKit integration with WebRTC -- Share videos/photos +# API +cd api && uv sync && uv run main.py -### Agent Template +# Agent +cd agents && uv sync && uv run shelf_assistant.py -- LiveKit room auto-join based on session -- Audio/video stream processing -- AI model integration examples (vision, TTS) -- Bidirectional communication (receive video, send audio) - -### Backend API - -- Session management endpoints -- LiveKit room creation with tokens -- R2 storage integration for life logs -- FastAPI with dependency injection patterns - -## Use It Your Way - -**Feel free to:** - -- Rip out everything you don't need -- Replace the AI models with your own -- Change the entire agent architecture -- Use a different backend (or no backend) -- Build something completely different on top of the glasses SDK - -**This is over-engineered for a hackathon.** The three-component architecture exists because I found the initial setup painful and wanted to provide options. If you have a better approach or this feels too complicated, throw it away! The point is to give you working examples to learn from, not to force an architecture on you. - -## Environment Variables & API Keys - -The app needs a few API keys to work: - -- **LiveKit**: Server URL, API key, API secret (for WebRTC streaming) -- **OpenRouter API Key** (optional, for AI models) -- **ElevenLabs API Key** (optional, for TTS) - -**Having trouble getting something running?** Reach out and I'll unblock you. +# iOS +cd ios && open Blindsighted.xcodeproj +``` -See `ios/Config.xcconfig.example` and `api/.env.example` for configuration details. +**Required API keys** (in `.env` files): +- `GOOGLE_API_KEY` — Gemini vision AI +- `ELEVENLABS_API_KEY` — Voice synthesis -## Documentation +## Accessibility by Design -- **CLAUDE.md** - Full development guide with architecture details, code patterns, troubleshooting -- **ios/README.md** - iOS-specific setup and configuration -- **agents/README.md** - Agent development guide -- **api/** - Backend API with OpenAPI docs at `/docs` +- **No visual interface required** — All feedback is audio +- **Natural language** — "I want the orange juice" not menu navigation +- **Spatial audio cues** — Clock positions for intuitive direction +- **Confirmation feedback** — "Got it!" when item is reached +- **Error recovery** — Graceful re-prompting if something goes wrong ## License -**In short:** Keep it open source, it's fine to make money with it. I'd love to see what you build with it. - -**Exception**: The iOS app incorporates sample code from Meta's [meta-wearables-dat-ios](https://github.com/facebook/meta-wearables-dat-ios) repository, which has its own license terms. Check that repo for Meta's SDK license. - -## Why Does This Exist? - -I built this because: - -1. Getting Meta's wearables SDK working took a bit of time without being 'fun'. -2. Originally I had custom WebRTC streaming (which took a lot of time); Pentaform showed me LiveKit which seems much more suitable for a hackathon use-case so I swapped over to that for this project, but also has it's own pain points. -3. Unlikely typical hackathons which are one-and-done, it'd be great to have something people can iterate on. - -If this helps you build something cool, that's awesome. If you find a better way to do any of this, even better. - -## Contributing - -Found a bug? Have a better pattern? PRs welcome. This is meant to help people, so improvements that make it easier to use or understand are great. +MIT License — See [LICENSE](LICENSE) diff --git "a/Screenshot 2026-01-17 at 1.46.20\342\200\257PM.png" "b/Screenshot 2026-01-17 at 1.46.20\342\200\257PM.png" new file mode 100644 index 0000000..e8621eb Binary files /dev/null and "b/Screenshot 2026-01-17 at 1.46.20\342\200\257PM.png" differ diff --git a/agents/__init__.py b/agents/__init__.py index 1fade03..d7a3cb1 100644 --- a/agents/__init__.py +++ b/agents/__init__.py @@ -1,5 +1,5 @@ -"""LiveKit Agents for Blindsighted - Vision-based AI assistance.""" +"""Julie Agents - Gemini-powered vision assistance for supermarket shopping.""" -from agents.vision_agent import VisionAssistant, vision_agent, server +from shelf_assistant import ShelfAssistant, LocalPhotoManager -__all__ = ["VisionAssistant", "vision_agent", "server"] +__all__ = ["ShelfAssistant", "LocalPhotoManager"] diff --git a/agents/config.py b/agents/config.py index dcc2041..403cfd3 100644 --- a/agents/config.py +++ b/agents/config.py @@ -10,22 +10,15 @@ class Settings(BaseSettings): extra="ignore", ) - # LiveKit Agent Configuration - livekit_agent_name: str = "vision-agent" - livekit_url: str = "" - livekit_api_key: str = "" - livekit_api_secret: str = "" + # Google AI API (for Gemini) + google_api_key: str = "" - # OpenRouter API - openrouter_api_key: str = "" - openrouter_base_url: str = "https://openrouter.ai/api/v1" + # API Backend URL + api_base_url: str = "https://localhost:8000" - # ElevenLabs API + # ElevenLabs Conversational AI (for reference) elevenlabs_api_key: str = "" - elevenlabs_voice_id: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice - - # Deepgram API - deepgram_api_key: str = "" + elevenlabs_agent_id: str = "agent_0701kf5rm5s6f7jtnh7swk9nkx0a" settings = Settings() diff --git a/agents/example_agent.py b/agents/example_agent.py deleted file mode 100644 index 9466b3a..0000000 --- a/agents/example_agent.py +++ /dev/null @@ -1,274 +0,0 @@ -"""LiveKit Agent for vision-based scene description using streaming STT-LLM-TTS pipeline.""" - -import asyncio -import logging - -from livekit import rtc -from livekit.agents import ( - Agent, - AgentServer, - AgentSession, - JobContext, - JobRequest, - WorkerOptions, - cli, - get_job_context, - llm, -) -from livekit.agents.metrics.base import TTSMetrics -from livekit.agents.voice.events import ConversationItemAddedEvent, SpeechCreatedEvent -from livekit.plugins import deepgram, elevenlabs, openai, silero -from loguru import logger - -from config import settings - -# Enable debug logging -logging.basicConfig(level=logging.INFO) - - -class VisionAssistant(Agent): - """Vision-enabled AI assistant that processes video frames with each user turn.""" - - def __init__(self) -> None: - super().__init__( - instructions="""You are a helpful AI assistant for visually impaired users. - You have access to their camera feed and describe what you see in the environment. - Your responses are conversational, concise, and focused on what's most relevant or interesting. - When describing scenes, prioritize: people, objects, text, spatial layout, and potential hazards. - Be natural and friendly, avoiding robotic or overly technical language. - If the user asks about something specific, focus on that in your description. - Do not be afraid to say that you don't know - either because you can't see any images in your context. - """ - ) - self._latest_frame: rtc.VideoFrame | None = None - self._video_stream: rtc.VideoStream | None = None - self._tasks: list[asyncio.Task] = [] - logger.info("VisionAssistant initialized") - - async def on_enter(self) -> None: - """Called when the agent enters a room. Sets up video stream monitoring.""" - ctx = get_job_context() - room = ctx.room - - logger.info(f"Agent entered room: {room.name}") - - # Find the first video track from remote participant (if any) - if room.remote_participants: - remote_participant = list(room.remote_participants.values())[0] - video_tracks = [ - publication.track - for publication in list(remote_participant.track_publications.values()) - if publication.track and publication.track.kind == rtc.TrackKind.KIND_VIDEO - ] - if video_tracks: - self._create_video_stream(video_tracks[0]) - logger.info( - f"Subscribed to existing video track from {remote_participant.identity}" - ) - - # Watch for new video tracks not yet published - @room.on("track_subscribed") - def on_track_subscribed( - track: rtc.Track, - publication: rtc.RemoteTrackPublication, - participant: rtc.RemoteParticipant, - ) -> None: - """Handle new track subscription.""" - if track.kind == rtc.TrackKind.KIND_VIDEO: - logger.info(f"New video track subscribed from {participant.identity}") - self._create_video_stream(track) - - async def on_user_turn_completed( - self, chat_ctx: llm.ChatContext, new_message: llm.ChatMessage - ) -> None: - """Add the latest video frame to the user's message for vision context.""" - if self._latest_frame: - logger.info("Attaching latest video frame to user message") - new_message.content.append(llm.ImageContent(image=self._latest_frame)) - # Don't clear the frame - keep it for next turn if user speaks again quickly - else: - logger.warning("No video frame available - video is not streaming") - # Add a system note for the LLM to inform the user about missing video - new_message.content.append( - "[SYSTEM: No video frame available. The user's camera feed is not currently streaming. Please inform them that you cannot see their camera at the moment.]" - ) - - def _create_video_stream(self, track: rtc.Track) -> None: - """Create a video stream to buffer the latest frame from user's camera.""" - # Close any existing stream (we only want one at a time) - if self._video_stream is not None: - logger.info("Closing existing video stream") - # Cancel existing stream - for task in self._tasks: - if not task.done(): - task.cancel() - self._tasks.clear() - - # Create a new stream to receive frames - self._video_stream = rtc.VideoStream(track) - logger.info("Created new video stream") - - async def read_stream() -> None: - """Continuously read and buffer the latest video frame.""" - if not self._video_stream: - logger.error("No video stream available") - return - frame_count = 0 - async for event in self._video_stream: - # Store the latest frame for use in next user turn - self._latest_frame = event.frame - frame_count += 1 - if frame_count % 100 == 0: - logger.debug(f"Buffered video frame '{track.name}#{frame_count}'") - - # Store the async task - task = asyncio.create_task(read_stream()) - task.add_done_callback(lambda t: self._tasks.remove(t) if t in self._tasks else None) - self._tasks.append(task) - logger.info("Started video frame buffering task") - - -# Create agent server -server = AgentServer() - - -async def should_accept_job(job_request: JobRequest) -> None: - """Filter function to accept only jobs matching this agent's name. - - The agent name is configured via settings.livekit_agent_name - and should match the agent_id stored in the room metadata by the API. - """ - agent_name = settings.livekit_agent_name - room_metadata = job_request.room.metadata - - # If no agent name is configured in the room metadata, accept all jobs (backward compatibility) - if not room_metadata: - logger.warning( - f"Room {job_request.room.name} has no metadata - accepting job for backward compatibility" - ) - await job_request.accept() - return - - # Accept job if room metadata matches our agent name - should_accept = room_metadata == agent_name - if should_accept: - logger.info(f"Accepting job for room {job_request.room.name} (agent: {agent_name})") - await job_request.accept() - return - - logger.info( - f"Skipping job for room {job_request.room.name} (expected: {agent_name}, got: {room_metadata})" - ) - return - - -async def entrypoint(ctx: JobContext) -> None: - """Entry point for the vision assistant agent. - - Uses streaming STT-LLM-TTS pipeline with vision capabilities. - """ - logger.info(f"Starting vision agent for room: {ctx.room.name}") - - await ctx.connect() - - tts_instance = deepgram.TTS( - api_key=settings.deepgram_api_key, - model="aura-asteria-en", - encoding="linear16", - sample_rate=24000, - ) - - # tts_instance = elevenlabs.TTS( - # api_key=settings.elevenlabs_api_key, - # voice_id=settings.elevenlabs_voice_id, - # model="eleven_turbo_v2_5", - # ) - - # tts_instance = elevenlabs.TTS( - # api_key=settings.elevenlabs_api_key, - # voice_id=settings.elevenlabs_voice_id, - # model="eleven_turbo_v2_5", - # ) - - # Configure the agent session with STT-LLM-TTS pipeline - session = AgentSession( - # Speech-to-Text: Use Deepgram for fast, accurate transcription - stt=deepgram.STT( - model="nova-3", - api_key=settings.deepgram_api_key, - ), - llm=openai.LLM( - model="google/gemini-2.5-flash", - base_url=settings.openrouter_base_url, - api_key=settings.openrouter_api_key, - max_completion_tokens=500, # Ensure longer responses aren't truncated - ), - # Text-to-Speech: Use Deepgram TTS - tts=tts_instance, - # Voice Activity Detection - vad=silero.VAD.load(), - # Interruption settings - ensure user doesn't accidentally interrupt during pauses - min_interruption_duration=1.0, # Require 1 second of speech to interrupt (default 0.5) - allow_interruptions=True, - use_tts_aligned_transcript=True, - ) - - # Start the agent session - agent = VisionAssistant() - await session.start( - room=ctx.room, - agent=agent, - ) - - # Add event listeners for debugging - @session.on("user_input_transcribed") - def _on_user_input(text: str) -> None: - logger.info(f"User said: {text}") - - @session.on("speech_created") - def _on_speech_created(event: SpeechCreatedEvent) -> None: - handle = event.speech_handle - logger.info(f"Speech from {event.source} with handle #{handle.id}") - - @session.on("conversation_item_added") - def _on_conversation_item(event: ConversationItemAddedEvent) -> None: - # event.item is a ChatMessage object - item = event.item - if not isinstance(item, llm.ChatMessage): - logger.debug(f"Unknown conversation item added: {item}") - return - content = item.content[0] if item.content else "" - logger.info( - f"Conversation item added: role={item.role}, content: '{content}', interrupted={item.interrupted}" - ) - - # Add session TTS event listeners - if session.tts: - logger.info("Setting up session TTS event listeners") - - @session.tts.on("error") - def _on_session_tts_error(error: Exception) -> None: - logger.warning(f"Session TTS error: {error}") - - @session.tts.on("metrics_collected") - def _on_session_tts_metrics(metrics: TTSMetrics) -> None: - logger.info(f"Session TTS metrics: {metrics}") - - # Generate initial greeting - await session.generate_reply(instructions="Say 'blind-sighted connected'.") - - logger.info("Vision agent session started successfully") - - -if __name__ == "__main__": - logger.info("Starting vision agent worker") - # Run the agent worker - cli.run_app( - WorkerOptions( - entrypoint_fnc=entrypoint, - request_fnc=should_accept_job, - ws_url=settings.livekit_url, - api_key=settings.livekit_api_key, - api_secret=settings.livekit_api_secret, - ) - ) diff --git a/agents/pyproject.toml b/agents/pyproject.toml index e240700..b2693f2 100644 --- a/agents/pyproject.toml +++ b/agents/pyproject.toml @@ -3,17 +3,21 @@ requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] -name = "blindsighted-agents" +name = "julie-agents" version = "0.1.0" -description = "LiveKit agents for Blindsighted vision assistance" +description = "Julie - Gemini-powered shelf assistant for visually impaired supermarket shopping" requires-python = ">=3.11" dependencies = [ - "livekit==1.0.23", - "livekit-agents[openai,silero,deepgram,turn-detector,elevenlabs]~=1.0", - "loguru==0.7.3", - "pydantic-settings==2.7.1", + "google-generativeai>=0.8.0", + "httpx>=0.27.0", + "pydantic-settings>=2.7.0", + "python-dotenv>=1.0.0", + "watchdog>=4.0.0", ] +[tool.setuptools] +py-modules = ["config", "shelf_assistant"] + [tool.ruff] line-length = 100 target-version = "py311" diff --git a/agents/shelf_assistant.py b/agents/shelf_assistant.py new file mode 100644 index 0000000..c565851 --- /dev/null +++ b/agents/shelf_assistant.py @@ -0,0 +1,594 @@ +""" +Julie Shelf Assistant - Gemini-powered vision assistant for visually impaired supermarket shopping. + +Processes photos from JuliePhotos directory and routes them based on filename flags: +- _low: Navigation mode - guide user positioning OR guide user's hand to selected item +- _high: Identification mode - list all items as CSV and upload to API + +Audio responses are converted to speech via ElevenLabs and saved to JulieAudio folder. + +Flow: +1. LOW photos guide camera positioning until view is good +2. HIGH photo triggers item identification, generates CSV, uploads to API +3. ElevenLabs voice call uses CSV to help user select an item +4. ElevenLabs posts user choice to API +5. New LOW photos guide user's hand to the selected item +""" + +import asyncio +import os +import time +from datetime import datetime +from pathlib import Path + +import google.generativeai as genai +import httpx +from dotenv import load_dotenv +from watchdog.events import FileSystemEvent, FileSystemEventHandler +from watchdog.observers import Observer + +# Load env - look for .env in current or agents/ directory +if os.path.exists(".env"): + load_dotenv(".env") +elif os.path.exists("agents/.env"): + load_dotenv("agents/.env") +else: + load_dotenv() + +# API base URL +API_BASE_URL = os.getenv("API_BASE_URL", "https://localhost:8000") + +# ElevenLabs settings +ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "") +ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") # Rachel voice + + +class ElevenLabsTTS: + """ElevenLabs text-to-speech client.""" + + def __init__(self, api_key: str = ELEVENLABS_API_KEY, voice_id: str = ELEVENLABS_VOICE_ID): + self.api_key = api_key + self.voice_id = voice_id + self.base_url = "https://api.elevenlabs.io/v1" + + async def text_to_speech(self, text: str) -> bytes | None: + """Convert text to speech and return audio bytes.""" + if not self.api_key: + print("[TTS] Warning: ELEVENLABS_API_KEY not set, skipping TTS") + return None + + if not text or not text.strip(): + return None + + url = f"{self.base_url}/text-to-speech/{self.voice_id}" + + headers = { + "xi-api-key": self.api_key, + "Content-Type": "application/json", + } + + payload = { + "text": text, + "model_id": "eleven_turbo_v2_5", # Fast model for real-time use + "voice_settings": { + "stability": 0.5, + "similarity_boost": 0.75, + }, + } + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post(url, json=payload, headers=headers) + response.raise_for_status() + print(f"[TTS] Generated {len(response.content)} bytes of audio") + return response.content + except Exception as e: + print(f"[TTS] Error generating speech: {e}") + return None + + +class AudioManager: + """Manages audio output files for the iOS app to play.""" + + def __init__(self): + self.directory_path = os.path.expanduser("~/Documents/JulieAudio") + print(f"[AudioManager] Output Path: {self.directory_path}") + + # Create directory if it doesn't exist + Path(self.directory_path).mkdir(parents=True, exist_ok=True) + + def save_audio(self, audio_bytes: bytes, prefix: str = "response") -> str | None: + """Save audio bytes to a file and return the path.""" + if not audio_bytes: + return None + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") + filename = f"{prefix}_{timestamp}.mp3" + filepath = os.path.join(self.directory_path, filename) + + try: + with open(filepath, "wb") as f: + f.write(audio_bytes) + print(f"[AudioManager] Saved: {filename}") + return filepath + except Exception as e: + print(f"[AudioManager] Error saving audio: {e}") + return None + + def cleanup_old_files(self, max_age_seconds: int = 300) -> None: + """Remove audio files older than max_age_seconds.""" + try: + now = time.time() + for filename in os.listdir(self.directory_path): + filepath = os.path.join(self.directory_path, filename) + if os.path.isfile(filepath): + file_age = now - os.path.getmtime(filepath) + if file_age > max_age_seconds: + os.remove(filepath) + print(f"[AudioManager] Cleaned up: {filename}") + except Exception as e: + print(f"[AudioManager] Cleanup error: {e}") + + +class LocalPhotoManager: + """ + Manages photo files from ~/Documents/JuliePhotos/. + Mirrors the Swift PhotoFileManager functionality. + """ + + def __init__(self): + self.directory_path = os.path.expanduser("~/Documents/JuliePhotos") + print(f"[LocalPhotoManager] Storage Path: {self.directory_path}") + + # Create directory if it doesn't exist + Path(self.directory_path).mkdir(parents=True, exist_ok=True) + + def list_photos(self, flag: str | None = None) -> list[str]: + """Returns a list of filenames in the directory.""" + if not os.path.exists(self.directory_path): + return [] + + photos = [ + f + for f in os.listdir(self.directory_path) + if f.lower().endswith((".jpg", ".jpeg", ".png")) + ] + + if flag: + photos = [p for p in photos if self.get_flag(p) == flag] + + return photos + + def get_flag(self, filename: str) -> str | None: + """Parse flag from filename (e.g., photo_..._low.jpg -> 'low').""" + filename_lower = filename.lower() + if "_low." in filename_lower: + return "low" + if "_high." in filename_lower: + return "high" + return None + + def load_image_bytes(self, filename: str) -> bytes | None: + """Loads an image and returns raw bytes.""" + path = os.path.join(self.directory_path, filename) + if not os.path.exists(path): + return None + + with open(path, "rb") as f: + return f.read() + + def get_mime_type(self, filename: str) -> str: + """Get MIME type for image file.""" + return "image/png" if filename.lower().endswith(".png") else "image/jpeg" + + +class APIClient: + """Client for communicating with the FastAPI backend.""" + + def __init__(self, base_url: str = API_BASE_URL): + self.base_url = base_url + + async def upload_csv(self, csv_content: str, filename: str | None = None) -> dict: + """Upload CSV content to the API.""" + if filename is None: + filename = f"shelf_items_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + + async with httpx.AsyncClient(verify=False) as client: + files = {"file": (filename, csv_content.encode(), "text/csv")} + response = await client.post(f"{self.base_url}/csv/upload", files=files) + response.raise_for_status() + return response.json() + + async def get_latest_user_choice(self) -> dict | None: + """Get the latest unprocessed user choice.""" + async with httpx.AsyncClient(verify=False) as client: + response = await client.get( + f"{self.base_url}/user-choice/latest", + params={"unprocessed_only": True}, + ) + if response.status_code == 200: + return response.json() + return None + + async def mark_choice_processed(self, choice_id: str) -> None: + """Mark a user choice as processed.""" + async with httpx.AsyncClient(verify=False) as client: + response = await client.patch( + f"{self.base_url}/user-choice/{choice_id}/processed" + ) + response.raise_for_status() + + +class GeminiAgent: + """Gemini agent wrapper for vision tasks using google-generativeai.""" + + def __init__(self, role_name: str, system_prompt: str): + self.role_name = role_name + self.system_prompt = system_prompt + + # Configure Gemini + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + raise ValueError("GOOGLE_API_KEY environment variable required") + + genai.configure(api_key=api_key) + self.model = genai.GenerativeModel( + model_name="gemini-2.0-flash-exp", + system_instruction=system_prompt, + ) + + def generate(self, user_content: str, image_bytes: bytes | None = None, mime_type: str = "image/jpeg") -> str: + """Generate a response from the agent (synchronous for simplicity).""" + print(f"[{self.role_name}] Processing request...") + + try: + content_parts = [user_content] + + if image_bytes: + content_parts.append({ + "mime_type": mime_type, + "data": image_bytes, + }) + + response = self.model.generate_content(content_parts) + return response.text + + except Exception as e: + return f"Error: {e}" + + +# Agent system prompts +NAVIGATION_PROMPT = """You are a navigation assistant helping a visually impaired user position their camera to see a store shelf. + +Your job is to analyze the current camera view and guide the user to adjust their position so ALL products on the shelf are visible in the frame. + +Instructions: +- Describe what's currently visible in the frame +- Identify if any products appear cut off at the edges (left, right, top, bottom) +- Give clear, concise directions to adjust camera position: + - "Move left/right" for horizontal adjustments + - "Move up/down" or "tilt up/down" for vertical adjustments + - "Step back" if too close, "step forward" if too far +- Use clock positions (12 o'clock = up, 3 o'clock = right, etc.) when helpful +- Keep responses SHORT and actionable - the user needs quick guidance +- If the view looks good and shows the full shelf, say "View looks good - ready for capture" + +Example response: "I can see canned goods on the left, but products are cut off on the right. Move your camera slightly to the right to include more of the shelf." +""" + +IDENTIFICATION_PROMPT = """You are an item identification assistant for visually impaired users. + +You are looking at a store shelf. Your job is to identify ALL products visible and output them as CSV data. + +OUTPUT FORMAT - You MUST output ONLY a valid CSV with these exact columns: +item_number,product_name,brand,location,price + +Rules: +- First line must be the header: item_number,product_name,brand,location,price +- List EVERY distinct product you can see +- item_number: sequential number starting from 1 +- product_name: the product type/name (e.g., "Cola 330ml", "Sparkling Water 500ml") +- brand: the brand name if visible, otherwise "Unknown" +- location: describe using "top/middle/bottom shelf" and "left/center/right" (e.g., "top shelf, left") +- price: the price if visible, otherwise "N/A" +- Do NOT include any text before or after the CSV +- Do NOT use markdown code blocks + +Example output: +item_number,product_name,brand,location,price +1,Cola 330ml,Coca-Cola,top shelf left,$1.99 +2,Sparkling Water 500ml,Perrier,top shelf center,$2.49 +3,Orange Juice 1L,Tropicana,middle shelf left,$3.99 +""" + +GUIDANCE_PROMPT = """You are a hand-guidance assistant helping a visually impaired user reach for a specific item on a shelf. + +The user has selected an item they want. You can see their current camera view (likely showing their hand near the shelf). + +Your job is to guide their hand to the exact location of the item. + +Instructions: +- The user wants to find: {item_name} +- Its known location is: {item_location} +- Guide their hand using clock positions (12 o'clock = up, 3 o'clock = right, etc.) +- Give distance estimates in centimeters or inches +- If you can see their hand, guide relative to it +- Keep responses SHORT and actionable +- When they're very close, say "You're almost there" or "Got it!" + +Example responses: +- "Move your hand to 2 o'clock, about 20 centimeters" +- "Up and slightly right, about 10 centimeters" +- "You're very close - just a bit more to the left" +- "Your hand is right on it!" +""" + + +class ShelfAssistant: + """Main assistant that routes images and manages the shopping flow.""" + + def __init__(self, enable_tts: bool = True): + self.photo_manager = LocalPhotoManager() + self.api_client = APIClient() + self.audio_manager = AudioManager() + self.tts = ElevenLabsTTS() if enable_tts else None + self.enable_tts = enable_tts + + # Initialize agents + self.navigation_agent = GeminiAgent("NAVIGATION", NAVIGATION_PROMPT) + self.identification_agent = GeminiAgent("IDENTIFICATION", IDENTIFICATION_PROMPT) + + # Track state + self.processed_files: set[str] = set() + self.current_user_choice: dict | None = None + self.guidance_agent: GeminiAgent | None = None + + async def speak(self, text: str, prefix: str = "response") -> str | None: + """Convert text to speech and save to audio folder.""" + if not self.enable_tts or not self.tts: + return None + + # Don't speak CSV data + if text.startswith("item_number,") or "," in text.split("\n")[0]: + print("[Assistant] Skipping TTS for CSV data") + return None + + audio_bytes = await self.tts.text_to_speech(text) + if audio_bytes: + return self.audio_manager.save_audio(audio_bytes, prefix) + return None + + async def check_for_user_choice(self) -> None: + """Check if there's a pending user choice for guidance mode.""" + try: + choice = await self.api_client.get_latest_user_choice() + if choice and choice != self.current_user_choice: + self.current_user_choice = choice + print(f"\n[Assistant] User selected: {choice['item_name']}") + if choice.get("item_location"): + print(f"[Assistant] Location: {choice['item_location']}") + + # Create guidance agent with item-specific prompt + guidance_prompt = GUIDANCE_PROMPT.format( + item_name=choice["item_name"], + item_location=choice.get("item_location", "unknown"), + ) + self.guidance_agent = GeminiAgent("GUIDANCE", guidance_prompt) + + # Announce the selection + announcement = f"Got it! Looking for {choice['item_name']}. Point your glasses at the shelf and I'll guide you." + await self.speak(announcement, "selection") + except Exception as e: + print(f"[Assistant] Error checking user choice: {e}") + + async def process_image(self, filename: str) -> str | None: + """Process an image based on its flag.""" + if filename in self.processed_files: + print(f"[Assistant] Skipping already processed: {filename}") + return None + + flag = self.photo_manager.get_flag(filename) + if flag is None: + print(f"[Assistant] No flag in filename: {filename} - skipping") + return None + + image_bytes = self.photo_manager.load_image_bytes(filename) + if image_bytes is None: + print(f"[Assistant] Failed to load: {filename}") + return None + + mime_type = self.photo_manager.get_mime_type(filename) + self.processed_files.add(filename) + + print(f"\n{'='*60}") + print(f"[Assistant] Processing: {filename} (flag: {flag})") + print(f"{'='*60}") + + if flag == "high": + return await self._process_high_image(image_bytes, mime_type) + elif flag == "low": + return await self._process_low_image(image_bytes, mime_type) + + return None + + async def _process_high_image(self, image_bytes: bytes, mime_type: str) -> str: + """Process HIGH image: identify items and upload CSV.""" + response = self.identification_agent.generate( + "Identify all products on this shelf and output as CSV.", + image_bytes, + mime_type, + ) + + print(f"\n[IDENTIFICATION RESPONSE]\n{response}\n") + + # Upload CSV to API + try: + # Clean up response - ensure it's valid CSV + csv_content = response.strip() + + # Remove markdown code blocks if present + if csv_content.startswith("```"): + lines = csv_content.split("\n") + csv_content = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:]) + + result = await self.api_client.upload_csv(csv_content) + print(f"[Assistant] CSV uploaded successfully: {result}") + + # Count items and announce + item_count = len(csv_content.strip().split("\n")) - 1 # Minus header + announcement = f"I found {item_count} products on the shelf. The list has been sent to your phone." + await self.speak(announcement, "identification") + + except Exception as e: + print(f"[Assistant] Failed to upload CSV: {e}") + await self.speak("Sorry, I had trouble identifying the products. Please try again.", "error") + + return response + + async def _process_low_image(self, image_bytes: bytes, mime_type: str) -> str: + """Process LOW image: navigation or hand guidance.""" + # Check for user choice first + await self.check_for_user_choice() + + if self.current_user_choice and self.guidance_agent: + # Guidance mode - help user reach their selected item + response = self.guidance_agent.generate( + f"Guide my hand to reach {self.current_user_choice['item_name']}.", + image_bytes, + mime_type, + ) + print(f"\n[GUIDANCE RESPONSE]\n{response}\n") + + # Speak the guidance + await self.speak(response, "guidance") + + # Check if user got the item (simple heuristic) + if any(phrase in response.lower() for phrase in ["got it", "right on it", "you have it", "perfect"]): + try: + await self.api_client.mark_choice_processed(self.current_user_choice["id"]) + print("[Assistant] Item found! Choice marked as processed.") + self.current_user_choice = None + self.guidance_agent = None + except Exception as e: + print(f"[Assistant] Failed to mark choice processed: {e}") + else: + # Navigation mode - help position camera + response = self.navigation_agent.generate( + "Analyze this camera view and guide me to position it.", + image_bytes, + mime_type, + ) + print(f"\n[NAVIGATION RESPONSE]\n{response}\n") + + # Speak the navigation guidance + await self.speak(response, "navigation") + + return response + + +class PhotoEventHandler(FileSystemEventHandler): + """Handles file system events for new photos.""" + + def __init__(self, assistant: ShelfAssistant, loop: asyncio.AbstractEventLoop): + self.assistant = assistant + self.loop = loop + + def on_created(self, event: FileSystemEvent) -> None: + """Called when a new file is created.""" + if event.is_directory: + return + + filepath = event.src_path + filename = os.path.basename(filepath) + + if not filename.lower().endswith((".jpg", ".jpeg", ".png")): + return + + print(f"[Watcher] New photo detected: {filename}") + + # Small delay to ensure file is fully written + time.sleep(0.5) + + asyncio.run_coroutine_threadsafe( + self.assistant.process_image(filename), self.loop + ) + + +async def watch_for_photos(assistant: ShelfAssistant) -> None: + """Start watching the photos directory for new files.""" + loop = asyncio.get_event_loop() + event_handler = PhotoEventHandler(assistant, loop) + observer = Observer() + observer.schedule( + event_handler, assistant.photo_manager.directory_path, recursive=False + ) + observer.start() + + print(f"\n[Watcher] Watching: {assistant.photo_manager.directory_path}") + print("[Watcher] Press Ctrl+C to stop\n") + + try: + while True: + # Periodically check for user choice updates + await assistant.check_for_user_choice() + # Cleanup old audio files + assistant.audio_manager.cleanup_old_files() + await asyncio.sleep(2) + except asyncio.CancelledError: + observer.stop() + finally: + observer.stop() + observer.join() + + +async def process_existing_photos(assistant: ShelfAssistant) -> None: + """Process any existing unprocessed photos.""" + print("\n[Startup] Checking for existing photos...") + + for flag in ["low", "high"]: + photos = assistant.photo_manager.list_photos(flag=flag) + if photos: + print(f"[Startup] Found {len(photos)} existing {flag} photos") + for photo in sorted(photos): + await assistant.process_image(photo) + + +async def main() -> None: + """Main entry point for the shelf assistant.""" + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + print("Error: GOOGLE_API_KEY not found in environment.") + print("Please set GOOGLE_API_KEY in your .env file.") + return + + # Check for ElevenLabs API key + enable_tts = bool(ELEVENLABS_API_KEY) + if not enable_tts: + print("Warning: ELEVENLABS_API_KEY not set. TTS disabled.") + + assistant = ShelfAssistant(enable_tts=enable_tts) + + print("\n" + "=" * 60) + print("Julie Shelf Assistant") + print("=" * 60) + print(f"Photos directory: {assistant.photo_manager.directory_path}") + print(f"Audio output: {assistant.audio_manager.directory_path}") + print(f"API endpoint: {API_BASE_URL}") + print(f"TTS enabled: {enable_tts}") + print("\nModes:") + print(" - LOW flag: Camera positioning / Hand guidance") + print(" - HIGH flag: Item identification (CSV output)") + print("=" * 60) + + # Check for existing user choice + await assistant.check_for_user_choice() + + # Process any existing photos first + await process_existing_photos(assistant) + + # Start watching for new photos + await watch_for_photos(assistant) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/api/CSV_SETUP.md b/api/CSV_SETUP.md new file mode 100644 index 0000000..3ea4a89 --- /dev/null +++ b/api/CSV_SETUP.md @@ -0,0 +1,111 @@ +# CSV File Storage Setup + +This backend now includes a CSV file storage system with HTTPS support. + +## Features + +- **Database storage**: CSV files are stored in PostgreSQL database +- **HTTPS endpoint**: `/csv/get-summary` endpoint accessible via HTTPS +- **File upload**: Upload CSV files via `/csv/upload` endpoint + +## Setup Instructions + +### 1. Run Database Migration + +Apply the database migration to create the `csv_files` table: + +```bash +cd api +alembic upgrade head +``` + +### 2. Generate SSL Certificates for HTTPS + +**On Windows (PowerShell):** +```powershell +cd api +.\generate_ssl_cert.ps1 +``` + +**On Linux/Mac:** +```bash +cd api +chmod +x generate_ssl_cert.sh +./generate_ssl_cert.sh +``` + +**Manual (if OpenSSL is installed):** +```bash +openssl req -x509 -newkey rsa:4096 \ + -keyout localhost-key.pem \ + -out localhost.pem \ + -days 365 \ + -nodes \ + -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" +``` + +This will create: +- `localhost-key.pem` (private key) +- `localhost.pem` (certificate) + +### 3. Start the Server + +The server will automatically use HTTPS if the certificates are found: + +```bash +cd api +python main.py +``` + +Or set environment variables: +```bash +$env:BLINDSIGHTED_API_PORT=8000 +$env:SSL_KEYFILE="localhost-key.pem" +$env:SSL_CERTFILE="localhost.pem" +python main.py +``` + +The server will run on `https://localhost:8000` (or `http://localhost:8000` if certificates are not found). + +## API Endpoints + +### GET `/csv/get-summary` + +Returns the latest CSV file from the database. + +**Response:** +```json +{ + "id": "uuid", + "filename": "example.csv", + "content": "col1,col2\nval1,val2", + "file_size_bytes": 20, + "created_at": "2026-01-20T12:00:00Z", + "updated_at": "2026-01-20T12:00:00Z" +} +``` + +**Example (HTTPS):** +```bash +curl -k https://localhost:8000/csv/get-summary +``` + +### POST `/csv/upload` + +Upload a CSV file to the database. + +**Request:** +- Content-Type: `multipart/form-data` +- Body: CSV file + +**Example:** +```bash +curl -k -X POST https://localhost:8000/csv/upload \ + -F "file=@example.csv" +``` + +## Notes + +- The `-k` flag in curl is needed for self-signed certificates +- Browsers will show a security warning for self-signed certificates - this is normal for localhost development +- For production, use proper SSL certificates from a Certificate Authority diff --git a/api/ENDPOINT_URLS.md b/api/ENDPOINT_URLS.md new file mode 100644 index 0000000..97cbc84 --- /dev/null +++ b/api/ENDPOINT_URLS.md @@ -0,0 +1,81 @@ +# CSV Endpoint URLs + +## HTTPS Endpoint (Recommended) + +The CSV endpoints are accessible via HTTPS on port 8000: + +### Get Latest CSV File +``` +https://localhost:8000/csv/get-summary +``` + +**Full URL:** +``` +https://localhost:8000/csv/get-summary +``` + +**Example using curl:** +```bash +curl -k https://localhost:8000/csv/get-summary +``` + +**Example using PowerShell:** +```powershell +Invoke-WebRequest -Uri "https://localhost:8000/csv/get-summary" -SkipCertificateCheck +``` + +### Upload CSV File +``` +https://localhost:8000/csv/upload +``` + +**Full URL:** +``` +https://localhost:8000/csv/upload +``` + +**Example using curl:** +```bash +curl -k -X POST https://localhost:8000/csv/upload -F "file=@example.csv" +``` + +**Example using PowerShell:** +```powershell +$file = Get-Item "example.csv" +Invoke-WebRequest -Uri "https://localhost:8000/csv/upload" -Method Post -Form @{file=$file} -SkipCertificateCheck +``` + +## HTTP Endpoint (Fallback) + +If SSL certificates are not found, the server will run on HTTP: + +``` +http://localhost:8000/csv/get-summary +http://localhost:8000/csv/upload +``` + +## Interactive API Documentation + +Once the server is running, you can access the interactive API documentation: + +- **Swagger UI (HTTPS):** https://localhost:8000/docs +- **ReDoc (HTTPS):** https://localhost:8000/redoc + +## Important Notes + +1. **Self-Signed Certificate Warning:** Browsers and some HTTP clients will show a security warning for self-signed certificates. This is normal for localhost development. + +2. **Skip Certificate Check:** When using `curl` or `Invoke-WebRequest`, you may need to: + - Use `-k` flag with curl (skips certificate verification) + - Use `-SkipCertificateCheck` with PowerShell's `Invoke-WebRequest` + +3. **Port Configuration:** The default port is 8000. You can change it by setting the environment variable: + ```powershell + $env:BLINDSIGHTED_API_PORT=8001 + ``` + +4. **Server Status:** Check if the server is running: + ``` + https://localhost:8000/ + ``` + Should return: `{"message":"Welcome to Blindsighted API","status":"healthy"}` diff --git a/api/TROUBLESHOOTING.md b/api/TROUBLESHOOTING.md new file mode 100644 index 0000000..05d09c3 --- /dev/null +++ b/api/TROUBLESHOOTING.md @@ -0,0 +1,152 @@ +# Troubleshooting Guide + +## Common Errors and Solutions + +### 1. Migration Error: "Target database is not up to date" + +**Error:** +``` +alembic.util.exc.CommandError: Target database is not up to date +``` + +**Solution:** +```bash +cd api +alembic upgrade head +``` + +### 2. Import Error: "No module named 'models'" + +**Error:** +``` +ModuleNotFoundError: No module named 'models' +``` + +**Solution:** +Make sure you're running commands from the `api` directory: +```bash +cd api +python main.py +``` + +### 3. Database Connection Error + +**Error:** +``` +sqlalchemy.exc.OperationalError: could not connect to server +``` + +**Solution:** +- Check your database is running +- Verify `DATABASE_URL` in `.env` file is correct +- Default: `postgresql+psycopg://localhost/blindsighted` + +### 4. SSL Certificate Error + +**Error:** +``` +Warning: SSL certificates not found. Running HTTP only. +``` + +**Solution:** +Generate SSL certificates: +```powershell +# Windows +cd api +.\generate_ssl_cert.ps1 + +# Linux/Mac +cd api +./generate_ssl_cert.sh +``` + +### 5. Migration Revision Error + +**Error:** +``` +alembic.util.exc.CommandError: Multiple heads detected +``` + +**Solution:** +Check current migration status: +```bash +cd api +alembic heads +alembic current +``` + +If there are multiple heads, merge them: +```bash +alembic merge heads -m "merge_csv_migration" +alembic upgrade head +``` + +### 6. Port Already in Use + +**Error:** +``` +OSError: [WinError 10048] Only one usage of each socket address +``` + +**Solution:** +- Change the port: `$env:BLINDSIGHTED_API_PORT=8001` +- Or stop the process using port 8000 + +### 7. CSV Router Not Found + +**Error:** +``` +404 Not Found: /csv/get-summary +``` + +**Solution:** +- Verify the router is included in `main.py` +- Check the server is running: `python main.py` +- Verify the endpoint: `https://localhost:8000/docs` (Swagger UI) + +## Step-by-Step Setup Verification + +1. **Check database connection:** + ```bash + cd api + python -c "from database import engine; import asyncio; asyncio.run(engine.connect())" + ``` + +2. **Check models are importable:** + ```bash + cd api + python -c "from models import CSVFile; print('OK')" + ``` + +3. **Check migration status:** + ```bash + cd api + alembic current + alembic heads + ``` + +4. **Run migration:** + ```bash + cd api + alembic upgrade head + ``` + +5. **Start server:** + ```bash + cd api + python main.py + ``` + +6. **Test endpoint:** + ```bash + curl -k https://localhost:8000/csv/get-summary + ``` + +## Getting Help + +If you're still experiencing issues: + +1. Check the full error message and stack trace +2. Verify all dependencies are installed: `pip install -r requirements.txt` +3. Check Python version: `python --version` (should be 3.11+) +4. Verify database is accessible and migrations are up to date diff --git a/api/alembic/env.py b/api/alembic/env.py index 076ccc1..70fc987 100644 --- a/api/alembic/env.py +++ b/api/alembic/env.py @@ -12,6 +12,8 @@ # Import config and models from config import settings from database import Base +# Import all models so Alembic can detect them +import models # noqa: F401 # this is the Alembic Config object, which provides # access to the values within the .ini file in use. diff --git a/api/alembic/versions/a1b2c3d4e5f6_add_csv_files_table.py b/api/alembic/versions/a1b2c3d4e5f6_add_csv_files_table.py new file mode 100644 index 0000000..84a75e2 --- /dev/null +++ b/api/alembic/versions/a1b2c3d4e5f6_add_csv_files_table.py @@ -0,0 +1,37 @@ +"""Add CSVFile model for storing CSV files + +Revision ID: a1b2c3d4e5f6 +Revises: 6aff89fb1214 +Create Date: 2026-01-20 12:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + + +# revision identifiers, used by Alembic. +revision: str = 'a1b2c3d4e5f6' +down_revision: Union[str, None] = '6aff89fb1214' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Create csv_files table + op.create_table( + 'csv_files', + sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False), + sa.Column('filename', sa.String(length=255), nullable=False), + sa.Column('file_content', sa.Text(), nullable=False), + sa.Column('file_size_bytes', sa.Integer(), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), nullable=False), + sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + + +def downgrade() -> None: + op.drop_table('csv_files') diff --git a/api/alembic/versions/b2c3d4e5f6g7_add_user_choices_table.py b/api/alembic/versions/b2c3d4e5f6g7_add_user_choices_table.py new file mode 100644 index 0000000..cd96d42 --- /dev/null +++ b/api/alembic/versions/b2c3d4e5f6g7_add_user_choices_table.py @@ -0,0 +1,38 @@ +"""Add UserChoice model for storing user item selections + +Revision ID: b2c3d4e5f6g7 +Revises: a1b2c3d4e5f6 +Create Date: 2026-01-17 18:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + + +# revision identifiers, used by Alembic. +revision: str = 'b2c3d4e5f6g7' +down_revision: Union[str, None] = 'a1b2c3d4e5f6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Create user_choices table + op.create_table( + 'user_choices', + sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False), + sa.Column('item_name', sa.String(length=255), nullable=False), + sa.Column('item_location', sa.String(length=255), nullable=True), + sa.Column('csv_file_id', postgresql.UUID(as_uuid=True), nullable=True), + sa.Column('processed', sa.Boolean(), nullable=False, default=False), + sa.Column('created_at', sa.DateTime(timezone=True), nullable=False), + sa.ForeignKeyConstraint(['csv_file_id'], ['csv_files.id'], ), + sa.PrimaryKeyConstraint('id') + ) + + +def downgrade() -> None: + op.drop_table('user_choices') diff --git a/api/clients/gemini.py b/api/clients/gemini.py new file mode 100644 index 0000000..14cde49 --- /dev/null +++ b/api/clients/gemini.py @@ -0,0 +1,65 @@ +"""Gemini API client for direct Google AI calls.""" + +import httpx + +from config import settings + + +class GeminiClient: + """Client for calling Gemini via Google AI API.""" + + def __init__(self) -> None: + self.api_key = settings.google_api_key + self.model = "gemini-2.0-flash-exp" + self.base_url = "https://generativelanguage.googleapis.com/v1beta" + + async def analyze_image(self, image_base64: str, prompt: str) -> str: + """ + Send image to Gemini and return text response. + + Args: + image_base64: Base64 encoded image string (without data URL prefix) + prompt: System/user prompt for the analysis + + Returns: + Text response from Gemini + """ + if not self.api_key: + raise ValueError("Google API key not configured (GOOGLE_API_KEY)") + + url = f"{self.base_url}/models/{self.model}:generateContent" + + payload = { + "contents": [ + { + "parts": [ + {"text": prompt}, + { + "inline_data": { + "mime_type": "image/jpeg", + "data": image_base64, + } + }, + ] + } + ] + } + + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + url, + params={"key": self.api_key}, + json=payload, + ) + response.raise_for_status() + data = response.json() + + # Extract text from response + if "candidates" in data and len(data["candidates"]) > 0: + candidate = data["candidates"][0] + if "content" in candidate and "parts" in candidate["content"]: + parts = candidate["content"]["parts"] + if len(parts) > 0 and "text" in parts[0]: + return parts[0]["text"] + + raise ValueError("No valid response from Gemini") diff --git a/api/config.py b/api/config.py index c12a5a5..bb09ade 100644 --- a/api/config.py +++ b/api/config.py @@ -8,7 +8,7 @@ class Settings(BaseSettings): model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") # Database - database_url: str = "postgresql+psycopg://localhost/blindsighted" + database_url: str = "postgresql+psycopg://localhost/julie" @field_validator("database_url") @classmethod @@ -18,18 +18,19 @@ def normalize_database_url(cls, v: str) -> str: return v.replace("postgresql://", "postgresql+psycopg://", 1) return v - # LiveKit - livekit_api_key: str = "" - livekit_api_secret: str = "" - livekit_url: str = "ws://localhost:7880" + # ElevenLabs Conversational AI + elevenlabs_agent_id: str = "agent_0701kf5rm5s6f7jtnh7swk9nkx0a" # OpenRouter API openrouter_api_key: str = "" openrouter_base_url: str = "https://openrouter.ai/api/v1" - # Gemini Model + # Gemini Model (via OpenRouter) gemini_model: str = "google/gemini-2.0-flash-exp:free" + # Google AI API (for direct Gemini calls) + google_api_key: str = "" + # ElevenLabs API elevenlabs_api_key: str = "" elevenlabs_voice_id: str = "21m00Tcm4TlvDq8ikWAM" # Rachel voice @@ -38,8 +39,8 @@ def normalize_database_url(cls, v: str) -> str: cloudflare_account_id: str = "78a27224f8a5e611fbb1a5999e2a77eb" r2_access_key_id: str = "" r2_secret_access_key: str = "" - r2_bucket_name: str = "blindsighted" - r2_public_url: str = "https://cdn.blindsighted.hails.info" + r2_bucket_name: str = "julie" + r2_public_url: str = "https://cdn.julie.hails.info" # CORS cors_origins: str = "http://localhost:8081,exp://" diff --git a/api/generate_ssl_cert.ps1 b/api/generate_ssl_cert.ps1 new file mode 100644 index 0000000..857067b --- /dev/null +++ b/api/generate_ssl_cert.ps1 @@ -0,0 +1,33 @@ +# Generate self-signed SSL certificate for localhost (PowerShell) +# Requires OpenSSL to be installed + +$certName = "localhost" +$keyFile = "$certName-key.pem" +$certFile = "$certName.pem" + +# Check if OpenSSL is available +$opensslPath = Get-Command openssl -ErrorAction SilentlyContinue +if (-not $opensslPath) { + Write-Host "Error: OpenSSL is not installed or not in PATH" -ForegroundColor Red + Write-Host "Install OpenSSL from: https://slproweb.com/products/Win32OpenSSL.html" -ForegroundColor Yellow + exit 1 +} + +# Generate certificate +openssl req -x509 -newkey rsa:4096 ` + -keyout $keyFile ` + -out $certFile ` + -days 365 ` + -nodes ` + -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" + +if ($LASTEXITCODE -eq 0) { + Write-Host "SSL certificates generated successfully:" -ForegroundColor Green + Write-Host " - $keyFile (private key)" -ForegroundColor Cyan + Write-Host " - $certFile (certificate)" -ForegroundColor Cyan + Write-Host "" + Write-Host "Note: You may need to accept the self-signed certificate in your browser." -ForegroundColor Yellow +} else { + Write-Host "Error generating certificates" -ForegroundColor Red + exit 1 +} diff --git a/api/generate_ssl_cert.sh b/api/generate_ssl_cert.sh new file mode 100644 index 0000000..b9ae2ec --- /dev/null +++ b/api/generate_ssl_cert.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Generate self-signed SSL certificate for localhost + +openssl req -x509 -newkey rsa:4096 \ + -keyout localhost-key.pem \ + -out localhost.pem \ + -days 365 \ + -nodes \ + -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" + +echo "SSL certificates generated:" +echo " - localhost-key.pem (private key)" +echo " - localhost.pem (certificate)" +echo "" +echo "Note: You may need to accept the self-signed certificate in your browser." diff --git a/api/main.py b/api/main.py index 1190542..61563c5 100644 --- a/api/main.py +++ b/api/main.py @@ -11,7 +11,7 @@ from clients.elevenlabs import ElevenLabsClient from clients.openrouter import OpenRouterClient from config import settings -from routers import lifelog, preview, sessions +from routers import csv_files, lifelog, photos, preview, user_choice @asynccontextmanager @@ -22,7 +22,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: # Shutdown: cleanup if needed -app = FastAPI(title="Blindsighted API", lifespan=lifespan) +app = FastAPI(title="Julie API", lifespan=lifespan) # Configure CORS for Expo app app.add_middleware( @@ -34,22 +34,42 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: ) # Include routers -app.include_router(sessions.router) +app.include_router(user_choice.router) app.include_router(preview.router) app.include_router(lifelog.router) +app.include_router(photos.router) +app.include_router(csv_files.router) @app.get("/") async def root() -> dict[str, str]: - return {"message": "Welcome to Blindsighted API", "status": "healthy"} + return {"message": "Welcome to Julie API", "status": "healthy"} if __name__ == "__main__": import uvicorn + # SSL certificate paths for HTTPS + ssl_keyfile = os.getenv("SSL_KEYFILE", "localhost-key.pem") + ssl_certfile = os.getenv("SSL_CERTFILE", "localhost.pem") + + # Check if SSL certificates exist + has_ssl = os.path.exists(ssl_keyfile) and os.path.exists(ssl_certfile) + + if has_ssl: + print(f"HTTPS enabled with certificates: {ssl_certfile}, {ssl_keyfile}") + else: + print(f"Warning: SSL certificates not found. Running HTTP only.") + print(f"Expected files: {ssl_keyfile}, {ssl_certfile}") + print("Generate certificates using: openssl req -x509 -newkey rsa:4096 -keyout localhost-key.pem -out localhost.pem -days 365 -nodes") + + port = int(os.getenv("JULIE_API_PORT", 8000)) + uvicorn.run( "main:app", host="0.0.0.0", - port=int(os.getenv("BLINDSIGHTED_API_PORT", 9999)), + port=port, reload=True, + ssl_keyfile=ssl_keyfile if has_ssl else None, + ssl_certfile=ssl_certfile if has_ssl else None, ) diff --git a/api/models.py b/api/models.py index d0896e2..75d2d5a 100644 --- a/api/models.py +++ b/api/models.py @@ -1,10 +1,10 @@ -"""Database models for Blindsighted""" +"""Database models for Julie""" from datetime import UTC, datetime from enum import Enum from uuid import UUID from uuid_utils.compat import uuid7 -from sqlalchemy import String, DateTime, Integer, Text, Float, ForeignKey +from sqlalchemy import String, DateTime, Integer, Text, Float, ForeignKey, Boolean from sqlalchemy.orm import Mapped, mapped_column, relationship from database import Base @@ -168,3 +168,42 @@ class LifelogEntry(Base): # Relationships user: Mapped["User"] = relationship(back_populates="lifelog_entries") + + +class CSVFile(Base): + """Model for storing CSV files in the database""" + + __tablename__ = "csv_files" + + id: Mapped[UUID] = mapped_column(primary_key=True, default=uuid7) + filename: Mapped[str] = mapped_column(String(255), nullable=False) + file_content: Mapped[str] = mapped_column(Text, nullable=False) # Store CSV content as text + file_size_bytes: Mapped[int] = mapped_column(Integer, nullable=False) + + # Timestamps + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), nullable=False, default=lambda: datetime.now(UTC) + ) + updated_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), + nullable=False, + default=lambda: datetime.now(UTC), + onupdate=lambda: datetime.now(UTC), + ) + + +class UserChoice(Base): + """Model for storing user's selected item from ElevenLabs voice call""" + + __tablename__ = "user_choices" + + id: Mapped[UUID] = mapped_column(primary_key=True, default=uuid7) + item_name: Mapped[str] = mapped_column(String(255), nullable=False) + item_location: Mapped[str | None] = mapped_column(String(255), nullable=True) + csv_file_id: Mapped[UUID | None] = mapped_column(ForeignKey("csv_files.id"), nullable=True) + processed: Mapped[bool] = mapped_column(Boolean, default=False) # Has Gemini acted on this? + + # Timestamps + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), nullable=False, default=lambda: datetime.now(UTC) + ) \ No newline at end of file diff --git a/api/pyproject.toml b/api/pyproject.toml index 31e0367..7d8e74d 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -3,9 +3,9 @@ requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] -name = "blindsighted-api" +name = "julie-api" version = "0.1.0" -description = "FastAPI backend for Blindsighted app" +description = "FastAPI backend for Julie - AI shopping assistant" requires-python = ">=3.11" dependencies = [ "fastapi==0.115.6", @@ -19,8 +19,6 @@ dependencies = [ "sqlalchemy[asyncio]==2.0.36", "psycopg[binary]==3.2.3", "alembic==1.14.0", - "livekit>=0.18.1", - "livekit-agents==0.12.3", "loguru==0.7.3", "python-multipart==0.0.20", "uuid-utils==0.9.0", diff --git a/api/routers/csv_files.py b/api/routers/csv_files.py new file mode 100644 index 0000000..36678da --- /dev/null +++ b/api/routers/csv_files.py @@ -0,0 +1,93 @@ +"""API routes for CSV file management.""" + +from typing import Annotated + +from fastapi import APIRouter, Depends, HTTPException, UploadFile, File +from pydantic import BaseModel +from sqlalchemy import select, desc +from sqlalchemy.ext.asyncio import AsyncSession + +from database import get_db +from models import CSVFile + +router = APIRouter(prefix="/csv", tags=["csv"]) + + +class CSVFileResponse(BaseModel): + """Response model for CSV file data.""" + + id: str + filename: str + content: str + file_size_bytes: int + created_at: str + updated_at: str + + +@router.get("/get-summary", response_model=CSVFileResponse) +async def get_summary( + db: Annotated[AsyncSession, Depends(get_db)], +) -> CSVFileResponse: + """Get the latest CSV file from the database. + + Returns the most recently created CSV file based on created_at timestamp. + """ + # Query for the latest CSV file ordered by created_at descending + result = await db.execute( + select(CSVFile).order_by(desc(CSVFile.created_at)).limit(1) + ) + csv_file = result.scalar_one_or_none() + + if not csv_file: + raise HTTPException( + status_code=404, + detail="No CSV files found in the database" + ) + + return CSVFileResponse( + id=str(csv_file.id), + filename=csv_file.filename, + content=csv_file.file_content, + file_size_bytes=csv_file.file_size_bytes, + created_at=csv_file.created_at.isoformat(), + updated_at=csv_file.updated_at.isoformat(), + ) + + +@router.post("/upload") +async def upload_csv( + file: Annotated[UploadFile, File()], + db: Annotated[AsyncSession, Depends(get_db)], +) -> dict[str, str]: + """Upload a CSV file to the database. + + Args: + file: CSV file to upload + db: Database session + """ + # Read file content + content = await file.read() + content_str = content.decode('utf-8') + + if not content_str: + raise HTTPException( + status_code=400, + detail="File content is empty" + ) + + # Create new CSV file record + csv_file = CSVFile( + filename=file.filename or "uploaded.csv", + file_content=content_str, + file_size_bytes=len(content), + ) + + db.add(csv_file) + await db.commit() + await db.refresh(csv_file) + + return { + "message": "CSV file uploaded successfully", + "id": str(csv_file.id), + "filename": csv_file.filename, + } diff --git a/api/routers/photos.py b/api/routers/photos.py new file mode 100644 index 0000000..e365470 --- /dev/null +++ b/api/routers/photos.py @@ -0,0 +1,97 @@ +"""Photo analysis endpoints for grocery shelf scanning. + +Routes photos to appropriate Gemini instance based on flag: +- LOW: Navigation/positioning guidance +- HIGH: Item identification and listing +""" + +from enum import Enum + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +from clients.gemini import GeminiClient + +router = APIRouter(prefix="/photos", tags=["photos"]) + + +class PhotoFlag(str, Enum): + """Flag indicating the type of photo analysis needed.""" + + LOW = "low" # Navigation mode - guide user positioning + HIGH = "high" # Identification mode - list shelf items + + +class PhotoAnalysisRequest(BaseModel): + """Request to analyze a photo.""" + + image_base64: str # Base64 encoded JPEG image + flag: PhotoFlag # Analysis type + + +class PhotoAnalysisResponse(BaseModel): + """Response from photo analysis.""" + + response: str # Gemini's text response + flag: PhotoFlag # Echo back the flag used + + +# System prompts for each mode +NAVIGATION_PROMPT = """You are helping a blind user position themselves in front of a grocery shelf. +Analyze the image and provide positioning instructions. + +If you can see a clear, full view of a shelf section, respond with: +"READY: [brief description of what you see]" + +Otherwise, give brief, actionable instructions: +- "move left" - if shelf extends to the right out of view +- "move right" - if shelf extends to the left out of view +- "step back" - if too close to see full shelf +- "step forward" - if too far away +- "tilt up" or "tilt down" - if viewing angle needs adjustment + +Be concise. One instruction at a time.""" + +IDENTIFICATION_PROMPT = """You are helping a blind user identify items on a grocery shelf. +List all visible products clearly and concisely. + +For each item include: +- Product name and brand (if visible) +- Position: left/center/right, top/middle/bottom shelf +- Size or quantity if visible + +Format as a numbered list. Be thorough but concise.""" + + +@router.post("/analyze", response_model=PhotoAnalysisResponse) +async def analyze_photo(request: PhotoAnalysisRequest) -> PhotoAnalysisResponse: + """Analyze a photo using the appropriate Gemini instance based on flag. + + Args: + request: Contains base64 image and flag (low/high) + + Returns: + Gemini's analysis response + """ + gemini = GeminiClient() + + # Select prompt based on flag + if request.flag == PhotoFlag.LOW: + prompt = NAVIGATION_PROMPT + else: + prompt = IDENTIFICATION_PROMPT + + try: + response = await gemini.analyze_image( + image_base64=request.image_base64, + prompt=prompt, + ) + + return PhotoAnalysisResponse( + response=response, + flag=request.flag, + ) + except ValueError as e: + raise HTTPException(status_code=500, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Gemini API error: {str(e)}") diff --git a/api/routers/sessions.py b/api/routers/sessions.py deleted file mode 100644 index 25ef670..0000000 --- a/api/routers/sessions.py +++ /dev/null @@ -1,159 +0,0 @@ -"""API routes for streaming session management.""" - -from datetime import UTC, datetime -from typing import Annotated -from uuid import UUID - -from fastapi import APIRouter, Depends, HTTPException -from pydantic import BaseModel -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from database import get_db -from models import SessionStatus, StreamSession -from services.lk import LiveKitService - -router = APIRouter(prefix="/sessions", tags=["sessions"]) - - -class StartSessionRequest(BaseModel): - """Request to start a new streaming session.""" - - user_id: str | None = None - device_id: str | None = None - agent_id: str | None = None # Custom agent prefix for this session - - -class StartSessionResponse(BaseModel): - """Response after starting a session.""" - - session_id: UUID - room_name: str - token: str - livekit_url: str - - -class StopSessionRequest(BaseModel): - """Request to stop a streaming session.""" - - session_id: UUID - - -def get_livekit_service() -> LiveKitService: - """Dependency for LiveKit service.""" - return LiveKitService() - - -@router.post("/start", response_model=StartSessionResponse) -async def start_session( - request: StartSessionRequest, - db: Annotated[AsyncSession, Depends(get_db)], - livekit: Annotated[LiveKitService, Depends(get_livekit_service)], -) -> StartSessionResponse: - """Start a new streaming session with LiveKit. - - Creates a room, generates an access token, and optionally starts recording. - """ - try: - # Generate unique room name - room_name = livekit.generate_room_name() - - # Create LiveKit room with agent_id in metadata - room = await livekit.create_room(room_name, agent_id=request.agent_id) - - # Create database session record - session = StreamSession( - room_name=room_name, - room_sid=room.sid, - user_id=request.user_id, - device_id=request.device_id, - agent_id=request.agent_id, - status=SessionStatus.CREATED, - ) - db.add(session) - await db.commit() - await db.refresh(session) - - # Generate access token for client - participant_identity = request.device_id or f"user-{session.id}" - token = livekit.create_access_token( - room_name=room_name, - participant_identity=participant_identity, - participant_name=request.user_id, - ) - - return StartSessionResponse( - session_id=session.id, - room_name=room_name, - token=token, - livekit_url=livekit.url, - ) - - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to start session: {str(e)}") - - -@router.post("/stop") -async def stop_session( - request: StopSessionRequest, - db: Annotated[AsyncSession, Depends(get_db)], - livekit: Annotated[LiveKitService, Depends(get_livekit_service)], -) -> dict[str, str]: - """Stop an active streaming session.""" - try: - # Get session from database - result = await db.execute( - select(StreamSession).where(StreamSession.id == request.session_id) - ) - session = result.scalar_one_or_none() - - if not session: - raise HTTPException(status_code=404, detail="Session not found") - - if session.status == SessionStatus.ENDED: - return {"message": "Session already ended"} - - # Update session status - session.status = SessionStatus.ENDED - session.ended_at = datetime.now(UTC) - await db.commit() - - # Delete LiveKit room - try: - await livekit.delete_room(session.room_name) - except Exception as e: - # Log error but don't fail the request - print(f"Failed to delete room: {e}") - - return {"message": "Session stopped successfully"} - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to stop session: {str(e)}") - - -@router.get("/{session_id}") -async def get_session( - session_id: int, - db: Annotated[AsyncSession, Depends(get_db)], -) -> dict: - """Get session details.""" - result = await db.execute(select(StreamSession).where(StreamSession.id == session_id)) - session = result.scalar_one_or_none() - - if not session: - raise HTTPException(status_code=404, detail="Session not found") - - return { - "id": session.id, - "room_name": session.room_name, - "room_sid": session.room_sid, - "user_id": session.user_id, - "device_id": session.device_id, - "agent_id": session.agent_id, - "status": session.status, - "created_at": session.created_at.isoformat(), - "started_at": session.started_at.isoformat() if session.started_at else None, - "ended_at": session.ended_at.isoformat() if session.ended_at else None, - } diff --git a/api/routers/user_choice.py b/api/routers/user_choice.py new file mode 100644 index 0000000..5b582c9 --- /dev/null +++ b/api/routers/user_choice.py @@ -0,0 +1,130 @@ +"""API routes for user choice management (ElevenLabs integration).""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel +from sqlalchemy import select, desc +from sqlalchemy.ext.asyncio import AsyncSession + +from database import get_db +from models import UserChoice, CSVFile + +router = APIRouter(prefix="/user-choice", tags=["user-choice"]) + + +class UserChoiceRequest(BaseModel): + """Request body for storing user's item selection.""" + + item_name: str + item_location: str | None = None + + +class UserChoiceResponse(BaseModel): + """Response after storing user's choice.""" + + message: str + id: str + + +class UserChoiceDetail(BaseModel): + """Detailed user choice response.""" + + id: str + item_name: str + item_location: str | None + processed: bool + created_at: str + + +@router.post("", response_model=UserChoiceResponse) +async def create_user_choice( + request: UserChoiceRequest, + db: Annotated[AsyncSession, Depends(get_db)], +) -> UserChoiceResponse: + """Store the user's selected item from ElevenLabs voice call. + + This endpoint is called by the ElevenLabs agent when the user + has selected an item they want to find on the shelf. + """ + # Get the latest CSV file ID (optional association) + csv_result = await db.execute( + select(CSVFile).order_by(desc(CSVFile.created_at)).limit(1) + ) + latest_csv = csv_result.scalar_one_or_none() + + # Create the user choice record + user_choice = UserChoice( + item_name=request.item_name, + item_location=request.item_location, + csv_file_id=latest_csv.id if latest_csv else None, + processed=False, + ) + + db.add(user_choice) + await db.commit() + await db.refresh(user_choice) + + return UserChoiceResponse( + message="Choice recorded", + id=str(user_choice.id), + ) + + +@router.get("/latest", response_model=UserChoiceDetail | None) +async def get_latest_user_choice( + db: Annotated[AsyncSession, Depends(get_db)], + unprocessed_only: bool = True, +) -> UserChoiceDetail | None: + """Get the latest user choice. + + Args: + unprocessed_only: If True (default), only return unprocessed choices. + Set to False to get the latest choice regardless of status. + + Returns: + The latest user choice, or None if no choices exist. + """ + query = select(UserChoice).order_by(desc(UserChoice.created_at)) + + if unprocessed_only: + query = query.where(UserChoice.processed == False) # noqa: E712 + + query = query.limit(1) + result = await db.execute(query) + user_choice = result.scalar_one_or_none() + + if not user_choice: + return None + + return UserChoiceDetail( + id=str(user_choice.id), + item_name=user_choice.item_name, + item_location=user_choice.item_location, + processed=user_choice.processed, + created_at=user_choice.created_at.isoformat(), + ) + + +@router.patch("/{choice_id}/processed") +async def mark_choice_processed( + choice_id: UUID, + db: Annotated[AsyncSession, Depends(get_db)], +) -> dict[str, str]: + """Mark a user choice as processed. + + Called by the Gemini agent after it has guided the user to the item. + """ + result = await db.execute( + select(UserChoice).where(UserChoice.id == choice_id) + ) + user_choice = result.scalar_one_or_none() + + if not user_choice: + raise HTTPException(status_code=404, detail="User choice not found") + + user_choice.processed = True + await db.commit() + + return {"message": "Choice marked as processed"} diff --git a/api/services/lk.py b/api/services/lk.py deleted file mode 100644 index a3ee79c..0000000 --- a/api/services/lk.py +++ /dev/null @@ -1,145 +0,0 @@ -"""LiveKit service for managing rooms, tokens, and egress.""" - -import secrets - -from livekit import api - -from config import settings - - -class LiveKitService: - """Service for interacting with LiveKit server.""" - - def __init__(self) -> None: - """Initialize the LiveKit service.""" - self.api_key = settings.livekit_api_key - self.api_secret = settings.livekit_api_secret - self.url = settings.livekit_url - - def generate_room_name(self) -> str: - """Generate a unique room name for a streaming session. - - Returns: - A unique room name string - """ - return f"blindsighted-{secrets.token_urlsafe(16)}" - - async def create_room(self, room_name: str, agent_id: str | None = None) -> api.Room: - """Create a new LiveKit room. - - Args: - room_name: Name of the room to create - agent_id: Optional agent identifier to store in room metadata - - Returns: - The created Room object - """ - lkapi = api.LiveKitAPI(self.url, self.api_key, self.api_secret) - room_service = lkapi.room - - room = await room_service.create_room( - api.CreateRoomRequest(name=room_name, metadata=agent_id or "") - ) - await lkapi.aclose() - return room - - def create_access_token( - self, - room_name: str, - participant_identity: str, - participant_name: str | None = None, - ) -> str: - """Create an access token for a participant to join a room. - - Args: - room_name: Name of the room - participant_identity: Unique identity for the participant - participant_name: Display name for the participant - - Returns: - Access token string - """ - token = api.AccessToken(self.api_key, self.api_secret) - token.with_identity(participant_identity) - token.with_name(participant_name or participant_identity) - token.with_grants( - api.VideoGrants( - room_join=True, - room=room_name, - can_publish=True, - can_subscribe=True, - ) - ) - return token.to_jwt() - - async def start_room_composite_egress( - self, - room_name: str, - r2_key: str, - ) -> api.EgressInfo: - """Start room composite egress to record the entire room. - - Args: - room_name: Name of the room to record - r2_key: S3/R2 key path for the recording - - Returns: - EgressInfo object with egress details - """ - lkapi = api.LiveKitAPI(self.url, self.api_key, self.api_secret) - egress_service = lkapi.egress - - # Configure S3 (R2-compatible) upload - s3_upload = api.S3Upload( - access_key=settings.r2_access_key_id, - secret=settings.r2_secret_access_key, - region="auto", - endpoint=f"https://{settings.cloudflare_account_id}.r2.cloudflarestorage.com", - bucket=settings.r2_bucket_name, - ) - - # Start room composite egress - egress_info = await egress_service.start_room_composite_egress( - api.RoomCompositeEgressRequest( - room_name=room_name, - file_outputs=[ - api.EncodedFileOutput( - file_type=api.EncodedFileType.MP4, - filepath=r2_key, - s3=s3_upload, - ) - ], - ) - ) - - await lkapi.aclose() - return egress_info - - async def stop_egress(self, egress_id: str) -> api.EgressInfo: - """Stop an active egress. - - Args: - egress_id: ID of the egress to stop - - Returns: - Updated EgressInfo object - """ - lkapi = api.LiveKitAPI(self.url, self.api_key, self.api_secret) - egress_service = lkapi.egress - - egress_info = await egress_service.stop_egress(api.StopEgressRequest(egress_id=egress_id)) - - await lkapi.aclose() - return egress_info - - async def delete_room(self, room_name: str) -> None: - """Delete a LiveKit room. - - Args: - room_name: Name of the room to delete - """ - lkapi = api.LiveKitAPI(self.url, self.api_key, self.api_secret) - room_service = lkapi.room - - await room_service.delete_room(api.DeleteRoomRequest(room=room_name)) - await lkapi.aclose() diff --git a/ios/Blindsighted.xcodeproj/project.pbxproj b/ios/Blindsighted.xcodeproj/project.pbxproj index 4a73536..071610d 100644 --- a/ios/Blindsighted.xcodeproj/project.pbxproj +++ b/ios/Blindsighted.xcodeproj/project.pbxproj @@ -14,6 +14,7 @@ 0ADFF2BC65A91AA27F154CAC /* APIClient.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6DB9AF0994BD6E62E7F77EB0 /* APIClient.swift */; }; 19E457CD71E5D4AF4E94D3F7 /* LifelogSyncManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 52B78A156B637E320CC7C498 /* LifelogSyncManager.swift */; }; 2658C6B816296F40AF610C8C /* LocationManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 24CAA0CF5F78EFD5AA49AD32 /* LocationManager.swift */; }; + 326E0940E9DAFAE85E9CFB55 /* PhotoGalleryViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = D6C9991AB19A9B2A8D38891F /* PhotoGalleryViewModel.swift */; }; 36D904DCCAF867D40E5E3EEC /* AudioCaptureManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3F5BDB9829320DD5FD1D86C5 /* AudioCaptureManager.swift */; }; 3EF0E5D9F4F3939E0328E694 /* AudioManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 97FBF1B9A4137446FC49EE27 /* AudioManager.swift */; }; 4D7D8EA119AE050C9044FFB2 /* AudioTestView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0100FB0CC48442082755E922 /* AudioTestView.swift */; }; @@ -40,6 +41,8 @@ 8FFD60612E84A2F70035E446 /* DebugMenuView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8FFD605E2E84A2F70035E446 /* DebugMenuView.swift */; }; 9CDE2A402E6F101100B7F891 /* LiveKit in Frameworks */ = {isa = PBXBuildFile; productRef = 9CDE2A3F2E6F101100B7F891 /* LiveKit */; }; A7F63494631A1366EC67F4D4 /* LiveKitSettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE29742DFA877E6A55FDE1B /* LiveKitSettingsView.swift */; }; + BFC56F3CD7F28459B63E8669 /* PhotoFileManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2060D72BCDBE068F8B0F5553 /* PhotoFileManager.swift */; }; + C8242E6839C21A37FC1E157F /* PhotoGalleryView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A2AD5CFFF8CC5DAF77E0441B /* PhotoGalleryView.swift */; }; E3404C0C2F197F100060330D /* VideoFileManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = E3404C0A2F197F100060330D /* VideoFileManager.swift */; }; E3404C0D2F197F100060330D /* VideoRecorder.swift in Sources */ = {isa = PBXBuildFile; fileRef = E3404C0B2F197F100060330D /* VideoRecorder.swift */; }; E3404C102F197F280060330D /* VideoPlayerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E3404C0F2F197F280060330D /* VideoPlayerView.swift */; }; @@ -82,6 +85,7 @@ 0100FB0CC48442082755E922 /* AudioTestView.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = AudioTestView.swift; sourceTree = ""; }; 0866ED6D8B295B34C2E33BA0 /* LiveKitManager.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = LiveKitManager.swift; sourceTree = ""; }; 0B0028DE23F5C74C1828B9AA /* Config.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; path = Config.xcconfig; sourceTree = ""; }; + 2060D72BCDBE068F8B0F5553 /* PhotoFileManager.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = PhotoFileManager.swift; sourceTree = ""; }; 24CAA0CF5F78EFD5AA49AD32 /* LocationManager.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = LocationManager.swift; sourceTree = ""; }; 2A0F08BAC0F3290D10A14384 /* AudioTestView.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = AudioTestView.swift; sourceTree = ""; }; 3A3A3A3A3A3A3A3A3A3A3A3A /* Blindsighted.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Blindsighted.app; sourceTree = BUILT_PRODUCTS_DIR; }; @@ -110,9 +114,11 @@ 8FFD605E2E84A2F70035E446 /* DebugMenuView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DebugMenuView.swift; sourceTree = ""; }; 8FFD605F2E84A2F70035E446 /* MainAppView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MainAppView.swift; sourceTree = ""; }; 97FBF1B9A4137446FC49EE27 /* AudioManager.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = AudioManager.swift; sourceTree = ""; }; + A2AD5CFFF8CC5DAF77E0441B /* PhotoGalleryView.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = PhotoGalleryView.swift; sourceTree = ""; }; AC2624254004236E8AAC8C71 /* LiveKitConfig.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = LiveKitConfig.swift; sourceTree = ""; }; D201422AD8220CEFF11C5BB8 /* PreviewVideoFileManager.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = PreviewVideoFileManager.swift; sourceTree = ""; }; D44CA48B3558E3BBC2F5925C /* AudioManager.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = AudioManager.swift; sourceTree = ""; }; + D6C9991AB19A9B2A8D38891F /* PhotoGalleryViewModel.swift */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = sourcecode.swift; path = PhotoGalleryViewModel.swift; sourceTree = ""; }; E3404C0A2F197F100060330D /* VideoFileManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VideoFileManager.swift; sourceTree = ""; }; E3404C0B2F197F100060330D /* VideoRecorder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VideoRecorder.swift; sourceTree = ""; }; E3404C0E2F197F280060330D /* LifelogView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LifelogView.swift; sourceTree = ""; }; @@ -192,6 +198,7 @@ 8F2D237F2E856711002D0588 /* DebugMenuViewModel.swift */, 8FD96B6F2E6F0A9800F56AB1 /* StreamSessionViewModel.swift */, 8F8F00772E8ACB4500A4BDAF /* WearablesViewModel.swift */, + D6C9991AB19A9B2A8D38891F /* PhotoGalleryViewModel.swift */, ); path = ViewModels; sourceTree = ""; @@ -215,6 +222,7 @@ 0100FB0CC48442082755E922 /* AudioTestView.swift */, FCE29742DFA877E6A55FDE1B /* LiveKitSettingsView.swift */, 4AFEB3EC3CB48EFDBA6D4593 /* LifelogDayView.swift */, + A2AD5CFFF8CC5DAF77E0441B /* PhotoGalleryView.swift */, ); path = Views; sourceTree = ""; @@ -260,6 +268,7 @@ D201422AD8220CEFF11C5BB8 /* PreviewVideoFileManager.swift */, 4E728DACFA5D93E9D63D4F96 /* PreviewVideoSeeder.swift */, 52B78A156B637E320CC7C498 /* LifelogSyncManager.swift */, + 2060D72BCDBE068F8B0F5553 /* PhotoFileManager.swift */, ); path = Utils; sourceTree = ""; @@ -429,6 +438,9 @@ F5CA9E6B4755AAE815951ACE /* LifelogDayView.swift in Sources */, F9B689D748BAF0A123792860 /* PreviewVideoSeeder.swift in Sources */, 19E457CD71E5D4AF4E94D3F7 /* LifelogSyncManager.swift in Sources */, + BFC56F3CD7F28459B63E8669 /* PhotoFileManager.swift in Sources */, + 326E0940E9DAFAE85E9CFB55 /* PhotoGalleryViewModel.swift in Sources */, + C8242E6839C21A37FC1E157F /* PhotoGalleryView.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -462,7 +474,7 @@ CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; DEVELOPMENT_ASSET_PATHS = ""; - DEVELOPMENT_TEAM = 59L345955F; + DEVELOPMENT_TEAM = FZ26Z8W5FZ; ENABLE_PREVIEWS = YES; FRAMEWORK_SEARCH_PATHS = ""; INFOPLIST_FILE = Blindsighted/Info.plist; @@ -473,7 +485,7 @@ "@executable_path/Frameworks", ); MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = info.hails.blindsighted; + PRODUCT_BUNDLE_IDENTIFIER = com.w3joe.blindsighted; PRODUCT_NAME = "$(TARGET_NAME)"; PROVISIONING_PROFILE_SPECIFIER = ""; SWIFT_VERSION = 5.0; @@ -493,7 +505,7 @@ CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; DEVELOPMENT_ASSET_PATHS = ""; - DEVELOPMENT_TEAM = 59L345955F; + DEVELOPMENT_TEAM = FZ26Z8W5FZ; ENABLE_PREVIEWS = YES; INFOPLIST_FILE = Blindsighted/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = Blindsighted; @@ -503,7 +515,7 @@ "@executable_path/Frameworks", ); MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = info.hails.blindsighted; + PRODUCT_BUNDLE_IDENTIFIER = com.w3joe.blindsighted; PRODUCT_NAME = "$(TARGET_NAME)"; PROVISIONING_PROFILE_SPECIFIER = ""; SWIFT_VERSION = 5.0; diff --git a/ios/Blindsighted.xcodeproj/project.xcworkspace/xcuserdata/w3joe.xcuserdatad/UserInterfaceState.xcuserstate b/ios/Blindsighted.xcodeproj/project.xcworkspace/xcuserdata/w3joe.xcuserdatad/UserInterfaceState.xcuserstate new file mode 100644 index 0000000..f4f420e Binary files /dev/null and b/ios/Blindsighted.xcodeproj/project.xcworkspace/xcuserdata/w3joe.xcuserdatad/UserInterfaceState.xcuserstate differ diff --git a/ios/Blindsighted/Utils/APIClient.swift b/ios/Blindsighted/Utils/APIClient.swift index 7f5df6a..7227864 100644 --- a/ios/Blindsighted/Utils/APIClient.swift +++ b/ios/Blindsighted/Utils/APIClient.swift @@ -1,4 +1,5 @@ import Foundation +import UIKit /// Request to start a streaming session struct StartSessionRequest: Codable { @@ -53,6 +54,41 @@ struct APIErrorResponse: Codable { let detail: String } +/// Request to analyze a photo +struct PhotoAnalysisRequest: Codable { + let imageBase64: String + let flag: PhotoFlag + + enum CodingKeys: String, CodingKey { + case imageBase64 = "image_base64" + case flag + } +} + +/// Response from /photos/analyze endpoint +struct PhotoAnalysisResponse: Codable { + let response: String + let flag: PhotoFlag +} + +/// Errors specific to photo operations +enum PhotoError: Error, LocalizedError { + case noFlag + case loadFailed + case encodingFailed + + var errorDescription: String? { + switch self { + case .noFlag: + return "Photo has no flag in filename" + case .loadFailed: + return "Failed to load image from disk" + case .encodingFailed: + return "Failed to encode image as JPEG" + } + } +} + /// Errors from API client enum APIClientError: Error, LocalizedError { case invalidURL @@ -108,6 +144,44 @@ class APIClient: ObservableObject { return try await post(endpoint: "/sessions/stop", body: request) } + // MARK: - Photo Analysis + + /// Analyze a photo using Gemini based on its flag + /// - Parameter photo: CapturedPhoto with flag in filename + /// - Returns: PhotoAnalysisResponse with Gemini's analysis + func analyzePhoto(_ photo: CapturedPhoto) async throws -> PhotoAnalysisResponse { + guard let flag = photo.flag else { + throw PhotoError.noFlag + } + guard let image = PhotoFileManager.shared.loadImage(for: photo) else { + throw PhotoError.loadFailed + } + guard let data = image.jpegData(compressionQuality: 0.8) else { + throw PhotoError.encodingFailed + } + + let base64 = data.base64EncodedString() + let request = PhotoAnalysisRequest(imageBase64: base64, flag: flag) + + return try await post(endpoint: "/photos/analyze", body: request) + } + + /// Analyze a photo directly from image and flag + /// - Parameters: + /// - image: UIImage to analyze + /// - flag: PhotoFlag indicating analysis type (low/high) + /// - Returns: PhotoAnalysisResponse with Gemini's analysis + func analyzePhoto(image: UIImage, flag: PhotoFlag) async throws -> PhotoAnalysisResponse { + guard let data = image.jpegData(compressionQuality: 0.8) else { + throw PhotoError.encodingFailed + } + + let base64 = data.base64EncodedString() + let request = PhotoAnalysisRequest(imageBase64: base64, flag: flag) + + return try await post(endpoint: "/photos/analyze", body: request) + } + // MARK: - HTTP Methods private func post(endpoint: String, body: T) async throws -> R { diff --git a/ios/Blindsighted/Utils/PhotoFileManager.swift b/ios/Blindsighted/Utils/PhotoFileManager.swift new file mode 100644 index 0000000..a0de255 --- /dev/null +++ b/ios/Blindsighted/Utils/PhotoFileManager.swift @@ -0,0 +1,151 @@ +// +// PhotoFileManager.swift +// +// Manages photo file storage and retrieval in the app's Documents directory. +// Handles periodic photo capture storage during recording sessions. +// + +import Foundation +import UIKit + +let PHOTOS_DIRECTORY = "BlindsightedPhotos" + +/// Flag indicating the type of photo analysis needed +enum PhotoFlag: String, Codable { + case low = "low" // Navigation mode - guide user positioning + case high = "high" // Identification mode - list shelf items +} + +struct CapturedPhoto: Identifiable, Codable { + let id: UUID + let filename: String + let capturedAt: Date + let fileSize: Int64 + + var url: URL { + PhotoFileManager.shared.photoURL(for: filename) + } + + /// Parse flag from filename (e.g., "photo_2026-01-17_low.jpg" -> .low) + var flag: PhotoFlag? { + if filename.contains("_low.") { return .low } + if filename.contains("_high.") { return .high } + return nil + } +} + +class PhotoFileManager { + static let shared = PhotoFileManager() + + private let photosDirectory: URL + + private init() { + let documentsPath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0] + self.photosDirectory = documentsPath.appendingPathComponent(PHOTOS_DIRECTORY, isDirectory: true) + + // Create photos directory if it doesn't exist + try? FileManager.default.createDirectory( + at: photosDirectory, + withIntermediateDirectories: true, + attributes: nil + ) + } + + /// Get the photos directory URL (for Files app access info) + var directoryURL: URL { + photosDirectory + } + + /// Save a photo and return its URL + /// - Parameters: + /// - image: The UIImage to save + /// - flag: Optional PhotoFlag to embed in filename (low/high) + /// - quality: JPEG compression quality (0.0 to 1.0) + /// - Returns: URL of saved photo, or nil if save failed + @discardableResult + func savePhoto(_ image: UIImage, flag: PhotoFlag? = nil, quality: CGFloat = 0.8) -> URL? { + guard let data = image.jpegData(compressionQuality: quality) else { + return nil + } + + let timestamp = ISO8601DateFormatter().string(from: Date()) + .replacingOccurrences(of: ":", with: "-") // Remove colons for filename compatibility + let flagSuffix = flag.map { "_\($0.rawValue)" } ?? "" + let filename = "photo_\(timestamp)\(flagSuffix).jpg" + let url = photosDirectory.appendingPathComponent(filename) + + do { + try data.write(to: url) + return url + } catch { + print("Failed to save photo: \(error)") + return nil + } + } + + /// Get URL for a specific filename + func photoURL(for filename: String) -> URL { + return photosDirectory.appendingPathComponent(filename) + } + + /// List all captured photos + /// - Parameter flag: Optional flag to filter by (nil returns all photos) + /// - Returns: Array of CapturedPhoto sorted by date (most recent first) + func listPhotos(withFlag flag: PhotoFlag? = nil) throws -> [CapturedPhoto] { + let fileURLs = try FileManager.default.contentsOfDirectory( + at: photosDirectory, + includingPropertiesForKeys: [.creationDateKey, .fileSizeKey], + options: [.skipsHiddenFiles] + ) + + let jpgFiles = fileURLs.filter { $0.pathExtension.lowercased() == "jpg" } + + let photos = try jpgFiles.compactMap { url -> CapturedPhoto? in + let attributes = try FileManager.default.attributesOfItem(atPath: url.path) + let fileSize = attributes[.size] as? Int64 ?? 0 + let creationDate = attributes[.creationDate] as? Date ?? Date() + + return CapturedPhoto( + id: UUID(), + filename: url.lastPathComponent, + capturedAt: creationDate, + fileSize: fileSize + ) + }.sorted { $0.capturedAt > $1.capturedAt } // Most recent first + + // Filter by flag if specified + if let flag = flag { + return photos.filter { $0.flag == flag } + } + return photos + } + + /// Delete a single photo + func deletePhoto(_ photo: CapturedPhoto) throws { + try FileManager.default.removeItem(at: photo.url) + } + + /// Delete all photos + func deleteAllPhotos() throws { + let photos = try listPhotos() + for photo in photos { + try? FileManager.default.removeItem(at: photo.url) + } + } + + /// Get total storage used by photos + func totalStorageUsed() throws -> Int64 { + let photos = try listPhotos() + return photos.reduce(0) { $0 + $1.fileSize } + } + + /// Get photo count + func photoCount() -> Int { + return (try? listPhotos().count) ?? 0 + } + + /// Load image from a captured photo + func loadImage(for photo: CapturedPhoto) -> UIImage? { + return UIImage(contentsOfFile: photo.url.path) + } +} diff --git a/ios/Blindsighted/ViewModels/PhotoGalleryViewModel.swift b/ios/Blindsighted/ViewModels/PhotoGalleryViewModel.swift new file mode 100644 index 0000000..65052a3 --- /dev/null +++ b/ios/Blindsighted/ViewModels/PhotoGalleryViewModel.swift @@ -0,0 +1,115 @@ +// +// PhotoGalleryViewModel.swift +// +// View model for managing the photo gallery, displaying all periodic photos captured during recordings. +// + +import Foundation +import SwiftUI + +@MainActor +class PhotoGalleryViewModel: ObservableObject { + @Published var photos: [CapturedPhoto] = [] + @Published var thumbnails: [UUID: UIImage] = [:] + @Published var selectedPhoto: CapturedPhoto? + @Published var isLoading: Bool = false + @Published var showError: Bool = false + @Published var errorMessage: String = "" + @Published var totalStorage: String = "0 MB" + @Published var showDeleteAllConfirmation: Bool = false + + private let fileManager = PhotoFileManager.shared + + init() { + loadPhotos() + } + + func loadPhotos() { + isLoading = true + + Task { + do { + photos = try fileManager.listPhotos() + updateTotalStorage() + await loadThumbnails() + isLoading = false + } catch { + showError("Failed to load photos: \(error.localizedDescription)") + isLoading = false + } + } + } + + private func updateTotalStorage() { + do { + let bytes = try fileManager.totalStorageUsed() + totalStorage = bytes.formattedFileSize + } catch { + totalStorage = "Unknown" + } + } + + private func loadThumbnails() async { + // Load thumbnails for visible photos (limit to first 100 for performance) + for photo in photos.prefix(100) { + if thumbnails[photo.id] == nil { + if let image = fileManager.loadImage(for: photo) { + // Create smaller thumbnail for grid display + let thumbnail = await createThumbnail(from: image, maxSize: 200) + thumbnails[photo.id] = thumbnail + } + } + } + } + + private func createThumbnail(from image: UIImage, maxSize: CGFloat) async -> UIImage { + let scale = min(maxSize / image.size.width, maxSize / image.size.height) + let newSize = CGSize(width: image.size.width * scale, height: image.size.height * scale) + + let renderer = UIGraphicsImageRenderer(size: newSize) + return renderer.image { _ in + image.draw(in: CGRect(origin: .zero, size: newSize)) + } + } + + func deletePhoto(_ photo: CapturedPhoto) { + do { + try fileManager.deletePhoto(photo) + photos.removeAll { $0.id == photo.id } + thumbnails.removeValue(forKey: photo.id) + updateTotalStorage() + + if selectedPhoto?.id == photo.id { + selectedPhoto = nil + } + } catch { + showError("Failed to delete photo: \(error.localizedDescription)") + } + } + + func deleteAllPhotos() { + do { + try fileManager.deleteAllPhotos() + photos.removeAll() + thumbnails.removeAll() + selectedPhoto = nil + updateTotalStorage() + } catch { + showError("Failed to delete photos: \(error.localizedDescription)") + } + } + + func loadFullImage(for photo: CapturedPhoto) -> UIImage? { + return fileManager.loadImage(for: photo) + } + + private func showError(_ message: String) { + errorMessage = message + showError = true + } + + /// Get the directory path for Files app access + var photosDirectoryPath: String { + fileManager.directoryURL.path + } +} diff --git a/ios/Blindsighted/ViewModels/StreamSessionViewModel.swift b/ios/Blindsighted/ViewModels/StreamSessionViewModel.swift index aef02c5..19591f6 100644 --- a/ios/Blindsighted/ViewModels/StreamSessionViewModel.swift +++ b/ios/Blindsighted/ViewModels/StreamSessionViewModel.swift @@ -43,6 +43,11 @@ class StreamSessionViewModel: ObservableObject { private var recordingMetadata: VideoMetadata? private let locationManager = LocationManager.shared + // Periodic photo capture properties + @Published var capturedPhotoCount: Int = 0 + private var photoTimer: Timer? + private let photoCaptureInterval: TimeInterval = 1.0 // Capture every 1 second + // LiveKit streaming properties @Published var isLiveKitConnected: Bool = false @Published var isMicrophoneMuted: Bool = false @@ -342,6 +347,9 @@ class StreamSessionViewModel: ObservableObject { if let lat = metadata.latitude, let lon = metadata.longitude { NSLog("[Blindsighted] Location: \(lat), \(lon)") } + + // Start periodic photo capture + startPeriodicPhotoCapture() } catch { showError("Failed to start recording: \(error.localizedDescription)") } @@ -352,6 +360,9 @@ class StreamSessionViewModel: ObservableObject { isRecording = false + // Stop periodic photo capture + stopPeriodicPhotoCapture() + do { let savedURL = try await recorder.stopRecording() NSLog("[Blindsighted] Video saved to: \(savedURL.path)") @@ -376,6 +387,45 @@ class StreamSessionViewModel: ObservableObject { recordingMetadata = nil } + // MARK: - Periodic Photo Capture + + /// Start capturing photos every second during recording + private func startPeriodicPhotoCapture() { + capturedPhotoCount = 0 + + // Capture first photo immediately + captureAndSavePhoto() + + // Schedule timer for subsequent photos + photoTimer = Timer.scheduledTimer(withTimeInterval: photoCaptureInterval, repeats: true) { [weak self] _ in + Task { @MainActor [weak self] in + self?.captureAndSavePhoto() + } + } + + NSLog("[Blindsighted] Started periodic photo capture (every \(photoCaptureInterval)s)") + } + + /// Stop periodic photo capture + private func stopPeriodicPhotoCapture() { + photoTimer?.invalidate() + photoTimer = nil + NSLog("[Blindsighted] Stopped periodic photo capture. Total photos: \(capturedPhotoCount)") + } + + /// Capture current frame and save to photo storage + private func captureAndSavePhoto() { + guard let frame = currentVideoFrame else { + NSLog("[Blindsighted] No video frame available for photo capture") + return + } + + if let url = PhotoFileManager.shared.savePhoto(frame) { + capturedPhotoCount += 1 + NSLog("[Blindsighted] Captured photo #\(capturedPhotoCount): \(url.lastPathComponent)") + } + } + // MARK: - LiveKit Methods /// Start publishing video and audio to LiveKit after first frame diff --git a/ios/Blindsighted/Views/MainAppView.swift b/ios/Blindsighted/Views/MainAppView.swift index 57dcadc..4f1247e 100644 --- a/ios/Blindsighted/Views/MainAppView.swift +++ b/ios/Blindsighted/Views/MainAppView.swift @@ -38,11 +38,19 @@ struct MainAppView: View { .accessibilityLabel("Lifelog tab") .accessibilityHint("Browse your recorded memories") + PhotoGalleryView() + .tabItem { + Label("Photos", systemImage: "photo.on.rectangle") + } + .tag(2) + .accessibilityLabel("Photos tab") + .accessibilityHint("Browse photos captured during recording") + AudioTestView() .tabItem { Label("Audio", systemImage: "headphones") } - .tag(2) + .tag(3) .accessibilityLabel("Audio testing tab") .accessibilityHint("Test audio routing to your glasses") } diff --git a/ios/Blindsighted/Views/PhotoGalleryView.swift b/ios/Blindsighted/Views/PhotoGalleryView.swift new file mode 100644 index 0000000..c36cf78 --- /dev/null +++ b/ios/Blindsighted/Views/PhotoGalleryView.swift @@ -0,0 +1,251 @@ +// +// PhotoGalleryView.swift +// +// Photo gallery view for browsing all periodic photos captured during recording sessions. +// Displays photos in a grid layout with full-size preview on tap. +// + +import SwiftUI + +struct PhotoGalleryView: View { + @StateObject private var viewModel = PhotoGalleryViewModel() + @State private var showFullPhoto: Bool = false + + private let columns = [ + GridItem(.flexible(), spacing: 2), + GridItem(.flexible(), spacing: 2), + GridItem(.flexible(), spacing: 2) + ] + + var body: some View { + NavigationView { + ScrollView { + if viewModel.photos.isEmpty && !viewModel.isLoading { + emptyStateView + } else { + LazyVGrid(columns: columns, spacing: 2) { + ForEach(viewModel.photos) { photo in + photoCell(photo) + } + } + .padding(2) + } + } + .background(Color(UIColor.systemBackground)) + .navigationTitle("Photos") + .navigationBarTitleDisplayMode(.large) + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Menu { + Text("\(viewModel.photos.count) photos") + Text("Storage: \(viewModel.totalStorage)") + Divider() + Button(role: .destructive, action: { + viewModel.showDeleteAllConfirmation = true + }) { + Label("Delete All Photos", systemImage: "trash.fill") + } + .disabled(viewModel.photos.isEmpty) + } label: { + Image(systemName: "ellipsis.circle") + } + .accessibilityLabel("Options menu") + .accessibilityHint("Opens photo count, storage info, and deletion options") + } + } + .refreshable { + viewModel.loadPhotos() + } + .alert("Error", isPresented: $viewModel.showError) { + Button("OK") {} + } message: { + Text(viewModel.errorMessage) + } + .confirmationDialog( + "Delete All Photos?", + isPresented: $viewModel.showDeleteAllConfirmation, + titleVisibility: .visible + ) { + Button("Delete All", role: .destructive) { + viewModel.deleteAllPhotos() + } + Button("Cancel", role: .cancel) {} + } message: { + Text("This will permanently delete all \(viewModel.photos.count) photos. This action cannot be undone.") + } + .sheet(item: $viewModel.selectedPhoto) { photo in + PhotoDetailView(photo: photo, viewModel: viewModel) + } + .overlay { + if viewModel.isLoading && viewModel.photos.isEmpty { + ProgressView() + .scaleEffect(1.5) + .accessibilityLabel("Loading photos") + } + } + } + } + + @ViewBuilder + private func photoCell(_ photo: CapturedPhoto) -> some View { + Button { + viewModel.selectedPhoto = photo + } label: { + ZStack { + if let thumbnail = viewModel.thumbnails[photo.id] { + Image(uiImage: thumbnail) + .resizable() + .aspectRatio(1, contentMode: .fill) + .clipped() + } else { + Rectangle() + .fill(Color.gray.opacity(0.3)) + .aspectRatio(1, contentMode: .fill) + .overlay { + ProgressView() + .scaleEffect(0.8) + } + } + } + } + .buttonStyle(.plain) + .accessibilityLabel("Photo from \(formatDateTime(photo.capturedAt))") + .accessibilityHint("Double tap to view full size") + .contextMenu { + Button { + viewModel.selectedPhoto = photo + } label: { + Label("View Photo", systemImage: "eye") + } + Button(role: .destructive) { + viewModel.deletePhoto(photo) + } label: { + Label("Delete", systemImage: "trash") + } + } + } + + private func formatDateTime(_ date: Date) -> String { + let formatter = DateFormatter() + formatter.dateStyle = .medium + formatter.timeStyle = .short + return formatter.string(from: date) + } + + private var emptyStateView: some View { + VStack(spacing: 16) { + Image(systemName: "photo.on.rectangle.angled") + .font(.system(size: 60)) + .foregroundColor(.secondary) + .accessibilityHidden(true) + + Text("No Photos") + .font(.title2) + .fontWeight(.semibold) + + Text("Photos are automatically captured every second during recording") + .font(.subheadline) + .foregroundColor(.secondary) + .multilineTextAlignment(.center) + .padding(.horizontal, 40) + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + .padding(.top, 100) + } +} + +// MARK: - Photo Detail View + +struct PhotoDetailView: View { + let photo: CapturedPhoto + @ObservedObject var viewModel: PhotoGalleryViewModel + @Environment(\.dismiss) private var dismiss + @State private var fullImage: UIImage? + + var body: some View { + NavigationView { + GeometryReader { geometry in + ZStack { + Color.black.ignoresSafeArea() + + if let image = fullImage { + Image(uiImage: image) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: geometry.size.width, maxHeight: geometry.size.height) + .accessibilityLabel("Full size photo from \(formatDateTime(photo.capturedAt))") + } else { + ProgressView() + .scaleEffect(1.5) + .tint(.white) + .accessibilityLabel("Loading full size photo") + } + } + } + .navigationBarTitleDisplayMode(.inline) + .toolbar { + ToolbarItem(placement: .navigationBarLeading) { + Button("Done") { + dismiss() + } + .foregroundColor(.white) + } + + ToolbarItem(placement: .principal) { + VStack(spacing: 2) { + Text(formatDate(photo.capturedAt)) + .font(.subheadline) + .fontWeight(.semibold) + Text(formatTime(photo.capturedAt)) + .font(.caption) + .foregroundColor(.secondary) + } + .foregroundColor(.white) + } + + ToolbarItem(placement: .navigationBarTrailing) { + if let image = fullImage { + ShareLink(item: Image(uiImage: image), preview: SharePreview("Photo", image: Image(uiImage: image))) { + Image(systemName: "square.and.arrow.up") + .foregroundColor(.white) + } + .accessibilityLabel("Share photo") + .accessibilityHint("Opens share options for this photo") + } + } + } + .toolbarBackground(.black, for: .navigationBar) + .toolbarBackground(.visible, for: .navigationBar) + } + .task { + fullImage = viewModel.loadFullImage(for: photo) + } + } + + private func formatDate(_ date: Date) -> String { + let formatter = DateFormatter() + formatter.dateStyle = .medium + formatter.timeStyle = .none + return formatter.string(from: date) + } + + private func formatTime(_ date: Date) -> String { + let formatter = DateFormatter() + formatter.dateStyle = .none + formatter.timeStyle = .medium + return formatter.string(from: date) + } + + private func formatDateTime(_ date: Date) -> String { + let formatter = DateFormatter() + formatter.dateStyle = .medium + formatter.timeStyle = .short + return formatter.string(from: date) + } +} + +#if DEBUG +#Preview("Empty State") { + PhotoGalleryView() +} +#endif