diff --git a/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md b/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md index 5deaa99cc..d9cd7cae6 100644 --- a/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md +++ b/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md @@ -1,402 +1,88 @@ # Auth Server ASR Proxy Implementation Guide -## Overview +> **This document was rewritten in 2026 to reflect the current OmniASR +> (HTTP POST) contract.** The previous WebSocket-based MMS proxy described +> here is no longer in use. -The Codex Editor client now supports authenticated ASR (Automatic Speech Recognition) transcription through the Frontier auth server. This document describes what needs to be implemented on the auth server side. +## Status -**Status**: Client implementation is complete and deployed. Auth server implementation is required to enable the feature. +- **Upstream service**: Meta Omnilingual ASR (`omniASR_LLM_1B_v2`), served + on Modal as `https://genesis-ai-dev--codex-asr-serve.modal.run` + (renamed from the historical `mms-zeroshot-asr` deployment — same + workload, model-agnostic name). +- **Client**: Codex Editor talks to the Frontier auth-proxy via plain + HTTP POST (multipart). No WebSocket. See + [`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md) for the full wire spec + and reference FastAPI implementation. -## What You Need to Implement +## What the auth server must implement -### 1. Add `getAsrEndpoint()` Method to FrontierAPI +### 1. `getAsrEndpoint()` on FrontierAPI -The client expects a new method on the FrontierAPI interface that returns the authenticated ASR proxy endpoint. - -**Method Signature**: ```typescript getAsrEndpoint(): Promise ``` -**Returns**: The WebSocket URL for the authenticated ASR proxy (e.g., `wss://auth.frontier.com/ws/asr`) - -**Example Implementation**: -```typescript -async getAsrEndpoint(): Promise { - if (!this.isAuthenticated) { - return undefined; - } - - // Return your ASR proxy WebSocket URL - return "wss://auth.frontier.com/ws/asr"; - // OR from config: - // return this.config.asrProxyUrl; -} -``` - -**Pattern Reference**: This follows the exact same pattern as your existing `getLlmEndpoint()` method. - -### 2. Implement WebSocket Proxy Endpoint: `/ws/asr` - -Create a new WebSocket endpoint that: -1. Validates the JWT token from the query parameter -2. Proxies messages between the client and the actual ASR service (Ryder's Modal endpoint) -3. Logs usage for authenticated users - -#### Endpoint Details - -**URL Pattern**: `wss://your-auth-server.com/ws/asr?token=JWT_TOKEN` - -**Authentication**: JWT token passed as query parameter `token` - -**Upstream Service**: `wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe` - -#### Message Flow - -``` -Client → Auth Server → ASR Service (Ryder's endpoint) - ↓ ↓ ↓ - ←─────────←──────────────← -``` - -1. Client sends metadata (JSON) -2. Auth server forwards to ASR service -3. Client sends audio (binary) -4. Auth server forwards to ASR service -5. ASR service sends progress/results (JSON) -6. Auth server forwards to client - -## Complete Python Implementation Example - -Here's a complete FastAPI implementation you can use as a reference: - -```python -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Query, HTTPException -from fastapi.responses import JSONResponse -import websockets -import jwt -import asyncio -import logging -from datetime import datetime - -app = FastAPI() -logger = logging.getLogger(__name__) - -# Configuration -ASR_UPSTREAM_URL = "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe" -JWT_SECRET = "your-jwt-secret-here" # Use your actual JWT secret -JWT_ALGORITHM = "HS256" - -def validate_token(token: str) -> dict: - """ - Validate JWT token and return decoded payload. - - Raises: - HTTPException: If token is invalid or expired - """ - try: - payload = jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGORITHM]) - return payload - except jwt.ExpiredSignatureError: - raise HTTPException(status_code=401, detail="Token expired") - except jwt.InvalidTokenError: - raise HTTPException(status_code=401, detail="Invalid token") - -@app.websocket("/ws/asr") -async def websocket_asr_proxy( - websocket: WebSocket, - token: str = Query(..., description="JWT authentication token") -): - """ - WebSocket proxy for ASR transcription with authentication. - - This endpoint: - 1. Validates the user's JWT token - 2. Establishes a connection to the upstream ASR service - 3. Proxies messages bidirectionally between client and ASR service - 4. Logs usage for monitoring - """ - - # Validate token before accepting connection - try: - user_payload = validate_token(token) - user_id = user_payload.get("sub") or user_payload.get("user_id") - username = user_payload.get("username") or user_payload.get("email") - except HTTPException as e: - await websocket.close(code=1008, reason=f"Authentication failed: {e.detail}") - logger.warning(f"Authentication failed: {e.detail}") - return - - # Accept client connection - await websocket.accept() - logger.info(f"User {username} (ID: {user_id}) started ASR session at {datetime.utcnow()}") - - # Connect to upstream ASR service - upstream_ws = None - try: - upstream_ws = await websockets.connect(ASR_UPSTREAM_URL) - logger.info(f"Connected to upstream ASR service for user {username}") - - async def forward_to_client(): - """Forward messages from ASR service to client""" - try: - async for message in upstream_ws: - await websocket.send_text(message) - logger.debug(f"Forwarded message to client {username}: {message[:100]}...") - except websockets.exceptions.ConnectionClosed: - logger.info(f"Upstream ASR connection closed for user {username}") - except Exception as e: - logger.error(f"Error forwarding to client {username}: {e}") - try: - await websocket.send_text( - '{"type": "error", "message": "Connection to transcription service lost"}' - ) - except: - pass - - async def forward_to_asr(): - """Forward messages from client to ASR service""" - try: - while True: - message = await websocket.receive() - - if "text" in message: - # Forward JSON metadata - await upstream_ws.send(message["text"]) - logger.debug(f"Forwarded metadata from {username}: {message['text'][:100]}...") - elif "bytes" in message: - # Forward binary audio data - audio_size = len(message["bytes"]) - await upstream_ws.send(message["bytes"]) - logger.info(f"Forwarded {audio_size} bytes of audio from {username}") - except WebSocketDisconnect: - logger.info(f"Client {username} disconnected") - except Exception as e: - logger.error(f"Error forwarding from client {username}: {e}") - - # Run both forwarding tasks concurrently - await asyncio.gather( - forward_to_client(), - forward_to_asr(), - return_exceptions=True - ) - - except Exception as e: - logger.error(f"Failed to connect to upstream ASR service for user {username}: {e}") - error_msg = { - "type": "error", - "message": f"Failed to connect to transcription service: {str(e)}" - } - try: - await websocket.send_json(error_msg) - except: - pass - finally: - # Cleanup - if upstream_ws: - await upstream_ws.close() - try: - await websocket.close() - except: - pass - logger.info(f"ASR session ended for user {username} (ID: {user_id})") - -@app.get("/health") -async def health_check(): - """Health check endpoint""" - return {"status": "healthy", "service": "asr-proxy"} - -if __name__ == "__main__": - import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) -``` - -## WebSocket Protocol Details - -The client implements this protocol, which your proxy must support: - -### Client → ASR Service - -**Step 1**: Client sends JSON metadata -```json -{ - "type": "meta", - "provider": "mms", - "model": "facebook/mms-1b-all", - "mime": "audio/webm", - "language": "eng", - "task": "transcribe", - "phonetic": false -} -``` - -**Step 2**: Client sends binary audio data (Blob) - -### ASR Service → Client - -**Progress Updates** (during processing): -```json -{ - "type": "progress", - "data": "Processing audio...", - "percentage": 50 -} -``` - -**Final Result** (on completion): -```json -{ - "type": "done", - "text": "This is the transcribed text", - "language": "eng", - "provider": "mms", - "model": "facebook/mms-1b-all", - "phonetic": "ðɪs ɪz ðə trænskraɪbd tɛkst" -} -``` - -**Error Message** (on failure): -```json -{ - "type": "error", - "message": "Transcription failed: invalid audio format" -} -``` - -## Implementation Checklist - -- [ ] Add `getAsrEndpoint()` method to FrontierAPI class - - Returns `Promise` - - Returns your ASR proxy URL (e.g., `wss://auth.frontier.com/ws/asr`) - - Returns `undefined` if not authenticated - -- [ ] Create WebSocket endpoint at `/ws/asr` - - Accepts `token` as query parameter - - Validates JWT token - - Rejects with code 1008 if token invalid - -- [ ] Implement bidirectional proxy - - Forward JSON text messages - - Forward binary audio data - - Handle connection lifecycle - - Clean up resources on disconnect - -- [ ] Add logging - - Log successful authentications with user ID - - Log ASR session start/end times - - Log audio data sizes for monitoring - - Log errors and failures - -- [ ] Test the implementation - - Valid token → successful proxying - - Invalid token → rejection with code 1008 - - Missing token → rejection - - Large audio files → proper streaming - - Connection interruptions → graceful cleanup - -## Configuration - -You'll need to configure: - -1. **JWT Secret**: Same secret used for other JWT validation -2. **Upstream ASR URL**: `wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe` -3. **Proxy Endpoint URL**: The URL you'll return from `getAsrEndpoint()` - -## Testing - -### Manual Test with wscat - -```bash -# Install wscat -npm install -g wscat - -# Test with valid token -wscat -c "wss://your-auth-server.com/ws/asr?token=YOUR_JWT_TOKEN" - -# Send metadata -> {"type":"meta","mime":"audio/webm"} - -# Observe responses -< {"type":"progress","data":"Processing...","percentage":50} -``` - -### Integration Test - -The Codex Editor client will automatically use your proxy when: -1. User is authenticated -2. `getAsrEndpoint()` returns a URL -3. User transcribes audio - -You can verify by checking your logs for authenticated transcription sessions. - -## Security Considerations - -1. **Token Validation**: Always validate JWT before accepting connection -2. **Rate Limiting**: Consider implementing per-user rate limits -3. **Timeout**: Set reasonable timeouts (30-60s) for transcription -4. **File Size Limits**: Consider limiting audio size if needed -5. **HTTPS/WSS**: Always use secure WebSocket in production -6. **Logging**: Log usage but respect user privacy (don't log audio content) - -## Monitoring Recommendations - -Track these metrics: -- Total ASR requests per day -- Active concurrent transcriptions -- Average transcription duration -- Error rate by error type -- Audio size distribution -- Per-user usage - -## Reference Implementation - -The LLM proxy endpoint on your auth server follows a similar pattern. You can use that as a reference for: -- JWT validation approach -- Error handling patterns -- Logging format -- Configuration management - -## Support - -If you need clarification on: -- Client behavior: See `docs/asr-proxy-endpoint.md` -- Message protocol: See examples above -- Client implementation: See `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts` - -## Deployment Notes - -### Before Deployment -1. Test with a staging environment first -2. Verify JWT token validation works correctly -3. Test with large audio files (>10MB) -4. Confirm error handling works as expected - -### After Deployment -1. Monitor logs for authentication failures -2. Check for any proxy errors -3. Verify transcription quality unchanged -4. Monitor for rate limit needs - -## Timeline - -**Client Ready**: ✅ Implemented and deployed - -**Auth Server Required**: This implementation - -**User Impact**: None until auth server is deployed (users will continue using manual endpoint configuration) - -**Urgency**: Medium - allows transition away from Ryder's personal namespace - ---- - -## Questions? - -For questions about: -- **Client implementation**: Check `docs/asr-auth-proxy-implementation-summary.md` -- **Protocol details**: Check `docs/asr-proxy-endpoint.md` -- **Client code**: Check `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts` - -## Version - -- **Client Version**: Implemented in v0.6.21+ -- **Last Updated**: 2025-10-14 - +Returns the **HTTPS** URL of the proxy's transcribe endpoint +(e.g. `https://auth.frontier.example/api/v1/asr/transcribe`). The client +performs a multipart POST against that URL. + +This mirrors the existing `getLlmEndpoint()`. + +### 2. `POST /api/v1/asr/transcribe` proxy endpoint + +A pass-through that: + +1. Validates the Frontier JWT (Authorization header or `?token=` query). +2. Forwards the multipart audio body to OmniASR. +3. **Forwards the optional `?lang=...` query parameter** when the client + supplies it (OmniASR `{iso639_3}_{Script}` format, e.g. `swh_Latn`). + In auto-detect mode the client omits `lang`; the proxy must also omit + it when calling upstream. +4. Returns OmniASR's JSON response verbatim (`text`, `duration_s`, + `inference_s`, and `lang` when one was sent). + +A complete reference FastAPI implementation is in +[`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md#example-implementation-pythonfastapi). + +## Migration from the WebSocket / MMS era + +Anything the client used to send over WebSocket (provider, model, +language as bare ISO 639-3, phonetic flag, etc.) is gone: + +- **No more `provider` / `model` fields**: the upstream is OmniASR; the + client doesn't choose providers. +- **No more `phonetic`**: OmniASR doesn't support IPA output. +- **No more bare ISO 639-3 codes**: OmniASR requires `{iso639_3}_{Script}` + (e.g. `urd_Arab`, not `urd`). The client resolves this from the project + language using `sharedUtils/asrLanguageUtils.ts`. +- **No more `lang=auto` magic value**: omit `lang` entirely for + auto-detect. + +## Key references + +- Wire contract: [`docs/asr-proxy-endpoint.md`](./asr-proxy-endpoint.md) +- Client: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts` +- Lang resolver + supported codes: `sharedUtils/asrLanguageUtils.ts`, + `sharedUtils/omniAsrSupportedLangs.ts`, + `sharedUtils/omniAsrDefaultScripts.ts`, + `sharedUtils/omniAsrFriendlyNames.ts` +- Modal app (source of truth for the upstream): + [`docs/asr/codex_asr_modal.py`](./asr/codex_asr_modal.py) in this repo. + Logs and dashboards: + . + +## Action items for the Frontier auth proxy team + +1. Point the upstream ASR URL at the new app: + `https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe` + (previously `…--mms-zeroshot-asr-…`). The legacy app is still up so + there's no urgency, but it should not be considered the source of + truth — only `codex-asr` will receive future updates. +2. Make sure the proxy forwards the optional `?lang=` query string + verbatim and does not synthesise one when the client omits it + (auto-detect mode). +3. Drop any `provider`, `model`, `phonetic`, `language` fields that + used to be part of the multipart/form body — they're no longer sent. +4. Once the proxy is migrated, we can decommission the + `mms-zeroshot-asr` Modal app. diff --git a/docs/asr-auth-proxy-implementation-summary.md b/docs/asr-auth-proxy-implementation-summary.md index 2f0515033..0748f84a4 100644 --- a/docs/asr-auth-proxy-implementation-summary.md +++ b/docs/asr-auth-proxy-implementation-summary.md @@ -1,5 +1,13 @@ # ASR Authentication Proxy Implementation Summary +> **Historical changelog.** This documents the initial WebSocket-era +> introduction of the Frontier auth proxy. The current contract is HTTP +> POST and the upstream is OmniASR (not MMS). For an up-to-date wire +> spec and reference implementation see +> [`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md); for the auth-server +> integration points see +> [`AUTH_SERVER_ASR_IMPLEMENTATION.md`](./AUTH_SERVER_ASR_IMPLEMENTATION.md). + ## Overview Successfully migrated ASR transcription from Ryder's personal Modal namespace to an authenticated proxy architecture. The system now supports: diff --git a/docs/asr-proxy-endpoint.md b/docs/asr-proxy-endpoint.md index 8b1e40956..e05215410 100644 --- a/docs/asr-proxy-endpoint.md +++ b/docs/asr-proxy-endpoint.md @@ -1,22 +1,33 @@ # ASR HTTP POST Endpoint Specification -This document describes the HTTP POST protocol for implementing an ASR (Automatic Speech Recognition) transcription endpoint compatible with the Codex Editor. +This document describes the HTTP POST protocol the Codex Editor expects from +an ASR (Automatic Speech Recognition) endpoint. The reference upstream is +**Meta Omnilingual ASR** (`omniASR_LLM_1B_v2`), served on Modal as +`genesis-ai-dev--codex-asr-serve.modal.run` (renamed from the +historical `mms-zeroshot-asr` deployment). + +The Frontier auth server runs a thin **proxy** in front of that Modal +endpoint, adds JWT validation, and is what the Codex client actually talks to +in production. This spec covers the proxy's wire contract; the proxy in turn +forwards to OmniASR. ## Overview -The Codex Editor uses a simple HTTP POST request for audio transcription. This allows for straightforward integration without WebSocket complexity. +The client uses a simple multipart HTTP POST to the proxy URL. No +WebSockets, no streaming progress messages. One request → one transcription. ## Authentication -The client passes authentication via a JWT token as either: +The client passes a Frontier JWT via either: 1. **Authorization header**: `Authorization: Bearer ` 2. **Query parameter**: `?token=&source=codex` The server should: -1. Validate the JWT token before processing the request -2. Reject requests with invalid or missing tokens (401) -3. Establish a connection to the actual ASR service (e.g., Modal endpoint) -4. Forward the audio file and return the transcription result +1. Validate the JWT before processing. +2. Reject invalid/missing tokens with HTTP 401. +3. Forward the audio (and the optional `lang` query parameter, if present) + to the upstream OmniASR service. +4. Return the upstream's JSON response. ## Request Protocol @@ -35,20 +46,34 @@ Authorization: Bearer (optional if token in query) ### Query Parameters -- `source` (required): `"codex"` or `"langquest"` -- `token` (optional): JWT token if not in Authorization header +- `source` (required): `"codex"` or `"langquest"` — for logging. +- `token` (optional): JWT, if not in the Authorization header. +- `lang` (**optional**): OmniASR language code in + `{iso639_3}_{Script}` form (e.g. `swh_Latn`, `urd_Arab`, `cmn_Hans`). + Forward this directly to OmniASR. **Omit** it to engage the upstream's + built-in language ID — `codex-asr` runs MMS-LID first and feeds the + detected code into OmniASR (the resolved code is then included in the + response). The full list of accepted codes is bundled with the client + in `sharedUtils/omniAsrSupportedLangs.ts` (and is the live response of + OmniASR's `GET /languages`). ### Request Body **Content-Type**: `multipart/form-data` **Form Fields**: -- `file`: Audio file (WAV, MP3, OGG, FLAC, WebM - max 50MB) +- `file`: Audio file (WAV, MP3, OGG, FLAC, WebM, M4A — max 50 MB, + max 40 s per chunk; OmniASR chunks longer audio internally) -### Example Request +### Example Requests ```bash -curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN" \ +# Auto-detect (no lang) +curl -X POST "https://auth.frontier.example/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN" \ + -F "file=@audio.wav" + +# Project-language mode (Swahili, Latin script) +curl -X POST "https://auth.frontier.example/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN&lang=swh_Latn" \ -F "file=@audio.wav" ``` @@ -60,10 +85,26 @@ curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT { "text": "This is the transcribed text", "duration_s": 4.94, - "inference_s": 1.72 + "inference_s": 1.72, + "lang": "swh_Latn" } ``` +The `lang` field reflects what was **actually used** for transcription: +- Request supplied `lang` → echoed verbatim. +- Request omitted `lang` → upstream ran MMS-LID and the resolved + `{iso639_3}_{Script}` code is returned here. If LID failed (silence, + unrecognised language, …) the field is omitted and the response also + includes `lid_s` so callers can tell auto-detect actually ran. The + client renders an "Auto Detect" badge in that case. + +Auto-detect responses include an additional `"lid_s": ` field +with the LID inference time (useful for monitoring). + +The client also accepts a legacy field name `language` in place of `lang` +(this was the Frontier proxy's earlier convention) — either works. Prefer +`lang` going forward. + ### Error Response (4xx/5xx) ```json @@ -73,32 +114,30 @@ curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT ``` **Common Error Codes**: -- `400`: Bad Request (missing source parameter, invalid audio format) +- `400`: Bad request (missing source, invalid audio, unknown `lang` code) - `401`: Unauthorized (invalid or missing token) -- `502`: Bad Gateway (upstream service unavailable) -- `504`: Gateway Timeout (upstream service timeout) +- `502`: Bad gateway (upstream OmniASR unavailable) +- `504`: Gateway timeout (upstream timeout) ## Example Implementation (Python/FastAPI) -Here's a basic example of implementing the ASR proxy endpoint: - ```python from fastapi import FastAPI, UploadFile, File, HTTPException, Query, Header from fastapi.responses import JSONResponse import httpx import jwt +from typing import Optional app = FastAPI() -# Configuration -ASR_SERVICE_URL = "https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run/transcribe" +# Configuration (post-rename; the old URL was +# https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run/transcribe) +ASR_SERVICE_URL = "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe" JWT_SECRET = "your-jwt-secret" def validate_token(token: str) -> dict: - """Validate JWT token and return payload""" try: - payload = jwt.decode(token, JWT_SECRET, algorithms=["HS256"]) - return payload + return jwt.decode(token, JWT_SECRET, algorithms=["HS256"]) except jwt.InvalidTokenError: raise HTTPException(status_code=401, detail="Invalid token") @@ -107,75 +146,70 @@ async def transcribe_audio( file: UploadFile = File(...), authorization: Optional[str] = Header(None), token: Optional[str] = Query(None), - source: str = Query(...) + source: str = Query(...), + lang: Optional[str] = Query(None), # OmniASR {iso639_3}_{Script} ): - """HTTP POST endpoint for ASR transcription with authentication""" - - # Extract token from header or query auth_token = None if authorization and authorization.startswith("Bearer "): auth_token = authorization[7:] elif token: auth_token = token - if not auth_token: raise HTTPException(status_code=401, detail="Token required") - - # Validate token - try: - user = validate_token(auth_token) - user_id = user.get("sub") - except HTTPException: - raise - - # Read audio file + validate_token(auth_token) + audio_content = await file.read() - - # Forward to upstream ASR service + async with httpx.AsyncClient(timeout=60.0) as client: files = {"file": (file.filename, audio_content, file.content_type)} - response = await client.post(ASR_SERVICE_URL, files=files) - + params = {} + if lang: + params["lang"] = lang + response = await client.post(ASR_SERVICE_URL, files=files, params=params) + if response.status_code != 200: raise HTTPException( status_code=response.status_code, - detail=f"Transcription service error: {response.text}" + detail=f"Transcription service error: {response.text}", ) - + + # Pass OmniASR's response through verbatim (it already echoes `lang` + # when present, and omits it in auto-detect mode). return JSONResponse(content=response.json()) ``` ## Client Implementation Reference -The Codex Editor client implementation can be found in: - -- **TypeScript Client**: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts` -- **Integration**: `webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx` +- **Client**: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts` +- **Code resolver** (project language → `{iso639_3}_{Script}`): + `sharedUtils/asrLanguageUtils.ts` +- **Supported codes**: `sharedUtils/omniAsrSupportedLangs.ts` +- **Default scripts**: `sharedUtils/omniAsrDefaultScripts.ts` +- **Friendly names**: `sharedUtils/omniAsrFriendlyNames.ts` -### Key Client Behavior +### Key Client Behaviour -1. Requests ASR config (including auth token) from VS Code extension -2. Creates FormData with audio blob -3. POSTs to endpoint URL with token in query parameter or Authorization header -4. Receives JSON response with transcription text -5. Handles errors and timeouts (default 60s) +1. Requests ASR config (endpoint + auth token + resolved OmniASR code) from the extension host. +2. POSTs `multipart/form-data` with the audio file; forwards `?lang=...` when in project mode. +3. Parses `lang` (or legacy `language`) from the JSON response and stores it + on the cell's audio attachment. +4. Renders the badge from the stored code via + `labelForTranscriptionLanguage()`. ## Testing Your Implementation -### Test Cases - -1. **Valid audio**: Should return transcription -2. **Invalid audio format**: Should return error message -3. **Missing token**: Should reject with 401 -4. **Invalid token**: Should reject with 401 -5. **Timeout**: Should handle gracefully (client has 60s timeout) -6. **Large audio files**: Should handle up to 50MB -7. **Network errors**: Should return appropriate error codes +1. **Project-mode request**: `?lang=swh_Latn` → expect 200 with + `"lang": "swh_Latn"` in response. +2. **Auto-detect**: no `lang` → expect 200, **no** `lang` in response. +3. **Unknown code**: `?lang=zzz_Zzzz` → expect 400 with descriptive error. +4. **Invalid token**: 401. +5. **Large audio (≤ 50 MB)**: 200. +6. **Long audio (> 40 s)**: OmniASR chunks it; expect 200 with full + concatenated transcription. +7. **Network error / upstream down**: 502/504 surfaced honestly. ## Supported Audio Formats -The endpoint should support common audio formats: - - `audio/webm` (recommended for browser recording) - `audio/wav` - `audio/mp3` @@ -185,28 +219,20 @@ The endpoint should support common audio formats: ## Security Considerations -1. **Token Validation**: Always validate JWT tokens before processing -2. **Rate Limiting**: Implement per-user rate limits to prevent abuse -3. **File Size Limits**: Set reasonable limits on audio file sizes (50MB recommended) -4. **Timeout**: Implement server-side timeouts to prevent hanging requests (60s recommended) -5. **Logging**: Log usage for monitoring and debugging (but respect privacy) -6. **HTTPS**: Always use secure connections in production - -## Performance Recommendations - -1. **Streaming**: For very large files, consider streaming uploads -2. **Caching**: Cache model loading to reduce cold starts (handled by upstream service) -3. **Resource Cleanup**: Properly close connections and free resources -4. **Concurrent Requests**: Handle multiple simultaneous transcriptions efficiently -5. **Timeout Handling**: Set reasonable timeouts for upstream requests +1. **Token validation**: validate JWT before processing. +2. **Rate limiting**: per-user limits to prevent abuse. +3. **File size limits**: 50 MB. +4. **Timeout**: server-side timeouts to prevent hanging requests (60 s recommended). +5. **Logging**: log usage for monitoring but respect privacy. +6. **HTTPS**: always. ## Integration with Frontier Auth Server The Frontier auth server should: -1. Provide `getAsrEndpoint()` method returning the proxy HTTP URL -2. Generate short-lived JWT tokens for ASR requests -3. Include user identification in tokens for logging -4. Handle token refresh if needed for long transcriptions +1. Implement `getAsrEndpoint()` returning the proxy HTTPS URL. +2. Generate short-lived JWTs for ASR requests. +3. Include user identification in tokens for logging. +4. Handle token refresh for long transcriptions if needed. -This follows the same pattern as the existing `getLlmEndpoint()` implementation. +This follows the same pattern as the existing `getLlmEndpoint()`. diff --git a/docs/asr/README.md b/docs/asr/README.md new file mode 100644 index 000000000..344def767 --- /dev/null +++ b/docs/asr/README.md @@ -0,0 +1,69 @@ +# Codex ASR deployment + +Modal source for the ASR backend used by the Codex Translation Editor. + +| File | What it is | +|------|------------| +| [`codex_asr_modal.py`](./codex_asr_modal.py) | The Modal app source. Deploy with `modal deploy`. | + +## Live URLs + +- **Current (post-rename)**: `https://genesis-ai-dev--codex-asr-serve.modal.run` +- **Legacy (kept warm during migration)**: `https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run` + +The legacy URL serves the same workload — the app was renamed from +`mms-zeroshot-asr` to `codex-asr` so the URL no longer encodes the +model family. Both deployments will be active during the rollout; the +legacy one is decommissioned after the Frontier auth proxy and any +hard-coded client defaults are updated to the new URL. + +## Deploying + +You need `modal` CLI installed (`pipx install modal`) and authenticated +(`modal token new`) with access to the `genesis-ai-dev` workspace. + +```bash +cd +modal deploy docs/asr/codex_asr_modal.py +``` + +For local development against your own Modal workspace: + +```bash +modal serve docs/asr/codex_asr_modal.py +``` + +## Sanity-checking after deploy + +```bash +# Service identity +curl -s https://genesis-ai-dev--codex-asr-serve.modal.run/ + +# Full supported-langs list (used to regenerate the client snapshot) +curl -s https://genesis-ai-dev--codex-asr-serve.modal.run/languages | jq '.count' + +# Transcribe with language hint +curl -X POST -F "file=@some_audio.wav" \ + "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe?lang=eng_Latn" + +# Transcribe in auto-detect mode (no `lang` field in response) +curl -X POST -F "file=@some_audio.wav" \ + https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe +``` + +## Wire spec + +See [`../asr-proxy-endpoint.md`](../asr-proxy-endpoint.md) for the full +HTTP POST contract the Codex client expects (this Modal app implements +it; the Frontier auth proxy sits in front and adds JWT validation). + +## Open follow-ups + +- **Server-side LID for auto-detect mode.** OmniASR LLM doesn't return a + detected language when run without `lang` conditioning. The plan is to + bake `facebook/mms-lid-2048` into the image and run it before + transcription when the client omits `lang`, then pass the detected + code through as the conditioning input and echo it back. ~+1 GB VRAM, + ~+1–2 s latency, makes the badge honest in auto-detect mode. Deferred + to a follow-up PR; the client is already prepared to consume the + field if/when it appears. diff --git a/docs/asr/codex_asr_modal.py b/docs/asr/codex_asr_modal.py new file mode 100644 index 000000000..0ba08f52f --- /dev/null +++ b/docs/asr/codex_asr_modal.py @@ -0,0 +1,510 @@ +""" +codex-asr — Modal deployment for the Codex Translation Editor's ASR backend. + +This is the **source of truth** for the deployed Modal app at +`https://genesis-ai-dev--codex-asr-serve.modal.run`. + +Model: Meta Omnilingual ASR (`omniASR_LLM_1B_v2`). 1600+ languages. +Native-script output, optional language conditioning. + +Naming +~~~~~~ +The Modal app is named `codex-asr` (model-agnostic) rather than +`mms-zeroshot-asr` (the old name, when the upstream was MMS Zero-Shot). +This is so the URL stays stable when we change models. Do NOT rename +again casually — every consumer (Codex client default endpoint, +Frontier auth proxy upstream URL, docs, snapshot regen instructions) +hard-codes `codex-asr`. + +Migration plan (if `codex-asr` ever needs to change): + 1. Deploy the new name first, keep `codex-asr` running. + 2. Update the Frontier auth proxy's upstream URL. + 3. Update the client's default endpoint in `package.json` + (`codex-editor-extension.asrEndpoint`) and any docs. + 4. Decommission `codex-asr` after a release cycle. + +The old `mms-zeroshot-asr` deployment is kept warm for backward +compatibility during the transition. Both serve identical responses. + +Auto-detect language ID +~~~~~~~~~~~~~~~~~~~~~~~ +OmniASR LLM models don't have built-in LID. When the client omits +`lang` we run **Meta MMS-LID 2048** as a first pass to detect the +ISO 639-3 base, then pair it with a default script (see +`_DEFAULT_SCRIPT_FOR_BASE`) to produce an OmniASR-compatible +`{iso639_3}_{Script}` code that's fed to the OmniASR transcribe call. +The resolved code is echoed back in the response so the client can +render a real "detected language" badge. + +If LID fails (silence, gibberish, language not in MMS-LID's 2048-set, +or the detected base has no OmniASR mapping), we fall through to +unconditioned transcription and omit `lang` in the response so the +client renders an honest "Auto Detect" badge. + +Deploy / Dev +~~~~~~~~~~~~ + modal deploy docs/asr/codex_asr_modal.py + modal serve docs/asr/codex_asr_modal.py # local dev + +Test +~~~~ + curl -X POST -F "file=@audio.wav" \\ + https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe + + curl -X POST -F "file=@audio.wav" \\ + "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe?lang=urd_Arab" + +Endpoints +~~~~~~~~~ + GET / — service identity + GET /health — readiness probe + GET /languages — full list of supported {iso639_3}_{Script} codes + (used by the client snapshot in sharedUtils/) + POST /transcribe — transcription endpoint +""" + +import modal + +# Renamed from "mms-zeroshot-asr" to be model-agnostic. See module docstring +# for migration notes. +app = modal.App("codex-asr") + +MODEL_CARD = "omniASR_LLM_1B_v2" +MODEL_CACHE_DIR = "/root/model_cache" + +# MMS-LID variant for auto-detect mode. 2048 languages — all MMS-LID models +# share the same wav2vec2 backbone (~960M params), so picking a larger +# classification head doesn't meaningfully change cold-start memory. +# Outputs ISO 639-3 codes which we pair with our default-script table. +LID_MODEL_ID = "facebook/mms-lid-2048" +HF_CACHE_DIR = "/root/hf_cache" + + +def download_model(): + """Download model weights during image build (runs with GPU so fairseq2 can verify).""" + import os + os.environ["FAIRSEQ2_CACHE_DIR"] = MODEL_CACHE_DIR + os.environ["HF_HOME"] = HF_CACHE_DIR + + from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline + + print(f"Downloading and verifying {MODEL_CARD}...") + pipeline = ASRInferencePipeline(model_card=MODEL_CARD) + print("Model downloaded and verified OK") + del pipeline + + print(f"Downloading {LID_MODEL_ID}...") + from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification + AutoFeatureExtractor.from_pretrained(LID_MODEL_ID) + Wav2Vec2ForSequenceClassification.from_pretrained(LID_MODEL_ID) + print("MMS-LID downloaded OK") + + +# Build the image with model weights baked in. +# The run_function step uses a T4 GPU so fairseq2 can fully verify the +# checkpoint. This only runs once — the resulting image is cached by Modal. +# +# Versions / CUDA notes: +# - omnilingual-asr 0.2.0 is the first release that ships the +# `omniASR_LLM_1B_v2` model card; 0.1.0 only has `omniASR_LLM_1B`. +# - omnilingual-asr -> fairseq2[arrow]<=0.6 -> fairseq2n which pins +# `torch==2.8.0` built specifically against CUDA 12.8 (it asserts this at +# import time). Newer torch wheels are CUDA 13 and fail to load on Modal's +# `debian_slim` (libcudart.so.13 missing). +# - We install everything in one pip call so the resolver lands on the +# cu128 wheel of torch 2.8.0. +image = ( + modal.Image.debian_slim(python_version="3.11") + .apt_install("ffmpeg", "libsndfile1") + .pip_install( + "torch==2.8.0", + "torchaudio==2.8.0", + "omnilingual-asr==0.2.0", + "transformers>=4.46,<5", + "huggingface_hub", + "fastapi", + "uvicorn", + "python-multipart", + "soundfile", + "numpy", + extra_index_url="https://download.pytorch.org/whl/cu128", + ) + .env({"FAIRSEQ2_CACHE_DIR": MODEL_CACHE_DIR, "HF_HOME": HF_CACHE_DIR}) + .run_function(download_model, gpu="T4") +) + +_pipeline = None +_lid_model = None +_lid_feature_extractor = None +_default_script_for_base: dict[str, str] | None = None + +# Hand-curated default script for the multi-script bases OmniASR serves. +# **Mirror of `sharedUtils/omniAsrDefaultScripts.ts`** — keep both in sync +# when adding entries (the client uses this for project-language → OmniASR +# code resolution; the server uses it after MMS-LID returns a bare ISO +# 639-3 base). Picked from Unicode CLDR likelySubtags cross-checked +# against modern majority usage. +_MULTI_SCRIPT_DEFAULTS: dict[str, str] = { + "aze": "Latn", # Azerbaijani — Latin in modern standard + "bcc": "Arab", # Southern Balochi + "cmn": "Hans", # Mandarin — Simplified default + "cmo": "Khmr", # Central Mnong — Khmer-script orthography + "crk": "Cans", # Plains Cree — Canadian Aboriginal Syllabics + "ell": "Grek", # Greek + "gag": "Latn", # Gagauz — modern Latin orthography + "kmr": "Latn", # Northern Kurdish — Latin (Hawar) + "lld": "Latn", # Ladin + "ojb": "Latn", # Northwestern Ojibwa + "rif": "Latn", # Tarifit Berber + "rmc": "Latn", # Carpathian Romani + "rmy": "Latn", # Vlax Romani + "tuk": "Latn", # Turkmen — modern Latin + "uig": "Arab", # Uyghur — Arabic-script + "urd": "Arab", # Urdu — Nastaliq + "uzb": "Latn", # Uzbek — modern Latin + "wal": "Ethi", # Wolaytta — Ethiopic + "yue": "Hant", # Cantonese — Traditional +} + + +def _ensure_gang_context() -> None: + """ + Initialise fairseq2's thread-local gang stack on the current thread. + + fairseq2 0.6 stores the "current gangs" stack on a `threading.local()`, + but only initialises the underlying `current_gangs = []` attribute on + the importing thread. FastAPI dispatches sync request handlers on + worker threads where the attribute is missing, causing inference to + fail with:: + + AttributeError: '_thread._local' object has no attribute 'current_gangs' + + Cheap to call per-request — just sets a list on the thread-local if + it isn't already there. + """ + try: + from fairseq2.gang import _thread_local # type: ignore[attr-defined] + if not hasattr(_thread_local, "current_gangs"): + _thread_local.current_gangs = [] + except Exception: # pragma: no cover — defensive only + pass + + +def get_pipeline(): + """Load the ASR pipeline from baked-in weights (no download needed).""" + global _pipeline + if _pipeline is None: + import os + os.environ["FAIRSEQ2_CACHE_DIR"] = MODEL_CACHE_DIR + + from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline + + print(f"Loading {MODEL_CARD} from image cache...") + _ensure_gang_context() + _pipeline = ASRInferencePipeline(model_card=MODEL_CARD) + print("Pipeline ready") + return _pipeline + + +def _default_script_table() -> dict[str, str]: + """ + Build (and cache) the base → default script lookup used by LID resolution. + + Layered on top of `_MULTI_SCRIPT_DEFAULTS`: + - Single-script bases get their sole script automatically. + - Multi-script bases without a hand-curated entry fall through to + Latin (when supported), otherwise alphabetical first. + """ + global _default_script_for_base + if _default_script_for_base is not None: + return _default_script_for_base + + from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs + + scripts_per_base: dict[str, list[str]] = {} + for code in supported_langs: + base, script = code.split("_", 1) + scripts_per_base.setdefault(base, []).append(script) + + table: dict[str, str] = {} + for base, scripts in scripts_per_base.items(): + if len(scripts) == 1: + table[base] = scripts[0] + elif base in _MULTI_SCRIPT_DEFAULTS and _MULTI_SCRIPT_DEFAULTS[base] in scripts: + table[base] = _MULTI_SCRIPT_DEFAULTS[base] + elif "Latn" in scripts: + table[base] = "Latn" + else: + table[base] = sorted(scripts)[0] + + _default_script_for_base = table + return table + + +def get_lid(): + """Load the MMS-LID model + feature extractor from baked-in HF cache.""" + global _lid_model, _lid_feature_extractor + if _lid_model is None or _lid_feature_extractor is None: + import os + import torch + from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification + + os.environ["HF_HOME"] = HF_CACHE_DIR + print(f"Loading {LID_MODEL_ID} from image cache...") + _lid_feature_extractor = AutoFeatureExtractor.from_pretrained(LID_MODEL_ID) + _lid_model = Wav2Vec2ForSequenceClassification.from_pretrained(LID_MODEL_ID) + if torch.cuda.is_available(): + _lid_model = _lid_model.to("cuda") + _lid_model.eval() + print("MMS-LID ready") + return _lid_model, _lid_feature_extractor + + +def detect_omniasr_code(waveform_16k) -> str | None: + """ + Run MMS-LID on a 16-kHz mono waveform and return an OmniASR-compatible + `{iso639_3}_{Script}` code, or `None` if we can't confidently map the + detected base into OmniASR's supported set. + + Strategy: MMS-LID outputs an ISO 639-3 base; pair it with the default + script for that base (`_default_script_table()`). If the detected base + isn't served by OmniASR at all, return None and let the caller fall + back to unconditioned transcription. + """ + import torch + import numpy as np + + model, fx = get_lid() + # Cap LID input at 30 s — speech models don't benefit from longer + # context for identification and shorter input is much faster. + max_lid_samples = 30 * 16000 + snippet = waveform_16k[:max_lid_samples].astype(np.float32, copy=False) + + inputs = fx(snippet, sampling_rate=16000, return_tensors="pt") + device = next(model.parameters()).device + input_values = inputs.input_values.to(device) + + with torch.inference_mode(): + logits = model(input_values).logits + + predicted_id = int(torch.argmax(logits, dim=-1).item()) + label = model.config.id2label.get(predicted_id) if hasattr(model.config.id2label, "get") else model.config.id2label[predicted_id] + if not label: + return None + # MMS-LID labels are ISO 639-3 codes (e.g. "eng", "swh"). Be lenient + # about case/whitespace just in case. + base = label.strip().lower() + if len(base) != 3: + print(f"LID returned non-ISO-639-3 label {label!r}; skipping") + return None + + table = _default_script_table() + script = table.get(base) + if not script: + # Detected language isn't in OmniASR's supported set — give up and + # let the caller transcribe without conditioning. + return None + return f"{base}_{script}" + + +def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str | None = None) -> dict: + """ + Transcribe audio bytes → text using OmniASR LLM 1B v2. + + Args: + audio_bytes: Raw audio file bytes. + mime_type: MIME type for format detection. + lang: Optional OmniASR language code (e.g. "eng_Latn", "urd_Arab"). + When provided we trust it and skip LID. When `None` we run + MMS-LID first to pick a code, then transcribe with it. + + Returns: + dict with text, duration_s, inference_s, and `lang` (the code we + ended up using — either the caller-supplied one or the LID-detected + one). `lang` is omitted only when LID failed and we transcribed + without conditioning. + """ + import soundfile as sf + import numpy as np + import tempfile + import subprocess + import os + import time + + pipeline = get_pipeline() + _ensure_gang_context() + + # --- Convert to 16kHz mono WAV via ffmpeg --- + ext_map = { + "audio/wav": ".wav", "audio/x-wav": ".wav", + "audio/mpeg": ".mp3", "audio/mp3": ".mp3", + "audio/webm": ".webm", "audio/ogg": ".ogg", + "audio/flac": ".flac", "audio/mp4": ".m4a", + } + ext = ext_map.get(mime_type, ".wav") + + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f: + f.write(audio_bytes) + input_path = f.name + + output_path = input_path.rsplit(".", 1)[0] + "_16k.wav" + try: + result = subprocess.run( + ["ffmpeg", "-y", "-i", input_path, + "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", + output_path], + capture_output=True, text=True, timeout=60, + ) + if result.returncode != 0: + raise RuntimeError(f"ffmpeg failed: {(result.stderr or '')[:500]}") + + waveform, sr = sf.read(output_path) + waveform = waveform.astype(np.float32) + if waveform.ndim > 1: + waveform = waveform.mean(axis=-1) + duration = len(waveform) / sr + + # --- Language ID (auto-detect mode only) --- + # If the caller supplied `lang` we trust it. Otherwise we run + # MMS-LID on the (already 16-kHz mono) waveform. + lid_time = 0.0 + resolved_lang = lang + if resolved_lang is None: + lid_start = time.perf_counter() + try: + resolved_lang = detect_omniasr_code(waveform) + except Exception as e: + print(f"LID failed: {e}; falling back to unconditioned transcription") + resolved_lang = None + lid_time = time.perf_counter() - lid_start + + # --- Chunk if > 40s (model limitation) --- + max_samples = 40 * sr # 40 seconds + if len(waveform) > max_samples: + chunks = [] + for start in range(0, len(waveform), max_samples): + chunks.append(waveform[start : start + max_samples]) + else: + chunks = [waveform] + + # Build audio dicts for the pipeline + audio_inputs = [ + {"waveform": chunk, "sample_rate": sr} + for chunk in chunks + ] + + # Build lang list to match (one per chunk), or None + lang_list = [resolved_lang] * len(audio_inputs) if resolved_lang else None + + # --- Transcribe --- + start_t = time.perf_counter() + transcriptions = pipeline.transcribe( + audio_inputs, + lang=lang_list, + batch_size=1, + ) + inference_time = time.perf_counter() - start_t + + # Join chunks with space + full_text = " ".join(t.strip() for t in transcriptions if t.strip()) + + resp = { + "text": full_text, + "duration_s": round(duration, 2), + "inference_s": round(inference_time, 3), + } + if lid_time: + resp["lid_s"] = round(lid_time, 3) + # Echo the lang we actually used (caller-supplied or LID-resolved) + # so the client can render an honest badge. If LID failed and we + # transcribed without conditioning, omit the field entirely. + if resolved_lang: + resp["lang"] = resolved_lang + + return resp + + finally: + os.unlink(input_path) + if os.path.exists(output_path): + os.unlink(output_path) + + +# ---------- Modal function ---------- + +@app.function( + image=image, + gpu="T4", + timeout=600, + scaledown_window=120, # keep warm 2 min after last request + max_containers=3, +) +@modal.asgi_app() +def serve(): + from fastapi import FastAPI, UploadFile, File, Query, HTTPException + from fastapi.middleware.cors import CORSMiddleware + + web_app = FastAPI(title="Codex ASR (OmniASR LLM 1B v2)") + web_app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], + ) + + @web_app.get("/") + def root(): + return { + "service": "codex-asr", + "model": MODEL_CARD, + "lid_model": LID_MODEL_ID, + "languages": "1600+", + "note": "Pass ?lang={iso639_3}_{Script} (e.g. eng_Latn) to skip LID. Omit to run MMS-LID first and use the detected language for transcription.", + } + + @web_app.get("/health") + def health(): + return {"status": "ok", "model_loaded": _pipeline is not None} + + @web_app.get("/languages") + def list_languages(): + """Return all supported language codes.""" + from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs + return {"count": len(supported_langs), "languages": sorted(supported_langs)} + + @web_app.post("/transcribe") + async def transcribe_endpoint( + file: UploadFile = File(...), + lang: str | None = Query( + default=None, + description="OmniASR language code in {iso639_3}_{Script} form, e.g. eng_Latn, urd_Arab, spa_Latn. Omit to run MMS-LID first and use the detected language for transcription.", + ), + ): + # Validate language code if provided + if lang is not None: + from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs + if lang not in supported_langs: + raise HTTPException( + 400, + f"Unknown language code: '{lang}'. " + f"Use GET /languages for the full list. " + f"Format: {{iso639_3}}_{{Script}}, e.g. eng_Latn", + ) + + try: + audio_bytes = await file.read() + if len(audio_bytes) > 50 * 1024 * 1024: + raise HTTPException(413, "File too large (50MB max)") + if len(audio_bytes) == 0: + raise HTTPException(400, "Empty file") + + mime = file.content_type or "audio/wav" + return transcribe_audio(audio_bytes, mime, lang=lang) + + except HTTPException: + raise + except Exception as e: + raise HTTPException(500, f"Transcription failed: {str(e)}") + + # Model loads lazily on first /transcribe request via get_pipeline(). + # Weights are baked into the image so loading takes ~15-20s (no download). + return web_app diff --git a/package.json b/package.json index e2da8ab79..13a9ab247 100644 --- a/package.json +++ b/package.json @@ -873,38 +873,48 @@ "description": "Model name selected for inference." }, "codex-editor-extension.asrEndpoint": { - "title": "ASR WebSocket Endpoint", + "title": "ASR Endpoint", "type": "string", - "default": "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe", - "description": "WebSocket endpoint for audio transcription. When authenticated with Frontier, the auth server endpoint is automatically used. This setting is used as fallback when not authenticated or for local development." + "default": "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe", + "description": "HTTPS endpoint for audio transcription (POST multipart with optional ?lang= query). When authenticated with Frontier, the auth server endpoint is automatically used; this setting is the fallback for unauthenticated / local development." }, "codex-editor-extension.asrProvider": { "title": "ASR Provider", "type": "string", "enum": [ - "mms", + "omniasr", "whisper" ], - "default": "mms", - "description": "Provider for transcription. MMS requires a language code; Whisper auto-detects." + "default": "omniasr", + "description": "Provider for transcription. OmniASR accepts an optional {iso639_3}_{Script} language hint; Whisper auto-detects." }, "codex-editor-extension.asrModel": { "title": "ASR Model", "type": "string", - "default": "facebook/mms-1b-all", - "description": "Model identifier to use for transcription (e.g., facebook/mms-1b-all)." + "default": "omniASR_LLM_1B_v2", + "description": "Model identifier used by the ASR service (e.g., omniASR_LLM_1B_v2)." }, "codex-editor-extension.asrLanguage": { "title": "ASR Language (ISO-639-3)", "type": "string", "default": "eng", - "description": "Language code for transcription. MMS requires ISO-639-3 (e.g., eng, fra, spa). 2-letter codes will be mapped where possible." + "description": "Legacy: ISO 639-3 hint for ASR providers. OmniASR uses the project's target language by default; configure via the gear menu on the Transcribe button." }, - "codex-editor-extension.asrPhonetic": { - "title": "Return Phonetic (IPA)", - "type": "boolean", - "default": false, - "description": "If enabled and supported by provider, also return phonetic (IPA) transcription." + "codex-editor-extension.asrLanguageMode": { + "title": "ASR Language Mode", + "type": "string", + "enum": [ + "project", + "auto" + ], + "default": "project", + "description": "Whether to send the project's target language as a hint to the ASR service (\"project\"), or let the model transcribe without language conditioning (\"auto\")." + }, + "codex-editor-extension.asrScriptPref": { + "title": "ASR Script Preference", + "type": "string", + "default": "auto", + "description": "Script subtag to pair with the ASR language code. \"auto\" picks the best-guess script per language; \"latin\" forces Latin where supported; any 4-letter ISO 15924 tag (e.g. \"Arab\", \"Cyrl\") overrides per-language." }, "codex-editor-extension.sourceBookWhitelist": { "title": "Source Book Whitelist", diff --git a/sharedUtils/asrLanguageUtils.ts b/sharedUtils/asrLanguageUtils.ts new file mode 100644 index 000000000..b050abb20 --- /dev/null +++ b/sharedUtils/asrLanguageUtils.ts @@ -0,0 +1,268 @@ +/** + * ASR language-utility functions + * ------------------------------ + * + * Pure helpers (no `vscode` imports → unit-testable, usable from both the + * extension host and the webviews) that: + * + * 1. **Resolve** a project's language metadata into an OmniASR-compatible + * `{iso639_3}_{Script}` code (or decide we should send no code, letting + * the server transcribe without language conditioning). + * 2. **Label** an OmniASR code with a friendly display name suitable for the + * post-transcription badge (e.g. `swh_Latn` → "Swahili"). + * + * Why this lives in `sharedUtils/` + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Both the extension host (`src/providers/...`) and the webviews + * (`webviews/.../CodexCellEditor`) need it: the host builds the `asrConfig` + * payload from project settings, and the webview renders the badge after a + * transcription completes. + */ + +import { + OMNI_ASR_SUPPORTED_LANGS, + OMNI_ASR_SUPPORTED_LANG_SET, +} from "./omniAsrSupportedLangs"; +import { OMNI_ASR_DEFAULT_SCRIPTS } from "./omniAsrDefaultScripts"; +import { OMNI_ASR_FRIENDLY_NAMES } from "./omniAsrFriendlyNames"; + +/** + * Minimal shape of the project's language metadata that we consume here. + * Matches `codex-types`'s `LanguageMetadata` but we restate it so this file + * doesn't pull `codex-types` (and its transitive deps) into the webview + * bundle. + */ +export type AsrLanguageMetaInput = { + tag?: string; + iso1?: string; + iso2t?: string; + iso2b?: string; + refName?: string; +}; + +/** + * Macrolanguage → individual-language remaps used when the project's tag + * names a macrolanguage that OmniASR doesn't serve directly. Each pair maps + * a macro ISO 639-3 to the individual ISO 639-3 that OmniASR actually + * supports for the most widely-spoken variety. Sources: + * - SIL macrolanguage mappings (iso-639-3-macrolanguages.tab) + * - cross-checked against `OMNI_ASR_SUPPORTED_LANGS` + * + * Add only when (a) the macro is genuinely not in OmniASR's set and (b) the + * "right" individual is unambiguous. + */ +const MACRO_TO_INDIVIDUAL: Readonly> = { + swa: "swh", // Swahili → Coastal Swahili (Kenya/Tanzania majority) + ara: "arb", // Arabic → Modern Standard Arabic + msa: "zsm", // Malay → Standard Malay + zho: "cmn", // Chinese → Mandarin + ori: "ory", // Oriya → Odia + est: "ekk", // Estonian → Standard Estonian + sqi: "als", // Albanian → Tosk Albanian + kur: "kmr", // Kurdish → Northern Kurdish (largest speaker base) + nor: "nob", // Norwegian → Bokmål + oji: "ojb", // Ojibwa → Northwestern Ojibwa +}; + +/** ISO 639-1 (2-letter) → ISO 639-3 (3-letter). Common languages only; the + * project usually carries `iso2t` directly so this is just a fallback. */ +const ISO1_TO_ISO3: Readonly> = { + en: "eng", fr: "fra", es: "spa", de: "deu", pt: "por", it: "ita", + nl: "nld", ru: "rus", zh: "cmn", ja: "jpn", ko: "kor", ar: "arb", + sw: "swh", ur: "urd", hi: "hin", bn: "ben", id: "ind", tr: "tur", + th: "tha", vi: "vie", uk: "ukr", pl: "pol", fa: "pes", he: "heb", +}; + +/** + * Pull the ISO 639-3 base + optional Script subtag out of a project's + * language metadata, normalizing macrolanguages to OmniASR-served + * individuals. Returns `undefined` if we can't recover a 3-letter code. + */ +function extractBaseAndScript( + meta: AsrLanguageMetaInput | undefined +): { base: string; explicitScript?: string; } | undefined { + if (!meta) return undefined; + + // BCP-47-ish tag is the richest source: e.g. "swh", "ur-Arab", "zh-Hans". + const tag = (meta.tag || "").trim(); + let base = ""; + let explicitScript: string | undefined; + + if (tag) { + const [primary, ...subtags] = tag.split(/[-_]/); + const lowered = (primary || "").toLowerCase(); + if (lowered.length === 3) { + base = lowered; + } else if (lowered.length === 2) { + base = ISO1_TO_ISO3[lowered] ?? ""; + } + // Script subtags are exactly 4 chars, title-case (Latn, Arab, Cyrl, ...). + const script = subtags.find((s) => s.length === 4); + if (script) { + explicitScript = script.charAt(0).toUpperCase() + script.slice(1).toLowerCase(); + } + } + + if (!base) { + base = (meta.iso2t || meta.iso2b || "").toLowerCase(); + } + if (!base) { + const i1 = (meta.iso1 || "").toLowerCase(); + base = ISO1_TO_ISO3[i1] ?? ""; + } + if (!base) return undefined; + + base = MACRO_TO_INDIVIDUAL[base] ?? base; + return { base, explicitScript }; +} + +/** + * `scriptPref` is what the user picked in the Script advanced setting. + * + * - `"auto"` → "best guess" (our default). Pick the script using + * `OMNI_ASR_DEFAULT_SCRIPTS`, falling back to Latin then + * the sole supported script. + * - `"latin"` → force Latin script when supported, otherwise fall back + * to auto behaviour. + * - any 4-char string (`"Arab"`, `"Cyrl"`, ...) → use that script. + */ +export type AsrScriptPref = "auto" | "latin" | string; + +/** + * Resolve a project's language metadata to an OmniASR-compatible + * `{iso639_3}_{Script}` code, or return `undefined` when we can't safely pick + * one (the caller should then omit the `lang` query param so the server + * transcribes without language conditioning). + * + * Selection priority: + * 1. Explicit `scriptPref` (4-letter ISO 15924 tag) → use as-is when + * `{base}_{Script}` is a supported code. + * 2. Script encoded in the project tag (e.g. `swa-Cyrl`) → ditto. + * 3. `scriptPref === "latin"` → Latin if supported. + * 4. `OMNI_ASR_DEFAULT_SCRIPTS[base]` (our hand-curated "best guess"). + * 5. Latin if supported. + * 6. Sole supported script for this base. + * 7. `undefined` (genuinely ambiguous → let the server pick). + * + * Future work: a per-cell script override could short-circuit step 1. + */ +export function resolveOmniAsrCode( + meta: AsrLanguageMetaInput | undefined, + scriptPref: AsrScriptPref = "auto" +): string | undefined { + const extracted = extractBaseAndScript(meta); + if (!extracted) return undefined; + const { base, explicitScript } = extracted; + + // Find every supported script for this base. + const supportedScripts = OMNI_ASR_SUPPORTED_LANGS + .filter((c) => c.startsWith(`${base}_`)) + .map((c) => c.split("_")[1]); + if (supportedScripts.length === 0) return undefined; + + const tryCode = (script: string): string | undefined => { + const code = `${base}_${script}`; + return OMNI_ASR_SUPPORTED_LANG_SET.has(code) ? code : undefined; + }; + + // 1. Explicit user-chosen script (4-letter custom tag from advanced setting) + if (scriptPref && scriptPref !== "auto" && scriptPref !== "latin" && scriptPref.length === 4) { + const normalized = scriptPref.charAt(0).toUpperCase() + scriptPref.slice(1).toLowerCase(); + const code = tryCode(normalized); + if (code) return code; + } + + // 2. Script encoded in the project tag + if (explicitScript) { + const code = tryCode(explicitScript); + if (code) return code; + } + + // 3. scriptPref === "latin" → Latin if supported + if (scriptPref === "latin") { + const code = tryCode("Latn"); + if (code) return code; + } + + // 4. Default script for this base + const defaultScript = OMNI_ASR_DEFAULT_SCRIPTS[base]; + if (defaultScript) { + const code = tryCode(defaultScript); + if (code) return code; + } + + // 5. Latin if supported + const latin = tryCode("Latn"); + if (latin) return latin; + + // 6. Sole supported script + if (supportedScripts.length === 1) { + return `${base}_${supportedScripts[0]}`; + } + + // 7. Genuinely ambiguous + return undefined; +} + +/** Split an OmniASR code like "swh_Latn" into base + script (or return null). */ +export function splitOmniAsrCode(code: string | undefined | null): { base: string; script: string; } | null { + if (!code) return null; + const m = /^([a-z]{2,3})_([A-Z][a-z]{3})$/.exec(code); + if (!m) return null; + return { base: m[1], script: m[2] }; +} + +/** + * SIL `Ref_Name` values are CamelCased with no spaces (e.g. "MinNanChinese"). + * Split on case changes for natural-looking display: "Min Nan Chinese". + */ +function prettifyRefName(name: string): string { + return name + // Insert a space before any uppercase letter that follows a lowercase one. + .replace(/([a-z])([A-Z])/g, "$1 $2") + // And before an uppercase letter that's followed by a lowercase one + // (handles runs of acronyms like "USA"). + .replace(/([A-Z])([A-Z][a-z])/g, "$1 $2") + .trim(); +} + +/** + * Friendly display name for a transcription's language badge. + * + * Inputs: + * - `serverLang` — the code OmniASR echoed back in its response (when we + * sent one). The primary source of truth. + * - `sentCode` — what we asked the server to use, in case it didn't + * echo (today the server only echoes when given a code). + * - `projectLanguageName` — `refName` of the project's target language, as + * a last-ditch fallback when we know we sent the + * project's code but the server omitted the echo. + * + * The badge returns `null` to mean "render nothing" (we have no honest label). + * The caller renders "Auto Detect" itself when in auto-detect mode and we + * have no detected-language info, so we never lie about it here. + */ +export function labelForTranscriptionLanguage( + serverLang: string | undefined | null, + sentCode: string | undefined | null, + projectLanguageName: string | undefined | null +): string | null { + const friendly = (code: string | null | undefined): string | null => { + const parts = splitOmniAsrCode(code); + if (!parts) return null; + const refName = OMNI_ASR_FRIENDLY_NAMES[parts.base]; + return refName ? prettifyRefName(refName) : null; + }; + + // 1. Server's echo is always the most truthful signal. + const fromServer = friendly(serverLang); + if (fromServer) return fromServer; + + // 2. If we sent a code but the server didn't echo, the server still used + // what we sent — show that. + const fromSent = friendly(sentCode); + if (fromSent) return fromSent; + + // 3. Last-ditch fallback: project language name, if any. + return projectLanguageName ? prettifyRefName(projectLanguageName) : null; +} diff --git a/sharedUtils/omniAsrDefaultScripts.ts b/sharedUtils/omniAsrDefaultScripts.ts new file mode 100644 index 000000000..3155590fd --- /dev/null +++ b/sharedUtils/omniAsrDefaultScripts.ts @@ -0,0 +1,77 @@ +/** + * OmniASR multi-script default-script table + * ----------------------------------------- + * + * For each OmniASR language with **multiple supported scripts**, the script + * we should pick by default when the user has not specified one. + * + * Background + * ~~~~~~~~~~ + * OmniASR codes are `{iso639_3}_{Script}` (e.g. `urd_Arab`). Almost every + * supported base language (1631 of 1650 unique bases) supports exactly one + * script, so the script choice is trivial. This file only lists the 19 + * multi-script bases that need a real tiebreaker. + * + * Selection priority used by the resolver (`asrLanguageUtils.ts`): + * 1. Explicit script the user typed in the advanced setting + * 2. Script encoded in the project's language tag (e.g. `swa-Cyrl`) + * 3. **This table** (the "best guess") + * 4. Latin, if the language supports Latin + * 5. Sole supported script (if only one) + * 6. Omit `lang` (server runs without language conditioning) + * + * Source / rationale per entry + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Picked using Unicode CLDR `likelySubtags.xml` (the official "if a user gives + * me a language tag with no script, what script should I assume?" table) + * cross-checked against modern majority usage. Macrolanguage → individual + * remaps (e.g. swa→swh, ara→arb, zho→cmn, kur→kmr) are handled in the + * resolver *before* lookup, so this table keys on the individual codes + * OmniASR actually serves. + * + * If you adjust an entry, leave a `// ←` note explaining why. + * + * Multi-script bases not listed here intentionally fall through to "Latin if + * supported, else sole script". Add an entry here only when CLDR or modern + * majority usage clearly disagrees with that default. + * + * Regenerating + * ~~~~~~~~~~~~ + * To rediscover which bases need entries (after a model update changes the + * supported set): + * + * curl -s "https://genesis-ai-dev--codex-asr-serve.modal.run/languages" \ + * | python3 -c " + * import json, sys + * d = json.load(sys.stdin) + * bases = {} + * for l in d['languages']: + * b, s = l.split('_') + * bases.setdefault(b, set()).add(s) + * for b, ss in sorted(bases.items()): + * if len(ss) > 1: + * print(b, sorted(ss)) + * " + */ + +export const OMNI_ASR_DEFAULT_SCRIPTS: Readonly> = { + aze: "Latn", // Azerbaijani — modern standard (Republic of Azerbaijan) is Latin + bcc: "Arab", // Southern Balochi — written in Arabic script + cmn: "Hans", // Mandarin Chinese — Simplified is the more common default + cmo: "Khmr", // Central Mnong — Khmer-script orthography (community standard) + crk: "Cans", // Plains Cree — Canadian Aboriginal Syllabics is the traditional script + ell: "Grek", // Greek — only one substantive script; entry exists for completeness + gag: "Latn", // Gagauz — modern orthography is Latin + kmr: "Latn", // Northern Kurdish — Latin (Hawar) is the predominant modern script + lld: "Latn", // Ladin — only Latin; entry exists for completeness + ojb: "Latn", // Northwestern Ojibwa — Latin (double-vowel) is most common in print + rif: "Latn", // Tarifit Berber — Latin in modern publications (Tifinagh not in OmniASR) + rmc: "Latn", // Carpathian Romani — Latin in modern orthographies + rmy: "Latn", // Vlax Romani — Latin in modern orthographies + tuk: "Latn", // Turkmen — modern standard (Turkmenistan) is Latin + uig: "Arab", // Uyghur — Arabic-script (Uyghur Ereb Yëziqi) is the predominant script + urd: "Arab", // Urdu — Arabic-script (Nastaliq) is the canonical script + uzb: "Latn", // Uzbek — modern standard (Uzbekistan) is Latin + wal: "Ethi", // Wolaytta — Ethiopic (Geʽez) script in modern orthographies + yue: "Hant", // Cantonese — Traditional Chinese (Hong Kong / Guangzhou default) +}; diff --git a/sharedUtils/omniAsrFriendlyNames.ts b/sharedUtils/omniAsrFriendlyNames.ts new file mode 100644 index 000000000..e39632bcb --- /dev/null +++ b/sharedUtils/omniAsrFriendlyNames.ts @@ -0,0 +1,1680 @@ +/** + * OmniASR friendly-name lookup + * ---------------------------- + * + * Maps each OmniASR-supported ISO 639-3 base (1650 entries) to its English + * "reference name" from the SIL ISO 639-3 registry. Used to render the + * language badge after a transcription completes (e.g. `swh_Latn` → "Swahili"). + * + * Notes + * ~~~~~ + * - Keyed on the **base** (ISO 639-3), not the full OmniASR code, because the + * friendly name is the same regardless of script. Callers should strip the + * `_{Script}` suffix before lookup. The resolver in `asrLanguageUtils.ts` + * handles that. + * - Names come straight from SIL's `Ref_Name` field, which is CamelCased and + * ASCII-only (e.g. "ArbëreshëAlbanian" → "ArbresheAlbanian"). The helper + * `prettifyRefName()` in `asrLanguageUtils.ts` splits these on case changes + * so they read naturally in the UI. + * - The 'nan' entry is added by hand (Min Nan Chinese) — SIL leaves Ref_Name + * blank for that code in the version we parsed. + * + * Regenerating + * ~~~~~~~~~~~~ + * If OmniASR's supported set changes, regenerate from the SIL data already + * bundled in `src/utils/languageUtils.ts` using the snippet in + * `omniAsrSupportedLangs.ts`'s header (look up each base's `Ref_Name`). + */ + +export const OMNI_ASR_FRIENDLY_NAMES: Readonly> = { + aae: "ArbëreshëAlbanian", + aal: "Afade", + abb: "Bankon", + abi: "Abidji", + abk: "Abkhazian", + abn: "Abua", + abp: "AbellenAyta", + abr: "Abron", + abs: "AmboneseMalay", + aca: "Achagua", + acd: "Gikyode", + ace: "Achinese", + acf: "SaintLucianCreoleFrench", + ach: "Acoli", + acm: "MesopotamianArabic", + acn: "Achang", + acr: "Achi", + acu: "Achuar-Shiwiar", + acw: "HijaziArabic", + ade: "Adele", + adh: "Adhola", + adj: "Adioukrou", + adx: "AmdoTibetan", + ady: "Adyghe", + aeb: "TunisianArabic", + aec: "SaidiArabic", + aeu: "Akeu", + afb: "GulfArabic", + afo: "Eloyi", + afr: "Afrikaans", + agd: "Agarabi", + agg: "Angor", + agn: "Agutaynen", + agr: "Aguaruna", + agu: "Aguacateco", + agx: "Aghul", + aha: "Ahanta", + ahk: "Akha", + ahl: "Igo", + ahs: "Ashe", + aia: "Arosi", + ajg: "Aja(Benin)", + aka: "Akan", + akb: "BatakAngkola", + ake: "Akawaio", + akp: "Siwu", + ala: "Alago", + alj: "Alangan", + aln: "GhegAlbanian", + alo: "Larike-Wakasihu", + alp: "Alune", + als: "ToskAlbanian", + alt: "SouthernAltai", + alz: "Alur", + ame: "Yanesha'", + amf: "Hamer-Banna", + amh: "Amharic", + ami: "Amis", + amk: "Ambai", + amu: "GuerreroAmuzgo", + anc: "Ngas", + ank: "Goemai", + ann: "Obolo", + anp: "Angika", + anw: "Anaang", + any: "Anyin", + aom: "Ömie", + aoz: "UabMeto", + apb: "Sa'a", + apc: "LevantineArabic", + apd: "SudaneseArabic", + apr: "Arop-Lokep", + arb: "StandardArabic", + arg: "Aragonese", + arl: "Arabela", + arq: "AlgerianArabic", + ars: "NajdiArabic", + ary: "MoroccanArabic", + arz: "EgyptianArabic", + asa: "Asu(Tanzania)", + asg: "Cishingini", + asm: "Assamese", + ast: "Asturian", + ata: "Pele-Ata", + atb: "Zaiwa", + atg: "IvbieNorth-Okpela-Arhe", + ati: "Attié", + atq: "Aralle-Tabulahan", + ava: "Avaric", + avn: "Avatime", + avu: "Avokaya", + awa: "Awadhi", + awb: "Awa(PapuaNewGuinea)", + awo: "Awak", + ayl: "LibyanArabic", + ayo: "Ayoreo", + ayp: "NorthMesopotamianArabic", + ayr: "CentralAymara", + ayz: "MaiBrat", + aze: "Azerbaijani", + azg: "SanPedroAmuzgosAmuzgo", + azz: "HighlandPueblaNahuatl", + bag: "Tuki", + bak: "Bashkir", + bam: "Bambara", + ban: "Balinese", + bao: "Waimaha", + bas: "Basa(Cameroon)", + bav: "Vengo", + bax: "Bamun", + bba: "Baatonum", + bbb: "Barai", + bbc: "BatakToba", + bbj: "Ghomálá'", + bbl: "Bats", + bbo: "NorthernBoboMadaré", + bbu: "Kulung(Nigeria)", + bcc: "SouthernBalochi", + bce: "Bamenyam", + bci: "Baoulé", + bcl: "CentralBikol", + bcs: "Kohumono", + bcw: "Bana", + bcy: "Bacama", + bcz: "Bainouk-Gunyaamolo", + bda: "Bayot", + bde: "Bade", + bdg: "Bonggi", + bdh: "Baka(SouthSudan)", + bdm: "Buduma", + bdq: "Bahnar", + bdu: "Oroko", + beb: "Bebele", + beh: "Biali", + bel: "Belarusian", + bem: "Bemba(Zambia)", + ben: "Bengali", + bep: "Besoa", + bew: "Betawi", + bex: "JurModo", + bfa: "Bari", + bfd: "Bafut", + bfo: "MalbaBirifor", + bft: "Balti", + bfy: "Bagheli", + bfz: "MahasuPahari", + bgc: "Haryanvi", + bgp: "EasternBalochi", + bgq: "Bagri", + bgr: "BawmChin", + bgt: "Bughotu", + bgw: "Bhatri", + bha: "Bharia", + bhb: "Bhili", + bhh: "Bukharic", + bho: "Bhojpuri", + bhp: "Bima", + bht: "Bhattiyali", + bhz: "Bada(Indonesia)", + bib: "Bissa", + bim: "Bimoba", + bis: "Bislama", + biv: "SouthernBirifor", + bjj: "Kanauji", + bjk: "Barok", + bjn: "Banjar", + bjr: "Binumarien", + bjt: "Balanta-Ganja", + bjv: "Bedjond", + bjw: "Bakwé", + bjz: "Baruga", + bkd: "Binukid", + bkh: "Bakoko", + bkm: "Kom(Cameroon)", + bkv: "Bekwarra", + bky: "Bokyi", + ble: "Balanta-Kentohe", + blh: "Kuwaa", + blt: "TaiDam", + blx: "Mag-IndiAyta", + blz: "Balantak", + bmm: "NorthernBetsimisarakaMalagasy", + bmq: "Bomu", + bmr: "Muinane", + bmu: "Somba-Siawari", + bmv: "Bum", + bng: "Benga", + bnm: "Batanga", + bnn: "Bunun", + bno: "Bantoanon", + bnp: "Bola", + bns: "Bundeli", + boa: "Bora", + bod: "Tibetan", + boj: "Anjam", + bom: "Berom", + bor: "Borôro", + bos: "Bosnian", + bou: "Bondei", + bov: "Tuwuli", + box: "Buamu", + bpr: "KoronadalBlaan", + bps: "SaranganiBlaan", + bqc: "Boko(Benin)", + bqg: "Bago-Kusuntu", + bqi: "Bakhtiari", + bqj: "Bandial", + bqp: "Busa", + bra: "Braj", + bre: "Breton", + brh: "Brahui", + bri: "Mokpwe", + bru: "EasternBru", + brx: "Bodo(India)", + bsc: "Bassari", + bsh: "Kati", + bsj: "Bangwinji", + bsk: "Burushaski", + bsq: "Bassa", + bss: "Akoose", + bsy: "SabahBisaya", + btd: "BatakDairi", + btm: "BatakMandailing", + bts: "BatakSimalungun", + btt: "Bete-Bendi", + btv: "Bateri", + btx: "BatakKaro", + bud: "Ntcham", + bug: "Buginese", + bul: "Bulgarian", + bum: "Bulu(Cameroon)", + buo: "Terei", + bus: "Bokobaru", + bux: "Boghom", + bvb: "Bube", + bvc: "Baelelea", + bvz: "Bauzi", + bwq: "SouthernBoboMadaré", + bwr: "Bura-Pabir", + bwu: "Buli(Ghana)", + bxf: "Bilur", + bxk: "Bukusu", + byc: "Ubaghara", + byr: "Baruya", + bys: "Burak", + byv: "Medumba", + byx: "Qaqet", + bzh: "MaposBuang", + bzi: "Bisu", + bzj: "BelizeKriolEnglish", + bzw: "Basa(Nigeria)", + caa: "Chortí", + cab: "Garifuna", + cac: "Chuj", + cak: "Kaqchikel", + cap: "Chipaya", + car: "GalibiCarib", + cas: "Tsimané", + cat: "Catalan", + cax: "Chiquitano", + cbc: "Carapana", + cbi: "Chachi", + cbr: "Cashibo-Cacataibo", + cbs: "Cashinahua", + cbt: "Chayahuita", + cbu: "Candoshi-Shapra", + cbv: "Cacua", + cce: "Chopi", + ccg: "SambaDaka", + cco: "ComaltepecChinantec", + cdj: "Churahi", + cdo: "MinDongChinese", + ceb: "Cebuano", + ceg: "Chamacoco", + cek: "EasternKhumiChin", + cen: "Cen", + ces: "Czech", + cfa: "Dijim-Bwilim", + cfm: "FalamChin", + cgc: "Kagayanen", + cgg: "Chiga", + che: "Chechen", + chf: "TabascoChontal", + chq: "QuiotepecChinantec", + chv: "Chuvash", + chz: "OzumacínChinantec", + cjk: "Chokwe", + cjo: "AshéninkaPajonal", + cjp: "Cabécar", + cjs: "Shor", + ckb: "CentralKurdish", + ckl: "Cibak", + cko: "Anufo", + ckr: "Kairak", + ckt: "Chukot", + cky: "Cakfem-Mushere", + cla: "Ron", + cle: "LealaoChinantec", + cly: "EasternHighlandChatino", + cme: "Cerma", + cmn: "MandarinChinese", + cmo: "CentralMnong", + cmr: "Mro-KhimiChin", + cnh: "HakhaChin", + cni: "Asháninka", + cnl: "LalanaChinantec", + cnt: "TepetotutlaChinantec", + coe: "Koreguaje", + cof: "Colorado", + cok: "SantaTeresaCora", + con: "Cofán", + cor: "Cornish", + cot: "Caquinte", + cou: "Wamey", + cpa: "PalantlaChinantec", + cpb: "Ucayali-YurúaAshéninka", + cpu: "PichisAshéninka", + cpx: "Pu-XianChinese", + cpy: "SouthUcayaliAshéninka", + crh: "CrimeanTatar", + crk: "PlainsCree", + crn: "ElNayarCora", + crq: "Iyo'wujwaChorote", + crs: "SeselwaCreoleFrench", + crt: "Iyojwa'jaChorote", + csk: "Jola-Kasa", + cso: "SochiapamChinantec", + ctd: "TedimChin", + cte: "TepinapaChinantec", + ctg: "Chittagonian", + ctl: "TlacoatzintepecChinantec", + cto: "Emberá-Catío", + ctu: "Chol", + cuc: "UsilaChinantec", + cui: "Cuiba", + cuk: "SanBlasKuna", + cul: "Culina", + cut: "TeutilaCuicatec", + cux: "TepeuxilaCuicatec", + cwa: "Kabwa", + cwe: "Kwere", + cwt: "Kuwaataay", + cya: "NopalaChatino", + cym: "Welsh", + daa: "Dangaléat", + dag: "Dagbani", + dah: "Gwahatike", + dan: "Danish", + dar: "Dargwa", + dav: "Taita", + dbd: "Dadiya", + dbj: "Ida'an", + dbq: "Daba", + dcc: "Deccan", + ddn: "Dendi(Benin)", + ded: "Dedua", + deg: "Degema", + des: "Desano", + deu: "German", + dga: "SouthernDagaare", + dgh: "Dghwede", + dgi: "NorthernDagara", + dgk: "Dagba", + dgo: "Dogri(individuallanguage)", + dgr: "Dogrib", + dhi: "Dhimal", + did: "Didinga", + dig: "Digo", + dik: "SouthwesternDinka", + dip: "NortheasternDinka", + div: "Dhivehi", + dje: "Zarma", + djk: "EasternMaroonCreole", + dmk: "Domaaki", + dml: "Dameli", + dnj: "Dan", + dnt: "MidGrandValleyDani", + dnw: "WesternDani", + dop: "Lukpa", + dos: "Dogosé", + dru: "Rukai", + dsb: "LowerSorbian", + dsh: "Daasanach", + dtp: "KadazanDusun", + dts: "ToroSoDogon", + dty: "Dotyali", + dua: "Duala", + dug: "Duruma", + dwr: "Dawro", + dyi: "DjiminiSenoufo", + dyo: "Jola-Fonyi", + dyu: "Dyula", + dzg: "Dazaga", + dzo: "Dzongkha", + ebu: "Embu", + ego: "Eggon", + eip: "Eipomek", + eiv: "Askopan", + eka: "Ekajuk", + ekk: "StandardEstonian", + eko: "Koti", + ekr: "Yace", + ell: "ModernGreek(1453-)", + elm: "Eleme", + emp: "NorthernEmberá", + enb: "Markweeta", + eng: "English", + enx: "Enxet", + epo: "Esperanto", + ese: "EseEjja", + ess: "CentralSiberianYupik", + esu: "CentralYupik", + eto: "Eton(Cameroon)", + ets: "Yekhee", + etu: "Ejagham", + eus: "Basque", + evn: "Evenki", + ewe: "Ewe", + ewo: "Ewondo", + eyo: "Keiyo", + eza: "Ezaa", + fal: "SouthFali", + fan: "Fang(EquatorialGuinea)", + fao: "Faroese", + far: "Fataleka", + fas: "Persian", + fat: "Fanti", + fia: "Nobiin", + fij: "Fijian", + fil: "Filipino", + fin: "Finnish", + fip: "Fipa", + fkk: "Kirya-Konzəl", + flr: "Fuliiru", + fmp: "Fe'fe'", + fmu: "FarWesternMuria", + fon: "Fon", + fra: "French", + frd: "Fordata", + fry: "WesternFrisian", + fub: "AdamawaFulfulde", + fuc: "Pulaar", + fue: "BorguFulfulde", + ful: "Fulah", + fuq: "Central-EasternNigerFulfulde", + fuv: "NigerianFulfulde", + gag: "Gagauz", + gai: "Borei", + gam: "Kandawo", + gau: "MudhiliGadaba", + gbi: "Galela", + gbk: "Gaddi", + gbm: "Garhwali", + gbo: "NorthernGrebo", + gbr: "Gbagyi", + gby: "Gbari", + gcc: "Mali", + gde: "Gude", + gdf: "Guduf-Gava", + geb: "Kire", + gej: "Gen", + ges: "Geser-Gorom", + ggg: "Gurgula", + gid: "Gidar", + gig: "Goaria", + gil: "Gilbertese", + giz: "SouthGiziga", + gjk: "KachiKoli", + gjn: "Gonja", + gju: "Gujari", + gkn: "Gokana", + gld: "Nanai", + gle: "Irish", + glg: "Galician", + glk: "Gilaki", + glv: "Manx", + glw: "Glavda", + gmv: "Gamo", + gna: "Kaansa", + gnd: "Zulgo-Gemzek", + gng: "Ngangam", + gof: "Gofa", + gog: "Gogo", + gol: "Gola", + gom: "GoanKonkani", + gor: "Gorontalo", + gqr: "Gor", + grc: "AncientGreek(to1453)", + gri: "Ghari", + grn: "Guarani", + grt: "Garo", + gsl: "Gusilay", + gso: "SouthwestGbaya", + gub: "Guajajára", + guc: "Wayuu", + gud: "YocobouéDida", + gug: "ParaguayanGuaraní", + guh: "Guahibo", + gui: "EasternBolivianGuaraní", + guj: "Gujarati", + guk: "Gumuz", + gum: "Guambiano", + guo: "Guayabero", + guq: "Aché", + gur: "Farefare", + guu: "Yanomamö", + gux: "Gourmanchéma", + guz: "Gusii", + gvc: "Guanano", + gvl: "Gulay", + gwc: "Gawri", + gwe: "Gweno", + gwi: "Gwichʼin", + gwr: "Gwere", + gwt: "Gawar-Bati", + gym: "Ngäbere", + gyr: "Guarayu", + gyz: "Geji", + had: "Hatam", + hag: "Hanga", + hah: "Hahon", + hak: "HakkaChinese", + hao: "Hakö", + hap: "Hupla", + hat: "Haitian", + hau: "Hausa", + haw: "Hawaiian", + hay: "Haya", + hbb: "Huba", + hch: "Huichol", + heb: "Hebrew", + heh: "Hehe", + her: "Herero", + hia: "Lamang", + hif: "FijiHindi", + hig: "Kamwe", + hil: "Hiligaynon", + hin: "Hindi", + hkk: "Hunjara-KainaKe", + hla: "Halia", + hlb: "Halbi", + hlt: "MatuChin", + hne: "Chhattisgarhi", + hnn: "Hanunoo", + hno: "NorthernHindko", + hns: "CaribbeanHindustani", + hoc: "Ho", + hrv: "Croatian", + hsb: "UpperSorbian", + hto: "MinicaHuitoto", + hub: "Huambisa", + hue: "SanFranciscoDelMarHuave", + hui: "Huli", + hul: "Hula", + hun: "Hungarian", + hus: "Huastec", + huu: "MuruiHuitoto", + huv: "SanMateoDelMarHuave", + hux: "NüpodeHuitoto", + hvn: "Sabu", + hwc: "Hawai'iCreoleEnglish", + hwo: "Hwana", + hye: "Armenian", + hyw: "WesternArmenian", + iba: "Iban", + ibb: "Ibibio", + ibo: "Igbo", + icr: "IslanderCreoleEnglish", + ida: "Idakho-Isukha-Tiriki", + idd: "EdeIdaca", + idu: "Idoma", + ifa: "AmganadIfugao", + ifb: "BatadIfugao", + ife: "Ifè", + ifk: "TuwaliIfugao", + ifu: "MayoyaoIfugao", + ify: "Keley-IKallahan", + igl: "Igala", + ign: "Ignaciano", + ijc: "Izon", + ijn: "Kalabari", + ikk: "Ika", + ikw: "Ikwere", + ilb: "Ila", + ilo: "Iloko", + imo: "Imbongu", + ina: "Interlingua(InternationalAuxiliaryLanguageAssociation)", + inb: "Inga", + ind: "Indonesian", + iou: "Tuma-Irumu", + ipi: "Ipili", + ipk: "Inupiaq", + iqw: "Ikwo", + iri: "Rigwe", + irk: "Iraqw", + ish: "Esan", + isl: "Icelandic", + iso: "Isoko", + ita: "Italian", + itl: "Itelmen", + its: "Isekiri", + itv: "Itawit", + itw: "Ito", + itz: "Itzá", + ixl: "Ixil", + izr: "Izere", + izz: "Izii", + jac: "Popti'", + jal: "Yalahatan", + jam: "JamaicanCreoleEnglish", + jav: "Javanese", + jax: "JambiMalay", + jbu: "JukunTakum", + jen: "Dza", + jic: "Tol", + jiv: "Shuar", + jmc: "Machame", + jmd: "Yamdena", + jmx: "WesternJuxtlahuacaMixtec", + jpn: "Japanese", + jqr: "Jaqaru", + juk: "Wapan", + jun: "Juang", + juo: "Jiba", + jvn: "CaribbeanJavanese", + kaa: "Kara-Kalpak", + kab: "Kabyle", + kac: "Kachin", + kai: "Karekare", + kaj: "Jju", + kak: "Kalanguya", + kam: "Kamba(Kenya)", + kan: "Kannada", + kao: "Xaasongaxango", + kaq: "Capanahua", + kas: "Kashmiri", + kat: "Georgian", + kay: "Kamayurá", + kaz: "Kazakh", + kbd: "Kabardian", + kbl: "Kanembu", + kbo: "Keliko", + kbp: "Kabiyè", + kbq: "Kamano", + kbr: "Kafa", + kbt: "Abadi", + kby: "MangaKanuri", + kca: "Khanty", + kcg: "Tyap", + kcn: "Nubi", + kcq: "Kamo", + kdc: "Kutu", + kde: "Makonde", + kdh: "Tem", + kdi: "Kumam", + kdj: "Karamojong", + kdl: "Tsikimba", + kdn: "Kunda", + kdt: "Kuy", + kea: "Kabuverdianu", + kek: "Kekchí", + ken: "Kenyang", + keo: "Kakwa", + ker: "Kera", + keu: "Akebu", + key: "Kupia", + kez: "Kukele", + kfb: "NorthwesternKolami", + kff: "Koya", + kfk: "Kinnauri", + kfq: "Korku", + kfr: "Kachhi", + kfw: "KharamNaga", + kfx: "KulluPahari", + kha: "Khasi", + khg: "KhamsTibetan", + khk: "HalhMongolian", + khm: "Khmer", + khq: "KoyraChiiniSonghay", + khw: "Khowar", + kia: "Kim", + kij: "Kilivila", + kik: "Kikuyu", + kin: "Kinyarwanda", + kir: "Kirghiz", + kix: "KhiamniunganNaga", + kjb: "Q'anjob'al", + kjc: "CoastalKonjo", + kje: "Kisar", + kjg: "Khmu", + kjh: "Khakas", + kjk: "HighlandKonjo", + kki: "Kagulu", + kkj: "Kako", + kle: "Kulung(Nepal)", + kln: "Kalenjin", + kls: "Kalasha", + klu: "Klao", + klv: "Maskelynes", + klw: "Tado", + kma: "Konni", + kmd: "MajukayangKalinga", + kml: "TanudanKalinga", + kmr: "NorthernKurdish", + kmu: "Kanite", + kmy: "Koma", + kna: "Dera(Nigeria)", + knb: "LubuaganKalinga", + knc: "CentralKanuri", + kne: "Kankanaey", + knf: "Mankanya", + knj: "WesternKanjobal", + knk: "Kuranko", + knn: "Konkani(individuallanguage)", + kno: "Kono(SierraLeone)", + kog: "Cogui", + kol: "Kol(PapuaNewGuinea)", + koo: "Konzo", + kor: "Korean", + kpo: "Ikposo", + kpq: "Korupun-Sela", + kps: "Tehit", + kpv: "Komi-Zyrian", + kpy: "Koryak", + kpz: "Kupsabiny", + kqe: "Kalagan", + kqo: "EasternKrahn", + kqp: "Kimré", + kqr: "Kimaragang", + kqy: "Koorete", + krc: "Karachay-Balkar", + kri: "Krio", + krj: "Kinaray-A", + krl: "Karelian", + krr: "Krung", + krs: "Gbaya(Sudan)", + kru: "Kurukh", + krx: "Karon", + ksb: "Shambala", + ksd: "Kuanua", + ksf: "Bafia", + ksr: "Borong", + kss: "SouthernKisi", + ksz: "Kodaku", + ktb: "Kambaata", + ktj: "PlapoKrumen", + kto: "Kuot", + kua: "Kuanyama", + kub: "Kutep", + kue: "Kuman(PapuaNewGuinea)", + kuh: "Kushi", + kum: "Kumyk", + kur: "Kurdish", + kus: "Kusaal", + kvn: "BorderKuna", + kvw: "Wersing", + kvx: "ParkariKoli", + kwd: "Kwaio", + kwf: "Kwara'ae", + kwi: "Awa-Cuaiquer", + kwm: "Kwambi", + kxc: "Konso", + kxf: "ManumanawKaren", + kxm: "NorthernKhmer", + kxp: "WadiyaraKoli", + kyb: "ButbutKalinga", + kyc: "Kyaka", + kyf: "Kouya", + kyg: "Keyagana", + kyo: "Kelon", + kyq: "Kenga", + kyu: "WesternKayah", + kyx: "Rapoisi", + kyz: "Kayabí", + kzf: "Da'aKaili", + kzi: "Kelabit", + lac: "Lacandon", + lag: "Rangi", + laj: "Lango(Uganda)", + lam: "Lamba", + lao: "Lao", + las: "Lama(Togo)", + lat: "Latin", + lav: "Latvian", + law: "Lauje", + lbj: "Ladakhi", + lbw: "Tolaki", + lcm: "Tungag", + lcp: "WesternLawa", + ldb: "Dũya", + led: "Lendu", + lee: "Lyélé", + lef: "Lelemi", + lem: "Nomaande", + lew: "LedoKaili", + lex: "Luang", + lgg: "Lugbara", + lgl: "Wala", + lhu: "Lahu", + lia: "West-CentralLimba", + lid: "Nyindrou", + lif: "Limbu", + lij: "Ligurian", + lin: "Lingala", + lip: "Sekpele", + lir: "LiberianEnglish", + lis: "Lisu", + lit: "Lithuanian", + lje: "Rampi", + ljp: "LampungApi", + lkb: "Kabras", + lke: "Kenyi", + lla: "Lala-Roba", + lld: "Ladin", + llg: "Lole", + lln: "Lele(Chad)", + lme: "Pévé", + lnd: "Lundayeh", + lns: "Lamnso'", + lnu: "Longuda", + loa: "Loloda", + lob: "Lobi", + lok: "Loko", + lom: "Loma(Liberia)", + lon: "MalawiLomwe", + loq: "Lobala", + lrk: "Loarki", + lsi: "Lashi", + lsm: "Saamia", + lss: "Lasi", + ltg: "Latgalian", + lth: "Thur", + lto: "Tsotso", + ltz: "Luxembourgish", + lua: "Luba-Lulua", + luc: "Aringa", + lug: "Ganda", + luo: "Luo(KenyaandTanzania)", + lus: "Lushai", + lwg: "Wanga", + lwo: "Luwo", + lww: "Lewo", + lzz: "Laz", + maa: "SanJerónimoTecóatlMazatec", + mab: "YutanduchiMixtec", + mad: "Madurese", + maf: "Mafa", + mag: "Magahi", + mah: "Marshallese", + mai: "Maithili", + maj: "JalapaDeDíazMazatec", + mak: "Makasar", + mal: "Malayalam", + mam: "Mam", + maq: "ChiquihuitlánMazatec", + mar: "Marathi", + mau: "HuautlaMazatec", + maw: "Mampruli", + max: "NorthMoluccanMalay", + maz: "CentralMazahua", + mbb: "WesternBukidnonManobo", + mbc: "Macushi", + mbh: "Mangseng", + mbj: "Nadëb", + mbt: "MatigsalugManobo", + mbu: "Mbula-Bwazza", + mca: "Maca", + mcb: "Machiguenga", + mcd: "Sharanahua", + mcf: "Matsés", + mco: "CoatlánMixe", + mcp: "Makaa", + mcq: "Ese", + mcu: "CameroonMambila", + mcx: "Mpiemo", + mda: "Mada(Nigeria)", + mdd: "Mbum", + mdv: "SantaLucíaMonteverdeMixtec", + mdy: "Male(Ethiopia)", + med: "Melpa", + mee: "Mengen", + meh: "SouthwesternTlaxiacoMixtec", + mej: "Meyah", + mek: "Mekeo", + mel: "CentralMelanau", + men: "Mende(SierraLeone)", + meq: "Merey", + mer: "Meru", + met: "Mato", + meu: "Motu", + mev: "Mano", + mfe: "Morisyen", + mfh: "Matal", + mfi: "Wandala", + mfk: "NorthMofu", + mfm: "MarghiSouth", + mfn: "CrossRiverMbembe", + mfo: "Mbe", + mfq: "Moba", + mfv: "Mandjak", + mfy: "Mayo", + mfz: "Mabaan", + mgd: "Moru", + mge: "Mango", + mgg: "Mpumpong", + mgh: "Makhuwa-Meetto", + mgi: "Lijili", + mgo: "Meta'", + mhi: "Ma'di", + mhk: "Mungaka", + mhr: "EasternMari", + mhu: "Digaro-Mishmi", + mhx: "Maru", + mhy: "Ma'anyan", + mib: "AtatláhucaMixtec", + mie: "OcotepecMixtec", + mif: "Mofu-Gudur", + mig: "SanMiguelElGrandeMixtec", + mih: "ChayucoMixtec", + mil: "PeñolesMixtec", + mim: "AlacatlatzalaMixtec", + min: "Minangkabau", + mio: "PinotepaNacionalMixtec", + mip: "Apasco-ApoalaMixtec", + miq: "Mískito", + mit: "SouthernPueblaMixtec", + miu: "CacaloxtepecMixtec", + miy: "AyutlaMixtec", + miz: "CoatzospanMixtec", + mjl: "Mandeali", + mjv: "Mannan", + mkd: "Macedonian", + mkf: "Miya", + mki: "Dhatki", + mkl: "Mokole", + mkn: "KupangMalay", + mlg: "Malagasy", + mlq: "WesternManinkakan", + mlt: "Maltese", + mmc: "MichoacánMazahua", + mmg: "NorthAmbrym", + mnb: "Muna", + mne: "Naba", + mnf: "Mundani", + mni: "Manipuri", + mnk: "Mandinka", + mnw: "Mon", + mnx: "Manikion", + moa: "Mwan", + mog: "Mongondow", + mon: "Mongolian", + mop: "MopánMaya", + mor: "Moro", + mos: "Mossi", + mox: "Molima", + moz: "Mukulu", + mpg: "Marba", + mpm: "YosondúaMixtec", + mpp: "Migabac", + mpx: "Misima-Panaeati", + mqb: "Mbuko", + mqf: "Momuna", + mqj: "Mamasa", + mqn: "Moronene", + mqy: "Manggarai", + mri: "Maori", + mrj: "WesternMari", + mrr: "Maria(India)", + mrt: "MarghiCentral", + mrw: "Maranao", + msh: "MasikoroMalagasy", + msi: "SabahMalay", + msw: "Mansoanka", + msy: "Aruamu", + mtd: "Mualang", + mtj: "Moskona", + mto: "TotontepecMixe", + mtr: "Mewari", + mtu: "TututepecMixtec", + mtx: "TidaáMixtec", + mua: "Mundang", + mug: "Musgu", + muh: "Mündü", + mui: "Musi", + mup: "Malvi", + mur: "Murle", + muv: "Muthuvan", + muy: "Muyang", + mve: "Marwari(Pakistan)", + mvp: "Duri", + mvy: "IndusKohistani", + mwq: "MünChin", + mwv: "Mentawai", + mxb: "TezoatlánMixtec", + mxq: "JuquilaMixe", + mxs: "HuitepecMixtec", + mxt: "JamiltepecMixtec", + mxu: "Mada(Cameroon)", + mxv: "MetlatónocMixtec", + mxy: "SoutheasternNochixtlánMixtec", + mya: "Burmese", + myb: "Mbay", + myk: "MamaraSenoufo", + myv: "Erzya", + myx: "Masaaba", + myy: "Macuna", + mza: "SantaMaríaZacatepecMixtec", + mzi: "IxcatlánMazatec", + mzj: "Manya", + mzk: "NigeriaMambila", + mzl: "MazatlánMixe", + mzm: "Mumuye", + mzw: "Deg", + nab: "SouthernNambikuára", + nag: "NagaPidgin", + nal: "Nalik", + nan: "Min Nan Chinese", + nap: "Neapolitan", + nas: "Naasioi", + naw: "Nawuri", + nbh: "Ngamo", + nca: "Iyo", + ncf: "Notsi", + nch: "CentralHuastecaNahuatl", + ncj: "NorthernPueblaNahuatl", + ncl: "MichoacánNahuatl", + nco: "Sibe", + ncu: "Chumburung", + ncx: "CentralPueblaNahuatl", + ndi: "SambaLeko", + ndj: "Ndamba", + ndo: "Ndonga", + ndp: "Ndo", + ndv: "Ndut", + ndy: "Lutos", + ndz: "Ndogo", + neb: "Toura(Côted'Ivoire)", + nep: "Nepali(macrolanguage)", + new: "Newari", + nfa: "Dhao", + nfr: "Nafaanra", + nga: "Ngbaka", + ngi: "Ngizim", + ngl: "Lomwe", + ngp: "Ngulu", + ngu: "GuerreroNahuatl", + nhe: "EasternHuastecaNahuatl", + nhg: "TetelcingoNahuatl", + nhi: "Zacatlán-Ahuacatlán-TepetzintlaNahuatl", + nhn: "CentralNahuatl", + nhq: "HuaxcalecaNahuatl", + nhu: "Noone", + nhw: "WesternHuastecaNahuatl", + nhx: "Isthmus-MecayapanNahuatl", + nhy: "NorthernOaxacaNahuatl", + nia: "Nias", + nij: "Ngaju", + nim: "Nilamba", + nin: "Ninzo", + nja: "Nzanyi", + nko: "Nkonya", + nla: "Ngombale", + nlc: "Nalca", + nld: "Dutch", + nlg: "Gela", + nlk: "NiniaYali", + nlv: "OrizabaNahuatl", + nmg: "Kwasio", + nmz: "Nawdm", + nnb: "Nande", + nnh: "Ngiemboon", + nnq: "Ngindo", + nnw: "SouthernNuni", + noa: "WounMeu", + nob: "NorwegianBokmål", + nod: "NorthernThai", + noe: "Nimadi", + nog: "Nogai", + not: "Nomatsiguenga", + npl: "SoutheasternPueblaNahuatl", + npy: "Napu", + nso: "Pedi", + nst: "TaseNaga", + nsu: "SierraNegraNahuatl", + ntm: "Nateni", + ntr: "Delo", + nuj: "Nyole", + nup: "Nupe-Nupe-Tako", + nus: "Nuer", + nuz: "TlamacazapaNahuatl", + nwb: "Nyabwa", + nxq: "Naxi", + nya: "Nyanja", + nyf: "Giryama", + nyn: "Nyankole", + nyo: "Nyoro", + nyu: "Nyungwe", + nyy: "Nyakyusa-Ngonde", + nzi: "Nzima", + obo: "OboManobo", + oci: "Occitan(post1500)", + odk: "Od", + odu: "Odual", + ogo: "Khana", + ojb: "NorthwesternOjibwa", + oku: "Oku", + old: "Mochi", + omw: "SouthTairora", + onb: "Lingao", + ood: "TohonoO'odham", + orc: "Orma", + orm: "Oromo", + oru: "Ormuri", + ory: "Odia", + oss: "Ossetian", + ote: "MezquitalOtomi", + otq: "QuerétaroOtomi", + ozm: "Koonzime", + pab: "Parecís", + pad: "Paumarí", + pag: "Pangasinan", + pam: "Pampanga", + pan: "Panjabi", + pao: "NorthernPaiute", + pap: "Papiamento", + pau: "Palauan", + pbb: "Páez", + pbc: "Patamona", + pbi: "Parkwa", + pbs: "CentralPame", + pbt: "SouthernPashto", + pbu: "NorthernPashto", + pce: "RuchingPalaung", + pcm: "NigerianPidgin", + pex: "Petats", + pez: "EasternPenan", + phl: "Phalura", + phr: "Pahari-Potwari", + pib: "Yine", + pil: "Yom", + pip: "Pero", + pir: "Piratapuyo", + pis: "Pijin", + piy: "Piya-Kwonci", + pjt: "Pitjantjatjara", + pkb: "Pokomo", + pko: "Pökoot", + plk: "KohistaniShina", + pls: "SanMarcosTlacoyalcoPopoloca", + plt: "PlateauMalagasy", + plw: "Brooke'sPointPalawano", + pmf: "Pamona", + pmq: "NorthernPame", + pms: "Piemontese", + pmy: "PapuanMalay", + pnb: "WesternPanjabi", + pne: "WesternPenan", + pny: "Pinyin", + poc: "Poqomam", + poe: "SanJuanAtzingoPopoloca", + poh: "Poqomchi'", + poi: "HighlandPopoluca", + pol: "Polish", + por: "Portuguese", + pov: "UpperGuineaCrioulo", + pow: "SanFelipeOtlaltepecPopoloca", + poy: "Pogolo", + ppk: "Uma", + pps: "SanLuísTemalacayucaPopoloca", + prf: "Paranan", + prk: "Parauk", + prq: "AshéninkaPerené", + prt: "Phai", + pse: "CentralMalay", + pss: "Kaulong", + pst: "CentralPashto", + ptu: "Bambam", + pua: "WesternHighlandPurepecha", + pui: "Puinave", + pus: "Pushto", + pwg: "Gapapaiwa", + pwn: "Paiwan", + pww: "PwoNorthernKaren", + pxm: "QuetzaltepecMixe", + qub: "HuallagaHuánucoQuechua", + quc: "K'iche'", + quf: "LambayequeQuechua", + qug: "ChimborazoHighlandQuichua", + quh: "SouthBolivianQuechua", + qul: "NorthBolivianQuechua", + qum: "Sipacapense", + qup: "SouthernPastazaQuechua", + qur: "YanahuancaPascoQuechua", + qus: "SantiagodelEsteroQuichua", + quv: "Sacapulteco", + quw: "TenaLowlandQuichua", + qux: "YauyosQuechua", + quy: "AyacuchoQuechua", + quz: "CuscoQuechua", + qva: "Ambo-PascoQuechua", + qvc: "CajamarcaQuechua", + qve: "EasternApurímacQuechua", + qvh: "Huamalíes-DosdeMayoHuánucoQuechua", + qvi: "ImbaburaHighlandQuichua", + qvj: "LojaHighlandQuichua", + qvl: "CajatamboNorthLimaQuechua", + qvm: "Margos-Yarowilca-LauricochaQuechua", + qvn: "NorthJunínQuechua", + qvo: "NapoLowlandQuechua", + qvs: "SanMartínQuechua", + qvw: "HuayllaWancaQuechua", + qvz: "NorthernPastazaQuichua", + qwa: "CorongoAncashQuechua", + qwh: "HuaylasAncashQuechua", + qws: "SihuasAncashQuechua", + qxa: "ChiquiánAncashQuechua", + qxh: "PanaoHuánucoQuechua", + qxl: "SalasacaHighlandQuichua", + qxn: "NorthernConchucosAncashQuechua", + qxo: "SouthernConchucosAncashQuechua", + qxp: "PunoQuechua", + qxr: "CañarHighlandQuichua", + qxt: "SantaAnadeTusiPascoQuechua", + qxu: "Arequipa-LaUniónQuechua", + qxw: "JaujaWancaQuechua", + rag: "Logooli", + rah: "Rabha", + rai: "Ramoaaina", + rap: "Rapanui", + rav: "Sampang", + raw: "Rawang", + rej: "Rejang", + rel: "Rendille", + rgu: "Ringgou", + rhg: "Rohingya", + rif: "Tarifit", + rim: "Nyaturu", + rjs: "Rajbanshi", + rkt: "Rangpuri", + rmc: "CarpathianRomani", + rmo: "SinteRomani", + rmy: "VlaxRomani", + rng: "Ronga", + rnl: "Ranglong", + rob: "Tae'", + rof: "Rombo", + roh: "Romansh", + rol: "Romblomanon", + ron: "Romanian", + roo: "Rotokas", + rop: "Kriol", + rro: "Waima", + rth: "Ratahan", + rub: "Gungu", + ruc: "Ruuli", + ruf: "Luguru", + rug: "Roviana", + run: "Rundi", + rus: "Russian", + rwm: "Amba(Uganda)", + rwr: "Marwari(India)", + sab: "Buglere", + sag: "Sango", + sah: "Yakut", + saj: "Sahu", + saq: "Samburu", + sas: "Sasak", + sau: "Saleman", + say: "Saya", + sba: "Ngambay", + sbd: "SouthernSamo", + sbl: "BotolanSambal", + sbn: "SindhiBhil", + sbp: "Sangu(Tanzania)", + sch: "Sakachep", + sck: "Sadri", + scl: "Shina", + scn: "Sicilian", + sco: "Scots", + sda: "Toraja-Sa'dan", + sdo: "Bukar-SadungBidayuh", + sea: "Semai", + seh: "Sena", + sei: "Seri", + ses: "KoyraboroSenniSonghai", + sey: "Secoya", + sgb: "Mag-antsiAyta", + sgj: "Surgujia", + sgw: "SebatBetGurage", + shi: "Tachelhit", + shk: "Shilluk", + shn: "Shan", + sho: "Shanga", + shp: "Shipibo-Conibo", + sid: "Sidamo", + sig: "Paasaal", + sil: "TumulungSisaala", + sin: "Sinhala", + sip: "Sikkimese", + siw: "Siwai", + sja: "Epena", + sjm: "Mapun", + sjp: "Surjapuri", + sjr: "Siar-Lak", + skg: "SakalavaMalagasy", + skr: "Saraiki", + sld: "Sissala", + slk: "Slovak", + slu: "Selaru", + slv: "Slovenian", + sml: "CentralSama", + smo: "Samoan", + sna: "Shona", + snc: "Sinaugoro", + snd: "Sindhi", + sne: "BauBidayuh", + snk: "Soninke", + snn: "Siona", + snp: "Siane", + snv: "Sa'ban", + snw: "Selee", + sol: "Solos", + som: "Somali", + soy: "Miyobe", + spa: "Spanish", + spp: "SupyireSenoufo", + sps: "Saposa", + spy: "Sabaot", + src: "LogudoreseSardinian", + srd: "Sardinian", + sri: "Siriano", + srm: "Saramaccan", + srn: "SrananTongo", + sro: "CampidaneseSardinian", + srp: "Serbian", + srr: "Serer", + srx: "Sirmauri", + ssi: "Sansi", + ste: "Liana-Seti", + stn: "Owa", + stp: "SoutheasternTepehuan", + sua: "Sulka", + suc: "WesternSubanon", + suk: "Sukuma", + sun: "Sundanese", + sur: "Mwaghavul", + sus: "Susu", + suv: "Puroik", + suz: "Sunwar", + sva: "Svan", + swe: "Swedish", + swh: "Swahili(individuallanguage)", + swv: "Shekhawati", + sxb: "Suba", + sxn: "Sangir", + sya: "Siang", + syl: "Sylheti", + sza: "Semelai", + szy: "Sakizaya", + tac: "LowlandTarahumara", + taj: "EasternTamang", + tam: "Tamil", + tan: "Tangale", + tao: "Yami", + tap: "Taabwa", + taq: "Tamasheq", + tar: "CentralTarahumara", + tat: "Tatar", + tav: "Tatuyo", + tay: "Atayal", + tbc: "Takia", + tbf: "Mandara", + tbg: "NorthTairora", + tbk: "CalamianTagbanwa", + tbl: "Tboli", + tby: "Tabaru", + tbz: "Ditammari", + tca: "Ticuna", + tcc: "Datooga", + tcf: "MalinaltepecMe'phaa", + tcy: "Tulu", + tcz: "ThadoChin", + tdj: "Tajio", + tdn: "Tondano", + tdx: "Tandroy-MahafalyMalagasy", + ted: "TepoKrumen", + tee: "HuehuetlaTepehua", + tel: "Telugu", + tem: "Timne", + teo: "Teso", + ter: "Tereno", + tew: "Tewa(USA)", + tex: "Tennet", + tfr: "Teribe", + tgc: "Tigak", + tgj: "Tagin", + tgk: "Tajik", + tgl: "Tagalog", + tgo: "Sudest", + tgp: "Tangoa", + tha: "Thai", + the: "ChitwaniaTharu", + thk: "Tharaka", + thl: "DangauraTharu", + thq: "KochilaTharu", + thr: "RanaTharu", + thv: "TahaggartTamahaq", + tig: "Tigre", + tih: "TimugonMurut", + tik: "Tikar", + tio: "Teop", + tir: "Tigrinya", + tkg: "TesakaMalagasy", + tkr: "Tsakhur", + tkt: "KathoriyaTharu", + tlb: "Tobelo", + tli: "Tlingit", + tlj: "Talinga-Bwisi", + tlp: "FilomenaMata-CoahuitlánTotonac", + tly: "Talysh", + tmc: "Tumak", + tmf: "Toba-Maskoy", + tna: "Tacana", + tng: "Tobanga", + tnk: "Kwamera", + tnn: "NorthTanna", + tnp: "Whitesands", + tnr: "Ménik", + tnt: "Tontemboan", + tob: "Toba", + toc: "CoyutlaTotonac", + toh: "Gitonga", + tok: "TokiPona", + tom: "Tombulu", + top: "PapantlaTotonac", + tos: "HighlandTotonac", + tpi: "TokPisin", + tpl: "TlacoapaMe'phaa", + tpm: "Tampulma", + tpp: "PisafloresTepehua", + tpt: "TlachichilcoTepehua", + tpz: "Tinputz", + tqp: "Tomoip", + trc: "CopalaTriqui", + tri: "Trió", + trn: "Trinitario", + trp: "KokBorok", + trq: "SanMartínItunyosoTriqui", + trs: "ChicahuaxtlaTriqui", + trv: "Sediq", + trw: "Torwali", + tsn: "Tswana", + tso: "Tsonga", + tsz: "Purepecha", + ttc: "Tektiteko", + tte: "Bwanabwana", + ttj: "Tooro", + ttq: "TawallammatTamajaq", + ttr: "Tera", + ttu: "Torau", + tue: "Tuyuca", + tuf: "CentralTunebo", + tui: "Tupuri", + tuk: "Turkmen", + tul: "Tula", + tuo: "Tucano", + tuq: "Tedaga", + tur: "Turkish", + tuv: "Turkana", + tuy: "Tugen", + tvo: "Tidore", + tvu: "Tunen", + tvw: "Sedoa", + twb: "WesternTawbuid", + twe: "Tewa(Indonesia)", + twu: "Termanu", + txa: "Tombonuo", + txq: "Tii", + txs: "Tonsea", + txu: "Kayapó", + txy: "TanosyMalagasy", + tye: "Kyanga", + tzh: "Tzeltal", + tzj: "Tz'utujil", + tzo: "Tzotzil", + ubl: "Buhi'nonBikol", + ubu: "Umbu-Ungu", + udl: "Wuzlam", + udm: "Udmurt", + udu: "Uduk", + uig: "Uighur", + uki: "Kui(India)", + ukr: "Ukrainian", + ukv: "Kuku", + umb: "Umbundu", + upv: "Uripiv-Wala-Rano-Atchin", + ura: "Urarina", + urb: "Urubú-Kaapor", + urd: "Urdu", + urh: "Urhobo", + urk: "UrakLawoi'", + urt: "Urat", + ury: "Orya", + ush: "Ushojo", + usp: "Uspanteco", + uzb: "Uzbek", + uzn: "NorthernUzbek", + vag: "Vagla", + vah: "Varhadi-Nagpuri", + vai: "Vai", + var: "Huarijio", + ver: "MomJango", + vid: "Vidunda", + vie: "Vietnamese", + vif: "Vili", + vmc: "JuxtlahuacaMixtec", + vmj: "IxtayutlaMixtec", + vmm: "MitlatongoMixtec", + vmp: "SoyaltepecMazatec", + vmw: "Makhuwa", + vmy: "AyautlaMazatec", + vmz: "MazatlánMazatec", + vro: "Võro", + vun: "Vunjo", + vut: "Vute", + wal: "Wolaytta", + wap: "Wapishana", + war: "Waray(Philippines)", + waw: "Waiwai", + way: "Wayana", + wba: "Warao", + wbl: "Wakhi", + wbr: "Wagdi", + wci: "WaciGbe", + weo: "Wemale", + wes: "CameroonPidgin", + wja: "Waja", + wji: "Warji", + wlo: "Wolio", + wlx: "Wali(Ghana)", + wmw: "Mwani", + wob: "WèNorthern", + wof: "GambianWolof", + wol: "Wolof", + wsg: "AdilabadGondi", + wwa: "Waama", + xal: "Kalmyk", + xdy: "MalayicDayak", + xed: "Hdi", + xer: "Xerénte", + xhe: "Khetrani", + xho: "Xhosa", + xka: "Kalkoti", + xkl: "MainstreamKenyah", + xmf: "Mingrelian", + xmm: "ManadoMalay", + xmv: "AntankaranaMalagasy", + xnj: "Ngoni(Tanzania)", + xnr: "Kangri", + xog: "Soga", + xon: "Konkomba", + xpe: "LiberiaKpelle", + xrb: "EasternKaraboro", + xsb: "Sambal", + xsm: "Kasem", + xsr: "Sherpa", + xsu: "Sanumá", + xta: "AlcozaucaMixtec", + xtd: "Diuxi-TilantongoMixtec", + xte: "Ketengban", + xti: "SinicahuaMixtec", + xtm: "MagdalenaPeñascoMixtec", + xtn: "NorthernTlaxiacoMixtec", + xtu: "CuyamecalcoMixtec", + xua: "AluKurumba", + xuo: "Kuo", + yaa: "Yaminahua", + yad: "Yagua", + yal: "Yalunka", + yam: "Yamba", + yao: "Yao", + yaq: "Yaqui", + yas: "Nugunu(Cameroon)", + yat: "Yambeta", + yav: "Yangben", + yay: "Agwagwune", + yaz: "Lokaa", + yba: "Yala", + ybb: "Yemba", + ycl: "Lolopo", + ycn: "Yucuna", + ydd: "EasternYiddish", + ydg: "Yidgha", + yea: "Ravula", + yer: "Tarok", + yes: "Nyankpa", + yka: "Yakan", + yli: "AnggurukYali", + yor: "Yoruba", + yre: "Yaouré", + yua: "Yucateco", + yue: "YueChinese", + yuz: "Yuracare", + yva: "Yawa", + zaa: "SierradeJuárezZapotec", + zab: "WesternTlacolulaValleyZapotec", + zac: "OcotlánZapotec", + zad: "CajonosZapotec", + zae: "YareniZapotec", + zai: "IsthmusZapotec", + zam: "MiahuatlánZapotec", + zao: "OzolotepecZapotec", + zaq: "AloápamZapotec", + zar: "RincónZapotec", + zas: "SantoDomingoAlbarradasZapotec", + zav: "YatzachiZapotec", + zaw: "MitlaZapotec", + zca: "CoatecasAltasZapotec", + zga: "Kinga", + zim: "Mesme", + ziw: "Zigula", + zmz: "Mbandja", + zne: "Zande(individuallanguage)", + zoc: "CopainaláZoque", + zoh: "ChimalapaZoque", + zor: "RayónZoque", + zos: "FranciscoLeónZoque", + zpc: "ChoapanZapotec", + zpg: "GueveaDeHumboldtZapotec", + zpi: "SantaMaríaQuiegolaniZapotec", + zpl: "LachixíoZapotec", + zpm: "MixtepecZapotec", + zpo: "AmatlánZapotec", + zpt: "SanVicenteCoatlánZapotec", + zpu: "YalálagZapotec", + zpv: "ChichicapanZapotec", + zpy: "MazaltepecZapotec", + zpz: "TexmelucanZapotec", + zsm: "StandardMalay", + ztg: "XanaguíaZapotec", + ztn: "SantaCatarinaAlbarradasZapotec", + ztp: "LoxichaZapotec", + ztq: "Quioquitani-QuieríZapotec", + zts: "TilquiapanZapotec", + ztu: "GüiláZapotec", + zty: "YateeZapotec", + zul: "Zulu", + zyb: "YongbeiZhuang", + zyp: "ZypheChin", + zza: "Zaza", +}; \ No newline at end of file diff --git a/sharedUtils/omniAsrSupportedLangs.ts b/sharedUtils/omniAsrSupportedLangs.ts new file mode 100644 index 000000000..bafb5995a --- /dev/null +++ b/sharedUtils/omniAsrSupportedLangs.ts @@ -0,0 +1,315 @@ +/** + * OmniASR supported-language snapshot + * ----------------------------------- + * + * Static snapshot of the language codes supported by the OmniASR transcription + * service (Meta Omnilingual ASR — `omniASR_LLM_1B_v2`). Each entry is in + * `{iso639_3}_{Script}` form, e.g. `eng_Latn`, `swh_Latn`, `urd_Arab`. + * + * We bundle this list so the extension can validate / resolve language codes + * offline, with no runtime network dependency. + * + * Regenerating + * ~~~~~~~~~~~~ + * If we change ASR providers or the underlying model, regenerate this file from + * the live `/languages` endpoint: + * + * curl -s "https://genesis-ai-dev--codex-asr-serve.modal.run/languages" \ + * | python3 -c " + * import json, sys + * d = json.load(sys.stdin) + * langs = sorted(set(d['languages'])) + * print('export const OMNI_ASR_SUPPORTED_LANGS: readonly string[] = [') + * for i in range(0, len(langs), 6): + * print(' ' + ', '.join(f'\"{c}\"' for c in langs[i:i+6]) + ',') + * print('];') + * " + * + * (Pre-rename, the host was `genesis-ai-dev--mms-zeroshot-asr-serve.modal.run`.) + * + * Snapshot taken: 2026-06-04. Server reported 1672 languages. + */ + +export const OMNI_ASR_SUPPORTED_LANGS: readonly string[] = [ + "aae_Latn", "aal_Latn", "abb_Latn", "abi_Latn", "abk_Cyrl", "abn_Latn", + "abp_Latn", "abr_Latn", "abs_Latn", "aca_Latn", "acd_Latn", "ace_Latn", + "acf_Latn", "ach_Latn", "acm_Arab", "acn_Latn", "acr_Latn", "acu_Latn", + "acw_Arab", "ade_Latn", "adh_Latn", "adj_Latn", "adx_Tibt", "ady_Cyrl", + "aeb_Arab", "aec_Arab", "aeu_Latn", "afb_Arab", "afo_Latn", "afr_Latn", + "agd_Latn", "agg_Latn", "agn_Latn", "agr_Latn", "agu_Latn", "agx_Cyrl", + "aha_Latn", "ahk_Latn", "ahl_Latn", "ahs_Latn", "aia_Latn", "ajg_Latn", + "aka_Latn", "akb_Latn", "ake_Latn", "akp_Latn", "ala_Latn", "alj_Latn", + "aln_Latn", "alo_Latn", "alp_Latn", "als_Latn", "alt_Cyrl", "alz_Latn", + "ame_Latn", "amf_Latn", "amh_Ethi", "ami_Latn", "amk_Latn", "amu_Latn", + "anc_Latn", "ank_Latn", "ann_Latn", "anp_Deva", "anw_Latn", "any_Latn", + "aom_Latn", "aoz_Latn", "apb_Latn", "apc_Arab", "apd_Arab", "apr_Latn", + "arb_Arab", "arg_Latn", "arl_Latn", "arq_Arab", "ars_Arab", "ary_Arab", + "arz_Arab", "asa_Latn", "asg_Latn", "asm_Beng", "ast_Latn", "ata_Latn", + "atb_Latn", "atg_Latn", "ati_Latn", "atq_Latn", "ava_Cyrl", "avn_Latn", + "avu_Latn", "awa_Deva", "awb_Latn", "awo_Latn", "ayl_Arab", "ayo_Latn", + "ayp_Arab", "ayr_Latn", "ayz_Latn", "aze_Arab", "aze_Cyrl", "aze_Latn", + "azg_Latn", "azz_Latn", "bag_Latn", "bak_Cyrl", "bam_Latn", "ban_Latn", + "bao_Latn", "bas_Latn", "bav_Latn", "bax_Latn", "bba_Latn", "bbb_Latn", + "bbc_Latn", "bbj_Latn", "bbl_Geor", "bbo_Latn", "bbu_Latn", "bcc_Arab", + "bcc_Latn", "bce_Latn", "bci_Latn", "bcl_Latn", "bcs_Latn", "bcw_Latn", + "bcy_Latn", "bcz_Latn", "bda_Latn", "bde_Latn", "bdg_Latn", "bdh_Latn", + "bdm_Latn", "bdq_Latn", "bdu_Latn", "beb_Latn", "beh_Latn", "bel_Cyrl", + "bem_Latn", "ben_Beng", "bep_Latn", "bew_Latn", "bex_Latn", "bfa_Latn", + "bfd_Latn", "bfo_Latn", "bft_Arab", "bfy_Deva", "bfz_Deva", "bgc_Deva", + "bgp_Arab", "bgq_Deva", "bgr_Latn", "bgt_Latn", "bgw_Deva", "bha_Deva", + "bhb_Deva", "bhh_Cyrl", "bho_Deva", "bhp_Latn", "bht_Deva", "bhz_Latn", + "bib_Latn", "bim_Latn", "bis_Latn", "biv_Latn", "bjj_Deva", "bjk_Latn", + "bjn_Latn", "bjr_Latn", "bjt_Latn", "bjv_Latn", "bjw_Latn", "bjz_Latn", + "bkd_Latn", "bkh_Latn", "bkm_Latn", "bkv_Latn", "bky_Latn", "ble_Latn", + "blh_Latn", "blt_Latn", "blx_Latn", "blz_Latn", "bmm_Latn", "bmq_Latn", + "bmr_Latn", "bmu_Latn", "bmv_Latn", "bng_Beng", "bnm_Latn", "bnn_Latn", + "bno_Latn", "bnp_Latn", "bns_Deva", "boa_Latn", "bod_Tibt", "boj_Latn", + "bom_Latn", "bor_Latn", "bos_Latn", "bou_Latn", "bov_Latn", "box_Latn", + "bpr_Latn", "bps_Latn", "bqc_Latn", "bqg_Latn", "bqi_Arab", "bqj_Latn", + "bqp_Latn", "bra_Deva", "bre_Latn", "brh_Arab", "bri_Latn", "bru_Latn", + "brx_Deva", "bsc_Latn", "bsh_Arab", "bsj_Latn", "bsk_Latn", "bsq_Latn", + "bss_Latn", "bsy_Latn", "btd_Latn", "btm_Latn", "bts_Latn", "btt_Latn", + "btv_Arab", "btx_Latn", "bud_Latn", "bug_Latn", "bul_Cyrl", "bum_Latn", + "buo_Latn", "bus_Latn", "bux_Latn", "bvb_Latn", "bvc_Latn", "bvz_Latn", + "bwq_Latn", "bwr_Latn", "bwu_Latn", "bxf_Latn", "bxk_Latn", "byc_Latn", + "byr_Latn", "bys_Latn", "byv_Latn", "byx_Latn", "bzh_Latn", "bzi_Thai", + "bzj_Latn", "bzw_Latn", "caa_Latn", "cab_Latn", "cac_Latn", "cak_Latn", + "cap_Latn", "car_Latn", "cas_Latn", "cat_Latn", "cax_Latn", "cbc_Latn", + "cbi_Latn", "cbr_Latn", "cbs_Latn", "cbt_Latn", "cbu_Latn", "cbv_Latn", + "cce_Latn", "ccg_Latn", "cco_Latn", "cdj_Deva", "cdo_Hans", "ceb_Latn", + "ceg_Latn", "cek_Latn", "cen_Latn", "ces_Latn", "cfa_Latn", "cfm_Latn", + "cgc_Latn", "cgg_Latn", "che_Cyrl", "chf_Latn", "chq_Latn", "chv_Cyrl", + "chz_Latn", "cjk_Latn", "cjo_Latn", "cjp_Latn", "cjs_Cyrl", "ckb_Arab", + "ckl_Latn", "cko_Latn", "ckr_Latn", "ckt_Cyrl", "cky_Latn", "cla_Latn", + "cle_Latn", "cly_Latn", "cme_Latn", "cmn_Hans", "cmn_Hant", "cmo_Khmr", + "cmo_Latn", "cmr_Latn", "cnh_Latn", "cni_Latn", "cnl_Latn", "cnt_Latn", + "coe_Latn", "cof_Latn", "cok_Latn", "con_Latn", "cor_Latn", "cot_Latn", + "cou_Latn", "cpa_Latn", "cpb_Latn", "cpu_Latn", "cpx_Hans", "cpy_Latn", + "crh_Cyrl", "crk_Cans", "crk_Latn", "crn_Latn", "crq_Latn", "crs_Latn", + "crt_Latn", "csk_Latn", "cso_Latn", "ctd_Latn", "cte_Latn", "ctg_Beng", + "ctl_Latn", "cto_Latn", "ctu_Latn", "cuc_Latn", "cui_Latn", "cuk_Latn", + "cul_Latn", "cut_Latn", "cux_Latn", "cwa_Latn", "cwe_Latn", "cwt_Latn", + "cya_Latn", "cym_Latn", "daa_Latn", "dag_Latn", "dah_Latn", "dan_Latn", + "dar_Cyrl", "dav_Latn", "dbd_Latn", "dbj_Latn", "dbq_Latn", "dcc_Arab", + "ddn_Latn", "ded_Latn", "deg_Latn", "des_Latn", "deu_Latn", "dga_Latn", + "dgh_Latn", "dgi_Latn", "dgk_Latn", "dgo_Deva", "dgr_Latn", "dhi_Deva", + "did_Latn", "dig_Latn", "dik_Latn", "dip_Latn", "div_Thaa", "dje_Latn", + "djk_Latn", "dmk_Arab", "dml_Arab", "dnj_Latn", "dnt_Latn", "dnw_Latn", + "dop_Latn", "dos_Latn", "dru_Latn", "dsb_Latn", "dsh_Latn", "dtp_Latn", + "dts_Latn", "dty_Deva", "dua_Latn", "dug_Latn", "dwr_Latn", "dyi_Latn", + "dyo_Latn", "dyu_Latn", "dzg_Latn", "dzo_Tibt", "ebu_Latn", "ego_Latn", + "eip_Latn", "eiv_Latn", "eka_Latn", "ekk_Latn", "eko_Latn", "ekr_Latn", + "ell_Grek", "ell_Grek_cypr1249", "elm_Latn", "emp_Latn", "enb_Latn", "eng_Latn", + "enx_Latn", "epo_Latn", "ese_Latn", "ess_Latn", "esu_Latn", "eto_Latn", + "ets_Latn", "etu_Latn", "eus_Latn", "evn_Cyrl", "ewe_Latn", "ewo_Latn", + "eyo_Latn", "eza_Latn", "fal_Latn", "fan_Latn", "fao_Latn", "far_Latn", + "fas_Arab", "fat_Latn", "fia_Latn", "fij_Latn", "fil_Latn", "fin_Latn", + "fip_Latn", "fkk_Latn", "flr_Latn", "fmp_Latn", "fmu_Deva", "fon_Latn", + "fra_Latn", "frd_Latn", "fry_Latn", "fub_Latn", "fuc_Latn", "fue_Latn", + "ful_Latn", "fuq_Latn", "fuv_Latn", "gag_Cyrl", "gag_Latn", "gai_Latn", + "gam_Latn", "gau_Telu", "gbi_Latn", "gbk_Deva", "gbm_Deva", "gbo_Latn", + "gbr_Latn", "gby_Latn", "gcc_Latn", "gde_Latn", "gdf_Latn", "geb_Latn", + "gej_Latn", "ges_Latn", "ggg_Arab", "gid_Latn", "gig_Arab", "gil_Latn", + "giz_Latn", "gjk_Arab", "gjn_Latn", "gju_Arab", "gkn_Latn", "gld_Cyrl", + "gle_Latn", "glg_Latn", "glk_Arab", "glv_Latn", "glw_Latn", "gmv_Latn", + "gna_Latn", "gnd_Latn", "gng_Latn", "gof_Latn", "gog_Latn", "gol_Latn", + "gom_Deva", "gor_Latn", "gqr_Latn", "grc_Grek", "gri_Latn", "grn_Latn", + "grt_Beng", "gsl_Latn", "gso_Latn", "gub_Latn", "guc_Latn", "gud_Latn", + "gug_Latn", "guh_Latn", "gui_Latn", "guj_Gujr", "guk_Ethi", "gum_Latn", + "guo_Latn", "guq_Latn", "gur_Latn", "guu_Latn", "gux_Latn", "guz_Latn", + "gvc_Latn", "gvl_Latn", "gwc_Arab", "gwe_Latn", "gwi_Latn", "gwr_Latn", + "gwt_Arab", "gym_Latn", "gyr_Latn", "gyz_Latn", "had_Latn", "hag_Latn", + "hah_Latn", "hak_Latn", "hao_Latn", "hap_Latn", "hat_Latn", "hau_Latn", + "haw_Latn", "hay_Latn", "hbb_Latn", "hch_Latn", "heb_Hebr", "heh_Latn", + "her_Latn", "hia_Latn", "hif_Latn", "hig_Latn", "hil_Latn", "hin_Deva", + "hkk_Latn", "hla_Latn", "hlb_Deva", "hlt_Latn", "hne_Deva", "hnn_Latn", + "hno_Arab", "hns_Latn", "hoc_Orya", "hrv_Latn", "hsb_Latn", "hto_Latn", + "hub_Latn", "hue_Latn", "hui_Latn", "hul_Latn", "hun_Latn", "hus_Latn", + "huu_Latn", "huv_Latn", "hux_Latn", "hvn_Latn", "hwc_Latn", "hwo_Latn", + "hye_Armn", "hyw_Armn", "iba_Latn", "ibb_Latn", "ibo_Latn", "icr_Latn", + "ida_Latn", "idd_Latn", "idu_Latn", "ifa_Latn", "ifb_Latn", "ife_Latn", + "ifk_Latn", "ifu_Latn", "ify_Latn", "igl_Latn", "ign_Latn", "ijc_Latn", + "ijn_Latn", "ikk_Latn", "ikw_Latn", "ilb_Latn", "ilo_Latn", "imo_Latn", + "ina_Latn", "inb_Latn", "ind_Latn", "iou_Latn", "ipi_Latn", "ipk_Latn", + "iqw_Latn", "iri_Latn", "irk_Latn", "ish_Latn", "isl_Latn", "iso_Latn", + "ita_Latn", "itl_Cyrl", "its_Latn", "itv_Latn", "itw_Latn", "itz_Latn", + "ixl_Latn", "izr_Latn", "izz_Latn", "jac_Latn", "jal_Latn", "jam_Latn", + "jav_Latn", "jax_Latn", "jbu_Latn", "jen_Latn", "jic_Latn", "jiv_Latn", + "jmc_Latn", "jmd_Latn", "jmx_Latn", "jpn_Jpan", "jqr_Latn", "juk_Latn", + "jun_Orya", "juo_Latn", "jvn_Latn", "kaa_Cyrl", "kab_Latn", "kac_Latn", + "kai_Latn", "kaj_Latn", "kak_Latn", "kam_Latn", "kan_Knda", "kao_Latn", + "kaq_Latn", "kas_Arab", "kat_Geor", "kay_Latn", "kaz_Cyrl", "kbd_Cyrl", + "kbl_Latn", "kbo_Latn", "kbp_Latn", "kbq_Latn", "kbr_Latn", "kbt_Latn", + "kby_Latn", "kca_Cyrl", "kcg_Latn", "kcn_Latn", "kcq_Latn", "kdc_Latn", + "kde_Latn", "kdh_Latn", "kdi_Latn", "kdj_Latn", "kdl_Latn", "kdn_Latn", + "kdt_Khmr", "kea_Latn", "kek_Latn", "ken_Latn", "keo_Latn", "ker_Latn", + "keu_Latn", "key_Telu", "kez_Latn", "kfb_Deva", "kff_Telu", "kfk_Deva", + "kfq_Deva", "kfr_Gujr", "kfw_Latn", "kfx_Deva", "kha_Latn", "khg_Tibt", + "khk_Cyrl", "khm_Khmr", "khq_Latn", "khw_Arab", "kia_Latn", "kij_Latn", + "kik_Latn", "kin_Latn", "kir_Cyrl", "kix_Latn", "kjb_Latn", "kjc_Latn", + "kje_Latn", "kjg_Latn", "kjh_Cyrl", "kjk_Latn", "kki_Latn", "kkj_Latn", + "kle_Deva", "kln_Latn", "kls_Latn", "klu_Latn", "klv_Latn", "klw_Latn", + "kma_Latn", "kmd_Latn", "kml_Latn", "kmr_Arab", "kmr_Cyrl", "kmr_Latn", + "kmu_Latn", "kmy_Latn", "kna_Latn", "knb_Latn", "knc_Latn", "kne_Latn", + "knf_Latn", "knj_Latn", "knk_Latn", "knn_Deva", "kno_Latn", "kog_Latn", + "kol_Latn", "koo_Latn", "kor_Hang", "kpo_Latn", "kpq_Latn", "kps_Latn", + "kpv_Cyrl", "kpy_Cyrl", "kpz_Latn", "kqe_Latn", "kqo_Latn", "kqp_Latn", + "kqr_Latn", "kqy_Ethi", "krc_Cyrl", "kri_Latn", "krj_Latn", "krl_Latn", + "krr_Khmr", "krs_Latn", "kru_Deva", "krx_Latn", "ksb_Latn", "ksd_Latn", + "ksf_Latn", "ksr_Latn", "kss_Latn", "ksz_Deva", "ktb_Ethi", "ktj_Latn", + "kto_Latn", "kua_Latn", "kub_Latn", "kue_Latn", "kuh_Latn", "kum_Cyrl", + "kur_Arab", "kus_Latn", "kvn_Latn", "kvw_Latn", "kvx_Arab", "kwd_Latn", + "kwf_Latn", "kwi_Latn", "kwm_Latn", "kxc_Ethi", "kxf_Latn", "kxm_Thai", + "kxp_Arab", "kyb_Latn", "kyc_Latn", "kyf_Latn", "kyg_Latn", "kyo_Latn", + "kyq_Latn", "kyu_Kali", "kyx_Latn", "kyz_Latn", "kzf_Latn", "kzi_Latn", + "lac_Latn", "lag_Latn", "laj_Latn", "lam_Latn", "lao_Laoo", "las_Latn", + "lat_Latn", "lav_Latn", "law_Latn", "lbj_Tibt", "lbw_Latn", "lcm_Latn", + "lcp_Thai", "ldb_Latn", "led_Latn", "lee_Latn", "lef_Latn", "lem_Latn", + "lew_Latn", "lex_Latn", "lgg_Latn", "lgl_Latn", "lhu_Latn", "lia_Latn", + "lid_Latn", "lif_Deva", "lij_Latn", "lin_Latn", "lip_Latn", "lir_Latn", + "lis_Lisu", "lit_Latn", "lje_Latn", "ljp_Latn", "lkb_Latn", "lke_Latn", + "lla_Latn", "lld_Latn_gherd", "lld_Latn_valbadia", "llg_Latn", "lln_Latn", "lme_Latn", + "lnd_Latn", "lns_Latn", "lnu_Latn", "loa_Latn", "lob_Latn", "lok_Latn", + "lom_Latn", "lon_Latn", "loq_Latn", "lrk_Arab", "lsi_Latn", "lsm_Latn", + "lss_Arab", "ltg_Latn", "lth_Latn", "lto_Latn", "ltz_Latn", "lua_Latn", + "luc_Latn", "lug_Latn", "luo_Latn", "lus_Latn", "lwg_Latn", "lwo_Latn", + "lww_Latn", "lzz_Latn", "maa_Latn", "mab_Latn", "mad_Latn", "maf_Latn", + "mag_Deva", "mah_Latn", "mai_Deva", "maj_Latn", "mak_Latn", "mal_Mlym", + "mam_Latn", "maq_Latn", "mar_Deva", "mau_Latn", "maw_Latn", "max_Latn", + "maz_Latn", "mbb_Latn", "mbc_Latn", "mbh_Latn", "mbj_Latn", "mbt_Latn", + "mbu_Latn", "mca_Latn", "mcb_Latn", "mcd_Latn", "mcf_Latn", "mco_Latn", + "mcp_Latn", "mcq_Latn", "mcu_Latn", "mcx_Latn", "mda_Latn", "mdd_Latn", + "mdv_Latn", "mdy_Ethi", "med_Latn", "mee_Latn", "meh_Latn", "mej_Latn", + "mek_Latn", "mel_Latn", "men_Latn", "meq_Latn", "mer_Latn", "met_Latn", + "meu_Latn", "mev_Latn", "mfe_Latn", "mfh_Latn", "mfi_Latn", "mfk_Latn", + "mfm_Latn", "mfn_Latn", "mfo_Latn", "mfq_Latn", "mfv_Latn", "mfy_Latn", + "mfz_Latn", "mgd_Latn", "mge_Latn", "mgg_Latn", "mgh_Latn", "mgi_Latn", + "mgo_Latn", "mhi_Latn", "mhk_Latn", "mhr_Cyrl", "mhu_Latn", "mhx_Latn", + "mhy_Latn", "mib_Latn", "mie_Latn", "mif_Latn", "mig_Latn", "mih_Latn", + "mil_Latn", "mim_Latn", "min_Latn", "mio_Latn", "mip_Latn", "miq_Latn", + "mit_Latn", "miu_Latn", "miy_Latn", "miz_Latn", "mjl_Deva", "mjv_Mlym", + "mkd_Cyrl", "mkf_Latn", "mki_Arab", "mkl_Latn", "mkn_Latn", "mlg_Latn", + "mlq_Latn", "mlt_Latn", "mmc_Latn", "mmg_Latn", "mnb_Latn", "mne_Latn", + "mnf_Latn", "mni_Beng", "mnk_Latn", "mnw_Mymr", "mnx_Latn", "moa_Latn", + "mog_Latn", "mon_Cyrl", "mop_Latn", "mor_Latn", "mos_Latn", "mox_Latn", + "moz_Latn", "mpg_Latn", "mpm_Latn", "mpp_Latn", "mpx_Latn", "mqb_Latn", + "mqf_Latn", "mqj_Latn", "mqn_Latn", "mqy_Latn", "mri_Latn", "mrj_Cyrl", + "mrr_Deva", "mrt_Latn", "mrw_Latn", "msh_Latn", "msi_Latn", "msw_Latn", + "msy_Latn", "mtd_Latn", "mtj_Latn", "mto_Latn", "mtr_Deva", "mtu_Latn", + "mtx_Latn", "mua_Latn", "mug_Latn", "muh_Latn", "mui_Latn", "mup_Deva", + "mur_Latn", "muv_Mlym", "muy_Latn", "mve_Arab", "mvp_Latn", "mvy_Arab", + "mwq_Latn", "mwv_Latn", "mxb_Latn", "mxq_Latn", "mxs_Latn", "mxt_Latn", + "mxu_Latn", "mxv_Latn", "mxy_Latn", "mya_Mymr", "myb_Latn", "myk_Latn", + "myv_Cyrl", "myx_Latn", "myy_Latn", "mza_Latn", "mzi_Latn", "mzj_Latn", + "mzk_Latn", "mzl_Latn", "mzm_Latn", "mzw_Latn", "nab_Latn", "nag_Latn", + "nal_Latn", "nan_Latn", "nap_Latn", "nas_Latn", "naw_Latn", "nbh_Latn", + "nca_Latn", "ncf_Latn", "nch_Latn", "ncj_Latn", "ncl_Latn", "nco_Latn", + "ncu_Latn", "ncx_Latn", "ndi_Latn", "ndj_Latn", "ndo_Latn", "ndp_Latn", + "ndv_Latn", "ndy_Latn", "ndz_Latn", "neb_Latn", "nep_Deva", "new_Deva", + "nfa_Latn", "nfr_Latn", "nga_Latn", "ngi_Latn", "ngl_Latn", "ngp_Latn", + "ngu_Latn", "nhe_Latn", "nhg_Latn", "nhi_Latn", "nhn_Latn", "nhq_Latn", + "nhu_Latn", "nhw_Latn", "nhx_Latn", "nhy_Latn", "nia_Latn", "nij_Latn", + "nim_Latn", "nin_Latn", "nja_Latn", "nko_Latn", "nla_Latn", "nlc_Latn", + "nld_Latn", "nlg_Latn", "nlk_Latn", "nlv_Latn", "nmg_Latn", "nmz_Latn", + "nnb_Latn", "nnh_Latn", "nnq_Latn", "nnw_Latn", "noa_Latn", "nob_Latn", + "nod_Thai", "noe_Deva", "nog_Cyrl", "not_Latn", "npl_Latn", "npy_Latn", + "nso_Latn", "nst_Latn", "nsu_Latn", "ntm_Latn", "ntr_Latn", "nuj_Latn", + "nup_Latn", "nus_Latn", "nuz_Latn", "nwb_Latn", "nxq_Latn", "nya_Latn", + "nyf_Latn", "nyn_Latn", "nyo_Latn", "nyu_Latn", "nyy_Latn", "nzi_Latn", + "obo_Latn", "oci_Latn", "odk_Arab", "odu_Latn", "ogo_Latn", "ojb_Cans", + "ojb_Latn", "oku_Latn", "old_Latn", "omw_Latn", "onb_Latn", "ood_Latn", + "orc_Latn", "orm_Latn", "oru_Arab", "ory_Orya", "oss_Cyrl", "ote_Latn", + "otq_Latn", "ozm_Latn", "pab_Latn", "pad_Latn", "pag_Latn", "pam_Latn", + "pan_Guru", "pao_Latn", "pap_Latn", "pau_Latn", "pbb_Latn", "pbc_Latn", + "pbi_Latn", "pbs_Latn", "pbt_Arab", "pbu_Arab", "pce_Thai", "pcm_Latn", + "pex_Latn", "pez_Latn", "phl_Arab", "phr_Arab", "pib_Latn", "pil_Latn", + "pip_Latn", "pir_Latn", "pis_Latn", "piy_Latn", "pjt_Latn", "pkb_Latn", + "pko_Latn", "plk_Arab", "pls_Latn", "plt_Latn", "plw_Latn", "pmf_Latn", + "pmq_Latn", "pms_Latn", "pmy_Latn", "pnb_Arab", "pne_Latn", "pny_Latn", + "poc_Latn", "poe_Latn", "poh_Latn", "poi_Latn", "pol_Latn", "por_Latn", + "pov_Latn", "pow_Latn", "poy_Latn", "ppk_Latn", "pps_Latn", "prf_Latn", + "prk_Latn", "prq_Latn", "prt_Thai", "pse_Latn", "pss_Latn", "pst_Arab", + "ptu_Latn", "pua_Latn", "pui_Latn", "pus_Arab", "pwg_Latn", "pwn_Latn", + "pww_Thai", "pxm_Latn", "qub_Latn", "quc_Latn", "quf_Latn", "qug_Latn", + "quh_Latn", "qul_Latn", "qum_Latn", "qup_Latn", "qur_Latn", "qus_Latn", + "quv_Latn", "quw_Latn", "qux_Latn", "quy_Latn", "quz_Latn", "qva_Latn", + "qvc_Latn", "qve_Latn", "qvh_Latn", "qvi_Latn", "qvj_Latn", "qvl_Latn", + "qvm_Latn", "qvn_Latn", "qvo_Latn", "qvs_Latn", "qvw_Latn", "qvz_Latn", + "qwa_Latn", "qwh_Latn", "qws_Latn", "qxa_Latn", "qxh_Latn", "qxl_Latn", + "qxn_Latn", "qxo_Latn", "qxp_Latn", "qxr_Latn", "qxt_Latn", "qxu_Latn", + "qxw_Latn", "rag_Latn", "rah_Beng", "rai_Latn", "rap_Latn", "rav_Deva", + "raw_Latn", "rej_Latn", "rel_Latn", "rgu_Latn", "rhg_Latn", "rif_Arab", + "rif_Latn", "rim_Latn", "rjs_Deva", "rkt_Beng", "rmc_Cyrl", "rmc_Latn", + "rmo_Latn", "rmy_Cyrl", "rmy_Latn", "rng_Latn", "rnl_Latn", "rob_Latn", + "rof_Latn", "roh_Latn_surs1244", "rol_Latn", "ron_Latn", "roo_Latn", "rop_Latn", + "rro_Latn", "rth_Latn", "rub_Latn", "ruc_Latn", "ruf_Latn", "rug_Latn", + "run_Latn", "rus_Cyrl", "rwm_Latn", "rwr_Deva", "sab_Latn", "sag_Latn", + "sah_Cyrl", "saj_Latn", "saq_Latn", "sas_Latn", "sau_Latn", "say_Latn", + "sba_Latn", "sbd_Latn", "sbl_Latn", "sbn_Arab", "sbp_Latn", "sch_Latn", + "sck_Deva", "scl_Arab", "scn_Latn", "sco_Latn", "sda_Latn", "sdo_Latn", + "sea_Latn", "seh_Latn", "sei_Latn", "ses_Latn", "sey_Latn", "sgb_Latn", + "sgj_Deva", "sgw_Ethi", "shi_Latn", "shk_Latn", "shn_Mymr", "sho_Latn", + "shp_Latn", "sid_Latn", "sig_Latn", "sil_Latn", "sin_Sinh", "sip_Tibt", + "siw_Latn", "sja_Latn", "sjm_Latn", "sjp_Deva", "sjr_Latn", "skg_Latn", + "skr_Arab", "sld_Latn", "slk_Latn", "slu_Latn", "slv_Latn", "sml_Latn", + "smo_Latn", "sna_Latn", "snc_Latn", "snd_Arab", "sne_Latn", "snk_Latn", + "snn_Latn", "snp_Latn", "snv_Latn", "snw_Latn", "sol_Latn", "som_Latn", + "soy_Latn", "spa_Latn", "spp_Latn", "sps_Latn", "spy_Latn", "src_Latn", + "srd_Latn", "sri_Latn", "srm_Latn", "srn_Latn", "sro_Latn", "srp_Cyrl", + "srr_Latn", "srx_Deva", "ssi_Arab", "ste_Latn", "stn_Latn", "stp_Latn", + "sua_Latn", "suc_Latn", "suk_Latn", "sun_Latn", "sur_Latn", "sus_Latn", + "suv_Latn", "suz_Deva", "sva_Geor", "swe_Latn", "swh_Latn", "swv_Deva", + "sxb_Latn", "sxn_Latn", "sya_Latn", "syl_Latn", "sza_Latn", "szy_Latn", + "tac_Latn", "taj_Deva", "tam_Taml", "tan_Latn", "tao_Latn", "tap_Latn", + "taq_Latn", "tar_Latn", "tat_Cyrl", "tav_Latn", "tay_Latn", "tbc_Latn", + "tbf_Latn", "tbg_Latn", "tbk_Latn", "tbl_Latn", "tby_Latn", "tbz_Latn", + "tca_Latn", "tcc_Latn", "tcf_Latn", "tcy_Mlym", "tcz_Latn", "tdj_Latn", + "tdn_Latn", "tdx_Latn", "ted_Latn", "tee_Latn", "tel_Telu", "tem_Latn", + "teo_Latn", "ter_Latn", "tew_Latn", "tex_Latn", "tfr_Latn", "tgc_Latn", + "tgj_Latn", "tgk_Cyrl", "tgl_Latn", "tgo_Latn", "tgp_Latn", "tha_Thai", + "the_Deva", "thk_Latn", "thl_Deva", "thq_Deva", "thr_Deva", "thv_Tfng", + "tig_Ethi", "tih_Latn", "tik_Latn", "tio_Latn", "tir_Ethi", "tkg_Latn", + "tkr_Latn", "tkt_Deva", "tlb_Latn", "tli_Latn", "tlj_Latn", "tlp_Latn", + "tly_Latn", "tmc_Latn", "tmf_Latn", "tna_Latn", "tng_Latn", "tnk_Latn", + "tnn_Latn", "tnp_Latn", "tnr_Latn", "tnt_Latn", "tob_Latn", "toc_Latn", + "toh_Latn", "tok_Latn", "tom_Latn", "top_Latn", "tos_Latn", "tpi_Latn", + "tpl_Latn", "tpm_Latn", "tpp_Latn", "tpt_Latn", "tpz_Latn", "tqp_Latn", + "trc_Latn", "tri_Latn", "trn_Latn", "trp_Latn", "trq_Latn", "trs_Latn", + "trv_Latn", "trw_Arab", "tsn_Latn", "tso_Latn", "tsz_Latn", "ttc_Latn", + "tte_Latn", "ttj_Latn", "ttq_Tfng", "ttr_Latn", "ttu_Latn", "tue_Latn", + "tuf_Latn", "tui_Latn", "tuk_Arab", "tuk_Latn", "tul_Latn", "tuo_Latn", + "tuq_Latn", "tur_Latn", "tuv_Latn", "tuy_Latn", "tvo_Latn", "tvu_Latn", + "tvw_Latn", "twb_Latn", "twe_Latn", "twu_Latn", "txa_Latn", "txq_Latn", + "txs_Latn", "txu_Latn", "txy_Latn", "tye_Latn", "tzh_Latn", "tzj_Latn", + "tzo_Latn", "ubl_Latn", "ubu_Latn", "udl_Latn", "udm_Cyrl", "udu_Latn", + "uig_Arab", "uig_Cyrl", "uki_Orya", "ukr_Cyrl", "ukv_Latn", "umb_Latn", + "upv_Latn", "ura_Latn", "urb_Latn", "urd_Arab", "urd_Deva", "urd_Latn", + "urh_Latn", "urk_Thai", "urt_Latn", "ury_Latn", "ush_Arab", "usp_Latn", + "uzb_Cyrl", "uzb_Latn", "uzn_Latn", "vag_Latn", "vah_Deva", "vai_Latn", + "var_Latn", "ver_Latn", "vid_Latn", "vie_Latn", "vif_Latn", "vmc_Latn", + "vmj_Latn", "vmm_Latn", "vmp_Latn", "vmw_Latn", "vmy_Latn", "vmz_Latn", + "vro_Latn", "vun_Latn", "vut_Latn", "wal_Ethi", "wal_Latn", "wap_Latn", + "war_Latn", "waw_Latn", "way_Latn", "wba_Latn", "wbl_Latn", "wbr_Deva", + "wci_Latn", "weo_Latn", "wes_Latn", "wja_Latn", "wji_Latn", "wlo_Latn", + "wlx_Latn", "wmw_Latn", "wob_Latn", "wof_Latn", "wol_Latn", "wsg_Telu", + "wwa_Latn", "xal_Cyrl", "xdy_Latn", "xed_Latn", "xer_Latn", "xhe_Arab", + "xho_Latn", "xka_Arab", "xkl_Latn", "xmf_Geor", "xmm_Latn", "xmv_Latn", + "xnj_Latn", "xnr_Deva", "xog_Latn", "xon_Latn", "xpe_Latn", "xrb_Latn", + "xsb_Latn", "xsm_Latn", "xsr_Deva", "xsu_Latn", "xta_Latn", "xtd_Latn", + "xte_Latn", "xti_Latn", "xtm_Latn", "xtn_Latn", "xtu_Latn", "xua_Taml", + "xuo_Latn", "yaa_Latn", "yad_Latn", "yal_Latn", "yam_Latn", "yao_Latn", + "yaq_Latn", "yas_Latn", "yat_Latn", "yav_Latn", "yay_Latn", "yaz_Latn", + "yba_Latn", "ybb_Latn", "ycl_Latn", "ycn_Latn", "ydd_Hebr", "ydg_Arab", + "yea_Mlym", "yer_Latn", "yes_Latn", "yka_Latn", "yli_Latn", "yor_Latn", + "yre_Latn", "yua_Latn", "yue_Hans", "yue_Hant", "yuz_Latn", "yva_Latn", + "zaa_Latn", "zab_Latn", "zac_Latn", "zad_Latn", "zae_Latn", "zai_Latn", + "zam_Latn", "zao_Latn", "zaq_Latn", "zar_Latn", "zas_Latn", "zav_Latn", + "zaw_Latn", "zca_Latn", "zga_Latn", "zim_Latn", "ziw_Latn", "zmz_Latn", + "zne_Latn", "zoc_Latn", "zoh_Latn", "zor_Latn", "zos_Latn", "zpc_Latn", + "zpg_Latn", "zpi_Latn", "zpl_Latn", "zpm_Latn", "zpo_Latn", "zpt_Latn", + "zpu_Latn", "zpv_Latn", "zpy_Latn", "zpz_Latn", "zsm_Latn", "ztg_Latn", + "ztn_Latn", "ztp_Latn", "ztq_Latn", "zts_Latn", "ztu_Latn", "zty_Latn", + "zul_Latn", "zyb_Latn", "zyp_Latn", "zza_Latn", +]; + +export const OMNI_ASR_SUPPORTED_LANG_SET: ReadonlySet = new Set(OMNI_ASR_SUPPORTED_LANGS); diff --git a/src/copilotSettings/copilotSettings.ts b/src/copilotSettings/copilotSettings.ts index fbe927f47..073fd4953 100644 --- a/src/copilotSettings/copilotSettings.ts +++ b/src/copilotSettings/copilotSettings.ts @@ -122,11 +122,10 @@ export async function openSystemMessageEditor() { try { const config = vscode.workspace.getConfiguration("codex-editor-extension"); const settings = { - endpoint: config.get("asrEndpoint", "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe"), - provider: config.get("asrProvider", "mms"), - model: config.get("asrModel", "facebook/mms-1b-all"), + endpoint: config.get("asrEndpoint", "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe"), + provider: config.get("asrProvider", "omniasr"), + model: config.get("asrModel", "omniASR_LLM_1B_v2"), language: config.get("asrLanguage", "eng"), - phonetic: config.get("asrPhonetic", false), }; panel.webview.postMessage({ command: "asrSettings", data: settings }); } catch (error) { @@ -143,7 +142,6 @@ export async function openSystemMessageEditor() { await config.update("asrProvider", message.data?.provider, target); await config.update("asrModel", message.data?.model, target); await config.update("asrLanguage", message.data?.language, target); - await config.update("asrPhonetic", !!message.data?.phonetic, target); panel.webview.postMessage({ command: "asrSettingsSaved" }); } catch (error) { console.error("[CopilotSettings] Failed to save ASR settings:", error); diff --git a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts index 39a560b99..bb8d18069 100644 --- a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts +++ b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts @@ -484,6 +484,25 @@ const messageHandlers: Record Promise("asrEndpoint", "http://localhost:8000/api/v1/asr/transcribe"); + // ASR language plumbing — see sharedUtils/asrLanguageUtils.ts for the resolver + // contract. The webview drives "auto-detect" vs "use project language" via the + // gear menu on the Transcribe button; that picker is persisted to the workspace + // setting `asrLanguageMode`. + const { resolveOmniAsrCode } = await import("../../../sharedUtils/asrLanguageUtils"); + const projectConfig = vscode.workspace.getConfiguration("codex-project-manager"); + const targetLanguage = projectConfig.get("targetLanguage") as + | { tag?: string; refName?: string; iso1?: string; iso2t?: string; iso2b?: string; } + | undefined; + const languageMode = (config.get("asrLanguageMode", "project") === "auto" + ? "auto" + : "project") as "auto" | "project"; + const scriptPref = config.get("asrScriptPref", "auto"); + const resolvedCode = + languageMode === "auto" + ? undefined + : resolveOmniAsrCode(targetLanguage, scriptPref); + const projectLanguageName = targetLanguage?.refName; + let authToken: string | undefined; // Try to get authenticated endpoint from FrontierAPI @@ -536,10 +555,17 @@ const messageHandlers: Record Promise Promise { + const typedEvent = event as Extract; + const mode = typedEvent.content?.mode === "auto" ? "auto" : "project"; + try { + await vscode.workspace + .getConfiguration("codex-editor-extension") + .update("asrLanguageMode", mode, vscode.ConfigurationTarget.Workspace); + } catch (err) { + console.warn("Failed to update asrLanguageMode", err); + } + // Rebroadcast so the webview can refresh its local asrConfig snapshot. + await messageHandlers.getAsrConfig({ webviewPanel } as any); + }, + + setAsrScriptPref: async ({ event, webviewPanel }) => { + const typedEvent = event as Extract; + const rawPref = typedEvent.content?.scriptPref; + // Accept "auto", "latin", or any 4-letter ISO 15924 tag. Anything else falls back to "auto". + const isFourLetter = typeof rawPref === "string" && /^[A-Za-z]{4}$/.test(rawPref); + const normalized = + rawPref === "auto" || rawPref === "latin" + ? rawPref + : isFourLetter + ? rawPref!.charAt(0).toUpperCase() + rawPref!.slice(1).toLowerCase() + : "auto"; + try { + await vscode.workspace + .getConfiguration("codex-editor-extension") + .update("asrScriptPref", normalized, vscode.ConfigurationTarget.Workspace); + } catch (err) { + console.warn("Failed to update asrScriptPref", err); + } + await messageHandlers.getAsrConfig({ webviewPanel } as any); + }, + updateCellAfterTranscription: async ({ event, document, webviewPanel, provider }) => { const typedEvent = event as Extract; const { cellId, transcribedText, language } = typedEvent.content; @@ -574,7 +632,12 @@ const messageHandlers: Record Promise - + Codex Cell Editor diff --git a/src/providers/mainMenu/mainMenuProvider.ts b/src/providers/mainMenu/mainMenuProvider.ts index 0395e4692..e8f60e9f4 100644 --- a/src/providers/mainMenu/mainMenuProvider.ts +++ b/src/providers/mainMenu/mainMenuProvider.ts @@ -705,7 +705,7 @@ export class MainMenuProvider extends BaseWebviewProvider { } case "getAsrSettings": { const config = vscode.workspace.getConfiguration("codex-editor-extension"); - let endpoint = config.get("asrEndpoint", "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe"); + let endpoint = config.get("asrEndpoint", "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe"); let authToken: string | undefined; // Try to get authenticated endpoint from FrontierAPI @@ -745,7 +745,7 @@ export class MainMenuProvider extends BaseWebviewProvider { new URL(endpoint); } catch (urlError) { console.error("Invalid ASR endpoint configuration:", endpoint, urlError); - endpoint = "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe"; + endpoint = "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe"; } // Warn if using authenticated endpoint without token @@ -756,10 +756,9 @@ export class MainMenuProvider extends BaseWebviewProvider { const settings = { endpoint, - provider: config.get("asrProvider", "mms"), - model: config.get("asrModel", "facebook/mms-1b-all"), + provider: config.get("asrProvider", "omniasr"), + model: config.get("asrModel", "omniASR_LLM_1B_v2"), language: config.get("asrLanguage", "eng"), - phonetic: config.get("asrPhonetic", false), authToken, }; if (this._view) { @@ -774,7 +773,6 @@ export class MainMenuProvider extends BaseWebviewProvider { await config.update("asrProvider", (message as any).data?.provider, target); await config.update("asrModel", (message as any).data?.model, target); await config.update("asrLanguage", (message as any).data?.language, target); - await config.update("asrPhonetic", !!(message as any).data?.phonetic, target); if (this._view) { safePostMessageToView(this._view, { command: "asrSettingsSaved" }, "MainMenu"); } diff --git a/types/index.d.ts b/types/index.d.ts index 4a62f3622..55445c080 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -576,10 +576,24 @@ export type EditorPostMessages = content: { cellId: string; transcribedText: string; - language: string; + /** OmniASR `{iso639_3}_{Script}` code the server reported (or that we sent and the server + * used silently). `null` when transcription ran in auto-detect mode and the server did + * not echo a language back. Persisted on the audio attachment so the badge survives + * re-renders. */ + language: string | null; }; } | { command: "getAsrConfig"; } + | { + command: "setAsrLanguageMode"; + content: { mode: "auto" | "project"; }; + } + | { + command: "setAsrScriptPref"; + /** `"auto"` (best guess), `"latin"` (force Latin where supported), or a 4-letter + * ISO 15924 tag (`"Arab"`, `"Cyrl"`, ...). */ + content: { scriptPref: string; }; + } | { command: "mergeCellWithPrevious"; content: { @@ -2150,7 +2164,25 @@ type EditorReceiveMessages = milestoneIndex?: number; subsectionIndex?: number; } - | { type: "asrConfig"; content: { endpoint: string; authToken?: string; }; } + | { + type: "asrConfig"; + content: { + endpoint: string; + authToken?: string; + /** OmniASR `{iso639_3}_{Script}` code to send as `?lang=...`. Omitted when the + * user picks Auto-Detect or when we can't safely resolve a code. */ + lang?: string; + /** "project" (default) → send `lang`. "auto" → omit `lang`, let the server transcribe + * without language conditioning. Persisted as workspace setting `asrLanguageMode`. */ + languageMode: "auto" | "project"; + /** Script preference: "auto" (best guess), "latin", or a 4-letter ISO 15924 tag. + * Persisted as workspace setting `asrScriptPref`. */ + scriptPref?: string; + /** Project target-language refName, e.g. "Swahili". Used as the badge fallback when + * the server doesn't echo `lang` in the response. */ + projectLanguageName?: string; + }; + } | { type: "startBatchTranscription"; content: { count: number; }; } | { type: "providerConfirmsBacktranslationSet"; diff --git a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx index 65cb49161..4c7052466 100644 --- a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx +++ b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx @@ -2,10 +2,23 @@ import React, { useEffect, useState } from "react"; import { CustomWaveformCanvas } from "./CustomWaveformCanvas.tsx"; import { Button } from "../components/ui/button"; import { Badge } from "../components/ui/badge"; -import { MessageCircle, Copy, Loader2, Trash2, History, Mic } from "lucide-react"; +import { MessageCircle, Copy, Loader2, Trash2, History, Mic, Settings as SettingsIcon } from "lucide-react"; import type { ValidationStatusIconProps } from "./AudioValidationStatusIcon.tsx"; import { AudioValidationBadge } from "./AudioValidationBadge.tsx"; import type { AudioValidationPopoverProps } from "./AudioValidationBadge.tsx"; +import { + Popover, + PopoverContent, + PopoverTrigger, +} from "../components/ui/popover"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "../components/ui/select"; +import { Input } from "../components/ui/input"; interface AudioWaveformWithTranscriptionProps { audioUrl: string; @@ -15,6 +28,10 @@ interface AudioWaveformWithTranscriptionProps { timestamp: number; language?: string; } | null; + /** Pre-computed friendly label for the language badge ("Swahili", "Auto Detect", or null + * for "render nothing"). Computed by the caller via `labelForTranscriptionLanguage()` + * from sharedUtils/asrLanguageUtils.ts so this component stays presentational. */ + transcriptionLanguageLabel?: string | null; isTranscribing: boolean; transcriptionProgress: number; onTranscribe: () => void; @@ -31,6 +48,17 @@ interface AudioWaveformWithTranscriptionProps { targetDuration?: number | null; // Target duration (in seconds) derived from cell timestamps. /** Total number of audio recordings for the cell (including soft-deleted). When > 0, a count badge is rendered on the History button. */ historyCount?: number; + // Advanced ASR settings (gear menu, next to the Transcribe button). + /** Whether to display the gear menu. Hide on source-text editors where the user can't drive transcription policy. */ + showAdvancedAsrMenu?: boolean; + /** Current language mode. Determines the chevron position in the gear menu. */ + asrLanguageMode?: "auto" | "project"; + /** Current script preference: "auto", "latin", or a 4-letter ISO 15924 tag (e.g. "Arab"). */ + asrScriptPref?: string; + /** Friendly project-language label for the "Project language" radio (e.g. "Swahili"). */ + projectLanguageName?: string; + onChangeAsrLanguageMode?: (mode: "auto" | "project") => void; + onChangeAsrScriptPref?: (pref: string) => void; } const AudioWaveformWithTranscription: React.FC = ({ @@ -52,10 +80,40 @@ const AudioWaveformWithTranscription: React.FC { const [audioSrc, setAudioSrc] = useState(""); const [audioDuration, setAudioDuration] = useState(null); + // The Script picker offers three "preset" choices plus a free-form 4-letter input for + // power users (e.g. someone wants `swa_Cyrl` even though the resolver would never pick + // it). We track the *dropdown* selection separately from the committed `asrScriptPref` + // so picking "Custom" reveals the input even before a valid tag has been entered. + type ScriptOption = "auto" | "latin" | "custom"; + const optionFromPref = (pref: string): ScriptOption => + pref === "auto" ? "auto" : pref === "latin" ? "latin" : "custom"; + const [scriptSelection, setScriptSelection] = useState( + optionFromPref(asrScriptPref) + ); + const [scriptCustomDraft, setScriptCustomDraft] = useState( + optionFromPref(asrScriptPref) === "custom" ? asrScriptPref : "" + ); + useEffect(() => { + const next = optionFromPref(asrScriptPref); + setScriptSelection(next); + if (next === "custom") setScriptCustomDraft(asrScriptPref); + }, [asrScriptPref]); + const commitCustomScript = () => { + const candidate = scriptCustomDraft.trim(); + if (/^[A-Za-z]{4}$/.test(candidate)) onChangeAsrScriptPref?.(candidate); + }; + // Prefer the provided URL (can be blob: or data:). Fall back to creating an object URL from the blob. useEffect(() => { if (audioUrl) { @@ -142,11 +200,17 @@ const AudioWaveformWithTranscription: React.FC {transcription.content}

- {transcription.language && ( - - {transcription.language} - - )} + {/* Language badge intentionally hidden in this PR. + The new `codex-asr` Modal app DOES run MMS-LID and echo back a + `lang` for auto-detect (and the plumbing all the way through + `transcriptionLanguageLabel` is wired and ready), but this PR + keeps the client pointed at the existing Frontier auth-proxy ASR + endpoint, which still forwards to the legacy `mms-zeroshot-asr` + Modal app — no LID, no `lang` echo. Showing the badge in that + world means falling back to "Auto Detect" (or worse, the project + language) instead of an honest detection, which is misleading. + Re-enable this `` once the auth-proxy upstream migrates + to `codex-asr` (see docs/AUTH_SERVER_ASR_IMPLEMENTATION.md). */} - )} + {/* Transcribe / Re-transcribe split-button. The gear is glued to the right + edge of the main button (shared border, no gap) so it visually belongs + to the transcribe control. The label flips to "Re-transcribe" once a + saved transcription exists so the user can re-run with different ASR + settings (e.g. flip to auto-detect). Grey-out the whole group while a + transcription is in flight. */} + {(() => { + const sharedBtnClass = + "h-8 text-xs text-[var(--vscode-button-background)] border-[var(--vscode-button-background)]/20 hover:bg-[var(--vscode-button-background)]/10"; + const transcribeDisabled = + disabled || isTranscribing || (!audioUrl && !audioBlob); + return ( +
+ + {showAdvancedAsrMenu && ( + + + + + +
+
Language
+ +
+
+
Script
+ + {scriptSelection === "custom" && ( +
+ + setScriptCustomDraft(e.target.value) + } + onKeyDown={(e) => { + if (e.key === "Enter") { + e.preventDefault(); + commitCustomScript(); + } + }} + placeholder="e.g. Arab, Cyrl, Hans" + maxLength={4} + className="h-7 text-xs" + /> + +
+ )} +
+
+
+ )} +
+ ); + })()}