diff --git a/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md b/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
index 5deaa99cc..d9cd7cae6 100644
--- a/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
+++ b/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
@@ -1,402 +1,88 @@
# Auth Server ASR Proxy Implementation Guide
-## Overview
+> **This document was rewritten in 2026 to reflect the current OmniASR
+> (HTTP POST) contract.** The previous WebSocket-based MMS proxy described
+> here is no longer in use.
-The Codex Editor client now supports authenticated ASR (Automatic Speech Recognition) transcription through the Frontier auth server. This document describes what needs to be implemented on the auth server side.
+## Status
-**Status**: Client implementation is complete and deployed. Auth server implementation is required to enable the feature.
+- **Upstream service**: Meta Omnilingual ASR (`omniASR_LLM_1B_v2`), served
+ on Modal as `https://genesis-ai-dev--codex-asr-serve.modal.run`
+ (renamed from the historical `mms-zeroshot-asr` deployment — same
+ workload, model-agnostic name).
+- **Client**: Codex Editor talks to the Frontier auth-proxy via plain
+ HTTP POST (multipart). No WebSocket. See
+ [`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md) for the full wire spec
+ and reference FastAPI implementation.
-## What You Need to Implement
+## What the auth server must implement
-### 1. Add `getAsrEndpoint()` Method to FrontierAPI
+### 1. `getAsrEndpoint()` on FrontierAPI
-The client expects a new method on the FrontierAPI interface that returns the authenticated ASR proxy endpoint.
-
-**Method Signature**:
```typescript
getAsrEndpoint(): Promise
```
-**Returns**: The WebSocket URL for the authenticated ASR proxy (e.g., `wss://auth.frontier.com/ws/asr`)
-
-**Example Implementation**:
-```typescript
-async getAsrEndpoint(): Promise {
- if (!this.isAuthenticated) {
- return undefined;
- }
-
- // Return your ASR proxy WebSocket URL
- return "wss://auth.frontier.com/ws/asr";
- // OR from config:
- // return this.config.asrProxyUrl;
-}
-```
-
-**Pattern Reference**: This follows the exact same pattern as your existing `getLlmEndpoint()` method.
-
-### 2. Implement WebSocket Proxy Endpoint: `/ws/asr`
-
-Create a new WebSocket endpoint that:
-1. Validates the JWT token from the query parameter
-2. Proxies messages between the client and the actual ASR service (Ryder's Modal endpoint)
-3. Logs usage for authenticated users
-
-#### Endpoint Details
-
-**URL Pattern**: `wss://your-auth-server.com/ws/asr?token=JWT_TOKEN`
-
-**Authentication**: JWT token passed as query parameter `token`
-
-**Upstream Service**: `wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe`
-
-#### Message Flow
-
-```
-Client → Auth Server → ASR Service (Ryder's endpoint)
- ↓ ↓ ↓
- ←─────────←──────────────←
-```
-
-1. Client sends metadata (JSON)
-2. Auth server forwards to ASR service
-3. Client sends audio (binary)
-4. Auth server forwards to ASR service
-5. ASR service sends progress/results (JSON)
-6. Auth server forwards to client
-
-## Complete Python Implementation Example
-
-Here's a complete FastAPI implementation you can use as a reference:
-
-```python
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Query, HTTPException
-from fastapi.responses import JSONResponse
-import websockets
-import jwt
-import asyncio
-import logging
-from datetime import datetime
-
-app = FastAPI()
-logger = logging.getLogger(__name__)
-
-# Configuration
-ASR_UPSTREAM_URL = "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe"
-JWT_SECRET = "your-jwt-secret-here" # Use your actual JWT secret
-JWT_ALGORITHM = "HS256"
-
-def validate_token(token: str) -> dict:
- """
- Validate JWT token and return decoded payload.
-
- Raises:
- HTTPException: If token is invalid or expired
- """
- try:
- payload = jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGORITHM])
- return payload
- except jwt.ExpiredSignatureError:
- raise HTTPException(status_code=401, detail="Token expired")
- except jwt.InvalidTokenError:
- raise HTTPException(status_code=401, detail="Invalid token")
-
-@app.websocket("/ws/asr")
-async def websocket_asr_proxy(
- websocket: WebSocket,
- token: str = Query(..., description="JWT authentication token")
-):
- """
- WebSocket proxy for ASR transcription with authentication.
-
- This endpoint:
- 1. Validates the user's JWT token
- 2. Establishes a connection to the upstream ASR service
- 3. Proxies messages bidirectionally between client and ASR service
- 4. Logs usage for monitoring
- """
-
- # Validate token before accepting connection
- try:
- user_payload = validate_token(token)
- user_id = user_payload.get("sub") or user_payload.get("user_id")
- username = user_payload.get("username") or user_payload.get("email")
- except HTTPException as e:
- await websocket.close(code=1008, reason=f"Authentication failed: {e.detail}")
- logger.warning(f"Authentication failed: {e.detail}")
- return
-
- # Accept client connection
- await websocket.accept()
- logger.info(f"User {username} (ID: {user_id}) started ASR session at {datetime.utcnow()}")
-
- # Connect to upstream ASR service
- upstream_ws = None
- try:
- upstream_ws = await websockets.connect(ASR_UPSTREAM_URL)
- logger.info(f"Connected to upstream ASR service for user {username}")
-
- async def forward_to_client():
- """Forward messages from ASR service to client"""
- try:
- async for message in upstream_ws:
- await websocket.send_text(message)
- logger.debug(f"Forwarded message to client {username}: {message[:100]}...")
- except websockets.exceptions.ConnectionClosed:
- logger.info(f"Upstream ASR connection closed for user {username}")
- except Exception as e:
- logger.error(f"Error forwarding to client {username}: {e}")
- try:
- await websocket.send_text(
- '{"type": "error", "message": "Connection to transcription service lost"}'
- )
- except:
- pass
-
- async def forward_to_asr():
- """Forward messages from client to ASR service"""
- try:
- while True:
- message = await websocket.receive()
-
- if "text" in message:
- # Forward JSON metadata
- await upstream_ws.send(message["text"])
- logger.debug(f"Forwarded metadata from {username}: {message['text'][:100]}...")
- elif "bytes" in message:
- # Forward binary audio data
- audio_size = len(message["bytes"])
- await upstream_ws.send(message["bytes"])
- logger.info(f"Forwarded {audio_size} bytes of audio from {username}")
- except WebSocketDisconnect:
- logger.info(f"Client {username} disconnected")
- except Exception as e:
- logger.error(f"Error forwarding from client {username}: {e}")
-
- # Run both forwarding tasks concurrently
- await asyncio.gather(
- forward_to_client(),
- forward_to_asr(),
- return_exceptions=True
- )
-
- except Exception as e:
- logger.error(f"Failed to connect to upstream ASR service for user {username}: {e}")
- error_msg = {
- "type": "error",
- "message": f"Failed to connect to transcription service: {str(e)}"
- }
- try:
- await websocket.send_json(error_msg)
- except:
- pass
- finally:
- # Cleanup
- if upstream_ws:
- await upstream_ws.close()
- try:
- await websocket.close()
- except:
- pass
- logger.info(f"ASR session ended for user {username} (ID: {user_id})")
-
-@app.get("/health")
-async def health_check():
- """Health check endpoint"""
- return {"status": "healthy", "service": "asr-proxy"}
-
-if __name__ == "__main__":
- import uvicorn
- uvicorn.run(app, host="0.0.0.0", port=8000)
-```
-
-## WebSocket Protocol Details
-
-The client implements this protocol, which your proxy must support:
-
-### Client → ASR Service
-
-**Step 1**: Client sends JSON metadata
-```json
-{
- "type": "meta",
- "provider": "mms",
- "model": "facebook/mms-1b-all",
- "mime": "audio/webm",
- "language": "eng",
- "task": "transcribe",
- "phonetic": false
-}
-```
-
-**Step 2**: Client sends binary audio data (Blob)
-
-### ASR Service → Client
-
-**Progress Updates** (during processing):
-```json
-{
- "type": "progress",
- "data": "Processing audio...",
- "percentage": 50
-}
-```
-
-**Final Result** (on completion):
-```json
-{
- "type": "done",
- "text": "This is the transcribed text",
- "language": "eng",
- "provider": "mms",
- "model": "facebook/mms-1b-all",
- "phonetic": "ðɪs ɪz ðə trænskraɪbd tɛkst"
-}
-```
-
-**Error Message** (on failure):
-```json
-{
- "type": "error",
- "message": "Transcription failed: invalid audio format"
-}
-```
-
-## Implementation Checklist
-
-- [ ] Add `getAsrEndpoint()` method to FrontierAPI class
- - Returns `Promise`
- - Returns your ASR proxy URL (e.g., `wss://auth.frontier.com/ws/asr`)
- - Returns `undefined` if not authenticated
-
-- [ ] Create WebSocket endpoint at `/ws/asr`
- - Accepts `token` as query parameter
- - Validates JWT token
- - Rejects with code 1008 if token invalid
-
-- [ ] Implement bidirectional proxy
- - Forward JSON text messages
- - Forward binary audio data
- - Handle connection lifecycle
- - Clean up resources on disconnect
-
-- [ ] Add logging
- - Log successful authentications with user ID
- - Log ASR session start/end times
- - Log audio data sizes for monitoring
- - Log errors and failures
-
-- [ ] Test the implementation
- - Valid token → successful proxying
- - Invalid token → rejection with code 1008
- - Missing token → rejection
- - Large audio files → proper streaming
- - Connection interruptions → graceful cleanup
-
-## Configuration
-
-You'll need to configure:
-
-1. **JWT Secret**: Same secret used for other JWT validation
-2. **Upstream ASR URL**: `wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe`
-3. **Proxy Endpoint URL**: The URL you'll return from `getAsrEndpoint()`
-
-## Testing
-
-### Manual Test with wscat
-
-```bash
-# Install wscat
-npm install -g wscat
-
-# Test with valid token
-wscat -c "wss://your-auth-server.com/ws/asr?token=YOUR_JWT_TOKEN"
-
-# Send metadata
-> {"type":"meta","mime":"audio/webm"}
-
-# Observe responses
-< {"type":"progress","data":"Processing...","percentage":50}
-```
-
-### Integration Test
-
-The Codex Editor client will automatically use your proxy when:
-1. User is authenticated
-2. `getAsrEndpoint()` returns a URL
-3. User transcribes audio
-
-You can verify by checking your logs for authenticated transcription sessions.
-
-## Security Considerations
-
-1. **Token Validation**: Always validate JWT before accepting connection
-2. **Rate Limiting**: Consider implementing per-user rate limits
-3. **Timeout**: Set reasonable timeouts (30-60s) for transcription
-4. **File Size Limits**: Consider limiting audio size if needed
-5. **HTTPS/WSS**: Always use secure WebSocket in production
-6. **Logging**: Log usage but respect user privacy (don't log audio content)
-
-## Monitoring Recommendations
-
-Track these metrics:
-- Total ASR requests per day
-- Active concurrent transcriptions
-- Average transcription duration
-- Error rate by error type
-- Audio size distribution
-- Per-user usage
-
-## Reference Implementation
-
-The LLM proxy endpoint on your auth server follows a similar pattern. You can use that as a reference for:
-- JWT validation approach
-- Error handling patterns
-- Logging format
-- Configuration management
-
-## Support
-
-If you need clarification on:
-- Client behavior: See `docs/asr-proxy-endpoint.md`
-- Message protocol: See examples above
-- Client implementation: See `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
-
-## Deployment Notes
-
-### Before Deployment
-1. Test with a staging environment first
-2. Verify JWT token validation works correctly
-3. Test with large audio files (>10MB)
-4. Confirm error handling works as expected
-
-### After Deployment
-1. Monitor logs for authentication failures
-2. Check for any proxy errors
-3. Verify transcription quality unchanged
-4. Monitor for rate limit needs
-
-## Timeline
-
-**Client Ready**: ✅ Implemented and deployed
-
-**Auth Server Required**: This implementation
-
-**User Impact**: None until auth server is deployed (users will continue using manual endpoint configuration)
-
-**Urgency**: Medium - allows transition away from Ryder's personal namespace
-
----
-
-## Questions?
-
-For questions about:
-- **Client implementation**: Check `docs/asr-auth-proxy-implementation-summary.md`
-- **Protocol details**: Check `docs/asr-proxy-endpoint.md`
-- **Client code**: Check `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
-
-## Version
-
-- **Client Version**: Implemented in v0.6.21+
-- **Last Updated**: 2025-10-14
-
+Returns the **HTTPS** URL of the proxy's transcribe endpoint
+(e.g. `https://auth.frontier.example/api/v1/asr/transcribe`). The client
+performs a multipart POST against that URL.
+
+This mirrors the existing `getLlmEndpoint()`.
+
+### 2. `POST /api/v1/asr/transcribe` proxy endpoint
+
+A pass-through that:
+
+1. Validates the Frontier JWT (Authorization header or `?token=` query).
+2. Forwards the multipart audio body to OmniASR.
+3. **Forwards the optional `?lang=...` query parameter** when the client
+ supplies it (OmniASR `{iso639_3}_{Script}` format, e.g. `swh_Latn`).
+ In auto-detect mode the client omits `lang`; the proxy must also omit
+ it when calling upstream.
+4. Returns OmniASR's JSON response verbatim (`text`, `duration_s`,
+ `inference_s`, and `lang` when one was sent).
+
+A complete reference FastAPI implementation is in
+[`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md#example-implementation-pythonfastapi).
+
+## Migration from the WebSocket / MMS era
+
+Anything the client used to send over WebSocket (provider, model,
+language as bare ISO 639-3, phonetic flag, etc.) is gone:
+
+- **No more `provider` / `model` fields**: the upstream is OmniASR; the
+ client doesn't choose providers.
+- **No more `phonetic`**: OmniASR doesn't support IPA output.
+- **No more bare ISO 639-3 codes**: OmniASR requires `{iso639_3}_{Script}`
+ (e.g. `urd_Arab`, not `urd`). The client resolves this from the project
+ language using `sharedUtils/asrLanguageUtils.ts`.
+- **No more `lang=auto` magic value**: omit `lang` entirely for
+ auto-detect.
+
+## Key references
+
+- Wire contract: [`docs/asr-proxy-endpoint.md`](./asr-proxy-endpoint.md)
+- Client: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
+- Lang resolver + supported codes: `sharedUtils/asrLanguageUtils.ts`,
+ `sharedUtils/omniAsrSupportedLangs.ts`,
+ `sharedUtils/omniAsrDefaultScripts.ts`,
+ `sharedUtils/omniAsrFriendlyNames.ts`
+- Modal app (source of truth for the upstream):
+ [`docs/asr/codex_asr_modal.py`](./asr/codex_asr_modal.py) in this repo.
+ Logs and dashboards:
+ .
+
+## Action items for the Frontier auth proxy team
+
+1. Point the upstream ASR URL at the new app:
+ `https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe`
+ (previously `…--mms-zeroshot-asr-…`). The legacy app is still up so
+ there's no urgency, but it should not be considered the source of
+ truth — only `codex-asr` will receive future updates.
+2. Make sure the proxy forwards the optional `?lang=` query string
+ verbatim and does not synthesise one when the client omits it
+ (auto-detect mode).
+3. Drop any `provider`, `model`, `phonetic`, `language` fields that
+ used to be part of the multipart/form body — they're no longer sent.
+4. Once the proxy is migrated, we can decommission the
+ `mms-zeroshot-asr` Modal app.
diff --git a/docs/asr-auth-proxy-implementation-summary.md b/docs/asr-auth-proxy-implementation-summary.md
index 2f0515033..0748f84a4 100644
--- a/docs/asr-auth-proxy-implementation-summary.md
+++ b/docs/asr-auth-proxy-implementation-summary.md
@@ -1,5 +1,13 @@
# ASR Authentication Proxy Implementation Summary
+> **Historical changelog.** This documents the initial WebSocket-era
+> introduction of the Frontier auth proxy. The current contract is HTTP
+> POST and the upstream is OmniASR (not MMS). For an up-to-date wire
+> spec and reference implementation see
+> [`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md); for the auth-server
+> integration points see
+> [`AUTH_SERVER_ASR_IMPLEMENTATION.md`](./AUTH_SERVER_ASR_IMPLEMENTATION.md).
+
## Overview
Successfully migrated ASR transcription from Ryder's personal Modal namespace to an authenticated proxy architecture. The system now supports:
diff --git a/docs/asr-proxy-endpoint.md b/docs/asr-proxy-endpoint.md
index 8b1e40956..e05215410 100644
--- a/docs/asr-proxy-endpoint.md
+++ b/docs/asr-proxy-endpoint.md
@@ -1,22 +1,33 @@
# ASR HTTP POST Endpoint Specification
-This document describes the HTTP POST protocol for implementing an ASR (Automatic Speech Recognition) transcription endpoint compatible with the Codex Editor.
+This document describes the HTTP POST protocol the Codex Editor expects from
+an ASR (Automatic Speech Recognition) endpoint. The reference upstream is
+**Meta Omnilingual ASR** (`omniASR_LLM_1B_v2`), served on Modal as
+`genesis-ai-dev--codex-asr-serve.modal.run` (renamed from the
+historical `mms-zeroshot-asr` deployment).
+
+The Frontier auth server runs a thin **proxy** in front of that Modal
+endpoint, adds JWT validation, and is what the Codex client actually talks to
+in production. This spec covers the proxy's wire contract; the proxy in turn
+forwards to OmniASR.
## Overview
-The Codex Editor uses a simple HTTP POST request for audio transcription. This allows for straightforward integration without WebSocket complexity.
+The client uses a simple multipart HTTP POST to the proxy URL. No
+WebSockets, no streaming progress messages. One request → one transcription.
## Authentication
-The client passes authentication via a JWT token as either:
+The client passes a Frontier JWT via either:
1. **Authorization header**: `Authorization: Bearer `
2. **Query parameter**: `?token=&source=codex`
The server should:
-1. Validate the JWT token before processing the request
-2. Reject requests with invalid or missing tokens (401)
-3. Establish a connection to the actual ASR service (e.g., Modal endpoint)
-4. Forward the audio file and return the transcription result
+1. Validate the JWT before processing.
+2. Reject invalid/missing tokens with HTTP 401.
+3. Forward the audio (and the optional `lang` query parameter, if present)
+ to the upstream OmniASR service.
+4. Return the upstream's JSON response.
## Request Protocol
@@ -35,20 +46,34 @@ Authorization: Bearer (optional if token in query)
### Query Parameters
-- `source` (required): `"codex"` or `"langquest"`
-- `token` (optional): JWT token if not in Authorization header
+- `source` (required): `"codex"` or `"langquest"` — for logging.
+- `token` (optional): JWT, if not in the Authorization header.
+- `lang` (**optional**): OmniASR language code in
+ `{iso639_3}_{Script}` form (e.g. `swh_Latn`, `urd_Arab`, `cmn_Hans`).
+ Forward this directly to OmniASR. **Omit** it to engage the upstream's
+ built-in language ID — `codex-asr` runs MMS-LID first and feeds the
+ detected code into OmniASR (the resolved code is then included in the
+ response). The full list of accepted codes is bundled with the client
+ in `sharedUtils/omniAsrSupportedLangs.ts` (and is the live response of
+ OmniASR's `GET /languages`).
### Request Body
**Content-Type**: `multipart/form-data`
**Form Fields**:
-- `file`: Audio file (WAV, MP3, OGG, FLAC, WebM - max 50MB)
+- `file`: Audio file (WAV, MP3, OGG, FLAC, WebM, M4A — max 50 MB,
+ max 40 s per chunk; OmniASR chunks longer audio internally)
-### Example Request
+### Example Requests
```bash
-curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN" \
+# Auto-detect (no lang)
+curl -X POST "https://auth.frontier.example/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN" \
+ -F "file=@audio.wav"
+
+# Project-language mode (Swahili, Latin script)
+curl -X POST "https://auth.frontier.example/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN&lang=swh_Latn" \
-F "file=@audio.wav"
```
@@ -60,10 +85,26 @@ curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT
{
"text": "This is the transcribed text",
"duration_s": 4.94,
- "inference_s": 1.72
+ "inference_s": 1.72,
+ "lang": "swh_Latn"
}
```
+The `lang` field reflects what was **actually used** for transcription:
+- Request supplied `lang` → echoed verbatim.
+- Request omitted `lang` → upstream ran MMS-LID and the resolved
+ `{iso639_3}_{Script}` code is returned here. If LID failed (silence,
+ unrecognised language, …) the field is omitted and the response also
+ includes `lid_s` so callers can tell auto-detect actually ran. The
+ client renders an "Auto Detect" badge in that case.
+
+Auto-detect responses include an additional `"lid_s": ` field
+with the LID inference time (useful for monitoring).
+
+The client also accepts a legacy field name `language` in place of `lang`
+(this was the Frontier proxy's earlier convention) — either works. Prefer
+`lang` going forward.
+
### Error Response (4xx/5xx)
```json
@@ -73,32 +114,30 @@ curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT
```
**Common Error Codes**:
-- `400`: Bad Request (missing source parameter, invalid audio format)
+- `400`: Bad request (missing source, invalid audio, unknown `lang` code)
- `401`: Unauthorized (invalid or missing token)
-- `502`: Bad Gateway (upstream service unavailable)
-- `504`: Gateway Timeout (upstream service timeout)
+- `502`: Bad gateway (upstream OmniASR unavailable)
+- `504`: Gateway timeout (upstream timeout)
## Example Implementation (Python/FastAPI)
-Here's a basic example of implementing the ASR proxy endpoint:
-
```python
from fastapi import FastAPI, UploadFile, File, HTTPException, Query, Header
from fastapi.responses import JSONResponse
import httpx
import jwt
+from typing import Optional
app = FastAPI()
-# Configuration
-ASR_SERVICE_URL = "https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run/transcribe"
+# Configuration (post-rename; the old URL was
+# https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run/transcribe)
+ASR_SERVICE_URL = "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe"
JWT_SECRET = "your-jwt-secret"
def validate_token(token: str) -> dict:
- """Validate JWT token and return payload"""
try:
- payload = jwt.decode(token, JWT_SECRET, algorithms=["HS256"])
- return payload
+ return jwt.decode(token, JWT_SECRET, algorithms=["HS256"])
except jwt.InvalidTokenError:
raise HTTPException(status_code=401, detail="Invalid token")
@@ -107,75 +146,70 @@ async def transcribe_audio(
file: UploadFile = File(...),
authorization: Optional[str] = Header(None),
token: Optional[str] = Query(None),
- source: str = Query(...)
+ source: str = Query(...),
+ lang: Optional[str] = Query(None), # OmniASR {iso639_3}_{Script}
):
- """HTTP POST endpoint for ASR transcription with authentication"""
-
- # Extract token from header or query
auth_token = None
if authorization and authorization.startswith("Bearer "):
auth_token = authorization[7:]
elif token:
auth_token = token
-
if not auth_token:
raise HTTPException(status_code=401, detail="Token required")
-
- # Validate token
- try:
- user = validate_token(auth_token)
- user_id = user.get("sub")
- except HTTPException:
- raise
-
- # Read audio file
+ validate_token(auth_token)
+
audio_content = await file.read()
-
- # Forward to upstream ASR service
+
async with httpx.AsyncClient(timeout=60.0) as client:
files = {"file": (file.filename, audio_content, file.content_type)}
- response = await client.post(ASR_SERVICE_URL, files=files)
-
+ params = {}
+ if lang:
+ params["lang"] = lang
+ response = await client.post(ASR_SERVICE_URL, files=files, params=params)
+
if response.status_code != 200:
raise HTTPException(
status_code=response.status_code,
- detail=f"Transcription service error: {response.text}"
+ detail=f"Transcription service error: {response.text}",
)
-
+
+ # Pass OmniASR's response through verbatim (it already echoes `lang`
+ # when present, and omits it in auto-detect mode).
return JSONResponse(content=response.json())
```
## Client Implementation Reference
-The Codex Editor client implementation can be found in:
-
-- **TypeScript Client**: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
-- **Integration**: `webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx`
+- **Client**: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
+- **Code resolver** (project language → `{iso639_3}_{Script}`):
+ `sharedUtils/asrLanguageUtils.ts`
+- **Supported codes**: `sharedUtils/omniAsrSupportedLangs.ts`
+- **Default scripts**: `sharedUtils/omniAsrDefaultScripts.ts`
+- **Friendly names**: `sharedUtils/omniAsrFriendlyNames.ts`
-### Key Client Behavior
+### Key Client Behaviour
-1. Requests ASR config (including auth token) from VS Code extension
-2. Creates FormData with audio blob
-3. POSTs to endpoint URL with token in query parameter or Authorization header
-4. Receives JSON response with transcription text
-5. Handles errors and timeouts (default 60s)
+1. Requests ASR config (endpoint + auth token + resolved OmniASR code) from the extension host.
+2. POSTs `multipart/form-data` with the audio file; forwards `?lang=...` when in project mode.
+3. Parses `lang` (or legacy `language`) from the JSON response and stores it
+ on the cell's audio attachment.
+4. Renders the badge from the stored code via
+ `labelForTranscriptionLanguage()`.
## Testing Your Implementation
-### Test Cases
-
-1. **Valid audio**: Should return transcription
-2. **Invalid audio format**: Should return error message
-3. **Missing token**: Should reject with 401
-4. **Invalid token**: Should reject with 401
-5. **Timeout**: Should handle gracefully (client has 60s timeout)
-6. **Large audio files**: Should handle up to 50MB
-7. **Network errors**: Should return appropriate error codes
+1. **Project-mode request**: `?lang=swh_Latn` → expect 200 with
+ `"lang": "swh_Latn"` in response.
+2. **Auto-detect**: no `lang` → expect 200, **no** `lang` in response.
+3. **Unknown code**: `?lang=zzz_Zzzz` → expect 400 with descriptive error.
+4. **Invalid token**: 401.
+5. **Large audio (≤ 50 MB)**: 200.
+6. **Long audio (> 40 s)**: OmniASR chunks it; expect 200 with full
+ concatenated transcription.
+7. **Network error / upstream down**: 502/504 surfaced honestly.
## Supported Audio Formats
-The endpoint should support common audio formats:
-
- `audio/webm` (recommended for browser recording)
- `audio/wav`
- `audio/mp3`
@@ -185,28 +219,20 @@ The endpoint should support common audio formats:
## Security Considerations
-1. **Token Validation**: Always validate JWT tokens before processing
-2. **Rate Limiting**: Implement per-user rate limits to prevent abuse
-3. **File Size Limits**: Set reasonable limits on audio file sizes (50MB recommended)
-4. **Timeout**: Implement server-side timeouts to prevent hanging requests (60s recommended)
-5. **Logging**: Log usage for monitoring and debugging (but respect privacy)
-6. **HTTPS**: Always use secure connections in production
-
-## Performance Recommendations
-
-1. **Streaming**: For very large files, consider streaming uploads
-2. **Caching**: Cache model loading to reduce cold starts (handled by upstream service)
-3. **Resource Cleanup**: Properly close connections and free resources
-4. **Concurrent Requests**: Handle multiple simultaneous transcriptions efficiently
-5. **Timeout Handling**: Set reasonable timeouts for upstream requests
+1. **Token validation**: validate JWT before processing.
+2. **Rate limiting**: per-user limits to prevent abuse.
+3. **File size limits**: 50 MB.
+4. **Timeout**: server-side timeouts to prevent hanging requests (60 s recommended).
+5. **Logging**: log usage for monitoring but respect privacy.
+6. **HTTPS**: always.
## Integration with Frontier Auth Server
The Frontier auth server should:
-1. Provide `getAsrEndpoint()` method returning the proxy HTTP URL
-2. Generate short-lived JWT tokens for ASR requests
-3. Include user identification in tokens for logging
-4. Handle token refresh if needed for long transcriptions
+1. Implement `getAsrEndpoint()` returning the proxy HTTPS URL.
+2. Generate short-lived JWTs for ASR requests.
+3. Include user identification in tokens for logging.
+4. Handle token refresh for long transcriptions if needed.
-This follows the same pattern as the existing `getLlmEndpoint()` implementation.
+This follows the same pattern as the existing `getLlmEndpoint()`.
diff --git a/docs/asr/README.md b/docs/asr/README.md
new file mode 100644
index 000000000..344def767
--- /dev/null
+++ b/docs/asr/README.md
@@ -0,0 +1,69 @@
+# Codex ASR deployment
+
+Modal source for the ASR backend used by the Codex Translation Editor.
+
+| File | What it is |
+|------|------------|
+| [`codex_asr_modal.py`](./codex_asr_modal.py) | The Modal app source. Deploy with `modal deploy`. |
+
+## Live URLs
+
+- **Current (post-rename)**: `https://genesis-ai-dev--codex-asr-serve.modal.run`
+- **Legacy (kept warm during migration)**: `https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run`
+
+The legacy URL serves the same workload — the app was renamed from
+`mms-zeroshot-asr` to `codex-asr` so the URL no longer encodes the
+model family. Both deployments will be active during the rollout; the
+legacy one is decommissioned after the Frontier auth proxy and any
+hard-coded client defaults are updated to the new URL.
+
+## Deploying
+
+You need `modal` CLI installed (`pipx install modal`) and authenticated
+(`modal token new`) with access to the `genesis-ai-dev` workspace.
+
+```bash
+cd
+modal deploy docs/asr/codex_asr_modal.py
+```
+
+For local development against your own Modal workspace:
+
+```bash
+modal serve docs/asr/codex_asr_modal.py
+```
+
+## Sanity-checking after deploy
+
+```bash
+# Service identity
+curl -s https://genesis-ai-dev--codex-asr-serve.modal.run/
+
+# Full supported-langs list (used to regenerate the client snapshot)
+curl -s https://genesis-ai-dev--codex-asr-serve.modal.run/languages | jq '.count'
+
+# Transcribe with language hint
+curl -X POST -F "file=@some_audio.wav" \
+ "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe?lang=eng_Latn"
+
+# Transcribe in auto-detect mode (no `lang` field in response)
+curl -X POST -F "file=@some_audio.wav" \
+ https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe
+```
+
+## Wire spec
+
+See [`../asr-proxy-endpoint.md`](../asr-proxy-endpoint.md) for the full
+HTTP POST contract the Codex client expects (this Modal app implements
+it; the Frontier auth proxy sits in front and adds JWT validation).
+
+## Open follow-ups
+
+- **Server-side LID for auto-detect mode.** OmniASR LLM doesn't return a
+ detected language when run without `lang` conditioning. The plan is to
+ bake `facebook/mms-lid-2048` into the image and run it before
+ transcription when the client omits `lang`, then pass the detected
+ code through as the conditioning input and echo it back. ~+1 GB VRAM,
+ ~+1–2 s latency, makes the badge honest in auto-detect mode. Deferred
+ to a follow-up PR; the client is already prepared to consume the
+ field if/when it appears.
diff --git a/docs/asr/codex_asr_modal.py b/docs/asr/codex_asr_modal.py
new file mode 100644
index 000000000..0ba08f52f
--- /dev/null
+++ b/docs/asr/codex_asr_modal.py
@@ -0,0 +1,510 @@
+"""
+codex-asr — Modal deployment for the Codex Translation Editor's ASR backend.
+
+This is the **source of truth** for the deployed Modal app at
+`https://genesis-ai-dev--codex-asr-serve.modal.run`.
+
+Model: Meta Omnilingual ASR (`omniASR_LLM_1B_v2`). 1600+ languages.
+Native-script output, optional language conditioning.
+
+Naming
+~~~~~~
+The Modal app is named `codex-asr` (model-agnostic) rather than
+`mms-zeroshot-asr` (the old name, when the upstream was MMS Zero-Shot).
+This is so the URL stays stable when we change models. Do NOT rename
+again casually — every consumer (Codex client default endpoint,
+Frontier auth proxy upstream URL, docs, snapshot regen instructions)
+hard-codes `codex-asr`.
+
+Migration plan (if `codex-asr` ever needs to change):
+ 1. Deploy the new name first, keep `codex-asr` running.
+ 2. Update the Frontier auth proxy's upstream URL.
+ 3. Update the client's default endpoint in `package.json`
+ (`codex-editor-extension.asrEndpoint`) and any docs.
+ 4. Decommission `codex-asr` after a release cycle.
+
+The old `mms-zeroshot-asr` deployment is kept warm for backward
+compatibility during the transition. Both serve identical responses.
+
+Auto-detect language ID
+~~~~~~~~~~~~~~~~~~~~~~~
+OmniASR LLM models don't have built-in LID. When the client omits
+`lang` we run **Meta MMS-LID 2048** as a first pass to detect the
+ISO 639-3 base, then pair it with a default script (see
+`_DEFAULT_SCRIPT_FOR_BASE`) to produce an OmniASR-compatible
+`{iso639_3}_{Script}` code that's fed to the OmniASR transcribe call.
+The resolved code is echoed back in the response so the client can
+render a real "detected language" badge.
+
+If LID fails (silence, gibberish, language not in MMS-LID's 2048-set,
+or the detected base has no OmniASR mapping), we fall through to
+unconditioned transcription and omit `lang` in the response so the
+client renders an honest "Auto Detect" badge.
+
+Deploy / Dev
+~~~~~~~~~~~~
+ modal deploy docs/asr/codex_asr_modal.py
+ modal serve docs/asr/codex_asr_modal.py # local dev
+
+Test
+~~~~
+ curl -X POST -F "file=@audio.wav" \\
+ https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe
+
+ curl -X POST -F "file=@audio.wav" \\
+ "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe?lang=urd_Arab"
+
+Endpoints
+~~~~~~~~~
+ GET / — service identity
+ GET /health — readiness probe
+ GET /languages — full list of supported {iso639_3}_{Script} codes
+ (used by the client snapshot in sharedUtils/)
+ POST /transcribe — transcription endpoint
+"""
+
+import modal
+
+# Renamed from "mms-zeroshot-asr" to be model-agnostic. See module docstring
+# for migration notes.
+app = modal.App("codex-asr")
+
+MODEL_CARD = "omniASR_LLM_1B_v2"
+MODEL_CACHE_DIR = "/root/model_cache"
+
+# MMS-LID variant for auto-detect mode. 2048 languages — all MMS-LID models
+# share the same wav2vec2 backbone (~960M params), so picking a larger
+# classification head doesn't meaningfully change cold-start memory.
+# Outputs ISO 639-3 codes which we pair with our default-script table.
+LID_MODEL_ID = "facebook/mms-lid-2048"
+HF_CACHE_DIR = "/root/hf_cache"
+
+
+def download_model():
+ """Download model weights during image build (runs with GPU so fairseq2 can verify)."""
+ import os
+ os.environ["FAIRSEQ2_CACHE_DIR"] = MODEL_CACHE_DIR
+ os.environ["HF_HOME"] = HF_CACHE_DIR
+
+ from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
+
+ print(f"Downloading and verifying {MODEL_CARD}...")
+ pipeline = ASRInferencePipeline(model_card=MODEL_CARD)
+ print("Model downloaded and verified OK")
+ del pipeline
+
+ print(f"Downloading {LID_MODEL_ID}...")
+ from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification
+ AutoFeatureExtractor.from_pretrained(LID_MODEL_ID)
+ Wav2Vec2ForSequenceClassification.from_pretrained(LID_MODEL_ID)
+ print("MMS-LID downloaded OK")
+
+
+# Build the image with model weights baked in.
+# The run_function step uses a T4 GPU so fairseq2 can fully verify the
+# checkpoint. This only runs once — the resulting image is cached by Modal.
+#
+# Versions / CUDA notes:
+# - omnilingual-asr 0.2.0 is the first release that ships the
+# `omniASR_LLM_1B_v2` model card; 0.1.0 only has `omniASR_LLM_1B`.
+# - omnilingual-asr -> fairseq2[arrow]<=0.6 -> fairseq2n which pins
+# `torch==2.8.0` built specifically against CUDA 12.8 (it asserts this at
+# import time). Newer torch wheels are CUDA 13 and fail to load on Modal's
+# `debian_slim` (libcudart.so.13 missing).
+# - We install everything in one pip call so the resolver lands on the
+# cu128 wheel of torch 2.8.0.
+image = (
+ modal.Image.debian_slim(python_version="3.11")
+ .apt_install("ffmpeg", "libsndfile1")
+ .pip_install(
+ "torch==2.8.0",
+ "torchaudio==2.8.0",
+ "omnilingual-asr==0.2.0",
+ "transformers>=4.46,<5",
+ "huggingface_hub",
+ "fastapi",
+ "uvicorn",
+ "python-multipart",
+ "soundfile",
+ "numpy",
+ extra_index_url="https://download.pytorch.org/whl/cu128",
+ )
+ .env({"FAIRSEQ2_CACHE_DIR": MODEL_CACHE_DIR, "HF_HOME": HF_CACHE_DIR})
+ .run_function(download_model, gpu="T4")
+)
+
+_pipeline = None
+_lid_model = None
+_lid_feature_extractor = None
+_default_script_for_base: dict[str, str] | None = None
+
+# Hand-curated default script for the multi-script bases OmniASR serves.
+# **Mirror of `sharedUtils/omniAsrDefaultScripts.ts`** — keep both in sync
+# when adding entries (the client uses this for project-language → OmniASR
+# code resolution; the server uses it after MMS-LID returns a bare ISO
+# 639-3 base). Picked from Unicode CLDR likelySubtags cross-checked
+# against modern majority usage.
+_MULTI_SCRIPT_DEFAULTS: dict[str, str] = {
+ "aze": "Latn", # Azerbaijani — Latin in modern standard
+ "bcc": "Arab", # Southern Balochi
+ "cmn": "Hans", # Mandarin — Simplified default
+ "cmo": "Khmr", # Central Mnong — Khmer-script orthography
+ "crk": "Cans", # Plains Cree — Canadian Aboriginal Syllabics
+ "ell": "Grek", # Greek
+ "gag": "Latn", # Gagauz — modern Latin orthography
+ "kmr": "Latn", # Northern Kurdish — Latin (Hawar)
+ "lld": "Latn", # Ladin
+ "ojb": "Latn", # Northwestern Ojibwa
+ "rif": "Latn", # Tarifit Berber
+ "rmc": "Latn", # Carpathian Romani
+ "rmy": "Latn", # Vlax Romani
+ "tuk": "Latn", # Turkmen — modern Latin
+ "uig": "Arab", # Uyghur — Arabic-script
+ "urd": "Arab", # Urdu — Nastaliq
+ "uzb": "Latn", # Uzbek — modern Latin
+ "wal": "Ethi", # Wolaytta — Ethiopic
+ "yue": "Hant", # Cantonese — Traditional
+}
+
+
+def _ensure_gang_context() -> None:
+ """
+ Initialise fairseq2's thread-local gang stack on the current thread.
+
+ fairseq2 0.6 stores the "current gangs" stack on a `threading.local()`,
+ but only initialises the underlying `current_gangs = []` attribute on
+ the importing thread. FastAPI dispatches sync request handlers on
+ worker threads where the attribute is missing, causing inference to
+ fail with::
+
+ AttributeError: '_thread._local' object has no attribute 'current_gangs'
+
+ Cheap to call per-request — just sets a list on the thread-local if
+ it isn't already there.
+ """
+ try:
+ from fairseq2.gang import _thread_local # type: ignore[attr-defined]
+ if not hasattr(_thread_local, "current_gangs"):
+ _thread_local.current_gangs = []
+ except Exception: # pragma: no cover — defensive only
+ pass
+
+
+def get_pipeline():
+ """Load the ASR pipeline from baked-in weights (no download needed)."""
+ global _pipeline
+ if _pipeline is None:
+ import os
+ os.environ["FAIRSEQ2_CACHE_DIR"] = MODEL_CACHE_DIR
+
+ from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
+
+ print(f"Loading {MODEL_CARD} from image cache...")
+ _ensure_gang_context()
+ _pipeline = ASRInferencePipeline(model_card=MODEL_CARD)
+ print("Pipeline ready")
+ return _pipeline
+
+
+def _default_script_table() -> dict[str, str]:
+ """
+ Build (and cache) the base → default script lookup used by LID resolution.
+
+ Layered on top of `_MULTI_SCRIPT_DEFAULTS`:
+ - Single-script bases get their sole script automatically.
+ - Multi-script bases without a hand-curated entry fall through to
+ Latin (when supported), otherwise alphabetical first.
+ """
+ global _default_script_for_base
+ if _default_script_for_base is not None:
+ return _default_script_for_base
+
+ from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+
+ scripts_per_base: dict[str, list[str]] = {}
+ for code in supported_langs:
+ base, script = code.split("_", 1)
+ scripts_per_base.setdefault(base, []).append(script)
+
+ table: dict[str, str] = {}
+ for base, scripts in scripts_per_base.items():
+ if len(scripts) == 1:
+ table[base] = scripts[0]
+ elif base in _MULTI_SCRIPT_DEFAULTS and _MULTI_SCRIPT_DEFAULTS[base] in scripts:
+ table[base] = _MULTI_SCRIPT_DEFAULTS[base]
+ elif "Latn" in scripts:
+ table[base] = "Latn"
+ else:
+ table[base] = sorted(scripts)[0]
+
+ _default_script_for_base = table
+ return table
+
+
+def get_lid():
+ """Load the MMS-LID model + feature extractor from baked-in HF cache."""
+ global _lid_model, _lid_feature_extractor
+ if _lid_model is None or _lid_feature_extractor is None:
+ import os
+ import torch
+ from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification
+
+ os.environ["HF_HOME"] = HF_CACHE_DIR
+ print(f"Loading {LID_MODEL_ID} from image cache...")
+ _lid_feature_extractor = AutoFeatureExtractor.from_pretrained(LID_MODEL_ID)
+ _lid_model = Wav2Vec2ForSequenceClassification.from_pretrained(LID_MODEL_ID)
+ if torch.cuda.is_available():
+ _lid_model = _lid_model.to("cuda")
+ _lid_model.eval()
+ print("MMS-LID ready")
+ return _lid_model, _lid_feature_extractor
+
+
+def detect_omniasr_code(waveform_16k) -> str | None:
+ """
+ Run MMS-LID on a 16-kHz mono waveform and return an OmniASR-compatible
+ `{iso639_3}_{Script}` code, or `None` if we can't confidently map the
+ detected base into OmniASR's supported set.
+
+ Strategy: MMS-LID outputs an ISO 639-3 base; pair it with the default
+ script for that base (`_default_script_table()`). If the detected base
+ isn't served by OmniASR at all, return None and let the caller fall
+ back to unconditioned transcription.
+ """
+ import torch
+ import numpy as np
+
+ model, fx = get_lid()
+ # Cap LID input at 30 s — speech models don't benefit from longer
+ # context for identification and shorter input is much faster.
+ max_lid_samples = 30 * 16000
+ snippet = waveform_16k[:max_lid_samples].astype(np.float32, copy=False)
+
+ inputs = fx(snippet, sampling_rate=16000, return_tensors="pt")
+ device = next(model.parameters()).device
+ input_values = inputs.input_values.to(device)
+
+ with torch.inference_mode():
+ logits = model(input_values).logits
+
+ predicted_id = int(torch.argmax(logits, dim=-1).item())
+ label = model.config.id2label.get(predicted_id) if hasattr(model.config.id2label, "get") else model.config.id2label[predicted_id]
+ if not label:
+ return None
+ # MMS-LID labels are ISO 639-3 codes (e.g. "eng", "swh"). Be lenient
+ # about case/whitespace just in case.
+ base = label.strip().lower()
+ if len(base) != 3:
+ print(f"LID returned non-ISO-639-3 label {label!r}; skipping")
+ return None
+
+ table = _default_script_table()
+ script = table.get(base)
+ if not script:
+ # Detected language isn't in OmniASR's supported set — give up and
+ # let the caller transcribe without conditioning.
+ return None
+ return f"{base}_{script}"
+
+
+def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str | None = None) -> dict:
+ """
+ Transcribe audio bytes → text using OmniASR LLM 1B v2.
+
+ Args:
+ audio_bytes: Raw audio file bytes.
+ mime_type: MIME type for format detection.
+ lang: Optional OmniASR language code (e.g. "eng_Latn", "urd_Arab").
+ When provided we trust it and skip LID. When `None` we run
+ MMS-LID first to pick a code, then transcribe with it.
+
+ Returns:
+ dict with text, duration_s, inference_s, and `lang` (the code we
+ ended up using — either the caller-supplied one or the LID-detected
+ one). `lang` is omitted only when LID failed and we transcribed
+ without conditioning.
+ """
+ import soundfile as sf
+ import numpy as np
+ import tempfile
+ import subprocess
+ import os
+ import time
+
+ pipeline = get_pipeline()
+ _ensure_gang_context()
+
+ # --- Convert to 16kHz mono WAV via ffmpeg ---
+ ext_map = {
+ "audio/wav": ".wav", "audio/x-wav": ".wav",
+ "audio/mpeg": ".mp3", "audio/mp3": ".mp3",
+ "audio/webm": ".webm", "audio/ogg": ".ogg",
+ "audio/flac": ".flac", "audio/mp4": ".m4a",
+ }
+ ext = ext_map.get(mime_type, ".wav")
+
+ with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f:
+ f.write(audio_bytes)
+ input_path = f.name
+
+ output_path = input_path.rsplit(".", 1)[0] + "_16k.wav"
+ try:
+ result = subprocess.run(
+ ["ffmpeg", "-y", "-i", input_path,
+ "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
+ output_path],
+ capture_output=True, text=True, timeout=60,
+ )
+ if result.returncode != 0:
+ raise RuntimeError(f"ffmpeg failed: {(result.stderr or '')[:500]}")
+
+ waveform, sr = sf.read(output_path)
+ waveform = waveform.astype(np.float32)
+ if waveform.ndim > 1:
+ waveform = waveform.mean(axis=-1)
+ duration = len(waveform) / sr
+
+ # --- Language ID (auto-detect mode only) ---
+ # If the caller supplied `lang` we trust it. Otherwise we run
+ # MMS-LID on the (already 16-kHz mono) waveform.
+ lid_time = 0.0
+ resolved_lang = lang
+ if resolved_lang is None:
+ lid_start = time.perf_counter()
+ try:
+ resolved_lang = detect_omniasr_code(waveform)
+ except Exception as e:
+ print(f"LID failed: {e}; falling back to unconditioned transcription")
+ resolved_lang = None
+ lid_time = time.perf_counter() - lid_start
+
+ # --- Chunk if > 40s (model limitation) ---
+ max_samples = 40 * sr # 40 seconds
+ if len(waveform) > max_samples:
+ chunks = []
+ for start in range(0, len(waveform), max_samples):
+ chunks.append(waveform[start : start + max_samples])
+ else:
+ chunks = [waveform]
+
+ # Build audio dicts for the pipeline
+ audio_inputs = [
+ {"waveform": chunk, "sample_rate": sr}
+ for chunk in chunks
+ ]
+
+ # Build lang list to match (one per chunk), or None
+ lang_list = [resolved_lang] * len(audio_inputs) if resolved_lang else None
+
+ # --- Transcribe ---
+ start_t = time.perf_counter()
+ transcriptions = pipeline.transcribe(
+ audio_inputs,
+ lang=lang_list,
+ batch_size=1,
+ )
+ inference_time = time.perf_counter() - start_t
+
+ # Join chunks with space
+ full_text = " ".join(t.strip() for t in transcriptions if t.strip())
+
+ resp = {
+ "text": full_text,
+ "duration_s": round(duration, 2),
+ "inference_s": round(inference_time, 3),
+ }
+ if lid_time:
+ resp["lid_s"] = round(lid_time, 3)
+ # Echo the lang we actually used (caller-supplied or LID-resolved)
+ # so the client can render an honest badge. If LID failed and we
+ # transcribed without conditioning, omit the field entirely.
+ if resolved_lang:
+ resp["lang"] = resolved_lang
+
+ return resp
+
+ finally:
+ os.unlink(input_path)
+ if os.path.exists(output_path):
+ os.unlink(output_path)
+
+
+# ---------- Modal function ----------
+
+@app.function(
+ image=image,
+ gpu="T4",
+ timeout=600,
+ scaledown_window=120, # keep warm 2 min after last request
+ max_containers=3,
+)
+@modal.asgi_app()
+def serve():
+ from fastapi import FastAPI, UploadFile, File, Query, HTTPException
+ from fastapi.middleware.cors import CORSMiddleware
+
+ web_app = FastAPI(title="Codex ASR (OmniASR LLM 1B v2)")
+ web_app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_methods=["*"],
+ allow_headers=["*"],
+ )
+
+ @web_app.get("/")
+ def root():
+ return {
+ "service": "codex-asr",
+ "model": MODEL_CARD,
+ "lid_model": LID_MODEL_ID,
+ "languages": "1600+",
+ "note": "Pass ?lang={iso639_3}_{Script} (e.g. eng_Latn) to skip LID. Omit to run MMS-LID first and use the detected language for transcription.",
+ }
+
+ @web_app.get("/health")
+ def health():
+ return {"status": "ok", "model_loaded": _pipeline is not None}
+
+ @web_app.get("/languages")
+ def list_languages():
+ """Return all supported language codes."""
+ from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+ return {"count": len(supported_langs), "languages": sorted(supported_langs)}
+
+ @web_app.post("/transcribe")
+ async def transcribe_endpoint(
+ file: UploadFile = File(...),
+ lang: str | None = Query(
+ default=None,
+ description="OmniASR language code in {iso639_3}_{Script} form, e.g. eng_Latn, urd_Arab, spa_Latn. Omit to run MMS-LID first and use the detected language for transcription.",
+ ),
+ ):
+ # Validate language code if provided
+ if lang is not None:
+ from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+ if lang not in supported_langs:
+ raise HTTPException(
+ 400,
+ f"Unknown language code: '{lang}'. "
+ f"Use GET /languages for the full list. "
+ f"Format: {{iso639_3}}_{{Script}}, e.g. eng_Latn",
+ )
+
+ try:
+ audio_bytes = await file.read()
+ if len(audio_bytes) > 50 * 1024 * 1024:
+ raise HTTPException(413, "File too large (50MB max)")
+ if len(audio_bytes) == 0:
+ raise HTTPException(400, "Empty file")
+
+ mime = file.content_type or "audio/wav"
+ return transcribe_audio(audio_bytes, mime, lang=lang)
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(500, f"Transcription failed: {str(e)}")
+
+ # Model loads lazily on first /transcribe request via get_pipeline().
+ # Weights are baked into the image so loading takes ~15-20s (no download).
+ return web_app
diff --git a/package.json b/package.json
index e2da8ab79..13a9ab247 100644
--- a/package.json
+++ b/package.json
@@ -873,38 +873,48 @@
"description": "Model name selected for inference."
},
"codex-editor-extension.asrEndpoint": {
- "title": "ASR WebSocket Endpoint",
+ "title": "ASR Endpoint",
"type": "string",
- "default": "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe",
- "description": "WebSocket endpoint for audio transcription. When authenticated with Frontier, the auth server endpoint is automatically used. This setting is used as fallback when not authenticated or for local development."
+ "default": "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe",
+ "description": "HTTPS endpoint for audio transcription (POST multipart with optional ?lang= query). When authenticated with Frontier, the auth server endpoint is automatically used; this setting is the fallback for unauthenticated / local development."
},
"codex-editor-extension.asrProvider": {
"title": "ASR Provider",
"type": "string",
"enum": [
- "mms",
+ "omniasr",
"whisper"
],
- "default": "mms",
- "description": "Provider for transcription. MMS requires a language code; Whisper auto-detects."
+ "default": "omniasr",
+ "description": "Provider for transcription. OmniASR accepts an optional {iso639_3}_{Script} language hint; Whisper auto-detects."
},
"codex-editor-extension.asrModel": {
"title": "ASR Model",
"type": "string",
- "default": "facebook/mms-1b-all",
- "description": "Model identifier to use for transcription (e.g., facebook/mms-1b-all)."
+ "default": "omniASR_LLM_1B_v2",
+ "description": "Model identifier used by the ASR service (e.g., omniASR_LLM_1B_v2)."
},
"codex-editor-extension.asrLanguage": {
"title": "ASR Language (ISO-639-3)",
"type": "string",
"default": "eng",
- "description": "Language code for transcription. MMS requires ISO-639-3 (e.g., eng, fra, spa). 2-letter codes will be mapped where possible."
+ "description": "Legacy: ISO 639-3 hint for ASR providers. OmniASR uses the project's target language by default; configure via the gear menu on the Transcribe button."
},
- "codex-editor-extension.asrPhonetic": {
- "title": "Return Phonetic (IPA)",
- "type": "boolean",
- "default": false,
- "description": "If enabled and supported by provider, also return phonetic (IPA) transcription."
+ "codex-editor-extension.asrLanguageMode": {
+ "title": "ASR Language Mode",
+ "type": "string",
+ "enum": [
+ "project",
+ "auto"
+ ],
+ "default": "project",
+ "description": "Whether to send the project's target language as a hint to the ASR service (\"project\"), or let the model transcribe without language conditioning (\"auto\")."
+ },
+ "codex-editor-extension.asrScriptPref": {
+ "title": "ASR Script Preference",
+ "type": "string",
+ "default": "auto",
+ "description": "Script subtag to pair with the ASR language code. \"auto\" picks the best-guess script per language; \"latin\" forces Latin where supported; any 4-letter ISO 15924 tag (e.g. \"Arab\", \"Cyrl\") overrides per-language."
},
"codex-editor-extension.sourceBookWhitelist": {
"title": "Source Book Whitelist",
diff --git a/sharedUtils/asrLanguageUtils.ts b/sharedUtils/asrLanguageUtils.ts
new file mode 100644
index 000000000..b050abb20
--- /dev/null
+++ b/sharedUtils/asrLanguageUtils.ts
@@ -0,0 +1,268 @@
+/**
+ * ASR language-utility functions
+ * ------------------------------
+ *
+ * Pure helpers (no `vscode` imports → unit-testable, usable from both the
+ * extension host and the webviews) that:
+ *
+ * 1. **Resolve** a project's language metadata into an OmniASR-compatible
+ * `{iso639_3}_{Script}` code (or decide we should send no code, letting
+ * the server transcribe without language conditioning).
+ * 2. **Label** an OmniASR code with a friendly display name suitable for the
+ * post-transcription badge (e.g. `swh_Latn` → "Swahili").
+ *
+ * Why this lives in `sharedUtils/`
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Both the extension host (`src/providers/...`) and the webviews
+ * (`webviews/.../CodexCellEditor`) need it: the host builds the `asrConfig`
+ * payload from project settings, and the webview renders the badge after a
+ * transcription completes.
+ */
+
+import {
+ OMNI_ASR_SUPPORTED_LANGS,
+ OMNI_ASR_SUPPORTED_LANG_SET,
+} from "./omniAsrSupportedLangs";
+import { OMNI_ASR_DEFAULT_SCRIPTS } from "./omniAsrDefaultScripts";
+import { OMNI_ASR_FRIENDLY_NAMES } from "./omniAsrFriendlyNames";
+
+/**
+ * Minimal shape of the project's language metadata that we consume here.
+ * Matches `codex-types`'s `LanguageMetadata` but we restate it so this file
+ * doesn't pull `codex-types` (and its transitive deps) into the webview
+ * bundle.
+ */
+export type AsrLanguageMetaInput = {
+ tag?: string;
+ iso1?: string;
+ iso2t?: string;
+ iso2b?: string;
+ refName?: string;
+};
+
+/**
+ * Macrolanguage → individual-language remaps used when the project's tag
+ * names a macrolanguage that OmniASR doesn't serve directly. Each pair maps
+ * a macro ISO 639-3 to the individual ISO 639-3 that OmniASR actually
+ * supports for the most widely-spoken variety. Sources:
+ * - SIL macrolanguage mappings (iso-639-3-macrolanguages.tab)
+ * - cross-checked against `OMNI_ASR_SUPPORTED_LANGS`
+ *
+ * Add only when (a) the macro is genuinely not in OmniASR's set and (b) the
+ * "right" individual is unambiguous.
+ */
+const MACRO_TO_INDIVIDUAL: Readonly> = {
+ swa: "swh", // Swahili → Coastal Swahili (Kenya/Tanzania majority)
+ ara: "arb", // Arabic → Modern Standard Arabic
+ msa: "zsm", // Malay → Standard Malay
+ zho: "cmn", // Chinese → Mandarin
+ ori: "ory", // Oriya → Odia
+ est: "ekk", // Estonian → Standard Estonian
+ sqi: "als", // Albanian → Tosk Albanian
+ kur: "kmr", // Kurdish → Northern Kurdish (largest speaker base)
+ nor: "nob", // Norwegian → Bokmål
+ oji: "ojb", // Ojibwa → Northwestern Ojibwa
+};
+
+/** ISO 639-1 (2-letter) → ISO 639-3 (3-letter). Common languages only; the
+ * project usually carries `iso2t` directly so this is just a fallback. */
+const ISO1_TO_ISO3: Readonly> = {
+ en: "eng", fr: "fra", es: "spa", de: "deu", pt: "por", it: "ita",
+ nl: "nld", ru: "rus", zh: "cmn", ja: "jpn", ko: "kor", ar: "arb",
+ sw: "swh", ur: "urd", hi: "hin", bn: "ben", id: "ind", tr: "tur",
+ th: "tha", vi: "vie", uk: "ukr", pl: "pol", fa: "pes", he: "heb",
+};
+
+/**
+ * Pull the ISO 639-3 base + optional Script subtag out of a project's
+ * language metadata, normalizing macrolanguages to OmniASR-served
+ * individuals. Returns `undefined` if we can't recover a 3-letter code.
+ */
+function extractBaseAndScript(
+ meta: AsrLanguageMetaInput | undefined
+): { base: string; explicitScript?: string; } | undefined {
+ if (!meta) return undefined;
+
+ // BCP-47-ish tag is the richest source: e.g. "swh", "ur-Arab", "zh-Hans".
+ const tag = (meta.tag || "").trim();
+ let base = "";
+ let explicitScript: string | undefined;
+
+ if (tag) {
+ const [primary, ...subtags] = tag.split(/[-_]/);
+ const lowered = (primary || "").toLowerCase();
+ if (lowered.length === 3) {
+ base = lowered;
+ } else if (lowered.length === 2) {
+ base = ISO1_TO_ISO3[lowered] ?? "";
+ }
+ // Script subtags are exactly 4 chars, title-case (Latn, Arab, Cyrl, ...).
+ const script = subtags.find((s) => s.length === 4);
+ if (script) {
+ explicitScript = script.charAt(0).toUpperCase() + script.slice(1).toLowerCase();
+ }
+ }
+
+ if (!base) {
+ base = (meta.iso2t || meta.iso2b || "").toLowerCase();
+ }
+ if (!base) {
+ const i1 = (meta.iso1 || "").toLowerCase();
+ base = ISO1_TO_ISO3[i1] ?? "";
+ }
+ if (!base) return undefined;
+
+ base = MACRO_TO_INDIVIDUAL[base] ?? base;
+ return { base, explicitScript };
+}
+
+/**
+ * `scriptPref` is what the user picked in the Script advanced setting.
+ *
+ * - `"auto"` → "best guess" (our default). Pick the script using
+ * `OMNI_ASR_DEFAULT_SCRIPTS`, falling back to Latin then
+ * the sole supported script.
+ * - `"latin"` → force Latin script when supported, otherwise fall back
+ * to auto behaviour.
+ * - any 4-char string (`"Arab"`, `"Cyrl"`, ...) → use that script.
+ */
+export type AsrScriptPref = "auto" | "latin" | string;
+
+/**
+ * Resolve a project's language metadata to an OmniASR-compatible
+ * `{iso639_3}_{Script}` code, or return `undefined` when we can't safely pick
+ * one (the caller should then omit the `lang` query param so the server
+ * transcribes without language conditioning).
+ *
+ * Selection priority:
+ * 1. Explicit `scriptPref` (4-letter ISO 15924 tag) → use as-is when
+ * `{base}_{Script}` is a supported code.
+ * 2. Script encoded in the project tag (e.g. `swa-Cyrl`) → ditto.
+ * 3. `scriptPref === "latin"` → Latin if supported.
+ * 4. `OMNI_ASR_DEFAULT_SCRIPTS[base]` (our hand-curated "best guess").
+ * 5. Latin if supported.
+ * 6. Sole supported script for this base.
+ * 7. `undefined` (genuinely ambiguous → let the server pick).
+ *
+ * Future work: a per-cell script override could short-circuit step 1.
+ */
+export function resolveOmniAsrCode(
+ meta: AsrLanguageMetaInput | undefined,
+ scriptPref: AsrScriptPref = "auto"
+): string | undefined {
+ const extracted = extractBaseAndScript(meta);
+ if (!extracted) return undefined;
+ const { base, explicitScript } = extracted;
+
+ // Find every supported script for this base.
+ const supportedScripts = OMNI_ASR_SUPPORTED_LANGS
+ .filter((c) => c.startsWith(`${base}_`))
+ .map((c) => c.split("_")[1]);
+ if (supportedScripts.length === 0) return undefined;
+
+ const tryCode = (script: string): string | undefined => {
+ const code = `${base}_${script}`;
+ return OMNI_ASR_SUPPORTED_LANG_SET.has(code) ? code : undefined;
+ };
+
+ // 1. Explicit user-chosen script (4-letter custom tag from advanced setting)
+ if (scriptPref && scriptPref !== "auto" && scriptPref !== "latin" && scriptPref.length === 4) {
+ const normalized = scriptPref.charAt(0).toUpperCase() + scriptPref.slice(1).toLowerCase();
+ const code = tryCode(normalized);
+ if (code) return code;
+ }
+
+ // 2. Script encoded in the project tag
+ if (explicitScript) {
+ const code = tryCode(explicitScript);
+ if (code) return code;
+ }
+
+ // 3. scriptPref === "latin" → Latin if supported
+ if (scriptPref === "latin") {
+ const code = tryCode("Latn");
+ if (code) return code;
+ }
+
+ // 4. Default script for this base
+ const defaultScript = OMNI_ASR_DEFAULT_SCRIPTS[base];
+ if (defaultScript) {
+ const code = tryCode(defaultScript);
+ if (code) return code;
+ }
+
+ // 5. Latin if supported
+ const latin = tryCode("Latn");
+ if (latin) return latin;
+
+ // 6. Sole supported script
+ if (supportedScripts.length === 1) {
+ return `${base}_${supportedScripts[0]}`;
+ }
+
+ // 7. Genuinely ambiguous
+ return undefined;
+}
+
+/** Split an OmniASR code like "swh_Latn" into base + script (or return null). */
+export function splitOmniAsrCode(code: string | undefined | null): { base: string; script: string; } | null {
+ if (!code) return null;
+ const m = /^([a-z]{2,3})_([A-Z][a-z]{3})$/.exec(code);
+ if (!m) return null;
+ return { base: m[1], script: m[2] };
+}
+
+/**
+ * SIL `Ref_Name` values are CamelCased with no spaces (e.g. "MinNanChinese").
+ * Split on case changes for natural-looking display: "Min Nan Chinese".
+ */
+function prettifyRefName(name: string): string {
+ return name
+ // Insert a space before any uppercase letter that follows a lowercase one.
+ .replace(/([a-z])([A-Z])/g, "$1 $2")
+ // And before an uppercase letter that's followed by a lowercase one
+ // (handles runs of acronyms like "USA").
+ .replace(/([A-Z])([A-Z][a-z])/g, "$1 $2")
+ .trim();
+}
+
+/**
+ * Friendly display name for a transcription's language badge.
+ *
+ * Inputs:
+ * - `serverLang` — the code OmniASR echoed back in its response (when we
+ * sent one). The primary source of truth.
+ * - `sentCode` — what we asked the server to use, in case it didn't
+ * echo (today the server only echoes when given a code).
+ * - `projectLanguageName` — `refName` of the project's target language, as
+ * a last-ditch fallback when we know we sent the
+ * project's code but the server omitted the echo.
+ *
+ * The badge returns `null` to mean "render nothing" (we have no honest label).
+ * The caller renders "Auto Detect" itself when in auto-detect mode and we
+ * have no detected-language info, so we never lie about it here.
+ */
+export function labelForTranscriptionLanguage(
+ serverLang: string | undefined | null,
+ sentCode: string | undefined | null,
+ projectLanguageName: string | undefined | null
+): string | null {
+ const friendly = (code: string | null | undefined): string | null => {
+ const parts = splitOmniAsrCode(code);
+ if (!parts) return null;
+ const refName = OMNI_ASR_FRIENDLY_NAMES[parts.base];
+ return refName ? prettifyRefName(refName) : null;
+ };
+
+ // 1. Server's echo is always the most truthful signal.
+ const fromServer = friendly(serverLang);
+ if (fromServer) return fromServer;
+
+ // 2. If we sent a code but the server didn't echo, the server still used
+ // what we sent — show that.
+ const fromSent = friendly(sentCode);
+ if (fromSent) return fromSent;
+
+ // 3. Last-ditch fallback: project language name, if any.
+ return projectLanguageName ? prettifyRefName(projectLanguageName) : null;
+}
diff --git a/sharedUtils/omniAsrDefaultScripts.ts b/sharedUtils/omniAsrDefaultScripts.ts
new file mode 100644
index 000000000..3155590fd
--- /dev/null
+++ b/sharedUtils/omniAsrDefaultScripts.ts
@@ -0,0 +1,77 @@
+/**
+ * OmniASR multi-script default-script table
+ * -----------------------------------------
+ *
+ * For each OmniASR language with **multiple supported scripts**, the script
+ * we should pick by default when the user has not specified one.
+ *
+ * Background
+ * ~~~~~~~~~~
+ * OmniASR codes are `{iso639_3}_{Script}` (e.g. `urd_Arab`). Almost every
+ * supported base language (1631 of 1650 unique bases) supports exactly one
+ * script, so the script choice is trivial. This file only lists the 19
+ * multi-script bases that need a real tiebreaker.
+ *
+ * Selection priority used by the resolver (`asrLanguageUtils.ts`):
+ * 1. Explicit script the user typed in the advanced setting
+ * 2. Script encoded in the project's language tag (e.g. `swa-Cyrl`)
+ * 3. **This table** (the "best guess")
+ * 4. Latin, if the language supports Latin
+ * 5. Sole supported script (if only one)
+ * 6. Omit `lang` (server runs without language conditioning)
+ *
+ * Source / rationale per entry
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Picked using Unicode CLDR `likelySubtags.xml` (the official "if a user gives
+ * me a language tag with no script, what script should I assume?" table)
+ * cross-checked against modern majority usage. Macrolanguage → individual
+ * remaps (e.g. swa→swh, ara→arb, zho→cmn, kur→kmr) are handled in the
+ * resolver *before* lookup, so this table keys on the individual codes
+ * OmniASR actually serves.
+ *
+ * If you adjust an entry, leave a `// ←` note explaining why.
+ *
+ * Multi-script bases not listed here intentionally fall through to "Latin if
+ * supported, else sole script". Add an entry here only when CLDR or modern
+ * majority usage clearly disagrees with that default.
+ *
+ * Regenerating
+ * ~~~~~~~~~~~~
+ * To rediscover which bases need entries (after a model update changes the
+ * supported set):
+ *
+ * curl -s "https://genesis-ai-dev--codex-asr-serve.modal.run/languages" \
+ * | python3 -c "
+ * import json, sys
+ * d = json.load(sys.stdin)
+ * bases = {}
+ * for l in d['languages']:
+ * b, s = l.split('_')
+ * bases.setdefault(b, set()).add(s)
+ * for b, ss in sorted(bases.items()):
+ * if len(ss) > 1:
+ * print(b, sorted(ss))
+ * "
+ */
+
+export const OMNI_ASR_DEFAULT_SCRIPTS: Readonly> = {
+ aze: "Latn", // Azerbaijani — modern standard (Republic of Azerbaijan) is Latin
+ bcc: "Arab", // Southern Balochi — written in Arabic script
+ cmn: "Hans", // Mandarin Chinese — Simplified is the more common default
+ cmo: "Khmr", // Central Mnong — Khmer-script orthography (community standard)
+ crk: "Cans", // Plains Cree — Canadian Aboriginal Syllabics is the traditional script
+ ell: "Grek", // Greek — only one substantive script; entry exists for completeness
+ gag: "Latn", // Gagauz — modern orthography is Latin
+ kmr: "Latn", // Northern Kurdish — Latin (Hawar) is the predominant modern script
+ lld: "Latn", // Ladin — only Latin; entry exists for completeness
+ ojb: "Latn", // Northwestern Ojibwa — Latin (double-vowel) is most common in print
+ rif: "Latn", // Tarifit Berber — Latin in modern publications (Tifinagh not in OmniASR)
+ rmc: "Latn", // Carpathian Romani — Latin in modern orthographies
+ rmy: "Latn", // Vlax Romani — Latin in modern orthographies
+ tuk: "Latn", // Turkmen — modern standard (Turkmenistan) is Latin
+ uig: "Arab", // Uyghur — Arabic-script (Uyghur Ereb Yëziqi) is the predominant script
+ urd: "Arab", // Urdu — Arabic-script (Nastaliq) is the canonical script
+ uzb: "Latn", // Uzbek — modern standard (Uzbekistan) is Latin
+ wal: "Ethi", // Wolaytta — Ethiopic (Geʽez) script in modern orthographies
+ yue: "Hant", // Cantonese — Traditional Chinese (Hong Kong / Guangzhou default)
+};
diff --git a/sharedUtils/omniAsrFriendlyNames.ts b/sharedUtils/omniAsrFriendlyNames.ts
new file mode 100644
index 000000000..e39632bcb
--- /dev/null
+++ b/sharedUtils/omniAsrFriendlyNames.ts
@@ -0,0 +1,1680 @@
+/**
+ * OmniASR friendly-name lookup
+ * ----------------------------
+ *
+ * Maps each OmniASR-supported ISO 639-3 base (1650 entries) to its English
+ * "reference name" from the SIL ISO 639-3 registry. Used to render the
+ * language badge after a transcription completes (e.g. `swh_Latn` → "Swahili").
+ *
+ * Notes
+ * ~~~~~
+ * - Keyed on the **base** (ISO 639-3), not the full OmniASR code, because the
+ * friendly name is the same regardless of script. Callers should strip the
+ * `_{Script}` suffix before lookup. The resolver in `asrLanguageUtils.ts`
+ * handles that.
+ * - Names come straight from SIL's `Ref_Name` field, which is CamelCased and
+ * ASCII-only (e.g. "ArbëreshëAlbanian" → "ArbresheAlbanian"). The helper
+ * `prettifyRefName()` in `asrLanguageUtils.ts` splits these on case changes
+ * so they read naturally in the UI.
+ * - The 'nan' entry is added by hand (Min Nan Chinese) — SIL leaves Ref_Name
+ * blank for that code in the version we parsed.
+ *
+ * Regenerating
+ * ~~~~~~~~~~~~
+ * If OmniASR's supported set changes, regenerate from the SIL data already
+ * bundled in `src/utils/languageUtils.ts` using the snippet in
+ * `omniAsrSupportedLangs.ts`'s header (look up each base's `Ref_Name`).
+ */
+
+export const OMNI_ASR_FRIENDLY_NAMES: Readonly> = {
+ aae: "ArbëreshëAlbanian",
+ aal: "Afade",
+ abb: "Bankon",
+ abi: "Abidji",
+ abk: "Abkhazian",
+ abn: "Abua",
+ abp: "AbellenAyta",
+ abr: "Abron",
+ abs: "AmboneseMalay",
+ aca: "Achagua",
+ acd: "Gikyode",
+ ace: "Achinese",
+ acf: "SaintLucianCreoleFrench",
+ ach: "Acoli",
+ acm: "MesopotamianArabic",
+ acn: "Achang",
+ acr: "Achi",
+ acu: "Achuar-Shiwiar",
+ acw: "HijaziArabic",
+ ade: "Adele",
+ adh: "Adhola",
+ adj: "Adioukrou",
+ adx: "AmdoTibetan",
+ ady: "Adyghe",
+ aeb: "TunisianArabic",
+ aec: "SaidiArabic",
+ aeu: "Akeu",
+ afb: "GulfArabic",
+ afo: "Eloyi",
+ afr: "Afrikaans",
+ agd: "Agarabi",
+ agg: "Angor",
+ agn: "Agutaynen",
+ agr: "Aguaruna",
+ agu: "Aguacateco",
+ agx: "Aghul",
+ aha: "Ahanta",
+ ahk: "Akha",
+ ahl: "Igo",
+ ahs: "Ashe",
+ aia: "Arosi",
+ ajg: "Aja(Benin)",
+ aka: "Akan",
+ akb: "BatakAngkola",
+ ake: "Akawaio",
+ akp: "Siwu",
+ ala: "Alago",
+ alj: "Alangan",
+ aln: "GhegAlbanian",
+ alo: "Larike-Wakasihu",
+ alp: "Alune",
+ als: "ToskAlbanian",
+ alt: "SouthernAltai",
+ alz: "Alur",
+ ame: "Yanesha'",
+ amf: "Hamer-Banna",
+ amh: "Amharic",
+ ami: "Amis",
+ amk: "Ambai",
+ amu: "GuerreroAmuzgo",
+ anc: "Ngas",
+ ank: "Goemai",
+ ann: "Obolo",
+ anp: "Angika",
+ anw: "Anaang",
+ any: "Anyin",
+ aom: "Ömie",
+ aoz: "UabMeto",
+ apb: "Sa'a",
+ apc: "LevantineArabic",
+ apd: "SudaneseArabic",
+ apr: "Arop-Lokep",
+ arb: "StandardArabic",
+ arg: "Aragonese",
+ arl: "Arabela",
+ arq: "AlgerianArabic",
+ ars: "NajdiArabic",
+ ary: "MoroccanArabic",
+ arz: "EgyptianArabic",
+ asa: "Asu(Tanzania)",
+ asg: "Cishingini",
+ asm: "Assamese",
+ ast: "Asturian",
+ ata: "Pele-Ata",
+ atb: "Zaiwa",
+ atg: "IvbieNorth-Okpela-Arhe",
+ ati: "Attié",
+ atq: "Aralle-Tabulahan",
+ ava: "Avaric",
+ avn: "Avatime",
+ avu: "Avokaya",
+ awa: "Awadhi",
+ awb: "Awa(PapuaNewGuinea)",
+ awo: "Awak",
+ ayl: "LibyanArabic",
+ ayo: "Ayoreo",
+ ayp: "NorthMesopotamianArabic",
+ ayr: "CentralAymara",
+ ayz: "MaiBrat",
+ aze: "Azerbaijani",
+ azg: "SanPedroAmuzgosAmuzgo",
+ azz: "HighlandPueblaNahuatl",
+ bag: "Tuki",
+ bak: "Bashkir",
+ bam: "Bambara",
+ ban: "Balinese",
+ bao: "Waimaha",
+ bas: "Basa(Cameroon)",
+ bav: "Vengo",
+ bax: "Bamun",
+ bba: "Baatonum",
+ bbb: "Barai",
+ bbc: "BatakToba",
+ bbj: "Ghomálá'",
+ bbl: "Bats",
+ bbo: "NorthernBoboMadaré",
+ bbu: "Kulung(Nigeria)",
+ bcc: "SouthernBalochi",
+ bce: "Bamenyam",
+ bci: "Baoulé",
+ bcl: "CentralBikol",
+ bcs: "Kohumono",
+ bcw: "Bana",
+ bcy: "Bacama",
+ bcz: "Bainouk-Gunyaamolo",
+ bda: "Bayot",
+ bde: "Bade",
+ bdg: "Bonggi",
+ bdh: "Baka(SouthSudan)",
+ bdm: "Buduma",
+ bdq: "Bahnar",
+ bdu: "Oroko",
+ beb: "Bebele",
+ beh: "Biali",
+ bel: "Belarusian",
+ bem: "Bemba(Zambia)",
+ ben: "Bengali",
+ bep: "Besoa",
+ bew: "Betawi",
+ bex: "JurModo",
+ bfa: "Bari",
+ bfd: "Bafut",
+ bfo: "MalbaBirifor",
+ bft: "Balti",
+ bfy: "Bagheli",
+ bfz: "MahasuPahari",
+ bgc: "Haryanvi",
+ bgp: "EasternBalochi",
+ bgq: "Bagri",
+ bgr: "BawmChin",
+ bgt: "Bughotu",
+ bgw: "Bhatri",
+ bha: "Bharia",
+ bhb: "Bhili",
+ bhh: "Bukharic",
+ bho: "Bhojpuri",
+ bhp: "Bima",
+ bht: "Bhattiyali",
+ bhz: "Bada(Indonesia)",
+ bib: "Bissa",
+ bim: "Bimoba",
+ bis: "Bislama",
+ biv: "SouthernBirifor",
+ bjj: "Kanauji",
+ bjk: "Barok",
+ bjn: "Banjar",
+ bjr: "Binumarien",
+ bjt: "Balanta-Ganja",
+ bjv: "Bedjond",
+ bjw: "Bakwé",
+ bjz: "Baruga",
+ bkd: "Binukid",
+ bkh: "Bakoko",
+ bkm: "Kom(Cameroon)",
+ bkv: "Bekwarra",
+ bky: "Bokyi",
+ ble: "Balanta-Kentohe",
+ blh: "Kuwaa",
+ blt: "TaiDam",
+ blx: "Mag-IndiAyta",
+ blz: "Balantak",
+ bmm: "NorthernBetsimisarakaMalagasy",
+ bmq: "Bomu",
+ bmr: "Muinane",
+ bmu: "Somba-Siawari",
+ bmv: "Bum",
+ bng: "Benga",
+ bnm: "Batanga",
+ bnn: "Bunun",
+ bno: "Bantoanon",
+ bnp: "Bola",
+ bns: "Bundeli",
+ boa: "Bora",
+ bod: "Tibetan",
+ boj: "Anjam",
+ bom: "Berom",
+ bor: "Borôro",
+ bos: "Bosnian",
+ bou: "Bondei",
+ bov: "Tuwuli",
+ box: "Buamu",
+ bpr: "KoronadalBlaan",
+ bps: "SaranganiBlaan",
+ bqc: "Boko(Benin)",
+ bqg: "Bago-Kusuntu",
+ bqi: "Bakhtiari",
+ bqj: "Bandial",
+ bqp: "Busa",
+ bra: "Braj",
+ bre: "Breton",
+ brh: "Brahui",
+ bri: "Mokpwe",
+ bru: "EasternBru",
+ brx: "Bodo(India)",
+ bsc: "Bassari",
+ bsh: "Kati",
+ bsj: "Bangwinji",
+ bsk: "Burushaski",
+ bsq: "Bassa",
+ bss: "Akoose",
+ bsy: "SabahBisaya",
+ btd: "BatakDairi",
+ btm: "BatakMandailing",
+ bts: "BatakSimalungun",
+ btt: "Bete-Bendi",
+ btv: "Bateri",
+ btx: "BatakKaro",
+ bud: "Ntcham",
+ bug: "Buginese",
+ bul: "Bulgarian",
+ bum: "Bulu(Cameroon)",
+ buo: "Terei",
+ bus: "Bokobaru",
+ bux: "Boghom",
+ bvb: "Bube",
+ bvc: "Baelelea",
+ bvz: "Bauzi",
+ bwq: "SouthernBoboMadaré",
+ bwr: "Bura-Pabir",
+ bwu: "Buli(Ghana)",
+ bxf: "Bilur",
+ bxk: "Bukusu",
+ byc: "Ubaghara",
+ byr: "Baruya",
+ bys: "Burak",
+ byv: "Medumba",
+ byx: "Qaqet",
+ bzh: "MaposBuang",
+ bzi: "Bisu",
+ bzj: "BelizeKriolEnglish",
+ bzw: "Basa(Nigeria)",
+ caa: "Chortí",
+ cab: "Garifuna",
+ cac: "Chuj",
+ cak: "Kaqchikel",
+ cap: "Chipaya",
+ car: "GalibiCarib",
+ cas: "Tsimané",
+ cat: "Catalan",
+ cax: "Chiquitano",
+ cbc: "Carapana",
+ cbi: "Chachi",
+ cbr: "Cashibo-Cacataibo",
+ cbs: "Cashinahua",
+ cbt: "Chayahuita",
+ cbu: "Candoshi-Shapra",
+ cbv: "Cacua",
+ cce: "Chopi",
+ ccg: "SambaDaka",
+ cco: "ComaltepecChinantec",
+ cdj: "Churahi",
+ cdo: "MinDongChinese",
+ ceb: "Cebuano",
+ ceg: "Chamacoco",
+ cek: "EasternKhumiChin",
+ cen: "Cen",
+ ces: "Czech",
+ cfa: "Dijim-Bwilim",
+ cfm: "FalamChin",
+ cgc: "Kagayanen",
+ cgg: "Chiga",
+ che: "Chechen",
+ chf: "TabascoChontal",
+ chq: "QuiotepecChinantec",
+ chv: "Chuvash",
+ chz: "OzumacínChinantec",
+ cjk: "Chokwe",
+ cjo: "AshéninkaPajonal",
+ cjp: "Cabécar",
+ cjs: "Shor",
+ ckb: "CentralKurdish",
+ ckl: "Cibak",
+ cko: "Anufo",
+ ckr: "Kairak",
+ ckt: "Chukot",
+ cky: "Cakfem-Mushere",
+ cla: "Ron",
+ cle: "LealaoChinantec",
+ cly: "EasternHighlandChatino",
+ cme: "Cerma",
+ cmn: "MandarinChinese",
+ cmo: "CentralMnong",
+ cmr: "Mro-KhimiChin",
+ cnh: "HakhaChin",
+ cni: "Asháninka",
+ cnl: "LalanaChinantec",
+ cnt: "TepetotutlaChinantec",
+ coe: "Koreguaje",
+ cof: "Colorado",
+ cok: "SantaTeresaCora",
+ con: "Cofán",
+ cor: "Cornish",
+ cot: "Caquinte",
+ cou: "Wamey",
+ cpa: "PalantlaChinantec",
+ cpb: "Ucayali-YurúaAshéninka",
+ cpu: "PichisAshéninka",
+ cpx: "Pu-XianChinese",
+ cpy: "SouthUcayaliAshéninka",
+ crh: "CrimeanTatar",
+ crk: "PlainsCree",
+ crn: "ElNayarCora",
+ crq: "Iyo'wujwaChorote",
+ crs: "SeselwaCreoleFrench",
+ crt: "Iyojwa'jaChorote",
+ csk: "Jola-Kasa",
+ cso: "SochiapamChinantec",
+ ctd: "TedimChin",
+ cte: "TepinapaChinantec",
+ ctg: "Chittagonian",
+ ctl: "TlacoatzintepecChinantec",
+ cto: "Emberá-Catío",
+ ctu: "Chol",
+ cuc: "UsilaChinantec",
+ cui: "Cuiba",
+ cuk: "SanBlasKuna",
+ cul: "Culina",
+ cut: "TeutilaCuicatec",
+ cux: "TepeuxilaCuicatec",
+ cwa: "Kabwa",
+ cwe: "Kwere",
+ cwt: "Kuwaataay",
+ cya: "NopalaChatino",
+ cym: "Welsh",
+ daa: "Dangaléat",
+ dag: "Dagbani",
+ dah: "Gwahatike",
+ dan: "Danish",
+ dar: "Dargwa",
+ dav: "Taita",
+ dbd: "Dadiya",
+ dbj: "Ida'an",
+ dbq: "Daba",
+ dcc: "Deccan",
+ ddn: "Dendi(Benin)",
+ ded: "Dedua",
+ deg: "Degema",
+ des: "Desano",
+ deu: "German",
+ dga: "SouthernDagaare",
+ dgh: "Dghwede",
+ dgi: "NorthernDagara",
+ dgk: "Dagba",
+ dgo: "Dogri(individuallanguage)",
+ dgr: "Dogrib",
+ dhi: "Dhimal",
+ did: "Didinga",
+ dig: "Digo",
+ dik: "SouthwesternDinka",
+ dip: "NortheasternDinka",
+ div: "Dhivehi",
+ dje: "Zarma",
+ djk: "EasternMaroonCreole",
+ dmk: "Domaaki",
+ dml: "Dameli",
+ dnj: "Dan",
+ dnt: "MidGrandValleyDani",
+ dnw: "WesternDani",
+ dop: "Lukpa",
+ dos: "Dogosé",
+ dru: "Rukai",
+ dsb: "LowerSorbian",
+ dsh: "Daasanach",
+ dtp: "KadazanDusun",
+ dts: "ToroSoDogon",
+ dty: "Dotyali",
+ dua: "Duala",
+ dug: "Duruma",
+ dwr: "Dawro",
+ dyi: "DjiminiSenoufo",
+ dyo: "Jola-Fonyi",
+ dyu: "Dyula",
+ dzg: "Dazaga",
+ dzo: "Dzongkha",
+ ebu: "Embu",
+ ego: "Eggon",
+ eip: "Eipomek",
+ eiv: "Askopan",
+ eka: "Ekajuk",
+ ekk: "StandardEstonian",
+ eko: "Koti",
+ ekr: "Yace",
+ ell: "ModernGreek(1453-)",
+ elm: "Eleme",
+ emp: "NorthernEmberá",
+ enb: "Markweeta",
+ eng: "English",
+ enx: "Enxet",
+ epo: "Esperanto",
+ ese: "EseEjja",
+ ess: "CentralSiberianYupik",
+ esu: "CentralYupik",
+ eto: "Eton(Cameroon)",
+ ets: "Yekhee",
+ etu: "Ejagham",
+ eus: "Basque",
+ evn: "Evenki",
+ ewe: "Ewe",
+ ewo: "Ewondo",
+ eyo: "Keiyo",
+ eza: "Ezaa",
+ fal: "SouthFali",
+ fan: "Fang(EquatorialGuinea)",
+ fao: "Faroese",
+ far: "Fataleka",
+ fas: "Persian",
+ fat: "Fanti",
+ fia: "Nobiin",
+ fij: "Fijian",
+ fil: "Filipino",
+ fin: "Finnish",
+ fip: "Fipa",
+ fkk: "Kirya-Konzəl",
+ flr: "Fuliiru",
+ fmp: "Fe'fe'",
+ fmu: "FarWesternMuria",
+ fon: "Fon",
+ fra: "French",
+ frd: "Fordata",
+ fry: "WesternFrisian",
+ fub: "AdamawaFulfulde",
+ fuc: "Pulaar",
+ fue: "BorguFulfulde",
+ ful: "Fulah",
+ fuq: "Central-EasternNigerFulfulde",
+ fuv: "NigerianFulfulde",
+ gag: "Gagauz",
+ gai: "Borei",
+ gam: "Kandawo",
+ gau: "MudhiliGadaba",
+ gbi: "Galela",
+ gbk: "Gaddi",
+ gbm: "Garhwali",
+ gbo: "NorthernGrebo",
+ gbr: "Gbagyi",
+ gby: "Gbari",
+ gcc: "Mali",
+ gde: "Gude",
+ gdf: "Guduf-Gava",
+ geb: "Kire",
+ gej: "Gen",
+ ges: "Geser-Gorom",
+ ggg: "Gurgula",
+ gid: "Gidar",
+ gig: "Goaria",
+ gil: "Gilbertese",
+ giz: "SouthGiziga",
+ gjk: "KachiKoli",
+ gjn: "Gonja",
+ gju: "Gujari",
+ gkn: "Gokana",
+ gld: "Nanai",
+ gle: "Irish",
+ glg: "Galician",
+ glk: "Gilaki",
+ glv: "Manx",
+ glw: "Glavda",
+ gmv: "Gamo",
+ gna: "Kaansa",
+ gnd: "Zulgo-Gemzek",
+ gng: "Ngangam",
+ gof: "Gofa",
+ gog: "Gogo",
+ gol: "Gola",
+ gom: "GoanKonkani",
+ gor: "Gorontalo",
+ gqr: "Gor",
+ grc: "AncientGreek(to1453)",
+ gri: "Ghari",
+ grn: "Guarani",
+ grt: "Garo",
+ gsl: "Gusilay",
+ gso: "SouthwestGbaya",
+ gub: "Guajajára",
+ guc: "Wayuu",
+ gud: "YocobouéDida",
+ gug: "ParaguayanGuaraní",
+ guh: "Guahibo",
+ gui: "EasternBolivianGuaraní",
+ guj: "Gujarati",
+ guk: "Gumuz",
+ gum: "Guambiano",
+ guo: "Guayabero",
+ guq: "Aché",
+ gur: "Farefare",
+ guu: "Yanomamö",
+ gux: "Gourmanchéma",
+ guz: "Gusii",
+ gvc: "Guanano",
+ gvl: "Gulay",
+ gwc: "Gawri",
+ gwe: "Gweno",
+ gwi: "Gwichʼin",
+ gwr: "Gwere",
+ gwt: "Gawar-Bati",
+ gym: "Ngäbere",
+ gyr: "Guarayu",
+ gyz: "Geji",
+ had: "Hatam",
+ hag: "Hanga",
+ hah: "Hahon",
+ hak: "HakkaChinese",
+ hao: "Hakö",
+ hap: "Hupla",
+ hat: "Haitian",
+ hau: "Hausa",
+ haw: "Hawaiian",
+ hay: "Haya",
+ hbb: "Huba",
+ hch: "Huichol",
+ heb: "Hebrew",
+ heh: "Hehe",
+ her: "Herero",
+ hia: "Lamang",
+ hif: "FijiHindi",
+ hig: "Kamwe",
+ hil: "Hiligaynon",
+ hin: "Hindi",
+ hkk: "Hunjara-KainaKe",
+ hla: "Halia",
+ hlb: "Halbi",
+ hlt: "MatuChin",
+ hne: "Chhattisgarhi",
+ hnn: "Hanunoo",
+ hno: "NorthernHindko",
+ hns: "CaribbeanHindustani",
+ hoc: "Ho",
+ hrv: "Croatian",
+ hsb: "UpperSorbian",
+ hto: "MinicaHuitoto",
+ hub: "Huambisa",
+ hue: "SanFranciscoDelMarHuave",
+ hui: "Huli",
+ hul: "Hula",
+ hun: "Hungarian",
+ hus: "Huastec",
+ huu: "MuruiHuitoto",
+ huv: "SanMateoDelMarHuave",
+ hux: "NüpodeHuitoto",
+ hvn: "Sabu",
+ hwc: "Hawai'iCreoleEnglish",
+ hwo: "Hwana",
+ hye: "Armenian",
+ hyw: "WesternArmenian",
+ iba: "Iban",
+ ibb: "Ibibio",
+ ibo: "Igbo",
+ icr: "IslanderCreoleEnglish",
+ ida: "Idakho-Isukha-Tiriki",
+ idd: "EdeIdaca",
+ idu: "Idoma",
+ ifa: "AmganadIfugao",
+ ifb: "BatadIfugao",
+ ife: "Ifè",
+ ifk: "TuwaliIfugao",
+ ifu: "MayoyaoIfugao",
+ ify: "Keley-IKallahan",
+ igl: "Igala",
+ ign: "Ignaciano",
+ ijc: "Izon",
+ ijn: "Kalabari",
+ ikk: "Ika",
+ ikw: "Ikwere",
+ ilb: "Ila",
+ ilo: "Iloko",
+ imo: "Imbongu",
+ ina: "Interlingua(InternationalAuxiliaryLanguageAssociation)",
+ inb: "Inga",
+ ind: "Indonesian",
+ iou: "Tuma-Irumu",
+ ipi: "Ipili",
+ ipk: "Inupiaq",
+ iqw: "Ikwo",
+ iri: "Rigwe",
+ irk: "Iraqw",
+ ish: "Esan",
+ isl: "Icelandic",
+ iso: "Isoko",
+ ita: "Italian",
+ itl: "Itelmen",
+ its: "Isekiri",
+ itv: "Itawit",
+ itw: "Ito",
+ itz: "Itzá",
+ ixl: "Ixil",
+ izr: "Izere",
+ izz: "Izii",
+ jac: "Popti'",
+ jal: "Yalahatan",
+ jam: "JamaicanCreoleEnglish",
+ jav: "Javanese",
+ jax: "JambiMalay",
+ jbu: "JukunTakum",
+ jen: "Dza",
+ jic: "Tol",
+ jiv: "Shuar",
+ jmc: "Machame",
+ jmd: "Yamdena",
+ jmx: "WesternJuxtlahuacaMixtec",
+ jpn: "Japanese",
+ jqr: "Jaqaru",
+ juk: "Wapan",
+ jun: "Juang",
+ juo: "Jiba",
+ jvn: "CaribbeanJavanese",
+ kaa: "Kara-Kalpak",
+ kab: "Kabyle",
+ kac: "Kachin",
+ kai: "Karekare",
+ kaj: "Jju",
+ kak: "Kalanguya",
+ kam: "Kamba(Kenya)",
+ kan: "Kannada",
+ kao: "Xaasongaxango",
+ kaq: "Capanahua",
+ kas: "Kashmiri",
+ kat: "Georgian",
+ kay: "Kamayurá",
+ kaz: "Kazakh",
+ kbd: "Kabardian",
+ kbl: "Kanembu",
+ kbo: "Keliko",
+ kbp: "Kabiyè",
+ kbq: "Kamano",
+ kbr: "Kafa",
+ kbt: "Abadi",
+ kby: "MangaKanuri",
+ kca: "Khanty",
+ kcg: "Tyap",
+ kcn: "Nubi",
+ kcq: "Kamo",
+ kdc: "Kutu",
+ kde: "Makonde",
+ kdh: "Tem",
+ kdi: "Kumam",
+ kdj: "Karamojong",
+ kdl: "Tsikimba",
+ kdn: "Kunda",
+ kdt: "Kuy",
+ kea: "Kabuverdianu",
+ kek: "Kekchí",
+ ken: "Kenyang",
+ keo: "Kakwa",
+ ker: "Kera",
+ keu: "Akebu",
+ key: "Kupia",
+ kez: "Kukele",
+ kfb: "NorthwesternKolami",
+ kff: "Koya",
+ kfk: "Kinnauri",
+ kfq: "Korku",
+ kfr: "Kachhi",
+ kfw: "KharamNaga",
+ kfx: "KulluPahari",
+ kha: "Khasi",
+ khg: "KhamsTibetan",
+ khk: "HalhMongolian",
+ khm: "Khmer",
+ khq: "KoyraChiiniSonghay",
+ khw: "Khowar",
+ kia: "Kim",
+ kij: "Kilivila",
+ kik: "Kikuyu",
+ kin: "Kinyarwanda",
+ kir: "Kirghiz",
+ kix: "KhiamniunganNaga",
+ kjb: "Q'anjob'al",
+ kjc: "CoastalKonjo",
+ kje: "Kisar",
+ kjg: "Khmu",
+ kjh: "Khakas",
+ kjk: "HighlandKonjo",
+ kki: "Kagulu",
+ kkj: "Kako",
+ kle: "Kulung(Nepal)",
+ kln: "Kalenjin",
+ kls: "Kalasha",
+ klu: "Klao",
+ klv: "Maskelynes",
+ klw: "Tado",
+ kma: "Konni",
+ kmd: "MajukayangKalinga",
+ kml: "TanudanKalinga",
+ kmr: "NorthernKurdish",
+ kmu: "Kanite",
+ kmy: "Koma",
+ kna: "Dera(Nigeria)",
+ knb: "LubuaganKalinga",
+ knc: "CentralKanuri",
+ kne: "Kankanaey",
+ knf: "Mankanya",
+ knj: "WesternKanjobal",
+ knk: "Kuranko",
+ knn: "Konkani(individuallanguage)",
+ kno: "Kono(SierraLeone)",
+ kog: "Cogui",
+ kol: "Kol(PapuaNewGuinea)",
+ koo: "Konzo",
+ kor: "Korean",
+ kpo: "Ikposo",
+ kpq: "Korupun-Sela",
+ kps: "Tehit",
+ kpv: "Komi-Zyrian",
+ kpy: "Koryak",
+ kpz: "Kupsabiny",
+ kqe: "Kalagan",
+ kqo: "EasternKrahn",
+ kqp: "Kimré",
+ kqr: "Kimaragang",
+ kqy: "Koorete",
+ krc: "Karachay-Balkar",
+ kri: "Krio",
+ krj: "Kinaray-A",
+ krl: "Karelian",
+ krr: "Krung",
+ krs: "Gbaya(Sudan)",
+ kru: "Kurukh",
+ krx: "Karon",
+ ksb: "Shambala",
+ ksd: "Kuanua",
+ ksf: "Bafia",
+ ksr: "Borong",
+ kss: "SouthernKisi",
+ ksz: "Kodaku",
+ ktb: "Kambaata",
+ ktj: "PlapoKrumen",
+ kto: "Kuot",
+ kua: "Kuanyama",
+ kub: "Kutep",
+ kue: "Kuman(PapuaNewGuinea)",
+ kuh: "Kushi",
+ kum: "Kumyk",
+ kur: "Kurdish",
+ kus: "Kusaal",
+ kvn: "BorderKuna",
+ kvw: "Wersing",
+ kvx: "ParkariKoli",
+ kwd: "Kwaio",
+ kwf: "Kwara'ae",
+ kwi: "Awa-Cuaiquer",
+ kwm: "Kwambi",
+ kxc: "Konso",
+ kxf: "ManumanawKaren",
+ kxm: "NorthernKhmer",
+ kxp: "WadiyaraKoli",
+ kyb: "ButbutKalinga",
+ kyc: "Kyaka",
+ kyf: "Kouya",
+ kyg: "Keyagana",
+ kyo: "Kelon",
+ kyq: "Kenga",
+ kyu: "WesternKayah",
+ kyx: "Rapoisi",
+ kyz: "Kayabí",
+ kzf: "Da'aKaili",
+ kzi: "Kelabit",
+ lac: "Lacandon",
+ lag: "Rangi",
+ laj: "Lango(Uganda)",
+ lam: "Lamba",
+ lao: "Lao",
+ las: "Lama(Togo)",
+ lat: "Latin",
+ lav: "Latvian",
+ law: "Lauje",
+ lbj: "Ladakhi",
+ lbw: "Tolaki",
+ lcm: "Tungag",
+ lcp: "WesternLawa",
+ ldb: "Dũya",
+ led: "Lendu",
+ lee: "Lyélé",
+ lef: "Lelemi",
+ lem: "Nomaande",
+ lew: "LedoKaili",
+ lex: "Luang",
+ lgg: "Lugbara",
+ lgl: "Wala",
+ lhu: "Lahu",
+ lia: "West-CentralLimba",
+ lid: "Nyindrou",
+ lif: "Limbu",
+ lij: "Ligurian",
+ lin: "Lingala",
+ lip: "Sekpele",
+ lir: "LiberianEnglish",
+ lis: "Lisu",
+ lit: "Lithuanian",
+ lje: "Rampi",
+ ljp: "LampungApi",
+ lkb: "Kabras",
+ lke: "Kenyi",
+ lla: "Lala-Roba",
+ lld: "Ladin",
+ llg: "Lole",
+ lln: "Lele(Chad)",
+ lme: "Pévé",
+ lnd: "Lundayeh",
+ lns: "Lamnso'",
+ lnu: "Longuda",
+ loa: "Loloda",
+ lob: "Lobi",
+ lok: "Loko",
+ lom: "Loma(Liberia)",
+ lon: "MalawiLomwe",
+ loq: "Lobala",
+ lrk: "Loarki",
+ lsi: "Lashi",
+ lsm: "Saamia",
+ lss: "Lasi",
+ ltg: "Latgalian",
+ lth: "Thur",
+ lto: "Tsotso",
+ ltz: "Luxembourgish",
+ lua: "Luba-Lulua",
+ luc: "Aringa",
+ lug: "Ganda",
+ luo: "Luo(KenyaandTanzania)",
+ lus: "Lushai",
+ lwg: "Wanga",
+ lwo: "Luwo",
+ lww: "Lewo",
+ lzz: "Laz",
+ maa: "SanJerónimoTecóatlMazatec",
+ mab: "YutanduchiMixtec",
+ mad: "Madurese",
+ maf: "Mafa",
+ mag: "Magahi",
+ mah: "Marshallese",
+ mai: "Maithili",
+ maj: "JalapaDeDíazMazatec",
+ mak: "Makasar",
+ mal: "Malayalam",
+ mam: "Mam",
+ maq: "ChiquihuitlánMazatec",
+ mar: "Marathi",
+ mau: "HuautlaMazatec",
+ maw: "Mampruli",
+ max: "NorthMoluccanMalay",
+ maz: "CentralMazahua",
+ mbb: "WesternBukidnonManobo",
+ mbc: "Macushi",
+ mbh: "Mangseng",
+ mbj: "Nadëb",
+ mbt: "MatigsalugManobo",
+ mbu: "Mbula-Bwazza",
+ mca: "Maca",
+ mcb: "Machiguenga",
+ mcd: "Sharanahua",
+ mcf: "Matsés",
+ mco: "CoatlánMixe",
+ mcp: "Makaa",
+ mcq: "Ese",
+ mcu: "CameroonMambila",
+ mcx: "Mpiemo",
+ mda: "Mada(Nigeria)",
+ mdd: "Mbum",
+ mdv: "SantaLucíaMonteverdeMixtec",
+ mdy: "Male(Ethiopia)",
+ med: "Melpa",
+ mee: "Mengen",
+ meh: "SouthwesternTlaxiacoMixtec",
+ mej: "Meyah",
+ mek: "Mekeo",
+ mel: "CentralMelanau",
+ men: "Mende(SierraLeone)",
+ meq: "Merey",
+ mer: "Meru",
+ met: "Mato",
+ meu: "Motu",
+ mev: "Mano",
+ mfe: "Morisyen",
+ mfh: "Matal",
+ mfi: "Wandala",
+ mfk: "NorthMofu",
+ mfm: "MarghiSouth",
+ mfn: "CrossRiverMbembe",
+ mfo: "Mbe",
+ mfq: "Moba",
+ mfv: "Mandjak",
+ mfy: "Mayo",
+ mfz: "Mabaan",
+ mgd: "Moru",
+ mge: "Mango",
+ mgg: "Mpumpong",
+ mgh: "Makhuwa-Meetto",
+ mgi: "Lijili",
+ mgo: "Meta'",
+ mhi: "Ma'di",
+ mhk: "Mungaka",
+ mhr: "EasternMari",
+ mhu: "Digaro-Mishmi",
+ mhx: "Maru",
+ mhy: "Ma'anyan",
+ mib: "AtatláhucaMixtec",
+ mie: "OcotepecMixtec",
+ mif: "Mofu-Gudur",
+ mig: "SanMiguelElGrandeMixtec",
+ mih: "ChayucoMixtec",
+ mil: "PeñolesMixtec",
+ mim: "AlacatlatzalaMixtec",
+ min: "Minangkabau",
+ mio: "PinotepaNacionalMixtec",
+ mip: "Apasco-ApoalaMixtec",
+ miq: "Mískito",
+ mit: "SouthernPueblaMixtec",
+ miu: "CacaloxtepecMixtec",
+ miy: "AyutlaMixtec",
+ miz: "CoatzospanMixtec",
+ mjl: "Mandeali",
+ mjv: "Mannan",
+ mkd: "Macedonian",
+ mkf: "Miya",
+ mki: "Dhatki",
+ mkl: "Mokole",
+ mkn: "KupangMalay",
+ mlg: "Malagasy",
+ mlq: "WesternManinkakan",
+ mlt: "Maltese",
+ mmc: "MichoacánMazahua",
+ mmg: "NorthAmbrym",
+ mnb: "Muna",
+ mne: "Naba",
+ mnf: "Mundani",
+ mni: "Manipuri",
+ mnk: "Mandinka",
+ mnw: "Mon",
+ mnx: "Manikion",
+ moa: "Mwan",
+ mog: "Mongondow",
+ mon: "Mongolian",
+ mop: "MopánMaya",
+ mor: "Moro",
+ mos: "Mossi",
+ mox: "Molima",
+ moz: "Mukulu",
+ mpg: "Marba",
+ mpm: "YosondúaMixtec",
+ mpp: "Migabac",
+ mpx: "Misima-Panaeati",
+ mqb: "Mbuko",
+ mqf: "Momuna",
+ mqj: "Mamasa",
+ mqn: "Moronene",
+ mqy: "Manggarai",
+ mri: "Maori",
+ mrj: "WesternMari",
+ mrr: "Maria(India)",
+ mrt: "MarghiCentral",
+ mrw: "Maranao",
+ msh: "MasikoroMalagasy",
+ msi: "SabahMalay",
+ msw: "Mansoanka",
+ msy: "Aruamu",
+ mtd: "Mualang",
+ mtj: "Moskona",
+ mto: "TotontepecMixe",
+ mtr: "Mewari",
+ mtu: "TututepecMixtec",
+ mtx: "TidaáMixtec",
+ mua: "Mundang",
+ mug: "Musgu",
+ muh: "Mündü",
+ mui: "Musi",
+ mup: "Malvi",
+ mur: "Murle",
+ muv: "Muthuvan",
+ muy: "Muyang",
+ mve: "Marwari(Pakistan)",
+ mvp: "Duri",
+ mvy: "IndusKohistani",
+ mwq: "MünChin",
+ mwv: "Mentawai",
+ mxb: "TezoatlánMixtec",
+ mxq: "JuquilaMixe",
+ mxs: "HuitepecMixtec",
+ mxt: "JamiltepecMixtec",
+ mxu: "Mada(Cameroon)",
+ mxv: "MetlatónocMixtec",
+ mxy: "SoutheasternNochixtlánMixtec",
+ mya: "Burmese",
+ myb: "Mbay",
+ myk: "MamaraSenoufo",
+ myv: "Erzya",
+ myx: "Masaaba",
+ myy: "Macuna",
+ mza: "SantaMaríaZacatepecMixtec",
+ mzi: "IxcatlánMazatec",
+ mzj: "Manya",
+ mzk: "NigeriaMambila",
+ mzl: "MazatlánMixe",
+ mzm: "Mumuye",
+ mzw: "Deg",
+ nab: "SouthernNambikuára",
+ nag: "NagaPidgin",
+ nal: "Nalik",
+ nan: "Min Nan Chinese",
+ nap: "Neapolitan",
+ nas: "Naasioi",
+ naw: "Nawuri",
+ nbh: "Ngamo",
+ nca: "Iyo",
+ ncf: "Notsi",
+ nch: "CentralHuastecaNahuatl",
+ ncj: "NorthernPueblaNahuatl",
+ ncl: "MichoacánNahuatl",
+ nco: "Sibe",
+ ncu: "Chumburung",
+ ncx: "CentralPueblaNahuatl",
+ ndi: "SambaLeko",
+ ndj: "Ndamba",
+ ndo: "Ndonga",
+ ndp: "Ndo",
+ ndv: "Ndut",
+ ndy: "Lutos",
+ ndz: "Ndogo",
+ neb: "Toura(Côted'Ivoire)",
+ nep: "Nepali(macrolanguage)",
+ new: "Newari",
+ nfa: "Dhao",
+ nfr: "Nafaanra",
+ nga: "Ngbaka",
+ ngi: "Ngizim",
+ ngl: "Lomwe",
+ ngp: "Ngulu",
+ ngu: "GuerreroNahuatl",
+ nhe: "EasternHuastecaNahuatl",
+ nhg: "TetelcingoNahuatl",
+ nhi: "Zacatlán-Ahuacatlán-TepetzintlaNahuatl",
+ nhn: "CentralNahuatl",
+ nhq: "HuaxcalecaNahuatl",
+ nhu: "Noone",
+ nhw: "WesternHuastecaNahuatl",
+ nhx: "Isthmus-MecayapanNahuatl",
+ nhy: "NorthernOaxacaNahuatl",
+ nia: "Nias",
+ nij: "Ngaju",
+ nim: "Nilamba",
+ nin: "Ninzo",
+ nja: "Nzanyi",
+ nko: "Nkonya",
+ nla: "Ngombale",
+ nlc: "Nalca",
+ nld: "Dutch",
+ nlg: "Gela",
+ nlk: "NiniaYali",
+ nlv: "OrizabaNahuatl",
+ nmg: "Kwasio",
+ nmz: "Nawdm",
+ nnb: "Nande",
+ nnh: "Ngiemboon",
+ nnq: "Ngindo",
+ nnw: "SouthernNuni",
+ noa: "WounMeu",
+ nob: "NorwegianBokmål",
+ nod: "NorthernThai",
+ noe: "Nimadi",
+ nog: "Nogai",
+ not: "Nomatsiguenga",
+ npl: "SoutheasternPueblaNahuatl",
+ npy: "Napu",
+ nso: "Pedi",
+ nst: "TaseNaga",
+ nsu: "SierraNegraNahuatl",
+ ntm: "Nateni",
+ ntr: "Delo",
+ nuj: "Nyole",
+ nup: "Nupe-Nupe-Tako",
+ nus: "Nuer",
+ nuz: "TlamacazapaNahuatl",
+ nwb: "Nyabwa",
+ nxq: "Naxi",
+ nya: "Nyanja",
+ nyf: "Giryama",
+ nyn: "Nyankole",
+ nyo: "Nyoro",
+ nyu: "Nyungwe",
+ nyy: "Nyakyusa-Ngonde",
+ nzi: "Nzima",
+ obo: "OboManobo",
+ oci: "Occitan(post1500)",
+ odk: "Od",
+ odu: "Odual",
+ ogo: "Khana",
+ ojb: "NorthwesternOjibwa",
+ oku: "Oku",
+ old: "Mochi",
+ omw: "SouthTairora",
+ onb: "Lingao",
+ ood: "TohonoO'odham",
+ orc: "Orma",
+ orm: "Oromo",
+ oru: "Ormuri",
+ ory: "Odia",
+ oss: "Ossetian",
+ ote: "MezquitalOtomi",
+ otq: "QuerétaroOtomi",
+ ozm: "Koonzime",
+ pab: "Parecís",
+ pad: "Paumarí",
+ pag: "Pangasinan",
+ pam: "Pampanga",
+ pan: "Panjabi",
+ pao: "NorthernPaiute",
+ pap: "Papiamento",
+ pau: "Palauan",
+ pbb: "Páez",
+ pbc: "Patamona",
+ pbi: "Parkwa",
+ pbs: "CentralPame",
+ pbt: "SouthernPashto",
+ pbu: "NorthernPashto",
+ pce: "RuchingPalaung",
+ pcm: "NigerianPidgin",
+ pex: "Petats",
+ pez: "EasternPenan",
+ phl: "Phalura",
+ phr: "Pahari-Potwari",
+ pib: "Yine",
+ pil: "Yom",
+ pip: "Pero",
+ pir: "Piratapuyo",
+ pis: "Pijin",
+ piy: "Piya-Kwonci",
+ pjt: "Pitjantjatjara",
+ pkb: "Pokomo",
+ pko: "Pökoot",
+ plk: "KohistaniShina",
+ pls: "SanMarcosTlacoyalcoPopoloca",
+ plt: "PlateauMalagasy",
+ plw: "Brooke'sPointPalawano",
+ pmf: "Pamona",
+ pmq: "NorthernPame",
+ pms: "Piemontese",
+ pmy: "PapuanMalay",
+ pnb: "WesternPanjabi",
+ pne: "WesternPenan",
+ pny: "Pinyin",
+ poc: "Poqomam",
+ poe: "SanJuanAtzingoPopoloca",
+ poh: "Poqomchi'",
+ poi: "HighlandPopoluca",
+ pol: "Polish",
+ por: "Portuguese",
+ pov: "UpperGuineaCrioulo",
+ pow: "SanFelipeOtlaltepecPopoloca",
+ poy: "Pogolo",
+ ppk: "Uma",
+ pps: "SanLuísTemalacayucaPopoloca",
+ prf: "Paranan",
+ prk: "Parauk",
+ prq: "AshéninkaPerené",
+ prt: "Phai",
+ pse: "CentralMalay",
+ pss: "Kaulong",
+ pst: "CentralPashto",
+ ptu: "Bambam",
+ pua: "WesternHighlandPurepecha",
+ pui: "Puinave",
+ pus: "Pushto",
+ pwg: "Gapapaiwa",
+ pwn: "Paiwan",
+ pww: "PwoNorthernKaren",
+ pxm: "QuetzaltepecMixe",
+ qub: "HuallagaHuánucoQuechua",
+ quc: "K'iche'",
+ quf: "LambayequeQuechua",
+ qug: "ChimborazoHighlandQuichua",
+ quh: "SouthBolivianQuechua",
+ qul: "NorthBolivianQuechua",
+ qum: "Sipacapense",
+ qup: "SouthernPastazaQuechua",
+ qur: "YanahuancaPascoQuechua",
+ qus: "SantiagodelEsteroQuichua",
+ quv: "Sacapulteco",
+ quw: "TenaLowlandQuichua",
+ qux: "YauyosQuechua",
+ quy: "AyacuchoQuechua",
+ quz: "CuscoQuechua",
+ qva: "Ambo-PascoQuechua",
+ qvc: "CajamarcaQuechua",
+ qve: "EasternApurímacQuechua",
+ qvh: "Huamalíes-DosdeMayoHuánucoQuechua",
+ qvi: "ImbaburaHighlandQuichua",
+ qvj: "LojaHighlandQuichua",
+ qvl: "CajatamboNorthLimaQuechua",
+ qvm: "Margos-Yarowilca-LauricochaQuechua",
+ qvn: "NorthJunínQuechua",
+ qvo: "NapoLowlandQuechua",
+ qvs: "SanMartínQuechua",
+ qvw: "HuayllaWancaQuechua",
+ qvz: "NorthernPastazaQuichua",
+ qwa: "CorongoAncashQuechua",
+ qwh: "HuaylasAncashQuechua",
+ qws: "SihuasAncashQuechua",
+ qxa: "ChiquiánAncashQuechua",
+ qxh: "PanaoHuánucoQuechua",
+ qxl: "SalasacaHighlandQuichua",
+ qxn: "NorthernConchucosAncashQuechua",
+ qxo: "SouthernConchucosAncashQuechua",
+ qxp: "PunoQuechua",
+ qxr: "CañarHighlandQuichua",
+ qxt: "SantaAnadeTusiPascoQuechua",
+ qxu: "Arequipa-LaUniónQuechua",
+ qxw: "JaujaWancaQuechua",
+ rag: "Logooli",
+ rah: "Rabha",
+ rai: "Ramoaaina",
+ rap: "Rapanui",
+ rav: "Sampang",
+ raw: "Rawang",
+ rej: "Rejang",
+ rel: "Rendille",
+ rgu: "Ringgou",
+ rhg: "Rohingya",
+ rif: "Tarifit",
+ rim: "Nyaturu",
+ rjs: "Rajbanshi",
+ rkt: "Rangpuri",
+ rmc: "CarpathianRomani",
+ rmo: "SinteRomani",
+ rmy: "VlaxRomani",
+ rng: "Ronga",
+ rnl: "Ranglong",
+ rob: "Tae'",
+ rof: "Rombo",
+ roh: "Romansh",
+ rol: "Romblomanon",
+ ron: "Romanian",
+ roo: "Rotokas",
+ rop: "Kriol",
+ rro: "Waima",
+ rth: "Ratahan",
+ rub: "Gungu",
+ ruc: "Ruuli",
+ ruf: "Luguru",
+ rug: "Roviana",
+ run: "Rundi",
+ rus: "Russian",
+ rwm: "Amba(Uganda)",
+ rwr: "Marwari(India)",
+ sab: "Buglere",
+ sag: "Sango",
+ sah: "Yakut",
+ saj: "Sahu",
+ saq: "Samburu",
+ sas: "Sasak",
+ sau: "Saleman",
+ say: "Saya",
+ sba: "Ngambay",
+ sbd: "SouthernSamo",
+ sbl: "BotolanSambal",
+ sbn: "SindhiBhil",
+ sbp: "Sangu(Tanzania)",
+ sch: "Sakachep",
+ sck: "Sadri",
+ scl: "Shina",
+ scn: "Sicilian",
+ sco: "Scots",
+ sda: "Toraja-Sa'dan",
+ sdo: "Bukar-SadungBidayuh",
+ sea: "Semai",
+ seh: "Sena",
+ sei: "Seri",
+ ses: "KoyraboroSenniSonghai",
+ sey: "Secoya",
+ sgb: "Mag-antsiAyta",
+ sgj: "Surgujia",
+ sgw: "SebatBetGurage",
+ shi: "Tachelhit",
+ shk: "Shilluk",
+ shn: "Shan",
+ sho: "Shanga",
+ shp: "Shipibo-Conibo",
+ sid: "Sidamo",
+ sig: "Paasaal",
+ sil: "TumulungSisaala",
+ sin: "Sinhala",
+ sip: "Sikkimese",
+ siw: "Siwai",
+ sja: "Epena",
+ sjm: "Mapun",
+ sjp: "Surjapuri",
+ sjr: "Siar-Lak",
+ skg: "SakalavaMalagasy",
+ skr: "Saraiki",
+ sld: "Sissala",
+ slk: "Slovak",
+ slu: "Selaru",
+ slv: "Slovenian",
+ sml: "CentralSama",
+ smo: "Samoan",
+ sna: "Shona",
+ snc: "Sinaugoro",
+ snd: "Sindhi",
+ sne: "BauBidayuh",
+ snk: "Soninke",
+ snn: "Siona",
+ snp: "Siane",
+ snv: "Sa'ban",
+ snw: "Selee",
+ sol: "Solos",
+ som: "Somali",
+ soy: "Miyobe",
+ spa: "Spanish",
+ spp: "SupyireSenoufo",
+ sps: "Saposa",
+ spy: "Sabaot",
+ src: "LogudoreseSardinian",
+ srd: "Sardinian",
+ sri: "Siriano",
+ srm: "Saramaccan",
+ srn: "SrananTongo",
+ sro: "CampidaneseSardinian",
+ srp: "Serbian",
+ srr: "Serer",
+ srx: "Sirmauri",
+ ssi: "Sansi",
+ ste: "Liana-Seti",
+ stn: "Owa",
+ stp: "SoutheasternTepehuan",
+ sua: "Sulka",
+ suc: "WesternSubanon",
+ suk: "Sukuma",
+ sun: "Sundanese",
+ sur: "Mwaghavul",
+ sus: "Susu",
+ suv: "Puroik",
+ suz: "Sunwar",
+ sva: "Svan",
+ swe: "Swedish",
+ swh: "Swahili(individuallanguage)",
+ swv: "Shekhawati",
+ sxb: "Suba",
+ sxn: "Sangir",
+ sya: "Siang",
+ syl: "Sylheti",
+ sza: "Semelai",
+ szy: "Sakizaya",
+ tac: "LowlandTarahumara",
+ taj: "EasternTamang",
+ tam: "Tamil",
+ tan: "Tangale",
+ tao: "Yami",
+ tap: "Taabwa",
+ taq: "Tamasheq",
+ tar: "CentralTarahumara",
+ tat: "Tatar",
+ tav: "Tatuyo",
+ tay: "Atayal",
+ tbc: "Takia",
+ tbf: "Mandara",
+ tbg: "NorthTairora",
+ tbk: "CalamianTagbanwa",
+ tbl: "Tboli",
+ tby: "Tabaru",
+ tbz: "Ditammari",
+ tca: "Ticuna",
+ tcc: "Datooga",
+ tcf: "MalinaltepecMe'phaa",
+ tcy: "Tulu",
+ tcz: "ThadoChin",
+ tdj: "Tajio",
+ tdn: "Tondano",
+ tdx: "Tandroy-MahafalyMalagasy",
+ ted: "TepoKrumen",
+ tee: "HuehuetlaTepehua",
+ tel: "Telugu",
+ tem: "Timne",
+ teo: "Teso",
+ ter: "Tereno",
+ tew: "Tewa(USA)",
+ tex: "Tennet",
+ tfr: "Teribe",
+ tgc: "Tigak",
+ tgj: "Tagin",
+ tgk: "Tajik",
+ tgl: "Tagalog",
+ tgo: "Sudest",
+ tgp: "Tangoa",
+ tha: "Thai",
+ the: "ChitwaniaTharu",
+ thk: "Tharaka",
+ thl: "DangauraTharu",
+ thq: "KochilaTharu",
+ thr: "RanaTharu",
+ thv: "TahaggartTamahaq",
+ tig: "Tigre",
+ tih: "TimugonMurut",
+ tik: "Tikar",
+ tio: "Teop",
+ tir: "Tigrinya",
+ tkg: "TesakaMalagasy",
+ tkr: "Tsakhur",
+ tkt: "KathoriyaTharu",
+ tlb: "Tobelo",
+ tli: "Tlingit",
+ tlj: "Talinga-Bwisi",
+ tlp: "FilomenaMata-CoahuitlánTotonac",
+ tly: "Talysh",
+ tmc: "Tumak",
+ tmf: "Toba-Maskoy",
+ tna: "Tacana",
+ tng: "Tobanga",
+ tnk: "Kwamera",
+ tnn: "NorthTanna",
+ tnp: "Whitesands",
+ tnr: "Ménik",
+ tnt: "Tontemboan",
+ tob: "Toba",
+ toc: "CoyutlaTotonac",
+ toh: "Gitonga",
+ tok: "TokiPona",
+ tom: "Tombulu",
+ top: "PapantlaTotonac",
+ tos: "HighlandTotonac",
+ tpi: "TokPisin",
+ tpl: "TlacoapaMe'phaa",
+ tpm: "Tampulma",
+ tpp: "PisafloresTepehua",
+ tpt: "TlachichilcoTepehua",
+ tpz: "Tinputz",
+ tqp: "Tomoip",
+ trc: "CopalaTriqui",
+ tri: "Trió",
+ trn: "Trinitario",
+ trp: "KokBorok",
+ trq: "SanMartínItunyosoTriqui",
+ trs: "ChicahuaxtlaTriqui",
+ trv: "Sediq",
+ trw: "Torwali",
+ tsn: "Tswana",
+ tso: "Tsonga",
+ tsz: "Purepecha",
+ ttc: "Tektiteko",
+ tte: "Bwanabwana",
+ ttj: "Tooro",
+ ttq: "TawallammatTamajaq",
+ ttr: "Tera",
+ ttu: "Torau",
+ tue: "Tuyuca",
+ tuf: "CentralTunebo",
+ tui: "Tupuri",
+ tuk: "Turkmen",
+ tul: "Tula",
+ tuo: "Tucano",
+ tuq: "Tedaga",
+ tur: "Turkish",
+ tuv: "Turkana",
+ tuy: "Tugen",
+ tvo: "Tidore",
+ tvu: "Tunen",
+ tvw: "Sedoa",
+ twb: "WesternTawbuid",
+ twe: "Tewa(Indonesia)",
+ twu: "Termanu",
+ txa: "Tombonuo",
+ txq: "Tii",
+ txs: "Tonsea",
+ txu: "Kayapó",
+ txy: "TanosyMalagasy",
+ tye: "Kyanga",
+ tzh: "Tzeltal",
+ tzj: "Tz'utujil",
+ tzo: "Tzotzil",
+ ubl: "Buhi'nonBikol",
+ ubu: "Umbu-Ungu",
+ udl: "Wuzlam",
+ udm: "Udmurt",
+ udu: "Uduk",
+ uig: "Uighur",
+ uki: "Kui(India)",
+ ukr: "Ukrainian",
+ ukv: "Kuku",
+ umb: "Umbundu",
+ upv: "Uripiv-Wala-Rano-Atchin",
+ ura: "Urarina",
+ urb: "Urubú-Kaapor",
+ urd: "Urdu",
+ urh: "Urhobo",
+ urk: "UrakLawoi'",
+ urt: "Urat",
+ ury: "Orya",
+ ush: "Ushojo",
+ usp: "Uspanteco",
+ uzb: "Uzbek",
+ uzn: "NorthernUzbek",
+ vag: "Vagla",
+ vah: "Varhadi-Nagpuri",
+ vai: "Vai",
+ var: "Huarijio",
+ ver: "MomJango",
+ vid: "Vidunda",
+ vie: "Vietnamese",
+ vif: "Vili",
+ vmc: "JuxtlahuacaMixtec",
+ vmj: "IxtayutlaMixtec",
+ vmm: "MitlatongoMixtec",
+ vmp: "SoyaltepecMazatec",
+ vmw: "Makhuwa",
+ vmy: "AyautlaMazatec",
+ vmz: "MazatlánMazatec",
+ vro: "Võro",
+ vun: "Vunjo",
+ vut: "Vute",
+ wal: "Wolaytta",
+ wap: "Wapishana",
+ war: "Waray(Philippines)",
+ waw: "Waiwai",
+ way: "Wayana",
+ wba: "Warao",
+ wbl: "Wakhi",
+ wbr: "Wagdi",
+ wci: "WaciGbe",
+ weo: "Wemale",
+ wes: "CameroonPidgin",
+ wja: "Waja",
+ wji: "Warji",
+ wlo: "Wolio",
+ wlx: "Wali(Ghana)",
+ wmw: "Mwani",
+ wob: "WèNorthern",
+ wof: "GambianWolof",
+ wol: "Wolof",
+ wsg: "AdilabadGondi",
+ wwa: "Waama",
+ xal: "Kalmyk",
+ xdy: "MalayicDayak",
+ xed: "Hdi",
+ xer: "Xerénte",
+ xhe: "Khetrani",
+ xho: "Xhosa",
+ xka: "Kalkoti",
+ xkl: "MainstreamKenyah",
+ xmf: "Mingrelian",
+ xmm: "ManadoMalay",
+ xmv: "AntankaranaMalagasy",
+ xnj: "Ngoni(Tanzania)",
+ xnr: "Kangri",
+ xog: "Soga",
+ xon: "Konkomba",
+ xpe: "LiberiaKpelle",
+ xrb: "EasternKaraboro",
+ xsb: "Sambal",
+ xsm: "Kasem",
+ xsr: "Sherpa",
+ xsu: "Sanumá",
+ xta: "AlcozaucaMixtec",
+ xtd: "Diuxi-TilantongoMixtec",
+ xte: "Ketengban",
+ xti: "SinicahuaMixtec",
+ xtm: "MagdalenaPeñascoMixtec",
+ xtn: "NorthernTlaxiacoMixtec",
+ xtu: "CuyamecalcoMixtec",
+ xua: "AluKurumba",
+ xuo: "Kuo",
+ yaa: "Yaminahua",
+ yad: "Yagua",
+ yal: "Yalunka",
+ yam: "Yamba",
+ yao: "Yao",
+ yaq: "Yaqui",
+ yas: "Nugunu(Cameroon)",
+ yat: "Yambeta",
+ yav: "Yangben",
+ yay: "Agwagwune",
+ yaz: "Lokaa",
+ yba: "Yala",
+ ybb: "Yemba",
+ ycl: "Lolopo",
+ ycn: "Yucuna",
+ ydd: "EasternYiddish",
+ ydg: "Yidgha",
+ yea: "Ravula",
+ yer: "Tarok",
+ yes: "Nyankpa",
+ yka: "Yakan",
+ yli: "AnggurukYali",
+ yor: "Yoruba",
+ yre: "Yaouré",
+ yua: "Yucateco",
+ yue: "YueChinese",
+ yuz: "Yuracare",
+ yva: "Yawa",
+ zaa: "SierradeJuárezZapotec",
+ zab: "WesternTlacolulaValleyZapotec",
+ zac: "OcotlánZapotec",
+ zad: "CajonosZapotec",
+ zae: "YareniZapotec",
+ zai: "IsthmusZapotec",
+ zam: "MiahuatlánZapotec",
+ zao: "OzolotepecZapotec",
+ zaq: "AloápamZapotec",
+ zar: "RincónZapotec",
+ zas: "SantoDomingoAlbarradasZapotec",
+ zav: "YatzachiZapotec",
+ zaw: "MitlaZapotec",
+ zca: "CoatecasAltasZapotec",
+ zga: "Kinga",
+ zim: "Mesme",
+ ziw: "Zigula",
+ zmz: "Mbandja",
+ zne: "Zande(individuallanguage)",
+ zoc: "CopainaláZoque",
+ zoh: "ChimalapaZoque",
+ zor: "RayónZoque",
+ zos: "FranciscoLeónZoque",
+ zpc: "ChoapanZapotec",
+ zpg: "GueveaDeHumboldtZapotec",
+ zpi: "SantaMaríaQuiegolaniZapotec",
+ zpl: "LachixíoZapotec",
+ zpm: "MixtepecZapotec",
+ zpo: "AmatlánZapotec",
+ zpt: "SanVicenteCoatlánZapotec",
+ zpu: "YalálagZapotec",
+ zpv: "ChichicapanZapotec",
+ zpy: "MazaltepecZapotec",
+ zpz: "TexmelucanZapotec",
+ zsm: "StandardMalay",
+ ztg: "XanaguíaZapotec",
+ ztn: "SantaCatarinaAlbarradasZapotec",
+ ztp: "LoxichaZapotec",
+ ztq: "Quioquitani-QuieríZapotec",
+ zts: "TilquiapanZapotec",
+ ztu: "GüiláZapotec",
+ zty: "YateeZapotec",
+ zul: "Zulu",
+ zyb: "YongbeiZhuang",
+ zyp: "ZypheChin",
+ zza: "Zaza",
+};
\ No newline at end of file
diff --git a/sharedUtils/omniAsrSupportedLangs.ts b/sharedUtils/omniAsrSupportedLangs.ts
new file mode 100644
index 000000000..bafb5995a
--- /dev/null
+++ b/sharedUtils/omniAsrSupportedLangs.ts
@@ -0,0 +1,315 @@
+/**
+ * OmniASR supported-language snapshot
+ * -----------------------------------
+ *
+ * Static snapshot of the language codes supported by the OmniASR transcription
+ * service (Meta Omnilingual ASR — `omniASR_LLM_1B_v2`). Each entry is in
+ * `{iso639_3}_{Script}` form, e.g. `eng_Latn`, `swh_Latn`, `urd_Arab`.
+ *
+ * We bundle this list so the extension can validate / resolve language codes
+ * offline, with no runtime network dependency.
+ *
+ * Regenerating
+ * ~~~~~~~~~~~~
+ * If we change ASR providers or the underlying model, regenerate this file from
+ * the live `/languages` endpoint:
+ *
+ * curl -s "https://genesis-ai-dev--codex-asr-serve.modal.run/languages" \
+ * | python3 -c "
+ * import json, sys
+ * d = json.load(sys.stdin)
+ * langs = sorted(set(d['languages']))
+ * print('export const OMNI_ASR_SUPPORTED_LANGS: readonly string[] = [')
+ * for i in range(0, len(langs), 6):
+ * print(' ' + ', '.join(f'\"{c}\"' for c in langs[i:i+6]) + ',')
+ * print('];')
+ * "
+ *
+ * (Pre-rename, the host was `genesis-ai-dev--mms-zeroshot-asr-serve.modal.run`.)
+ *
+ * Snapshot taken: 2026-06-04. Server reported 1672 languages.
+ */
+
+export const OMNI_ASR_SUPPORTED_LANGS: readonly string[] = [
+ "aae_Latn", "aal_Latn", "abb_Latn", "abi_Latn", "abk_Cyrl", "abn_Latn",
+ "abp_Latn", "abr_Latn", "abs_Latn", "aca_Latn", "acd_Latn", "ace_Latn",
+ "acf_Latn", "ach_Latn", "acm_Arab", "acn_Latn", "acr_Latn", "acu_Latn",
+ "acw_Arab", "ade_Latn", "adh_Latn", "adj_Latn", "adx_Tibt", "ady_Cyrl",
+ "aeb_Arab", "aec_Arab", "aeu_Latn", "afb_Arab", "afo_Latn", "afr_Latn",
+ "agd_Latn", "agg_Latn", "agn_Latn", "agr_Latn", "agu_Latn", "agx_Cyrl",
+ "aha_Latn", "ahk_Latn", "ahl_Latn", "ahs_Latn", "aia_Latn", "ajg_Latn",
+ "aka_Latn", "akb_Latn", "ake_Latn", "akp_Latn", "ala_Latn", "alj_Latn",
+ "aln_Latn", "alo_Latn", "alp_Latn", "als_Latn", "alt_Cyrl", "alz_Latn",
+ "ame_Latn", "amf_Latn", "amh_Ethi", "ami_Latn", "amk_Latn", "amu_Latn",
+ "anc_Latn", "ank_Latn", "ann_Latn", "anp_Deva", "anw_Latn", "any_Latn",
+ "aom_Latn", "aoz_Latn", "apb_Latn", "apc_Arab", "apd_Arab", "apr_Latn",
+ "arb_Arab", "arg_Latn", "arl_Latn", "arq_Arab", "ars_Arab", "ary_Arab",
+ "arz_Arab", "asa_Latn", "asg_Latn", "asm_Beng", "ast_Latn", "ata_Latn",
+ "atb_Latn", "atg_Latn", "ati_Latn", "atq_Latn", "ava_Cyrl", "avn_Latn",
+ "avu_Latn", "awa_Deva", "awb_Latn", "awo_Latn", "ayl_Arab", "ayo_Latn",
+ "ayp_Arab", "ayr_Latn", "ayz_Latn", "aze_Arab", "aze_Cyrl", "aze_Latn",
+ "azg_Latn", "azz_Latn", "bag_Latn", "bak_Cyrl", "bam_Latn", "ban_Latn",
+ "bao_Latn", "bas_Latn", "bav_Latn", "bax_Latn", "bba_Latn", "bbb_Latn",
+ "bbc_Latn", "bbj_Latn", "bbl_Geor", "bbo_Latn", "bbu_Latn", "bcc_Arab",
+ "bcc_Latn", "bce_Latn", "bci_Latn", "bcl_Latn", "bcs_Latn", "bcw_Latn",
+ "bcy_Latn", "bcz_Latn", "bda_Latn", "bde_Latn", "bdg_Latn", "bdh_Latn",
+ "bdm_Latn", "bdq_Latn", "bdu_Latn", "beb_Latn", "beh_Latn", "bel_Cyrl",
+ "bem_Latn", "ben_Beng", "bep_Latn", "bew_Latn", "bex_Latn", "bfa_Latn",
+ "bfd_Latn", "bfo_Latn", "bft_Arab", "bfy_Deva", "bfz_Deva", "bgc_Deva",
+ "bgp_Arab", "bgq_Deva", "bgr_Latn", "bgt_Latn", "bgw_Deva", "bha_Deva",
+ "bhb_Deva", "bhh_Cyrl", "bho_Deva", "bhp_Latn", "bht_Deva", "bhz_Latn",
+ "bib_Latn", "bim_Latn", "bis_Latn", "biv_Latn", "bjj_Deva", "bjk_Latn",
+ "bjn_Latn", "bjr_Latn", "bjt_Latn", "bjv_Latn", "bjw_Latn", "bjz_Latn",
+ "bkd_Latn", "bkh_Latn", "bkm_Latn", "bkv_Latn", "bky_Latn", "ble_Latn",
+ "blh_Latn", "blt_Latn", "blx_Latn", "blz_Latn", "bmm_Latn", "bmq_Latn",
+ "bmr_Latn", "bmu_Latn", "bmv_Latn", "bng_Beng", "bnm_Latn", "bnn_Latn",
+ "bno_Latn", "bnp_Latn", "bns_Deva", "boa_Latn", "bod_Tibt", "boj_Latn",
+ "bom_Latn", "bor_Latn", "bos_Latn", "bou_Latn", "bov_Latn", "box_Latn",
+ "bpr_Latn", "bps_Latn", "bqc_Latn", "bqg_Latn", "bqi_Arab", "bqj_Latn",
+ "bqp_Latn", "bra_Deva", "bre_Latn", "brh_Arab", "bri_Latn", "bru_Latn",
+ "brx_Deva", "bsc_Latn", "bsh_Arab", "bsj_Latn", "bsk_Latn", "bsq_Latn",
+ "bss_Latn", "bsy_Latn", "btd_Latn", "btm_Latn", "bts_Latn", "btt_Latn",
+ "btv_Arab", "btx_Latn", "bud_Latn", "bug_Latn", "bul_Cyrl", "bum_Latn",
+ "buo_Latn", "bus_Latn", "bux_Latn", "bvb_Latn", "bvc_Latn", "bvz_Latn",
+ "bwq_Latn", "bwr_Latn", "bwu_Latn", "bxf_Latn", "bxk_Latn", "byc_Latn",
+ "byr_Latn", "bys_Latn", "byv_Latn", "byx_Latn", "bzh_Latn", "bzi_Thai",
+ "bzj_Latn", "bzw_Latn", "caa_Latn", "cab_Latn", "cac_Latn", "cak_Latn",
+ "cap_Latn", "car_Latn", "cas_Latn", "cat_Latn", "cax_Latn", "cbc_Latn",
+ "cbi_Latn", "cbr_Latn", "cbs_Latn", "cbt_Latn", "cbu_Latn", "cbv_Latn",
+ "cce_Latn", "ccg_Latn", "cco_Latn", "cdj_Deva", "cdo_Hans", "ceb_Latn",
+ "ceg_Latn", "cek_Latn", "cen_Latn", "ces_Latn", "cfa_Latn", "cfm_Latn",
+ "cgc_Latn", "cgg_Latn", "che_Cyrl", "chf_Latn", "chq_Latn", "chv_Cyrl",
+ "chz_Latn", "cjk_Latn", "cjo_Latn", "cjp_Latn", "cjs_Cyrl", "ckb_Arab",
+ "ckl_Latn", "cko_Latn", "ckr_Latn", "ckt_Cyrl", "cky_Latn", "cla_Latn",
+ "cle_Latn", "cly_Latn", "cme_Latn", "cmn_Hans", "cmn_Hant", "cmo_Khmr",
+ "cmo_Latn", "cmr_Latn", "cnh_Latn", "cni_Latn", "cnl_Latn", "cnt_Latn",
+ "coe_Latn", "cof_Latn", "cok_Latn", "con_Latn", "cor_Latn", "cot_Latn",
+ "cou_Latn", "cpa_Latn", "cpb_Latn", "cpu_Latn", "cpx_Hans", "cpy_Latn",
+ "crh_Cyrl", "crk_Cans", "crk_Latn", "crn_Latn", "crq_Latn", "crs_Latn",
+ "crt_Latn", "csk_Latn", "cso_Latn", "ctd_Latn", "cte_Latn", "ctg_Beng",
+ "ctl_Latn", "cto_Latn", "ctu_Latn", "cuc_Latn", "cui_Latn", "cuk_Latn",
+ "cul_Latn", "cut_Latn", "cux_Latn", "cwa_Latn", "cwe_Latn", "cwt_Latn",
+ "cya_Latn", "cym_Latn", "daa_Latn", "dag_Latn", "dah_Latn", "dan_Latn",
+ "dar_Cyrl", "dav_Latn", "dbd_Latn", "dbj_Latn", "dbq_Latn", "dcc_Arab",
+ "ddn_Latn", "ded_Latn", "deg_Latn", "des_Latn", "deu_Latn", "dga_Latn",
+ "dgh_Latn", "dgi_Latn", "dgk_Latn", "dgo_Deva", "dgr_Latn", "dhi_Deva",
+ "did_Latn", "dig_Latn", "dik_Latn", "dip_Latn", "div_Thaa", "dje_Latn",
+ "djk_Latn", "dmk_Arab", "dml_Arab", "dnj_Latn", "dnt_Latn", "dnw_Latn",
+ "dop_Latn", "dos_Latn", "dru_Latn", "dsb_Latn", "dsh_Latn", "dtp_Latn",
+ "dts_Latn", "dty_Deva", "dua_Latn", "dug_Latn", "dwr_Latn", "dyi_Latn",
+ "dyo_Latn", "dyu_Latn", "dzg_Latn", "dzo_Tibt", "ebu_Latn", "ego_Latn",
+ "eip_Latn", "eiv_Latn", "eka_Latn", "ekk_Latn", "eko_Latn", "ekr_Latn",
+ "ell_Grek", "ell_Grek_cypr1249", "elm_Latn", "emp_Latn", "enb_Latn", "eng_Latn",
+ "enx_Latn", "epo_Latn", "ese_Latn", "ess_Latn", "esu_Latn", "eto_Latn",
+ "ets_Latn", "etu_Latn", "eus_Latn", "evn_Cyrl", "ewe_Latn", "ewo_Latn",
+ "eyo_Latn", "eza_Latn", "fal_Latn", "fan_Latn", "fao_Latn", "far_Latn",
+ "fas_Arab", "fat_Latn", "fia_Latn", "fij_Latn", "fil_Latn", "fin_Latn",
+ "fip_Latn", "fkk_Latn", "flr_Latn", "fmp_Latn", "fmu_Deva", "fon_Latn",
+ "fra_Latn", "frd_Latn", "fry_Latn", "fub_Latn", "fuc_Latn", "fue_Latn",
+ "ful_Latn", "fuq_Latn", "fuv_Latn", "gag_Cyrl", "gag_Latn", "gai_Latn",
+ "gam_Latn", "gau_Telu", "gbi_Latn", "gbk_Deva", "gbm_Deva", "gbo_Latn",
+ "gbr_Latn", "gby_Latn", "gcc_Latn", "gde_Latn", "gdf_Latn", "geb_Latn",
+ "gej_Latn", "ges_Latn", "ggg_Arab", "gid_Latn", "gig_Arab", "gil_Latn",
+ "giz_Latn", "gjk_Arab", "gjn_Latn", "gju_Arab", "gkn_Latn", "gld_Cyrl",
+ "gle_Latn", "glg_Latn", "glk_Arab", "glv_Latn", "glw_Latn", "gmv_Latn",
+ "gna_Latn", "gnd_Latn", "gng_Latn", "gof_Latn", "gog_Latn", "gol_Latn",
+ "gom_Deva", "gor_Latn", "gqr_Latn", "grc_Grek", "gri_Latn", "grn_Latn",
+ "grt_Beng", "gsl_Latn", "gso_Latn", "gub_Latn", "guc_Latn", "gud_Latn",
+ "gug_Latn", "guh_Latn", "gui_Latn", "guj_Gujr", "guk_Ethi", "gum_Latn",
+ "guo_Latn", "guq_Latn", "gur_Latn", "guu_Latn", "gux_Latn", "guz_Latn",
+ "gvc_Latn", "gvl_Latn", "gwc_Arab", "gwe_Latn", "gwi_Latn", "gwr_Latn",
+ "gwt_Arab", "gym_Latn", "gyr_Latn", "gyz_Latn", "had_Latn", "hag_Latn",
+ "hah_Latn", "hak_Latn", "hao_Latn", "hap_Latn", "hat_Latn", "hau_Latn",
+ "haw_Latn", "hay_Latn", "hbb_Latn", "hch_Latn", "heb_Hebr", "heh_Latn",
+ "her_Latn", "hia_Latn", "hif_Latn", "hig_Latn", "hil_Latn", "hin_Deva",
+ "hkk_Latn", "hla_Latn", "hlb_Deva", "hlt_Latn", "hne_Deva", "hnn_Latn",
+ "hno_Arab", "hns_Latn", "hoc_Orya", "hrv_Latn", "hsb_Latn", "hto_Latn",
+ "hub_Latn", "hue_Latn", "hui_Latn", "hul_Latn", "hun_Latn", "hus_Latn",
+ "huu_Latn", "huv_Latn", "hux_Latn", "hvn_Latn", "hwc_Latn", "hwo_Latn",
+ "hye_Armn", "hyw_Armn", "iba_Latn", "ibb_Latn", "ibo_Latn", "icr_Latn",
+ "ida_Latn", "idd_Latn", "idu_Latn", "ifa_Latn", "ifb_Latn", "ife_Latn",
+ "ifk_Latn", "ifu_Latn", "ify_Latn", "igl_Latn", "ign_Latn", "ijc_Latn",
+ "ijn_Latn", "ikk_Latn", "ikw_Latn", "ilb_Latn", "ilo_Latn", "imo_Latn",
+ "ina_Latn", "inb_Latn", "ind_Latn", "iou_Latn", "ipi_Latn", "ipk_Latn",
+ "iqw_Latn", "iri_Latn", "irk_Latn", "ish_Latn", "isl_Latn", "iso_Latn",
+ "ita_Latn", "itl_Cyrl", "its_Latn", "itv_Latn", "itw_Latn", "itz_Latn",
+ "ixl_Latn", "izr_Latn", "izz_Latn", "jac_Latn", "jal_Latn", "jam_Latn",
+ "jav_Latn", "jax_Latn", "jbu_Latn", "jen_Latn", "jic_Latn", "jiv_Latn",
+ "jmc_Latn", "jmd_Latn", "jmx_Latn", "jpn_Jpan", "jqr_Latn", "juk_Latn",
+ "jun_Orya", "juo_Latn", "jvn_Latn", "kaa_Cyrl", "kab_Latn", "kac_Latn",
+ "kai_Latn", "kaj_Latn", "kak_Latn", "kam_Latn", "kan_Knda", "kao_Latn",
+ "kaq_Latn", "kas_Arab", "kat_Geor", "kay_Latn", "kaz_Cyrl", "kbd_Cyrl",
+ "kbl_Latn", "kbo_Latn", "kbp_Latn", "kbq_Latn", "kbr_Latn", "kbt_Latn",
+ "kby_Latn", "kca_Cyrl", "kcg_Latn", "kcn_Latn", "kcq_Latn", "kdc_Latn",
+ "kde_Latn", "kdh_Latn", "kdi_Latn", "kdj_Latn", "kdl_Latn", "kdn_Latn",
+ "kdt_Khmr", "kea_Latn", "kek_Latn", "ken_Latn", "keo_Latn", "ker_Latn",
+ "keu_Latn", "key_Telu", "kez_Latn", "kfb_Deva", "kff_Telu", "kfk_Deva",
+ "kfq_Deva", "kfr_Gujr", "kfw_Latn", "kfx_Deva", "kha_Latn", "khg_Tibt",
+ "khk_Cyrl", "khm_Khmr", "khq_Latn", "khw_Arab", "kia_Latn", "kij_Latn",
+ "kik_Latn", "kin_Latn", "kir_Cyrl", "kix_Latn", "kjb_Latn", "kjc_Latn",
+ "kje_Latn", "kjg_Latn", "kjh_Cyrl", "kjk_Latn", "kki_Latn", "kkj_Latn",
+ "kle_Deva", "kln_Latn", "kls_Latn", "klu_Latn", "klv_Latn", "klw_Latn",
+ "kma_Latn", "kmd_Latn", "kml_Latn", "kmr_Arab", "kmr_Cyrl", "kmr_Latn",
+ "kmu_Latn", "kmy_Latn", "kna_Latn", "knb_Latn", "knc_Latn", "kne_Latn",
+ "knf_Latn", "knj_Latn", "knk_Latn", "knn_Deva", "kno_Latn", "kog_Latn",
+ "kol_Latn", "koo_Latn", "kor_Hang", "kpo_Latn", "kpq_Latn", "kps_Latn",
+ "kpv_Cyrl", "kpy_Cyrl", "kpz_Latn", "kqe_Latn", "kqo_Latn", "kqp_Latn",
+ "kqr_Latn", "kqy_Ethi", "krc_Cyrl", "kri_Latn", "krj_Latn", "krl_Latn",
+ "krr_Khmr", "krs_Latn", "kru_Deva", "krx_Latn", "ksb_Latn", "ksd_Latn",
+ "ksf_Latn", "ksr_Latn", "kss_Latn", "ksz_Deva", "ktb_Ethi", "ktj_Latn",
+ "kto_Latn", "kua_Latn", "kub_Latn", "kue_Latn", "kuh_Latn", "kum_Cyrl",
+ "kur_Arab", "kus_Latn", "kvn_Latn", "kvw_Latn", "kvx_Arab", "kwd_Latn",
+ "kwf_Latn", "kwi_Latn", "kwm_Latn", "kxc_Ethi", "kxf_Latn", "kxm_Thai",
+ "kxp_Arab", "kyb_Latn", "kyc_Latn", "kyf_Latn", "kyg_Latn", "kyo_Latn",
+ "kyq_Latn", "kyu_Kali", "kyx_Latn", "kyz_Latn", "kzf_Latn", "kzi_Latn",
+ "lac_Latn", "lag_Latn", "laj_Latn", "lam_Latn", "lao_Laoo", "las_Latn",
+ "lat_Latn", "lav_Latn", "law_Latn", "lbj_Tibt", "lbw_Latn", "lcm_Latn",
+ "lcp_Thai", "ldb_Latn", "led_Latn", "lee_Latn", "lef_Latn", "lem_Latn",
+ "lew_Latn", "lex_Latn", "lgg_Latn", "lgl_Latn", "lhu_Latn", "lia_Latn",
+ "lid_Latn", "lif_Deva", "lij_Latn", "lin_Latn", "lip_Latn", "lir_Latn",
+ "lis_Lisu", "lit_Latn", "lje_Latn", "ljp_Latn", "lkb_Latn", "lke_Latn",
+ "lla_Latn", "lld_Latn_gherd", "lld_Latn_valbadia", "llg_Latn", "lln_Latn", "lme_Latn",
+ "lnd_Latn", "lns_Latn", "lnu_Latn", "loa_Latn", "lob_Latn", "lok_Latn",
+ "lom_Latn", "lon_Latn", "loq_Latn", "lrk_Arab", "lsi_Latn", "lsm_Latn",
+ "lss_Arab", "ltg_Latn", "lth_Latn", "lto_Latn", "ltz_Latn", "lua_Latn",
+ "luc_Latn", "lug_Latn", "luo_Latn", "lus_Latn", "lwg_Latn", "lwo_Latn",
+ "lww_Latn", "lzz_Latn", "maa_Latn", "mab_Latn", "mad_Latn", "maf_Latn",
+ "mag_Deva", "mah_Latn", "mai_Deva", "maj_Latn", "mak_Latn", "mal_Mlym",
+ "mam_Latn", "maq_Latn", "mar_Deva", "mau_Latn", "maw_Latn", "max_Latn",
+ "maz_Latn", "mbb_Latn", "mbc_Latn", "mbh_Latn", "mbj_Latn", "mbt_Latn",
+ "mbu_Latn", "mca_Latn", "mcb_Latn", "mcd_Latn", "mcf_Latn", "mco_Latn",
+ "mcp_Latn", "mcq_Latn", "mcu_Latn", "mcx_Latn", "mda_Latn", "mdd_Latn",
+ "mdv_Latn", "mdy_Ethi", "med_Latn", "mee_Latn", "meh_Latn", "mej_Latn",
+ "mek_Latn", "mel_Latn", "men_Latn", "meq_Latn", "mer_Latn", "met_Latn",
+ "meu_Latn", "mev_Latn", "mfe_Latn", "mfh_Latn", "mfi_Latn", "mfk_Latn",
+ "mfm_Latn", "mfn_Latn", "mfo_Latn", "mfq_Latn", "mfv_Latn", "mfy_Latn",
+ "mfz_Latn", "mgd_Latn", "mge_Latn", "mgg_Latn", "mgh_Latn", "mgi_Latn",
+ "mgo_Latn", "mhi_Latn", "mhk_Latn", "mhr_Cyrl", "mhu_Latn", "mhx_Latn",
+ "mhy_Latn", "mib_Latn", "mie_Latn", "mif_Latn", "mig_Latn", "mih_Latn",
+ "mil_Latn", "mim_Latn", "min_Latn", "mio_Latn", "mip_Latn", "miq_Latn",
+ "mit_Latn", "miu_Latn", "miy_Latn", "miz_Latn", "mjl_Deva", "mjv_Mlym",
+ "mkd_Cyrl", "mkf_Latn", "mki_Arab", "mkl_Latn", "mkn_Latn", "mlg_Latn",
+ "mlq_Latn", "mlt_Latn", "mmc_Latn", "mmg_Latn", "mnb_Latn", "mne_Latn",
+ "mnf_Latn", "mni_Beng", "mnk_Latn", "mnw_Mymr", "mnx_Latn", "moa_Latn",
+ "mog_Latn", "mon_Cyrl", "mop_Latn", "mor_Latn", "mos_Latn", "mox_Latn",
+ "moz_Latn", "mpg_Latn", "mpm_Latn", "mpp_Latn", "mpx_Latn", "mqb_Latn",
+ "mqf_Latn", "mqj_Latn", "mqn_Latn", "mqy_Latn", "mri_Latn", "mrj_Cyrl",
+ "mrr_Deva", "mrt_Latn", "mrw_Latn", "msh_Latn", "msi_Latn", "msw_Latn",
+ "msy_Latn", "mtd_Latn", "mtj_Latn", "mto_Latn", "mtr_Deva", "mtu_Latn",
+ "mtx_Latn", "mua_Latn", "mug_Latn", "muh_Latn", "mui_Latn", "mup_Deva",
+ "mur_Latn", "muv_Mlym", "muy_Latn", "mve_Arab", "mvp_Latn", "mvy_Arab",
+ "mwq_Latn", "mwv_Latn", "mxb_Latn", "mxq_Latn", "mxs_Latn", "mxt_Latn",
+ "mxu_Latn", "mxv_Latn", "mxy_Latn", "mya_Mymr", "myb_Latn", "myk_Latn",
+ "myv_Cyrl", "myx_Latn", "myy_Latn", "mza_Latn", "mzi_Latn", "mzj_Latn",
+ "mzk_Latn", "mzl_Latn", "mzm_Latn", "mzw_Latn", "nab_Latn", "nag_Latn",
+ "nal_Latn", "nan_Latn", "nap_Latn", "nas_Latn", "naw_Latn", "nbh_Latn",
+ "nca_Latn", "ncf_Latn", "nch_Latn", "ncj_Latn", "ncl_Latn", "nco_Latn",
+ "ncu_Latn", "ncx_Latn", "ndi_Latn", "ndj_Latn", "ndo_Latn", "ndp_Latn",
+ "ndv_Latn", "ndy_Latn", "ndz_Latn", "neb_Latn", "nep_Deva", "new_Deva",
+ "nfa_Latn", "nfr_Latn", "nga_Latn", "ngi_Latn", "ngl_Latn", "ngp_Latn",
+ "ngu_Latn", "nhe_Latn", "nhg_Latn", "nhi_Latn", "nhn_Latn", "nhq_Latn",
+ "nhu_Latn", "nhw_Latn", "nhx_Latn", "nhy_Latn", "nia_Latn", "nij_Latn",
+ "nim_Latn", "nin_Latn", "nja_Latn", "nko_Latn", "nla_Latn", "nlc_Latn",
+ "nld_Latn", "nlg_Latn", "nlk_Latn", "nlv_Latn", "nmg_Latn", "nmz_Latn",
+ "nnb_Latn", "nnh_Latn", "nnq_Latn", "nnw_Latn", "noa_Latn", "nob_Latn",
+ "nod_Thai", "noe_Deva", "nog_Cyrl", "not_Latn", "npl_Latn", "npy_Latn",
+ "nso_Latn", "nst_Latn", "nsu_Latn", "ntm_Latn", "ntr_Latn", "nuj_Latn",
+ "nup_Latn", "nus_Latn", "nuz_Latn", "nwb_Latn", "nxq_Latn", "nya_Latn",
+ "nyf_Latn", "nyn_Latn", "nyo_Latn", "nyu_Latn", "nyy_Latn", "nzi_Latn",
+ "obo_Latn", "oci_Latn", "odk_Arab", "odu_Latn", "ogo_Latn", "ojb_Cans",
+ "ojb_Latn", "oku_Latn", "old_Latn", "omw_Latn", "onb_Latn", "ood_Latn",
+ "orc_Latn", "orm_Latn", "oru_Arab", "ory_Orya", "oss_Cyrl", "ote_Latn",
+ "otq_Latn", "ozm_Latn", "pab_Latn", "pad_Latn", "pag_Latn", "pam_Latn",
+ "pan_Guru", "pao_Latn", "pap_Latn", "pau_Latn", "pbb_Latn", "pbc_Latn",
+ "pbi_Latn", "pbs_Latn", "pbt_Arab", "pbu_Arab", "pce_Thai", "pcm_Latn",
+ "pex_Latn", "pez_Latn", "phl_Arab", "phr_Arab", "pib_Latn", "pil_Latn",
+ "pip_Latn", "pir_Latn", "pis_Latn", "piy_Latn", "pjt_Latn", "pkb_Latn",
+ "pko_Latn", "plk_Arab", "pls_Latn", "plt_Latn", "plw_Latn", "pmf_Latn",
+ "pmq_Latn", "pms_Latn", "pmy_Latn", "pnb_Arab", "pne_Latn", "pny_Latn",
+ "poc_Latn", "poe_Latn", "poh_Latn", "poi_Latn", "pol_Latn", "por_Latn",
+ "pov_Latn", "pow_Latn", "poy_Latn", "ppk_Latn", "pps_Latn", "prf_Latn",
+ "prk_Latn", "prq_Latn", "prt_Thai", "pse_Latn", "pss_Latn", "pst_Arab",
+ "ptu_Latn", "pua_Latn", "pui_Latn", "pus_Arab", "pwg_Latn", "pwn_Latn",
+ "pww_Thai", "pxm_Latn", "qub_Latn", "quc_Latn", "quf_Latn", "qug_Latn",
+ "quh_Latn", "qul_Latn", "qum_Latn", "qup_Latn", "qur_Latn", "qus_Latn",
+ "quv_Latn", "quw_Latn", "qux_Latn", "quy_Latn", "quz_Latn", "qva_Latn",
+ "qvc_Latn", "qve_Latn", "qvh_Latn", "qvi_Latn", "qvj_Latn", "qvl_Latn",
+ "qvm_Latn", "qvn_Latn", "qvo_Latn", "qvs_Latn", "qvw_Latn", "qvz_Latn",
+ "qwa_Latn", "qwh_Latn", "qws_Latn", "qxa_Latn", "qxh_Latn", "qxl_Latn",
+ "qxn_Latn", "qxo_Latn", "qxp_Latn", "qxr_Latn", "qxt_Latn", "qxu_Latn",
+ "qxw_Latn", "rag_Latn", "rah_Beng", "rai_Latn", "rap_Latn", "rav_Deva",
+ "raw_Latn", "rej_Latn", "rel_Latn", "rgu_Latn", "rhg_Latn", "rif_Arab",
+ "rif_Latn", "rim_Latn", "rjs_Deva", "rkt_Beng", "rmc_Cyrl", "rmc_Latn",
+ "rmo_Latn", "rmy_Cyrl", "rmy_Latn", "rng_Latn", "rnl_Latn", "rob_Latn",
+ "rof_Latn", "roh_Latn_surs1244", "rol_Latn", "ron_Latn", "roo_Latn", "rop_Latn",
+ "rro_Latn", "rth_Latn", "rub_Latn", "ruc_Latn", "ruf_Latn", "rug_Latn",
+ "run_Latn", "rus_Cyrl", "rwm_Latn", "rwr_Deva", "sab_Latn", "sag_Latn",
+ "sah_Cyrl", "saj_Latn", "saq_Latn", "sas_Latn", "sau_Latn", "say_Latn",
+ "sba_Latn", "sbd_Latn", "sbl_Latn", "sbn_Arab", "sbp_Latn", "sch_Latn",
+ "sck_Deva", "scl_Arab", "scn_Latn", "sco_Latn", "sda_Latn", "sdo_Latn",
+ "sea_Latn", "seh_Latn", "sei_Latn", "ses_Latn", "sey_Latn", "sgb_Latn",
+ "sgj_Deva", "sgw_Ethi", "shi_Latn", "shk_Latn", "shn_Mymr", "sho_Latn",
+ "shp_Latn", "sid_Latn", "sig_Latn", "sil_Latn", "sin_Sinh", "sip_Tibt",
+ "siw_Latn", "sja_Latn", "sjm_Latn", "sjp_Deva", "sjr_Latn", "skg_Latn",
+ "skr_Arab", "sld_Latn", "slk_Latn", "slu_Latn", "slv_Latn", "sml_Latn",
+ "smo_Latn", "sna_Latn", "snc_Latn", "snd_Arab", "sne_Latn", "snk_Latn",
+ "snn_Latn", "snp_Latn", "snv_Latn", "snw_Latn", "sol_Latn", "som_Latn",
+ "soy_Latn", "spa_Latn", "spp_Latn", "sps_Latn", "spy_Latn", "src_Latn",
+ "srd_Latn", "sri_Latn", "srm_Latn", "srn_Latn", "sro_Latn", "srp_Cyrl",
+ "srr_Latn", "srx_Deva", "ssi_Arab", "ste_Latn", "stn_Latn", "stp_Latn",
+ "sua_Latn", "suc_Latn", "suk_Latn", "sun_Latn", "sur_Latn", "sus_Latn",
+ "suv_Latn", "suz_Deva", "sva_Geor", "swe_Latn", "swh_Latn", "swv_Deva",
+ "sxb_Latn", "sxn_Latn", "sya_Latn", "syl_Latn", "sza_Latn", "szy_Latn",
+ "tac_Latn", "taj_Deva", "tam_Taml", "tan_Latn", "tao_Latn", "tap_Latn",
+ "taq_Latn", "tar_Latn", "tat_Cyrl", "tav_Latn", "tay_Latn", "tbc_Latn",
+ "tbf_Latn", "tbg_Latn", "tbk_Latn", "tbl_Latn", "tby_Latn", "tbz_Latn",
+ "tca_Latn", "tcc_Latn", "tcf_Latn", "tcy_Mlym", "tcz_Latn", "tdj_Latn",
+ "tdn_Latn", "tdx_Latn", "ted_Latn", "tee_Latn", "tel_Telu", "tem_Latn",
+ "teo_Latn", "ter_Latn", "tew_Latn", "tex_Latn", "tfr_Latn", "tgc_Latn",
+ "tgj_Latn", "tgk_Cyrl", "tgl_Latn", "tgo_Latn", "tgp_Latn", "tha_Thai",
+ "the_Deva", "thk_Latn", "thl_Deva", "thq_Deva", "thr_Deva", "thv_Tfng",
+ "tig_Ethi", "tih_Latn", "tik_Latn", "tio_Latn", "tir_Ethi", "tkg_Latn",
+ "tkr_Latn", "tkt_Deva", "tlb_Latn", "tli_Latn", "tlj_Latn", "tlp_Latn",
+ "tly_Latn", "tmc_Latn", "tmf_Latn", "tna_Latn", "tng_Latn", "tnk_Latn",
+ "tnn_Latn", "tnp_Latn", "tnr_Latn", "tnt_Latn", "tob_Latn", "toc_Latn",
+ "toh_Latn", "tok_Latn", "tom_Latn", "top_Latn", "tos_Latn", "tpi_Latn",
+ "tpl_Latn", "tpm_Latn", "tpp_Latn", "tpt_Latn", "tpz_Latn", "tqp_Latn",
+ "trc_Latn", "tri_Latn", "trn_Latn", "trp_Latn", "trq_Latn", "trs_Latn",
+ "trv_Latn", "trw_Arab", "tsn_Latn", "tso_Latn", "tsz_Latn", "ttc_Latn",
+ "tte_Latn", "ttj_Latn", "ttq_Tfng", "ttr_Latn", "ttu_Latn", "tue_Latn",
+ "tuf_Latn", "tui_Latn", "tuk_Arab", "tuk_Latn", "tul_Latn", "tuo_Latn",
+ "tuq_Latn", "tur_Latn", "tuv_Latn", "tuy_Latn", "tvo_Latn", "tvu_Latn",
+ "tvw_Latn", "twb_Latn", "twe_Latn", "twu_Latn", "txa_Latn", "txq_Latn",
+ "txs_Latn", "txu_Latn", "txy_Latn", "tye_Latn", "tzh_Latn", "tzj_Latn",
+ "tzo_Latn", "ubl_Latn", "ubu_Latn", "udl_Latn", "udm_Cyrl", "udu_Latn",
+ "uig_Arab", "uig_Cyrl", "uki_Orya", "ukr_Cyrl", "ukv_Latn", "umb_Latn",
+ "upv_Latn", "ura_Latn", "urb_Latn", "urd_Arab", "urd_Deva", "urd_Latn",
+ "urh_Latn", "urk_Thai", "urt_Latn", "ury_Latn", "ush_Arab", "usp_Latn",
+ "uzb_Cyrl", "uzb_Latn", "uzn_Latn", "vag_Latn", "vah_Deva", "vai_Latn",
+ "var_Latn", "ver_Latn", "vid_Latn", "vie_Latn", "vif_Latn", "vmc_Latn",
+ "vmj_Latn", "vmm_Latn", "vmp_Latn", "vmw_Latn", "vmy_Latn", "vmz_Latn",
+ "vro_Latn", "vun_Latn", "vut_Latn", "wal_Ethi", "wal_Latn", "wap_Latn",
+ "war_Latn", "waw_Latn", "way_Latn", "wba_Latn", "wbl_Latn", "wbr_Deva",
+ "wci_Latn", "weo_Latn", "wes_Latn", "wja_Latn", "wji_Latn", "wlo_Latn",
+ "wlx_Latn", "wmw_Latn", "wob_Latn", "wof_Latn", "wol_Latn", "wsg_Telu",
+ "wwa_Latn", "xal_Cyrl", "xdy_Latn", "xed_Latn", "xer_Latn", "xhe_Arab",
+ "xho_Latn", "xka_Arab", "xkl_Latn", "xmf_Geor", "xmm_Latn", "xmv_Latn",
+ "xnj_Latn", "xnr_Deva", "xog_Latn", "xon_Latn", "xpe_Latn", "xrb_Latn",
+ "xsb_Latn", "xsm_Latn", "xsr_Deva", "xsu_Latn", "xta_Latn", "xtd_Latn",
+ "xte_Latn", "xti_Latn", "xtm_Latn", "xtn_Latn", "xtu_Latn", "xua_Taml",
+ "xuo_Latn", "yaa_Latn", "yad_Latn", "yal_Latn", "yam_Latn", "yao_Latn",
+ "yaq_Latn", "yas_Latn", "yat_Latn", "yav_Latn", "yay_Latn", "yaz_Latn",
+ "yba_Latn", "ybb_Latn", "ycl_Latn", "ycn_Latn", "ydd_Hebr", "ydg_Arab",
+ "yea_Mlym", "yer_Latn", "yes_Latn", "yka_Latn", "yli_Latn", "yor_Latn",
+ "yre_Latn", "yua_Latn", "yue_Hans", "yue_Hant", "yuz_Latn", "yva_Latn",
+ "zaa_Latn", "zab_Latn", "zac_Latn", "zad_Latn", "zae_Latn", "zai_Latn",
+ "zam_Latn", "zao_Latn", "zaq_Latn", "zar_Latn", "zas_Latn", "zav_Latn",
+ "zaw_Latn", "zca_Latn", "zga_Latn", "zim_Latn", "ziw_Latn", "zmz_Latn",
+ "zne_Latn", "zoc_Latn", "zoh_Latn", "zor_Latn", "zos_Latn", "zpc_Latn",
+ "zpg_Latn", "zpi_Latn", "zpl_Latn", "zpm_Latn", "zpo_Latn", "zpt_Latn",
+ "zpu_Latn", "zpv_Latn", "zpy_Latn", "zpz_Latn", "zsm_Latn", "ztg_Latn",
+ "ztn_Latn", "ztp_Latn", "ztq_Latn", "zts_Latn", "ztu_Latn", "zty_Latn",
+ "zul_Latn", "zyb_Latn", "zyp_Latn", "zza_Latn",
+];
+
+export const OMNI_ASR_SUPPORTED_LANG_SET: ReadonlySet = new Set(OMNI_ASR_SUPPORTED_LANGS);
diff --git a/src/copilotSettings/copilotSettings.ts b/src/copilotSettings/copilotSettings.ts
index fbe927f47..073fd4953 100644
--- a/src/copilotSettings/copilotSettings.ts
+++ b/src/copilotSettings/copilotSettings.ts
@@ -122,11 +122,10 @@ export async function openSystemMessageEditor() {
try {
const config = vscode.workspace.getConfiguration("codex-editor-extension");
const settings = {
- endpoint: config.get("asrEndpoint", "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe"),
- provider: config.get("asrProvider", "mms"),
- model: config.get("asrModel", "facebook/mms-1b-all"),
+ endpoint: config.get("asrEndpoint", "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe"),
+ provider: config.get("asrProvider", "omniasr"),
+ model: config.get("asrModel", "omniASR_LLM_1B_v2"),
language: config.get("asrLanguage", "eng"),
- phonetic: config.get("asrPhonetic", false),
};
panel.webview.postMessage({ command: "asrSettings", data: settings });
} catch (error) {
@@ -143,7 +142,6 @@ export async function openSystemMessageEditor() {
await config.update("asrProvider", message.data?.provider, target);
await config.update("asrModel", message.data?.model, target);
await config.update("asrLanguage", message.data?.language, target);
- await config.update("asrPhonetic", !!message.data?.phonetic, target);
panel.webview.postMessage({ command: "asrSettingsSaved" });
} catch (error) {
console.error("[CopilotSettings] Failed to save ASR settings:", error);
diff --git a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
index 39a560b99..bb8d18069 100644
--- a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
+++ b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
@@ -484,6 +484,25 @@ const messageHandlers: Record Promise("asrEndpoint", "http://localhost:8000/api/v1/asr/transcribe");
+ // ASR language plumbing — see sharedUtils/asrLanguageUtils.ts for the resolver
+ // contract. The webview drives "auto-detect" vs "use project language" via the
+ // gear menu on the Transcribe button; that picker is persisted to the workspace
+ // setting `asrLanguageMode`.
+ const { resolveOmniAsrCode } = await import("../../../sharedUtils/asrLanguageUtils");
+ const projectConfig = vscode.workspace.getConfiguration("codex-project-manager");
+ const targetLanguage = projectConfig.get("targetLanguage") as
+ | { tag?: string; refName?: string; iso1?: string; iso2t?: string; iso2b?: string; }
+ | undefined;
+ const languageMode = (config.get("asrLanguageMode", "project") === "auto"
+ ? "auto"
+ : "project") as "auto" | "project";
+ const scriptPref = config.get("asrScriptPref", "auto");
+ const resolvedCode =
+ languageMode === "auto"
+ ? undefined
+ : resolveOmniAsrCode(targetLanguage, scriptPref);
+ const projectLanguageName = targetLanguage?.refName;
+
let authToken: string | undefined;
// Try to get authenticated endpoint from FrontierAPI
@@ -536,10 +555,17 @@ const messageHandlers: Record Promise Promise {
+ const typedEvent = event as Extract;
+ const mode = typedEvent.content?.mode === "auto" ? "auto" : "project";
+ try {
+ await vscode.workspace
+ .getConfiguration("codex-editor-extension")
+ .update("asrLanguageMode", mode, vscode.ConfigurationTarget.Workspace);
+ } catch (err) {
+ console.warn("Failed to update asrLanguageMode", err);
+ }
+ // Rebroadcast so the webview can refresh its local asrConfig snapshot.
+ await messageHandlers.getAsrConfig({ webviewPanel } as any);
+ },
+
+ setAsrScriptPref: async ({ event, webviewPanel }) => {
+ const typedEvent = event as Extract;
+ const rawPref = typedEvent.content?.scriptPref;
+ // Accept "auto", "latin", or any 4-letter ISO 15924 tag. Anything else falls back to "auto".
+ const isFourLetter = typeof rawPref === "string" && /^[A-Za-z]{4}$/.test(rawPref);
+ const normalized =
+ rawPref === "auto" || rawPref === "latin"
+ ? rawPref
+ : isFourLetter
+ ? rawPref!.charAt(0).toUpperCase() + rawPref!.slice(1).toLowerCase()
+ : "auto";
+ try {
+ await vscode.workspace
+ .getConfiguration("codex-editor-extension")
+ .update("asrScriptPref", normalized, vscode.ConfigurationTarget.Workspace);
+ } catch (err) {
+ console.warn("Failed to update asrScriptPref", err);
+ }
+ await messageHandlers.getAsrConfig({ webviewPanel } as any);
+ },
+
updateCellAfterTranscription: async ({ event, document, webviewPanel, provider }) => {
const typedEvent = event as Extract;
const { cellId, transcribedText, language } = typedEvent.content;
@@ -574,7 +632,12 @@ const messageHandlers: Record Promise
-
+
Codex Cell Editor
diff --git a/src/providers/mainMenu/mainMenuProvider.ts b/src/providers/mainMenu/mainMenuProvider.ts
index 0395e4692..e8f60e9f4 100644
--- a/src/providers/mainMenu/mainMenuProvider.ts
+++ b/src/providers/mainMenu/mainMenuProvider.ts
@@ -705,7 +705,7 @@ export class MainMenuProvider extends BaseWebviewProvider {
}
case "getAsrSettings": {
const config = vscode.workspace.getConfiguration("codex-editor-extension");
- let endpoint = config.get("asrEndpoint", "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe");
+ let endpoint = config.get("asrEndpoint", "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe");
let authToken: string | undefined;
// Try to get authenticated endpoint from FrontierAPI
@@ -745,7 +745,7 @@ export class MainMenuProvider extends BaseWebviewProvider {
new URL(endpoint);
} catch (urlError) {
console.error("Invalid ASR endpoint configuration:", endpoint, urlError);
- endpoint = "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe";
+ endpoint = "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe";
}
// Warn if using authenticated endpoint without token
@@ -756,10 +756,9 @@ export class MainMenuProvider extends BaseWebviewProvider {
const settings = {
endpoint,
- provider: config.get("asrProvider", "mms"),
- model: config.get("asrModel", "facebook/mms-1b-all"),
+ provider: config.get("asrProvider", "omniasr"),
+ model: config.get("asrModel", "omniASR_LLM_1B_v2"),
language: config.get("asrLanguage", "eng"),
- phonetic: config.get("asrPhonetic", false),
authToken,
};
if (this._view) {
@@ -774,7 +773,6 @@ export class MainMenuProvider extends BaseWebviewProvider {
await config.update("asrProvider", (message as any).data?.provider, target);
await config.update("asrModel", (message as any).data?.model, target);
await config.update("asrLanguage", (message as any).data?.language, target);
- await config.update("asrPhonetic", !!(message as any).data?.phonetic, target);
if (this._view) {
safePostMessageToView(this._view, { command: "asrSettingsSaved" }, "MainMenu");
}
diff --git a/types/index.d.ts b/types/index.d.ts
index 4a62f3622..55445c080 100644
--- a/types/index.d.ts
+++ b/types/index.d.ts
@@ -576,10 +576,24 @@ export type EditorPostMessages =
content: {
cellId: string;
transcribedText: string;
- language: string;
+ /** OmniASR `{iso639_3}_{Script}` code the server reported (or that we sent and the server
+ * used silently). `null` when transcription ran in auto-detect mode and the server did
+ * not echo a language back. Persisted on the audio attachment so the badge survives
+ * re-renders. */
+ language: string | null;
};
}
| { command: "getAsrConfig"; }
+ | {
+ command: "setAsrLanguageMode";
+ content: { mode: "auto" | "project"; };
+ }
+ | {
+ command: "setAsrScriptPref";
+ /** `"auto"` (best guess), `"latin"` (force Latin where supported), or a 4-letter
+ * ISO 15924 tag (`"Arab"`, `"Cyrl"`, ...). */
+ content: { scriptPref: string; };
+ }
| {
command: "mergeCellWithPrevious";
content: {
@@ -2150,7 +2164,25 @@ type EditorReceiveMessages =
milestoneIndex?: number;
subsectionIndex?: number;
}
- | { type: "asrConfig"; content: { endpoint: string; authToken?: string; }; }
+ | {
+ type: "asrConfig";
+ content: {
+ endpoint: string;
+ authToken?: string;
+ /** OmniASR `{iso639_3}_{Script}` code to send as `?lang=...`. Omitted when the
+ * user picks Auto-Detect or when we can't safely resolve a code. */
+ lang?: string;
+ /** "project" (default) → send `lang`. "auto" → omit `lang`, let the server transcribe
+ * without language conditioning. Persisted as workspace setting `asrLanguageMode`. */
+ languageMode: "auto" | "project";
+ /** Script preference: "auto" (best guess), "latin", or a 4-letter ISO 15924 tag.
+ * Persisted as workspace setting `asrScriptPref`. */
+ scriptPref?: string;
+ /** Project target-language refName, e.g. "Swahili". Used as the badge fallback when
+ * the server doesn't echo `lang` in the response. */
+ projectLanguageName?: string;
+ };
+ }
| { type: "startBatchTranscription"; content: { count: number; }; }
| {
type: "providerConfirmsBacktranslationSet";
diff --git a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
index 65cb49161..4c7052466 100644
--- a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
@@ -2,10 +2,23 @@ import React, { useEffect, useState } from "react";
import { CustomWaveformCanvas } from "./CustomWaveformCanvas.tsx";
import { Button } from "../components/ui/button";
import { Badge } from "../components/ui/badge";
-import { MessageCircle, Copy, Loader2, Trash2, History, Mic } from "lucide-react";
+import { MessageCircle, Copy, Loader2, Trash2, History, Mic, Settings as SettingsIcon } from "lucide-react";
import type { ValidationStatusIconProps } from "./AudioValidationStatusIcon.tsx";
import { AudioValidationBadge } from "./AudioValidationBadge.tsx";
import type { AudioValidationPopoverProps } from "./AudioValidationBadge.tsx";
+import {
+ Popover,
+ PopoverContent,
+ PopoverTrigger,
+} from "../components/ui/popover";
+import {
+ Select,
+ SelectContent,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from "../components/ui/select";
+import { Input } from "../components/ui/input";
interface AudioWaveformWithTranscriptionProps {
audioUrl: string;
@@ -15,6 +28,10 @@ interface AudioWaveformWithTranscriptionProps {
timestamp: number;
language?: string;
} | null;
+ /** Pre-computed friendly label for the language badge ("Swahili", "Auto Detect", or null
+ * for "render nothing"). Computed by the caller via `labelForTranscriptionLanguage()`
+ * from sharedUtils/asrLanguageUtils.ts so this component stays presentational. */
+ transcriptionLanguageLabel?: string | null;
isTranscribing: boolean;
transcriptionProgress: number;
onTranscribe: () => void;
@@ -31,6 +48,17 @@ interface AudioWaveformWithTranscriptionProps {
targetDuration?: number | null; // Target duration (in seconds) derived from cell timestamps.
/** Total number of audio recordings for the cell (including soft-deleted). When > 0, a count badge is rendered on the History button. */
historyCount?: number;
+ // Advanced ASR settings (gear menu, next to the Transcribe button).
+ /** Whether to display the gear menu. Hide on source-text editors where the user can't drive transcription policy. */
+ showAdvancedAsrMenu?: boolean;
+ /** Current language mode. Determines the chevron position in the gear menu. */
+ asrLanguageMode?: "auto" | "project";
+ /** Current script preference: "auto", "latin", or a 4-letter ISO 15924 tag (e.g. "Arab"). */
+ asrScriptPref?: string;
+ /** Friendly project-language label for the "Project language" radio (e.g. "Swahili"). */
+ projectLanguageName?: string;
+ onChangeAsrLanguageMode?: (mode: "auto" | "project") => void;
+ onChangeAsrScriptPref?: (pref: string) => void;
}
const AudioWaveformWithTranscription: React.FC = ({
@@ -52,10 +80,40 @@ const AudioWaveformWithTranscription: React.FC {
const [audioSrc, setAudioSrc] = useState("");
const [audioDuration, setAudioDuration] = useState(null);
+ // The Script picker offers three "preset" choices plus a free-form 4-letter input for
+ // power users (e.g. someone wants `swa_Cyrl` even though the resolver would never pick
+ // it). We track the *dropdown* selection separately from the committed `asrScriptPref`
+ // so picking "Custom" reveals the input even before a valid tag has been entered.
+ type ScriptOption = "auto" | "latin" | "custom";
+ const optionFromPref = (pref: string): ScriptOption =>
+ pref === "auto" ? "auto" : pref === "latin" ? "latin" : "custom";
+ const [scriptSelection, setScriptSelection] = useState(
+ optionFromPref(asrScriptPref)
+ );
+ const [scriptCustomDraft, setScriptCustomDraft] = useState(
+ optionFromPref(asrScriptPref) === "custom" ? asrScriptPref : ""
+ );
+ useEffect(() => {
+ const next = optionFromPref(asrScriptPref);
+ setScriptSelection(next);
+ if (next === "custom") setScriptCustomDraft(asrScriptPref);
+ }, [asrScriptPref]);
+ const commitCustomScript = () => {
+ const candidate = scriptCustomDraft.trim();
+ if (/^[A-Za-z]{4}$/.test(candidate)) onChangeAsrScriptPref?.(candidate);
+ };
+
// Prefer the provided URL (can be blob: or data:). Fall back to creating an object URL from the blob.
useEffect(() => {
if (audioUrl) {
@@ -142,11 +200,17 @@ const AudioWaveformWithTranscription: React.FC
{transcription.content}
- {transcription.language && (
-
- {transcription.language}
-
- )}
+ {/* Language badge intentionally hidden in this PR.
+ The new `codex-asr` Modal app DOES run MMS-LID and echo back a
+ `lang` for auto-detect (and the plumbing all the way through
+ `transcriptionLanguageLabel` is wired and ready), but this PR
+ keeps the client pointed at the existing Frontier auth-proxy ASR
+ endpoint, which still forwards to the legacy `mms-zeroshot-asr`
+ Modal app — no LID, no `lang` echo. Showing the badge in that
+ world means falling back to "Auto Detect" (or worse, the project
+ language) instead of an honest detection, which is misleading.
+ Re-enable this `` once the auth-proxy upstream migrates
+ to `codex-asr` (see docs/AUTH_SERVER_ASR_IMPLEMENTATION.md). */}