From 251b301622fd2fedbc41f100449ba23de79f645c Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 15:19:55 -0500
Subject: [PATCH 01/12] Add OmniASR language data + resolver/labeller utilities
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds four new files in sharedUtils/, all importable from both the
extension host and the webview bundles:

  - omniAsrSupportedLangs.ts: 1672 supported {iso639_3}_{Script} codes,
    snapshotted from the live GET /languages endpoint.
  - omniAsrDefaultScripts.ts: per-base "best guess" script for the 19
    multi-script bases (urd→Arab, cmn→Hans, uig→Arab, yue→Hant, ...).
    All others have exactly one supported script so no entry needed.
  - omniAsrFriendlyNames.ts: 1650 base→Ref_Name map, for rendering the
    transcription badge.
  - asrLanguageUtils.ts: pure helpers — resolveOmniAsrCode(meta,
    scriptPref) and labelForTranscriptionLanguage(serverLang, sentCode,
    projectLanguageName).

Nothing is wired up yet; that comes in subsequent commits. Each file
has a header explaining how to regenerate after a model/endpoint
change.
---
 sharedUtils/asrLanguageUtils.ts      |  268 ++++
 sharedUtils/omniAsrDefaultScripts.ts |   77 ++
 sharedUtils/omniAsrFriendlyNames.ts  | 1680 ++++++++++++++++++++++++++
 sharedUtils/omniAsrSupportedLangs.ts |  315 +++++
 4 files changed, 2340 insertions(+)
 create mode 100644 sharedUtils/asrLanguageUtils.ts
 create mode 100644 sharedUtils/omniAsrDefaultScripts.ts
 create mode 100644 sharedUtils/omniAsrFriendlyNames.ts
 create mode 100644 sharedUtils/omniAsrSupportedLangs.ts

diff --git a/sharedUtils/asrLanguageUtils.ts b/sharedUtils/asrLanguageUtils.ts
new file mode 100644
index 000000000..b050abb20
--- /dev/null
+++ b/sharedUtils/asrLanguageUtils.ts
@@ -0,0 +1,268 @@
+/**
+ * ASR language-utility functions
+ * ------------------------------
+ *
+ * Pure helpers (no `vscode` imports → unit-testable, usable from both the
+ * extension host and the webviews) that:
+ *
+ *   1. **Resolve** a project's language metadata into an OmniASR-compatible
+ *      `{iso639_3}_{Script}` code (or decide we should send no code, letting
+ *      the server transcribe without language conditioning).
+ *   2. **Label** an OmniASR code with a friendly display name suitable for the
+ *      post-transcription badge (e.g. `swh_Latn` → "Swahili").
+ *
+ * Why this lives in `sharedUtils/`
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Both the extension host (`src/providers/...`) and the webviews
+ * (`webviews/.../CodexCellEditor`) need it: the host builds the `asrConfig`
+ * payload from project settings, and the webview renders the badge after a
+ * transcription completes.
+ */
+
+import {
+    OMNI_ASR_SUPPORTED_LANGS,
+    OMNI_ASR_SUPPORTED_LANG_SET,
+} from "./omniAsrSupportedLangs";
+import { OMNI_ASR_DEFAULT_SCRIPTS } from "./omniAsrDefaultScripts";
+import { OMNI_ASR_FRIENDLY_NAMES } from "./omniAsrFriendlyNames";
+
+/**
+ * Minimal shape of the project's language metadata that we consume here.
+ * Matches `codex-types`'s `LanguageMetadata` but we restate it so this file
+ * doesn't pull `codex-types` (and its transitive deps) into the webview
+ * bundle.
+ */
+export type AsrLanguageMetaInput = {
+    tag?: string;
+    iso1?: string;
+    iso2t?: string;
+    iso2b?: string;
+    refName?: string;
+};
+
+/**
+ * Macrolanguage → individual-language remaps used when the project's tag
+ * names a macrolanguage that OmniASR doesn't serve directly. Each pair maps
+ * a macro ISO 639-3 to the individual ISO 639-3 that OmniASR actually
+ * supports for the most widely-spoken variety. Sources:
+ *   - SIL macrolanguage mappings (iso-639-3-macrolanguages.tab)
+ *   - cross-checked against `OMNI_ASR_SUPPORTED_LANGS`
+ *
+ * Add only when (a) the macro is genuinely not in OmniASR's set and (b) the
+ * "right" individual is unambiguous.
+ */
+const MACRO_TO_INDIVIDUAL: Readonly<Record<string, string>> = {
+    swa: "swh", // Swahili → Coastal Swahili (Kenya/Tanzania majority)
+    ara: "arb", // Arabic → Modern Standard Arabic
+    msa: "zsm", // Malay → Standard Malay
+    zho: "cmn", // Chinese → Mandarin
+    ori: "ory", // Oriya → Odia
+    est: "ekk", // Estonian → Standard Estonian
+    sqi: "als", // Albanian → Tosk Albanian
+    kur: "kmr", // Kurdish → Northern Kurdish (largest speaker base)
+    nor: "nob", // Norwegian → Bokmål
+    oji: "ojb", // Ojibwa → Northwestern Ojibwa
+};
+
+/** ISO 639-1 (2-letter) → ISO 639-3 (3-letter). Common languages only; the
+ * project usually carries `iso2t` directly so this is just a fallback. */
+const ISO1_TO_ISO3: Readonly<Record<string, string>> = {
+    en: "eng", fr: "fra", es: "spa", de: "deu", pt: "por", it: "ita",
+    nl: "nld", ru: "rus", zh: "cmn", ja: "jpn", ko: "kor", ar: "arb",
+    sw: "swh", ur: "urd", hi: "hin", bn: "ben", id: "ind", tr: "tur",
+    th: "tha", vi: "vie", uk: "ukr", pl: "pol", fa: "pes", he: "heb",
+};
+
+/**
+ * Pull the ISO 639-3 base + optional Script subtag out of a project's
+ * language metadata, normalizing macrolanguages to OmniASR-served
+ * individuals. Returns `undefined` if we can't recover a 3-letter code.
+ */
+function extractBaseAndScript(
+    meta: AsrLanguageMetaInput | undefined
+): { base: string; explicitScript?: string; } | undefined {
+    if (!meta) return undefined;
+
+    // BCP-47-ish tag is the richest source: e.g. "swh", "ur-Arab", "zh-Hans".
+    const tag = (meta.tag || "").trim();
+    let base = "";
+    let explicitScript: string | undefined;
+
+    if (tag) {
+        const [primary, ...subtags] = tag.split(/[-_]/);
+        const lowered = (primary || "").toLowerCase();
+        if (lowered.length === 3) {
+            base = lowered;
+        } else if (lowered.length === 2) {
+            base = ISO1_TO_ISO3[lowered] ?? "";
+        }
+        // Script subtags are exactly 4 chars, title-case (Latn, Arab, Cyrl, ...).
+        const script = subtags.find((s) => s.length === 4);
+        if (script) {
+            explicitScript = script.charAt(0).toUpperCase() + script.slice(1).toLowerCase();
+        }
+    }
+
+    if (!base) {
+        base = (meta.iso2t || meta.iso2b || "").toLowerCase();
+    }
+    if (!base) {
+        const i1 = (meta.iso1 || "").toLowerCase();
+        base = ISO1_TO_ISO3[i1] ?? "";
+    }
+    if (!base) return undefined;
+
+    base = MACRO_TO_INDIVIDUAL[base] ?? base;
+    return { base, explicitScript };
+}
+
+/**
+ * `scriptPref` is what the user picked in the Script advanced setting.
+ *
+ *   - `"auto"`     → "best guess" (our default). Pick the script using
+ *                    `OMNI_ASR_DEFAULT_SCRIPTS`, falling back to Latin then
+ *                    the sole supported script.
+ *   - `"latin"`    → force Latin script when supported, otherwise fall back
+ *                    to auto behaviour.
+ *   - any 4-char string (`"Arab"`, `"Cyrl"`, ...) → use that script.
+ */
+export type AsrScriptPref = "auto" | "latin" | string;
+
+/**
+ * Resolve a project's language metadata to an OmniASR-compatible
+ * `{iso639_3}_{Script}` code, or return `undefined` when we can't safely pick
+ * one (the caller should then omit the `lang` query param so the server
+ * transcribes without language conditioning).
+ *
+ * Selection priority:
+ *   1. Explicit `scriptPref` (4-letter ISO 15924 tag) → use as-is when
+ *      `{base}_{Script}` is a supported code.
+ *   2. Script encoded in the project tag (e.g. `swa-Cyrl`) → ditto.
+ *   3. `scriptPref === "latin"` → Latin if supported.
+ *   4. `OMNI_ASR_DEFAULT_SCRIPTS[base]` (our hand-curated "best guess").
+ *   5. Latin if supported.
+ *   6. Sole supported script for this base.
+ *   7. `undefined` (genuinely ambiguous → let the server pick).
+ *
+ * Future work: a per-cell script override could short-circuit step 1.
+ */
+export function resolveOmniAsrCode(
+    meta: AsrLanguageMetaInput | undefined,
+    scriptPref: AsrScriptPref = "auto"
+): string | undefined {
+    const extracted = extractBaseAndScript(meta);
+    if (!extracted) return undefined;
+    const { base, explicitScript } = extracted;
+
+    // Find every supported script for this base.
+    const supportedScripts = OMNI_ASR_SUPPORTED_LANGS
+        .filter((c) => c.startsWith(`${base}_`))
+        .map((c) => c.split("_")[1]);
+    if (supportedScripts.length === 0) return undefined;
+
+    const tryCode = (script: string): string | undefined => {
+        const code = `${base}_${script}`;
+        return OMNI_ASR_SUPPORTED_LANG_SET.has(code) ? code : undefined;
+    };
+
+    // 1. Explicit user-chosen script (4-letter custom tag from advanced setting)
+    if (scriptPref && scriptPref !== "auto" && scriptPref !== "latin" && scriptPref.length === 4) {
+        const normalized = scriptPref.charAt(0).toUpperCase() + scriptPref.slice(1).toLowerCase();
+        const code = tryCode(normalized);
+        if (code) return code;
+    }
+
+    // 2. Script encoded in the project tag
+    if (explicitScript) {
+        const code = tryCode(explicitScript);
+        if (code) return code;
+    }
+
+    // 3. scriptPref === "latin" → Latin if supported
+    if (scriptPref === "latin") {
+        const code = tryCode("Latn");
+        if (code) return code;
+    }
+
+    // 4. Default script for this base
+    const defaultScript = OMNI_ASR_DEFAULT_SCRIPTS[base];
+    if (defaultScript) {
+        const code = tryCode(defaultScript);
+        if (code) return code;
+    }
+
+    // 5. Latin if supported
+    const latin = tryCode("Latn");
+    if (latin) return latin;
+
+    // 6. Sole supported script
+    if (supportedScripts.length === 1) {
+        return `${base}_${supportedScripts[0]}`;
+    }
+
+    // 7. Genuinely ambiguous
+    return undefined;
+}
+
+/** Split an OmniASR code like "swh_Latn" into base + script (or return null). */
+export function splitOmniAsrCode(code: string | undefined | null): { base: string; script: string; } | null {
+    if (!code) return null;
+    const m = /^([a-z]{2,3})_([A-Z][a-z]{3})$/.exec(code);
+    if (!m) return null;
+    return { base: m[1], script: m[2] };
+}
+
+/**
+ * SIL `Ref_Name` values are CamelCased with no spaces (e.g. "MinNanChinese").
+ * Split on case changes for natural-looking display: "Min Nan Chinese".
+ */
+function prettifyRefName(name: string): string {
+    return name
+        // Insert a space before any uppercase letter that follows a lowercase one.
+        .replace(/([a-z])([A-Z])/g, "$1 $2")
+        // And before an uppercase letter that's followed by a lowercase one
+        // (handles runs of acronyms like "USA").
+        .replace(/([A-Z])([A-Z][a-z])/g, "$1 $2")
+        .trim();
+}
+
+/**
+ * Friendly display name for a transcription's language badge.
+ *
+ * Inputs:
+ *   - `serverLang`  — the code OmniASR echoed back in its response (when we
+ *                     sent one). The primary source of truth.
+ *   - `sentCode`    — what we asked the server to use, in case it didn't
+ *                     echo (today the server only echoes when given a code).
+ *   - `projectLanguageName` — `refName` of the project's target language, as
+ *                             a last-ditch fallback when we know we sent the
+ *                             project's code but the server omitted the echo.
+ *
+ * The badge returns `null` to mean "render nothing" (we have no honest label).
+ * The caller renders "Auto Detect" itself when in auto-detect mode and we
+ * have no detected-language info, so we never lie about it here.
+ */
+export function labelForTranscriptionLanguage(
+    serverLang: string | undefined | null,
+    sentCode: string | undefined | null,
+    projectLanguageName: string | undefined | null
+): string | null {
+    const friendly = (code: string | null | undefined): string | null => {
+        const parts = splitOmniAsrCode(code);
+        if (!parts) return null;
+        const refName = OMNI_ASR_FRIENDLY_NAMES[parts.base];
+        return refName ? prettifyRefName(refName) : null;
+    };
+
+    // 1. Server's echo is always the most truthful signal.
+    const fromServer = friendly(serverLang);
+    if (fromServer) return fromServer;
+
+    // 2. If we sent a code but the server didn't echo, the server still used
+    //    what we sent — show that.
+    const fromSent = friendly(sentCode);
+    if (fromSent) return fromSent;
+
+    // 3. Last-ditch fallback: project language name, if any.
+    return projectLanguageName ? prettifyRefName(projectLanguageName) : null;
+}
diff --git a/sharedUtils/omniAsrDefaultScripts.ts b/sharedUtils/omniAsrDefaultScripts.ts
new file mode 100644
index 000000000..3155590fd
--- /dev/null
+++ b/sharedUtils/omniAsrDefaultScripts.ts
@@ -0,0 +1,77 @@
+/**
+ * OmniASR multi-script default-script table
+ * -----------------------------------------
+ *
+ * For each OmniASR language with **multiple supported scripts**, the script
+ * we should pick by default when the user has not specified one.
+ *
+ * Background
+ * ~~~~~~~~~~
+ * OmniASR codes are `{iso639_3}_{Script}` (e.g. `urd_Arab`). Almost every
+ * supported base language (1631 of 1650 unique bases) supports exactly one
+ * script, so the script choice is trivial. This file only lists the 19
+ * multi-script bases that need a real tiebreaker.
+ *
+ * Selection priority used by the resolver (`asrLanguageUtils.ts`):
+ *   1. Explicit script the user typed in the advanced setting
+ *   2. Script encoded in the project's language tag (e.g. `swa-Cyrl`)
+ *   3. **This table** (the "best guess")
+ *   4. Latin, if the language supports Latin
+ *   5. Sole supported script (if only one)
+ *   6. Omit `lang` (server runs without language conditioning)
+ *
+ * Source / rationale per entry
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Picked using Unicode CLDR `likelySubtags.xml` (the official "if a user gives
+ * me a language tag with no script, what script should I assume?" table)
+ * cross-checked against modern majority usage. Macrolanguage → individual
+ * remaps (e.g. swa→swh, ara→arb, zho→cmn, kur→kmr) are handled in the
+ * resolver *before* lookup, so this table keys on the individual codes
+ * OmniASR actually serves.
+ *
+ * If you adjust an entry, leave a `// ←` note explaining why.
+ *
+ * Multi-script bases not listed here intentionally fall through to "Latin if
+ * supported, else sole script". Add an entry here only when CLDR or modern
+ * majority usage clearly disagrees with that default.
+ *
+ * Regenerating
+ * ~~~~~~~~~~~~
+ * To rediscover which bases need entries (after a model update changes the
+ * supported set):
+ *
+ *   curl -s "https://genesis-ai-dev--codex-asr-serve.modal.run/languages" \
+ *     | python3 -c "
+ *   import json, sys
+ *   d = json.load(sys.stdin)
+ *   bases = {}
+ *   for l in d['languages']:
+ *       b, s = l.split('_')
+ *       bases.setdefault(b, set()).add(s)
+ *   for b, ss in sorted(bases.items()):
+ *       if len(ss) > 1:
+ *           print(b, sorted(ss))
+ *   "
+ */
+
+export const OMNI_ASR_DEFAULT_SCRIPTS: Readonly<Record<string, string>> = {
+    aze: "Latn", // Azerbaijani — modern standard (Republic of Azerbaijan) is Latin
+    bcc: "Arab", // Southern Balochi — written in Arabic script
+    cmn: "Hans", // Mandarin Chinese — Simplified is the more common default
+    cmo: "Khmr", // Central Mnong — Khmer-script orthography (community standard)
+    crk: "Cans", // Plains Cree — Canadian Aboriginal Syllabics is the traditional script
+    ell: "Grek", // Greek — only one substantive script; entry exists for completeness
+    gag: "Latn", // Gagauz — modern orthography is Latin
+    kmr: "Latn", // Northern Kurdish — Latin (Hawar) is the predominant modern script
+    lld: "Latn", // Ladin — only Latin; entry exists for completeness
+    ojb: "Latn", // Northwestern Ojibwa — Latin (double-vowel) is most common in print
+    rif: "Latn", // Tarifit Berber — Latin in modern publications (Tifinagh not in OmniASR)
+    rmc: "Latn", // Carpathian Romani — Latin in modern orthographies
+    rmy: "Latn", // Vlax Romani — Latin in modern orthographies
+    tuk: "Latn", // Turkmen — modern standard (Turkmenistan) is Latin
+    uig: "Arab", // Uyghur — Arabic-script (Uyghur Ereb Yëziqi) is the predominant script
+    urd: "Arab", // Urdu — Arabic-script (Nastaliq) is the canonical script
+    uzb: "Latn", // Uzbek — modern standard (Uzbekistan) is Latin
+    wal: "Ethi", // Wolaytta — Ethiopic (Geʽez) script in modern orthographies
+    yue: "Hant", // Cantonese — Traditional Chinese (Hong Kong / Guangzhou default)
+};
diff --git a/sharedUtils/omniAsrFriendlyNames.ts b/sharedUtils/omniAsrFriendlyNames.ts
new file mode 100644
index 000000000..e39632bcb
--- /dev/null
+++ b/sharedUtils/omniAsrFriendlyNames.ts
@@ -0,0 +1,1680 @@
+/**
+ * OmniASR friendly-name lookup
+ * ----------------------------
+ *
+ * Maps each OmniASR-supported ISO 639-3 base (1650 entries) to its English
+ * "reference name" from the SIL ISO 639-3 registry. Used to render the
+ * language badge after a transcription completes (e.g. `swh_Latn` → "Swahili").
+ *
+ * Notes
+ * ~~~~~
+ * - Keyed on the **base** (ISO 639-3), not the full OmniASR code, because the
+ *   friendly name is the same regardless of script. Callers should strip the
+ *   `_{Script}` suffix before lookup. The resolver in `asrLanguageUtils.ts`
+ *   handles that.
+ * - Names come straight from SIL's `Ref_Name` field, which is CamelCased and
+ *   ASCII-only (e.g. "ArbëreshëAlbanian" → "ArbresheAlbanian"). The helper
+ *   `prettifyRefName()` in `asrLanguageUtils.ts` splits these on case changes
+ *   so they read naturally in the UI.
+ * - The 'nan' entry is added by hand (Min Nan Chinese) — SIL leaves Ref_Name
+ *   blank for that code in the version we parsed.
+ *
+ * Regenerating
+ * ~~~~~~~~~~~~
+ * If OmniASR's supported set changes, regenerate from the SIL data already
+ * bundled in `src/utils/languageUtils.ts` using the snippet in
+ * `omniAsrSupportedLangs.ts`'s header (look up each base's `Ref_Name`).
+ */
+
+export const OMNI_ASR_FRIENDLY_NAMES: Readonly<Record<string, string>> = {
+    aae: "ArbëreshëAlbanian",
+    aal: "Afade",
+    abb: "Bankon",
+    abi: "Abidji",
+    abk: "Abkhazian",
+    abn: "Abua",
+    abp: "AbellenAyta",
+    abr: "Abron",
+    abs: "AmboneseMalay",
+    aca: "Achagua",
+    acd: "Gikyode",
+    ace: "Achinese",
+    acf: "SaintLucianCreoleFrench",
+    ach: "Acoli",
+    acm: "MesopotamianArabic",
+    acn: "Achang",
+    acr: "Achi",
+    acu: "Achuar-Shiwiar",
+    acw: "HijaziArabic",
+    ade: "Adele",
+    adh: "Adhola",
+    adj: "Adioukrou",
+    adx: "AmdoTibetan",
+    ady: "Adyghe",
+    aeb: "TunisianArabic",
+    aec: "SaidiArabic",
+    aeu: "Akeu",
+    afb: "GulfArabic",
+    afo: "Eloyi",
+    afr: "Afrikaans",
+    agd: "Agarabi",
+    agg: "Angor",
+    agn: "Agutaynen",
+    agr: "Aguaruna",
+    agu: "Aguacateco",
+    agx: "Aghul",
+    aha: "Ahanta",
+    ahk: "Akha",
+    ahl: "Igo",
+    ahs: "Ashe",
+    aia: "Arosi",
+    ajg: "Aja(Benin)",
+    aka: "Akan",
+    akb: "BatakAngkola",
+    ake: "Akawaio",
+    akp: "Siwu",
+    ala: "Alago",
+    alj: "Alangan",
+    aln: "GhegAlbanian",
+    alo: "Larike-Wakasihu",
+    alp: "Alune",
+    als: "ToskAlbanian",
+    alt: "SouthernAltai",
+    alz: "Alur",
+    ame: "Yanesha'",
+    amf: "Hamer-Banna",
+    amh: "Amharic",
+    ami: "Amis",
+    amk: "Ambai",
+    amu: "GuerreroAmuzgo",
+    anc: "Ngas",
+    ank: "Goemai",
+    ann: "Obolo",
+    anp: "Angika",
+    anw: "Anaang",
+    any: "Anyin",
+    aom: "Ömie",
+    aoz: "UabMeto",
+    apb: "Sa'a",
+    apc: "LevantineArabic",
+    apd: "SudaneseArabic",
+    apr: "Arop-Lokep",
+    arb: "StandardArabic",
+    arg: "Aragonese",
+    arl: "Arabela",
+    arq: "AlgerianArabic",
+    ars: "NajdiArabic",
+    ary: "MoroccanArabic",
+    arz: "EgyptianArabic",
+    asa: "Asu(Tanzania)",
+    asg: "Cishingini",
+    asm: "Assamese",
+    ast: "Asturian",
+    ata: "Pele-Ata",
+    atb: "Zaiwa",
+    atg: "IvbieNorth-Okpela-Arhe",
+    ati: "Attié",
+    atq: "Aralle-Tabulahan",
+    ava: "Avaric",
+    avn: "Avatime",
+    avu: "Avokaya",
+    awa: "Awadhi",
+    awb: "Awa(PapuaNewGuinea)",
+    awo: "Awak",
+    ayl: "LibyanArabic",
+    ayo: "Ayoreo",
+    ayp: "NorthMesopotamianArabic",
+    ayr: "CentralAymara",
+    ayz: "MaiBrat",
+    aze: "Azerbaijani",
+    azg: "SanPedroAmuzgosAmuzgo",
+    azz: "HighlandPueblaNahuatl",
+    bag: "Tuki",
+    bak: "Bashkir",
+    bam: "Bambara",
+    ban: "Balinese",
+    bao: "Waimaha",
+    bas: "Basa(Cameroon)",
+    bav: "Vengo",
+    bax: "Bamun",
+    bba: "Baatonum",
+    bbb: "Barai",
+    bbc: "BatakToba",
+    bbj: "Ghomálá'",
+    bbl: "Bats",
+    bbo: "NorthernBoboMadaré",
+    bbu: "Kulung(Nigeria)",
+    bcc: "SouthernBalochi",
+    bce: "Bamenyam",
+    bci: "Baoulé",
+    bcl: "CentralBikol",
+    bcs: "Kohumono",
+    bcw: "Bana",
+    bcy: "Bacama",
+    bcz: "Bainouk-Gunyaamolo",
+    bda: "Bayot",
+    bde: "Bade",
+    bdg: "Bonggi",
+    bdh: "Baka(SouthSudan)",
+    bdm: "Buduma",
+    bdq: "Bahnar",
+    bdu: "Oroko",
+    beb: "Bebele",
+    beh: "Biali",
+    bel: "Belarusian",
+    bem: "Bemba(Zambia)",
+    ben: "Bengali",
+    bep: "Besoa",
+    bew: "Betawi",
+    bex: "JurModo",
+    bfa: "Bari",
+    bfd: "Bafut",
+    bfo: "MalbaBirifor",
+    bft: "Balti",
+    bfy: "Bagheli",
+    bfz: "MahasuPahari",
+    bgc: "Haryanvi",
+    bgp: "EasternBalochi",
+    bgq: "Bagri",
+    bgr: "BawmChin",
+    bgt: "Bughotu",
+    bgw: "Bhatri",
+    bha: "Bharia",
+    bhb: "Bhili",
+    bhh: "Bukharic",
+    bho: "Bhojpuri",
+    bhp: "Bima",
+    bht: "Bhattiyali",
+    bhz: "Bada(Indonesia)",
+    bib: "Bissa",
+    bim: "Bimoba",
+    bis: "Bislama",
+    biv: "SouthernBirifor",
+    bjj: "Kanauji",
+    bjk: "Barok",
+    bjn: "Banjar",
+    bjr: "Binumarien",
+    bjt: "Balanta-Ganja",
+    bjv: "Bedjond",
+    bjw: "Bakwé",
+    bjz: "Baruga",
+    bkd: "Binukid",
+    bkh: "Bakoko",
+    bkm: "Kom(Cameroon)",
+    bkv: "Bekwarra",
+    bky: "Bokyi",
+    ble: "Balanta-Kentohe",
+    blh: "Kuwaa",
+    blt: "TaiDam",
+    blx: "Mag-IndiAyta",
+    blz: "Balantak",
+    bmm: "NorthernBetsimisarakaMalagasy",
+    bmq: "Bomu",
+    bmr: "Muinane",
+    bmu: "Somba-Siawari",
+    bmv: "Bum",
+    bng: "Benga",
+    bnm: "Batanga",
+    bnn: "Bunun",
+    bno: "Bantoanon",
+    bnp: "Bola",
+    bns: "Bundeli",
+    boa: "Bora",
+    bod: "Tibetan",
+    boj: "Anjam",
+    bom: "Berom",
+    bor: "Borôro",
+    bos: "Bosnian",
+    bou: "Bondei",
+    bov: "Tuwuli",
+    box: "Buamu",
+    bpr: "KoronadalBlaan",
+    bps: "SaranganiBlaan",
+    bqc: "Boko(Benin)",
+    bqg: "Bago-Kusuntu",
+    bqi: "Bakhtiari",
+    bqj: "Bandial",
+    bqp: "Busa",
+    bra: "Braj",
+    bre: "Breton",
+    brh: "Brahui",
+    bri: "Mokpwe",
+    bru: "EasternBru",
+    brx: "Bodo(India)",
+    bsc: "Bassari",
+    bsh: "Kati",
+    bsj: "Bangwinji",
+    bsk: "Burushaski",
+    bsq: "Bassa",
+    bss: "Akoose",
+    bsy: "SabahBisaya",
+    btd: "BatakDairi",
+    btm: "BatakMandailing",
+    bts: "BatakSimalungun",
+    btt: "Bete-Bendi",
+    btv: "Bateri",
+    btx: "BatakKaro",
+    bud: "Ntcham",
+    bug: "Buginese",
+    bul: "Bulgarian",
+    bum: "Bulu(Cameroon)",
+    buo: "Terei",
+    bus: "Bokobaru",
+    bux: "Boghom",
+    bvb: "Bube",
+    bvc: "Baelelea",
+    bvz: "Bauzi",
+    bwq: "SouthernBoboMadaré",
+    bwr: "Bura-Pabir",
+    bwu: "Buli(Ghana)",
+    bxf: "Bilur",
+    bxk: "Bukusu",
+    byc: "Ubaghara",
+    byr: "Baruya",
+    bys: "Burak",
+    byv: "Medumba",
+    byx: "Qaqet",
+    bzh: "MaposBuang",
+    bzi: "Bisu",
+    bzj: "BelizeKriolEnglish",
+    bzw: "Basa(Nigeria)",
+    caa: "Chortí",
+    cab: "Garifuna",
+    cac: "Chuj",
+    cak: "Kaqchikel",
+    cap: "Chipaya",
+    car: "GalibiCarib",
+    cas: "Tsimané",
+    cat: "Catalan",
+    cax: "Chiquitano",
+    cbc: "Carapana",
+    cbi: "Chachi",
+    cbr: "Cashibo-Cacataibo",
+    cbs: "Cashinahua",
+    cbt: "Chayahuita",
+    cbu: "Candoshi-Shapra",
+    cbv: "Cacua",
+    cce: "Chopi",
+    ccg: "SambaDaka",
+    cco: "ComaltepecChinantec",
+    cdj: "Churahi",
+    cdo: "MinDongChinese",
+    ceb: "Cebuano",
+    ceg: "Chamacoco",
+    cek: "EasternKhumiChin",
+    cen: "Cen",
+    ces: "Czech",
+    cfa: "Dijim-Bwilim",
+    cfm: "FalamChin",
+    cgc: "Kagayanen",
+    cgg: "Chiga",
+    che: "Chechen",
+    chf: "TabascoChontal",
+    chq: "QuiotepecChinantec",
+    chv: "Chuvash",
+    chz: "OzumacínChinantec",
+    cjk: "Chokwe",
+    cjo: "AshéninkaPajonal",
+    cjp: "Cabécar",
+    cjs: "Shor",
+    ckb: "CentralKurdish",
+    ckl: "Cibak",
+    cko: "Anufo",
+    ckr: "Kairak",
+    ckt: "Chukot",
+    cky: "Cakfem-Mushere",
+    cla: "Ron",
+    cle: "LealaoChinantec",
+    cly: "EasternHighlandChatino",
+    cme: "Cerma",
+    cmn: "MandarinChinese",
+    cmo: "CentralMnong",
+    cmr: "Mro-KhimiChin",
+    cnh: "HakhaChin",
+    cni: "Asháninka",
+    cnl: "LalanaChinantec",
+    cnt: "TepetotutlaChinantec",
+    coe: "Koreguaje",
+    cof: "Colorado",
+    cok: "SantaTeresaCora",
+    con: "Cofán",
+    cor: "Cornish",
+    cot: "Caquinte",
+    cou: "Wamey",
+    cpa: "PalantlaChinantec",
+    cpb: "Ucayali-YurúaAshéninka",
+    cpu: "PichisAshéninka",
+    cpx: "Pu-XianChinese",
+    cpy: "SouthUcayaliAshéninka",
+    crh: "CrimeanTatar",
+    crk: "PlainsCree",
+    crn: "ElNayarCora",
+    crq: "Iyo'wujwaChorote",
+    crs: "SeselwaCreoleFrench",
+    crt: "Iyojwa'jaChorote",
+    csk: "Jola-Kasa",
+    cso: "SochiapamChinantec",
+    ctd: "TedimChin",
+    cte: "TepinapaChinantec",
+    ctg: "Chittagonian",
+    ctl: "TlacoatzintepecChinantec",
+    cto: "Emberá-Catío",
+    ctu: "Chol",
+    cuc: "UsilaChinantec",
+    cui: "Cuiba",
+    cuk: "SanBlasKuna",
+    cul: "Culina",
+    cut: "TeutilaCuicatec",
+    cux: "TepeuxilaCuicatec",
+    cwa: "Kabwa",
+    cwe: "Kwere",
+    cwt: "Kuwaataay",
+    cya: "NopalaChatino",
+    cym: "Welsh",
+    daa: "Dangaléat",
+    dag: "Dagbani",
+    dah: "Gwahatike",
+    dan: "Danish",
+    dar: "Dargwa",
+    dav: "Taita",
+    dbd: "Dadiya",
+    dbj: "Ida'an",
+    dbq: "Daba",
+    dcc: "Deccan",
+    ddn: "Dendi(Benin)",
+    ded: "Dedua",
+    deg: "Degema",
+    des: "Desano",
+    deu: "German",
+    dga: "SouthernDagaare",
+    dgh: "Dghwede",
+    dgi: "NorthernDagara",
+    dgk: "Dagba",
+    dgo: "Dogri(individuallanguage)",
+    dgr: "Dogrib",
+    dhi: "Dhimal",
+    did: "Didinga",
+    dig: "Digo",
+    dik: "SouthwesternDinka",
+    dip: "NortheasternDinka",
+    div: "Dhivehi",
+    dje: "Zarma",
+    djk: "EasternMaroonCreole",
+    dmk: "Domaaki",
+    dml: "Dameli",
+    dnj: "Dan",
+    dnt: "MidGrandValleyDani",
+    dnw: "WesternDani",
+    dop: "Lukpa",
+    dos: "Dogosé",
+    dru: "Rukai",
+    dsb: "LowerSorbian",
+    dsh: "Daasanach",
+    dtp: "KadazanDusun",
+    dts: "ToroSoDogon",
+    dty: "Dotyali",
+    dua: "Duala",
+    dug: "Duruma",
+    dwr: "Dawro",
+    dyi: "DjiminiSenoufo",
+    dyo: "Jola-Fonyi",
+    dyu: "Dyula",
+    dzg: "Dazaga",
+    dzo: "Dzongkha",
+    ebu: "Embu",
+    ego: "Eggon",
+    eip: "Eipomek",
+    eiv: "Askopan",
+    eka: "Ekajuk",
+    ekk: "StandardEstonian",
+    eko: "Koti",
+    ekr: "Yace",
+    ell: "ModernGreek(1453-)",
+    elm: "Eleme",
+    emp: "NorthernEmberá",
+    enb: "Markweeta",
+    eng: "English",
+    enx: "Enxet",
+    epo: "Esperanto",
+    ese: "EseEjja",
+    ess: "CentralSiberianYupik",
+    esu: "CentralYupik",
+    eto: "Eton(Cameroon)",
+    ets: "Yekhee",
+    etu: "Ejagham",
+    eus: "Basque",
+    evn: "Evenki",
+    ewe: "Ewe",
+    ewo: "Ewondo",
+    eyo: "Keiyo",
+    eza: "Ezaa",
+    fal: "SouthFali",
+    fan: "Fang(EquatorialGuinea)",
+    fao: "Faroese",
+    far: "Fataleka",
+    fas: "Persian",
+    fat: "Fanti",
+    fia: "Nobiin",
+    fij: "Fijian",
+    fil: "Filipino",
+    fin: "Finnish",
+    fip: "Fipa",
+    fkk: "Kirya-Konzəl",
+    flr: "Fuliiru",
+    fmp: "Fe'fe'",
+    fmu: "FarWesternMuria",
+    fon: "Fon",
+    fra: "French",
+    frd: "Fordata",
+    fry: "WesternFrisian",
+    fub: "AdamawaFulfulde",
+    fuc: "Pulaar",
+    fue: "BorguFulfulde",
+    ful: "Fulah",
+    fuq: "Central-EasternNigerFulfulde",
+    fuv: "NigerianFulfulde",
+    gag: "Gagauz",
+    gai: "Borei",
+    gam: "Kandawo",
+    gau: "MudhiliGadaba",
+    gbi: "Galela",
+    gbk: "Gaddi",
+    gbm: "Garhwali",
+    gbo: "NorthernGrebo",
+    gbr: "Gbagyi",
+    gby: "Gbari",
+    gcc: "Mali",
+    gde: "Gude",
+    gdf: "Guduf-Gava",
+    geb: "Kire",
+    gej: "Gen",
+    ges: "Geser-Gorom",
+    ggg: "Gurgula",
+    gid: "Gidar",
+    gig: "Goaria",
+    gil: "Gilbertese",
+    giz: "SouthGiziga",
+    gjk: "KachiKoli",
+    gjn: "Gonja",
+    gju: "Gujari",
+    gkn: "Gokana",
+    gld: "Nanai",
+    gle: "Irish",
+    glg: "Galician",
+    glk: "Gilaki",
+    glv: "Manx",
+    glw: "Glavda",
+    gmv: "Gamo",
+    gna: "Kaansa",
+    gnd: "Zulgo-Gemzek",
+    gng: "Ngangam",
+    gof: "Gofa",
+    gog: "Gogo",
+    gol: "Gola",
+    gom: "GoanKonkani",
+    gor: "Gorontalo",
+    gqr: "Gor",
+    grc: "AncientGreek(to1453)",
+    gri: "Ghari",
+    grn: "Guarani",
+    grt: "Garo",
+    gsl: "Gusilay",
+    gso: "SouthwestGbaya",
+    gub: "Guajajára",
+    guc: "Wayuu",
+    gud: "YocobouéDida",
+    gug: "ParaguayanGuaraní",
+    guh: "Guahibo",
+    gui: "EasternBolivianGuaraní",
+    guj: "Gujarati",
+    guk: "Gumuz",
+    gum: "Guambiano",
+    guo: "Guayabero",
+    guq: "Aché",
+    gur: "Farefare",
+    guu: "Yanomamö",
+    gux: "Gourmanchéma",
+    guz: "Gusii",
+    gvc: "Guanano",
+    gvl: "Gulay",
+    gwc: "Gawri",
+    gwe: "Gweno",
+    gwi: "Gwichʼin",
+    gwr: "Gwere",
+    gwt: "Gawar-Bati",
+    gym: "Ngäbere",
+    gyr: "Guarayu",
+    gyz: "Geji",
+    had: "Hatam",
+    hag: "Hanga",
+    hah: "Hahon",
+    hak: "HakkaChinese",
+    hao: "Hakö",
+    hap: "Hupla",
+    hat: "Haitian",
+    hau: "Hausa",
+    haw: "Hawaiian",
+    hay: "Haya",
+    hbb: "Huba",
+    hch: "Huichol",
+    heb: "Hebrew",
+    heh: "Hehe",
+    her: "Herero",
+    hia: "Lamang",
+    hif: "FijiHindi",
+    hig: "Kamwe",
+    hil: "Hiligaynon",
+    hin: "Hindi",
+    hkk: "Hunjara-KainaKe",
+    hla: "Halia",
+    hlb: "Halbi",
+    hlt: "MatuChin",
+    hne: "Chhattisgarhi",
+    hnn: "Hanunoo",
+    hno: "NorthernHindko",
+    hns: "CaribbeanHindustani",
+    hoc: "Ho",
+    hrv: "Croatian",
+    hsb: "UpperSorbian",
+    hto: "MinicaHuitoto",
+    hub: "Huambisa",
+    hue: "SanFranciscoDelMarHuave",
+    hui: "Huli",
+    hul: "Hula",
+    hun: "Hungarian",
+    hus: "Huastec",
+    huu: "MuruiHuitoto",
+    huv: "SanMateoDelMarHuave",
+    hux: "NüpodeHuitoto",
+    hvn: "Sabu",
+    hwc: "Hawai'iCreoleEnglish",
+    hwo: "Hwana",
+    hye: "Armenian",
+    hyw: "WesternArmenian",
+    iba: "Iban",
+    ibb: "Ibibio",
+    ibo: "Igbo",
+    icr: "IslanderCreoleEnglish",
+    ida: "Idakho-Isukha-Tiriki",
+    idd: "EdeIdaca",
+    idu: "Idoma",
+    ifa: "AmganadIfugao",
+    ifb: "BatadIfugao",
+    ife: "Ifè",
+    ifk: "TuwaliIfugao",
+    ifu: "MayoyaoIfugao",
+    ify: "Keley-IKallahan",
+    igl: "Igala",
+    ign: "Ignaciano",
+    ijc: "Izon",
+    ijn: "Kalabari",
+    ikk: "Ika",
+    ikw: "Ikwere",
+    ilb: "Ila",
+    ilo: "Iloko",
+    imo: "Imbongu",
+    ina: "Interlingua(InternationalAuxiliaryLanguageAssociation)",
+    inb: "Inga",
+    ind: "Indonesian",
+    iou: "Tuma-Irumu",
+    ipi: "Ipili",
+    ipk: "Inupiaq",
+    iqw: "Ikwo",
+    iri: "Rigwe",
+    irk: "Iraqw",
+    ish: "Esan",
+    isl: "Icelandic",
+    iso: "Isoko",
+    ita: "Italian",
+    itl: "Itelmen",
+    its: "Isekiri",
+    itv: "Itawit",
+    itw: "Ito",
+    itz: "Itzá",
+    ixl: "Ixil",
+    izr: "Izere",
+    izz: "Izii",
+    jac: "Popti'",
+    jal: "Yalahatan",
+    jam: "JamaicanCreoleEnglish",
+    jav: "Javanese",
+    jax: "JambiMalay",
+    jbu: "JukunTakum",
+    jen: "Dza",
+    jic: "Tol",
+    jiv: "Shuar",
+    jmc: "Machame",
+    jmd: "Yamdena",
+    jmx: "WesternJuxtlahuacaMixtec",
+    jpn: "Japanese",
+    jqr: "Jaqaru",
+    juk: "Wapan",
+    jun: "Juang",
+    juo: "Jiba",
+    jvn: "CaribbeanJavanese",
+    kaa: "Kara-Kalpak",
+    kab: "Kabyle",
+    kac: "Kachin",
+    kai: "Karekare",
+    kaj: "Jju",
+    kak: "Kalanguya",
+    kam: "Kamba(Kenya)",
+    kan: "Kannada",
+    kao: "Xaasongaxango",
+    kaq: "Capanahua",
+    kas: "Kashmiri",
+    kat: "Georgian",
+    kay: "Kamayurá",
+    kaz: "Kazakh",
+    kbd: "Kabardian",
+    kbl: "Kanembu",
+    kbo: "Keliko",
+    kbp: "Kabiyè",
+    kbq: "Kamano",
+    kbr: "Kafa",
+    kbt: "Abadi",
+    kby: "MangaKanuri",
+    kca: "Khanty",
+    kcg: "Tyap",
+    kcn: "Nubi",
+    kcq: "Kamo",
+    kdc: "Kutu",
+    kde: "Makonde",
+    kdh: "Tem",
+    kdi: "Kumam",
+    kdj: "Karamojong",
+    kdl: "Tsikimba",
+    kdn: "Kunda",
+    kdt: "Kuy",
+    kea: "Kabuverdianu",
+    kek: "Kekchí",
+    ken: "Kenyang",
+    keo: "Kakwa",
+    ker: "Kera",
+    keu: "Akebu",
+    key: "Kupia",
+    kez: "Kukele",
+    kfb: "NorthwesternKolami",
+    kff: "Koya",
+    kfk: "Kinnauri",
+    kfq: "Korku",
+    kfr: "Kachhi",
+    kfw: "KharamNaga",
+    kfx: "KulluPahari",
+    kha: "Khasi",
+    khg: "KhamsTibetan",
+    khk: "HalhMongolian",
+    khm: "Khmer",
+    khq: "KoyraChiiniSonghay",
+    khw: "Khowar",
+    kia: "Kim",
+    kij: "Kilivila",
+    kik: "Kikuyu",
+    kin: "Kinyarwanda",
+    kir: "Kirghiz",
+    kix: "KhiamniunganNaga",
+    kjb: "Q'anjob'al",
+    kjc: "CoastalKonjo",
+    kje: "Kisar",
+    kjg: "Khmu",
+    kjh: "Khakas",
+    kjk: "HighlandKonjo",
+    kki: "Kagulu",
+    kkj: "Kako",
+    kle: "Kulung(Nepal)",
+    kln: "Kalenjin",
+    kls: "Kalasha",
+    klu: "Klao",
+    klv: "Maskelynes",
+    klw: "Tado",
+    kma: "Konni",
+    kmd: "MajukayangKalinga",
+    kml: "TanudanKalinga",
+    kmr: "NorthernKurdish",
+    kmu: "Kanite",
+    kmy: "Koma",
+    kna: "Dera(Nigeria)",
+    knb: "LubuaganKalinga",
+    knc: "CentralKanuri",
+    kne: "Kankanaey",
+    knf: "Mankanya",
+    knj: "WesternKanjobal",
+    knk: "Kuranko",
+    knn: "Konkani(individuallanguage)",
+    kno: "Kono(SierraLeone)",
+    kog: "Cogui",
+    kol: "Kol(PapuaNewGuinea)",
+    koo: "Konzo",
+    kor: "Korean",
+    kpo: "Ikposo",
+    kpq: "Korupun-Sela",
+    kps: "Tehit",
+    kpv: "Komi-Zyrian",
+    kpy: "Koryak",
+    kpz: "Kupsabiny",
+    kqe: "Kalagan",
+    kqo: "EasternKrahn",
+    kqp: "Kimré",
+    kqr: "Kimaragang",
+    kqy: "Koorete",
+    krc: "Karachay-Balkar",
+    kri: "Krio",
+    krj: "Kinaray-A",
+    krl: "Karelian",
+    krr: "Krung",
+    krs: "Gbaya(Sudan)",
+    kru: "Kurukh",
+    krx: "Karon",
+    ksb: "Shambala",
+    ksd: "Kuanua",
+    ksf: "Bafia",
+    ksr: "Borong",
+    kss: "SouthernKisi",
+    ksz: "Kodaku",
+    ktb: "Kambaata",
+    ktj: "PlapoKrumen",
+    kto: "Kuot",
+    kua: "Kuanyama",
+    kub: "Kutep",
+    kue: "Kuman(PapuaNewGuinea)",
+    kuh: "Kushi",
+    kum: "Kumyk",
+    kur: "Kurdish",
+    kus: "Kusaal",
+    kvn: "BorderKuna",
+    kvw: "Wersing",
+    kvx: "ParkariKoli",
+    kwd: "Kwaio",
+    kwf: "Kwara'ae",
+    kwi: "Awa-Cuaiquer",
+    kwm: "Kwambi",
+    kxc: "Konso",
+    kxf: "ManumanawKaren",
+    kxm: "NorthernKhmer",
+    kxp: "WadiyaraKoli",
+    kyb: "ButbutKalinga",
+    kyc: "Kyaka",
+    kyf: "Kouya",
+    kyg: "Keyagana",
+    kyo: "Kelon",
+    kyq: "Kenga",
+    kyu: "WesternKayah",
+    kyx: "Rapoisi",
+    kyz: "Kayabí",
+    kzf: "Da'aKaili",
+    kzi: "Kelabit",
+    lac: "Lacandon",
+    lag: "Rangi",
+    laj: "Lango(Uganda)",
+    lam: "Lamba",
+    lao: "Lao",
+    las: "Lama(Togo)",
+    lat: "Latin",
+    lav: "Latvian",
+    law: "Lauje",
+    lbj: "Ladakhi",
+    lbw: "Tolaki",
+    lcm: "Tungag",
+    lcp: "WesternLawa",
+    ldb: "Dũya",
+    led: "Lendu",
+    lee: "Lyélé",
+    lef: "Lelemi",
+    lem: "Nomaande",
+    lew: "LedoKaili",
+    lex: "Luang",
+    lgg: "Lugbara",
+    lgl: "Wala",
+    lhu: "Lahu",
+    lia: "West-CentralLimba",
+    lid: "Nyindrou",
+    lif: "Limbu",
+    lij: "Ligurian",
+    lin: "Lingala",
+    lip: "Sekpele",
+    lir: "LiberianEnglish",
+    lis: "Lisu",
+    lit: "Lithuanian",
+    lje: "Rampi",
+    ljp: "LampungApi",
+    lkb: "Kabras",
+    lke: "Kenyi",
+    lla: "Lala-Roba",
+    lld: "Ladin",
+    llg: "Lole",
+    lln: "Lele(Chad)",
+    lme: "Pévé",
+    lnd: "Lundayeh",
+    lns: "Lamnso'",
+    lnu: "Longuda",
+    loa: "Loloda",
+    lob: "Lobi",
+    lok: "Loko",
+    lom: "Loma(Liberia)",
+    lon: "MalawiLomwe",
+    loq: "Lobala",
+    lrk: "Loarki",
+    lsi: "Lashi",
+    lsm: "Saamia",
+    lss: "Lasi",
+    ltg: "Latgalian",
+    lth: "Thur",
+    lto: "Tsotso",
+    ltz: "Luxembourgish",
+    lua: "Luba-Lulua",
+    luc: "Aringa",
+    lug: "Ganda",
+    luo: "Luo(KenyaandTanzania)",
+    lus: "Lushai",
+    lwg: "Wanga",
+    lwo: "Luwo",
+    lww: "Lewo",
+    lzz: "Laz",
+    maa: "SanJerónimoTecóatlMazatec",
+    mab: "YutanduchiMixtec",
+    mad: "Madurese",
+    maf: "Mafa",
+    mag: "Magahi",
+    mah: "Marshallese",
+    mai: "Maithili",
+    maj: "JalapaDeDíazMazatec",
+    mak: "Makasar",
+    mal: "Malayalam",
+    mam: "Mam",
+    maq: "ChiquihuitlánMazatec",
+    mar: "Marathi",
+    mau: "HuautlaMazatec",
+    maw: "Mampruli",
+    max: "NorthMoluccanMalay",
+    maz: "CentralMazahua",
+    mbb: "WesternBukidnonManobo",
+    mbc: "Macushi",
+    mbh: "Mangseng",
+    mbj: "Nadëb",
+    mbt: "MatigsalugManobo",
+    mbu: "Mbula-Bwazza",
+    mca: "Maca",
+    mcb: "Machiguenga",
+    mcd: "Sharanahua",
+    mcf: "Matsés",
+    mco: "CoatlánMixe",
+    mcp: "Makaa",
+    mcq: "Ese",
+    mcu: "CameroonMambila",
+    mcx: "Mpiemo",
+    mda: "Mada(Nigeria)",
+    mdd: "Mbum",
+    mdv: "SantaLucíaMonteverdeMixtec",
+    mdy: "Male(Ethiopia)",
+    med: "Melpa",
+    mee: "Mengen",
+    meh: "SouthwesternTlaxiacoMixtec",
+    mej: "Meyah",
+    mek: "Mekeo",
+    mel: "CentralMelanau",
+    men: "Mende(SierraLeone)",
+    meq: "Merey",
+    mer: "Meru",
+    met: "Mato",
+    meu: "Motu",
+    mev: "Mano",
+    mfe: "Morisyen",
+    mfh: "Matal",
+    mfi: "Wandala",
+    mfk: "NorthMofu",
+    mfm: "MarghiSouth",
+    mfn: "CrossRiverMbembe",
+    mfo: "Mbe",
+    mfq: "Moba",
+    mfv: "Mandjak",
+    mfy: "Mayo",
+    mfz: "Mabaan",
+    mgd: "Moru",
+    mge: "Mango",
+    mgg: "Mpumpong",
+    mgh: "Makhuwa-Meetto",
+    mgi: "Lijili",
+    mgo: "Meta'",
+    mhi: "Ma'di",
+    mhk: "Mungaka",
+    mhr: "EasternMari",
+    mhu: "Digaro-Mishmi",
+    mhx: "Maru",
+    mhy: "Ma'anyan",
+    mib: "AtatláhucaMixtec",
+    mie: "OcotepecMixtec",
+    mif: "Mofu-Gudur",
+    mig: "SanMiguelElGrandeMixtec",
+    mih: "ChayucoMixtec",
+    mil: "PeñolesMixtec",
+    mim: "AlacatlatzalaMixtec",
+    min: "Minangkabau",
+    mio: "PinotepaNacionalMixtec",
+    mip: "Apasco-ApoalaMixtec",
+    miq: "Mískito",
+    mit: "SouthernPueblaMixtec",
+    miu: "CacaloxtepecMixtec",
+    miy: "AyutlaMixtec",
+    miz: "CoatzospanMixtec",
+    mjl: "Mandeali",
+    mjv: "Mannan",
+    mkd: "Macedonian",
+    mkf: "Miya",
+    mki: "Dhatki",
+    mkl: "Mokole",
+    mkn: "KupangMalay",
+    mlg: "Malagasy",
+    mlq: "WesternManinkakan",
+    mlt: "Maltese",
+    mmc: "MichoacánMazahua",
+    mmg: "NorthAmbrym",
+    mnb: "Muna",
+    mne: "Naba",
+    mnf: "Mundani",
+    mni: "Manipuri",
+    mnk: "Mandinka",
+    mnw: "Mon",
+    mnx: "Manikion",
+    moa: "Mwan",
+    mog: "Mongondow",
+    mon: "Mongolian",
+    mop: "MopánMaya",
+    mor: "Moro",
+    mos: "Mossi",
+    mox: "Molima",
+    moz: "Mukulu",
+    mpg: "Marba",
+    mpm: "YosondúaMixtec",
+    mpp: "Migabac",
+    mpx: "Misima-Panaeati",
+    mqb: "Mbuko",
+    mqf: "Momuna",
+    mqj: "Mamasa",
+    mqn: "Moronene",
+    mqy: "Manggarai",
+    mri: "Maori",
+    mrj: "WesternMari",
+    mrr: "Maria(India)",
+    mrt: "MarghiCentral",
+    mrw: "Maranao",
+    msh: "MasikoroMalagasy",
+    msi: "SabahMalay",
+    msw: "Mansoanka",
+    msy: "Aruamu",
+    mtd: "Mualang",
+    mtj: "Moskona",
+    mto: "TotontepecMixe",
+    mtr: "Mewari",
+    mtu: "TututepecMixtec",
+    mtx: "TidaáMixtec",
+    mua: "Mundang",
+    mug: "Musgu",
+    muh: "Mündü",
+    mui: "Musi",
+    mup: "Malvi",
+    mur: "Murle",
+    muv: "Muthuvan",
+    muy: "Muyang",
+    mve: "Marwari(Pakistan)",
+    mvp: "Duri",
+    mvy: "IndusKohistani",
+    mwq: "MünChin",
+    mwv: "Mentawai",
+    mxb: "TezoatlánMixtec",
+    mxq: "JuquilaMixe",
+    mxs: "HuitepecMixtec",
+    mxt: "JamiltepecMixtec",
+    mxu: "Mada(Cameroon)",
+    mxv: "MetlatónocMixtec",
+    mxy: "SoutheasternNochixtlánMixtec",
+    mya: "Burmese",
+    myb: "Mbay",
+    myk: "MamaraSenoufo",
+    myv: "Erzya",
+    myx: "Masaaba",
+    myy: "Macuna",
+    mza: "SantaMaríaZacatepecMixtec",
+    mzi: "IxcatlánMazatec",
+    mzj: "Manya",
+    mzk: "NigeriaMambila",
+    mzl: "MazatlánMixe",
+    mzm: "Mumuye",
+    mzw: "Deg",
+    nab: "SouthernNambikuára",
+    nag: "NagaPidgin",
+    nal: "Nalik",
+    nan: "Min Nan Chinese",
+    nap: "Neapolitan",
+    nas: "Naasioi",
+    naw: "Nawuri",
+    nbh: "Ngamo",
+    nca: "Iyo",
+    ncf: "Notsi",
+    nch: "CentralHuastecaNahuatl",
+    ncj: "NorthernPueblaNahuatl",
+    ncl: "MichoacánNahuatl",
+    nco: "Sibe",
+    ncu: "Chumburung",
+    ncx: "CentralPueblaNahuatl",
+    ndi: "SambaLeko",
+    ndj: "Ndamba",
+    ndo: "Ndonga",
+    ndp: "Ndo",
+    ndv: "Ndut",
+    ndy: "Lutos",
+    ndz: "Ndogo",
+    neb: "Toura(Côted'Ivoire)",
+    nep: "Nepali(macrolanguage)",
+    new: "Newari",
+    nfa: "Dhao",
+    nfr: "Nafaanra",
+    nga: "Ngbaka",
+    ngi: "Ngizim",
+    ngl: "Lomwe",
+    ngp: "Ngulu",
+    ngu: "GuerreroNahuatl",
+    nhe: "EasternHuastecaNahuatl",
+    nhg: "TetelcingoNahuatl",
+    nhi: "Zacatlán-Ahuacatlán-TepetzintlaNahuatl",
+    nhn: "CentralNahuatl",
+    nhq: "HuaxcalecaNahuatl",
+    nhu: "Noone",
+    nhw: "WesternHuastecaNahuatl",
+    nhx: "Isthmus-MecayapanNahuatl",
+    nhy: "NorthernOaxacaNahuatl",
+    nia: "Nias",
+    nij: "Ngaju",
+    nim: "Nilamba",
+    nin: "Ninzo",
+    nja: "Nzanyi",
+    nko: "Nkonya",
+    nla: "Ngombale",
+    nlc: "Nalca",
+    nld: "Dutch",
+    nlg: "Gela",
+    nlk: "NiniaYali",
+    nlv: "OrizabaNahuatl",
+    nmg: "Kwasio",
+    nmz: "Nawdm",
+    nnb: "Nande",
+    nnh: "Ngiemboon",
+    nnq: "Ngindo",
+    nnw: "SouthernNuni",
+    noa: "WounMeu",
+    nob: "NorwegianBokmål",
+    nod: "NorthernThai",
+    noe: "Nimadi",
+    nog: "Nogai",
+    not: "Nomatsiguenga",
+    npl: "SoutheasternPueblaNahuatl",
+    npy: "Napu",
+    nso: "Pedi",
+    nst: "TaseNaga",
+    nsu: "SierraNegraNahuatl",
+    ntm: "Nateni",
+    ntr: "Delo",
+    nuj: "Nyole",
+    nup: "Nupe-Nupe-Tako",
+    nus: "Nuer",
+    nuz: "TlamacazapaNahuatl",
+    nwb: "Nyabwa",
+    nxq: "Naxi",
+    nya: "Nyanja",
+    nyf: "Giryama",
+    nyn: "Nyankole",
+    nyo: "Nyoro",
+    nyu: "Nyungwe",
+    nyy: "Nyakyusa-Ngonde",
+    nzi: "Nzima",
+    obo: "OboManobo",
+    oci: "Occitan(post1500)",
+    odk: "Od",
+    odu: "Odual",
+    ogo: "Khana",
+    ojb: "NorthwesternOjibwa",
+    oku: "Oku",
+    old: "Mochi",
+    omw: "SouthTairora",
+    onb: "Lingao",
+    ood: "TohonoO'odham",
+    orc: "Orma",
+    orm: "Oromo",
+    oru: "Ormuri",
+    ory: "Odia",
+    oss: "Ossetian",
+    ote: "MezquitalOtomi",
+    otq: "QuerétaroOtomi",
+    ozm: "Koonzime",
+    pab: "Parecís",
+    pad: "Paumarí",
+    pag: "Pangasinan",
+    pam: "Pampanga",
+    pan: "Panjabi",
+    pao: "NorthernPaiute",
+    pap: "Papiamento",
+    pau: "Palauan",
+    pbb: "Páez",
+    pbc: "Patamona",
+    pbi: "Parkwa",
+    pbs: "CentralPame",
+    pbt: "SouthernPashto",
+    pbu: "NorthernPashto",
+    pce: "RuchingPalaung",
+    pcm: "NigerianPidgin",
+    pex: "Petats",
+    pez: "EasternPenan",
+    phl: "Phalura",
+    phr: "Pahari-Potwari",
+    pib: "Yine",
+    pil: "Yom",
+    pip: "Pero",
+    pir: "Piratapuyo",
+    pis: "Pijin",
+    piy: "Piya-Kwonci",
+    pjt: "Pitjantjatjara",
+    pkb: "Pokomo",
+    pko: "Pökoot",
+    plk: "KohistaniShina",
+    pls: "SanMarcosTlacoyalcoPopoloca",
+    plt: "PlateauMalagasy",
+    plw: "Brooke'sPointPalawano",
+    pmf: "Pamona",
+    pmq: "NorthernPame",
+    pms: "Piemontese",
+    pmy: "PapuanMalay",
+    pnb: "WesternPanjabi",
+    pne: "WesternPenan",
+    pny: "Pinyin",
+    poc: "Poqomam",
+    poe: "SanJuanAtzingoPopoloca",
+    poh: "Poqomchi'",
+    poi: "HighlandPopoluca",
+    pol: "Polish",
+    por: "Portuguese",
+    pov: "UpperGuineaCrioulo",
+    pow: "SanFelipeOtlaltepecPopoloca",
+    poy: "Pogolo",
+    ppk: "Uma",
+    pps: "SanLuísTemalacayucaPopoloca",
+    prf: "Paranan",
+    prk: "Parauk",
+    prq: "AshéninkaPerené",
+    prt: "Phai",
+    pse: "CentralMalay",
+    pss: "Kaulong",
+    pst: "CentralPashto",
+    ptu: "Bambam",
+    pua: "WesternHighlandPurepecha",
+    pui: "Puinave",
+    pus: "Pushto",
+    pwg: "Gapapaiwa",
+    pwn: "Paiwan",
+    pww: "PwoNorthernKaren",
+    pxm: "QuetzaltepecMixe",
+    qub: "HuallagaHuánucoQuechua",
+    quc: "K'iche'",
+    quf: "LambayequeQuechua",
+    qug: "ChimborazoHighlandQuichua",
+    quh: "SouthBolivianQuechua",
+    qul: "NorthBolivianQuechua",
+    qum: "Sipacapense",
+    qup: "SouthernPastazaQuechua",
+    qur: "YanahuancaPascoQuechua",
+    qus: "SantiagodelEsteroQuichua",
+    quv: "Sacapulteco",
+    quw: "TenaLowlandQuichua",
+    qux: "YauyosQuechua",
+    quy: "AyacuchoQuechua",
+    quz: "CuscoQuechua",
+    qva: "Ambo-PascoQuechua",
+    qvc: "CajamarcaQuechua",
+    qve: "EasternApurímacQuechua",
+    qvh: "Huamalíes-DosdeMayoHuánucoQuechua",
+    qvi: "ImbaburaHighlandQuichua",
+    qvj: "LojaHighlandQuichua",
+    qvl: "CajatamboNorthLimaQuechua",
+    qvm: "Margos-Yarowilca-LauricochaQuechua",
+    qvn: "NorthJunínQuechua",
+    qvo: "NapoLowlandQuechua",
+    qvs: "SanMartínQuechua",
+    qvw: "HuayllaWancaQuechua",
+    qvz: "NorthernPastazaQuichua",
+    qwa: "CorongoAncashQuechua",
+    qwh: "HuaylasAncashQuechua",
+    qws: "SihuasAncashQuechua",
+    qxa: "ChiquiánAncashQuechua",
+    qxh: "PanaoHuánucoQuechua",
+    qxl: "SalasacaHighlandQuichua",
+    qxn: "NorthernConchucosAncashQuechua",
+    qxo: "SouthernConchucosAncashQuechua",
+    qxp: "PunoQuechua",
+    qxr: "CañarHighlandQuichua",
+    qxt: "SantaAnadeTusiPascoQuechua",
+    qxu: "Arequipa-LaUniónQuechua",
+    qxw: "JaujaWancaQuechua",
+    rag: "Logooli",
+    rah: "Rabha",
+    rai: "Ramoaaina",
+    rap: "Rapanui",
+    rav: "Sampang",
+    raw: "Rawang",
+    rej: "Rejang",
+    rel: "Rendille",
+    rgu: "Ringgou",
+    rhg: "Rohingya",
+    rif: "Tarifit",
+    rim: "Nyaturu",
+    rjs: "Rajbanshi",
+    rkt: "Rangpuri",
+    rmc: "CarpathianRomani",
+    rmo: "SinteRomani",
+    rmy: "VlaxRomani",
+    rng: "Ronga",
+    rnl: "Ranglong",
+    rob: "Tae'",
+    rof: "Rombo",
+    roh: "Romansh",
+    rol: "Romblomanon",
+    ron: "Romanian",
+    roo: "Rotokas",
+    rop: "Kriol",
+    rro: "Waima",
+    rth: "Ratahan",
+    rub: "Gungu",
+    ruc: "Ruuli",
+    ruf: "Luguru",
+    rug: "Roviana",
+    run: "Rundi",
+    rus: "Russian",
+    rwm: "Amba(Uganda)",
+    rwr: "Marwari(India)",
+    sab: "Buglere",
+    sag: "Sango",
+    sah: "Yakut",
+    saj: "Sahu",
+    saq: "Samburu",
+    sas: "Sasak",
+    sau: "Saleman",
+    say: "Saya",
+    sba: "Ngambay",
+    sbd: "SouthernSamo",
+    sbl: "BotolanSambal",
+    sbn: "SindhiBhil",
+    sbp: "Sangu(Tanzania)",
+    sch: "Sakachep",
+    sck: "Sadri",
+    scl: "Shina",
+    scn: "Sicilian",
+    sco: "Scots",
+    sda: "Toraja-Sa'dan",
+    sdo: "Bukar-SadungBidayuh",
+    sea: "Semai",
+    seh: "Sena",
+    sei: "Seri",
+    ses: "KoyraboroSenniSonghai",
+    sey: "Secoya",
+    sgb: "Mag-antsiAyta",
+    sgj: "Surgujia",
+    sgw: "SebatBetGurage",
+    shi: "Tachelhit",
+    shk: "Shilluk",
+    shn: "Shan",
+    sho: "Shanga",
+    shp: "Shipibo-Conibo",
+    sid: "Sidamo",
+    sig: "Paasaal",
+    sil: "TumulungSisaala",
+    sin: "Sinhala",
+    sip: "Sikkimese",
+    siw: "Siwai",
+    sja: "Epena",
+    sjm: "Mapun",
+    sjp: "Surjapuri",
+    sjr: "Siar-Lak",
+    skg: "SakalavaMalagasy",
+    skr: "Saraiki",
+    sld: "Sissala",
+    slk: "Slovak",
+    slu: "Selaru",
+    slv: "Slovenian",
+    sml: "CentralSama",
+    smo: "Samoan",
+    sna: "Shona",
+    snc: "Sinaugoro",
+    snd: "Sindhi",
+    sne: "BauBidayuh",
+    snk: "Soninke",
+    snn: "Siona",
+    snp: "Siane",
+    snv: "Sa'ban",
+    snw: "Selee",
+    sol: "Solos",
+    som: "Somali",
+    soy: "Miyobe",
+    spa: "Spanish",
+    spp: "SupyireSenoufo",
+    sps: "Saposa",
+    spy: "Sabaot",
+    src: "LogudoreseSardinian",
+    srd: "Sardinian",
+    sri: "Siriano",
+    srm: "Saramaccan",
+    srn: "SrananTongo",
+    sro: "CampidaneseSardinian",
+    srp: "Serbian",
+    srr: "Serer",
+    srx: "Sirmauri",
+    ssi: "Sansi",
+    ste: "Liana-Seti",
+    stn: "Owa",
+    stp: "SoutheasternTepehuan",
+    sua: "Sulka",
+    suc: "WesternSubanon",
+    suk: "Sukuma",
+    sun: "Sundanese",
+    sur: "Mwaghavul",
+    sus: "Susu",
+    suv: "Puroik",
+    suz: "Sunwar",
+    sva: "Svan",
+    swe: "Swedish",
+    swh: "Swahili(individuallanguage)",
+    swv: "Shekhawati",
+    sxb: "Suba",
+    sxn: "Sangir",
+    sya: "Siang",
+    syl: "Sylheti",
+    sza: "Semelai",
+    szy: "Sakizaya",
+    tac: "LowlandTarahumara",
+    taj: "EasternTamang",
+    tam: "Tamil",
+    tan: "Tangale",
+    tao: "Yami",
+    tap: "Taabwa",
+    taq: "Tamasheq",
+    tar: "CentralTarahumara",
+    tat: "Tatar",
+    tav: "Tatuyo",
+    tay: "Atayal",
+    tbc: "Takia",
+    tbf: "Mandara",
+    tbg: "NorthTairora",
+    tbk: "CalamianTagbanwa",
+    tbl: "Tboli",
+    tby: "Tabaru",
+    tbz: "Ditammari",
+    tca: "Ticuna",
+    tcc: "Datooga",
+    tcf: "MalinaltepecMe'phaa",
+    tcy: "Tulu",
+    tcz: "ThadoChin",
+    tdj: "Tajio",
+    tdn: "Tondano",
+    tdx: "Tandroy-MahafalyMalagasy",
+    ted: "TepoKrumen",
+    tee: "HuehuetlaTepehua",
+    tel: "Telugu",
+    tem: "Timne",
+    teo: "Teso",
+    ter: "Tereno",
+    tew: "Tewa(USA)",
+    tex: "Tennet",
+    tfr: "Teribe",
+    tgc: "Tigak",
+    tgj: "Tagin",
+    tgk: "Tajik",
+    tgl: "Tagalog",
+    tgo: "Sudest",
+    tgp: "Tangoa",
+    tha: "Thai",
+    the: "ChitwaniaTharu",
+    thk: "Tharaka",
+    thl: "DangauraTharu",
+    thq: "KochilaTharu",
+    thr: "RanaTharu",
+    thv: "TahaggartTamahaq",
+    tig: "Tigre",
+    tih: "TimugonMurut",
+    tik: "Tikar",
+    tio: "Teop",
+    tir: "Tigrinya",
+    tkg: "TesakaMalagasy",
+    tkr: "Tsakhur",
+    tkt: "KathoriyaTharu",
+    tlb: "Tobelo",
+    tli: "Tlingit",
+    tlj: "Talinga-Bwisi",
+    tlp: "FilomenaMata-CoahuitlánTotonac",
+    tly: "Talysh",
+    tmc: "Tumak",
+    tmf: "Toba-Maskoy",
+    tna: "Tacana",
+    tng: "Tobanga",
+    tnk: "Kwamera",
+    tnn: "NorthTanna",
+    tnp: "Whitesands",
+    tnr: "Ménik",
+    tnt: "Tontemboan",
+    tob: "Toba",
+    toc: "CoyutlaTotonac",
+    toh: "Gitonga",
+    tok: "TokiPona",
+    tom: "Tombulu",
+    top: "PapantlaTotonac",
+    tos: "HighlandTotonac",
+    tpi: "TokPisin",
+    tpl: "TlacoapaMe'phaa",
+    tpm: "Tampulma",
+    tpp: "PisafloresTepehua",
+    tpt: "TlachichilcoTepehua",
+    tpz: "Tinputz",
+    tqp: "Tomoip",
+    trc: "CopalaTriqui",
+    tri: "Trió",
+    trn: "Trinitario",
+    trp: "KokBorok",
+    trq: "SanMartínItunyosoTriqui",
+    trs: "ChicahuaxtlaTriqui",
+    trv: "Sediq",
+    trw: "Torwali",
+    tsn: "Tswana",
+    tso: "Tsonga",
+    tsz: "Purepecha",
+    ttc: "Tektiteko",
+    tte: "Bwanabwana",
+    ttj: "Tooro",
+    ttq: "TawallammatTamajaq",
+    ttr: "Tera",
+    ttu: "Torau",
+    tue: "Tuyuca",
+    tuf: "CentralTunebo",
+    tui: "Tupuri",
+    tuk: "Turkmen",
+    tul: "Tula",
+    tuo: "Tucano",
+    tuq: "Tedaga",
+    tur: "Turkish",
+    tuv: "Turkana",
+    tuy: "Tugen",
+    tvo: "Tidore",
+    tvu: "Tunen",
+    tvw: "Sedoa",
+    twb: "WesternTawbuid",
+    twe: "Tewa(Indonesia)",
+    twu: "Termanu",
+    txa: "Tombonuo",
+    txq: "Tii",
+    txs: "Tonsea",
+    txu: "Kayapó",
+    txy: "TanosyMalagasy",
+    tye: "Kyanga",
+    tzh: "Tzeltal",
+    tzj: "Tz'utujil",
+    tzo: "Tzotzil",
+    ubl: "Buhi'nonBikol",
+    ubu: "Umbu-Ungu",
+    udl: "Wuzlam",
+    udm: "Udmurt",
+    udu: "Uduk",
+    uig: "Uighur",
+    uki: "Kui(India)",
+    ukr: "Ukrainian",
+    ukv: "Kuku",
+    umb: "Umbundu",
+    upv: "Uripiv-Wala-Rano-Atchin",
+    ura: "Urarina",
+    urb: "Urubú-Kaapor",
+    urd: "Urdu",
+    urh: "Urhobo",
+    urk: "UrakLawoi'",
+    urt: "Urat",
+    ury: "Orya",
+    ush: "Ushojo",
+    usp: "Uspanteco",
+    uzb: "Uzbek",
+    uzn: "NorthernUzbek",
+    vag: "Vagla",
+    vah: "Varhadi-Nagpuri",
+    vai: "Vai",
+    var: "Huarijio",
+    ver: "MomJango",
+    vid: "Vidunda",
+    vie: "Vietnamese",
+    vif: "Vili",
+    vmc: "JuxtlahuacaMixtec",
+    vmj: "IxtayutlaMixtec",
+    vmm: "MitlatongoMixtec",
+    vmp: "SoyaltepecMazatec",
+    vmw: "Makhuwa",
+    vmy: "AyautlaMazatec",
+    vmz: "MazatlánMazatec",
+    vro: "Võro",
+    vun: "Vunjo",
+    vut: "Vute",
+    wal: "Wolaytta",
+    wap: "Wapishana",
+    war: "Waray(Philippines)",
+    waw: "Waiwai",
+    way: "Wayana",
+    wba: "Warao",
+    wbl: "Wakhi",
+    wbr: "Wagdi",
+    wci: "WaciGbe",
+    weo: "Wemale",
+    wes: "CameroonPidgin",
+    wja: "Waja",
+    wji: "Warji",
+    wlo: "Wolio",
+    wlx: "Wali(Ghana)",
+    wmw: "Mwani",
+    wob: "WèNorthern",
+    wof: "GambianWolof",
+    wol: "Wolof",
+    wsg: "AdilabadGondi",
+    wwa: "Waama",
+    xal: "Kalmyk",
+    xdy: "MalayicDayak",
+    xed: "Hdi",
+    xer: "Xerénte",
+    xhe: "Khetrani",
+    xho: "Xhosa",
+    xka: "Kalkoti",
+    xkl: "MainstreamKenyah",
+    xmf: "Mingrelian",
+    xmm: "ManadoMalay",
+    xmv: "AntankaranaMalagasy",
+    xnj: "Ngoni(Tanzania)",
+    xnr: "Kangri",
+    xog: "Soga",
+    xon: "Konkomba",
+    xpe: "LiberiaKpelle",
+    xrb: "EasternKaraboro",
+    xsb: "Sambal",
+    xsm: "Kasem",
+    xsr: "Sherpa",
+    xsu: "Sanumá",
+    xta: "AlcozaucaMixtec",
+    xtd: "Diuxi-TilantongoMixtec",
+    xte: "Ketengban",
+    xti: "SinicahuaMixtec",
+    xtm: "MagdalenaPeñascoMixtec",
+    xtn: "NorthernTlaxiacoMixtec",
+    xtu: "CuyamecalcoMixtec",
+    xua: "AluKurumba",
+    xuo: "Kuo",
+    yaa: "Yaminahua",
+    yad: "Yagua",
+    yal: "Yalunka",
+    yam: "Yamba",
+    yao: "Yao",
+    yaq: "Yaqui",
+    yas: "Nugunu(Cameroon)",
+    yat: "Yambeta",
+    yav: "Yangben",
+    yay: "Agwagwune",
+    yaz: "Lokaa",
+    yba: "Yala",
+    ybb: "Yemba",
+    ycl: "Lolopo",
+    ycn: "Yucuna",
+    ydd: "EasternYiddish",
+    ydg: "Yidgha",
+    yea: "Ravula",
+    yer: "Tarok",
+    yes: "Nyankpa",
+    yka: "Yakan",
+    yli: "AnggurukYali",
+    yor: "Yoruba",
+    yre: "Yaouré",
+    yua: "Yucateco",
+    yue: "YueChinese",
+    yuz: "Yuracare",
+    yva: "Yawa",
+    zaa: "SierradeJuárezZapotec",
+    zab: "WesternTlacolulaValleyZapotec",
+    zac: "OcotlánZapotec",
+    zad: "CajonosZapotec",
+    zae: "YareniZapotec",
+    zai: "IsthmusZapotec",
+    zam: "MiahuatlánZapotec",
+    zao: "OzolotepecZapotec",
+    zaq: "AloápamZapotec",
+    zar: "RincónZapotec",
+    zas: "SantoDomingoAlbarradasZapotec",
+    zav: "YatzachiZapotec",
+    zaw: "MitlaZapotec",
+    zca: "CoatecasAltasZapotec",
+    zga: "Kinga",
+    zim: "Mesme",
+    ziw: "Zigula",
+    zmz: "Mbandja",
+    zne: "Zande(individuallanguage)",
+    zoc: "CopainaláZoque",
+    zoh: "ChimalapaZoque",
+    zor: "RayónZoque",
+    zos: "FranciscoLeónZoque",
+    zpc: "ChoapanZapotec",
+    zpg: "GueveaDeHumboldtZapotec",
+    zpi: "SantaMaríaQuiegolaniZapotec",
+    zpl: "LachixíoZapotec",
+    zpm: "MixtepecZapotec",
+    zpo: "AmatlánZapotec",
+    zpt: "SanVicenteCoatlánZapotec",
+    zpu: "YalálagZapotec",
+    zpv: "ChichicapanZapotec",
+    zpy: "MazaltepecZapotec",
+    zpz: "TexmelucanZapotec",
+    zsm: "StandardMalay",
+    ztg: "XanaguíaZapotec",
+    ztn: "SantaCatarinaAlbarradasZapotec",
+    ztp: "LoxichaZapotec",
+    ztq: "Quioquitani-QuieríZapotec",
+    zts: "TilquiapanZapotec",
+    ztu: "GüiláZapotec",
+    zty: "YateeZapotec",
+    zul: "Zulu",
+    zyb: "YongbeiZhuang",
+    zyp: "ZypheChin",
+    zza: "Zaza",
+};
\ No newline at end of file
diff --git a/sharedUtils/omniAsrSupportedLangs.ts b/sharedUtils/omniAsrSupportedLangs.ts
new file mode 100644
index 000000000..bafb5995a
--- /dev/null
+++ b/sharedUtils/omniAsrSupportedLangs.ts
@@ -0,0 +1,315 @@
+/**
+ * OmniASR supported-language snapshot
+ * -----------------------------------
+ *
+ * Static snapshot of the language codes supported by the OmniASR transcription
+ * service (Meta Omnilingual ASR — `omniASR_LLM_1B_v2`). Each entry is in
+ * `{iso639_3}_{Script}` form, e.g. `eng_Latn`, `swh_Latn`, `urd_Arab`.
+ *
+ * We bundle this list so the extension can validate / resolve language codes
+ * offline, with no runtime network dependency.
+ *
+ * Regenerating
+ * ~~~~~~~~~~~~
+ * If we change ASR providers or the underlying model, regenerate this file from
+ * the live `/languages` endpoint:
+ *
+ *   curl -s "https://genesis-ai-dev--codex-asr-serve.modal.run/languages" \
+ *     | python3 -c "
+ *   import json, sys
+ *   d = json.load(sys.stdin)
+ *   langs = sorted(set(d['languages']))
+ *   print('export const OMNI_ASR_SUPPORTED_LANGS: readonly string[] = [')
+ *   for i in range(0, len(langs), 6):
+ *       print('    ' + ', '.join(f'\"{c}\"' for c in langs[i:i+6]) + ',')
+ *   print('];')
+ *   "
+ *
+ * (Pre-rename, the host was `genesis-ai-dev--mms-zeroshot-asr-serve.modal.run`.)
+ *
+ * Snapshot taken: 2026-06-04. Server reported 1672 languages.
+ */
+
+export const OMNI_ASR_SUPPORTED_LANGS: readonly string[] = [
+    "aae_Latn", "aal_Latn", "abb_Latn", "abi_Latn", "abk_Cyrl", "abn_Latn",
+    "abp_Latn", "abr_Latn", "abs_Latn", "aca_Latn", "acd_Latn", "ace_Latn",
+    "acf_Latn", "ach_Latn", "acm_Arab", "acn_Latn", "acr_Latn", "acu_Latn",
+    "acw_Arab", "ade_Latn", "adh_Latn", "adj_Latn", "adx_Tibt", "ady_Cyrl",
+    "aeb_Arab", "aec_Arab", "aeu_Latn", "afb_Arab", "afo_Latn", "afr_Latn",
+    "agd_Latn", "agg_Latn", "agn_Latn", "agr_Latn", "agu_Latn", "agx_Cyrl",
+    "aha_Latn", "ahk_Latn", "ahl_Latn", "ahs_Latn", "aia_Latn", "ajg_Latn",
+    "aka_Latn", "akb_Latn", "ake_Latn", "akp_Latn", "ala_Latn", "alj_Latn",
+    "aln_Latn", "alo_Latn", "alp_Latn", "als_Latn", "alt_Cyrl", "alz_Latn",
+    "ame_Latn", "amf_Latn", "amh_Ethi", "ami_Latn", "amk_Latn", "amu_Latn",
+    "anc_Latn", "ank_Latn", "ann_Latn", "anp_Deva", "anw_Latn", "any_Latn",
+    "aom_Latn", "aoz_Latn", "apb_Latn", "apc_Arab", "apd_Arab", "apr_Latn",
+    "arb_Arab", "arg_Latn", "arl_Latn", "arq_Arab", "ars_Arab", "ary_Arab",
+    "arz_Arab", "asa_Latn", "asg_Latn", "asm_Beng", "ast_Latn", "ata_Latn",
+    "atb_Latn", "atg_Latn", "ati_Latn", "atq_Latn", "ava_Cyrl", "avn_Latn",
+    "avu_Latn", "awa_Deva", "awb_Latn", "awo_Latn", "ayl_Arab", "ayo_Latn",
+    "ayp_Arab", "ayr_Latn", "ayz_Latn", "aze_Arab", "aze_Cyrl", "aze_Latn",
+    "azg_Latn", "azz_Latn", "bag_Latn", "bak_Cyrl", "bam_Latn", "ban_Latn",
+    "bao_Latn", "bas_Latn", "bav_Latn", "bax_Latn", "bba_Latn", "bbb_Latn",
+    "bbc_Latn", "bbj_Latn", "bbl_Geor", "bbo_Latn", "bbu_Latn", "bcc_Arab",
+    "bcc_Latn", "bce_Latn", "bci_Latn", "bcl_Latn", "bcs_Latn", "bcw_Latn",
+    "bcy_Latn", "bcz_Latn", "bda_Latn", "bde_Latn", "bdg_Latn", "bdh_Latn",
+    "bdm_Latn", "bdq_Latn", "bdu_Latn", "beb_Latn", "beh_Latn", "bel_Cyrl",
+    "bem_Latn", "ben_Beng", "bep_Latn", "bew_Latn", "bex_Latn", "bfa_Latn",
+    "bfd_Latn", "bfo_Latn", "bft_Arab", "bfy_Deva", "bfz_Deva", "bgc_Deva",
+    "bgp_Arab", "bgq_Deva", "bgr_Latn", "bgt_Latn", "bgw_Deva", "bha_Deva",
+    "bhb_Deva", "bhh_Cyrl", "bho_Deva", "bhp_Latn", "bht_Deva", "bhz_Latn",
+    "bib_Latn", "bim_Latn", "bis_Latn", "biv_Latn", "bjj_Deva", "bjk_Latn",
+    "bjn_Latn", "bjr_Latn", "bjt_Latn", "bjv_Latn", "bjw_Latn", "bjz_Latn",
+    "bkd_Latn", "bkh_Latn", "bkm_Latn", "bkv_Latn", "bky_Latn", "ble_Latn",
+    "blh_Latn", "blt_Latn", "blx_Latn", "blz_Latn", "bmm_Latn", "bmq_Latn",
+    "bmr_Latn", "bmu_Latn", "bmv_Latn", "bng_Beng", "bnm_Latn", "bnn_Latn",
+    "bno_Latn", "bnp_Latn", "bns_Deva", "boa_Latn", "bod_Tibt", "boj_Latn",
+    "bom_Latn", "bor_Latn", "bos_Latn", "bou_Latn", "bov_Latn", "box_Latn",
+    "bpr_Latn", "bps_Latn", "bqc_Latn", "bqg_Latn", "bqi_Arab", "bqj_Latn",
+    "bqp_Latn", "bra_Deva", "bre_Latn", "brh_Arab", "bri_Latn", "bru_Latn",
+    "brx_Deva", "bsc_Latn", "bsh_Arab", "bsj_Latn", "bsk_Latn", "bsq_Latn",
+    "bss_Latn", "bsy_Latn", "btd_Latn", "btm_Latn", "bts_Latn", "btt_Latn",
+    "btv_Arab", "btx_Latn", "bud_Latn", "bug_Latn", "bul_Cyrl", "bum_Latn",
+    "buo_Latn", "bus_Latn", "bux_Latn", "bvb_Latn", "bvc_Latn", "bvz_Latn",
+    "bwq_Latn", "bwr_Latn", "bwu_Latn", "bxf_Latn", "bxk_Latn", "byc_Latn",
+    "byr_Latn", "bys_Latn", "byv_Latn", "byx_Latn", "bzh_Latn", "bzi_Thai",
+    "bzj_Latn", "bzw_Latn", "caa_Latn", "cab_Latn", "cac_Latn", "cak_Latn",
+    "cap_Latn", "car_Latn", "cas_Latn", "cat_Latn", "cax_Latn", "cbc_Latn",
+    "cbi_Latn", "cbr_Latn", "cbs_Latn", "cbt_Latn", "cbu_Latn", "cbv_Latn",
+    "cce_Latn", "ccg_Latn", "cco_Latn", "cdj_Deva", "cdo_Hans", "ceb_Latn",
+    "ceg_Latn", "cek_Latn", "cen_Latn", "ces_Latn", "cfa_Latn", "cfm_Latn",
+    "cgc_Latn", "cgg_Latn", "che_Cyrl", "chf_Latn", "chq_Latn", "chv_Cyrl",
+    "chz_Latn", "cjk_Latn", "cjo_Latn", "cjp_Latn", "cjs_Cyrl", "ckb_Arab",
+    "ckl_Latn", "cko_Latn", "ckr_Latn", "ckt_Cyrl", "cky_Latn", "cla_Latn",
+    "cle_Latn", "cly_Latn", "cme_Latn", "cmn_Hans", "cmn_Hant", "cmo_Khmr",
+    "cmo_Latn", "cmr_Latn", "cnh_Latn", "cni_Latn", "cnl_Latn", "cnt_Latn",
+    "coe_Latn", "cof_Latn", "cok_Latn", "con_Latn", "cor_Latn", "cot_Latn",
+    "cou_Latn", "cpa_Latn", "cpb_Latn", "cpu_Latn", "cpx_Hans", "cpy_Latn",
+    "crh_Cyrl", "crk_Cans", "crk_Latn", "crn_Latn", "crq_Latn", "crs_Latn",
+    "crt_Latn", "csk_Latn", "cso_Latn", "ctd_Latn", "cte_Latn", "ctg_Beng",
+    "ctl_Latn", "cto_Latn", "ctu_Latn", "cuc_Latn", "cui_Latn", "cuk_Latn",
+    "cul_Latn", "cut_Latn", "cux_Latn", "cwa_Latn", "cwe_Latn", "cwt_Latn",
+    "cya_Latn", "cym_Latn", "daa_Latn", "dag_Latn", "dah_Latn", "dan_Latn",
+    "dar_Cyrl", "dav_Latn", "dbd_Latn", "dbj_Latn", "dbq_Latn", "dcc_Arab",
+    "ddn_Latn", "ded_Latn", "deg_Latn", "des_Latn", "deu_Latn", "dga_Latn",
+    "dgh_Latn", "dgi_Latn", "dgk_Latn", "dgo_Deva", "dgr_Latn", "dhi_Deva",
+    "did_Latn", "dig_Latn", "dik_Latn", "dip_Latn", "div_Thaa", "dje_Latn",
+    "djk_Latn", "dmk_Arab", "dml_Arab", "dnj_Latn", "dnt_Latn", "dnw_Latn",
+    "dop_Latn", "dos_Latn", "dru_Latn", "dsb_Latn", "dsh_Latn", "dtp_Latn",
+    "dts_Latn", "dty_Deva", "dua_Latn", "dug_Latn", "dwr_Latn", "dyi_Latn",
+    "dyo_Latn", "dyu_Latn", "dzg_Latn", "dzo_Tibt", "ebu_Latn", "ego_Latn",
+    "eip_Latn", "eiv_Latn", "eka_Latn", "ekk_Latn", "eko_Latn", "ekr_Latn",
+    "ell_Grek", "ell_Grek_cypr1249", "elm_Latn", "emp_Latn", "enb_Latn", "eng_Latn",
+    "enx_Latn", "epo_Latn", "ese_Latn", "ess_Latn", "esu_Latn", "eto_Latn",
+    "ets_Latn", "etu_Latn", "eus_Latn", "evn_Cyrl", "ewe_Latn", "ewo_Latn",
+    "eyo_Latn", "eza_Latn", "fal_Latn", "fan_Latn", "fao_Latn", "far_Latn",
+    "fas_Arab", "fat_Latn", "fia_Latn", "fij_Latn", "fil_Latn", "fin_Latn",
+    "fip_Latn", "fkk_Latn", "flr_Latn", "fmp_Latn", "fmu_Deva", "fon_Latn",
+    "fra_Latn", "frd_Latn", "fry_Latn", "fub_Latn", "fuc_Latn", "fue_Latn",
+    "ful_Latn", "fuq_Latn", "fuv_Latn", "gag_Cyrl", "gag_Latn", "gai_Latn",
+    "gam_Latn", "gau_Telu", "gbi_Latn", "gbk_Deva", "gbm_Deva", "gbo_Latn",
+    "gbr_Latn", "gby_Latn", "gcc_Latn", "gde_Latn", "gdf_Latn", "geb_Latn",
+    "gej_Latn", "ges_Latn", "ggg_Arab", "gid_Latn", "gig_Arab", "gil_Latn",
+    "giz_Latn", "gjk_Arab", "gjn_Latn", "gju_Arab", "gkn_Latn", "gld_Cyrl",
+    "gle_Latn", "glg_Latn", "glk_Arab", "glv_Latn", "glw_Latn", "gmv_Latn",
+    "gna_Latn", "gnd_Latn", "gng_Latn", "gof_Latn", "gog_Latn", "gol_Latn",
+    "gom_Deva", "gor_Latn", "gqr_Latn", "grc_Grek", "gri_Latn", "grn_Latn",
+    "grt_Beng", "gsl_Latn", "gso_Latn", "gub_Latn", "guc_Latn", "gud_Latn",
+    "gug_Latn", "guh_Latn", "gui_Latn", "guj_Gujr", "guk_Ethi", "gum_Latn",
+    "guo_Latn", "guq_Latn", "gur_Latn", "guu_Latn", "gux_Latn", "guz_Latn",
+    "gvc_Latn", "gvl_Latn", "gwc_Arab", "gwe_Latn", "gwi_Latn", "gwr_Latn",
+    "gwt_Arab", "gym_Latn", "gyr_Latn", "gyz_Latn", "had_Latn", "hag_Latn",
+    "hah_Latn", "hak_Latn", "hao_Latn", "hap_Latn", "hat_Latn", "hau_Latn",
+    "haw_Latn", "hay_Latn", "hbb_Latn", "hch_Latn", "heb_Hebr", "heh_Latn",
+    "her_Latn", "hia_Latn", "hif_Latn", "hig_Latn", "hil_Latn", "hin_Deva",
+    "hkk_Latn", "hla_Latn", "hlb_Deva", "hlt_Latn", "hne_Deva", "hnn_Latn",
+    "hno_Arab", "hns_Latn", "hoc_Orya", "hrv_Latn", "hsb_Latn", "hto_Latn",
+    "hub_Latn", "hue_Latn", "hui_Latn", "hul_Latn", "hun_Latn", "hus_Latn",
+    "huu_Latn", "huv_Latn", "hux_Latn", "hvn_Latn", "hwc_Latn", "hwo_Latn",
+    "hye_Armn", "hyw_Armn", "iba_Latn", "ibb_Latn", "ibo_Latn", "icr_Latn",
+    "ida_Latn", "idd_Latn", "idu_Latn", "ifa_Latn", "ifb_Latn", "ife_Latn",
+    "ifk_Latn", "ifu_Latn", "ify_Latn", "igl_Latn", "ign_Latn", "ijc_Latn",
+    "ijn_Latn", "ikk_Latn", "ikw_Latn", "ilb_Latn", "ilo_Latn", "imo_Latn",
+    "ina_Latn", "inb_Latn", "ind_Latn", "iou_Latn", "ipi_Latn", "ipk_Latn",
+    "iqw_Latn", "iri_Latn", "irk_Latn", "ish_Latn", "isl_Latn", "iso_Latn",
+    "ita_Latn", "itl_Cyrl", "its_Latn", "itv_Latn", "itw_Latn", "itz_Latn",
+    "ixl_Latn", "izr_Latn", "izz_Latn", "jac_Latn", "jal_Latn", "jam_Latn",
+    "jav_Latn", "jax_Latn", "jbu_Latn", "jen_Latn", "jic_Latn", "jiv_Latn",
+    "jmc_Latn", "jmd_Latn", "jmx_Latn", "jpn_Jpan", "jqr_Latn", "juk_Latn",
+    "jun_Orya", "juo_Latn", "jvn_Latn", "kaa_Cyrl", "kab_Latn", "kac_Latn",
+    "kai_Latn", "kaj_Latn", "kak_Latn", "kam_Latn", "kan_Knda", "kao_Latn",
+    "kaq_Latn", "kas_Arab", "kat_Geor", "kay_Latn", "kaz_Cyrl", "kbd_Cyrl",
+    "kbl_Latn", "kbo_Latn", "kbp_Latn", "kbq_Latn", "kbr_Latn", "kbt_Latn",
+    "kby_Latn", "kca_Cyrl", "kcg_Latn", "kcn_Latn", "kcq_Latn", "kdc_Latn",
+    "kde_Latn", "kdh_Latn", "kdi_Latn", "kdj_Latn", "kdl_Latn", "kdn_Latn",
+    "kdt_Khmr", "kea_Latn", "kek_Latn", "ken_Latn", "keo_Latn", "ker_Latn",
+    "keu_Latn", "key_Telu", "kez_Latn", "kfb_Deva", "kff_Telu", "kfk_Deva",
+    "kfq_Deva", "kfr_Gujr", "kfw_Latn", "kfx_Deva", "kha_Latn", "khg_Tibt",
+    "khk_Cyrl", "khm_Khmr", "khq_Latn", "khw_Arab", "kia_Latn", "kij_Latn",
+    "kik_Latn", "kin_Latn", "kir_Cyrl", "kix_Latn", "kjb_Latn", "kjc_Latn",
+    "kje_Latn", "kjg_Latn", "kjh_Cyrl", "kjk_Latn", "kki_Latn", "kkj_Latn",
+    "kle_Deva", "kln_Latn", "kls_Latn", "klu_Latn", "klv_Latn", "klw_Latn",
+    "kma_Latn", "kmd_Latn", "kml_Latn", "kmr_Arab", "kmr_Cyrl", "kmr_Latn",
+    "kmu_Latn", "kmy_Latn", "kna_Latn", "knb_Latn", "knc_Latn", "kne_Latn",
+    "knf_Latn", "knj_Latn", "knk_Latn", "knn_Deva", "kno_Latn", "kog_Latn",
+    "kol_Latn", "koo_Latn", "kor_Hang", "kpo_Latn", "kpq_Latn", "kps_Latn",
+    "kpv_Cyrl", "kpy_Cyrl", "kpz_Latn", "kqe_Latn", "kqo_Latn", "kqp_Latn",
+    "kqr_Latn", "kqy_Ethi", "krc_Cyrl", "kri_Latn", "krj_Latn", "krl_Latn",
+    "krr_Khmr", "krs_Latn", "kru_Deva", "krx_Latn", "ksb_Latn", "ksd_Latn",
+    "ksf_Latn", "ksr_Latn", "kss_Latn", "ksz_Deva", "ktb_Ethi", "ktj_Latn",
+    "kto_Latn", "kua_Latn", "kub_Latn", "kue_Latn", "kuh_Latn", "kum_Cyrl",
+    "kur_Arab", "kus_Latn", "kvn_Latn", "kvw_Latn", "kvx_Arab", "kwd_Latn",
+    "kwf_Latn", "kwi_Latn", "kwm_Latn", "kxc_Ethi", "kxf_Latn", "kxm_Thai",
+    "kxp_Arab", "kyb_Latn", "kyc_Latn", "kyf_Latn", "kyg_Latn", "kyo_Latn",
+    "kyq_Latn", "kyu_Kali", "kyx_Latn", "kyz_Latn", "kzf_Latn", "kzi_Latn",
+    "lac_Latn", "lag_Latn", "laj_Latn", "lam_Latn", "lao_Laoo", "las_Latn",
+    "lat_Latn", "lav_Latn", "law_Latn", "lbj_Tibt", "lbw_Latn", "lcm_Latn",
+    "lcp_Thai", "ldb_Latn", "led_Latn", "lee_Latn", "lef_Latn", "lem_Latn",
+    "lew_Latn", "lex_Latn", "lgg_Latn", "lgl_Latn", "lhu_Latn", "lia_Latn",
+    "lid_Latn", "lif_Deva", "lij_Latn", "lin_Latn", "lip_Latn", "lir_Latn",
+    "lis_Lisu", "lit_Latn", "lje_Latn", "ljp_Latn", "lkb_Latn", "lke_Latn",
+    "lla_Latn", "lld_Latn_gherd", "lld_Latn_valbadia", "llg_Latn", "lln_Latn", "lme_Latn",
+    "lnd_Latn", "lns_Latn", "lnu_Latn", "loa_Latn", "lob_Latn", "lok_Latn",
+    "lom_Latn", "lon_Latn", "loq_Latn", "lrk_Arab", "lsi_Latn", "lsm_Latn",
+    "lss_Arab", "ltg_Latn", "lth_Latn", "lto_Latn", "ltz_Latn", "lua_Latn",
+    "luc_Latn", "lug_Latn", "luo_Latn", "lus_Latn", "lwg_Latn", "lwo_Latn",
+    "lww_Latn", "lzz_Latn", "maa_Latn", "mab_Latn", "mad_Latn", "maf_Latn",
+    "mag_Deva", "mah_Latn", "mai_Deva", "maj_Latn", "mak_Latn", "mal_Mlym",
+    "mam_Latn", "maq_Latn", "mar_Deva", "mau_Latn", "maw_Latn", "max_Latn",
+    "maz_Latn", "mbb_Latn", "mbc_Latn", "mbh_Latn", "mbj_Latn", "mbt_Latn",
+    "mbu_Latn", "mca_Latn", "mcb_Latn", "mcd_Latn", "mcf_Latn", "mco_Latn",
+    "mcp_Latn", "mcq_Latn", "mcu_Latn", "mcx_Latn", "mda_Latn", "mdd_Latn",
+    "mdv_Latn", "mdy_Ethi", "med_Latn", "mee_Latn", "meh_Latn", "mej_Latn",
+    "mek_Latn", "mel_Latn", "men_Latn", "meq_Latn", "mer_Latn", "met_Latn",
+    "meu_Latn", "mev_Latn", "mfe_Latn", "mfh_Latn", "mfi_Latn", "mfk_Latn",
+    "mfm_Latn", "mfn_Latn", "mfo_Latn", "mfq_Latn", "mfv_Latn", "mfy_Latn",
+    "mfz_Latn", "mgd_Latn", "mge_Latn", "mgg_Latn", "mgh_Latn", "mgi_Latn",
+    "mgo_Latn", "mhi_Latn", "mhk_Latn", "mhr_Cyrl", "mhu_Latn", "mhx_Latn",
+    "mhy_Latn", "mib_Latn", "mie_Latn", "mif_Latn", "mig_Latn", "mih_Latn",
+    "mil_Latn", "mim_Latn", "min_Latn", "mio_Latn", "mip_Latn", "miq_Latn",
+    "mit_Latn", "miu_Latn", "miy_Latn", "miz_Latn", "mjl_Deva", "mjv_Mlym",
+    "mkd_Cyrl", "mkf_Latn", "mki_Arab", "mkl_Latn", "mkn_Latn", "mlg_Latn",
+    "mlq_Latn", "mlt_Latn", "mmc_Latn", "mmg_Latn", "mnb_Latn", "mne_Latn",
+    "mnf_Latn", "mni_Beng", "mnk_Latn", "mnw_Mymr", "mnx_Latn", "moa_Latn",
+    "mog_Latn", "mon_Cyrl", "mop_Latn", "mor_Latn", "mos_Latn", "mox_Latn",
+    "moz_Latn", "mpg_Latn", "mpm_Latn", "mpp_Latn", "mpx_Latn", "mqb_Latn",
+    "mqf_Latn", "mqj_Latn", "mqn_Latn", "mqy_Latn", "mri_Latn", "mrj_Cyrl",
+    "mrr_Deva", "mrt_Latn", "mrw_Latn", "msh_Latn", "msi_Latn", "msw_Latn",
+    "msy_Latn", "mtd_Latn", "mtj_Latn", "mto_Latn", "mtr_Deva", "mtu_Latn",
+    "mtx_Latn", "mua_Latn", "mug_Latn", "muh_Latn", "mui_Latn", "mup_Deva",
+    "mur_Latn", "muv_Mlym", "muy_Latn", "mve_Arab", "mvp_Latn", "mvy_Arab",
+    "mwq_Latn", "mwv_Latn", "mxb_Latn", "mxq_Latn", "mxs_Latn", "mxt_Latn",
+    "mxu_Latn", "mxv_Latn", "mxy_Latn", "mya_Mymr", "myb_Latn", "myk_Latn",
+    "myv_Cyrl", "myx_Latn", "myy_Latn", "mza_Latn", "mzi_Latn", "mzj_Latn",
+    "mzk_Latn", "mzl_Latn", "mzm_Latn", "mzw_Latn", "nab_Latn", "nag_Latn",
+    "nal_Latn", "nan_Latn", "nap_Latn", "nas_Latn", "naw_Latn", "nbh_Latn",
+    "nca_Latn", "ncf_Latn", "nch_Latn", "ncj_Latn", "ncl_Latn", "nco_Latn",
+    "ncu_Latn", "ncx_Latn", "ndi_Latn", "ndj_Latn", "ndo_Latn", "ndp_Latn",
+    "ndv_Latn", "ndy_Latn", "ndz_Latn", "neb_Latn", "nep_Deva", "new_Deva",
+    "nfa_Latn", "nfr_Latn", "nga_Latn", "ngi_Latn", "ngl_Latn", "ngp_Latn",
+    "ngu_Latn", "nhe_Latn", "nhg_Latn", "nhi_Latn", "nhn_Latn", "nhq_Latn",
+    "nhu_Latn", "nhw_Latn", "nhx_Latn", "nhy_Latn", "nia_Latn", "nij_Latn",
+    "nim_Latn", "nin_Latn", "nja_Latn", "nko_Latn", "nla_Latn", "nlc_Latn",
+    "nld_Latn", "nlg_Latn", "nlk_Latn", "nlv_Latn", "nmg_Latn", "nmz_Latn",
+    "nnb_Latn", "nnh_Latn", "nnq_Latn", "nnw_Latn", "noa_Latn", "nob_Latn",
+    "nod_Thai", "noe_Deva", "nog_Cyrl", "not_Latn", "npl_Latn", "npy_Latn",
+    "nso_Latn", "nst_Latn", "nsu_Latn", "ntm_Latn", "ntr_Latn", "nuj_Latn",
+    "nup_Latn", "nus_Latn", "nuz_Latn", "nwb_Latn", "nxq_Latn", "nya_Latn",
+    "nyf_Latn", "nyn_Latn", "nyo_Latn", "nyu_Latn", "nyy_Latn", "nzi_Latn",
+    "obo_Latn", "oci_Latn", "odk_Arab", "odu_Latn", "ogo_Latn", "ojb_Cans",
+    "ojb_Latn", "oku_Latn", "old_Latn", "omw_Latn", "onb_Latn", "ood_Latn",
+    "orc_Latn", "orm_Latn", "oru_Arab", "ory_Orya", "oss_Cyrl", "ote_Latn",
+    "otq_Latn", "ozm_Latn", "pab_Latn", "pad_Latn", "pag_Latn", "pam_Latn",
+    "pan_Guru", "pao_Latn", "pap_Latn", "pau_Latn", "pbb_Latn", "pbc_Latn",
+    "pbi_Latn", "pbs_Latn", "pbt_Arab", "pbu_Arab", "pce_Thai", "pcm_Latn",
+    "pex_Latn", "pez_Latn", "phl_Arab", "phr_Arab", "pib_Latn", "pil_Latn",
+    "pip_Latn", "pir_Latn", "pis_Latn", "piy_Latn", "pjt_Latn", "pkb_Latn",
+    "pko_Latn", "plk_Arab", "pls_Latn", "plt_Latn", "plw_Latn", "pmf_Latn",
+    "pmq_Latn", "pms_Latn", "pmy_Latn", "pnb_Arab", "pne_Latn", "pny_Latn",
+    "poc_Latn", "poe_Latn", "poh_Latn", "poi_Latn", "pol_Latn", "por_Latn",
+    "pov_Latn", "pow_Latn", "poy_Latn", "ppk_Latn", "pps_Latn", "prf_Latn",
+    "prk_Latn", "prq_Latn", "prt_Thai", "pse_Latn", "pss_Latn", "pst_Arab",
+    "ptu_Latn", "pua_Latn", "pui_Latn", "pus_Arab", "pwg_Latn", "pwn_Latn",
+    "pww_Thai", "pxm_Latn", "qub_Latn", "quc_Latn", "quf_Latn", "qug_Latn",
+    "quh_Latn", "qul_Latn", "qum_Latn", "qup_Latn", "qur_Latn", "qus_Latn",
+    "quv_Latn", "quw_Latn", "qux_Latn", "quy_Latn", "quz_Latn", "qva_Latn",
+    "qvc_Latn", "qve_Latn", "qvh_Latn", "qvi_Latn", "qvj_Latn", "qvl_Latn",
+    "qvm_Latn", "qvn_Latn", "qvo_Latn", "qvs_Latn", "qvw_Latn", "qvz_Latn",
+    "qwa_Latn", "qwh_Latn", "qws_Latn", "qxa_Latn", "qxh_Latn", "qxl_Latn",
+    "qxn_Latn", "qxo_Latn", "qxp_Latn", "qxr_Latn", "qxt_Latn", "qxu_Latn",
+    "qxw_Latn", "rag_Latn", "rah_Beng", "rai_Latn", "rap_Latn", "rav_Deva",
+    "raw_Latn", "rej_Latn", "rel_Latn", "rgu_Latn", "rhg_Latn", "rif_Arab",
+    "rif_Latn", "rim_Latn", "rjs_Deva", "rkt_Beng", "rmc_Cyrl", "rmc_Latn",
+    "rmo_Latn", "rmy_Cyrl", "rmy_Latn", "rng_Latn", "rnl_Latn", "rob_Latn",
+    "rof_Latn", "roh_Latn_surs1244", "rol_Latn", "ron_Latn", "roo_Latn", "rop_Latn",
+    "rro_Latn", "rth_Latn", "rub_Latn", "ruc_Latn", "ruf_Latn", "rug_Latn",
+    "run_Latn", "rus_Cyrl", "rwm_Latn", "rwr_Deva", "sab_Latn", "sag_Latn",
+    "sah_Cyrl", "saj_Latn", "saq_Latn", "sas_Latn", "sau_Latn", "say_Latn",
+    "sba_Latn", "sbd_Latn", "sbl_Latn", "sbn_Arab", "sbp_Latn", "sch_Latn",
+    "sck_Deva", "scl_Arab", "scn_Latn", "sco_Latn", "sda_Latn", "sdo_Latn",
+    "sea_Latn", "seh_Latn", "sei_Latn", "ses_Latn", "sey_Latn", "sgb_Latn",
+    "sgj_Deva", "sgw_Ethi", "shi_Latn", "shk_Latn", "shn_Mymr", "sho_Latn",
+    "shp_Latn", "sid_Latn", "sig_Latn", "sil_Latn", "sin_Sinh", "sip_Tibt",
+    "siw_Latn", "sja_Latn", "sjm_Latn", "sjp_Deva", "sjr_Latn", "skg_Latn",
+    "skr_Arab", "sld_Latn", "slk_Latn", "slu_Latn", "slv_Latn", "sml_Latn",
+    "smo_Latn", "sna_Latn", "snc_Latn", "snd_Arab", "sne_Latn", "snk_Latn",
+    "snn_Latn", "snp_Latn", "snv_Latn", "snw_Latn", "sol_Latn", "som_Latn",
+    "soy_Latn", "spa_Latn", "spp_Latn", "sps_Latn", "spy_Latn", "src_Latn",
+    "srd_Latn", "sri_Latn", "srm_Latn", "srn_Latn", "sro_Latn", "srp_Cyrl",
+    "srr_Latn", "srx_Deva", "ssi_Arab", "ste_Latn", "stn_Latn", "stp_Latn",
+    "sua_Latn", "suc_Latn", "suk_Latn", "sun_Latn", "sur_Latn", "sus_Latn",
+    "suv_Latn", "suz_Deva", "sva_Geor", "swe_Latn", "swh_Latn", "swv_Deva",
+    "sxb_Latn", "sxn_Latn", "sya_Latn", "syl_Latn", "sza_Latn", "szy_Latn",
+    "tac_Latn", "taj_Deva", "tam_Taml", "tan_Latn", "tao_Latn", "tap_Latn",
+    "taq_Latn", "tar_Latn", "tat_Cyrl", "tav_Latn", "tay_Latn", "tbc_Latn",
+    "tbf_Latn", "tbg_Latn", "tbk_Latn", "tbl_Latn", "tby_Latn", "tbz_Latn",
+    "tca_Latn", "tcc_Latn", "tcf_Latn", "tcy_Mlym", "tcz_Latn", "tdj_Latn",
+    "tdn_Latn", "tdx_Latn", "ted_Latn", "tee_Latn", "tel_Telu", "tem_Latn",
+    "teo_Latn", "ter_Latn", "tew_Latn", "tex_Latn", "tfr_Latn", "tgc_Latn",
+    "tgj_Latn", "tgk_Cyrl", "tgl_Latn", "tgo_Latn", "tgp_Latn", "tha_Thai",
+    "the_Deva", "thk_Latn", "thl_Deva", "thq_Deva", "thr_Deva", "thv_Tfng",
+    "tig_Ethi", "tih_Latn", "tik_Latn", "tio_Latn", "tir_Ethi", "tkg_Latn",
+    "tkr_Latn", "tkt_Deva", "tlb_Latn", "tli_Latn", "tlj_Latn", "tlp_Latn",
+    "tly_Latn", "tmc_Latn", "tmf_Latn", "tna_Latn", "tng_Latn", "tnk_Latn",
+    "tnn_Latn", "tnp_Latn", "tnr_Latn", "tnt_Latn", "tob_Latn", "toc_Latn",
+    "toh_Latn", "tok_Latn", "tom_Latn", "top_Latn", "tos_Latn", "tpi_Latn",
+    "tpl_Latn", "tpm_Latn", "tpp_Latn", "tpt_Latn", "tpz_Latn", "tqp_Latn",
+    "trc_Latn", "tri_Latn", "trn_Latn", "trp_Latn", "trq_Latn", "trs_Latn",
+    "trv_Latn", "trw_Arab", "tsn_Latn", "tso_Latn", "tsz_Latn", "ttc_Latn",
+    "tte_Latn", "ttj_Latn", "ttq_Tfng", "ttr_Latn", "ttu_Latn", "tue_Latn",
+    "tuf_Latn", "tui_Latn", "tuk_Arab", "tuk_Latn", "tul_Latn", "tuo_Latn",
+    "tuq_Latn", "tur_Latn", "tuv_Latn", "tuy_Latn", "tvo_Latn", "tvu_Latn",
+    "tvw_Latn", "twb_Latn", "twe_Latn", "twu_Latn", "txa_Latn", "txq_Latn",
+    "txs_Latn", "txu_Latn", "txy_Latn", "tye_Latn", "tzh_Latn", "tzj_Latn",
+    "tzo_Latn", "ubl_Latn", "ubu_Latn", "udl_Latn", "udm_Cyrl", "udu_Latn",
+    "uig_Arab", "uig_Cyrl", "uki_Orya", "ukr_Cyrl", "ukv_Latn", "umb_Latn",
+    "upv_Latn", "ura_Latn", "urb_Latn", "urd_Arab", "urd_Deva", "urd_Latn",
+    "urh_Latn", "urk_Thai", "urt_Latn", "ury_Latn", "ush_Arab", "usp_Latn",
+    "uzb_Cyrl", "uzb_Latn", "uzn_Latn", "vag_Latn", "vah_Deva", "vai_Latn",
+    "var_Latn", "ver_Latn", "vid_Latn", "vie_Latn", "vif_Latn", "vmc_Latn",
+    "vmj_Latn", "vmm_Latn", "vmp_Latn", "vmw_Latn", "vmy_Latn", "vmz_Latn",
+    "vro_Latn", "vun_Latn", "vut_Latn", "wal_Ethi", "wal_Latn", "wap_Latn",
+    "war_Latn", "waw_Latn", "way_Latn", "wba_Latn", "wbl_Latn", "wbr_Deva",
+    "wci_Latn", "weo_Latn", "wes_Latn", "wja_Latn", "wji_Latn", "wlo_Latn",
+    "wlx_Latn", "wmw_Latn", "wob_Latn", "wof_Latn", "wol_Latn", "wsg_Telu",
+    "wwa_Latn", "xal_Cyrl", "xdy_Latn", "xed_Latn", "xer_Latn", "xhe_Arab",
+    "xho_Latn", "xka_Arab", "xkl_Latn", "xmf_Geor", "xmm_Latn", "xmv_Latn",
+    "xnj_Latn", "xnr_Deva", "xog_Latn", "xon_Latn", "xpe_Latn", "xrb_Latn",
+    "xsb_Latn", "xsm_Latn", "xsr_Deva", "xsu_Latn", "xta_Latn", "xtd_Latn",
+    "xte_Latn", "xti_Latn", "xtm_Latn", "xtn_Latn", "xtu_Latn", "xua_Taml",
+    "xuo_Latn", "yaa_Latn", "yad_Latn", "yal_Latn", "yam_Latn", "yao_Latn",
+    "yaq_Latn", "yas_Latn", "yat_Latn", "yav_Latn", "yay_Latn", "yaz_Latn",
+    "yba_Latn", "ybb_Latn", "ycl_Latn", "ycn_Latn", "ydd_Hebr", "ydg_Arab",
+    "yea_Mlym", "yer_Latn", "yes_Latn", "yka_Latn", "yli_Latn", "yor_Latn",
+    "yre_Latn", "yua_Latn", "yue_Hans", "yue_Hant", "yuz_Latn", "yva_Latn",
+    "zaa_Latn", "zab_Latn", "zac_Latn", "zad_Latn", "zae_Latn", "zai_Latn",
+    "zam_Latn", "zao_Latn", "zaq_Latn", "zar_Latn", "zas_Latn", "zav_Latn",
+    "zaw_Latn", "zca_Latn", "zga_Latn", "zim_Latn", "ziw_Latn", "zmz_Latn",
+    "zne_Latn", "zoc_Latn", "zoh_Latn", "zor_Latn", "zos_Latn", "zpc_Latn",
+    "zpg_Latn", "zpi_Latn", "zpl_Latn", "zpm_Latn", "zpo_Latn", "zpt_Latn",
+    "zpu_Latn", "zpv_Latn", "zpy_Latn", "zpz_Latn", "zsm_Latn", "ztg_Latn",
+    "ztn_Latn", "ztp_Latn", "ztq_Latn", "zts_Latn", "ztu_Latn", "zty_Latn",
+    "zul_Latn", "zyb_Latn", "zyp_Latn", "zza_Latn",
+];
+
+export const OMNI_ASR_SUPPORTED_LANG_SET: ReadonlySet<string> = new Set(OMNI_ASR_SUPPORTED_LANGS);

From ed5cc7a3862b7b5e93e8911c94e0ba30c3cd13a6 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 15:21:51 -0500
Subject: [PATCH 02/12] Remove dead phonetic/IPA ASR plumbing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OmniASR doesn't support IPA output (only its now-deprecated MMS
predecessor's ESPeak companion did), so the phonetic flag was never
doing anything. Removes:

  - `codex-editor-extension.asrPhonetic` workspace setting
  - `phonetic` field from the asrConfig payload and the inline types
    in TextCellEditor + CodexCellEditor's batch transcription path
  - `phonetic` read/write in the Copilot + MainMenu settings panels
    (settings panel UI itself is unchanged — the field just stops
    being wired)

Also nudges the stale default endpoints / provider / model strings
toward OmniASR-correct values (the endpoint default is unused in
production — the live endpoint comes from getAsrEndpoint() — but the
old default leaked the deprecated WebSocket URL).
---
 package.json                                           |  6 ------
 src/copilotSettings/copilotSettings.ts                 |  8 +++-----
 .../codexCellEditorMessagehandling.ts                  |  6 +-----
 src/providers/mainMenu/mainMenuProvider.ts             | 10 ++++------
 .../src/CodexCellEditor/CodexCellEditor.tsx            |  7 +++----
 .../src/CodexCellEditor/TextCellEditor.tsx             | 10 ++++++----
 6 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/package.json b/package.json
index e2da8ab79..148a20168 100644
--- a/package.json
+++ b/package.json
@@ -900,12 +900,6 @@
                         "default": "eng",
                         "description": "Language code for transcription. MMS requires ISO-639-3 (e.g., eng, fra, spa). 2-letter codes will be mapped where possible."
                     },
-                    "codex-editor-extension.asrPhonetic": {
-                        "title": "Return Phonetic (IPA)",
-                        "type": "boolean",
-                        "default": false,
-                        "description": "If enabled and supported by provider, also return phonetic (IPA) transcription."
-                    },
                     "codex-editor-extension.sourceBookWhitelist": {
                         "title": "Source Book Whitelist",
                         "type": "string",
diff --git a/src/copilotSettings/copilotSettings.ts b/src/copilotSettings/copilotSettings.ts
index fbe927f47..073fd4953 100644
--- a/src/copilotSettings/copilotSettings.ts
+++ b/src/copilotSettings/copilotSettings.ts
@@ -122,11 +122,10 @@ export async function openSystemMessageEditor() {
                 try {
                     const config = vscode.workspace.getConfiguration("codex-editor-extension");
                     const settings = {
-                        endpoint: config.get<string>("asrEndpoint", "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe"),
-                        provider: config.get<string>("asrProvider", "mms"),
-                        model: config.get<string>("asrModel", "facebook/mms-1b-all"),
+                        endpoint: config.get<string>("asrEndpoint", "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe"),
+                        provider: config.get<string>("asrProvider", "omniasr"),
+                        model: config.get<string>("asrModel", "omniASR_LLM_1B_v2"),
                         language: config.get<string>("asrLanguage", "eng"),
-                        phonetic: config.get<boolean>("asrPhonetic", false),
                     };
                     panel.webview.postMessage({ command: "asrSettings", data: settings });
                 } catch (error) {
@@ -143,7 +142,6 @@ export async function openSystemMessageEditor() {
                     await config.update("asrProvider", message.data?.provider, target);
                     await config.update("asrModel", message.data?.model, target);
                     await config.update("asrLanguage", message.data?.language, target);
-                    await config.update("asrPhonetic", !!message.data?.phonetic, target);
                     panel.webview.postMessage({ command: "asrSettingsSaved" });
                 } catch (error) {
                     console.error("[CopilotSettings] Failed to save ASR settings:", error);
diff --git a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
index 39a560b99..2f9ffe3c4 100644
--- a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
+++ b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
@@ -549,11 +549,7 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                 type: "asrConfig",
                 content: {
                     endpoint: fallbackEndpoint,
-                    provider: "mms",
-                    model: "facebook/mms-1b-all",
-                    language: "eng",
-                    phonetic: false,
-                    authToken: undefined
+                    authToken: undefined,
                 }
             });
         }
diff --git a/src/providers/mainMenu/mainMenuProvider.ts b/src/providers/mainMenu/mainMenuProvider.ts
index 0395e4692..e8f60e9f4 100644
--- a/src/providers/mainMenu/mainMenuProvider.ts
+++ b/src/providers/mainMenu/mainMenuProvider.ts
@@ -705,7 +705,7 @@ export class MainMenuProvider extends BaseWebviewProvider {
             }
             case "getAsrSettings": {
                 const config = vscode.workspace.getConfiguration("codex-editor-extension");
-                let endpoint = config.get<string>("asrEndpoint", "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe");
+                let endpoint = config.get<string>("asrEndpoint", "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe");
                 let authToken: string | undefined;
 
                 // Try to get authenticated endpoint from FrontierAPI
@@ -745,7 +745,7 @@ export class MainMenuProvider extends BaseWebviewProvider {
                     new URL(endpoint);
                 } catch (urlError) {
                     console.error("Invalid ASR endpoint configuration:", endpoint, urlError);
-                    endpoint = "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe";
+                    endpoint = "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe";
                 }
 
                 // Warn if using authenticated endpoint without token
@@ -756,10 +756,9 @@ export class MainMenuProvider extends BaseWebviewProvider {
 
                 const settings = {
                     endpoint,
-                    provider: config.get<string>("asrProvider", "mms"),
-                    model: config.get<string>("asrModel", "facebook/mms-1b-all"),
+                    provider: config.get<string>("asrProvider", "omniasr"),
+                    model: config.get<string>("asrModel", "omniASR_LLM_1B_v2"),
                     language: config.get<string>("asrLanguage", "eng"),
-                    phonetic: config.get<boolean>("asrPhonetic", false),
                     authToken,
                 };
                 if (this._view) {
@@ -774,7 +773,6 @@ export class MainMenuProvider extends BaseWebviewProvider {
                 await config.update("asrProvider", (message as any).data?.provider, target);
                 await config.update("asrModel", (message as any).data?.model, target);
                 await config.update("asrLanguage", (message as any).data?.language, target);
-                await config.update("asrPhonetic", !!(message as any).data?.phonetic, target);
                 if (this._view) {
                     safePostMessageToView(this._view, { command: "asrSettingsSaved" }, "MainMenu");
                 }
diff --git a/webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx b/webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx
index 3f5b452f7..d468104fd 100755
--- a/webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx
@@ -458,11 +458,10 @@ const CodexCellEditor: React.FC = () => {
                     // Fetch ASR config
                     const asrConfig = await new Promise<{
                         endpoint: string;
-                        provider: string;
-                        model: string;
-                        language: string;
-                        phonetic: boolean;
                         authToken?: string;
+                        lang?: string;
+                        languageMode?: "auto" | "project";
+                        projectLanguageName?: string;
                     }>((resolve, reject) => {
                         let resolved = false;
                         const onMsg = (ev: MessageEvent) => {
diff --git a/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx b/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
index df4389851..9536c0073 100644
--- a/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
@@ -497,11 +497,13 @@ const CellEditor: React.FC<CellEditorProps> = ({
     const transcriptionClientRef = useRef<WhisperTranscriptionClient | null>(null);
     const [asrConfig, setAsrConfig] = useState<{
         endpoint: string;
-        provider: string;
-        model: string;
-        language: string; // ISO-639-3 expected by MMS; may be ISO-639-1 and mapped
-        phonetic: boolean;
         authToken?: string;
+        /** OmniASR code (e.g. `swh_Latn`) to send as `?lang=...`. Omitted in auto-detect mode. */
+        lang?: string;
+        /** What the user picked in the gear menu: "project" (default) or "auto". */
+        languageMode?: "auto" | "project";
+        /** Project's target-language refName, used as fallback when the server doesn't echo `lang`. */
+        projectLanguageName?: string;
     } | null>(null);
 
     // Helper to smoothly center the editor. Coalesces multiple calls and

From 820886d5d144bd4bbb948679fa1c81801218aea6 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 15:24:26 -0500
Subject: [PATCH 03/12] Wire OmniASR language settings into asrConfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extension-host side. The webview now receives `lang` (resolved OmniASR
code), `languageMode` ("project" | "auto"), and `projectLanguageName`
in the `asrConfig` message; updateCellAfterTranscription stops
defaulting `language` to "unknown" — the field now carries the actual
OmniASR code the server echoed (or `null` when auto-detect gave us
nothing to report).

New workspace settings persisting user choices from the gear menu:
  - `codex-editor-extension.asrLanguageMode` ("project" | "auto")
  - `codex-editor-extension.asrScriptPref` ("auto" | "latin" | 4-letter
    ISO 15924 tag)

New message commands the webview calls when the user toggles these:
  - `setAsrLanguageMode`
  - `setAsrScriptPref`

Both rebroadcast `asrConfig` so the live webview state stays in sync
without a reload.
---
 package.json                                  | 18 ++++-
 .../codexCellEditorMessagehandling.ts         | 72 ++++++++++++++++++-
 types/index.d.ts                              | 33 ++++++++-
 3 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/package.json b/package.json
index 148a20168..8fe329d44 100644
--- a/package.json
+++ b/package.json
@@ -898,7 +898,23 @@
                         "title": "ASR Language (ISO-639-3)",
                         "type": "string",
                         "default": "eng",
-                        "description": "Language code for transcription. MMS requires ISO-639-3 (e.g., eng, fra, spa). 2-letter codes will be mapped where possible."
+                        "description": "Legacy: ISO 639-3 hint for ASR providers. OmniASR uses the project's target language by default; configure via the gear menu on the Transcribe button."
+                    },
+                    "codex-editor-extension.asrLanguageMode": {
+                        "title": "ASR Language Mode",
+                        "type": "string",
+                        "enum": [
+                            "project",
+                            "auto"
+                        ],
+                        "default": "project",
+                        "description": "Whether to send the project's target language as a hint to the ASR service (\"project\"), or let the model transcribe without language conditioning (\"auto\")."
+                    },
+                    "codex-editor-extension.asrScriptPref": {
+                        "title": "ASR Script Preference",
+                        "type": "string",
+                        "default": "auto",
+                        "description": "Script subtag to pair with the ASR language code. \"auto\" picks the best-guess script per language; \"latin\" forces Latin where supported; any 4-letter ISO 15924 tag (e.g. \"Arab\", \"Cyrl\") overrides per-language."
                     },
                     "codex-editor-extension.sourceBookWhitelist": {
                         "title": "Source Book Whitelist",
diff --git a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
index 2f9ffe3c4..8c5ca19a0 100644
--- a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
+++ b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
@@ -484,6 +484,25 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
             const config = vscode.workspace.getConfiguration("codex-editor-extension");
             let endpoint = config.get<string>("asrEndpoint", "http://localhost:8000/api/v1/asr/transcribe");
 
+            // ASR language plumbing — see sharedUtils/asrLanguageUtils.ts for the resolver
+            // contract. The webview drives "auto-detect" vs "use project language" via the
+            // gear menu on the Transcribe button; that picker is persisted to the workspace
+            // setting `asrLanguageMode`.
+            const { resolveOmniAsrCode } = await import("../../../sharedUtils/asrLanguageUtils");
+            const projectConfig = vscode.workspace.getConfiguration("codex-project-manager");
+            const targetLanguage = projectConfig.get<any>("targetLanguage") as
+                | { tag?: string; refName?: string; iso1?: string; iso2t?: string; iso2b?: string; }
+                | undefined;
+            const languageMode = (config.get<string>("asrLanguageMode", "project") === "auto"
+                ? "auto"
+                : "project") as "auto" | "project";
+            const scriptPref = config.get<string>("asrScriptPref", "auto");
+            const resolvedCode =
+                languageMode === "auto"
+                    ? undefined
+                    : resolveOmniAsrCode(targetLanguage, scriptPref);
+            const projectLanguageName = targetLanguage?.refName;
+
             let authToken: string | undefined;
 
             // Try to get authenticated endpoint from FrontierAPI
@@ -536,10 +555,16 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                 console.error(`[getAsrConfig] This will cause transcription to fail. Please check authentication status.`);
             }
 
-            debug(`[getAsrConfig] Sending config: endpoint=${endpoint}, hasToken=${!!authToken}`);
+            debug(`[getAsrConfig] Sending config: endpoint=${endpoint}, hasToken=${!!authToken}, lang=${resolvedCode}, mode=${languageMode}`);
             safePostMessageToPanel(webviewPanel, {
                 type: "asrConfig",
-                content: { endpoint, authToken }
+                content: {
+                    endpoint,
+                    authToken,
+                    lang: resolvedCode,
+                    languageMode,
+                    projectLanguageName,
+                },
             });
         } catch (error) {
             console.error("Error sending ASR config:", error);
@@ -550,11 +575,47 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                 content: {
                     endpoint: fallbackEndpoint,
                     authToken: undefined,
+                    languageMode: "project",
                 }
             });
         }
     },
 
+    setAsrLanguageMode: async ({ event, webviewPanel }) => {
+        const typedEvent = event as Extract<EditorPostMessages, { command: "setAsrLanguageMode"; }>;
+        const mode = typedEvent.content?.mode === "auto" ? "auto" : "project";
+        try {
+            await vscode.workspace
+                .getConfiguration("codex-editor-extension")
+                .update("asrLanguageMode", mode, vscode.ConfigurationTarget.Workspace);
+        } catch (err) {
+            console.warn("Failed to update asrLanguageMode", err);
+        }
+        // Rebroadcast so the webview can refresh its local asrConfig snapshot.
+        await messageHandlers.getAsrConfig({ webviewPanel } as any);
+    },
+
+    setAsrScriptPref: async ({ event, webviewPanel }) => {
+        const typedEvent = event as Extract<EditorPostMessages, { command: "setAsrScriptPref"; }>;
+        const rawPref = typedEvent.content?.scriptPref;
+        // Accept "auto", "latin", or any 4-letter ISO 15924 tag. Anything else falls back to "auto".
+        const isFourLetter = typeof rawPref === "string" && /^[A-Za-z]{4}$/.test(rawPref);
+        const normalized =
+            rawPref === "auto" || rawPref === "latin"
+                ? rawPref
+                : isFourLetter
+                    ? rawPref!.charAt(0).toUpperCase() + rawPref!.slice(1).toLowerCase()
+                    : "auto";
+        try {
+            await vscode.workspace
+                .getConfiguration("codex-editor-extension")
+                .update("asrScriptPref", normalized, vscode.ConfigurationTarget.Workspace);
+        } catch (err) {
+            console.warn("Failed to update asrScriptPref", err);
+        }
+        await messageHandlers.getAsrConfig({ webviewPanel } as any);
+    },
+
     updateCellAfterTranscription: async ({ event, document, webviewPanel, provider }) => {
         const typedEvent = event as Extract<EditorPostMessages, { command: "updateCellAfterTranscription"; }>;
         const { cellId, transcribedText, language } = typedEvent.content;
@@ -570,7 +631,12 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                 ...(attachment || {}),
                 transcription: {
                     content: transcribedText,
-                    language: language || "unknown",
+                    // `language` is the OmniASR `{iso639_3}_{Script}` code the server reported
+                    // (or null when the server ran in auto-detect mode and didn't echo one).
+                    // The webview labels the badge with `labelForTranscriptionLanguage()` from
+                    // sharedUtils/asrLanguageUtils.ts — never trust "language" to be a human
+                    // string here.
+                    language: language ?? null,
                     timestamp: Date.now(),
                 },
                 updatedAt: Date.now(),
diff --git a/types/index.d.ts b/types/index.d.ts
index 4a62f3622..827f4360c 100644
--- a/types/index.d.ts
+++ b/types/index.d.ts
@@ -576,10 +576,24 @@ export type EditorPostMessages =
         content: {
             cellId: string;
             transcribedText: string;
-            language: string;
+            /** OmniASR `{iso639_3}_{Script}` code the server reported (or that we sent and the server
+             *  used silently). `null` when transcription ran in auto-detect mode and the server did
+             *  not echo a language back. Persisted on the audio attachment so the badge survives
+             *  re-renders. */
+            language: string | null;
         };
     }
     | { command: "getAsrConfig"; }
+    | {
+        command: "setAsrLanguageMode";
+        content: { mode: "auto" | "project"; };
+    }
+    | {
+        command: "setAsrScriptPref";
+        /** `"auto"` (best guess), `"latin"` (force Latin where supported), or a 4-letter
+         *  ISO 15924 tag (`"Arab"`, `"Cyrl"`, ...). */
+        content: { scriptPref: string; };
+    }
     | {
         command: "mergeCellWithPrevious";
         content: {
@@ -2150,7 +2164,22 @@ type EditorReceiveMessages =
         milestoneIndex?: number;
         subsectionIndex?: number;
     }
-    | { type: "asrConfig"; content: { endpoint: string; authToken?: string; }; }
+    | {
+        type: "asrConfig";
+        content: {
+            endpoint: string;
+            authToken?: string;
+            /** OmniASR `{iso639_3}_{Script}` code to send as `?lang=...`. Omitted when the
+             *  user picks Auto-Detect or when we can't safely resolve a code. */
+            lang?: string;
+            /** "project" (default) → send `lang`. "auto" → omit `lang`, let the server transcribe
+             *  without language conditioning. Persisted as workspace setting `asrLanguageMode`. */
+            languageMode: "auto" | "project";
+            /** Project target-language refName, e.g. "Swahili". Used as the badge fallback when
+             *  the server doesn't echo `lang` in the response. */
+            projectLanguageName?: string;
+        };
+    }
     | { type: "startBatchTranscription"; content: { count: number; }; }
     | {
         type: "providerConfirmsBacktranslationSet";

From 37af7c89b2e4ffb1ef0bf35b7a8dc25837970021 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 15:30:47 -0500
Subject: [PATCH 04/12] Wire ASR client to OmniASR lang plumbing; add
 Re-transcribe + gear menu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WhisperTranscriptionClient
  - transcribe() now takes { lang?, timeoutMs? } and forwards lang as
    ?lang= when provided
  - parses back result.lang (or result.language, for back-compat with
    the Frontier proxy's earlier field name) and returns it alongside
    the text

CodexCellEditor / TextCellEditor
  - both transcription paths (per-cell button + batch run) now send
    the resolved OmniASR code in project mode, omit it in auto-detect
    mode, and persist whatever the server echoes (or what they sent)
    via updateCellAfterTranscription
  - the badge label is now computed via labelForTranscriptionLanguage:
    server echo → sent code → project name → "Auto Detect" (only when
    that's the user's chosen mode); falls through to nothing when in
    project mode and we have no signal — never lies about the language
  - deletes the dead toIso3() lookup table; the resolver handles
    macrolang/ISO-1→3 mapping now

AudioWaveformWithTranscription
  - Transcribe button is always visible (no longer hidden once a
    transcription exists); flips label to "Re-transcribe" and stays
    disabled while transcribing — mirrors the Re-record button
  - new gear-icon popover next to it surfaces two advanced settings:
      Language: Project (default) / Auto-detect
      Script:   Best guess (default) / Latin / Custom (ISO 15924 tag)
    Hidden on source editors where transcription policy isn't user-
    driven. Selections post back to the host (setAsrLanguageMode /
    setAsrScriptPref) which persists them to workspace settings and
    rebroadcasts asrConfig so the live state stays in sync.

Types
  - asrConfig content gains lang, languageMode, scriptPref,
    projectLanguageName
  - updateCellAfterTranscription.content.language is now `string | null`
    (was always the hardcoded "unknown")
  - new EditorPostMessages: setAsrLanguageMode, setAsrScriptPref
---
 .../codexCellEditorMessagehandling.ts         |   3 +-
 types/index.d.ts                              |   3 +
 .../AudioWaveformWithTranscription.tsx        | 175 ++++++++++++++++--
 .../src/CodexCellEditor/CodexCellEditor.tsx   |  28 +--
 .../src/CodexCellEditor/TextCellEditor.tsx    |  64 ++++++-
 .../WhisperTranscriptionClient.ts             |  34 +++-
 6 files changed, 262 insertions(+), 45 deletions(-)

diff --git a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
index 8c5ca19a0..bb8d18069 100644
--- a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
+++ b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
@@ -555,7 +555,7 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                 console.error(`[getAsrConfig] This will cause transcription to fail. Please check authentication status.`);
             }
 
-            debug(`[getAsrConfig] Sending config: endpoint=${endpoint}, hasToken=${!!authToken}, lang=${resolvedCode}, mode=${languageMode}`);
+            debug(`[getAsrConfig] Sending config: endpoint=${endpoint}, hasToken=${!!authToken}, lang=${resolvedCode}, mode=${languageMode}, scriptPref=${scriptPref}`);
             safePostMessageToPanel(webviewPanel, {
                 type: "asrConfig",
                 content: {
@@ -563,6 +563,7 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                     authToken,
                     lang: resolvedCode,
                     languageMode,
+                    scriptPref,
                     projectLanguageName,
                 },
             });
diff --git a/types/index.d.ts b/types/index.d.ts
index 827f4360c..55445c080 100644
--- a/types/index.d.ts
+++ b/types/index.d.ts
@@ -2175,6 +2175,9 @@ type EditorReceiveMessages =
             /** "project" (default) → send `lang`. "auto" → omit `lang`, let the server transcribe
              *  without language conditioning. Persisted as workspace setting `asrLanguageMode`. */
             languageMode: "auto" | "project";
+            /** Script preference: "auto" (best guess), "latin", or a 4-letter ISO 15924 tag.
+             *  Persisted as workspace setting `asrScriptPref`. */
+            scriptPref?: string;
             /** Project target-language refName, e.g. "Swahili". Used as the badge fallback when
              *  the server doesn't echo `lang` in the response. */
             projectLanguageName?: string;
diff --git a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
index 65cb49161..a546b8fe9 100644
--- a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
@@ -2,10 +2,23 @@ import React, { useEffect, useState } from "react";
 import { CustomWaveformCanvas } from "./CustomWaveformCanvas.tsx";
 import { Button } from "../components/ui/button";
 import { Badge } from "../components/ui/badge";
-import { MessageCircle, Copy, Loader2, Trash2, History, Mic } from "lucide-react";
+import { MessageCircle, Copy, Loader2, Trash2, History, Mic, Settings as SettingsIcon } from "lucide-react";
 import type { ValidationStatusIconProps } from "./AudioValidationStatusIcon.tsx";
 import { AudioValidationBadge } from "./AudioValidationBadge.tsx";
 import type { AudioValidationPopoverProps } from "./AudioValidationBadge.tsx";
+import {
+    Popover,
+    PopoverContent,
+    PopoverTrigger,
+} from "../components/ui/popover";
+import {
+    Select,
+    SelectContent,
+    SelectItem,
+    SelectTrigger,
+    SelectValue,
+} from "../components/ui/select";
+import { Input } from "../components/ui/input";
 
 interface AudioWaveformWithTranscriptionProps {
     audioUrl: string;
@@ -15,6 +28,10 @@ interface AudioWaveformWithTranscriptionProps {
         timestamp: number;
         language?: string;
     } | null;
+    /** Pre-computed friendly label for the language badge ("Swahili", "Auto Detect", or null
+     *  for "render nothing"). Computed by the caller via `labelForTranscriptionLanguage()`
+     *  from sharedUtils/asrLanguageUtils.ts so this component stays presentational. */
+    transcriptionLanguageLabel?: string | null;
     isTranscribing: boolean;
     transcriptionProgress: number;
     onTranscribe: () => void;
@@ -31,6 +48,17 @@ interface AudioWaveformWithTranscriptionProps {
     targetDuration?: number | null; // Target duration (in seconds) derived from cell timestamps.
     /** Total number of audio recordings for the cell (including soft-deleted). When > 0, a count badge is rendered on the History button. */
     historyCount?: number;
+    // Advanced ASR settings (gear menu, next to the Transcribe button).
+    /** Whether to display the gear menu. Hide on source-text editors where the user can't drive transcription policy. */
+    showAdvancedAsrMenu?: boolean;
+    /** Current language mode. Determines the chevron position in the gear menu. */
+    asrLanguageMode?: "auto" | "project";
+    /** Current script preference: "auto", "latin", or a 4-letter ISO 15924 tag (e.g. "Arab"). */
+    asrScriptPref?: string;
+    /** Friendly project-language label for the "Project language" radio (e.g. "Swahili"). */
+    projectLanguageName?: string;
+    onChangeAsrLanguageMode?: (mode: "auto" | "project") => void;
+    onChangeAsrScriptPref?: (pref: string) => void;
 }
 
 const AudioWaveformWithTranscription: React.FC<AudioWaveformWithTranscriptionProps> = ({
@@ -52,10 +80,28 @@ const AudioWaveformWithTranscription: React.FC<AudioWaveformWithTranscriptionPro
     targetDuration,
     author,
     historyCount,
+    transcriptionLanguageLabel,
+    showAdvancedAsrMenu = true,
+    asrLanguageMode = "project",
+    asrScriptPref = "auto",
+    projectLanguageName,
+    onChangeAsrLanguageMode,
+    onChangeAsrScriptPref,
 }) => {
     const [audioSrc, setAudioSrc] = useState<string>("");
     const [audioDuration, setAudioDuration] = useState<number | null>(null);
 
+    // The Script picker offers three "preset" choices plus a free-form 4-letter input for
+    // power users (e.g. someone wants `swa_Cyrl` even though the resolver would never pick
+    // it). We surface "Custom" only when the current value isn't one of the presets.
+    const isPresetScript = asrScriptPref === "auto" || asrScriptPref === "latin";
+    const [scriptCustomDraft, setScriptCustomDraft] = useState<string>(
+        isPresetScript ? "" : asrScriptPref
+    );
+    useEffect(() => {
+        if (!isPresetScript) setScriptCustomDraft(asrScriptPref);
+    }, [asrScriptPref, isPresetScript]);
+
     // Prefer the provided URL (can be blob: or data:). Fall back to creating an object URL from the blob.
     useEffect(() => {
         if (audioUrl) {
@@ -142,9 +188,9 @@ const AudioWaveformWithTranscription: React.FC<AudioWaveformWithTranscriptionPro
                             >
                                 {transcription.content}
                             </p>
-                            {transcription.language && (
+                            {transcriptionLanguageLabel && (
                                 <Badge variant="secondary" className="text-xs">
-                                    {transcription.language}
+                                    {transcriptionLanguageLabel}
                                 </Badge>
                             )}
                         </div>
@@ -224,17 +270,118 @@ const AudioWaveformWithTranscription: React.FC<AudioWaveformWithTranscriptionPro
 
             {/* Action buttons at bottom */}
             <div className="flex flex-wrap items-center justify-center gap-2 px-2">
-                {!transcription && !isTranscribing && (
-                    <Button
-                        onClick={onTranscribe}
-                        disabled={disabled || (!audioUrl && !audioBlob)}
-                        variant="outline"
-                        className="h-8 px-2 text-xs text-[var(--vscode-button-background)] border-[var(--vscode-button-background)]/20 hover:bg-[var(--vscode-button-background)]/10"
-                        title="Transcribe Audio"
-                    >
-                        <MessageCircle className="h-3 w-3" />
-                        <span className="ml-1">Transcribe</span>
-                    </Button>
+                {/* Transcribe / Re-transcribe button — always visible (mirrors Re-record),
+                    grey-out while a transcription is in flight. The label flips to
+                    "Re-transcribe" once we have a saved transcription so the user can
+                    re-run with different ASR settings (e.g. flip to auto-detect). */}
+                <Button
+                    onClick={onTranscribe}
+                    disabled={disabled || isTranscribing || (!audioUrl && !audioBlob)}
+                    variant="outline"
+                    className="h-8 px-2 text-xs text-[var(--vscode-button-background)] border-[var(--vscode-button-background)]/20 hover:bg-[var(--vscode-button-background)]/10"
+                    title={transcription ? "Re-transcribe audio with current settings" : "Transcribe Audio"}
+                >
+                    <MessageCircle className="h-3 w-3" />
+                    <span className="ml-1">{transcription ? "Re-transcribe" : "Transcribe"}</span>
+                </Button>
+                {/* Gear menu — Language (auto-detect vs project) + Script (auto/Latin/custom).
+                    Hidden on source-text editors where transcription policy isn't user-driven. */}
+                {showAdvancedAsrMenu && (
+                    <Popover>
+                        <PopoverTrigger asChild>
+                            <Button
+                                variant="outline"
+                                size="sm"
+                                className="h-8 w-8 p-0 text-xs"
+                                title="Advanced ASR settings (Language / Script)"
+                                aria-label="Advanced ASR settings"
+                                disabled={isTranscribing}
+                            >
+                                <SettingsIcon className="h-3 w-3" />
+                            </Button>
+                        </PopoverTrigger>
+                        <PopoverContent className="w-72 space-y-3" align="start">
+                            <div className="space-y-1">
+                                <div className="text-xs font-semibold">Language</div>
+                                <Select
+                                    value={asrLanguageMode}
+                                    onValueChange={(v) =>
+                                        onChangeAsrLanguageMode?.(v === "auto" ? "auto" : "project")
+                                    }
+                                >
+                                    <SelectTrigger className="h-7 text-xs">
+                                        <SelectValue />
+                                    </SelectTrigger>
+                                    <SelectContent>
+                                        <SelectItem value="project">
+                                            {projectLanguageName ? `Project (${projectLanguageName})` : "Project language"}
+                                        </SelectItem>
+                                        <SelectItem value="auto">Auto-detect</SelectItem>
+                                    </SelectContent>
+                                </Select>
+                                <p className="text-[10px] text-muted-foreground leading-snug">
+                                    "Project" sends the language code to OmniASR for better accuracy.
+                                    "Auto-detect" omits it — OmniASR transcribes without language conditioning.
+                                </p>
+                            </div>
+                            <div className="space-y-1">
+                                <div className="text-xs font-semibold">Script</div>
+                                <Select
+                                    value={isPresetScript ? asrScriptPref : "custom"}
+                                    onValueChange={(v) => {
+                                        if (v === "auto" || v === "latin") {
+                                            onChangeAsrScriptPref?.(v);
+                                        } else {
+                                            // "custom" — keep whatever 4-letter tag is in the input,
+                                            // or fall back to "auto" if the input is empty/invalid.
+                                            const candidate = scriptCustomDraft.trim();
+                                            const isValid = /^[A-Za-z]{4}$/.test(candidate);
+                                            onChangeAsrScriptPref?.(isValid ? candidate : "auto");
+                                        }
+                                    }}
+                                >
+                                    <SelectTrigger className="h-7 text-xs">
+                                        <SelectValue />
+                                    </SelectTrigger>
+                                    <SelectContent>
+                                        <SelectItem value="auto">Best guess (default)</SelectItem>
+                                        <SelectItem value="latin">Latin (where supported)</SelectItem>
+                                        <SelectItem value="custom">Custom (ISO 15924 tag)</SelectItem>
+                                    </SelectContent>
+                                </Select>
+                                {/* Free-form 4-letter input shown only when "Custom" is selected.
+                                    Validation happens on commit so users can type. */}
+                                {!isPresetScript ? (
+                                    <div className="flex items-center gap-1">
+                                        <Input
+                                            value={scriptCustomDraft}
+                                            onChange={(e) => setScriptCustomDraft(e.target.value)}
+                                            placeholder="e.g. Arab, Cyrl, Hans"
+                                            maxLength={4}
+                                            className="h-7 text-xs"
+                                        />
+                                        <Button
+                                            size="sm"
+                                            variant="outline"
+                                            className="h-7 px-2 text-xs"
+                                            onClick={() => {
+                                                const candidate = scriptCustomDraft.trim();
+                                                if (/^[A-Za-z]{4}$/.test(candidate)) {
+                                                    onChangeAsrScriptPref?.(candidate);
+                                                }
+                                            }}
+                                        >
+                                            Apply
+                                        </Button>
+                                    </div>
+                                ) : null}
+                                <p className="text-[10px] text-muted-foreground leading-snug">
+                                    Script subtag paired with the language. Best guess covers Urdu→Arabic,
+                                    Mandarin→Simplified, Cantonese→Traditional, etc.
+                                </p>
+                            </div>
+                        </PopoverContent>
+                    </Popover>
                 )}
                 <Button
                     variant="outline"
diff --git a/webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx b/webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx
index d468104fd..fabd1af81 100755
--- a/webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx
@@ -493,28 +493,9 @@ const CodexCellEditor: React.FC = () => {
                         }, 5000);
                     });
 
-                    const toIso3 = (code?: string) => {
-                        const ISO2_TO_ISO3: Record<string, string> = {
-                            en: "eng",
-                            fr: "fra",
-                            es: "spa",
-                            de: "deu",
-                            pt: "por",
-                            it: "ita",
-                            nl: "nld",
-                            ru: "rus",
-                            zh: "zho",
-                            ja: "jpn",
-                            ko: "kor",
-                        };
-                        if (!code) return "eng";
-                        const norm = code.toLowerCase();
-                        return norm.length === 2 ? ISO2_TO_ISO3[norm] ?? "eng" : norm;
-                    };
-
                     const wsEndpoint =
                         asrConfig.endpoint ||
-                        "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe";
+                        "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe";
 
                     const targetCount = Math.max(0, message.content.count | 0);
                     const specificCellId: string | undefined = (message as any)?.content?.cellId;
@@ -583,7 +564,10 @@ const CodexCellEditor: React.FC = () => {
                                 next.add(cellId);
                                 return next;
                             });
-                            const result = await client.transcribe(blob);
+                            // Same lang-mode handling as the per-cell button: omit lang in
+                            // auto-detect mode, send the resolved code in project mode.
+                            const sentLang = asrConfig.languageMode === "auto" ? undefined : asrConfig.lang;
+                            const result = await client.transcribe(blob, { lang: sentLang });
                             const text = (result.text || "").trim();
                             if (text) {
                                 vscode.postMessage({
@@ -591,7 +575,7 @@ const CodexCellEditor: React.FC = () => {
                                     content: {
                                         cellId,
                                         transcribedText: text,
-                                        language: "unknown",
+                                        language: result.lang ?? sentLang ?? null,
                                     },
                                 } as unknown as EditorPostMessages);
 
diff --git a/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx b/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
index 9536c0073..7d167af24 100644
--- a/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
@@ -20,6 +20,7 @@ import { generateChildCellId } from "../../../../src/providers/codexCellEditorPr
 import ScrollToContentContext from "./contextProviders/ScrollToContentContext";
 import { WhisperTranscriptionClient } from "./WhisperTranscriptionClient";
 import AudioWaveformWithTranscription from "./AudioWaveformWithTranscription";
+import { labelForTranscriptionLanguage } from "@sharedUtils/asrLanguageUtils";
 import { AudioValidationBadge } from "./AudioValidationBadge";
 import { useAudioValidationStatus } from "./hooks/useAudioValidationStatus";
 import SourceTextDisplay from "./SourceTextDisplay";
@@ -502,10 +503,41 @@ const CellEditor: React.FC<CellEditorProps> = ({
         lang?: string;
         /** What the user picked in the gear menu: "project" (default) or "auto". */
         languageMode?: "auto" | "project";
+        /** Script preference: "auto" (best guess), "latin", or a 4-letter ISO 15924 tag. */
+        scriptPref?: string;
         /** Project's target-language refName, used as fallback when the server doesn't echo `lang`. */
         projectLanguageName?: string;
     } | null>(null);
 
+    /**
+     * Friendly label shown on the transcription badge.
+     *
+     * Source priority (delegated to labelForTranscriptionLanguage):
+     *   1. Language the server echoed in the response (saved as `transcription.language`)
+     *   2. Language we *sent* — when in project mode the server uses it silently
+     *   3. Project target-language refName, as a last-ditch fallback
+     *   4. "Auto Detect" — only when in auto-detect mode and (1)/(2)/(3) all returned null
+     *
+     * Returns null → render no badge.
+     */
+    const transcriptionBadgeLabel: string | null = useMemo(() => {
+        if (!savedTranscription) return null;
+        const serverLang = savedTranscription.language ?? null;
+        const sentLang = asrConfig?.languageMode === "auto" ? null : asrConfig?.lang ?? null;
+        const projectName = asrConfig?.projectLanguageName ?? null;
+        const friendly = labelForTranscriptionLanguage(serverLang, sentLang, projectName);
+        if (friendly) return friendly;
+        // No language info at all → only honest answer is "Auto Detect" (and only when
+        // that's what the user picked; in project mode we'd rather render nothing than lie).
+        if (asrConfig?.languageMode === "auto") return "Auto Detect";
+        return null;
+    }, [
+        savedTranscription,
+        asrConfig?.lang,
+        asrConfig?.languageMode,
+        asrConfig?.projectLanguageName,
+    ]);
+
     // Helper to smoothly center the editor. Coalesces multiple calls and
     // performs a single smooth scroll after layout settles.
     const scrollTimeoutRef = useRef<number | null>(null);
@@ -3076,18 +3108,25 @@ const CellEditor: React.FC<CellEditorProps> = ({
                 setTranscriptionStatus(`Error: ${error}`);
             };
 
-            // Perform transcription
-            const result = await client.transcribe(audioBlob);
+            // Perform transcription. In project-language mode `asrConfig.lang` is the OmniASR
+            // code we want OmniASR to bias toward. In auto-detect mode it's undefined so the
+            // server transcribes without language conditioning.
+            const sentLang = asrConfig?.languageMode === "auto" ? undefined : asrConfig?.lang;
+            const result = await client.transcribe(audioBlob, { lang: sentLang });
 
             // Success - save transcription but don't automatically insert
             const transcribedText = result.text.trim();
             if (transcribedText) {
-                // Save transcription to cell metadata
+                // Save transcription to cell metadata. Prefer the language the server echoed
+                // back; fall back to what we sent (the server used it silently). Both can be
+                // null in auto-detect mode — that's fine, the badge code handles that.
+                const echoedOrSentLang = result.lang ?? sentLang ?? null;
                 const audioId = sessionStorage.getItem(`audio-id-${cellMarkers[0]}`);
                 if (audioId) {
                     const transcriptionData = {
                         content: transcribedText,
                         timestamp: Date.now(),
+                        language: echoedOrSentLang ?? undefined,
                     };
 
                     // Save to cell metadata via provider
@@ -3096,7 +3135,7 @@ const CellEditor: React.FC<CellEditorProps> = ({
                         content: {
                             cellId: cellMarkers[0],
                             transcribedText: transcribedText,
-                            language: "unknown",
+                            language: echoedOrSentLang,
                         },
                     };
                     window.vscodeApi.postMessage(messageContent);
@@ -5544,6 +5583,23 @@ const CellEditor: React.FC<CellEditorProps> = ({
                                                             ? audioDuration ?? undefined
                                                             : undefined
                                                     }
+                                                    transcriptionLanguageLabel={transcriptionBadgeLabel}
+                                                    showAdvancedAsrMenu={!isSourceText}
+                                                    asrLanguageMode={asrConfig?.languageMode ?? "project"}
+                                                    asrScriptPref={asrConfig?.scriptPref ?? "auto"}
+                                                    projectLanguageName={asrConfig?.projectLanguageName}
+                                                    onChangeAsrLanguageMode={(mode) => {
+                                                        window.vscodeApi.postMessage({
+                                                            command: "setAsrLanguageMode",
+                                                            content: { mode },
+                                                        } as EditorPostMessages);
+                                                    }}
+                                                    onChangeAsrScriptPref={(scriptPref) => {
+                                                        window.vscodeApi.postMessage({
+                                                            command: "setAsrScriptPref",
+                                                            content: { scriptPref },
+                                                        } as EditorPostMessages);
+                                                    }}
                                                 />
 
                                                 {confirmingDiscard && (
diff --git a/webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts b/webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts
index 69fdb1434..de3c161e4 100644
--- a/webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts
+++ b/webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts
@@ -1,3 +1,10 @@
+/**
+ * HTTP client for the OmniASR transcription service.
+ *
+ * Despite the class name (kept for git-history continuity), this talks to
+ * Meta Omnilingual ASR through the Frontier auth-proxy. Contract spec lives
+ * at `docs/asr-proxy-endpoint.md`.
+ */
 export class WhisperTranscriptionClient {
     private url: string;
     private authToken?: string;
@@ -9,10 +16,17 @@ export class WhisperTranscriptionClient {
         this.authToken = authToken;
     }
 
+    /**
+     * @param audioBlob audio bytes (WebM, WAV, MP3, OGG, FLAC, ...).
+     * @param options.lang   OmniASR `{iso639_3}_{Script}` code (e.g. `swh_Latn`).
+     *                       Omit to let the server transcribe without language conditioning.
+     * @param options.timeoutMs request timeout in ms. Default 60s.
+     */
     async transcribe(
         audioBlob: Blob,
-        timeoutMs: number = 60000
-    ): Promise<{ text: string; }> {
+        options: { lang?: string; timeoutMs?: number; } = {}
+    ): Promise<{ text: string; lang: string | null; }> {
+        const { lang, timeoutMs = 60000 } = options;
         try {
             // Create FormData with audio file
             const formData = new FormData();
@@ -25,6 +39,12 @@ export class WhisperTranscriptionClient {
             if (this.authToken) {
                 url.searchParams.set("token", this.authToken);
             }
+            // OmniASR-specific: forward the language hint when provided. Omitting it tells
+            // the model to transcribe without conditioning (no internal LID, just the
+            // model's autoregressive guess).
+            if (lang) {
+                url.searchParams.set("lang", lang);
+            }
 
             // Prepare headers
             const headers: HeadersInit = {};
@@ -81,9 +101,15 @@ export class WhisperTranscriptionClient {
                     throw new Error(errorMsg);
                 }
 
-                // Parse response
+                // Parse response. OmniASR echoes `lang` when one was sent; in auto-detect
+                // mode it omits the field. The Frontier proxy used to call this field
+                // `language`, so we accept either.
                 const result = await response.json();
-                return { text: result.text || "" };
+                const echoedLang: string | null =
+                    (typeof result?.lang === "string" && result.lang) ||
+                    (typeof result?.language === "string" && result.language) ||
+                    null;
+                return { text: result.text || "", lang: echoedLang };
             } catch (error) {
                 clearTimeout(timeoutId);
 

From fa40fbb5714ee0e9d2ac3dcdb931315b3a369336 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 15:33:28 -0500
Subject: [PATCH 05/12] Rewrite ASR docs for the OmniASR HTTP POST contract
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The two specs (asr-proxy-endpoint.md, AUTH_SERVER_ASR_IMPLEMENTATION.md)
were stuck describing the WebSocket / MMS / phonetic era and were
actively misleading. Replaces them with the current contract:

  - Multipart HTTP POST, not WebSocket.
  - OmniASR upstream, not MMS — and explicitly future-proofed via the
    model-agnostic Modal app name (codex-asr), with a note on the
    historical mms-zeroshot-asr URL.
  - Language is sent as ?lang={iso639_3}_{Script} (e.g. swh_Latn) and
    omitted for auto-detect. The proxy passes it through verbatim.
  - Reference FastAPI implementation updated to match.

The third doc (asr-auth-proxy-implementation-summary.md) is left intact
as a changelog of the original WebSocket-era work, with a header
pointing readers to the current spec.

Also tightens the webview CSP — drops the dead
`wss://ryderwishart--...` allow entry and adds `https://*.modal.run` so
the new HTTPS endpoint works under the policy.
---
 docs/AUTH_SERVER_ASR_IMPLEMENTATION.md        | 452 +++---------------
 docs/asr-auth-proxy-implementation-summary.md |   8 +
 docs/asr-proxy-endpoint.md                    | 185 +++----
 .../codexCellEditorProvider.ts                |   2 +-
 4 files changed, 172 insertions(+), 475 deletions(-)

diff --git a/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md b/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
index 5deaa99cc..16486b174 100644
--- a/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
+++ b/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
@@ -1,402 +1,72 @@
 # Auth Server ASR Proxy Implementation Guide
 
-## Overview
+> **This document was rewritten in 2026 to reflect the current OmniASR
+> (HTTP POST) contract.** The previous WebSocket-based MMS proxy described
+> here is no longer in use.
 
-The Codex Editor client now supports authenticated ASR (Automatic Speech Recognition) transcription through the Frontier auth server. This document describes what needs to be implemented on the auth server side.
+## Status
 
-**Status**: Client implementation is complete and deployed. Auth server implementation is required to enable the feature.
+- **Upstream service**: Meta Omnilingual ASR (`omniASR_LLM_1B_v2`), served
+  on Modal as `https://genesis-ai-dev--codex-asr-serve.modal.run`
+  (renamed from the historical `mms-zeroshot-asr` deployment — same
+  workload, model-agnostic name).
+- **Client**: Codex Editor talks to the Frontier auth-proxy via plain
+  HTTP POST (multipart). No WebSocket. See
+  [`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md) for the full wire spec
+  and reference FastAPI implementation.
 
-## What You Need to Implement
+## What the auth server must implement
 
-### 1. Add `getAsrEndpoint()` Method to FrontierAPI
+### 1. `getAsrEndpoint()` on FrontierAPI
 
-The client expects a new method on the FrontierAPI interface that returns the authenticated ASR proxy endpoint.
-
-**Method Signature**:
 ```typescript
 getAsrEndpoint(): Promise<string | undefined>
 ```
 
-**Returns**: The WebSocket URL for the authenticated ASR proxy (e.g., `wss://auth.frontier.com/ws/asr`)
-
-**Example Implementation**:
-```typescript
-async getAsrEndpoint(): Promise<string | undefined> {
-    if (!this.isAuthenticated) {
-        return undefined;
-    }
-    
-    // Return your ASR proxy WebSocket URL
-    return "wss://auth.frontier.com/ws/asr";
-    // OR from config:
-    // return this.config.asrProxyUrl;
-}
-```
-
-**Pattern Reference**: This follows the exact same pattern as your existing `getLlmEndpoint()` method.
-
-### 2. Implement WebSocket Proxy Endpoint: `/ws/asr`
-
-Create a new WebSocket endpoint that:
-1. Validates the JWT token from the query parameter
-2. Proxies messages between the client and the actual ASR service (Ryder's Modal endpoint)
-3. Logs usage for authenticated users
-
-#### Endpoint Details
-
-**URL Pattern**: `wss://your-auth-server.com/ws/asr?token=JWT_TOKEN`
-
-**Authentication**: JWT token passed as query parameter `token`
-
-**Upstream Service**: `wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe`
-
-#### Message Flow
-
-```
-Client → Auth Server → ASR Service (Ryder's endpoint)
-   ↓         ↓              ↓
-   ←─────────←──────────────←
-```
-
-1. Client sends metadata (JSON)
-2. Auth server forwards to ASR service
-3. Client sends audio (binary)
-4. Auth server forwards to ASR service
-5. ASR service sends progress/results (JSON)
-6. Auth server forwards to client
-
-## Complete Python Implementation Example
-
-Here's a complete FastAPI implementation you can use as a reference:
-
-```python
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Query, HTTPException
-from fastapi.responses import JSONResponse
-import websockets
-import jwt
-import asyncio
-import logging
-from datetime import datetime
-
-app = FastAPI()
-logger = logging.getLogger(__name__)
-
-# Configuration
-ASR_UPSTREAM_URL = "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe"
-JWT_SECRET = "your-jwt-secret-here"  # Use your actual JWT secret
-JWT_ALGORITHM = "HS256"
-
-def validate_token(token: str) -> dict:
-    """
-    Validate JWT token and return decoded payload.
-    
-    Raises:
-        HTTPException: If token is invalid or expired
-    """
-    try:
-        payload = jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGORITHM])
-        return payload
-    except jwt.ExpiredSignatureError:
-        raise HTTPException(status_code=401, detail="Token expired")
-    except jwt.InvalidTokenError:
-        raise HTTPException(status_code=401, detail="Invalid token")
-
-@app.websocket("/ws/asr")
-async def websocket_asr_proxy(
-    websocket: WebSocket,
-    token: str = Query(..., description="JWT authentication token")
-):
-    """
-    WebSocket proxy for ASR transcription with authentication.
-    
-    This endpoint:
-    1. Validates the user's JWT token
-    2. Establishes a connection to the upstream ASR service
-    3. Proxies messages bidirectionally between client and ASR service
-    4. Logs usage for monitoring
-    """
-    
-    # Validate token before accepting connection
-    try:
-        user_payload = validate_token(token)
-        user_id = user_payload.get("sub") or user_payload.get("user_id")
-        username = user_payload.get("username") or user_payload.get("email")
-    except HTTPException as e:
-        await websocket.close(code=1008, reason=f"Authentication failed: {e.detail}")
-        logger.warning(f"Authentication failed: {e.detail}")
-        return
-    
-    # Accept client connection
-    await websocket.accept()
-    logger.info(f"User {username} (ID: {user_id}) started ASR session at {datetime.utcnow()}")
-    
-    # Connect to upstream ASR service
-    upstream_ws = None
-    try:
-        upstream_ws = await websockets.connect(ASR_UPSTREAM_URL)
-        logger.info(f"Connected to upstream ASR service for user {username}")
-        
-        async def forward_to_client():
-            """Forward messages from ASR service to client"""
-            try:
-                async for message in upstream_ws:
-                    await websocket.send_text(message)
-                    logger.debug(f"Forwarded message to client {username}: {message[:100]}...")
-            except websockets.exceptions.ConnectionClosed:
-                logger.info(f"Upstream ASR connection closed for user {username}")
-            except Exception as e:
-                logger.error(f"Error forwarding to client {username}: {e}")
-                try:
-                    await websocket.send_text(
-                        '{"type": "error", "message": "Connection to transcription service lost"}'
-                    )
-                except:
-                    pass
-        
-        async def forward_to_asr():
-            """Forward messages from client to ASR service"""
-            try:
-                while True:
-                    message = await websocket.receive()
-                    
-                    if "text" in message:
-                        # Forward JSON metadata
-                        await upstream_ws.send(message["text"])
-                        logger.debug(f"Forwarded metadata from {username}: {message['text'][:100]}...")
-                    elif "bytes" in message:
-                        # Forward binary audio data
-                        audio_size = len(message["bytes"])
-                        await upstream_ws.send(message["bytes"])
-                        logger.info(f"Forwarded {audio_size} bytes of audio from {username}")
-            except WebSocketDisconnect:
-                logger.info(f"Client {username} disconnected")
-            except Exception as e:
-                logger.error(f"Error forwarding from client {username}: {e}")
-        
-        # Run both forwarding tasks concurrently
-        await asyncio.gather(
-            forward_to_client(),
-            forward_to_asr(),
-            return_exceptions=True
-        )
-        
-    except Exception as e:
-        logger.error(f"Failed to connect to upstream ASR service for user {username}: {e}")
-        error_msg = {
-            "type": "error",
-            "message": f"Failed to connect to transcription service: {str(e)}"
-        }
-        try:
-            await websocket.send_json(error_msg)
-        except:
-            pass
-    finally:
-        # Cleanup
-        if upstream_ws:
-            await upstream_ws.close()
-        try:
-            await websocket.close()
-        except:
-            pass
-        logger.info(f"ASR session ended for user {username} (ID: {user_id})")
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint"""
-    return {"status": "healthy", "service": "asr-proxy"}
-
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)
-```
-
-## WebSocket Protocol Details
-
-The client implements this protocol, which your proxy must support:
-
-### Client → ASR Service
-
-**Step 1**: Client sends JSON metadata
-```json
-{
-  "type": "meta",
-  "provider": "mms",
-  "model": "facebook/mms-1b-all",
-  "mime": "audio/webm",
-  "language": "eng",
-  "task": "transcribe",
-  "phonetic": false
-}
-```
-
-**Step 2**: Client sends binary audio data (Blob)
-
-### ASR Service → Client
-
-**Progress Updates** (during processing):
-```json
-{
-  "type": "progress",
-  "data": "Processing audio...",
-  "percentage": 50
-}
-```
-
-**Final Result** (on completion):
-```json
-{
-  "type": "done",
-  "text": "This is the transcribed text",
-  "language": "eng",
-  "provider": "mms",
-  "model": "facebook/mms-1b-all",
-  "phonetic": "ðɪs ɪz ðə trænskraɪbd tɛkst"
-}
-```
-
-**Error Message** (on failure):
-```json
-{
-  "type": "error",
-  "message": "Transcription failed: invalid audio format"
-}
-```
-
-## Implementation Checklist
-
-- [ ] Add `getAsrEndpoint()` method to FrontierAPI class
-  - Returns `Promise<string | undefined>`
-  - Returns your ASR proxy URL (e.g., `wss://auth.frontier.com/ws/asr`)
-  - Returns `undefined` if not authenticated
-
-- [ ] Create WebSocket endpoint at `/ws/asr`
-  - Accepts `token` as query parameter
-  - Validates JWT token
-  - Rejects with code 1008 if token invalid
-
-- [ ] Implement bidirectional proxy
-  - Forward JSON text messages
-  - Forward binary audio data
-  - Handle connection lifecycle
-  - Clean up resources on disconnect
-
-- [ ] Add logging
-  - Log successful authentications with user ID
-  - Log ASR session start/end times
-  - Log audio data sizes for monitoring
-  - Log errors and failures
-
-- [ ] Test the implementation
-  - Valid token → successful proxying
-  - Invalid token → rejection with code 1008
-  - Missing token → rejection
-  - Large audio files → proper streaming
-  - Connection interruptions → graceful cleanup
-
-## Configuration
-
-You'll need to configure:
-
-1. **JWT Secret**: Same secret used for other JWT validation
-2. **Upstream ASR URL**: `wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe`
-3. **Proxy Endpoint URL**: The URL you'll return from `getAsrEndpoint()`
-
-## Testing
-
-### Manual Test with wscat
-
-```bash
-# Install wscat
-npm install -g wscat
-
-# Test with valid token
-wscat -c "wss://your-auth-server.com/ws/asr?token=YOUR_JWT_TOKEN"
-
-# Send metadata
-> {"type":"meta","mime":"audio/webm"}
-
-# Observe responses
-< {"type":"progress","data":"Processing...","percentage":50}
-```
-
-### Integration Test
-
-The Codex Editor client will automatically use your proxy when:
-1. User is authenticated
-2. `getAsrEndpoint()` returns a URL
-3. User transcribes audio
-
-You can verify by checking your logs for authenticated transcription sessions.
-
-## Security Considerations
-
-1. **Token Validation**: Always validate JWT before accepting connection
-2. **Rate Limiting**: Consider implementing per-user rate limits
-3. **Timeout**: Set reasonable timeouts (30-60s) for transcription
-4. **File Size Limits**: Consider limiting audio size if needed
-5. **HTTPS/WSS**: Always use secure WebSocket in production
-6. **Logging**: Log usage but respect user privacy (don't log audio content)
-
-## Monitoring Recommendations
-
-Track these metrics:
-- Total ASR requests per day
-- Active concurrent transcriptions
-- Average transcription duration
-- Error rate by error type
-- Audio size distribution
-- Per-user usage
-
-## Reference Implementation
-
-The LLM proxy endpoint on your auth server follows a similar pattern. You can use that as a reference for:
-- JWT validation approach
-- Error handling patterns
-- Logging format
-- Configuration management
-
-## Support
-
-If you need clarification on:
-- Client behavior: See `docs/asr-proxy-endpoint.md`
-- Message protocol: See examples above
-- Client implementation: See `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
-
-## Deployment Notes
-
-### Before Deployment
-1. Test with a staging environment first
-2. Verify JWT token validation works correctly
-3. Test with large audio files (>10MB)
-4. Confirm error handling works as expected
-
-### After Deployment
-1. Monitor logs for authentication failures
-2. Check for any proxy errors
-3. Verify transcription quality unchanged
-4. Monitor for rate limit needs
-
-## Timeline
-
-**Client Ready**: ✅ Implemented and deployed
-
-**Auth Server Required**: This implementation
-
-**User Impact**: None until auth server is deployed (users will continue using manual endpoint configuration)
-
-**Urgency**: Medium - allows transition away from Ryder's personal namespace
-
----
-
-## Questions?
-
-For questions about:
-- **Client implementation**: Check `docs/asr-auth-proxy-implementation-summary.md`
-- **Protocol details**: Check `docs/asr-proxy-endpoint.md`
-- **Client code**: Check `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
-
-## Version
-
-- **Client Version**: Implemented in v0.6.21+
-- **Last Updated**: 2025-10-14
-
+Returns the **HTTPS** URL of the proxy's transcribe endpoint
+(e.g. `https://auth.frontier.example/api/v1/asr/transcribe`). The client
+performs a multipart POST against that URL.
+
+This mirrors the existing `getLlmEndpoint()`.
+
+### 2. `POST /api/v1/asr/transcribe` proxy endpoint
+
+A pass-through that:
+
+1. Validates the Frontier JWT (Authorization header or `?token=` query).
+2. Forwards the multipart audio body to OmniASR.
+3. **Forwards the optional `?lang=...` query parameter** when the client
+   supplies it (OmniASR `{iso639_3}_{Script}` format, e.g. `swh_Latn`).
+   In auto-detect mode the client omits `lang`; the proxy must also omit
+   it when calling upstream.
+4. Returns OmniASR's JSON response verbatim (`text`, `duration_s`,
+   `inference_s`, and `lang` when one was sent).
+
+A complete reference FastAPI implementation is in
+[`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md#example-implementation-pythonfastapi).
+
+## Migration from the WebSocket / MMS era
+
+Anything the client used to send over WebSocket (provider, model,
+language as bare ISO 639-3, phonetic flag, etc.) is gone:
+
+- **No more `provider` / `model` fields**: the upstream is OmniASR; the
+  client doesn't choose providers.
+- **No more `phonetic`**: OmniASR doesn't support IPA output.
+- **No more bare ISO 639-3 codes**: OmniASR requires `{iso639_3}_{Script}`
+  (e.g. `urd_Arab`, not `urd`). The client resolves this from the project
+  language using `sharedUtils/asrLanguageUtils.ts`.
+- **No more `lang=auto` magic value**: omit `lang` entirely for
+  auto-detect.
+
+## Key references
+
+- Wire contract: [`docs/asr-proxy-endpoint.md`](./asr-proxy-endpoint.md)
+- Client: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
+- Lang resolver + supported codes: `sharedUtils/asrLanguageUtils.ts`,
+  `sharedUtils/omniAsrSupportedLangs.ts`,
+  `sharedUtils/omniAsrDefaultScripts.ts`,
+  `sharedUtils/omniAsrFriendlyNames.ts`
+- Modal app (source of truth for the upstream): `omniasr_llm_1b.py` in
+  the Modal deployment repo. Logs and dashboards:
+  <https://modal.com/apps/genesis-ai-dev/main>.
diff --git a/docs/asr-auth-proxy-implementation-summary.md b/docs/asr-auth-proxy-implementation-summary.md
index 2f0515033..0748f84a4 100644
--- a/docs/asr-auth-proxy-implementation-summary.md
+++ b/docs/asr-auth-proxy-implementation-summary.md
@@ -1,5 +1,13 @@
 # ASR Authentication Proxy Implementation Summary
 
+> **Historical changelog.** This documents the initial WebSocket-era
+> introduction of the Frontier auth proxy. The current contract is HTTP
+> POST and the upstream is OmniASR (not MMS). For an up-to-date wire
+> spec and reference implementation see
+> [`asr-proxy-endpoint.md`](./asr-proxy-endpoint.md); for the auth-server
+> integration points see
+> [`AUTH_SERVER_ASR_IMPLEMENTATION.md`](./AUTH_SERVER_ASR_IMPLEMENTATION.md).
+
 ## Overview
 
 Successfully migrated ASR transcription from Ryder's personal Modal namespace to an authenticated proxy architecture. The system now supports:
diff --git a/docs/asr-proxy-endpoint.md b/docs/asr-proxy-endpoint.md
index 8b1e40956..11c240c12 100644
--- a/docs/asr-proxy-endpoint.md
+++ b/docs/asr-proxy-endpoint.md
@@ -1,22 +1,33 @@
 # ASR HTTP POST Endpoint Specification
 
-This document describes the HTTP POST protocol for implementing an ASR (Automatic Speech Recognition) transcription endpoint compatible with the Codex Editor.
+This document describes the HTTP POST protocol the Codex Editor expects from
+an ASR (Automatic Speech Recognition) endpoint. The reference upstream is
+**Meta Omnilingual ASR** (`omniASR_LLM_1B_v2`), served on Modal as
+`genesis-ai-dev--codex-asr-serve.modal.run` (renamed from the
+historical `mms-zeroshot-asr` deployment).
+
+The Frontier auth server runs a thin **proxy** in front of that Modal
+endpoint, adds JWT validation, and is what the Codex client actually talks to
+in production. This spec covers the proxy's wire contract; the proxy in turn
+forwards to OmniASR.
 
 ## Overview
 
-The Codex Editor uses a simple HTTP POST request for audio transcription. This allows for straightforward integration without WebSocket complexity.
+The client uses a simple multipart HTTP POST to the proxy URL. No
+WebSockets, no streaming progress messages. One request → one transcription.
 
 ## Authentication
 
-The client passes authentication via a JWT token as either:
+The client passes a Frontier JWT via either:
 1. **Authorization header**: `Authorization: Bearer <token>`
 2. **Query parameter**: `?token=<token>&source=codex`
 
 The server should:
-1. Validate the JWT token before processing the request
-2. Reject requests with invalid or missing tokens (401)
-3. Establish a connection to the actual ASR service (e.g., Modal endpoint)
-4. Forward the audio file and return the transcription result
+1. Validate the JWT before processing.
+2. Reject invalid/missing tokens with HTTP 401.
+3. Forward the audio (and the optional `lang` query parameter, if present)
+   to the upstream OmniASR service.
+4. Return the upstream's JSON response.
 
 ## Request Protocol
 
@@ -35,20 +46,33 @@ Authorization: Bearer <token>  (optional if token in query)
 
 ### Query Parameters
 
-- `source` (required): `"codex"` or `"langquest"`
-- `token` (optional): JWT token if not in Authorization header
+- `source` (required): `"codex"` or `"langquest"` — for logging.
+- `token` (optional): JWT, if not in the Authorization header.
+- `lang` (**optional, new**): OmniASR language code in
+  `{iso639_3}_{Script}` form (e.g. `swh_Latn`, `urd_Arab`, `cmn_Hans`).
+  Forward this directly to OmniASR. **Omit** it to let OmniASR
+  transcribe without language conditioning. The full list of accepted
+  codes is bundled with the client in
+  `sharedUtils/omniAsrSupportedLangs.ts` (and is the live response of
+  OmniASR's `GET /languages`).
 
 ### Request Body
 
 **Content-Type**: `multipart/form-data`
 
 **Form Fields**:
-- `file`: Audio file (WAV, MP3, OGG, FLAC, WebM - max 50MB)
+- `file`: Audio file (WAV, MP3, OGG, FLAC, WebM, M4A — max 50 MB,
+  max 40 s per chunk; OmniASR chunks longer audio internally)
 
-### Example Request
+### Example Requests
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN" \
+# Auto-detect (no lang)
+curl -X POST "https://auth.frontier.example/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN" \
+  -F "file=@audio.wav"
+
+# Project-language mode (Swahili, Latin script)
+curl -X POST "https://auth.frontier.example/api/v1/asr/transcribe?source=codex&token=JWT_TOKEN&lang=swh_Latn" \
   -F "file=@audio.wav"
 ```
 
@@ -60,10 +84,20 @@ curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT
 {
   "text": "This is the transcribed text",
   "duration_s": 4.94,
-  "inference_s": 1.72
+  "inference_s": 1.72,
+  "lang": "swh_Latn"
 }
 ```
 
+The `lang` field is **echoed only when the request supplied one**. In
+auto-detect mode (no `lang` on the request) OmniASR omits the field and the
+proxy should do the same. The client renders an "Auto Detect" badge in that
+case (it does not lie about what language was actually used).
+
+The client also accepts a legacy field name `language` in place of `lang`
+(this was the Frontier proxy's earlier convention) — either works. Prefer
+`lang` going forward.
+
 ### Error Response (4xx/5xx)
 
 ```json
@@ -73,32 +107,30 @@ curl -X POST "http://localhost:8000/api/v1/asr/transcribe?source=codex&token=JWT
 ```
 
 **Common Error Codes**:
-- `400`: Bad Request (missing source parameter, invalid audio format)
+- `400`: Bad request (missing source, invalid audio, unknown `lang` code)
 - `401`: Unauthorized (invalid or missing token)
-- `502`: Bad Gateway (upstream service unavailable)
-- `504`: Gateway Timeout (upstream service timeout)
+- `502`: Bad gateway (upstream OmniASR unavailable)
+- `504`: Gateway timeout (upstream timeout)
 
 ## Example Implementation (Python/FastAPI)
 
-Here's a basic example of implementing the ASR proxy endpoint:
-
 ```python
 from fastapi import FastAPI, UploadFile, File, HTTPException, Query, Header
 from fastapi.responses import JSONResponse
 import httpx
 import jwt
+from typing import Optional
 
 app = FastAPI()
 
-# Configuration
-ASR_SERVICE_URL = "https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run/transcribe"
+# Configuration (post-rename; the old URL was
+# https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run/transcribe)
+ASR_SERVICE_URL = "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe"
 JWT_SECRET = "your-jwt-secret"
 
 def validate_token(token: str) -> dict:
-    """Validate JWT token and return payload"""
     try:
-        payload = jwt.decode(token, JWT_SECRET, algorithms=["HS256"])
-        return payload
+        return jwt.decode(token, JWT_SECRET, algorithms=["HS256"])
     except jwt.InvalidTokenError:
         raise HTTPException(status_code=401, detail="Invalid token")
 
@@ -107,75 +139,70 @@ async def transcribe_audio(
     file: UploadFile = File(...),
     authorization: Optional[str] = Header(None),
     token: Optional[str] = Query(None),
-    source: str = Query(...)
+    source: str = Query(...),
+    lang: Optional[str] = Query(None),  # OmniASR {iso639_3}_{Script}
 ):
-    """HTTP POST endpoint for ASR transcription with authentication"""
-    
-    # Extract token from header or query
     auth_token = None
     if authorization and authorization.startswith("Bearer "):
         auth_token = authorization[7:]
     elif token:
         auth_token = token
-    
     if not auth_token:
         raise HTTPException(status_code=401, detail="Token required")
-    
-    # Validate token
-    try:
-        user = validate_token(auth_token)
-        user_id = user.get("sub")
-    except HTTPException:
-        raise
-    
-    # Read audio file
+    validate_token(auth_token)
+
     audio_content = await file.read()
-    
-    # Forward to upstream ASR service
+
     async with httpx.AsyncClient(timeout=60.0) as client:
         files = {"file": (file.filename, audio_content, file.content_type)}
-        response = await client.post(ASR_SERVICE_URL, files=files)
-        
+        params = {}
+        if lang:
+            params["lang"] = lang
+        response = await client.post(ASR_SERVICE_URL, files=files, params=params)
+
         if response.status_code != 200:
             raise HTTPException(
                 status_code=response.status_code,
-                detail=f"Transcription service error: {response.text}"
+                detail=f"Transcription service error: {response.text}",
             )
-        
+
+        # Pass OmniASR's response through verbatim (it already echoes `lang`
+        # when present, and omits it in auto-detect mode).
         return JSONResponse(content=response.json())
 ```
 
 ## Client Implementation Reference
 
-The Codex Editor client implementation can be found in:
-
-- **TypeScript Client**: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
-- **Integration**: `webviews/codex-webviews/src/CodexCellEditor/CodexCellEditor.tsx`
+- **Client**: `webviews/codex-webviews/src/CodexCellEditor/WhisperTranscriptionClient.ts`
+- **Code resolver** (project language → `{iso639_3}_{Script}`):
+  `sharedUtils/asrLanguageUtils.ts`
+- **Supported codes**: `sharedUtils/omniAsrSupportedLangs.ts`
+- **Default scripts**: `sharedUtils/omniAsrDefaultScripts.ts`
+- **Friendly names**: `sharedUtils/omniAsrFriendlyNames.ts`
 
-### Key Client Behavior
+### Key Client Behaviour
 
-1. Requests ASR config (including auth token) from VS Code extension
-2. Creates FormData with audio blob
-3. POSTs to endpoint URL with token in query parameter or Authorization header
-4. Receives JSON response with transcription text
-5. Handles errors and timeouts (default 60s)
+1. Requests ASR config (endpoint + auth token + resolved OmniASR code) from the extension host.
+2. POSTs `multipart/form-data` with the audio file; forwards `?lang=...` when in project mode.
+3. Parses `lang` (or legacy `language`) from the JSON response and stores it
+   on the cell's audio attachment.
+4. Renders the badge from the stored code via
+   `labelForTranscriptionLanguage()`.
 
 ## Testing Your Implementation
 
-### Test Cases
-
-1. **Valid audio**: Should return transcription
-2. **Invalid audio format**: Should return error message
-3. **Missing token**: Should reject with 401
-4. **Invalid token**: Should reject with 401
-5. **Timeout**: Should handle gracefully (client has 60s timeout)
-6. **Large audio files**: Should handle up to 50MB
-7. **Network errors**: Should return appropriate error codes
+1. **Project-mode request**: `?lang=swh_Latn` → expect 200 with
+   `"lang": "swh_Latn"` in response.
+2. **Auto-detect**: no `lang` → expect 200, **no** `lang` in response.
+3. **Unknown code**: `?lang=zzz_Zzzz` → expect 400 with descriptive error.
+4. **Invalid token**: 401.
+5. **Large audio (≤ 50 MB)**: 200.
+6. **Long audio (> 40 s)**: OmniASR chunks it; expect 200 with full
+   concatenated transcription.
+7. **Network error / upstream down**: 502/504 surfaced honestly.
 
 ## Supported Audio Formats
 
-The endpoint should support common audio formats:
-
 - `audio/webm` (recommended for browser recording)
 - `audio/wav`
 - `audio/mp3`
@@ -185,28 +212,20 @@ The endpoint should support common audio formats:
 
 ## Security Considerations
 
-1. **Token Validation**: Always validate JWT tokens before processing
-2. **Rate Limiting**: Implement per-user rate limits to prevent abuse
-3. **File Size Limits**: Set reasonable limits on audio file sizes (50MB recommended)
-4. **Timeout**: Implement server-side timeouts to prevent hanging requests (60s recommended)
-5. **Logging**: Log usage for monitoring and debugging (but respect privacy)
-6. **HTTPS**: Always use secure connections in production
-
-## Performance Recommendations
-
-1. **Streaming**: For very large files, consider streaming uploads
-2. **Caching**: Cache model loading to reduce cold starts (handled by upstream service)
-3. **Resource Cleanup**: Properly close connections and free resources
-4. **Concurrent Requests**: Handle multiple simultaneous transcriptions efficiently
-5. **Timeout Handling**: Set reasonable timeouts for upstream requests
+1. **Token validation**: validate JWT before processing.
+2. **Rate limiting**: per-user limits to prevent abuse.
+3. **File size limits**: 50 MB.
+4. **Timeout**: server-side timeouts to prevent hanging requests (60 s recommended).
+5. **Logging**: log usage for monitoring but respect privacy.
+6. **HTTPS**: always.
 
 ## Integration with Frontier Auth Server
 
 The Frontier auth server should:
 
-1. Provide `getAsrEndpoint()` method returning the proxy HTTP URL
-2. Generate short-lived JWT tokens for ASR requests
-3. Include user identification in tokens for logging
-4. Handle token refresh if needed for long transcriptions
+1. Implement `getAsrEndpoint()` returning the proxy HTTPS URL.
+2. Generate short-lived JWTs for ASR requests.
+3. Include user identification in tokens for logging.
+4. Handle token refresh for long transcriptions if needed.
 
-This follows the same pattern as the existing `getLlmEndpoint()` implementation.
+This follows the same pattern as the existing `getLlmEndpoint()`.
diff --git a/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts b/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts
index af4ebe313..ff09fe6ef 100755
--- a/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts
+++ b/src/providers/codexCellEditorProvider/codexCellEditorProvider.ts
@@ -1802,7 +1802,7 @@ export class CodexCellEditorProvider implements vscode.CustomEditorProvider<Code
             <head>
                 <meta charset="UTF-8">
                 <meta name="viewport" content="width=device-width, initial-scale=1.0">
-                                <meta http-equiv="Content-Security-Policy" content="default-src 'none'; style-src ${webview.cspSource} 'unsafe-inline'; script-src 'nonce-${nonce}' 'strict-dynamic' https://www.youtube.com https://static.cloudflareinsights.com; frame-src https://www.youtube.com; worker-src ${webview.cspSource} blob:; connect-src https://*.vscode-cdn.net https://*.frontierrnd.com wss://*.frontierrnd.com https://languagetool.org/api/ https://*.workers.dev https://*.fastly.net https://*.thechosen.media data: wss://ryderwishart--whisper-websocket-transcription-websocket-transcribe.modal.run wss://*.modal.run; img-src 'self' data: ${webview.cspSource} https:; font-src ${webview.cspSource} data:; media-src ${webview.cspSource} https: blob: data:;">
+                                <meta http-equiv="Content-Security-Policy" content="default-src 'none'; style-src ${webview.cspSource} 'unsafe-inline'; script-src 'nonce-${nonce}' 'strict-dynamic' https://www.youtube.com https://static.cloudflareinsights.com; frame-src https://www.youtube.com; worker-src ${webview.cspSource} blob:; connect-src https://*.vscode-cdn.net https://*.frontierrnd.com wss://*.frontierrnd.com https://languagetool.org/api/ https://*.workers.dev https://*.fastly.net https://*.thechosen.media data: https://*.modal.run wss://*.modal.run; img-src 'self' data: ${webview.cspSource} https:; font-src ${webview.cspSource} data:; media-src ${webview.cspSource} https: blob: data:;">
                 <link href="${styleResetUriWithBuster}" rel="stylesheet" nonce="${nonce}">
                 <link href="${codiconsUriWithBuster}" rel="stylesheet" nonce="${nonce}" />
                 <title>Codex Cell Editor</title>

From 78fa0d799a792c30a19f3f0d95a47fc5eb174d58 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 15:35:16 -0500
Subject: [PATCH 06/12] Bring OmniASR Modal source into the tree as codex-asr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Modal app source was living only on the deployed instance — this
commit makes it discoverable and reviewable in-repo at
`docs/asr/codex_asr_modal.py`, with a README describing the deploy
workflow and migration plan.

Substantive changes vs. the currently-deployed source:

  - `modal.App("mms-zeroshot-asr")` → `modal.App("codex-asr")` so the
    Modal URL stops hard-coding the (long-since-replaced) model
    family. Deploying this file creates `genesis-ai-dev--codex-asr-
    serve.modal.run`. The old `mms-zeroshot-asr` deployment stays
    warm during the rollout — both serve identical responses.
  - Module docstring spells out the naming rationale, migration
    plan, and the auto-detect LID gap.
  - Comments in transcribe_audio() clarify that the absent `lang`
    field on auto-detect responses is intentional (no built-in LID),
    not a bug.

Functional contract is unchanged — same `/`, `/health`, `/languages`,
`/transcribe` endpoints, same response shape.

Deployment is a follow-up step (requires `modal token new` and a
deploy). The Frontier auth proxy + the client's default endpoint must
be updated to the new URL once deployed — see the handoff note
(separate commit).
---
 docs/asr/README.md          |  69 +++++++++
 docs/asr/codex_asr_modal.py | 299 ++++++++++++++++++++++++++++++++++++
 2 files changed, 368 insertions(+)
 create mode 100644 docs/asr/README.md
 create mode 100644 docs/asr/codex_asr_modal.py

diff --git a/docs/asr/README.md b/docs/asr/README.md
new file mode 100644
index 000000000..344def767
--- /dev/null
+++ b/docs/asr/README.md
@@ -0,0 +1,69 @@
+# Codex ASR deployment
+
+Modal source for the ASR backend used by the Codex Translation Editor.
+
+| File | What it is |
+|------|------------|
+| [`codex_asr_modal.py`](./codex_asr_modal.py) | The Modal app source. Deploy with `modal deploy`. |
+
+## Live URLs
+
+- **Current (post-rename)**: `https://genesis-ai-dev--codex-asr-serve.modal.run`
+- **Legacy (kept warm during migration)**: `https://genesis-ai-dev--mms-zeroshot-asr-serve.modal.run`
+
+The legacy URL serves the same workload — the app was renamed from
+`mms-zeroshot-asr` to `codex-asr` so the URL no longer encodes the
+model family. Both deployments will be active during the rollout; the
+legacy one is decommissioned after the Frontier auth proxy and any
+hard-coded client defaults are updated to the new URL.
+
+## Deploying
+
+You need `modal` CLI installed (`pipx install modal`) and authenticated
+(`modal token new`) with access to the `genesis-ai-dev` workspace.
+
+```bash
+cd <repo-root>
+modal deploy docs/asr/codex_asr_modal.py
+```
+
+For local development against your own Modal workspace:
+
+```bash
+modal serve docs/asr/codex_asr_modal.py
+```
+
+## Sanity-checking after deploy
+
+```bash
+# Service identity
+curl -s https://genesis-ai-dev--codex-asr-serve.modal.run/
+
+# Full supported-langs list (used to regenerate the client snapshot)
+curl -s https://genesis-ai-dev--codex-asr-serve.modal.run/languages | jq '.count'
+
+# Transcribe with language hint
+curl -X POST -F "file=@some_audio.wav" \
+  "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe?lang=eng_Latn"
+
+# Transcribe in auto-detect mode (no `lang` field in response)
+curl -X POST -F "file=@some_audio.wav" \
+  https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe
+```
+
+## Wire spec
+
+See [`../asr-proxy-endpoint.md`](../asr-proxy-endpoint.md) for the full
+HTTP POST contract the Codex client expects (this Modal app implements
+it; the Frontier auth proxy sits in front and adds JWT validation).
+
+## Open follow-ups
+
+- **Server-side LID for auto-detect mode.** OmniASR LLM doesn't return a
+  detected language when run without `lang` conditioning. The plan is to
+  bake `facebook/mms-lid-2048` into the image and run it before
+  transcription when the client omits `lang`, then pass the detected
+  code through as the conditioning input and echo it back. ~+1 GB VRAM,
+  ~+1–2 s latency, makes the badge honest in auto-detect mode. Deferred
+  to a follow-up PR; the client is already prepared to consume the
+  field if/when it appears.
diff --git a/docs/asr/codex_asr_modal.py b/docs/asr/codex_asr_modal.py
new file mode 100644
index 000000000..7662935ae
--- /dev/null
+++ b/docs/asr/codex_asr_modal.py
@@ -0,0 +1,299 @@
+"""
+codex-asr — Modal deployment for the Codex Translation Editor's ASR backend.
+
+This is the **source of truth** for the deployed Modal app at
+`https://genesis-ai-dev--codex-asr-serve.modal.run`.
+
+Model: Meta Omnilingual ASR (`omniASR_LLM_1B_v2`). 1600+ languages.
+Native-script output, optional language conditioning.
+
+Naming
+~~~~~~
+The Modal app is named `codex-asr` (model-agnostic) rather than
+`mms-zeroshot-asr` (the old name, when the upstream was MMS Zero-Shot).
+This is so the URL stays stable when we change models. Do NOT rename
+again casually — every consumer (Codex client default endpoint,
+Frontier auth proxy upstream URL, docs, snapshot regen instructions)
+hard-codes `codex-asr`.
+
+Migration plan (if `codex-asr` ever needs to change):
+  1. Deploy the new name first, keep `codex-asr` running.
+  2. Update the Frontier auth proxy's upstream URL.
+  3. Update the client's default endpoint in `package.json`
+     (`codex-editor-extension.asrEndpoint`) and any docs.
+  4. Decommission `codex-asr` after a release cycle.
+
+The old `mms-zeroshot-asr` deployment is kept warm for backward
+compatibility during the transition. Both serve identical responses.
+
+Auto-detect language ID
+~~~~~~~~~~~~~~~~~~~~~~~
+OmniASR LLM models don't have built-in LID — without a `lang`
+parameter they generate without conditioning and the response has no
+"detected language" field. Adding a separate LID model (e.g.
+`facebook/mms-lid-2048`) is a planned follow-up. For now, auto-detect
+mode returns no `lang` and the client renders an honest "Auto Detect"
+badge.
+
+Deploy / Dev
+~~~~~~~~~~~~
+  modal deploy docs/asr/codex_asr_modal.py
+  modal serve  docs/asr/codex_asr_modal.py   # local dev
+
+Test
+~~~~
+  curl -X POST -F "file=@audio.wav" \\
+    https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe
+
+  curl -X POST -F "file=@audio.wav" \\
+    "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe?lang=urd_Arab"
+
+Endpoints
+~~~~~~~~~
+  GET  /            — service identity
+  GET  /health      — readiness probe
+  GET  /languages   — full list of supported {iso639_3}_{Script} codes
+                      (used by the client snapshot in sharedUtils/)
+  POST /transcribe  — transcription endpoint
+"""
+
+import modal
+
+# Renamed from "mms-zeroshot-asr" to be model-agnostic. See module docstring
+# for migration notes.
+app = modal.App("codex-asr")
+
+MODEL_CARD = "omniASR_LLM_1B_v2"
+MODEL_CACHE_DIR = "/root/model_cache"
+
+
+def download_model():
+    """Download model weights during image build (runs with GPU so fairseq2 can verify)."""
+    import os
+    os.environ["FAIRSEQ2_CACHE_DIR"] = MODEL_CACHE_DIR
+
+    from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
+
+    print(f"Downloading and verifying {MODEL_CARD}...")
+    pipeline = ASRInferencePipeline(model_card=MODEL_CARD)
+    print("Model downloaded and verified OK")
+    del pipeline
+
+
+# Build the image with model weights baked in.
+# The run_function step uses a T4 GPU so fairseq2 can fully verify the
+# checkpoint. This only runs once — the resulting image is cached by Modal.
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .apt_install("ffmpeg", "libsndfile1")
+    .pip_install(
+        "omnilingual-asr",
+        "fastapi",
+        "uvicorn",
+        "python-multipart",
+        "soundfile",
+        "numpy",
+    )
+    .env({"FAIRSEQ2_CACHE_DIR": MODEL_CACHE_DIR})
+    .run_function(download_model, gpu="T4")
+)
+
+_pipeline = None
+
+
+def get_pipeline():
+    """Load the ASR pipeline from baked-in weights (no download needed)."""
+    global _pipeline
+    if _pipeline is None:
+        import os
+        os.environ["FAIRSEQ2_CACHE_DIR"] = MODEL_CACHE_DIR
+
+        from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
+
+        print(f"Loading {MODEL_CARD} from image cache...")
+        _pipeline = ASRInferencePipeline(model_card=MODEL_CARD)
+        print("Pipeline ready")
+    return _pipeline
+
+
+def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str | None = None) -> dict:
+    """
+    Transcribe audio bytes → text using OmniASR LLM 1B v2.
+
+    Args:
+        audio_bytes: Raw audio file bytes.
+        mime_type: MIME type for format detection.
+        lang: Optional OmniASR language code (e.g. "eng_Latn", "urd_Arab").
+              If None, the model runs without language conditioning. The model
+              does NOT do internal LID, so the response will not contain a
+              `lang` field when this is None.
+
+    Returns:
+        dict with text, duration_s, inference_s, and lang (only when one was provided).
+    """
+    import soundfile as sf
+    import numpy as np
+    import tempfile
+    import subprocess
+    import os
+    import time
+
+    pipeline = get_pipeline()
+
+    # --- Convert to 16kHz mono WAV via ffmpeg ---
+    ext_map = {
+        "audio/wav": ".wav", "audio/x-wav": ".wav",
+        "audio/mpeg": ".mp3", "audio/mp3": ".mp3",
+        "audio/webm": ".webm", "audio/ogg": ".ogg",
+        "audio/flac": ".flac", "audio/mp4": ".m4a",
+    }
+    ext = ext_map.get(mime_type, ".wav")
+
+    with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f:
+        f.write(audio_bytes)
+        input_path = f.name
+
+    output_path = input_path.rsplit(".", 1)[0] + "_16k.wav"
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-y", "-i", input_path,
+             "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
+             output_path],
+            capture_output=True, text=True, timeout=60,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"ffmpeg failed: {(result.stderr or '')[:500]}")
+
+        waveform, sr = sf.read(output_path)
+        waveform = waveform.astype(np.float32)
+        if waveform.ndim > 1:
+            waveform = waveform.mean(axis=-1)
+        duration = len(waveform) / sr
+
+        # --- Chunk if > 40s (model limitation) ---
+        max_samples = 40 * sr  # 40 seconds
+        if len(waveform) > max_samples:
+            chunks = []
+            for start in range(0, len(waveform), max_samples):
+                chunks.append(waveform[start : start + max_samples])
+        else:
+            chunks = [waveform]
+
+        # Build audio dicts for the pipeline
+        audio_inputs = [
+            {"waveform": chunk, "sample_rate": sr}
+            for chunk in chunks
+        ]
+
+        # Build lang list to match (one per chunk), or None
+        lang_list = [lang] * len(audio_inputs) if lang else None
+
+        # --- Transcribe ---
+        start_t = time.perf_counter()
+        transcriptions = pipeline.transcribe(
+            audio_inputs,
+            lang=lang_list,
+            batch_size=1,
+        )
+        inference_time = time.perf_counter() - start_t
+
+        # Join chunks with space
+        full_text = " ".join(t.strip() for t in transcriptions if t.strip())
+
+        resp = {
+            "text": full_text,
+            "duration_s": round(duration, 2),
+            "inference_s": round(inference_time, 3),
+        }
+        # Echo the lang we used so the client can render the badge. In auto-detect
+        # mode (lang is None) we have no detected language to report — omit the
+        # field and let the client render "Auto Detect" honestly.
+        if lang:
+            resp["lang"] = lang
+
+        return resp
+
+    finally:
+        os.unlink(input_path)
+        if os.path.exists(output_path):
+            os.unlink(output_path)
+
+
+# ---------- Modal function ----------
+
+@app.function(
+    image=image,
+    gpu="T4",
+    timeout=600,
+    scaledown_window=120,     # keep warm 2 min after last request
+    max_containers=3,
+)
+@modal.asgi_app()
+def serve():
+    from fastapi import FastAPI, UploadFile, File, Query, HTTPException
+    from fastapi.middleware.cors import CORSMiddleware
+
+    web_app = FastAPI(title="Codex ASR (OmniASR LLM 1B v2)")
+    web_app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    @web_app.get("/")
+    def root():
+        return {
+            "service": "codex-asr",
+            "model": MODEL_CARD,
+            "languages": "1600+",
+            "note": "Pass ?lang={iso639_3}_{Script} (e.g. eng_Latn) for best accuracy. Omit for autodetect (no LID, lower accuracy).",
+        }
+
+    @web_app.get("/health")
+    def health():
+        return {"status": "ok", "model_loaded": _pipeline is not None}
+
+    @web_app.get("/languages")
+    def list_languages():
+        """Return all supported language codes."""
+        from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+        return {"count": len(supported_langs), "languages": sorted(supported_langs)}
+
+    @web_app.post("/transcribe")
+    async def transcribe_endpoint(
+        file: UploadFile = File(...),
+        lang: str | None = Query(
+            default=None,
+            description="OmniASR language code in {iso639_3}_{Script} form, e.g. eng_Latn, urd_Arab, spa_Latn. Omit to let the model transcribe without language conditioning.",
+        ),
+    ):
+        # Validate language code if provided
+        if lang is not None:
+            from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+            if lang not in supported_langs:
+                raise HTTPException(
+                    400,
+                    f"Unknown language code: '{lang}'. "
+                    f"Use GET /languages for the full list. "
+                    f"Format: {{iso639_3}}_{{Script}}, e.g. eng_Latn",
+                )
+
+        try:
+            audio_bytes = await file.read()
+            if len(audio_bytes) > 50 * 1024 * 1024:
+                raise HTTPException(413, "File too large (50MB max)")
+            if len(audio_bytes) == 0:
+                raise HTTPException(400, "Empty file")
+
+            mime = file.content_type or "audio/wav"
+            return transcribe_audio(audio_bytes, mime, lang=lang)
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            raise HTTPException(500, f"Transcription failed: {str(e)}")
+
+    # Model loads lazily on first /transcribe request via get_pipeline().
+    # Weights are baked into the image so loading takes ~15-20s (no download).
+    return web_app

From b9f2d3c51eb2b9d48e8cb17ed453857cda875cb3 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 16:18:27 -0500
Subject: [PATCH 07/12] Pin Modal ASR image to torch 2.8.0 cu128 +
 omnilingual-asr 0.2.0

The unpinned image was pulling torch wheels built against CUDA 13,
which fails to load on Modal's debian_slim (libcudart.so.13 missing).
omnilingual-asr 0.2.0 also exposes the omniASR_LLM_1B_v2 model card
the legacy mms-zeroshot-asr deployment was using.

Add a small _ensure_gang_context() shim around the inference path:
fairseq2 0.6 stores the current-gangs stack on a threading.local()
that is only initialised on the importing thread, so FastAPI worker
threads otherwise blow up with AttributeError on every request.
---
 docs/asr/codex_asr_modal.py | 40 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/docs/asr/codex_asr_modal.py b/docs/asr/codex_asr_modal.py
index 7662935ae..05ec75696 100644
--- a/docs/asr/codex_asr_modal.py
+++ b/docs/asr/codex_asr_modal.py
@@ -83,16 +83,29 @@ def download_model():
 # Build the image with model weights baked in.
 # The run_function step uses a T4 GPU so fairseq2 can fully verify the
 # checkpoint. This only runs once — the resulting image is cached by Modal.
+#
+# Versions / CUDA notes:
+#  - omnilingual-asr 0.2.0 is the first release that ships the
+#    `omniASR_LLM_1B_v2` model card; 0.1.0 only has `omniASR_LLM_1B`.
+#  - omnilingual-asr -> fairseq2[arrow]<=0.6 -> fairseq2n which pins
+#    `torch==2.8.0` built specifically against CUDA 12.8 (it asserts this at
+#    import time). Newer torch wheels are CUDA 13 and fail to load on Modal's
+#    `debian_slim` (libcudart.so.13 missing).
+#  - We install everything in one pip call so the resolver lands on the
+#    cu128 wheel of torch 2.8.0.
 image = (
     modal.Image.debian_slim(python_version="3.11")
     .apt_install("ffmpeg", "libsndfile1")
     .pip_install(
-        "omnilingual-asr",
+        "torch==2.8.0",
+        "torchaudio==2.8.0",
+        "omnilingual-asr==0.2.0",
         "fastapi",
         "uvicorn",
         "python-multipart",
         "soundfile",
         "numpy",
+        extra_index_url="https://download.pytorch.org/whl/cu128",
     )
     .env({"FAIRSEQ2_CACHE_DIR": MODEL_CACHE_DIR})
     .run_function(download_model, gpu="T4")
@@ -101,6 +114,29 @@ def download_model():
 _pipeline = None
 
 
+def _ensure_gang_context() -> None:
+    """
+    Initialise fairseq2's thread-local gang stack on the current thread.
+
+    fairseq2 0.6 stores the "current gangs" stack on a `threading.local()`,
+    but only initialises the underlying `current_gangs = []` attribute on
+    the importing thread. FastAPI dispatches sync request handlers on
+    worker threads where the attribute is missing, causing inference to
+    fail with::
+
+        AttributeError: '_thread._local' object has no attribute 'current_gangs'
+
+    Cheap to call per-request — just sets a list on the thread-local if
+    it isn't already there.
+    """
+    try:
+        from fairseq2.gang import _thread_local  # type: ignore[attr-defined]
+        if not hasattr(_thread_local, "current_gangs"):
+            _thread_local.current_gangs = []
+    except Exception:  # pragma: no cover — defensive only
+        pass
+
+
 def get_pipeline():
     """Load the ASR pipeline from baked-in weights (no download needed)."""
     global _pipeline
@@ -111,6 +147,7 @@ def get_pipeline():
         from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
 
         print(f"Loading {MODEL_CARD} from image cache...")
+        _ensure_gang_context()
         _pipeline = ASRInferencePipeline(model_card=MODEL_CARD)
         print("Pipeline ready")
     return _pipeline
@@ -139,6 +176,7 @@ def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str
     import time
 
     pipeline = get_pipeline()
+    _ensure_gang_context()
 
     # --- Convert to 16kHz mono WAV via ffmpeg ---
     ext_map = {

From 62b1cff84ea93a4466fb68d9390365fc7d8bf837 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 16:18:33 -0500
Subject: [PATCH 08/12] Update ASR setting defaults + auth-proxy action items
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- package.json: point default asrEndpoint at the new codex-asr Modal
  app, default asrProvider to "omniasr" (was "mms"), and asrModel to
  omniASR_LLM_1B_v2. Drop the "ASR WebSocket Endpoint" framing — the
  endpoint is HTTPS multipart.
- docs/AUTH_SERVER_ASR_IMPLEMENTATION.md: add an action-items section
  for the Frontier auth-proxy team (new upstream URL, forward ?lang=
  verbatim, drop legacy fields) and link the in-repo Modal source.
---
 docs/AUTH_SERVER_ASR_IMPLEMENTATION.md | 22 +++++++++++++++++++---
 package.json                           | 16 ++++++++--------
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md b/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
index 16486b174..d9cd7cae6 100644
--- a/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
+++ b/docs/AUTH_SERVER_ASR_IMPLEMENTATION.md
@@ -67,6 +67,22 @@ language as bare ISO 639-3, phonetic flag, etc.) is gone:
   `sharedUtils/omniAsrSupportedLangs.ts`,
   `sharedUtils/omniAsrDefaultScripts.ts`,
   `sharedUtils/omniAsrFriendlyNames.ts`
-- Modal app (source of truth for the upstream): `omniasr_llm_1b.py` in
-  the Modal deployment repo. Logs and dashboards:
-  <https://modal.com/apps/genesis-ai-dev/main>.
+- Modal app (source of truth for the upstream):
+  [`docs/asr/codex_asr_modal.py`](./asr/codex_asr_modal.py) in this repo.
+  Logs and dashboards:
+  <https://modal.com/apps/genesis-ai-dev/main/deployed/codex-asr>.
+
+## Action items for the Frontier auth proxy team
+
+1. Point the upstream ASR URL at the new app:
+   `https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe`
+   (previously `…--mms-zeroshot-asr-…`). The legacy app is still up so
+   there's no urgency, but it should not be considered the source of
+   truth — only `codex-asr` will receive future updates.
+2. Make sure the proxy forwards the optional `?lang=` query string
+   verbatim and does not synthesise one when the client omits it
+   (auto-detect mode).
+3. Drop any `provider`, `model`, `phonetic`, `language` fields that
+   used to be part of the multipart/form body — they're no longer sent.
+4. Once the proxy is migrated, we can decommission the
+   `mms-zeroshot-asr` Modal app.
diff --git a/package.json b/package.json
index 8fe329d44..13a9ab247 100644
--- a/package.json
+++ b/package.json
@@ -873,26 +873,26 @@
                         "description": "Model name selected for inference."
                     },
                     "codex-editor-extension.asrEndpoint": {
-                        "title": "ASR WebSocket Endpoint",
+                        "title": "ASR Endpoint",
                         "type": "string",
-                        "default": "wss://ryderwishart--asr-websocket-transcription-fastapi-asgi.modal.run/ws/transcribe",
-                        "description": "WebSocket endpoint for audio transcription. When authenticated with Frontier, the auth server endpoint is automatically used. This setting is used as fallback when not authenticated or for local development."
+                        "default": "https://genesis-ai-dev--codex-asr-serve.modal.run/transcribe",
+                        "description": "HTTPS endpoint for audio transcription (POST multipart with optional ?lang= query). When authenticated with Frontier, the auth server endpoint is automatically used; this setting is the fallback for unauthenticated / local development."
                     },
                     "codex-editor-extension.asrProvider": {
                         "title": "ASR Provider",
                         "type": "string",
                         "enum": [
-                            "mms",
+                            "omniasr",
                             "whisper"
                         ],
-                        "default": "mms",
-                        "description": "Provider for transcription. MMS requires a language code; Whisper auto-detects."
+                        "default": "omniasr",
+                        "description": "Provider for transcription. OmniASR accepts an optional {iso639_3}_{Script} language hint; Whisper auto-detects."
                     },
                     "codex-editor-extension.asrModel": {
                         "title": "ASR Model",
                         "type": "string",
-                        "default": "facebook/mms-1b-all",
-                        "description": "Model identifier to use for transcription (e.g., facebook/mms-1b-all)."
+                        "default": "omniASR_LLM_1B_v2",
+                        "description": "Model identifier used by the ASR service (e.g., omniASR_LLM_1B_v2)."
                     },
                     "codex-editor-extension.asrLanguage": {
                         "title": "ASR Language (ISO-639-3)",

From a7664fc6fd21963296f63dd521e48005a301e451 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 22:15:23 -0500
Subject: [PATCH 09/12] Add MMS-LID auto-detect to codex-asr Modal endpoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the client omits ?lang= we now run facebook/mms-lid-2048 first
to detect the ISO 639-3 base, pair it with the default script for
that base, and feed the resolved {iso639_3}_{Script} code into
OmniASR. The same code is echoed back in the response so the client
can render a real "detected language" badge.

LID adds ~70-130 ms when warm, ~12 s on the first call after a
cold-start. If LID fails for any reason (silence, unrecognised
language, base not in OmniASR's set) we fall through to
unconditioned transcription and omit `lang` in the response.

The script-default table is a Python mirror of
sharedUtils/omniAsrDefaultScripts.ts — keep both in sync.
---
 docs/asr-proxy-endpoint.md  |  25 +++--
 docs/asr/codex_asr_modal.py | 211 ++++++++++++++++++++++++++++++++----
 2 files changed, 208 insertions(+), 28 deletions(-)

diff --git a/docs/asr-proxy-endpoint.md b/docs/asr-proxy-endpoint.md
index 11c240c12..e05215410 100644
--- a/docs/asr-proxy-endpoint.md
+++ b/docs/asr-proxy-endpoint.md
@@ -48,12 +48,13 @@ Authorization: Bearer <token>  (optional if token in query)
 
 - `source` (required): `"codex"` or `"langquest"` — for logging.
 - `token` (optional): JWT, if not in the Authorization header.
-- `lang` (**optional, new**): OmniASR language code in
+- `lang` (**optional**): OmniASR language code in
   `{iso639_3}_{Script}` form (e.g. `swh_Latn`, `urd_Arab`, `cmn_Hans`).
-  Forward this directly to OmniASR. **Omit** it to let OmniASR
-  transcribe without language conditioning. The full list of accepted
-  codes is bundled with the client in
-  `sharedUtils/omniAsrSupportedLangs.ts` (and is the live response of
+  Forward this directly to OmniASR. **Omit** it to engage the upstream's
+  built-in language ID — `codex-asr` runs MMS-LID first and feeds the
+  detected code into OmniASR (the resolved code is then included in the
+  response). The full list of accepted codes is bundled with the client
+  in `sharedUtils/omniAsrSupportedLangs.ts` (and is the live response of
   OmniASR's `GET /languages`).
 
 ### Request Body
@@ -89,10 +90,16 @@ curl -X POST "https://auth.frontier.example/api/v1/asr/transcribe?source=codex&t
 }
 ```
 
-The `lang` field is **echoed only when the request supplied one**. In
-auto-detect mode (no `lang` on the request) OmniASR omits the field and the
-proxy should do the same. The client renders an "Auto Detect" badge in that
-case (it does not lie about what language was actually used).
+The `lang` field reflects what was **actually used** for transcription:
+- Request supplied `lang` → echoed verbatim.
+- Request omitted `lang` → upstream ran MMS-LID and the resolved
+  `{iso639_3}_{Script}` code is returned here. If LID failed (silence,
+  unrecognised language, …) the field is omitted and the response also
+  includes `lid_s` so callers can tell auto-detect actually ran. The
+  client renders an "Auto Detect" badge in that case.
+
+Auto-detect responses include an additional `"lid_s": <float>` field
+with the LID inference time (useful for monitoring).
 
 The client also accepts a legacy field name `language` in place of `lang`
 (this was the Frontier proxy's earlier convention) — either works. Prefer
diff --git a/docs/asr/codex_asr_modal.py b/docs/asr/codex_asr_modal.py
index 05ec75696..0ba08f52f 100644
--- a/docs/asr/codex_asr_modal.py
+++ b/docs/asr/codex_asr_modal.py
@@ -28,12 +28,18 @@
 
 Auto-detect language ID
 ~~~~~~~~~~~~~~~~~~~~~~~
-OmniASR LLM models don't have built-in LID — without a `lang`
-parameter they generate without conditioning and the response has no
-"detected language" field. Adding a separate LID model (e.g.
-`facebook/mms-lid-2048`) is a planned follow-up. For now, auto-detect
-mode returns no `lang` and the client renders an honest "Auto Detect"
-badge.
+OmniASR LLM models don't have built-in LID. When the client omits
+`lang` we run **Meta MMS-LID 2048** as a first pass to detect the
+ISO 639-3 base, then pair it with a default script (see
+`_DEFAULT_SCRIPT_FOR_BASE`) to produce an OmniASR-compatible
+`{iso639_3}_{Script}` code that's fed to the OmniASR transcribe call.
+The resolved code is echoed back in the response so the client can
+render a real "detected language" badge.
+
+If LID fails (silence, gibberish, language not in MMS-LID's 2048-set,
+or the detected base has no OmniASR mapping), we fall through to
+unconditioned transcription and omit `lang` in the response so the
+client renders an honest "Auto Detect" badge.
 
 Deploy / Dev
 ~~~~~~~~~~~~
@@ -66,11 +72,19 @@
 MODEL_CARD = "omniASR_LLM_1B_v2"
 MODEL_CACHE_DIR = "/root/model_cache"
 
+# MMS-LID variant for auto-detect mode. 2048 languages — all MMS-LID models
+# share the same wav2vec2 backbone (~960M params), so picking a larger
+# classification head doesn't meaningfully change cold-start memory.
+# Outputs ISO 639-3 codes which we pair with our default-script table.
+LID_MODEL_ID = "facebook/mms-lid-2048"
+HF_CACHE_DIR = "/root/hf_cache"
+
 
 def download_model():
     """Download model weights during image build (runs with GPU so fairseq2 can verify)."""
     import os
     os.environ["FAIRSEQ2_CACHE_DIR"] = MODEL_CACHE_DIR
+    os.environ["HF_HOME"] = HF_CACHE_DIR
 
     from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
 
@@ -79,6 +93,12 @@ def download_model():
     print("Model downloaded and verified OK")
     del pipeline
 
+    print(f"Downloading {LID_MODEL_ID}...")
+    from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification
+    AutoFeatureExtractor.from_pretrained(LID_MODEL_ID)
+    Wav2Vec2ForSequenceClassification.from_pretrained(LID_MODEL_ID)
+    print("MMS-LID downloaded OK")
+
 
 # Build the image with model weights baked in.
 # The run_function step uses a T4 GPU so fairseq2 can fully verify the
@@ -100,6 +120,8 @@ def download_model():
         "torch==2.8.0",
         "torchaudio==2.8.0",
         "omnilingual-asr==0.2.0",
+        "transformers>=4.46,<5",
+        "huggingface_hub",
         "fastapi",
         "uvicorn",
         "python-multipart",
@@ -107,11 +129,42 @@ def download_model():
         "numpy",
         extra_index_url="https://download.pytorch.org/whl/cu128",
     )
-    .env({"FAIRSEQ2_CACHE_DIR": MODEL_CACHE_DIR})
+    .env({"FAIRSEQ2_CACHE_DIR": MODEL_CACHE_DIR, "HF_HOME": HF_CACHE_DIR})
     .run_function(download_model, gpu="T4")
 )
 
 _pipeline = None
+_lid_model = None
+_lid_feature_extractor = None
+_default_script_for_base: dict[str, str] | None = None
+
+# Hand-curated default script for the multi-script bases OmniASR serves.
+# **Mirror of `sharedUtils/omniAsrDefaultScripts.ts`** — keep both in sync
+# when adding entries (the client uses this for project-language → OmniASR
+# code resolution; the server uses it after MMS-LID returns a bare ISO
+# 639-3 base). Picked from Unicode CLDR likelySubtags cross-checked
+# against modern majority usage.
+_MULTI_SCRIPT_DEFAULTS: dict[str, str] = {
+    "aze": "Latn",  # Azerbaijani — Latin in modern standard
+    "bcc": "Arab",  # Southern Balochi
+    "cmn": "Hans",  # Mandarin — Simplified default
+    "cmo": "Khmr",  # Central Mnong — Khmer-script orthography
+    "crk": "Cans",  # Plains Cree — Canadian Aboriginal Syllabics
+    "ell": "Grek",  # Greek
+    "gag": "Latn",  # Gagauz — modern Latin orthography
+    "kmr": "Latn",  # Northern Kurdish — Latin (Hawar)
+    "lld": "Latn",  # Ladin
+    "ojb": "Latn",  # Northwestern Ojibwa
+    "rif": "Latn",  # Tarifit Berber
+    "rmc": "Latn",  # Carpathian Romani
+    "rmy": "Latn",  # Vlax Romani
+    "tuk": "Latn",  # Turkmen — modern Latin
+    "uig": "Arab",  # Uyghur — Arabic-script
+    "urd": "Arab",  # Urdu — Nastaliq
+    "uzb": "Latn",  # Uzbek — modern Latin
+    "wal": "Ethi",  # Wolaytta — Ethiopic
+    "yue": "Hant",  # Cantonese — Traditional
+}
 
 
 def _ensure_gang_context() -> None:
@@ -153,6 +206,107 @@ def get_pipeline():
     return _pipeline
 
 
+def _default_script_table() -> dict[str, str]:
+    """
+    Build (and cache) the base → default script lookup used by LID resolution.
+
+    Layered on top of `_MULTI_SCRIPT_DEFAULTS`:
+      - Single-script bases get their sole script automatically.
+      - Multi-script bases without a hand-curated entry fall through to
+        Latin (when supported), otherwise alphabetical first.
+    """
+    global _default_script_for_base
+    if _default_script_for_base is not None:
+        return _default_script_for_base
+
+    from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+
+    scripts_per_base: dict[str, list[str]] = {}
+    for code in supported_langs:
+        base, script = code.split("_", 1)
+        scripts_per_base.setdefault(base, []).append(script)
+
+    table: dict[str, str] = {}
+    for base, scripts in scripts_per_base.items():
+        if len(scripts) == 1:
+            table[base] = scripts[0]
+        elif base in _MULTI_SCRIPT_DEFAULTS and _MULTI_SCRIPT_DEFAULTS[base] in scripts:
+            table[base] = _MULTI_SCRIPT_DEFAULTS[base]
+        elif "Latn" in scripts:
+            table[base] = "Latn"
+        else:
+            table[base] = sorted(scripts)[0]
+
+    _default_script_for_base = table
+    return table
+
+
+def get_lid():
+    """Load the MMS-LID model + feature extractor from baked-in HF cache."""
+    global _lid_model, _lid_feature_extractor
+    if _lid_model is None or _lid_feature_extractor is None:
+        import os
+        import torch
+        from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification
+
+        os.environ["HF_HOME"] = HF_CACHE_DIR
+        print(f"Loading {LID_MODEL_ID} from image cache...")
+        _lid_feature_extractor = AutoFeatureExtractor.from_pretrained(LID_MODEL_ID)
+        _lid_model = Wav2Vec2ForSequenceClassification.from_pretrained(LID_MODEL_ID)
+        if torch.cuda.is_available():
+            _lid_model = _lid_model.to("cuda")
+        _lid_model.eval()
+        print("MMS-LID ready")
+    return _lid_model, _lid_feature_extractor
+
+
+def detect_omniasr_code(waveform_16k) -> str | None:
+    """
+    Run MMS-LID on a 16-kHz mono waveform and return an OmniASR-compatible
+    `{iso639_3}_{Script}` code, or `None` if we can't confidently map the
+    detected base into OmniASR's supported set.
+
+    Strategy: MMS-LID outputs an ISO 639-3 base; pair it with the default
+    script for that base (`_default_script_table()`). If the detected base
+    isn't served by OmniASR at all, return None and let the caller fall
+    back to unconditioned transcription.
+    """
+    import torch
+    import numpy as np
+
+    model, fx = get_lid()
+    # Cap LID input at 30 s — speech models don't benefit from longer
+    # context for identification and shorter input is much faster.
+    max_lid_samples = 30 * 16000
+    snippet = waveform_16k[:max_lid_samples].astype(np.float32, copy=False)
+
+    inputs = fx(snippet, sampling_rate=16000, return_tensors="pt")
+    device = next(model.parameters()).device
+    input_values = inputs.input_values.to(device)
+
+    with torch.inference_mode():
+        logits = model(input_values).logits
+
+    predicted_id = int(torch.argmax(logits, dim=-1).item())
+    label = model.config.id2label.get(predicted_id) if hasattr(model.config.id2label, "get") else model.config.id2label[predicted_id]
+    if not label:
+        return None
+    # MMS-LID labels are ISO 639-3 codes (e.g. "eng", "swh"). Be lenient
+    # about case/whitespace just in case.
+    base = label.strip().lower()
+    if len(base) != 3:
+        print(f"LID returned non-ISO-639-3 label {label!r}; skipping")
+        return None
+
+    table = _default_script_table()
+    script = table.get(base)
+    if not script:
+        # Detected language isn't in OmniASR's supported set — give up and
+        # let the caller transcribe without conditioning.
+        return None
+    return f"{base}_{script}"
+
+
 def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str | None = None) -> dict:
     """
     Transcribe audio bytes → text using OmniASR LLM 1B v2.
@@ -161,12 +315,14 @@ def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str
         audio_bytes: Raw audio file bytes.
         mime_type: MIME type for format detection.
         lang: Optional OmniASR language code (e.g. "eng_Latn", "urd_Arab").
-              If None, the model runs without language conditioning. The model
-              does NOT do internal LID, so the response will not contain a
-              `lang` field when this is None.
+              When provided we trust it and skip LID. When `None` we run
+              MMS-LID first to pick a code, then transcribe with it.
 
     Returns:
-        dict with text, duration_s, inference_s, and lang (only when one was provided).
+        dict with text, duration_s, inference_s, and `lang` (the code we
+        ended up using — either the caller-supplied one or the LID-detected
+        one). `lang` is omitted only when LID failed and we transcribed
+        without conditioning.
     """
     import soundfile as sf
     import numpy as np
@@ -208,6 +364,20 @@ def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str
             waveform = waveform.mean(axis=-1)
         duration = len(waveform) / sr
 
+        # --- Language ID (auto-detect mode only) ---
+        # If the caller supplied `lang` we trust it. Otherwise we run
+        # MMS-LID on the (already 16-kHz mono) waveform.
+        lid_time = 0.0
+        resolved_lang = lang
+        if resolved_lang is None:
+            lid_start = time.perf_counter()
+            try:
+                resolved_lang = detect_omniasr_code(waveform)
+            except Exception as e:
+                print(f"LID failed: {e}; falling back to unconditioned transcription")
+                resolved_lang = None
+            lid_time = time.perf_counter() - lid_start
+
         # --- Chunk if > 40s (model limitation) ---
         max_samples = 40 * sr  # 40 seconds
         if len(waveform) > max_samples:
@@ -224,7 +394,7 @@ def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str
         ]
 
         # Build lang list to match (one per chunk), or None
-        lang_list = [lang] * len(audio_inputs) if lang else None
+        lang_list = [resolved_lang] * len(audio_inputs) if resolved_lang else None
 
         # --- Transcribe ---
         start_t = time.perf_counter()
@@ -243,11 +413,13 @@ def transcribe_audio(audio_bytes: bytes, mime_type: str = "audio/wav", lang: str
             "duration_s": round(duration, 2),
             "inference_s": round(inference_time, 3),
         }
-        # Echo the lang we used so the client can render the badge. In auto-detect
-        # mode (lang is None) we have no detected language to report — omit the
-        # field and let the client render "Auto Detect" honestly.
-        if lang:
-            resp["lang"] = lang
+        if lid_time:
+            resp["lid_s"] = round(lid_time, 3)
+        # Echo the lang we actually used (caller-supplied or LID-resolved)
+        # so the client can render an honest badge. If LID failed and we
+        # transcribed without conditioning, omit the field entirely.
+        if resolved_lang:
+            resp["lang"] = resolved_lang
 
         return resp
 
@@ -284,8 +456,9 @@ def root():
         return {
             "service": "codex-asr",
             "model": MODEL_CARD,
+            "lid_model": LID_MODEL_ID,
             "languages": "1600+",
-            "note": "Pass ?lang={iso639_3}_{Script} (e.g. eng_Latn) for best accuracy. Omit for autodetect (no LID, lower accuracy).",
+            "note": "Pass ?lang={iso639_3}_{Script} (e.g. eng_Latn) to skip LID. Omit to run MMS-LID first and use the detected language for transcription.",
         }
 
     @web_app.get("/health")
@@ -303,7 +476,7 @@ async def transcribe_endpoint(
         file: UploadFile = File(...),
         lang: str | None = Query(
             default=None,
-            description="OmniASR language code in {iso639_3}_{Script} form, e.g. eng_Latn, urd_Arab, spa_Latn. Omit to let the model transcribe without language conditioning.",
+            description="OmniASR language code in {iso639_3}_{Script} form, e.g. eng_Latn, urd_Arab, spa_Latn. Omit to run MMS-LID first and use the detected language for transcription.",
         ),
     ):
         # Validate language code if provided

From 1dc8745c9937ee03b15b178ad09c740f7b509a13 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 22:15:33 -0500
Subject: [PATCH 10/12] Re-enable transcription badge, rename script options
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the Modal endpoint runs MMS-LID in auto-detect mode, the
language badge on transcribed text is informative again (it reflects
the server's resolved code, which is the LID result in auto-detect
mode and the user-supplied code in project mode).

Relabel the Script dropdown to use plainer language:
- "Best guess (default)"  → "Default"
- "Custom (ISO 15924 tag)" → "Other (ISO 15924 tag)"
---
 .../AudioWaveformWithTranscription.tsx        | 256 ++++++++++--------
 1 file changed, 146 insertions(+), 110 deletions(-)

diff --git a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
index a546b8fe9..3a529caa8 100644
--- a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
@@ -93,14 +93,26 @@ const AudioWaveformWithTranscription: React.FC<AudioWaveformWithTranscriptionPro
 
     // The Script picker offers three "preset" choices plus a free-form 4-letter input for
     // power users (e.g. someone wants `swa_Cyrl` even though the resolver would never pick
-    // it). We surface "Custom" only when the current value isn't one of the presets.
-    const isPresetScript = asrScriptPref === "auto" || asrScriptPref === "latin";
+    // it). We track the *dropdown* selection separately from the committed `asrScriptPref`
+    // so picking "Custom" reveals the input even before a valid tag has been entered.
+    type ScriptOption = "auto" | "latin" | "custom";
+    const optionFromPref = (pref: string): ScriptOption =>
+        pref === "auto" ? "auto" : pref === "latin" ? "latin" : "custom";
+    const [scriptSelection, setScriptSelection] = useState<ScriptOption>(
+        optionFromPref(asrScriptPref)
+    );
     const [scriptCustomDraft, setScriptCustomDraft] = useState<string>(
-        isPresetScript ? "" : asrScriptPref
+        optionFromPref(asrScriptPref) === "custom" ? asrScriptPref : ""
     );
     useEffect(() => {
-        if (!isPresetScript) setScriptCustomDraft(asrScriptPref);
-    }, [asrScriptPref, isPresetScript]);
+        const next = optionFromPref(asrScriptPref);
+        setScriptSelection(next);
+        if (next === "custom") setScriptCustomDraft(asrScriptPref);
+    }, [asrScriptPref]);
+    const commitCustomScript = () => {
+        const candidate = scriptCustomDraft.trim();
+        if (/^[A-Za-z]{4}$/.test(candidate)) onChangeAsrScriptPref?.(candidate);
+    };
 
     // Prefer the provided URL (can be blob: or data:). Fall back to creating an object URL from the blob.
     useEffect(() => {
@@ -270,119 +282,143 @@ const AudioWaveformWithTranscription: React.FC<AudioWaveformWithTranscriptionPro
 
             {/* Action buttons at bottom */}
             <div className="flex flex-wrap items-center justify-center gap-2 px-2">
-                {/* Transcribe / Re-transcribe button — always visible (mirrors Re-record),
-                    grey-out while a transcription is in flight. The label flips to
-                    "Re-transcribe" once we have a saved transcription so the user can
-                    re-run with different ASR settings (e.g. flip to auto-detect). */}
-                <Button
-                    onClick={onTranscribe}
-                    disabled={disabled || isTranscribing || (!audioUrl && !audioBlob)}
-                    variant="outline"
-                    className="h-8 px-2 text-xs text-[var(--vscode-button-background)] border-[var(--vscode-button-background)]/20 hover:bg-[var(--vscode-button-background)]/10"
-                    title={transcription ? "Re-transcribe audio with current settings" : "Transcribe Audio"}
-                >
-                    <MessageCircle className="h-3 w-3" />
-                    <span className="ml-1">{transcription ? "Re-transcribe" : "Transcribe"}</span>
-                </Button>
-                {/* Gear menu — Language (auto-detect vs project) + Script (auto/Latin/custom).
-                    Hidden on source-text editors where transcription policy isn't user-driven. */}
-                {showAdvancedAsrMenu && (
-                    <Popover>
-                        <PopoverTrigger asChild>
+                {/* Transcribe / Re-transcribe split-button. The gear is glued to the right
+                    edge of the main button (shared border, no gap) so it visually belongs
+                    to the transcribe control. The label flips to "Re-transcribe" once a
+                    saved transcription exists so the user can re-run with different ASR
+                    settings (e.g. flip to auto-detect). Grey-out the whole group while a
+                    transcription is in flight. */}
+                {(() => {
+                    const sharedBtnClass =
+                        "h-8 text-xs text-[var(--vscode-button-background)] border-[var(--vscode-button-background)]/20 hover:bg-[var(--vscode-button-background)]/10";
+                    const transcribeDisabled =
+                        disabled || isTranscribing || (!audioUrl && !audioBlob);
+                    return (
+                        <div className="inline-flex items-stretch">
                             <Button
+                                onClick={onTranscribe}
+                                disabled={transcribeDisabled}
                                 variant="outline"
-                                size="sm"
-                                className="h-8 w-8 p-0 text-xs"
-                                title="Advanced ASR settings (Language / Script)"
-                                aria-label="Advanced ASR settings"
-                                disabled={isTranscribing}
+                                className={`${sharedBtnClass} px-2 ${
+                                    showAdvancedAsrMenu ? "rounded-r-none border-r-0" : ""
+                                }`}
+                                title={
+                                    transcription
+                                        ? "Re-transcribe audio with current settings"
+                                        : "Transcribe Audio"
+                                }
                             >
-                                <SettingsIcon className="h-3 w-3" />
+                                <MessageCircle className="h-3 w-3" />
+                                <span className="ml-1">
+                                    {transcription ? "Re-transcribe" : "Transcribe"}
+                                </span>
                             </Button>
-                        </PopoverTrigger>
-                        <PopoverContent className="w-72 space-y-3" align="start">
-                            <div className="space-y-1">
-                                <div className="text-xs font-semibold">Language</div>
-                                <Select
-                                    value={asrLanguageMode}
-                                    onValueChange={(v) =>
-                                        onChangeAsrLanguageMode?.(v === "auto" ? "auto" : "project")
-                                    }
-                                >
-                                    <SelectTrigger className="h-7 text-xs">
-                                        <SelectValue />
-                                    </SelectTrigger>
-                                    <SelectContent>
-                                        <SelectItem value="project">
-                                            {projectLanguageName ? `Project (${projectLanguageName})` : "Project language"}
-                                        </SelectItem>
-                                        <SelectItem value="auto">Auto-detect</SelectItem>
-                                    </SelectContent>
-                                </Select>
-                                <p className="text-[10px] text-muted-foreground leading-snug">
-                                    "Project" sends the language code to OmniASR for better accuracy.
-                                    "Auto-detect" omits it — OmniASR transcribes without language conditioning.
-                                </p>
-                            </div>
-                            <div className="space-y-1">
-                                <div className="text-xs font-semibold">Script</div>
-                                <Select
-                                    value={isPresetScript ? asrScriptPref : "custom"}
-                                    onValueChange={(v) => {
-                                        if (v === "auto" || v === "latin") {
-                                            onChangeAsrScriptPref?.(v);
-                                        } else {
-                                            // "custom" — keep whatever 4-letter tag is in the input,
-                                            // or fall back to "auto" if the input is empty/invalid.
-                                            const candidate = scriptCustomDraft.trim();
-                                            const isValid = /^[A-Za-z]{4}$/.test(candidate);
-                                            onChangeAsrScriptPref?.(isValid ? candidate : "auto");
-                                        }
-                                    }}
-                                >
-                                    <SelectTrigger className="h-7 text-xs">
-                                        <SelectValue />
-                                    </SelectTrigger>
-                                    <SelectContent>
-                                        <SelectItem value="auto">Best guess (default)</SelectItem>
-                                        <SelectItem value="latin">Latin (where supported)</SelectItem>
-                                        <SelectItem value="custom">Custom (ISO 15924 tag)</SelectItem>
-                                    </SelectContent>
-                                </Select>
-                                {/* Free-form 4-letter input shown only when "Custom" is selected.
-                                    Validation happens on commit so users can type. */}
-                                {!isPresetScript ? (
-                                    <div className="flex items-center gap-1">
-                                        <Input
-                                            value={scriptCustomDraft}
-                                            onChange={(e) => setScriptCustomDraft(e.target.value)}
-                                            placeholder="e.g. Arab, Cyrl, Hans"
-                                            maxLength={4}
-                                            className="h-7 text-xs"
-                                        />
+                            {showAdvancedAsrMenu && (
+                                <Popover>
+                                    <PopoverTrigger asChild>
                                         <Button
-                                            size="sm"
+                                            type="button"
                                             variant="outline"
-                                            className="h-7 px-2 text-xs"
-                                            onClick={() => {
-                                                const candidate = scriptCustomDraft.trim();
-                                                if (/^[A-Za-z]{4}$/.test(candidate)) {
-                                                    onChangeAsrScriptPref?.(candidate);
-                                                }
-                                            }}
+                                            disabled={isTranscribing}
+                                            className={`${sharedBtnClass} px-1.5 rounded-l-none`}
+                                            title="Advanced ASR settings (Language / Script)"
+                                            aria-label="Advanced ASR settings"
                                         >
-                                            Apply
+                                            <SettingsIcon className="h-3 w-3 opacity-70" />
                                         </Button>
-                                    </div>
-                                ) : null}
-                                <p className="text-[10px] text-muted-foreground leading-snug">
-                                    Script subtag paired with the language. Best guess covers Urdu→Arabic,
-                                    Mandarin→Simplified, Cantonese→Traditional, etc.
-                                </p>
-                            </div>
-                        </PopoverContent>
-                    </Popover>
-                )}
+                                    </PopoverTrigger>
+                                    <PopoverContent className="w-64 space-y-3" align="end">
+                                        <div className="space-y-1">
+                                            <div className="text-xs font-semibold">Language</div>
+                                            <Select
+                                                value={asrLanguageMode}
+                                                onValueChange={(v) =>
+                                                    onChangeAsrLanguageMode?.(
+                                                        v === "auto" ? "auto" : "project"
+                                                    )
+                                                }
+                                            >
+                                                <SelectTrigger className="h-7 text-xs">
+                                                    <SelectValue />
+                                                </SelectTrigger>
+                                                <SelectContent>
+                                                    <SelectItem value="project">
+                                                        {projectLanguageName
+                                                            ? `Project (${projectLanguageName})`
+                                                            : "Project language"}
+                                                    </SelectItem>
+                                                    <SelectItem value="auto">
+                                                        Auto-detect
+                                                    </SelectItem>
+                                                </SelectContent>
+                                            </Select>
+                                        </div>
+                                        <div className="space-y-1">
+                                            <div className="text-xs font-semibold">Script</div>
+                                            <Select
+                                                value={scriptSelection}
+                                                onValueChange={(v) => {
+                                                    const next = v as ScriptOption;
+                                                    setScriptSelection(next);
+                                                    if (next === "auto" || next === "latin") {
+                                                        onChangeAsrScriptPref?.(next);
+                                                    }
+                                                }}
+                                            >
+                                                <SelectTrigger className="h-7 text-xs">
+                                                    <SelectValue />
+                                                </SelectTrigger>
+                                                <SelectContent>
+                                                    <SelectItem value="auto">
+                                                        Default
+                                                    </SelectItem>
+                                                    <SelectItem value="latin">
+                                                        Latin (where supported)
+                                                    </SelectItem>
+                                                    <SelectItem value="custom">
+                                                        Other (ISO 15924 tag)
+                                                    </SelectItem>
+                                                </SelectContent>
+                                            </Select>
+                                            {scriptSelection === "custom" && (
+                                                <div className="flex items-center gap-1">
+                                                    <Input
+                                                        value={scriptCustomDraft}
+                                                        onChange={(e) =>
+                                                            setScriptCustomDraft(e.target.value)
+                                                        }
+                                                        onKeyDown={(e) => {
+                                                            if (e.key === "Enter") {
+                                                                e.preventDefault();
+                                                                commitCustomScript();
+                                                            }
+                                                        }}
+                                                        placeholder="e.g. Arab, Cyrl, Hans"
+                                                        maxLength={4}
+                                                        className="h-7 text-xs"
+                                                    />
+                                                    <Button
+                                                        size="sm"
+                                                        variant="outline"
+                                                        className="h-7 px-2 text-xs"
+                                                        disabled={
+                                                            !/^[A-Za-z]{4}$/.test(
+                                                                scriptCustomDraft.trim()
+                                                            )
+                                                        }
+                                                        onClick={commitCustomScript}
+                                                    >
+                                                        Apply
+                                                    </Button>
+                                                </div>
+                                            )}
+                                        </div>
+                                    </PopoverContent>
+                                </Popover>
+                            )}
+                        </div>
+                    );
+                })()}
                 <Button
                     variant="outline"
                     size="sm"

From 2760a8b21d11f10175804ac7bc12f75f61f54c88 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 23:00:07 -0500
Subject: [PATCH 11/12] Route around legacy auth-proxy ASR upstream; fix
 auto-mode badge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two issues stacking made auto-detect render the project language
("Arabic") even when the user spoke clear English:

1. Badge labeller bug: in auto-detect mode we still passed
   `projectLanguageName` to `labelForTranscriptionLanguage`, so when
   the server didn't echo a `lang` (e.g. the legacy endpoint without
   LID) the labeller fell through to its project-name last-ditch
   fallback. Now we pass null for both `sentLang` and `projectName`
   in auto mode, so the only label source is the server's echo, and
   the explicit "Auto Detect" branch handles the missing-echo case.

2. The Frontier auth proxy still points its ASR upstream at the
   legacy `mms-zeroshot-asr` Modal app, which doesn't run LID. When
   the proxy hands us that URL we now detect the legacy host and
   fall back to the configured `asrEndpoint` (defaulted to
   `codex-asr`, which does run LID). The bypass becomes a no-op once
   the proxy migrates its upstream — see
   docs/AUTH_SERVER_ASR_IMPLEMENTATION.md.
---
 .../codexCellEditorMessagehandling.ts         | 24 ++++++++++++--
 .../src/CodexCellEditor/TextCellEditor.tsx    | 32 +++++++++++++------
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
index bb8d18069..2d716fa18 100644
--- a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
+++ b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
@@ -505,7 +505,21 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
 
             let authToken: string | undefined;
 
-            // Try to get authenticated endpoint from FrontierAPI
+            // Try to get authenticated endpoint from FrontierAPI.
+            //
+            // TRANSITIONAL: the Frontier auth-proxy currently points its ASR
+            // upstream at the legacy `mms-zeroshot-asr` Modal app, which does
+            // **not** run LID and so cannot return a `lang` for auto-detect
+            // mode. While that's true we ignore the proxy's URL and fall back
+            // to the configured `asrEndpoint` (default: `codex-asr`, which
+            // does LID). Once the auth-proxy migrates its upstream to
+            // `codex-asr` (see docs/AUTH_SERVER_ASR_IMPLEMENTATION.md) this
+            // bypass becomes a no-op and can be removed.
+            //
+            // We still pull the JWT (when present) so the configured endpoint
+            // can validate it if it ever moves behind auth — the direct
+            // `codex-asr` Modal app simply ignores Authorization headers.
+            const LEGACY_ASR_HOST_PATTERN = /mms-?zeroshot-?asr/i;
             try {
                 const frontierApi = getAuthApi();
                 if (frontierApi) {
@@ -516,7 +530,13 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                         if (asrEndpoint && asrEndpoint.trim()) {
                             try {
                                 new URL(asrEndpoint);
-                                endpoint = asrEndpoint;
+                                if (LEGACY_ASR_HOST_PATTERN.test(asrEndpoint)) {
+                                    debug(
+                                        `[getAsrConfig] Auth proxy still points at legacy mms-zeroshot-asr (${asrEndpoint}); using configured endpoint instead so LID-enabled auto-detect works.`
+                                    );
+                                } else {
+                                    endpoint = asrEndpoint;
+                                }
                             } catch (urlError) {
                                 console.warn("Invalid ASR endpoint URL from auth API:", asrEndpoint, urlError);
                                 // Fall back to default endpoint
diff --git a/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx b/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
index 7d167af24..e3f273ffa 100644
--- a/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/TextCellEditor.tsx
@@ -512,24 +512,36 @@ const CellEditor: React.FC<CellEditorProps> = ({
     /**
      * Friendly label shown on the transcription badge.
      *
-     * Source priority (delegated to labelForTranscriptionLanguage):
-     *   1. Language the server echoed in the response (saved as `transcription.language`)
-     *   2. Language we *sent* — when in project mode the server uses it silently
-     *   3. Project target-language refName, as a last-ditch fallback
-     *   4. "Auto Detect" — only when in auto-detect mode and (1)/(2)/(3) all returned null
+     * Auto-detect mode:
+     *   - Server echoed `lang` (LID succeeded) → show that language's friendly name.
+     *   - Server returned no `lang` (LID failed, OR client is talking to a legacy
+     *     endpoint without LID like the old `mms-zeroshot-asr` Modal app) →
+     *     **"Auto Detect"**. We deliberately do NOT fall back to the project
+     *     language here — the whole point of auto-detect is that we're not
+     *     assuming it's the project language.
+     *
+     * Project mode:
+     *   - Server echoed `lang` → show that.
+     *   - Server didn't echo but we sent a code → show what we sent.
+     *   - Otherwise fall back to the project language refName.
      *
      * Returns null → render no badge.
      */
     const transcriptionBadgeLabel: string | null = useMemo(() => {
         if (!savedTranscription) return null;
+        const isAuto = asrConfig?.languageMode === "auto";
         const serverLang = savedTranscription.language ?? null;
-        const sentLang = asrConfig?.languageMode === "auto" ? null : asrConfig?.lang ?? null;
-        const projectName = asrConfig?.projectLanguageName ?? null;
+        // In auto mode neither sentLang nor projectName are meaningful labels —
+        // we didn't send a code and we don't know that the speaker used the
+        // project's language. Force the labeller down its server-echo path only.
+        const sentLang = isAuto ? null : asrConfig?.lang ?? null;
+        const projectName = isAuto ? null : asrConfig?.projectLanguageName ?? null;
         const friendly = labelForTranscriptionLanguage(serverLang, sentLang, projectName);
         if (friendly) return friendly;
-        // No language info at all → only honest answer is "Auto Detect" (and only when
-        // that's what the user picked; in project mode we'd rather render nothing than lie).
-        if (asrConfig?.languageMode === "auto") return "Auto Detect";
+        // No usable label and we're in auto mode → display "Auto Detect" rather
+        // than nothing, so the user can tell auto-detect ran but failed (or the
+        // endpoint they're hitting doesn't do server-side LID).
+        if (isAuto) return "Auto Detect";
         return null;
     }, [
         savedTranscription,

From 42b6ac81568fa71496c6562bcb470fbb3fd53681 Mon Sep 17 00:00:00 2001
From: Luke-Bilhorn <luke.bilhorn@my.wheaton.edu>
Date: Thu, 4 Jun 2026 23:32:41 -0500
Subject: [PATCH 12/12] Keep using Frontier auth-proxy ASR endpoint; hide
 language badge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revert the auth-proxy bypass added in 2760a8b2 — this PR should
behave like main and route ASR through the Frontier auth-proxy,
which still forwards to the legacy `mms-zeroshot-asr` Modal app.
Changing that routing is a separate decision that needs sign-off.

Since the legacy upstream doesn't run LID and never echoes `lang`,
the transcription language badge would either silently say
"Auto Detect" or fall back to the project language — neither of
which is honest. Hide the badge with a TODO comment pointing at
the auth-proxy migration; the `transcriptionLanguageLabel` prop
and all the plumbing through it stay wired so re-enabling is a
one-line change once the proxy upstream moves to `codex-asr`.

The `codex-asr` Modal app (with MMS-LID baked in) stays deployed
and ready — see docs/asr/codex_asr_modal.py and
docs/AUTH_SERVER_ASR_IMPLEMENTATION.md.
---
 .../codexCellEditorMessagehandling.ts         | 24 ++-----------------
 .../AudioWaveformWithTranscription.tsx        | 16 +++++++++----
 2 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
index 2d716fa18..bb8d18069 100644
--- a/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
+++ b/src/providers/codexCellEditorProvider/codexCellEditorMessagehandling.ts
@@ -505,21 +505,7 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
 
             let authToken: string | undefined;
 
-            // Try to get authenticated endpoint from FrontierAPI.
-            //
-            // TRANSITIONAL: the Frontier auth-proxy currently points its ASR
-            // upstream at the legacy `mms-zeroshot-asr` Modal app, which does
-            // **not** run LID and so cannot return a `lang` for auto-detect
-            // mode. While that's true we ignore the proxy's URL and fall back
-            // to the configured `asrEndpoint` (default: `codex-asr`, which
-            // does LID). Once the auth-proxy migrates its upstream to
-            // `codex-asr` (see docs/AUTH_SERVER_ASR_IMPLEMENTATION.md) this
-            // bypass becomes a no-op and can be removed.
-            //
-            // We still pull the JWT (when present) so the configured endpoint
-            // can validate it if it ever moves behind auth — the direct
-            // `codex-asr` Modal app simply ignores Authorization headers.
-            const LEGACY_ASR_HOST_PATTERN = /mms-?zeroshot-?asr/i;
+            // Try to get authenticated endpoint from FrontierAPI
             try {
                 const frontierApi = getAuthApi();
                 if (frontierApi) {
@@ -530,13 +516,7 @@ const messageHandlers: Record<string, (ctx: MessageHandlerContext) => Promise<vo
                         if (asrEndpoint && asrEndpoint.trim()) {
                             try {
                                 new URL(asrEndpoint);
-                                if (LEGACY_ASR_HOST_PATTERN.test(asrEndpoint)) {
-                                    debug(
-                                        `[getAsrConfig] Auth proxy still points at legacy mms-zeroshot-asr (${asrEndpoint}); using configured endpoint instead so LID-enabled auto-detect works.`
-                                    );
-                                } else {
-                                    endpoint = asrEndpoint;
-                                }
+                                endpoint = asrEndpoint;
                             } catch (urlError) {
                                 console.warn("Invalid ASR endpoint URL from auth API:", asrEndpoint, urlError);
                                 // Fall back to default endpoint
diff --git a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
index 3a529caa8..4c7052466 100644
--- a/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
+++ b/webviews/codex-webviews/src/CodexCellEditor/AudioWaveformWithTranscription.tsx
@@ -200,11 +200,17 @@ const AudioWaveformWithTranscription: React.FC<AudioWaveformWithTranscriptionPro
                             >
                                 {transcription.content}
                             </p>
-                            {transcriptionLanguageLabel && (
-                                <Badge variant="secondary" className="text-xs">
-                                    {transcriptionLanguageLabel}
-                                </Badge>
-                            )}
+                            {/* Language badge intentionally hidden in this PR.
+                                The new `codex-asr` Modal app DOES run MMS-LID and echo back a
+                                `lang` for auto-detect (and the plumbing all the way through
+                                `transcriptionLanguageLabel` is wired and ready), but this PR
+                                keeps the client pointed at the existing Frontier auth-proxy ASR
+                                endpoint, which still forwards to the legacy `mms-zeroshot-asr`
+                                Modal app — no LID, no `lang` echo. Showing the badge in that
+                                world means falling back to "Auto Detect" (or worse, the project
+                                language) instead of an honest detection, which is misleading.
+                                Re-enable this `<Badge>` once the auth-proxy upstream migrates
+                                to `codex-asr` (see docs/AUTH_SERVER_ASR_IMPLEMENTATION.md). */}
                         </div>
                         <Button
                             onClick={onInsertTranscription}