genesis-ai-dev · Luke-Bilhorn · Apr 16, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 21, 2026
diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts
@@ -129,10 +129,13 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide
                     const intentMatch = uriQuery.match(/intent=(source|target)/);
                     const initialIntent = intentMatch ? intentMatch[1] : undefined;
 
+                    const sourceLanguageTag = await this.getSourceLanguageTag();
+
                     webviewPanel.webview.postMessage({
                         command: "projectInventory",
                         inventory: inventory,
                         initialIntent,
+                        sourceLanguageTag,
                     });
                 } else if (message.command === "metadata.check") {
                     // Handle metadata check request
@@ -2000,6 +2003,22 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide
         return confirmOverwriteWithDetails(items);
     }
 
+    private async getSourceLanguageTag(): Promise<string | undefined> {
+        try {
+            const workspaceFolders = vscode.workspace.workspaceFolders;
+            if (!workspaceFolders?.length) return undefined;
+            const metadataUri = vscode.Uri.joinPath(workspaceFolders[0].uri, "metadata.json");
+            const raw = await vscode.workspace.fs.readFile(metadataUri);
+            const metadata = JSON.parse(raw.toString());
+            const sourceLang = metadata.languages?.find(
+                (l: { projectStatus?: string }) => l.projectStatus === "source"
+            );
+            return sourceLang?.tag as string | undefined;
+        } catch {
+            return undefined;
+        }
+    }
+
     private async fetchProjectInventory(): Promise<{
         sourceFiles: Array<{
             name: string;

diff --git a/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx b/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx
@@ -291,6 +291,7 @@ const NewSourceUploader: React.FC = () => {
                     targetFiles: [],
                     translationPairs: [],
                 };
+                const sourceLanguageTag: string | undefined = message.sourceLanguageTag;
 
                 const initialIntent: ImportIntent | undefined = message.initialIntent;
 
@@ -300,6 +301,7 @@ const NewSourceUploader: React.FC = () => {
                     const base = {
                         ...prev,
                         projectInventory: inventory,
+                        sourceLanguageTag: sourceLanguageTag ?? prev.sourceLanguageTag,
                         isLoadingInventory: false,
                     };
 
@@ -628,6 +630,7 @@ const NewSourceUploader: React.FC = () => {
             selectedSource: wizardState.selectedSourceForTarget,
             selectedSourceDetails: wizardState.selectedSourceDetails,
             projectInventory: wizardState.projectInventory,
+            sourceLanguageTag: wizardState.sourceLanguageTag,
         };
 
         // For target imports, we need detailed source info and should use translation completion

diff --git a/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx
@@ -17,6 +17,7 @@ import {
     Eye,
     BarChart3,
     AlertCircle,
+    ChevronDown,
 } from "lucide-react";
 import { ImporterComponentProps, CellAligner, AlignedCell } from "../types/plugin";
 import { NotebookPair, ImportProgress } from "../types/common";
@@ -82,6 +83,13 @@ export interface UnifiedImporterFormProps {
      * above the import button (e.g. timestamp corruption warnings).
      */
     analyzeWarnings?: (files: File[]) => Promise<string[]>;
+
+    /**
+     * Optional custom controls rendered in a collapsible "Advanced Settings"
+     * section below the file-selection card. Importer-specific knobs
+     * (e.g. ideal cell length for DOCX splitting) can live here.
+     */
+    advancedSettings?: React.ReactNode;
 }
 
 export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
@@ -99,7 +107,9 @@ export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
     onSourceImportComplete,
     showEnforceStructure = false,
     analyzeWarnings,
+    advancedSettings,
 }) => {
+    const [showAdvanced, setShowAdvanced] = useState(false);
     const [files, setFiles] = useState<File[]>([]);
     const [enforceStructure, setEnforceStructure] = useState(showEnforceStructure);
     const [previewContent, setPreviewContent] = useState<string>("");
@@ -428,6 +438,27 @@ export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
                 </CardContent>
             </Card>
 
+            {/* Advanced Settings (optional, importer-specific) */}
+            {advancedSettings && (
+                <div className="border border-gray-300 rounded-md">
+                    <button
+                        type="button"
+                        onClick={() => setShowAdvanced((v) => !v)}
+                        className="flex w-full items-center justify-between px-3 py-2 text-sm font-medium text-gray-500 hover:text-gray-800 transition-colors"
+                    >
+                        <span>Advanced Settings</span>
+                        <ChevronDown
+                            className={`h-4 w-4 transition-transform ${showAdvanced ? "rotate-180" : ""}`}
+                        />
+                    </button>
+                    {showAdvanced && (
+                        <div className="px-3 pb-3 pt-1 space-y-2">
+                            {advancedSettings}
+                        </div>
+                    )}
+                </div>
+            )}
+
             {/* Enforce HTML Structure Checkbox */}
             {showEnforceStructure && hasFiles && (
                 <EnforceStructureCheckbox

diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx
@@ -1,11 +1,15 @@
-import React, { useCallback } from "react";
+import React, { useCallback, useState } from "react";
 import { FileText } from "lucide-react";
 import { UnifiedImporterForm, type FileAnalysisStat } from "../../components/UnifiedImporterForm";
 import { type ImporterComponentProps, sequentialCellAligner } from "../../types/plugin";
 import type { NotebookPair, ImportProgress } from "../../types/common";
 import { validateFile, parseFile } from "./index";
+import { DEFAULT_IDEAL_CELL_LENGTH } from "../../utils/textSplitter";
 
 export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
+    const [idealCellLength, setIdealCellLength] = useState<number>(DEFAULT_IDEAL_CELL_LENGTH);
+    const locale = props.wizardContext?.sourceLanguageTag;
+
     const analyzeFiles = useCallback(async (files: File[]): Promise<FileAnalysisStat[]> => {
         const totalBytes = files.reduce((sum, f) => sum + f.size, 0);
         return [
@@ -38,7 +42,7 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
                     throw new Error(`${file.name}: ${validation.errors.join(", ")}`);
                 }
 
-                const importResult = await parseFile(file, onProgress);
+                const importResult = await parseFile(file, onProgress, { idealCellLength, locale });
                 if (!importResult.success || !importResult.notebookPair) {
                     throw new Error(importResult.error || `Failed to parse ${file.name}`);
                 }
@@ -48,7 +52,36 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
 
             return results.length === 1 ? results[0]! : results;
         },
-        []
+        [idealCellLength, locale]
+    );
+
+    const advancedSettings = (
+        <>
+            <div className="flex items-center gap-3">
+                <label
+                    htmlFor="ideal-cell-length"
+                    className="text-sm font-medium whitespace-nowrap"
+                >
+                    Ideal cell length (in characters)
+                </label>
+                <input
+                    id="ideal-cell-length"
+                    type="number"
+                    min={0}
+                    step={10}
+                    value={idealCellLength}
+                    onChange={(e) => {
+                        const v = parseInt(e.target.value, 10);
+                        if (!isNaN(v) && v >= 0) setIdealCellLength(v);
+                    }}
+                    className="w-24 rounded-md border border-input bg-background px-3 py-1 text-sm shadow-sm focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:opacity-50"
+                />
+            </div>
+            <p className="text-xs text-gray-500">
+                Long paragraphs are split into smaller cells at sentence boundaries. Set to 0 to
+                disable splitting.
+            </p>
+        </>
     );
 
     return (
@@ -65,6 +98,7 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
             cellAligner={sequentialCellAligner}
             showPreview={false}
             showEnforceStructure
+            advancedSettings={advancedSettings}
         />
     );
 };
diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts
@@ -19,14 +19,24 @@ export interface DocxCellMetadataParams {
     paragraph: DocxParagraph;
     docxDoc: DocxDocument;
     fileName: string;
+    /**
+     * When a paragraph is split into multiple cells, this is the 0-based index
+     * of this cell within that paragraph.  Undefined for unsplit paragraphs.
+     */
+    segmentIndex?: number;
+    /**
+     * Total number of cells this paragraph was split into.
+     * Undefined for unsplit paragraphs.
+     */
+    segmentCount?: number;
 }
 
 /**
  * Creates metadata for a DOCX paragraph cell
  * Generates a UUID for the cell ID
  */
 export function createDocxCellMetadata(params: DocxCellMetadataParams): { metadata: any; cellId: string; } {
-    const { paragraphId, paragraphIndex, originalContent, paragraph, docxDoc, fileName } = params;
+    const { paragraphId, paragraphIndex, originalContent, segmentIndex, segmentCount } = params;
 
     // Generate UUID for cell ID
     const cellId = uuidv4();
@@ -39,6 +49,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada
      *
      * To keep `.source`/`.codex` small, we only persist what we need to map a Codex cell
      * back to a paragraph in `word/document.xml`.
+     *
+     * For split paragraphs, segmentIndex/segmentCount allow the exporter to
+     * recombine the translated segments in order before writing them back to
+     * the original <w:p>.
      */
     const cellMetadata = {
         id: cellId,
@@ -47,6 +61,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada
         paragraphId,
         paragraphIndex,
 
+        // Present only when the paragraph was split into multiple cells
+        ...(segmentIndex !== undefined && { segmentIndex }),
+        ...(segmentCount !== undefined && { segmentCount }),
+
         // Data object for consistency with other importers
         data: {
             originalText: originalContent,

diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts
@@ -100,59 +100,82 @@ export async function exportDocxWithTranslations(
 }
 
 /**
- * Collect translations from Codex cells
+ * Collect translations from Codex cells.
+ *
+ * Handles three cell shapes:
+ *  1. Table cells  – one Codex cell maps to multiple DOCX paragraphs (paragraphIndices[]).
+ *  2. Split cells  – one DOCX paragraph was split into N Codex cells (segmentIndex present).
+ *     The per-segment translations are joined in order before writing to the <w:p>.
+ *  3. Normal cells – one Codex cell ↔ one DOCX paragraph (paragraphIndex only).
  */
 function collectTranslations(
     codexCells: Array<{ kind: number; value: string; metadata: any; }>
 ): Map<number, string> {
-    const translations = new Map<number, string>();
-
     console.log(`[Exporter] Processing ${codexCells.length} cells for translations`);
 
-    for (let i = 0; i < codexCells.length; i++) {
-        const cell = codexCells[i];
-        const meta = cell.metadata;
+    // Accumulate per-paragraph segments: paragraphIndex → sorted list of {segmentIndex, text}
+    const segmentsByParagraph = new Map<number, Array<{ segmentIndex: number; text: string }>>();
+    // Table cells bypass the segment system entirely
+    const tableTranslations = new Map<number, string>();
 
-        // Only DOCX cells have paragraphIndex/paragraphId; everything else is skipped naturally.
-        // (Don't rely on kind/type here; it varies by host and we only need the mapping fields.)
+    for (const cell of codexCells) {
+        const meta = cell.metadata;
 
-        // Get translated content (strip HTML tags)
         const translated = removeHtmlTags(cell.value).trim();
-        if (!translated) {
-            continue;
-        }
+        if (!translated) continue;
 
-        // Get paragraph identifier
         const paragraphId = meta?.paragraphId;
         const paragraphIndex = meta?.paragraphIndex;
         const paragraphIndices = meta?.paragraphIndices;
+        const segmentIndex: number | undefined = meta?.segmentIndex;
 
         if (Array.isArray(paragraphIndices) && paragraphIndices.length > 0) {
-            // Table-cell case: a single Codex cell maps to multiple DOCX paragraphs.
-            // We map lines of the translation to each paragraph index (preserves paragraph count).
+            // Table-cell case: map lines of translation to each paragraph index.
             const parts = translated.split(/\r?\n/);
             for (let j = 0; j < paragraphIndices.length; j++) {
                 const idx = paragraphIndices[j];
-                if (typeof idx !== "number") continue;
-                translations.set(idx, parts[j] ?? '');
+                if (typeof idx !== 'number') continue;
+                tableTranslations.set(idx, parts[j] ?? '');
             }
-        } else if (typeof paragraphIndex === 'number') {
-            translations.set(paragraphIndex, translated);
-            // Keep logs light; large documents can have thousands of cells.
+            continue;
+        }
+
+        // Resolve the paragraph index (numeric or from paragraphId string)
+        let paraIdx: number | undefined;
+        if (typeof paragraphIndex === 'number') {
+            paraIdx = paragraphIndex;
         } else if (typeof paragraphId === 'string') {
             const m = paragraphId.match(/^p-(\d+)$/);
             if (m) {
-                const idx = Number(m[1]);
-                translations.set(idx, translated);
+                paraIdx = Number(m[1]);
             } else {
                 console.warn(`[Exporter] ⚠ Unrecognized paragraphId format: ${paragraphId}`);
+                continue;
             }
+        } else {
+            continue;
         }
+
+        if (!segmentsByParagraph.has(paraIdx)) {
+            segmentsByParagraph.set(paraIdx, []);
+        }
+        segmentsByParagraph.get(paraIdx)!.push({
+            // Unsplit paragraphs have no segmentIndex; treat them as the sole segment (index 0).
+            segmentIndex: segmentIndex ?? 0,
+            text: translated,
+        });
     }
 
-    console.log(`[Exporter] Collected ${translations.size} translations total`);
-    // Avoid dumping thousands of IDs in logs.
+    // Build the final map: for split paragraphs, join segments in order.
+    const translations = new Map<number, string>(tableTranslations);
+
+    for (const [paraIdx, segments] of segmentsByParagraph) {
+        segments.sort((a, b) => a.segmentIndex - b.segmentIndex);
+        const combined = segments.map(s => s.text).join(' ');
+        translations.set(paraIdx, combined);
+    }
 
+    console.log(`[Exporter] Collected ${translations.size} translations total`);
     return translations;
 }
 

diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts
@@ -75,6 +75,7 @@ export class DocxParser {
             extractTables: false, // TODO: Implement table support
             segmentationStrategy: 'paragraph',
             validateStructure: true,
+            idealCellLength: 160,
             ...config,
         };
 

diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts
@@ -276,6 +276,13 @@ export interface DocxParseConfig {
     extractTables: boolean;
     segmentationStrategy: 'paragraph' | 'sentence' | 'run';
     validateStructure: boolean;
+    /**
+     * Ideal cell length in characters.
+     * Paragraphs longer than ~N*1.1 are recursively split at sentence
+     * boundaries (L1), then sub-sentence stops (L2), then whitespace (L3).
+     * Defaults to 160. Set to 0 to disable splitting.
+     */
+    idealCellLength: number;
 }
 
 // Error types