diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts
index 5668f2923..147653fc5 100644
--- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts
+++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts
@@ -129,10 +129,13 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide
                     const intentMatch = uriQuery.match(/intent=(source|target)/);
                     const initialIntent = intentMatch ? intentMatch[1] : undefined;
 
+                    const sourceLanguageTag = await this.getSourceLanguageTag();
+
                     webviewPanel.webview.postMessage({
                         command: "projectInventory",
                         inventory: inventory,
                         initialIntent,
+                        sourceLanguageTag,
                     });
                 } else if (message.command === "metadata.check") {
                     // Handle metadata check request
@@ -2000,6 +2003,22 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide
         return confirmOverwriteWithDetails(items);
     }
 
+    private async getSourceLanguageTag(): Promise<string | undefined> {
+        try {
+            const workspaceFolders = vscode.workspace.workspaceFolders;
+            if (!workspaceFolders?.length) return undefined;
+            const metadataUri = vscode.Uri.joinPath(workspaceFolders[0].uri, "metadata.json");
+            const raw = await vscode.workspace.fs.readFile(metadataUri);
+            const metadata = JSON.parse(raw.toString());
+            const sourceLang = metadata.languages?.find(
+                (l: { projectStatus?: string }) => l.projectStatus === "source"
+            );
+            return sourceLang?.tag as string | undefined;
+        } catch {
+            return undefined;
+        }
+    }
+
     private async fetchProjectInventory(): Promise<{
         sourceFiles: Array<{
             name: string;
diff --git a/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx b/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx
index a4759e3ef..84a5c50cb 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx
+++ b/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx
@@ -291,6 +291,7 @@ const NewSourceUploader: React.FC = () => {
                     targetFiles: [],
                     translationPairs: [],
                 };
+                const sourceLanguageTag: string | undefined = message.sourceLanguageTag;
 
                 const initialIntent: ImportIntent | undefined = message.initialIntent;
 
@@ -300,6 +301,7 @@ const NewSourceUploader: React.FC = () => {
                     const base = {
                         ...prev,
                         projectInventory: inventory,
+                        sourceLanguageTag: sourceLanguageTag ?? prev.sourceLanguageTag,
                         isLoadingInventory: false,
                     };
 
@@ -628,6 +630,7 @@ const NewSourceUploader: React.FC = () => {
             selectedSource: wizardState.selectedSourceForTarget,
             selectedSourceDetails: wizardState.selectedSourceDetails,
             projectInventory: wizardState.projectInventory,
+            sourceLanguageTag: wizardState.sourceLanguageTag,
         };
 
         // For target imports, we need detailed source info and should use translation completion
diff --git a/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx
index 7d68226a7..cd6b0bc08 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx
+++ b/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx
@@ -17,6 +17,7 @@ import {
     Eye,
     BarChart3,
     AlertCircle,
+    ChevronDown,
 } from "lucide-react";
 import { ImporterComponentProps, CellAligner, AlignedCell } from "../types/plugin";
 import { NotebookPair, ImportProgress } from "../types/common";
@@ -82,6 +83,13 @@ export interface UnifiedImporterFormProps {
      * above the import button (e.g. timestamp corruption warnings).
      */
     analyzeWarnings?: (files: File[]) => Promise<string[]>;
+
+    /**
+     * Optional custom controls rendered in a collapsible "Advanced Settings"
+     * section below the file-selection card. Importer-specific knobs
+     * (e.g. ideal cell length for DOCX splitting) can live here.
+     */
+    advancedSettings?: React.ReactNode;
 }
 
 export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
@@ -99,7 +107,9 @@ export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
     onSourceImportComplete,
     showEnforceStructure = false,
     analyzeWarnings,
+    advancedSettings,
 }) => {
+    const [showAdvanced, setShowAdvanced] = useState(false);
     const [files, setFiles] = useState<File[]>([]);
     const [enforceStructure, setEnforceStructure] = useState(showEnforceStructure);
     const [previewContent, setPreviewContent] = useState<string>("");
@@ -428,6 +438,27 @@ export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
                 </CardContent>
             </Card>
 
+            {/* Advanced Settings (optional, importer-specific) */}
+            {advancedSettings && (
+                <div className="border border-gray-300 rounded-md">
+                    <button
+                        type="button"
+                        onClick={() => setShowAdvanced((v) => !v)}
+                        className="flex w-full items-center justify-between px-3 py-2 text-sm font-medium text-gray-500 hover:text-gray-800 transition-colors"
+                    >
+                        <span>Advanced Settings</span>
+                        <ChevronDown
+                            className={`h-4 w-4 transition-transform ${showAdvanced ? "rotate-180" : ""}`}
+                        />
+                    </button>
+                    {showAdvanced && (
+                        <div className="px-3 pb-3 pt-1 space-y-2">
+                            {advancedSettings}
+                        </div>
+                    )}
+                </div>
+            )}
+
             {/* Enforce HTML Structure Checkbox */}
             {showEnforceStructure && hasFiles && (
                 <EnforceStructureCheckbox
diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx
index b7d0ce843..b33c3af3e 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx
+++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx
@@ -1,11 +1,15 @@
-import React, { useCallback } from "react";
+import React, { useCallback, useState } from "react";
 import { FileText } from "lucide-react";
 import { UnifiedImporterForm, type FileAnalysisStat } from "../../components/UnifiedImporterForm";
 import { type ImporterComponentProps, sequentialCellAligner } from "../../types/plugin";
 import type { NotebookPair, ImportProgress } from "../../types/common";
 import { validateFile, parseFile } from "./index";
+import { DEFAULT_IDEAL_CELL_LENGTH } from "../../utils/textSplitter";
 
 export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
+    const [idealCellLength, setIdealCellLength] = useState<number>(DEFAULT_IDEAL_CELL_LENGTH);
+    const locale = props.wizardContext?.sourceLanguageTag;
+
     const analyzeFiles = useCallback(async (files: File[]): Promise<FileAnalysisStat[]> => {
         const totalBytes = files.reduce((sum, f) => sum + f.size, 0);
         return [
@@ -38,7 +42,7 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
                     throw new Error(`${file.name}: ${validation.errors.join(", ")}`);
                 }
 
-                const importResult = await parseFile(file, onProgress);
+                const importResult = await parseFile(file, onProgress, { idealCellLength, locale });
                 if (!importResult.success || !importResult.notebookPair) {
                     throw new Error(importResult.error || `Failed to parse ${file.name}`);
                 }
@@ -48,7 +52,36 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
 
             return results.length === 1 ? results[0]! : results;
         },
-        []
+        [idealCellLength, locale]
+    );
+
+    const advancedSettings = (
+        <>
+            <div className="flex items-center gap-3">
+                <label
+                    htmlFor="ideal-cell-length"
+                    className="text-sm font-medium whitespace-nowrap"
+                >
+                    Ideal cell length (in characters)
+                </label>
+                <input
+                    id="ideal-cell-length"
+                    type="number"
+                    min={0}
+                    step={10}
+                    value={idealCellLength}
+                    onChange={(e) => {
+                        const v = parseInt(e.target.value, 10);
+                        if (!isNaN(v) && v >= 0) setIdealCellLength(v);
+                    }}
+                    className="w-24 rounded-md border border-input bg-background px-3 py-1 text-sm shadow-sm focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:opacity-50"
+                />
+            </div>
+            <p className="text-xs text-gray-500">
+                Long paragraphs are split into smaller cells at sentence boundaries. Set to 0 to
+                disable splitting.
+            </p>
+        </>
     );
 
     return (
@@ -65,6 +98,7 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
             cellAligner={sequentialCellAligner}
             showPreview={false}
             showEnforceStructure
+            advancedSettings={advancedSettings}
         />
     );
 };
diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts
index 8de996c74..ec8ac6eaf 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts
+++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts
@@ -19,6 +19,16 @@ export interface DocxCellMetadataParams {
     paragraph: DocxParagraph;
     docxDoc: DocxDocument;
     fileName: string;
+    /**
+     * When a paragraph is split into multiple cells, this is the 0-based index
+     * of this cell within that paragraph.  Undefined for unsplit paragraphs.
+     */
+    segmentIndex?: number;
+    /**
+     * Total number of cells this paragraph was split into.
+     * Undefined for unsplit paragraphs.
+     */
+    segmentCount?: number;
 }
 
 /**
@@ -26,7 +36,7 @@ export interface DocxCellMetadataParams {
  * Generates a UUID for the cell ID
  */
 export function createDocxCellMetadata(params: DocxCellMetadataParams): { metadata: any; cellId: string; } {
-    const { paragraphId, paragraphIndex, originalContent, paragraph, docxDoc, fileName } = params;
+    const { paragraphId, paragraphIndex, originalContent, segmentIndex, segmentCount } = params;
 
     // Generate UUID for cell ID
     const cellId = uuidv4();
@@ -39,6 +49,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada
      *
      * To keep `.source`/`.codex` small, we only persist what we need to map a Codex cell
      * back to a paragraph in `word/document.xml`.
+     *
+     * For split paragraphs, segmentIndex/segmentCount allow the exporter to
+     * recombine the translated segments in order before writing them back to
+     * the original <w:p>.
      */
     const cellMetadata = {
         id: cellId,
@@ -47,6 +61,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada
         paragraphId,
         paragraphIndex,
 
+        // Present only when the paragraph was split into multiple cells
+        ...(segmentIndex !== undefined && { segmentIndex }),
+        ...(segmentCount !== undefined && { segmentCount }),
+
         // Data object for consistency with other importers
         data: {
             originalText: originalContent,
diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts
index 8404d3667..ce4540a12 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts
+++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts
@@ -100,59 +100,82 @@ export async function exportDocxWithTranslations(
 }
 
 /**
- * Collect translations from Codex cells
+ * Collect translations from Codex cells.
+ *
+ * Handles three cell shapes:
+ *  1. Table cells  – one Codex cell maps to multiple DOCX paragraphs (paragraphIndices[]).
+ *  2. Split cells  – one DOCX paragraph was split into N Codex cells (segmentIndex present).
+ *     The per-segment translations are joined in order before writing to the <w:p>.
+ *  3. Normal cells – one Codex cell ↔ one DOCX paragraph (paragraphIndex only).
  */
 function collectTranslations(
     codexCells: Array<{ kind: number; value: string; metadata: any; }>
 ): Map<number, string> {
-    const translations = new Map<number, string>();
-
     console.log(`[Exporter] Processing ${codexCells.length} cells for translations`);
 
-    for (let i = 0; i < codexCells.length; i++) {
-        const cell = codexCells[i];
-        const meta = cell.metadata;
+    // Accumulate per-paragraph segments: paragraphIndex → sorted list of {segmentIndex, text}
+    const segmentsByParagraph = new Map<number, Array<{ segmentIndex: number; text: string }>>();
+    // Table cells bypass the segment system entirely
+    const tableTranslations = new Map<number, string>();
 
-        // Only DOCX cells have paragraphIndex/paragraphId; everything else is skipped naturally.
-        // (Don't rely on kind/type here; it varies by host and we only need the mapping fields.)
+    for (const cell of codexCells) {
+        const meta = cell.metadata;
 
-        // Get translated content (strip HTML tags)
         const translated = removeHtmlTags(cell.value).trim();
-        if (!translated) {
-            continue;
-        }
+        if (!translated) continue;
 
-        // Get paragraph identifier
         const paragraphId = meta?.paragraphId;
         const paragraphIndex = meta?.paragraphIndex;
         const paragraphIndices = meta?.paragraphIndices;
+        const segmentIndex: number | undefined = meta?.segmentIndex;
 
         if (Array.isArray(paragraphIndices) && paragraphIndices.length > 0) {
-            // Table-cell case: a single Codex cell maps to multiple DOCX paragraphs.
-            // We map lines of the translation to each paragraph index (preserves paragraph count).
+            // Table-cell case: map lines of translation to each paragraph index.
             const parts = translated.split(/\r?\n/);
             for (let j = 0; j < paragraphIndices.length; j++) {
                 const idx = paragraphIndices[j];
-                if (typeof idx !== "number") continue;
-                translations.set(idx, parts[j] ?? '');
+                if (typeof idx !== 'number') continue;
+                tableTranslations.set(idx, parts[j] ?? '');
             }
-        } else if (typeof paragraphIndex === 'number') {
-            translations.set(paragraphIndex, translated);
-            // Keep logs light; large documents can have thousands of cells.
+            continue;
+        }
+
+        // Resolve the paragraph index (numeric or from paragraphId string)
+        let paraIdx: number | undefined;
+        if (typeof paragraphIndex === 'number') {
+            paraIdx = paragraphIndex;
         } else if (typeof paragraphId === 'string') {
             const m = paragraphId.match(/^p-(\d+)$/);
             if (m) {
-                const idx = Number(m[1]);
-                translations.set(idx, translated);
+                paraIdx = Number(m[1]);
             } else {
                 console.warn(`[Exporter] ⚠ Unrecognized paragraphId format: ${paragraphId}`);
+                continue;
             }
+        } else {
+            continue;
         }
+
+        if (!segmentsByParagraph.has(paraIdx)) {
+            segmentsByParagraph.set(paraIdx, []);
+        }
+        segmentsByParagraph.get(paraIdx)!.push({
+            // Unsplit paragraphs have no segmentIndex; treat them as the sole segment (index 0).
+            segmentIndex: segmentIndex ?? 0,
+            text: translated,
+        });
     }
 
-    console.log(`[Exporter] Collected ${translations.size} translations total`);
-    // Avoid dumping thousands of IDs in logs.
+    // Build the final map: for split paragraphs, join segments in order.
+    const translations = new Map<number, string>(tableTranslations);
+
+    for (const [paraIdx, segments] of segmentsByParagraph) {
+        segments.sort((a, b) => a.segmentIndex - b.segmentIndex);
+        const combined = segments.map(s => s.text).join(' ');
+        translations.set(paraIdx, combined);
+    }
 
+    console.log(`[Exporter] Collected ${translations.size} translations total`);
     return translations;
 }
 
diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts
index 25ce28d27..e87a1487b 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts
+++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts
@@ -75,6 +75,7 @@ export class DocxParser {
             extractTables: false, // TODO: Implement table support
             segmentationStrategy: 'paragraph',
             validateStructure: true,
+            idealCellLength: 160,
             ...config,
         };
 
diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts
index 766dbaf9e..919e1c1e6 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts
+++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts
@@ -276,6 +276,13 @@ export interface DocxParseConfig {
     extractTables: boolean;
     segmentationStrategy: 'paragraph' | 'sentence' | 'run';
     validateStructure: boolean;
+    /**
+     * Ideal cell length in characters.
+     * Paragraphs longer than ~N*1.1 are recursively split at sentence
+     * boundaries (L1), then sub-sentence stops (L2), then whitespace (L3).
+     * Defaults to 160. Set to 0 to disable splitting.
+     */
+    idealCellLength: number;
 }
 
 // Error types
diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts
index 45665e4d2..a589164ba 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts
+++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts
@@ -24,6 +24,8 @@ import { DocxParser } from './docxParser';
 import type { DocxDocument, DocxParagraph, DocxRun } from './docxTypes';
 import { createDocxCellMetadata, createDocxTableCellMetadata } from './cellMetadata';
 import { extractTableCellParagraphGroups } from './utils/tableSegmentation';
+import { splitTextIntoRanges, DEFAULT_IDEAL_CELL_LENGTH } from '../../utils/textSplitter';
+import type { TextRange } from '../../utils/textSplitter';
 const SUPPORTED_EXTENSIONS = ['docx'];
 
 /**
@@ -76,8 +78,12 @@ export const validateFile = async (file: File): Promise<FileValidationResult> =>
  */
 export const parseFile = async (
     file: File,
-    onProgress?: ProgressCallback
+    onProgress?: ProgressCallback,
+    options?: { idealCellLength?: number; locale?: string }
 ): Promise<ImportResult> => {
+    const idealCellLength = options?.idealCellLength ?? DEFAULT_IDEAL_CELL_LENGTH;
+    const locale = options?.locale;
+
     try {
         onProgress?.(createProgress('Reading File', 'Reading DOCX file...', 10));
 
@@ -88,6 +94,7 @@ export const parseFile = async (
             extractFootnotes: true,
             segmentationStrategy: 'paragraph',
             validateStructure: true,
+            idealCellLength,
         });
 
         // Set up debug logging - pass through to progress callback
@@ -105,7 +112,7 @@ export const parseFile = async (
         onProgress?.(createProgress('Creating Cells', 'Converting paragraphs to cells...', 60));
 
         // Convert document content to cells (paragraphs + table cells)
-        const cells = createCellsFromDocx(docxDoc, file.name);
+        const cells = createCellsFromDocx(docxDoc, file.name, idealCellLength, locale);
 
         onProgress?.(createProgress('Creating Notebooks', 'Creating source and codex notebooks...', 80));
 
@@ -196,9 +203,17 @@ export const parseFile = async (
 };
 
 /**
- * Convert DOCX paragraphs to Codex cells with complete metadata for round-trip
+ * Convert DOCX paragraphs to Codex cells with complete metadata for round-trip.
+ * Paragraphs whose plain text exceeds idealCellLength are split into multiple
+ * cells (one per segment).  Each segment's cell carries segmentIndex/segmentCount
+ * so the exporter can recombine translations before writing them back.
  */
-const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] => {
+const createCellsFromDocx = (
+    docxDoc: DocxDocument,
+    fileName: string,
+    idealCellLength: number = DEFAULT_IDEAL_CELL_LENGTH,
+    locale?: string
+): any[] => {
     const cells: any[] = [];
 
     // Group paragraph indices by <w:tc> (table cells), using XML order to match exporter indices.
@@ -260,22 +275,33 @@ const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] =>
         const fullText = paragraph.runs.map((r) => r.content).join('');
         if (!fullText.trim()) continue;
 
-        const htmlContent = convertParagraphToHtml(paragraph);
-        const { cellId, metadata: cellMetadata } = createDocxCellMetadata({
-            paragraphId: paragraph.id,
-            paragraphIndex: paragraph.paragraphIndex,
-            originalContent: fullText,
-            paragraph,
-            docxDoc,
-            fileName,
-        });
+        const ranges = splitTextIntoRanges(fullText, idealCellLength, locale);
+        const charRanges = buildRunCharRanges(paragraph.runs);
+        const isMultiSegment = ranges.length > 1;
+
+        for (let segIdx = 0; segIdx < ranges.length; segIdx++) {
+            const segmentRuns = sliceRunsForRange(charRanges, ranges[segIdx]);
+            const segmentText = segmentRuns.map((r) => r.content).join('');
+
+            const htmlContent = convertRunGroupToHtml(segmentRuns, paragraph);
+            const { cellId, metadata: cellMetadata } = createDocxCellMetadata({
+                paragraphId: paragraph.id,
+                paragraphIndex: paragraph.paragraphIndex,
+                originalContent: segmentText,
+                paragraph,
+                docxDoc,
+                fileName,
+                segmentIndex: isMultiSegment ? segIdx : undefined,
+                segmentCount: isMultiSegment ? ranges.length : undefined,
+            });
 
-        cells.push(
-            createProcessedCell(cellId, htmlContent, {
-                ...cellMetadata,
-                type: 'text',
-            })
-        );
+            cells.push(
+                createProcessedCell(cellId, htmlContent, {
+                    ...cellMetadata,
+                    type: 'text',
+                })
+            );
+        }
     }
 
     console.log(
@@ -285,13 +311,52 @@ const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] =>
     return cells;
 };
 
+// ---------------------------------------------------------------------------
+// Run-slicing helpers (DOCX-specific, used to map text ranges back to runs)
+// ---------------------------------------------------------------------------
+
+interface RunCharRange {
+    run: DocxRun;
+    charStart: number;
+    charEnd: number;
+}
+
+const buildRunCharRanges = (runs: DocxRun[]): RunCharRange[] => {
+    const ranges: RunCharRange[] = [];
+    let pos = 0;
+    for (const run of runs) {
+        ranges.push({ run, charStart: pos, charEnd: pos + run.content.length });
+        pos += run.content.length;
+    }
+    return ranges;
+};
+
+const sliceRunsForRange = (charRanges: RunCharRange[], range: TextRange): DocxRun[] => {
+    const result: DocxRun[] = [];
+    for (const { run, charStart, charEnd } of charRanges) {
+        if (charEnd <= range.start || charStart >= range.end) continue;
+        const localStart = Math.max(charStart, range.start) - charStart;
+        const localEnd = Math.min(charEnd, range.end) - charStart;
+        const slicedContent = run.content.slice(localStart, localEnd);
+        if (slicedContent.length === 0) continue;
+        result.push({
+            ...run,
+            id: `${run.id}:${range.start}-${range.end}`,
+            content: slicedContent,
+        });
+    }
+    return result;
+};
+
 /**
- * Convert a DOCX paragraph to HTML for display in Codex
+ * Convert a specific set of runs (a segment) to HTML, applying the parent
+ * paragraph's block-level properties (style, alignment, indentation, spacing).
+ * Used both for whole paragraphs and for sub-segments after splitting.
  */
-const convertParagraphToHtml = (paragraph: DocxParagraph): string => {
+const convertRunGroupToHtml = (runs: DocxRun[], paragraph: DocxParagraph): string => {
     let html = '<p';
 
-    // Add data attributes for paragraph properties
+    // Block-level attributes from the parent paragraph
     if (paragraph.paragraphProperties.styleId) {
         html += ` data-style-id="${escapeHtml(paragraph.paragraphProperties.styleId)}"`;
     }
@@ -299,7 +364,6 @@ const convertParagraphToHtml = (paragraph: DocxParagraph): string => {
         html += ` data-alignment="${paragraph.paragraphProperties.alignment}"`;
     }
 
-    // Add inline styles
     const styles: string[] = [];
     if (paragraph.paragraphProperties.alignment) {
         styles.push(`text-align: ${paragraph.paragraphProperties.alignment}`);
@@ -323,16 +387,21 @@ const convertParagraphToHtml = (paragraph: DocxParagraph): string => {
 
     html += '>';
 
-    // Add runs
-    for (const run of paragraph.runs) {
+    for (const run of runs) {
         html += convertRunToHtml(run);
     }
 
     html += '</p>';
-
     return html;
 };
 
+/**
+ * Convert a DOCX paragraph to HTML for display in Codex.
+ * Used by the table-cell path where no splitting is applied.
+ */
+const convertParagraphToHtml = (paragraph: DocxParagraph): string =>
+    convertRunGroupToHtml(paragraph.runs, paragraph);
+
 /**
  * Convert a DOCX run to HTML
  */
diff --git a/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts b/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts
index 36d48d8c5..51bd0c945 100644
--- a/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts
+++ b/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts
@@ -63,6 +63,8 @@ export interface WizardState {
     selectedSourceDetails?: DetailedFileInfo;
     selectedPlugin?: string;
     projectInventory: ProjectInventory;
+    /** BCP-47 tag of the project's source language (from metadata.json). */
+    sourceLanguageTag?: string;
     isLoadingInventory: boolean;
     isLoadingFileDetails: boolean;
     fileDetailsError?: string;
@@ -82,6 +84,8 @@ export interface WizardContext {
     selectedSource?: BasicFileInfo;
     selectedSourceDetails?: DetailedFileInfo;
     projectInventory: ProjectInventory;
+    /** BCP-47 tag of the project's source language (from metadata.json). */
+    sourceLanguageTag?: string;
 }
 
 /**
diff --git a/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts
new file mode 100644
index 000000000..e8b7daa77
--- /dev/null
+++ b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts
@@ -0,0 +1,317 @@
+/**
+ * Generic recursive text splitter for paragraph segmentation.
+ *
+ * Works on plain strings — no dependency on DOCX types, runs, or any importer.
+ * Any importer (DOCX, PDF, plain text, etc.) can use this to break long
+ * paragraphs into translator-friendly cell-sized chunks.
+ *
+ * Algorithm:
+ *   Given an ideal cell length N, the splitter asks "is this segment too long?"
+ *   and if so, finds the best boundary closest to the midpoint:
+ *
+ *   L1 (sentence boundaries)  — tried when length > N * THRESHOLD_SPLIT
+ *   L2 (sub-sentence breaks)  — tried when there is no suitable L1 boundary and length > N * THRESHOLD_L2
+ *   L3 (whitespace / word)    — tried when there is no suitable L1 or L2 boundary and length > N * THRESHOLD_L3
+ *
+ *   Each split is rejected if it would leave either side shorter than N * MIN_SIDE_RATIO.
+ *   After a successful split, both halves are recursively re-evaluated.
+ *
+ * Locale-aware mode:
+ *   When the caller passes a BCP-47 `locale` (e.g. "th", "ja", "fa") AND the
+ *   runtime supports `Intl.Segmenter` for it, L1 sources its candidate
+ *   boundaries from `Intl.Segmenter(locale, { granularity: 'sentence' })`
+ *   and L3 from `granularity: 'word'`.  This handles abbreviations
+ *   ("Mr. Smith") correctly at L1 and produces real word boundaries for
+ *   scripts without word-spacing (Thai, Khmer, Lao, Myanmar, CJK) at L3.
+ *   L2 has no Intl equivalent and always uses the regex below.
+ *   If the locale is missing or unsupported, every tier falls back to regex
+ *   and behavior is byte-identical to the locale-less call.
+ */
+
+
+/**
+ * 160 seems to be a good default for English, but depending on the language and how it is encoded, it will surely vary.
+ * Thus, I've made it adjustable to the user.
+ */
+export const DEFAULT_IDEAL_CELL_LENGTH = 160;
+
+// ---------------------------------------------------------------------------
+// Threshold multipliers (applied to idealLength)
+// ---------------------------------------------------------------------------
+
+const THRESHOLD_SPLIT = 1.1;  // a – minimum length to attempt any split
+const THRESHOLD_L2 = 1.5;     // b – minimum length for L2 (sub-sentence) splits
+const THRESHOLD_L3 = 2.4;     // c – minimum length for L3 (whitespace) splits
+const MIN_SIDE_RATIO = 0.3;   // d – minimum side length as fraction of idealLength
+
+// ---------------------------------------------------------------------------
+// Boundary patterns (multilingual)
+// ---------------------------------------------------------------------------
+
+/**
+ * L1 — Sentence-ending boundaries.
+ *
+ * Latin marks (. ! ?) require following whitespace or end-of-string to avoid
+ * splitting numbers ("3.14") and abbreviations. A negative lookbehind excludes
+ * digits immediately before ".".
+ *
+ * Script-specific marks (Devanagari, CJK, Arabic, Urdu, Ethiopic, Myanmar,
+ * Khmer, Tibetan, Armenian, Balinese, full-width variants) match standalone
+ * since they are unambiguous sentence terminators even without trailing space.
+ */
+const L1_RE = /(?:(?<!\d)[.．]+|[!?]+)(?=\s|$)|[।॥。！？؟۔።᭞᭟፧။។།༎։՞՜‽⁇⁈⁉｡]+/g;
+
+/**
+ * L2 — Sub-sentence boundaries: commas, semicolons, colons, non-hyphen dashes,
+ * ellipsis, closing quotes / brackets.
+ *
+ * Latin marks require following whitespace. CJK / Arabic marks match standalone.
+ */
+const L2_RE = /(?:[,;:)\]—–…"'»›]+)(?=\s)|[،؛、，；：）〉》]+/g;
+
+/**
+ * L3 — Whitespace (last resort).
+ */
+const L3_RE = /\s+/g;
+
+// ---------------------------------------------------------------------------
+// Public types
+// ---------------------------------------------------------------------------
+
+export interface TextRange {
+    start: number;
+    end: number;
+}
+
+// ---------------------------------------------------------------------------
+// Internal helpers (defined before use)
+// ---------------------------------------------------------------------------
+
+/**
+ * Opaque handle for a constructed segmenter.  Typed loosely so this file
+ * compiles under tsconfigs that lack `ES2022.Intl` (the root webpack build
+ * pulls this module in via a test that cross-imports from the webview tree).
+ */
+type SegmenterHandle = {
+    segment(input: string): Iterable<{ segment: string; index: number; isWordLike?: boolean }>;
+};
+
+/**
+ * Try to construct an `Intl.Segmenter` for the given locale and granularity.
+ * Returns null when no locale was supplied, when the runtime lacks the API,
+ * or when the locale is unrecognized (RangeError).
+ */
+function tryCreateSegmenter(
+    locale: string | undefined,
+    granularity: 'sentence' | 'word'
+): SegmenterHandle | null {
+    if (!locale) return null;
+    const IntlAny = Intl as Record<string, unknown>;
+    if (typeof IntlAny.Segmenter !== 'function') return null;
+    try {
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        return new (IntlAny.Segmenter as any)(locale, { granularity }) as SegmenterHandle;
+    } catch {
+        return null;
+    }
+}
+
+/**
+ * Boundary positions between sentences in `text`, derived from a locale-aware
+ * sentence segmenter. Returns the start index of every sentence after the
+ * first (each `seg.index` already accounts for trailing whitespace of the
+ * previous sentence, matching the regex-based `findSplitPoints` semantics).
+ */
+function findSentenceSplitPointsIntl(text: string, segmenter: SegmenterHandle): number[] {
+    const points: number[] = [];
+    for (const seg of segmenter.segment(text)) {
+        if (seg.index > 0 && seg.index < text.length) {
+            points.push(seg.index);
+        }
+    }
+    return points;
+}
+
+/**
+ * Boundary positions between words in `text`, derived from a locale-aware
+ * word segmenter. Returns the start index of every word-like segment after
+ * position 0 — this is the locale-aware analogue of "split at whitespace"
+ * and works for scripts (Thai, Khmer, Lao, Myanmar, CJK) where words are
+ * not space-separated.
+ */
+function findWordSplitPointsIntl(text: string, segmenter: SegmenterHandle): number[] {
+    const points: number[] = [];
+    for (const seg of segmenter.segment(text)) {
+        if (seg.isWordLike && seg.index > 0 && seg.index < text.length) {
+            points.push(seg.index);
+        }
+    }
+    return points;
+}
+
+/**
+ * Find all character positions where a new segment could BEGIN after a
+ * boundary match (i.e. right after the punctuation + any trailing spaces).
+ */
+function findSplitPoints(text: string, re: RegExp): number[] {
+    const points: number[] = [];
+    const localRe = new RegExp(re.source, re.flags.includes('g') ? re.flags : re.flags + 'g');
+    let m: RegExpExecArray | null;
+    while ((m = localRe.exec(text)) !== null) {
+        let nextWord = m.index + m[0].length;
+        while (nextWord < text.length && text[nextWord] === ' ') nextWord++;
+        if (nextWord > 0 && nextWord < text.length) {
+            points.push(nextWord);
+        }
+    }
+    return points;
+}
+
+/**
+ * From a set of candidate split points, pick the one closest to `mid` that
+ * keeps both resulting sides at least `minSide` characters long.
+ * Returns the chosen point, or null if none qualifies.
+ */
+function pickBestPoint(
+    points: number[],
+    mid: number,
+    textLength: number,
+    minSide: number
+): number | null {
+    if (points.length === 0) return null;
+
+    const sorted = [...points].sort(
+        (a, b) => Math.abs(a - mid) - Math.abs(b - mid)
+    );
+
+    for (const p of sorted) {
+        if (p >= minSide && textLength - p >= minSide) {
+            return p;
+        }
+    }
+    return null;
+}
+
+/**
+ * Per-invocation context passed through recursion. The two segmenter slots
+ * are constructed once at the public entry point so we don't re-construct
+ * them at every recursive call.
+ */
+interface SplitContext {
+    idealLength: number;
+    sentenceSegmenter: SegmenterHandle | null;
+    wordSegmenter: SegmenterHandle | null;
+}
+
+/**
+ * Core recursive splitter.  Operates on character offsets within `fullText`.
+ */
+function splitRecursive(
+    fullText: string,
+    start: number,
+    end: number,
+    ctx: SplitContext
+): TextRange[] {
+    const { idealLength, sentenceSegmenter, wordSegmenter } = ctx;
+    const length = end - start;
+    const minSide = idealLength * MIN_SIDE_RATIO;
+
+    // Below the split threshold — keep as-is
+    if (length <= idealLength * THRESHOLD_SPLIT) {
+        return [{ start, end }];
+    }
+
+    const segText = fullText.slice(start, end);
+    const mid = Math.floor(segText.length / 2);
+
+    // --- L1: sentence boundaries ---
+    // Locale-aware when a sentence segmenter is available; regex otherwise.
+    const l1Points = sentenceSegmenter
+        ? findSentenceSplitPointsIntl(segText, sentenceSegmenter)
+        : findSplitPoints(segText, L1_RE);
+    const l1Pick = pickBestPoint(l1Points, mid, segText.length, minSide);
+    if (l1Pick !== null) {
+        const g = start + l1Pick;
+        return [
+            ...splitRecursive(fullText, start, g, ctx),
+            ...splitRecursive(fullText, g, end, ctx),
+        ];
+    }
+
+    // --- L2: sub-sentence boundaries (only if long enough) ---
+    // No Intl equivalent for clause-level boundaries — regex always.
+    if (length > idealLength * THRESHOLD_L2) {
+        const l2Points = findSplitPoints(segText, L2_RE);
+        const l2Pick = pickBestPoint(l2Points, mid, segText.length, minSide);
+        if (l2Pick !== null) {
+            const g = start + l2Pick;
+            return [
+                ...splitRecursive(fullText, start, g, ctx),
+                ...splitRecursive(fullText, g, end, ctx),
+            ];
+        }
+    }
+
+    // --- L3: word boundaries (only if very long) ---
+    // Locale-aware when a word segmenter is available — essential for
+    // scripts without space-separated words. Falls back to whitespace regex.
+    if (length > idealLength * THRESHOLD_L3) {
+        const l3Points = wordSegmenter
+            ? findWordSplitPointsIntl(segText, wordSegmenter)
+            : findSplitPoints(segText, L3_RE);
+        const l3Pick = pickBestPoint(l3Points, mid, segText.length, minSide);
+        if (l3Pick !== null) {
+            const g = start + l3Pick;
+            return [
+                ...splitRecursive(fullText, start, g, ctx),
+                ...splitRecursive(fullText, g, end, ctx),
+            ];
+        }
+    }
+
+    // Cannot split further
+    return [{ start, end }];
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+/**
+ * Split `text` into character ranges, each roughly `idealLength` characters.
+ *
+ * Returns a single-element array when no split is needed.
+ * Set `idealLength` to 0 to disable splitting entirely.
+ *
+ * @param locale Optional BCP-47 tag (e.g. "th", "ja", "fa-IR"). When supplied
+ *   and supported by the runtime's `Intl.Segmenter`, L1 (sentence) and L3
+ *   (word) boundaries are derived from the segmenter instead of regex.
+ *   Unknown or unsupported tags transparently fall back to regex.
+ */
+export function splitTextIntoRanges(
+    text: string,
+    idealLength: number = DEFAULT_IDEAL_CELL_LENGTH,
+    locale?: string
+): TextRange[] {
+    if (!text || idealLength <= 0 || text.length <= idealLength * THRESHOLD_SPLIT) {
+        return [{ start: 0, end: text.length }];
+    }
+    const ctx: SplitContext = {
+        idealLength,
+        sentenceSegmenter: tryCreateSegmenter(locale, 'sentence'),
+        wordSegmenter: tryCreateSegmenter(locale, 'word'),
+    };
+    return splitRecursive(text, 0, text.length, ctx);
+}
+
+/**
+ * Convenience wrapper that returns the actual substrings.
+ */
+export function splitText(
+    text: string,
+    idealLength: number = DEFAULT_IDEAL_CELL_LENGTH,
+    locale?: string
+): string[] {
+    return splitTextIntoRanges(text, idealLength, locale)
+        .map(r => text.slice(r.start, r.end));
+}