From 87734bdfb44a6cff8aed3cf61956c6855e6df4f0 Mon Sep 17 00:00:00 2001 From: Luke-Bilhorn Date: Thu, 16 Apr 2026 12:39:04 -0500 Subject: [PATCH 1/6] feat(docx): split long paragraphs into sentence-level cells on import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Paragraphs in imported DOCX files are now recursively split into translator-friendly cells. A new general-purpose text splitter (NewSourceUploader/utils/textSplitter.ts) takes a plain string and an ideal cell length, then bisects at the boundary nearest the midpoint: sentence ends (L1), sub-sentence stops like commas/dashes/ellipsis (L2), and whitespace as a last resort (L3). Each tier has its own length threshold (×1.1 / ×1.5 / ×2.4) and a minimum side-length guard (×0.3) to prevent fragments. Multilingual punctuation is supported across Latin, CJK, Arabic, Urdu, Devanagari, Ethiopic, and several other scripts. The DOCX importer uses this splitter via run-aware helpers that preserve inline formatting (bold, italic, font, color, etc.) even when a split falls mid-run. Split cells carry segmentIndex/segmentCount metadata so the round-trip exporter can recombine translated segments in order before writing them back to the original , keeping the output DOCX structurally identical to the input. Cell metadata now contains optional fields segmentIndex and segmentCount to support these subdivisions. The ideal cell length defaults to 160 characters and is user-adjustable via a collapsible Advanced Settings panel in the DOCX import UI. --- .../importers/docx/cellMetadata.ts | 20 +- .../importers/docx/docxExporter.ts | 71 ++++-- .../importers/docx/docxParser.ts | 1 + .../importers/docx/docxTypes.ts | 7 + .../NewSourceUploader/importers/docx/index.ts | 119 +++++++--- .../NewSourceUploader/utils/textSplitter.ts | 207 ++++++++++++++++++ 6 files changed, 374 insertions(+), 51 deletions(-) create mode 100644 webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts index 8de996c74..ec8ac6eaf 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts @@ -19,6 +19,16 @@ export interface DocxCellMetadataParams { paragraph: DocxParagraph; docxDoc: DocxDocument; fileName: string; + /** + * When a paragraph is split into multiple cells, this is the 0-based index + * of this cell within that paragraph. Undefined for unsplit paragraphs. + */ + segmentIndex?: number; + /** + * Total number of cells this paragraph was split into. + * Undefined for unsplit paragraphs. + */ + segmentCount?: number; } /** @@ -26,7 +36,7 @@ export interface DocxCellMetadataParams { * Generates a UUID for the cell ID */ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metadata: any; cellId: string; } { - const { paragraphId, paragraphIndex, originalContent, paragraph, docxDoc, fileName } = params; + const { paragraphId, paragraphIndex, originalContent, segmentIndex, segmentCount } = params; // Generate UUID for cell ID const cellId = uuidv4(); @@ -39,6 +49,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada * * To keep `.source`/`.codex` small, we only persist what we need to map a Codex cell * back to a paragraph in `word/document.xml`. + * + * For split paragraphs, segmentIndex/segmentCount allow the exporter to + * recombine the translated segments in order before writing them back to + * the original . */ const cellMetadata = { id: cellId, @@ -47,6 +61,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada paragraphId, paragraphIndex, + // Present only when the paragraph was split into multiple cells + ...(segmentIndex !== undefined && { segmentIndex }), + ...(segmentCount !== undefined && { segmentCount }), + // Data object for consistency with other importers data: { originalText: originalContent, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts index 8404d3667..ce4540a12 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts @@ -100,59 +100,82 @@ export async function exportDocxWithTranslations( } /** - * Collect translations from Codex cells + * Collect translations from Codex cells. + * + * Handles three cell shapes: + * 1. Table cells – one Codex cell maps to multiple DOCX paragraphs (paragraphIndices[]). + * 2. Split cells – one DOCX paragraph was split into N Codex cells (segmentIndex present). + * The per-segment translations are joined in order before writing to the . + * 3. Normal cells – one Codex cell ↔ one DOCX paragraph (paragraphIndex only). */ function collectTranslations( codexCells: Array<{ kind: number; value: string; metadata: any; }> ): Map { - const translations = new Map(); - console.log(`[Exporter] Processing ${codexCells.length} cells for translations`); - for (let i = 0; i < codexCells.length; i++) { - const cell = codexCells[i]; - const meta = cell.metadata; + // Accumulate per-paragraph segments: paragraphIndex → sorted list of {segmentIndex, text} + const segmentsByParagraph = new Map>(); + // Table cells bypass the segment system entirely + const tableTranslations = new Map(); - // Only DOCX cells have paragraphIndex/paragraphId; everything else is skipped naturally. - // (Don't rely on kind/type here; it varies by host and we only need the mapping fields.) + for (const cell of codexCells) { + const meta = cell.metadata; - // Get translated content (strip HTML tags) const translated = removeHtmlTags(cell.value).trim(); - if (!translated) { - continue; - } + if (!translated) continue; - // Get paragraph identifier const paragraphId = meta?.paragraphId; const paragraphIndex = meta?.paragraphIndex; const paragraphIndices = meta?.paragraphIndices; + const segmentIndex: number | undefined = meta?.segmentIndex; if (Array.isArray(paragraphIndices) && paragraphIndices.length > 0) { - // Table-cell case: a single Codex cell maps to multiple DOCX paragraphs. - // We map lines of the translation to each paragraph index (preserves paragraph count). + // Table-cell case: map lines of translation to each paragraph index. const parts = translated.split(/\r?\n/); for (let j = 0; j < paragraphIndices.length; j++) { const idx = paragraphIndices[j]; - if (typeof idx !== "number") continue; - translations.set(idx, parts[j] ?? ''); + if (typeof idx !== 'number') continue; + tableTranslations.set(idx, parts[j] ?? ''); } - } else if (typeof paragraphIndex === 'number') { - translations.set(paragraphIndex, translated); - // Keep logs light; large documents can have thousands of cells. + continue; + } + + // Resolve the paragraph index (numeric or from paragraphId string) + let paraIdx: number | undefined; + if (typeof paragraphIndex === 'number') { + paraIdx = paragraphIndex; } else if (typeof paragraphId === 'string') { const m = paragraphId.match(/^p-(\d+)$/); if (m) { - const idx = Number(m[1]); - translations.set(idx, translated); + paraIdx = Number(m[1]); } else { console.warn(`[Exporter] ⚠ Unrecognized paragraphId format: ${paragraphId}`); + continue; } + } else { + continue; } + + if (!segmentsByParagraph.has(paraIdx)) { + segmentsByParagraph.set(paraIdx, []); + } + segmentsByParagraph.get(paraIdx)!.push({ + // Unsplit paragraphs have no segmentIndex; treat them as the sole segment (index 0). + segmentIndex: segmentIndex ?? 0, + text: translated, + }); } - console.log(`[Exporter] Collected ${translations.size} translations total`); - // Avoid dumping thousands of IDs in logs. + // Build the final map: for split paragraphs, join segments in order. + const translations = new Map(tableTranslations); + + for (const [paraIdx, segments] of segmentsByParagraph) { + segments.sort((a, b) => a.segmentIndex - b.segmentIndex); + const combined = segments.map(s => s.text).join(' '); + translations.set(paraIdx, combined); + } + console.log(`[Exporter] Collected ${translations.size} translations total`); return translations; } diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts index 25ce28d27..544492962 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts @@ -75,6 +75,7 @@ export class DocxParser { extractTables: false, // TODO: Implement table support segmentationStrategy: 'paragraph', validateStructure: true, + targetCellLength: 160, ...config, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts index 766dbaf9e..68767e05c 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts @@ -276,6 +276,13 @@ export interface DocxParseConfig { extractTables: boolean; segmentationStrategy: 'paragraph' | 'sentence' | 'run'; validateStructure: boolean; + /** + * Ideal cell length in characters. + * Paragraphs longer than ~N*1.1 are recursively split at sentence + * boundaries (L1), then sub-sentence stops (L2), then whitespace (L3). + * Defaults to 160. Set to 0 to disable splitting. + */ + targetCellLength: number; } // Error types diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts index 086583ec5..861e1121d 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts @@ -23,6 +23,8 @@ import { DocxParser } from './docxParser'; import type { DocxDocument, DocxParagraph, DocxRun } from './docxTypes'; import { createDocxCellMetadata, createDocxTableCellMetadata } from './cellMetadata'; import { extractTableCellParagraphGroups } from './utils/tableSegmentation'; +import { splitTextIntoRanges, DEFAULT_IDEAL_CELL_LENGTH } from '../../utils/textSplitter'; +import type { TextRange } from '../../utils/textSplitter'; const SUPPORTED_EXTENSIONS = ['docx']; /** @@ -75,8 +77,11 @@ export const validateFile = async (file: File): Promise => */ export const parseFile = async ( file: File, - onProgress?: ProgressCallback + onProgress?: ProgressCallback, + options?: { targetCellLength?: number } ): Promise => { + const targetCellLength = options?.targetCellLength ?? DEFAULT_IDEAL_CELL_LENGTH; + try { onProgress?.(createProgress('Reading File', 'Reading DOCX file...', 10)); @@ -87,6 +92,7 @@ export const parseFile = async ( extractFootnotes: true, segmentationStrategy: 'paragraph', validateStructure: true, + targetCellLength, }); // Set up debug logging - pass through to progress callback @@ -104,7 +110,7 @@ export const parseFile = async ( onProgress?.(createProgress('Creating Cells', 'Converting paragraphs to cells...', 60)); // Convert document content to cells (paragraphs + table cells) - const cells = createCellsFromDocx(docxDoc, file.name); + const cells = createCellsFromDocx(docxDoc, file.name, targetCellLength); onProgress?.(createProgress('Creating Notebooks', 'Creating source and codex notebooks...', 80)); @@ -202,9 +208,16 @@ export const parseFile = async ( }; /** - * Convert DOCX paragraphs to Codex cells with complete metadata for round-trip + * Convert DOCX paragraphs to Codex cells with complete metadata for round-trip. + * Paragraphs whose plain text exceeds targetCellLength are split into multiple + * cells (one per segment). Each segment's cell carries segmentIndex/segmentCount + * so the exporter can recombine translations before writing them back. */ -const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] => { +const createCellsFromDocx = ( + docxDoc: DocxDocument, + fileName: string, + targetCellLength: number = DEFAULT_IDEAL_CELL_LENGTH +): any[] => { const cells: any[] = []; // Group paragraph indices by (table cells), using XML order to match exporter indices. @@ -266,22 +279,33 @@ const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] => const fullText = paragraph.runs.map((r) => r.content).join(''); if (!fullText.trim()) continue; - const htmlContent = convertParagraphToHtml(paragraph); - const { cellId, metadata: cellMetadata } = createDocxCellMetadata({ - paragraphId: paragraph.id, - paragraphIndex: paragraph.paragraphIndex, - originalContent: fullText, - paragraph, - docxDoc, - fileName, - }); + const ranges = splitTextIntoRanges(fullText, targetCellLength); + const charRanges = buildRunCharRanges(paragraph.runs); + const isMultiSegment = ranges.length > 1; + + for (let segIdx = 0; segIdx < ranges.length; segIdx++) { + const segmentRuns = sliceRunsForRange(charRanges, ranges[segIdx]); + const segmentText = segmentRuns.map((r) => r.content).join(''); + + const htmlContent = convertRunGroupToHtml(segmentRuns, paragraph); + const { cellId, metadata: cellMetadata } = createDocxCellMetadata({ + paragraphId: paragraph.id, + paragraphIndex: paragraph.paragraphIndex, + originalContent: segmentText, + paragraph, + docxDoc, + fileName, + segmentIndex: isMultiSegment ? segIdx : undefined, + segmentCount: isMultiSegment ? ranges.length : undefined, + }); - cells.push( - createProcessedCell(cellId, htmlContent, { - ...cellMetadata, - type: 'text', - }) - ); + cells.push( + createProcessedCell(cellId, htmlContent, { + ...cellMetadata, + type: 'text', + }) + ); + } } console.log( @@ -291,13 +315,52 @@ const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] => return cells; }; +// --------------------------------------------------------------------------- +// Run-slicing helpers (DOCX-specific, used to map text ranges back to runs) +// --------------------------------------------------------------------------- + +interface RunCharRange { + run: DocxRun; + charStart: number; + charEnd: number; +} + +const buildRunCharRanges = (runs: DocxRun[]): RunCharRange[] => { + const ranges: RunCharRange[] = []; + let pos = 0; + for (const run of runs) { + ranges.push({ run, charStart: pos, charEnd: pos + run.content.length }); + pos += run.content.length; + } + return ranges; +}; + +const sliceRunsForRange = (charRanges: RunCharRange[], range: TextRange): DocxRun[] => { + const result: DocxRun[] = []; + for (const { run, charStart, charEnd } of charRanges) { + if (charEnd <= range.start || charStart >= range.end) continue; + const localStart = Math.max(charStart, range.start) - charStart; + const localEnd = Math.min(charEnd, range.end) - charStart; + const slicedContent = run.content.slice(localStart, localEnd); + if (slicedContent.length === 0) continue; + result.push({ + ...run, + id: `${run.id}:${range.start}-${range.end}`, + content: slicedContent, + }); + } + return result; +}; + /** - * Convert a DOCX paragraph to HTML for display in Codex + * Convert a specific set of runs (a segment) to HTML, applying the parent + * paragraph's block-level properties (style, alignment, indentation, spacing). + * Used both for whole paragraphs and for sub-segments after splitting. */ -const convertParagraphToHtml = (paragraph: DocxParagraph): string => { +const convertRunGroupToHtml = (runs: DocxRun[], paragraph: DocxParagraph): string => { let html = ' { html += ` data-alignment="${paragraph.paragraphProperties.alignment}"`; } - // Add inline styles const styles: string[] = []; if (paragraph.paragraphProperties.alignment) { styles.push(`text-align: ${paragraph.paragraphProperties.alignment}`); @@ -329,16 +391,21 @@ const convertParagraphToHtml = (paragraph: DocxParagraph): string => { html += '>'; - // Add runs - for (const run of paragraph.runs) { + for (const run of runs) { html += convertRunToHtml(run); } html += '

'; - return html; }; +/** + * Convert a DOCX paragraph to HTML for display in Codex. + * Used by the table-cell path where no splitting is applied. + */ +const convertParagraphToHtml = (paragraph: DocxParagraph): string => + convertRunGroupToHtml(paragraph.runs, paragraph); + /** * Convert a DOCX run to HTML */ diff --git a/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts new file mode 100644 index 000000000..998394b03 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts @@ -0,0 +1,207 @@ +/** + * Generic recursive text splitter for paragraph segmentation. + * + * Works on plain strings — no dependency on DOCX types, runs, or any importer. + * Any importer (DOCX, PDF, plain text, etc.) can use this to break long + * paragraphs into translator-friendly cell-sized chunks. + * + * Algorithm: + * Given an ideal cell length N, the splitter asks "is this segment too long?" + * and if so, finds the best boundary closest to the midpoint: + * + * L1 (sentence boundaries) — tried when length > N * THRESHOLD_SPLIT + * L2 (sub-sentence breaks) — tried when length > N * THRESHOLD_L2 + * L3 (whitespace) — tried when length > N * THRESHOLD_L3 + * + * Each split is rejected if it would leave either side shorter than N * MIN_SIDE_RATIO. + * After a successful split, both halves are recursively re-evaluated. + */ + +export const DEFAULT_IDEAL_CELL_LENGTH = 160; + +// --------------------------------------------------------------------------- +// Threshold multipliers (applied to idealLength) +// --------------------------------------------------------------------------- + +const THRESHOLD_SPLIT = 1.1; // a – minimum length to attempt any split +const THRESHOLD_L2 = 1.5; // b – minimum length for L2 (sub-sentence) splits +const THRESHOLD_L3 = 2.4; // c – minimum length for L3 (whitespace) splits +const MIN_SIDE_RATIO = 0.3; // d – minimum side length as fraction of idealLength + +// --------------------------------------------------------------------------- +// Boundary patterns (multilingual) +// --------------------------------------------------------------------------- + +/** + * L1 — Sentence-ending boundaries. + * + * Latin marks (. ! ?) require following whitespace or end-of-string to avoid + * splitting numbers ("3.14") and abbreviations. A negative lookbehind excludes + * digits immediately before ".". + * + * Script-specific marks (Devanagari, CJK, Arabic, Urdu, Ethiopic, Myanmar, + * Khmer, Tibetan, Armenian, Balinese, full-width variants) match standalone + * since they are unambiguous sentence terminators even without trailing space. + */ +const L1_RE = /(?:(? 0 && nextWord < text.length) { + points.push(nextWord); + } + } + return points; +} + +/** + * From a set of candidate split points, pick the one closest to `mid` that + * keeps both resulting sides at least `minSide` characters long. + * Returns the chosen point, or null if none qualifies. + */ +function pickBestPoint( + points: number[], + mid: number, + textLength: number, + minSide: number +): number | null { + if (points.length === 0) return null; + + const sorted = [...points].sort( + (a, b) => Math.abs(a - mid) - Math.abs(b - mid) + ); + + for (const p of sorted) { + if (p >= minSide && textLength - p >= minSide) { + return p; + } + } + return null; +} + +/** + * Core recursive splitter. Operates on character offsets within `fullText`. + */ +function splitRecursive( + fullText: string, + start: number, + end: number, + idealLength: number +): TextRange[] { + const length = end - start; + const minSide = idealLength * MIN_SIDE_RATIO; + + // Below the split threshold — keep as-is + if (length <= idealLength * THRESHOLD_SPLIT) { + return [{ start, end }]; + } + + const segText = fullText.slice(start, end); + const mid = Math.floor(segText.length / 2); + + // --- L1: sentence boundaries --- + const l1Points = findSplitPoints(segText, L1_RE); + const l1Pick = pickBestPoint(l1Points, mid, segText.length, minSide); + if (l1Pick !== null) { + const g = start + l1Pick; + return [ + ...splitRecursive(fullText, start, g, idealLength), + ...splitRecursive(fullText, g, end, idealLength), + ]; + } + + // --- L2: sub-sentence boundaries (only if long enough) --- + if (length > idealLength * THRESHOLD_L2) { + const l2Points = findSplitPoints(segText, L2_RE); + const l2Pick = pickBestPoint(l2Points, mid, segText.length, minSide); + if (l2Pick !== null) { + const g = start + l2Pick; + return [ + ...splitRecursive(fullText, start, g, idealLength), + ...splitRecursive(fullText, g, end, idealLength), + ]; + } + } + + // --- L3: whitespace (only if very long) --- + if (length > idealLength * THRESHOLD_L3) { + const l3Points = findSplitPoints(segText, L3_RE); + const l3Pick = pickBestPoint(l3Points, mid, segText.length, minSide); + if (l3Pick !== null) { + const g = start + l3Pick; + return [ + ...splitRecursive(fullText, start, g, idealLength), + ...splitRecursive(fullText, g, end, idealLength), + ]; + } + } + + // Cannot split further + return [{ start, end }]; +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Split `text` into character ranges, each roughly `idealLength` characters. + * + * Returns a single-element array when no split is needed. + * Set `idealLength` to 0 to disable splitting entirely. + */ +export function splitTextIntoRanges( + text: string, + idealLength: number = DEFAULT_IDEAL_CELL_LENGTH +): TextRange[] { + if (!text || idealLength <= 0 || text.length <= idealLength * THRESHOLD_SPLIT) { + return [{ start: 0, end: text.length }]; + } + return splitRecursive(text, 0, text.length, idealLength); +} + +/** + * Convenience wrapper that returns the actual substrings. + */ +export function splitText( + text: string, + idealLength: number = DEFAULT_IDEAL_CELL_LENGTH +): string[] { + return splitTextIntoRanges(text, idealLength) + .map(r => text.slice(r.start, r.end)); +} From 359d8b0fd3bc91f375da338c89a33ba4a93291ee Mon Sep 17 00:00:00 2001 From: Luke-Bilhorn Date: Fri, 17 Apr 2026 15:14:40 -0500 Subject: [PATCH 2/6] Added notes on spltting choices. --- .../NewSourceUploader/utils/textSplitter.ts | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts index 998394b03..b0768243d 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts @@ -10,13 +10,18 @@ * and if so, finds the best boundary closest to the midpoint: * * L1 (sentence boundaries) — tried when length > N * THRESHOLD_SPLIT - * L2 (sub-sentence breaks) — tried when length > N * THRESHOLD_L2 - * L3 (whitespace) — tried when length > N * THRESHOLD_L3 + * L2 (sub-sentence breaks) — tried when there is no suitable L1 boundary and length > N * THRESHOLD_L2 + * L3 (whitespace) — tried when there is no suitable L1 or L2 boundary and length > N * THRESHOLD_L3 * * Each split is rejected if it would leave either side shorter than N * MIN_SIDE_RATIO. * After a successful split, both halves are recursively re-evaluated. */ + +/** + * 160 seems to be a good default for English, but depending on the language and how it is encoded, it will surely vary. + * Thus, I've made it adjustable to the user. + */ export const DEFAULT_IDEAL_CELL_LENGTH = 160; // --------------------------------------------------------------------------- @@ -170,6 +175,17 @@ function splitRecursive( ]; } } + /** + * Really, I'd like to implement some much more sophisticated techniques + * for splitting L3. Whitespace is a last resort, but it's not great. + * Short of making calls to an llm, a better approach might use stats to + * make decent guesses at where clauses start and end, for languages that + * use whitespace. + * + * For those that don't use whitespace (e.g. CJK, Thai, Khmer, Lao, Myanmar), + * even this wouldn't help at all. As it is, we should try to implement + * something to prefer splitting at word boundaries rather than within compounds. + */ // Cannot split further return [{ start, end }]; From ddb5734d38b957679026341885d129ebe66aef8c Mon Sep 17 00:00:00 2001 From: Luke-Bilhorn Date: Fri, 17 Apr 2026 16:16:13 -0500 Subject: [PATCH 3/6] Changed "targetCellLength" to "idealCellLength" --- .../NewSourceUploader/importers/docx/docxParser.ts | 2 +- .../NewSourceUploader/importers/docx/docxTypes.ts | 2 +- .../src/NewSourceUploader/importers/docx/index.ts | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts index 544492962..e87a1487b 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts @@ -75,7 +75,7 @@ export class DocxParser { extractTables: false, // TODO: Implement table support segmentationStrategy: 'paragraph', validateStructure: true, - targetCellLength: 160, + idealCellLength: 160, ...config, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts index 68767e05c..919e1c1e6 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts @@ -282,7 +282,7 @@ export interface DocxParseConfig { * boundaries (L1), then sub-sentence stops (L2), then whitespace (L3). * Defaults to 160. Set to 0 to disable splitting. */ - targetCellLength: number; + idealCellLength: number; } // Error types diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts index 861e1121d..e5d1f08b0 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts @@ -78,9 +78,9 @@ export const validateFile = async (file: File): Promise => export const parseFile = async ( file: File, onProgress?: ProgressCallback, - options?: { targetCellLength?: number } + options?: { idealCellLength?: number } ): Promise => { - const targetCellLength = options?.targetCellLength ?? DEFAULT_IDEAL_CELL_LENGTH; + const idealCellLength = options?.idealCellLength ?? DEFAULT_IDEAL_CELL_LENGTH; try { onProgress?.(createProgress('Reading File', 'Reading DOCX file...', 10)); @@ -92,7 +92,7 @@ export const parseFile = async ( extractFootnotes: true, segmentationStrategy: 'paragraph', validateStructure: true, - targetCellLength, + idealCellLength, }); // Set up debug logging - pass through to progress callback @@ -110,7 +110,7 @@ export const parseFile = async ( onProgress?.(createProgress('Creating Cells', 'Converting paragraphs to cells...', 60)); // Convert document content to cells (paragraphs + table cells) - const cells = createCellsFromDocx(docxDoc, file.name, targetCellLength); + const cells = createCellsFromDocx(docxDoc, file.name, idealCellLength); onProgress?.(createProgress('Creating Notebooks', 'Creating source and codex notebooks...', 80)); @@ -209,14 +209,14 @@ export const parseFile = async ( /** * Convert DOCX paragraphs to Codex cells with complete metadata for round-trip. - * Paragraphs whose plain text exceeds targetCellLength are split into multiple + * Paragraphs whose plain text exceeds idealCellLength are split into multiple * cells (one per segment). Each segment's cell carries segmentIndex/segmentCount * so the exporter can recombine translations before writing them back. */ const createCellsFromDocx = ( docxDoc: DocxDocument, fileName: string, - targetCellLength: number = DEFAULT_IDEAL_CELL_LENGTH + idealCellLength: number = DEFAULT_IDEAL_CELL_LENGTH ): any[] => { const cells: any[] = []; @@ -279,7 +279,7 @@ const createCellsFromDocx = ( const fullText = paragraph.runs.map((r) => r.content).join(''); if (!fullText.trim()) continue; - const ranges = splitTextIntoRanges(fullText, targetCellLength); + const ranges = splitTextIntoRanges(fullText, idealCellLength); const charRanges = buildRunCharRanges(paragraph.runs); const isMultiSegment = ranges.length > 1; From f375caf07d41a767d481a8b6f6448d6b2cd8d6ed Mon Sep 17 00:00:00 2001 From: Luke-Bilhorn Date: Mon, 20 Apr 2026 22:24:25 -0500 Subject: [PATCH 4/6] feat(docx): expose ideal cell length via Advanced Settings Adds an `advancedSettings` prop slot to UnifiedImporterForm, rendered as a collapsible panel below the file-selection card, and uses it from the DOCX importer to let the user override the ideal cell length used by the recursive paragraph splitter. Defaults to 160 characters. Ports the intent of the old experiment-layout "Made Ideal Segment Length Button visible" commit onto the new UnifiedImporterForm-based DOCX form. Made-with: Cursor --- .../components/UnifiedImporterForm.tsx | 31 +++++++++++++++ .../importers/docx/DocxImporterForm.tsx | 39 +++++++++++++++++-- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx index 7d68226a7..cd6b0bc08 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx @@ -17,6 +17,7 @@ import { Eye, BarChart3, AlertCircle, + ChevronDown, } from "lucide-react"; import { ImporterComponentProps, CellAligner, AlignedCell } from "../types/plugin"; import { NotebookPair, ImportProgress } from "../types/common"; @@ -82,6 +83,13 @@ export interface UnifiedImporterFormProps { * above the import button (e.g. timestamp corruption warnings). */ analyzeWarnings?: (files: File[]) => Promise; + + /** + * Optional custom controls rendered in a collapsible "Advanced Settings" + * section below the file-selection card. Importer-specific knobs + * (e.g. ideal cell length for DOCX splitting) can live here. + */ + advancedSettings?: React.ReactNode; } export const UnifiedImporterForm: React.FC = ({ @@ -99,7 +107,9 @@ export const UnifiedImporterForm: React.FC = ({ onSourceImportComplete, showEnforceStructure = false, analyzeWarnings, + advancedSettings, }) => { + const [showAdvanced, setShowAdvanced] = useState(false); const [files, setFiles] = useState([]); const [enforceStructure, setEnforceStructure] = useState(showEnforceStructure); const [previewContent, setPreviewContent] = useState(""); @@ -428,6 +438,27 @@ export const UnifiedImporterForm: React.FC = ({ + {/* Advanced Settings (optional, importer-specific) */} + {advancedSettings && ( +
+ + {showAdvanced && ( +
+ {advancedSettings} +
+ )} +
+ )} + {/* Enforce HTML Structure Checkbox */} {showEnforceStructure && hasFiles && ( = (props) => { + const [idealCellLength, setIdealCellLength] = useState(DEFAULT_IDEAL_CELL_LENGTH); + const analyzeFiles = useCallback(async (files: File[]): Promise => { const totalBytes = files.reduce((sum, f) => sum + f.size, 0); return [ @@ -38,7 +41,7 @@ export const DocxImporterForm: React.FC = (props) => { throw new Error(`${file.name}: ${validation.errors.join(", ")}`); } - const importResult = await parseFile(file, onProgress); + const importResult = await parseFile(file, onProgress, { idealCellLength }); if (!importResult.success || !importResult.notebookPair) { throw new Error(importResult.error || `Failed to parse ${file.name}`); } @@ -48,7 +51,36 @@ export const DocxImporterForm: React.FC = (props) => { return results.length === 1 ? results[0]! : results; }, - [] + [idealCellLength] + ); + + const advancedSettings = ( + <> +
+ + { + const v = parseInt(e.target.value, 10); + if (!isNaN(v) && v >= 0) setIdealCellLength(v); + }} + className="w-24 rounded-md border border-input bg-background px-3 py-1 text-sm shadow-sm focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:opacity-50" + /> +
+

+ Long paragraphs are split into smaller cells at sentence boundaries. Set to 0 to + disable splitting. +

+ ); return ( @@ -65,6 +97,7 @@ export const DocxImporterForm: React.FC = (props) => { cellAligner={sequentialCellAligner} showPreview={false} showEnforceStructure + advancedSettings={advancedSettings} /> ); }; From fcc9531d23cea87aa8a8049fb68de258cb26f549 Mon Sep 17 00:00:00 2001 From: Luke-Bilhorn Date: Fri, 22 May 2026 00:20:32 -0500 Subject: [PATCH 5/6] feat(splitter): use Intl.Segmenter for locale-aware L1/L3 boundaries When a BCP-47 locale is supplied (e.g. derived from on DOCX runs), the recursive splitter now sources sentence (L1) and word (L3) split candidates from Intl.Segmenter, falling back to the existing regex when the locale is missing or unsupported. Sub-sentence stops (L2) remain regex-only because Intl has no clause granularity. Thresholds, midpoint preference, and min-side guard are unchanged. The DOCX importer picks each paragraph's dominant run-level lang (weighted by content length) and passes it through, giving correct word boundaries for scripts without space-separated words (Thai, Khmer, Lao, Myanmar, CJK) and smarter sentence detection. Uses an opaque SegmenterHandle type rather than Intl.Segmenter directly, so the file compiles under both the webview tsconfig (Vite) and the root tsconfig (webpack/ts-loader) which lacks ES2022.Intl. --- .../NewSourceUploader/importers/docx/index.ts | 36 ++++- .../NewSourceUploader/utils/textSplitter.ts | 146 ++++++++++++++---- 2 files changed, 155 insertions(+), 27 deletions(-) diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts index 875781705..9a0181457 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts @@ -273,7 +273,8 @@ const createCellsFromDocx = ( const fullText = paragraph.runs.map((r) => r.content).join(''); if (!fullText.trim()) continue; - const ranges = splitTextIntoRanges(fullText, idealCellLength); + const locale = pickParagraphLocale(paragraph); + const ranges = splitTextIntoRanges(fullText, idealCellLength, locale); const charRanges = buildRunCharRanges(paragraph.runs); const isMultiSegment = ranges.length > 1; @@ -313,6 +314,39 @@ const createCellsFromDocx = ( // Run-slicing helpers (DOCX-specific, used to map text ranges back to runs) // --------------------------------------------------------------------------- +/** + * Pick the dominant BCP-47 locale for a paragraph by tallying the non-empty + * w:lang values across its runs, weighted by run content length so a one-word + * code-switched span doesn't outvote the rest of the paragraph. Ties are + * broken by first appearance. Returns undefined when no run carries a lang + * attribute, which makes the splitter fall back to its regex path. + */ +const pickParagraphLocale = (paragraph: DocxParagraph): string | undefined => { + const weights = new Map(); + const firstSeen = new Map(); + for (let i = 0; i < paragraph.runs.length; i++) { + const lang = paragraph.runs[i].runProperties?.lang; + if (!lang) continue; + const weight = paragraph.runs[i].content.length || 1; + weights.set(lang, (weights.get(lang) ?? 0) + weight); + if (!firstSeen.has(lang)) firstSeen.set(lang, i); + } + if (weights.size === 0) return undefined; + + let best: string | undefined; + let bestWeight = -1; + for (const [lang, w] of weights) { + if ( + w > bestWeight || + (w === bestWeight && (firstSeen.get(lang) ?? Infinity) < (firstSeen.get(best ?? '') ?? Infinity)) + ) { + best = lang; + bestWeight = w; + } + } + return best; +}; + interface RunCharRange { run: DocxRun; charStart: number; diff --git a/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts index b0768243d..e8b7daa77 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts @@ -11,10 +11,21 @@ * * L1 (sentence boundaries) — tried when length > N * THRESHOLD_SPLIT * L2 (sub-sentence breaks) — tried when there is no suitable L1 boundary and length > N * THRESHOLD_L2 - * L3 (whitespace) — tried when there is no suitable L1 or L2 boundary and length > N * THRESHOLD_L3 + * L3 (whitespace / word) — tried when there is no suitable L1 or L2 boundary and length > N * THRESHOLD_L3 * * Each split is rejected if it would leave either side shorter than N * MIN_SIDE_RATIO. * After a successful split, both halves are recursively re-evaluated. + * + * Locale-aware mode: + * When the caller passes a BCP-47 `locale` (e.g. "th", "ja", "fa") AND the + * runtime supports `Intl.Segmenter` for it, L1 sources its candidate + * boundaries from `Intl.Segmenter(locale, { granularity: 'sentence' })` + * and L3 from `granularity: 'word'`. This handles abbreviations + * ("Mr. Smith") correctly at L1 and produces real word boundaries for + * scripts without word-spacing (Thai, Khmer, Lao, Myanmar, CJK) at L3. + * L2 has no Intl equivalent and always uses the regex below. + * If the locale is missing or unsupported, every tier falls back to regex + * and behavior is byte-identical to the locale-less call. */ @@ -76,6 +87,68 @@ export interface TextRange { // Internal helpers (defined before use) // --------------------------------------------------------------------------- +/** + * Opaque handle for a constructed segmenter. Typed loosely so this file + * compiles under tsconfigs that lack `ES2022.Intl` (the root webpack build + * pulls this module in via a test that cross-imports from the webview tree). + */ +type SegmenterHandle = { + segment(input: string): Iterable<{ segment: string; index: number; isWordLike?: boolean }>; +}; + +/** + * Try to construct an `Intl.Segmenter` for the given locale and granularity. + * Returns null when no locale was supplied, when the runtime lacks the API, + * or when the locale is unrecognized (RangeError). + */ +function tryCreateSegmenter( + locale: string | undefined, + granularity: 'sentence' | 'word' +): SegmenterHandle | null { + if (!locale) return null; + const IntlAny = Intl as Record; + if (typeof IntlAny.Segmenter !== 'function') return null; + try { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return new (IntlAny.Segmenter as any)(locale, { granularity }) as SegmenterHandle; + } catch { + return null; + } +} + +/** + * Boundary positions between sentences in `text`, derived from a locale-aware + * sentence segmenter. Returns the start index of every sentence after the + * first (each `seg.index` already accounts for trailing whitespace of the + * previous sentence, matching the regex-based `findSplitPoints` semantics). + */ +function findSentenceSplitPointsIntl(text: string, segmenter: SegmenterHandle): number[] { + const points: number[] = []; + for (const seg of segmenter.segment(text)) { + if (seg.index > 0 && seg.index < text.length) { + points.push(seg.index); + } + } + return points; +} + +/** + * Boundary positions between words in `text`, derived from a locale-aware + * word segmenter. Returns the start index of every word-like segment after + * position 0 — this is the locale-aware analogue of "split at whitespace" + * and works for scripts (Thai, Khmer, Lao, Myanmar, CJK) where words are + * not space-separated. + */ +function findWordSplitPointsIntl(text: string, segmenter: SegmenterHandle): number[] { + const points: number[] = []; + for (const seg of segmenter.segment(text)) { + if (seg.isWordLike && seg.index > 0 && seg.index < text.length) { + points.push(seg.index); + } + } + return points; +} + /** * Find all character positions where a new segment could BEGIN after a * boundary match (i.e. right after the punctuation + any trailing spaces). @@ -119,6 +192,17 @@ function pickBestPoint( return null; } +/** + * Per-invocation context passed through recursion. The two segmenter slots + * are constructed once at the public entry point so we don't re-construct + * them at every recursive call. + */ +interface SplitContext { + idealLength: number; + sentenceSegmenter: SegmenterHandle | null; + wordSegmenter: SegmenterHandle | null; +} + /** * Core recursive splitter. Operates on character offsets within `fullText`. */ @@ -126,8 +210,9 @@ function splitRecursive( fullText: string, start: number, end: number, - idealLength: number + ctx: SplitContext ): TextRange[] { + const { idealLength, sentenceSegmenter, wordSegmenter } = ctx; const length = end - start; const minSide = idealLength * MIN_SIDE_RATIO; @@ -140,52 +225,49 @@ function splitRecursive( const mid = Math.floor(segText.length / 2); // --- L1: sentence boundaries --- - const l1Points = findSplitPoints(segText, L1_RE); + // Locale-aware when a sentence segmenter is available; regex otherwise. + const l1Points = sentenceSegmenter + ? findSentenceSplitPointsIntl(segText, sentenceSegmenter) + : findSplitPoints(segText, L1_RE); const l1Pick = pickBestPoint(l1Points, mid, segText.length, minSide); if (l1Pick !== null) { const g = start + l1Pick; return [ - ...splitRecursive(fullText, start, g, idealLength), - ...splitRecursive(fullText, g, end, idealLength), + ...splitRecursive(fullText, start, g, ctx), + ...splitRecursive(fullText, g, end, ctx), ]; } // --- L2: sub-sentence boundaries (only if long enough) --- + // No Intl equivalent for clause-level boundaries — regex always. if (length > idealLength * THRESHOLD_L2) { const l2Points = findSplitPoints(segText, L2_RE); const l2Pick = pickBestPoint(l2Points, mid, segText.length, minSide); if (l2Pick !== null) { const g = start + l2Pick; return [ - ...splitRecursive(fullText, start, g, idealLength), - ...splitRecursive(fullText, g, end, idealLength), + ...splitRecursive(fullText, start, g, ctx), + ...splitRecursive(fullText, g, end, ctx), ]; } } - // --- L3: whitespace (only if very long) --- + // --- L3: word boundaries (only if very long) --- + // Locale-aware when a word segmenter is available — essential for + // scripts without space-separated words. Falls back to whitespace regex. if (length > idealLength * THRESHOLD_L3) { - const l3Points = findSplitPoints(segText, L3_RE); + const l3Points = wordSegmenter + ? findWordSplitPointsIntl(segText, wordSegmenter) + : findSplitPoints(segText, L3_RE); const l3Pick = pickBestPoint(l3Points, mid, segText.length, minSide); if (l3Pick !== null) { const g = start + l3Pick; return [ - ...splitRecursive(fullText, start, g, idealLength), - ...splitRecursive(fullText, g, end, idealLength), + ...splitRecursive(fullText, start, g, ctx), + ...splitRecursive(fullText, g, end, ctx), ]; } } - /** - * Really, I'd like to implement some much more sophisticated techniques - * for splitting L3. Whitespace is a last resort, but it's not great. - * Short of making calls to an llm, a better approach might use stats to - * make decent guesses at where clauses start and end, for languages that - * use whitespace. - * - * For those that don't use whitespace (e.g. CJK, Thai, Khmer, Lao, Myanmar), - * even this wouldn't help at all. As it is, we should try to implement - * something to prefer splitting at word boundaries rather than within compounds. - */ // Cannot split further return [{ start, end }]; @@ -200,15 +282,26 @@ function splitRecursive( * * Returns a single-element array when no split is needed. * Set `idealLength` to 0 to disable splitting entirely. + * + * @param locale Optional BCP-47 tag (e.g. "th", "ja", "fa-IR"). When supplied + * and supported by the runtime's `Intl.Segmenter`, L1 (sentence) and L3 + * (word) boundaries are derived from the segmenter instead of regex. + * Unknown or unsupported tags transparently fall back to regex. */ export function splitTextIntoRanges( text: string, - idealLength: number = DEFAULT_IDEAL_CELL_LENGTH + idealLength: number = DEFAULT_IDEAL_CELL_LENGTH, + locale?: string ): TextRange[] { if (!text || idealLength <= 0 || text.length <= idealLength * THRESHOLD_SPLIT) { return [{ start: 0, end: text.length }]; } - return splitRecursive(text, 0, text.length, idealLength); + const ctx: SplitContext = { + idealLength, + sentenceSegmenter: tryCreateSegmenter(locale, 'sentence'), + wordSegmenter: tryCreateSegmenter(locale, 'word'), + }; + return splitRecursive(text, 0, text.length, ctx); } /** @@ -216,8 +309,9 @@ export function splitTextIntoRanges( */ export function splitText( text: string, - idealLength: number = DEFAULT_IDEAL_CELL_LENGTH + idealLength: number = DEFAULT_IDEAL_CELL_LENGTH, + locale?: string ): string[] { - return splitTextIntoRanges(text, idealLength) + return splitTextIntoRanges(text, idealLength, locale) .map(r => text.slice(r.start, r.end)); } From c1db5ee1b2b82106080dd9a8bc26dbed9592c1c1 Mon Sep 17 00:00:00 2001 From: Luke-Bilhorn Date: Sun, 24 May 2026 22:54:36 -0500 Subject: [PATCH 6/6] feat(docx import): use project source language as Intl.Segmenter locale Replaces the previous w:lang sniffing in the DOCX importer with the project's source language tag (from metadata.json), threaded through the wizard context. This gives Intl.Segmenter a stable, predictable locale for L1/L3 splitting that does not depend on whether the source DOCX (e.g. Google Docs exports) carries w:lang attributes. - Provider now reads metadata.json and includes sourceLanguageTag in the projectInventory message. - WizardContext / WizardState carry sourceLanguageTag through to importer components. - DocxImporterForm reads it from wizardContext and passes it as a locale option to parseFile. - parseFile / createCellsFromDocx accept the locale and forward it to splitTextIntoRanges; pickParagraphLocale (the w:lang tally helper) is removed. --- .../NewSourceUploaderProvider.ts | 19 +++++++++ .../NewSourceUploader/NewSourceUploader.tsx | 3 ++ .../importers/docx/DocxImporterForm.tsx | 5 ++- .../NewSourceUploader/importers/docx/index.ts | 42 +++---------------- .../src/NewSourceUploader/types/wizard.ts | 4 ++ 5 files changed, 34 insertions(+), 39 deletions(-) diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts index 5668f2923..147653fc5 100644 --- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts +++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts @@ -129,10 +129,13 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide const intentMatch = uriQuery.match(/intent=(source|target)/); const initialIntent = intentMatch ? intentMatch[1] : undefined; + const sourceLanguageTag = await this.getSourceLanguageTag(); + webviewPanel.webview.postMessage({ command: "projectInventory", inventory: inventory, initialIntent, + sourceLanguageTag, }); } else if (message.command === "metadata.check") { // Handle metadata check request @@ -2000,6 +2003,22 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide return confirmOverwriteWithDetails(items); } + private async getSourceLanguageTag(): Promise { + try { + const workspaceFolders = vscode.workspace.workspaceFolders; + if (!workspaceFolders?.length) return undefined; + const metadataUri = vscode.Uri.joinPath(workspaceFolders[0].uri, "metadata.json"); + const raw = await vscode.workspace.fs.readFile(metadataUri); + const metadata = JSON.parse(raw.toString()); + const sourceLang = metadata.languages?.find( + (l: { projectStatus?: string }) => l.projectStatus === "source" + ); + return sourceLang?.tag as string | undefined; + } catch { + return undefined; + } + } + private async fetchProjectInventory(): Promise<{ sourceFiles: Array<{ name: string; diff --git a/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx b/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx index a4759e3ef..84a5c50cb 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx @@ -291,6 +291,7 @@ const NewSourceUploader: React.FC = () => { targetFiles: [], translationPairs: [], }; + const sourceLanguageTag: string | undefined = message.sourceLanguageTag; const initialIntent: ImportIntent | undefined = message.initialIntent; @@ -300,6 +301,7 @@ const NewSourceUploader: React.FC = () => { const base = { ...prev, projectInventory: inventory, + sourceLanguageTag: sourceLanguageTag ?? prev.sourceLanguageTag, isLoadingInventory: false, }; @@ -628,6 +630,7 @@ const NewSourceUploader: React.FC = () => { selectedSource: wizardState.selectedSourceForTarget, selectedSourceDetails: wizardState.selectedSourceDetails, projectInventory: wizardState.projectInventory, + sourceLanguageTag: wizardState.sourceLanguageTag, }; // For target imports, we need detailed source info and should use translation completion diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx index 96d936899..b33c3af3e 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/DocxImporterForm.tsx @@ -8,6 +8,7 @@ import { DEFAULT_IDEAL_CELL_LENGTH } from "../../utils/textSplitter"; export const DocxImporterForm: React.FC = (props) => { const [idealCellLength, setIdealCellLength] = useState(DEFAULT_IDEAL_CELL_LENGTH); + const locale = props.wizardContext?.sourceLanguageTag; const analyzeFiles = useCallback(async (files: File[]): Promise => { const totalBytes = files.reduce((sum, f) => sum + f.size, 0); @@ -41,7 +42,7 @@ export const DocxImporterForm: React.FC = (props) => { throw new Error(`${file.name}: ${validation.errors.join(", ")}`); } - const importResult = await parseFile(file, onProgress, { idealCellLength }); + const importResult = await parseFile(file, onProgress, { idealCellLength, locale }); if (!importResult.success || !importResult.notebookPair) { throw new Error(importResult.error || `Failed to parse ${file.name}`); } @@ -51,7 +52,7 @@ export const DocxImporterForm: React.FC = (props) => { return results.length === 1 ? results[0]! : results; }, - [idealCellLength] + [idealCellLength, locale] ); const advancedSettings = ( diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts index 9a0181457..a589164ba 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts @@ -79,9 +79,10 @@ export const validateFile = async (file: File): Promise => export const parseFile = async ( file: File, onProgress?: ProgressCallback, - options?: { idealCellLength?: number } + options?: { idealCellLength?: number; locale?: string } ): Promise => { const idealCellLength = options?.idealCellLength ?? DEFAULT_IDEAL_CELL_LENGTH; + const locale = options?.locale; try { onProgress?.(createProgress('Reading File', 'Reading DOCX file...', 10)); @@ -111,7 +112,7 @@ export const parseFile = async ( onProgress?.(createProgress('Creating Cells', 'Converting paragraphs to cells...', 60)); // Convert document content to cells (paragraphs + table cells) - const cells = createCellsFromDocx(docxDoc, file.name, idealCellLength); + const cells = createCellsFromDocx(docxDoc, file.name, idealCellLength, locale); onProgress?.(createProgress('Creating Notebooks', 'Creating source and codex notebooks...', 80)); @@ -210,7 +211,8 @@ export const parseFile = async ( const createCellsFromDocx = ( docxDoc: DocxDocument, fileName: string, - idealCellLength: number = DEFAULT_IDEAL_CELL_LENGTH + idealCellLength: number = DEFAULT_IDEAL_CELL_LENGTH, + locale?: string ): any[] => { const cells: any[] = []; @@ -273,7 +275,6 @@ const createCellsFromDocx = ( const fullText = paragraph.runs.map((r) => r.content).join(''); if (!fullText.trim()) continue; - const locale = pickParagraphLocale(paragraph); const ranges = splitTextIntoRanges(fullText, idealCellLength, locale); const charRanges = buildRunCharRanges(paragraph.runs); const isMultiSegment = ranges.length > 1; @@ -314,39 +315,6 @@ const createCellsFromDocx = ( // Run-slicing helpers (DOCX-specific, used to map text ranges back to runs) // --------------------------------------------------------------------------- -/** - * Pick the dominant BCP-47 locale for a paragraph by tallying the non-empty - * w:lang values across its runs, weighted by run content length so a one-word - * code-switched span doesn't outvote the rest of the paragraph. Ties are - * broken by first appearance. Returns undefined when no run carries a lang - * attribute, which makes the splitter fall back to its regex path. - */ -const pickParagraphLocale = (paragraph: DocxParagraph): string | undefined => { - const weights = new Map(); - const firstSeen = new Map(); - for (let i = 0; i < paragraph.runs.length; i++) { - const lang = paragraph.runs[i].runProperties?.lang; - if (!lang) continue; - const weight = paragraph.runs[i].content.length || 1; - weights.set(lang, (weights.get(lang) ?? 0) + weight); - if (!firstSeen.has(lang)) firstSeen.set(lang, i); - } - if (weights.size === 0) return undefined; - - let best: string | undefined; - let bestWeight = -1; - for (const [lang, w] of weights) { - if ( - w > bestWeight || - (w === bestWeight && (firstSeen.get(lang) ?? Infinity) < (firstSeen.get(best ?? '') ?? Infinity)) - ) { - best = lang; - bestWeight = w; - } - } - return best; -}; - interface RunCharRange { run: DocxRun; charStart: number; diff --git a/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts b/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts index 36d48d8c5..51bd0c945 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts @@ -63,6 +63,8 @@ export interface WizardState { selectedSourceDetails?: DetailedFileInfo; selectedPlugin?: string; projectInventory: ProjectInventory; + /** BCP-47 tag of the project's source language (from metadata.json). */ + sourceLanguageTag?: string; isLoadingInventory: boolean; isLoadingFileDetails: boolean; fileDetailsError?: string; @@ -82,6 +84,8 @@ export interface WizardContext { selectedSource?: BasicFileInfo; selectedSourceDetails?: DetailedFileInfo; projectInventory: ProjectInventory; + /** BCP-47 tag of the project's source language (from metadata.json). */ + sourceLanguageTag?: string; } /**