diff --git a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts index 5668f2923..147653fc5 100644 --- a/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts +++ b/src/providers/NewSourceUploader/NewSourceUploaderProvider.ts @@ -129,10 +129,13 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide const intentMatch = uriQuery.match(/intent=(source|target)/); const initialIntent = intentMatch ? intentMatch[1] : undefined; + const sourceLanguageTag = await this.getSourceLanguageTag(); + webviewPanel.webview.postMessage({ command: "projectInventory", inventory: inventory, initialIntent, + sourceLanguageTag, }); } else if (message.command === "metadata.check") { // Handle metadata check request @@ -2000,6 +2003,22 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide return confirmOverwriteWithDetails(items); } + private async getSourceLanguageTag(): Promise { + try { + const workspaceFolders = vscode.workspace.workspaceFolders; + if (!workspaceFolders?.length) return undefined; + const metadataUri = vscode.Uri.joinPath(workspaceFolders[0].uri, "metadata.json"); + const raw = await vscode.workspace.fs.readFile(metadataUri); + const metadata = JSON.parse(raw.toString()); + const sourceLang = metadata.languages?.find( + (l: { projectStatus?: string }) => l.projectStatus === "source" + ); + return sourceLang?.tag as string | undefined; + } catch { + return undefined; + } + } + private async fetchProjectInventory(): Promise<{ sourceFiles: Array<{ name: string; diff --git a/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx b/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx index a4759e3ef..84a5c50cb 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/NewSourceUploader.tsx @@ -291,6 +291,7 @@ const NewSourceUploader: React.FC = () => { targetFiles: [], translationPairs: [], }; + const sourceLanguageTag: string | undefined = message.sourceLanguageTag; const initialIntent: ImportIntent | undefined = message.initialIntent; @@ -300,6 +301,7 @@ const NewSourceUploader: React.FC = () => { const base = { ...prev, projectInventory: inventory, + sourceLanguageTag: sourceLanguageTag ?? prev.sourceLanguageTag, isLoadingInventory: false, }; @@ -628,6 +630,7 @@ const NewSourceUploader: React.FC = () => { selectedSource: wizardState.selectedSourceForTarget, selectedSourceDetails: wizardState.selectedSourceDetails, projectInventory: wizardState.projectInventory, + sourceLanguageTag: wizardState.sourceLanguageTag, }; // For target imports, we need detailed source info and should use translation completion diff --git a/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx b/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx index 7d68226a7..cd6b0bc08 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx +++ b/webviews/codex-webviews/src/NewSourceUploader/components/UnifiedImporterForm.tsx @@ -17,6 +17,7 @@ import { Eye, BarChart3, AlertCircle, + ChevronDown, } from "lucide-react"; import { ImporterComponentProps, CellAligner, AlignedCell } from "../types/plugin"; import { NotebookPair, ImportProgress } from "../types/common"; @@ -82,6 +83,13 @@ export interface UnifiedImporterFormProps { * above the import button (e.g. timestamp corruption warnings). */ analyzeWarnings?: (files: File[]) => Promise; + + /** + * Optional custom controls rendered in a collapsible "Advanced Settings" + * section below the file-selection card. Importer-specific knobs + * (e.g. ideal cell length for DOCX splitting) can live here. + */ + advancedSettings?: React.ReactNode; } export const UnifiedImporterForm: React.FC = ({ @@ -99,7 +107,9 @@ export const UnifiedImporterForm: React.FC = ({ onSourceImportComplete, showEnforceStructure = false, analyzeWarnings, + advancedSettings, }) => { + const [showAdvanced, setShowAdvanced] = useState(false); const [files, setFiles] = useState([]); const [enforceStructure, setEnforceStructure] = useState(showEnforceStructure); const [previewContent, setPreviewContent] = useState(""); @@ -428,6 +438,27 @@ export const UnifiedImporterForm: React.FC = ({ + {/* Advanced Settings (optional, importer-specific) */} + {advancedSettings && ( +
+ + {showAdvanced && ( +
+ {advancedSettings} +
+ )} +
+ )} + {/* Enforce HTML Structure Checkbox */} {showEnforceStructure && hasFiles && ( = (props) => { + const [idealCellLength, setIdealCellLength] = useState(DEFAULT_IDEAL_CELL_LENGTH); + const locale = props.wizardContext?.sourceLanguageTag; + const analyzeFiles = useCallback(async (files: File[]): Promise => { const totalBytes = files.reduce((sum, f) => sum + f.size, 0); return [ @@ -38,7 +42,7 @@ export const DocxImporterForm: React.FC = (props) => { throw new Error(`${file.name}: ${validation.errors.join(", ")}`); } - const importResult = await parseFile(file, onProgress); + const importResult = await parseFile(file, onProgress, { idealCellLength, locale }); if (!importResult.success || !importResult.notebookPair) { throw new Error(importResult.error || `Failed to parse ${file.name}`); } @@ -48,7 +52,36 @@ export const DocxImporterForm: React.FC = (props) => { return results.length === 1 ? results[0]! : results; }, - [] + [idealCellLength, locale] + ); + + const advancedSettings = ( + <> +
+ + { + const v = parseInt(e.target.value, 10); + if (!isNaN(v) && v >= 0) setIdealCellLength(v); + }} + className="w-24 rounded-md border border-input bg-background px-3 py-1 text-sm shadow-sm focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:opacity-50" + /> +
+

+ Long paragraphs are split into smaller cells at sentence boundaries. Set to 0 to + disable splitting. +

+ ); return ( @@ -65,6 +98,7 @@ export const DocxImporterForm: React.FC = (props) => { cellAligner={sequentialCellAligner} showPreview={false} showEnforceStructure + advancedSettings={advancedSettings} /> ); }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts index 8de996c74..ec8ac6eaf 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/cellMetadata.ts @@ -19,6 +19,16 @@ export interface DocxCellMetadataParams { paragraph: DocxParagraph; docxDoc: DocxDocument; fileName: string; + /** + * When a paragraph is split into multiple cells, this is the 0-based index + * of this cell within that paragraph. Undefined for unsplit paragraphs. + */ + segmentIndex?: number; + /** + * Total number of cells this paragraph was split into. + * Undefined for unsplit paragraphs. + */ + segmentCount?: number; } /** @@ -26,7 +36,7 @@ export interface DocxCellMetadataParams { * Generates a UUID for the cell ID */ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metadata: any; cellId: string; } { - const { paragraphId, paragraphIndex, originalContent, paragraph, docxDoc, fileName } = params; + const { paragraphId, paragraphIndex, originalContent, segmentIndex, segmentCount } = params; // Generate UUID for cell ID const cellId = uuidv4(); @@ -39,6 +49,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada * * To keep `.source`/`.codex` small, we only persist what we need to map a Codex cell * back to a paragraph in `word/document.xml`. + * + * For split paragraphs, segmentIndex/segmentCount allow the exporter to + * recombine the translated segments in order before writing them back to + * the original . */ const cellMetadata = { id: cellId, @@ -47,6 +61,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada paragraphId, paragraphIndex, + // Present only when the paragraph was split into multiple cells + ...(segmentIndex !== undefined && { segmentIndex }), + ...(segmentCount !== undefined && { segmentCount }), + // Data object for consistency with other importers data: { originalText: originalContent, diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts index 8404d3667..ce4540a12 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxExporter.ts @@ -100,59 +100,82 @@ export async function exportDocxWithTranslations( } /** - * Collect translations from Codex cells + * Collect translations from Codex cells. + * + * Handles three cell shapes: + * 1. Table cells – one Codex cell maps to multiple DOCX paragraphs (paragraphIndices[]). + * 2. Split cells – one DOCX paragraph was split into N Codex cells (segmentIndex present). + * The per-segment translations are joined in order before writing to the . + * 3. Normal cells – one Codex cell ↔ one DOCX paragraph (paragraphIndex only). */ function collectTranslations( codexCells: Array<{ kind: number; value: string; metadata: any; }> ): Map { - const translations = new Map(); - console.log(`[Exporter] Processing ${codexCells.length} cells for translations`); - for (let i = 0; i < codexCells.length; i++) { - const cell = codexCells[i]; - const meta = cell.metadata; + // Accumulate per-paragraph segments: paragraphIndex → sorted list of {segmentIndex, text} + const segmentsByParagraph = new Map>(); + // Table cells bypass the segment system entirely + const tableTranslations = new Map(); - // Only DOCX cells have paragraphIndex/paragraphId; everything else is skipped naturally. - // (Don't rely on kind/type here; it varies by host and we only need the mapping fields.) + for (const cell of codexCells) { + const meta = cell.metadata; - // Get translated content (strip HTML tags) const translated = removeHtmlTags(cell.value).trim(); - if (!translated) { - continue; - } + if (!translated) continue; - // Get paragraph identifier const paragraphId = meta?.paragraphId; const paragraphIndex = meta?.paragraphIndex; const paragraphIndices = meta?.paragraphIndices; + const segmentIndex: number | undefined = meta?.segmentIndex; if (Array.isArray(paragraphIndices) && paragraphIndices.length > 0) { - // Table-cell case: a single Codex cell maps to multiple DOCX paragraphs. - // We map lines of the translation to each paragraph index (preserves paragraph count). + // Table-cell case: map lines of translation to each paragraph index. const parts = translated.split(/\r?\n/); for (let j = 0; j < paragraphIndices.length; j++) { const idx = paragraphIndices[j]; - if (typeof idx !== "number") continue; - translations.set(idx, parts[j] ?? ''); + if (typeof idx !== 'number') continue; + tableTranslations.set(idx, parts[j] ?? ''); } - } else if (typeof paragraphIndex === 'number') { - translations.set(paragraphIndex, translated); - // Keep logs light; large documents can have thousands of cells. + continue; + } + + // Resolve the paragraph index (numeric or from paragraphId string) + let paraIdx: number | undefined; + if (typeof paragraphIndex === 'number') { + paraIdx = paragraphIndex; } else if (typeof paragraphId === 'string') { const m = paragraphId.match(/^p-(\d+)$/); if (m) { - const idx = Number(m[1]); - translations.set(idx, translated); + paraIdx = Number(m[1]); } else { console.warn(`[Exporter] ⚠ Unrecognized paragraphId format: ${paragraphId}`); + continue; } + } else { + continue; } + + if (!segmentsByParagraph.has(paraIdx)) { + segmentsByParagraph.set(paraIdx, []); + } + segmentsByParagraph.get(paraIdx)!.push({ + // Unsplit paragraphs have no segmentIndex; treat them as the sole segment (index 0). + segmentIndex: segmentIndex ?? 0, + text: translated, + }); } - console.log(`[Exporter] Collected ${translations.size} translations total`); - // Avoid dumping thousands of IDs in logs. + // Build the final map: for split paragraphs, join segments in order. + const translations = new Map(tableTranslations); + + for (const [paraIdx, segments] of segmentsByParagraph) { + segments.sort((a, b) => a.segmentIndex - b.segmentIndex); + const combined = segments.map(s => s.text).join(' '); + translations.set(paraIdx, combined); + } + console.log(`[Exporter] Collected ${translations.size} translations total`); return translations; } diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts index 25ce28d27..e87a1487b 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxParser.ts @@ -75,6 +75,7 @@ export class DocxParser { extractTables: false, // TODO: Implement table support segmentationStrategy: 'paragraph', validateStructure: true, + idealCellLength: 160, ...config, }; diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts index 766dbaf9e..919e1c1e6 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/docxTypes.ts @@ -276,6 +276,13 @@ export interface DocxParseConfig { extractTables: boolean; segmentationStrategy: 'paragraph' | 'sentence' | 'run'; validateStructure: boolean; + /** + * Ideal cell length in characters. + * Paragraphs longer than ~N*1.1 are recursively split at sentence + * boundaries (L1), then sub-sentence stops (L2), then whitespace (L3). + * Defaults to 160. Set to 0 to disable splitting. + */ + idealCellLength: number; } // Error types diff --git a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts index 45665e4d2..a589164ba 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/importers/docx/index.ts @@ -24,6 +24,8 @@ import { DocxParser } from './docxParser'; import type { DocxDocument, DocxParagraph, DocxRun } from './docxTypes'; import { createDocxCellMetadata, createDocxTableCellMetadata } from './cellMetadata'; import { extractTableCellParagraphGroups } from './utils/tableSegmentation'; +import { splitTextIntoRanges, DEFAULT_IDEAL_CELL_LENGTH } from '../../utils/textSplitter'; +import type { TextRange } from '../../utils/textSplitter'; const SUPPORTED_EXTENSIONS = ['docx']; /** @@ -76,8 +78,12 @@ export const validateFile = async (file: File): Promise => */ export const parseFile = async ( file: File, - onProgress?: ProgressCallback + onProgress?: ProgressCallback, + options?: { idealCellLength?: number; locale?: string } ): Promise => { + const idealCellLength = options?.idealCellLength ?? DEFAULT_IDEAL_CELL_LENGTH; + const locale = options?.locale; + try { onProgress?.(createProgress('Reading File', 'Reading DOCX file...', 10)); @@ -88,6 +94,7 @@ export const parseFile = async ( extractFootnotes: true, segmentationStrategy: 'paragraph', validateStructure: true, + idealCellLength, }); // Set up debug logging - pass through to progress callback @@ -105,7 +112,7 @@ export const parseFile = async ( onProgress?.(createProgress('Creating Cells', 'Converting paragraphs to cells...', 60)); // Convert document content to cells (paragraphs + table cells) - const cells = createCellsFromDocx(docxDoc, file.name); + const cells = createCellsFromDocx(docxDoc, file.name, idealCellLength, locale); onProgress?.(createProgress('Creating Notebooks', 'Creating source and codex notebooks...', 80)); @@ -196,9 +203,17 @@ export const parseFile = async ( }; /** - * Convert DOCX paragraphs to Codex cells with complete metadata for round-trip + * Convert DOCX paragraphs to Codex cells with complete metadata for round-trip. + * Paragraphs whose plain text exceeds idealCellLength are split into multiple + * cells (one per segment). Each segment's cell carries segmentIndex/segmentCount + * so the exporter can recombine translations before writing them back. */ -const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] => { +const createCellsFromDocx = ( + docxDoc: DocxDocument, + fileName: string, + idealCellLength: number = DEFAULT_IDEAL_CELL_LENGTH, + locale?: string +): any[] => { const cells: any[] = []; // Group paragraph indices by (table cells), using XML order to match exporter indices. @@ -260,22 +275,33 @@ const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] => const fullText = paragraph.runs.map((r) => r.content).join(''); if (!fullText.trim()) continue; - const htmlContent = convertParagraphToHtml(paragraph); - const { cellId, metadata: cellMetadata } = createDocxCellMetadata({ - paragraphId: paragraph.id, - paragraphIndex: paragraph.paragraphIndex, - originalContent: fullText, - paragraph, - docxDoc, - fileName, - }); + const ranges = splitTextIntoRanges(fullText, idealCellLength, locale); + const charRanges = buildRunCharRanges(paragraph.runs); + const isMultiSegment = ranges.length > 1; + + for (let segIdx = 0; segIdx < ranges.length; segIdx++) { + const segmentRuns = sliceRunsForRange(charRanges, ranges[segIdx]); + const segmentText = segmentRuns.map((r) => r.content).join(''); + + const htmlContent = convertRunGroupToHtml(segmentRuns, paragraph); + const { cellId, metadata: cellMetadata } = createDocxCellMetadata({ + paragraphId: paragraph.id, + paragraphIndex: paragraph.paragraphIndex, + originalContent: segmentText, + paragraph, + docxDoc, + fileName, + segmentIndex: isMultiSegment ? segIdx : undefined, + segmentCount: isMultiSegment ? ranges.length : undefined, + }); - cells.push( - createProcessedCell(cellId, htmlContent, { - ...cellMetadata, - type: 'text', - }) - ); + cells.push( + createProcessedCell(cellId, htmlContent, { + ...cellMetadata, + type: 'text', + }) + ); + } } console.log( @@ -285,13 +311,52 @@ const createCellsFromDocx = (docxDoc: DocxDocument, fileName: string): any[] => return cells; }; +// --------------------------------------------------------------------------- +// Run-slicing helpers (DOCX-specific, used to map text ranges back to runs) +// --------------------------------------------------------------------------- + +interface RunCharRange { + run: DocxRun; + charStart: number; + charEnd: number; +} + +const buildRunCharRanges = (runs: DocxRun[]): RunCharRange[] => { + const ranges: RunCharRange[] = []; + let pos = 0; + for (const run of runs) { + ranges.push({ run, charStart: pos, charEnd: pos + run.content.length }); + pos += run.content.length; + } + return ranges; +}; + +const sliceRunsForRange = (charRanges: RunCharRange[], range: TextRange): DocxRun[] => { + const result: DocxRun[] = []; + for (const { run, charStart, charEnd } of charRanges) { + if (charEnd <= range.start || charStart >= range.end) continue; + const localStart = Math.max(charStart, range.start) - charStart; + const localEnd = Math.min(charEnd, range.end) - charStart; + const slicedContent = run.content.slice(localStart, localEnd); + if (slicedContent.length === 0) continue; + result.push({ + ...run, + id: `${run.id}:${range.start}-${range.end}`, + content: slicedContent, + }); + } + return result; +}; + /** - * Convert a DOCX paragraph to HTML for display in Codex + * Convert a specific set of runs (a segment) to HTML, applying the parent + * paragraph's block-level properties (style, alignment, indentation, spacing). + * Used both for whole paragraphs and for sub-segments after splitting. */ -const convertParagraphToHtml = (paragraph: DocxParagraph): string => { +const convertRunGroupToHtml = (runs: DocxRun[], paragraph: DocxParagraph): string => { let html = ' { html += ` data-alignment="${paragraph.paragraphProperties.alignment}"`; } - // Add inline styles const styles: string[] = []; if (paragraph.paragraphProperties.alignment) { styles.push(`text-align: ${paragraph.paragraphProperties.alignment}`); @@ -323,16 +387,21 @@ const convertParagraphToHtml = (paragraph: DocxParagraph): string => { html += '>'; - // Add runs - for (const run of paragraph.runs) { + for (const run of runs) { html += convertRunToHtml(run); } html += '

'; - return html; }; +/** + * Convert a DOCX paragraph to HTML for display in Codex. + * Used by the table-cell path where no splitting is applied. + */ +const convertParagraphToHtml = (paragraph: DocxParagraph): string => + convertRunGroupToHtml(paragraph.runs, paragraph); + /** * Convert a DOCX run to HTML */ diff --git a/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts b/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts index 36d48d8c5..51bd0c945 100644 --- a/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts +++ b/webviews/codex-webviews/src/NewSourceUploader/types/wizard.ts @@ -63,6 +63,8 @@ export interface WizardState { selectedSourceDetails?: DetailedFileInfo; selectedPlugin?: string; projectInventory: ProjectInventory; + /** BCP-47 tag of the project's source language (from metadata.json). */ + sourceLanguageTag?: string; isLoadingInventory: boolean; isLoadingFileDetails: boolean; fileDetailsError?: string; @@ -82,6 +84,8 @@ export interface WizardContext { selectedSource?: BasicFileInfo; selectedSourceDetails?: DetailedFileInfo; projectInventory: ProjectInventory; + /** BCP-47 tag of the project's source language (from metadata.json). */ + sourceLanguageTag?: string; } /** diff --git a/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts new file mode 100644 index 000000000..e8b7daa77 --- /dev/null +++ b/webviews/codex-webviews/src/NewSourceUploader/utils/textSplitter.ts @@ -0,0 +1,317 @@ +/** + * Generic recursive text splitter for paragraph segmentation. + * + * Works on plain strings — no dependency on DOCX types, runs, or any importer. + * Any importer (DOCX, PDF, plain text, etc.) can use this to break long + * paragraphs into translator-friendly cell-sized chunks. + * + * Algorithm: + * Given an ideal cell length N, the splitter asks "is this segment too long?" + * and if so, finds the best boundary closest to the midpoint: + * + * L1 (sentence boundaries) — tried when length > N * THRESHOLD_SPLIT + * L2 (sub-sentence breaks) — tried when there is no suitable L1 boundary and length > N * THRESHOLD_L2 + * L3 (whitespace / word) — tried when there is no suitable L1 or L2 boundary and length > N * THRESHOLD_L3 + * + * Each split is rejected if it would leave either side shorter than N * MIN_SIDE_RATIO. + * After a successful split, both halves are recursively re-evaluated. + * + * Locale-aware mode: + * When the caller passes a BCP-47 `locale` (e.g. "th", "ja", "fa") AND the + * runtime supports `Intl.Segmenter` for it, L1 sources its candidate + * boundaries from `Intl.Segmenter(locale, { granularity: 'sentence' })` + * and L3 from `granularity: 'word'`. This handles abbreviations + * ("Mr. Smith") correctly at L1 and produces real word boundaries for + * scripts without word-spacing (Thai, Khmer, Lao, Myanmar, CJK) at L3. + * L2 has no Intl equivalent and always uses the regex below. + * If the locale is missing or unsupported, every tier falls back to regex + * and behavior is byte-identical to the locale-less call. + */ + + +/** + * 160 seems to be a good default for English, but depending on the language and how it is encoded, it will surely vary. + * Thus, I've made it adjustable to the user. + */ +export const DEFAULT_IDEAL_CELL_LENGTH = 160; + +// --------------------------------------------------------------------------- +// Threshold multipliers (applied to idealLength) +// --------------------------------------------------------------------------- + +const THRESHOLD_SPLIT = 1.1; // a – minimum length to attempt any split +const THRESHOLD_L2 = 1.5; // b – minimum length for L2 (sub-sentence) splits +const THRESHOLD_L3 = 2.4; // c – minimum length for L3 (whitespace) splits +const MIN_SIDE_RATIO = 0.3; // d – minimum side length as fraction of idealLength + +// --------------------------------------------------------------------------- +// Boundary patterns (multilingual) +// --------------------------------------------------------------------------- + +/** + * L1 — Sentence-ending boundaries. + * + * Latin marks (. ! ?) require following whitespace or end-of-string to avoid + * splitting numbers ("3.14") and abbreviations. A negative lookbehind excludes + * digits immediately before ".". + * + * Script-specific marks (Devanagari, CJK, Arabic, Urdu, Ethiopic, Myanmar, + * Khmer, Tibetan, Armenian, Balinese, full-width variants) match standalone + * since they are unambiguous sentence terminators even without trailing space. + */ +const L1_RE = /(?:(?; +}; + +/** + * Try to construct an `Intl.Segmenter` for the given locale and granularity. + * Returns null when no locale was supplied, when the runtime lacks the API, + * or when the locale is unrecognized (RangeError). + */ +function tryCreateSegmenter( + locale: string | undefined, + granularity: 'sentence' | 'word' +): SegmenterHandle | null { + if (!locale) return null; + const IntlAny = Intl as Record; + if (typeof IntlAny.Segmenter !== 'function') return null; + try { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return new (IntlAny.Segmenter as any)(locale, { granularity }) as SegmenterHandle; + } catch { + return null; + } +} + +/** + * Boundary positions between sentences in `text`, derived from a locale-aware + * sentence segmenter. Returns the start index of every sentence after the + * first (each `seg.index` already accounts for trailing whitespace of the + * previous sentence, matching the regex-based `findSplitPoints` semantics). + */ +function findSentenceSplitPointsIntl(text: string, segmenter: SegmenterHandle): number[] { + const points: number[] = []; + for (const seg of segmenter.segment(text)) { + if (seg.index > 0 && seg.index < text.length) { + points.push(seg.index); + } + } + return points; +} + +/** + * Boundary positions between words in `text`, derived from a locale-aware + * word segmenter. Returns the start index of every word-like segment after + * position 0 — this is the locale-aware analogue of "split at whitespace" + * and works for scripts (Thai, Khmer, Lao, Myanmar, CJK) where words are + * not space-separated. + */ +function findWordSplitPointsIntl(text: string, segmenter: SegmenterHandle): number[] { + const points: number[] = []; + for (const seg of segmenter.segment(text)) { + if (seg.isWordLike && seg.index > 0 && seg.index < text.length) { + points.push(seg.index); + } + } + return points; +} + +/** + * Find all character positions where a new segment could BEGIN after a + * boundary match (i.e. right after the punctuation + any trailing spaces). + */ +function findSplitPoints(text: string, re: RegExp): number[] { + const points: number[] = []; + const localRe = new RegExp(re.source, re.flags.includes('g') ? re.flags : re.flags + 'g'); + let m: RegExpExecArray | null; + while ((m = localRe.exec(text)) !== null) { + let nextWord = m.index + m[0].length; + while (nextWord < text.length && text[nextWord] === ' ') nextWord++; + if (nextWord > 0 && nextWord < text.length) { + points.push(nextWord); + } + } + return points; +} + +/** + * From a set of candidate split points, pick the one closest to `mid` that + * keeps both resulting sides at least `minSide` characters long. + * Returns the chosen point, or null if none qualifies. + */ +function pickBestPoint( + points: number[], + mid: number, + textLength: number, + minSide: number +): number | null { + if (points.length === 0) return null; + + const sorted = [...points].sort( + (a, b) => Math.abs(a - mid) - Math.abs(b - mid) + ); + + for (const p of sorted) { + if (p >= minSide && textLength - p >= minSide) { + return p; + } + } + return null; +} + +/** + * Per-invocation context passed through recursion. The two segmenter slots + * are constructed once at the public entry point so we don't re-construct + * them at every recursive call. + */ +interface SplitContext { + idealLength: number; + sentenceSegmenter: SegmenterHandle | null; + wordSegmenter: SegmenterHandle | null; +} + +/** + * Core recursive splitter. Operates on character offsets within `fullText`. + */ +function splitRecursive( + fullText: string, + start: number, + end: number, + ctx: SplitContext +): TextRange[] { + const { idealLength, sentenceSegmenter, wordSegmenter } = ctx; + const length = end - start; + const minSide = idealLength * MIN_SIDE_RATIO; + + // Below the split threshold — keep as-is + if (length <= idealLength * THRESHOLD_SPLIT) { + return [{ start, end }]; + } + + const segText = fullText.slice(start, end); + const mid = Math.floor(segText.length / 2); + + // --- L1: sentence boundaries --- + // Locale-aware when a sentence segmenter is available; regex otherwise. + const l1Points = sentenceSegmenter + ? findSentenceSplitPointsIntl(segText, sentenceSegmenter) + : findSplitPoints(segText, L1_RE); + const l1Pick = pickBestPoint(l1Points, mid, segText.length, minSide); + if (l1Pick !== null) { + const g = start + l1Pick; + return [ + ...splitRecursive(fullText, start, g, ctx), + ...splitRecursive(fullText, g, end, ctx), + ]; + } + + // --- L2: sub-sentence boundaries (only if long enough) --- + // No Intl equivalent for clause-level boundaries — regex always. + if (length > idealLength * THRESHOLD_L2) { + const l2Points = findSplitPoints(segText, L2_RE); + const l2Pick = pickBestPoint(l2Points, mid, segText.length, minSide); + if (l2Pick !== null) { + const g = start + l2Pick; + return [ + ...splitRecursive(fullText, start, g, ctx), + ...splitRecursive(fullText, g, end, ctx), + ]; + } + } + + // --- L3: word boundaries (only if very long) --- + // Locale-aware when a word segmenter is available — essential for + // scripts without space-separated words. Falls back to whitespace regex. + if (length > idealLength * THRESHOLD_L3) { + const l3Points = wordSegmenter + ? findWordSplitPointsIntl(segText, wordSegmenter) + : findSplitPoints(segText, L3_RE); + const l3Pick = pickBestPoint(l3Points, mid, segText.length, minSide); + if (l3Pick !== null) { + const g = start + l3Pick; + return [ + ...splitRecursive(fullText, start, g, ctx), + ...splitRecursive(fullText, g, end, ctx), + ]; + } + } + + // Cannot split further + return [{ start, end }]; +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Split `text` into character ranges, each roughly `idealLength` characters. + * + * Returns a single-element array when no split is needed. + * Set `idealLength` to 0 to disable splitting entirely. + * + * @param locale Optional BCP-47 tag (e.g. "th", "ja", "fa-IR"). When supplied + * and supported by the runtime's `Intl.Segmenter`, L1 (sentence) and L3 + * (word) boundaries are derived from the segmenter instead of regex. + * Unknown or unsupported tags transparently fall back to regex. + */ +export function splitTextIntoRanges( + text: string, + idealLength: number = DEFAULT_IDEAL_CELL_LENGTH, + locale?: string +): TextRange[] { + if (!text || idealLength <= 0 || text.length <= idealLength * THRESHOLD_SPLIT) { + return [{ start: 0, end: text.length }]; + } + const ctx: SplitContext = { + idealLength, + sentenceSegmenter: tryCreateSegmenter(locale, 'sentence'), + wordSegmenter: tryCreateSegmenter(locale, 'word'), + }; + return splitRecursive(text, 0, text.length, ctx); +} + +/** + * Convenience wrapper that returns the actual substrings. + */ +export function splitText( + text: string, + idealLength: number = DEFAULT_IDEAL_CELL_LENGTH, + locale?: string +): string[] { + return splitTextIntoRanges(text, idealLength, locale) + .map(r => text.slice(r.start, r.end)); +}