Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/providers/NewSourceUploader/NewSourceUploaderProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,13 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide
const intentMatch = uriQuery.match(/intent=(source|target)/);
const initialIntent = intentMatch ? intentMatch[1] : undefined;

const sourceLanguageTag = await this.getSourceLanguageTag();

webviewPanel.webview.postMessage({
command: "projectInventory",
inventory: inventory,
initialIntent,
sourceLanguageTag,
});
} else if (message.command === "metadata.check") {
// Handle metadata check request
Expand Down Expand Up @@ -2000,6 +2003,22 @@ export class NewSourceUploaderProvider implements vscode.CustomTextEditorProvide
return confirmOverwriteWithDetails(items);
}

private async getSourceLanguageTag(): Promise<string | undefined> {
try {
const workspaceFolders = vscode.workspace.workspaceFolders;
if (!workspaceFolders?.length) return undefined;
const metadataUri = vscode.Uri.joinPath(workspaceFolders[0].uri, "metadata.json");
const raw = await vscode.workspace.fs.readFile(metadataUri);
const metadata = JSON.parse(raw.toString());
const sourceLang = metadata.languages?.find(
(l: { projectStatus?: string }) => l.projectStatus === "source"
);
return sourceLang?.tag as string | undefined;
} catch {
return undefined;
}
}

private async fetchProjectInventory(): Promise<{
sourceFiles: Array<{
name: string;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ const NewSourceUploader: React.FC = () => {
targetFiles: [],
translationPairs: [],
};
const sourceLanguageTag: string | undefined = message.sourceLanguageTag;

const initialIntent: ImportIntent | undefined = message.initialIntent;

Expand All @@ -300,6 +301,7 @@ const NewSourceUploader: React.FC = () => {
const base = {
...prev,
projectInventory: inventory,
sourceLanguageTag: sourceLanguageTag ?? prev.sourceLanguageTag,
isLoadingInventory: false,
};

Expand Down Expand Up @@ -628,6 +630,7 @@ const NewSourceUploader: React.FC = () => {
selectedSource: wizardState.selectedSourceForTarget,
selectedSourceDetails: wizardState.selectedSourceDetails,
projectInventory: wizardState.projectInventory,
sourceLanguageTag: wizardState.sourceLanguageTag,
};

// For target imports, we need detailed source info and should use translation completion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
Eye,
BarChart3,
AlertCircle,
ChevronDown,
} from "lucide-react";
import { ImporterComponentProps, CellAligner, AlignedCell } from "../types/plugin";
import { NotebookPair, ImportProgress } from "../types/common";
Expand Down Expand Up @@ -82,6 +83,13 @@ export interface UnifiedImporterFormProps {
* above the import button (e.g. timestamp corruption warnings).
*/
analyzeWarnings?: (files: File[]) => Promise<string[]>;

/**
* Optional custom controls rendered in a collapsible "Advanced Settings"
* section below the file-selection card. Importer-specific knobs
* (e.g. ideal cell length for DOCX splitting) can live here.
*/
advancedSettings?: React.ReactNode;
}

export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
Expand All @@ -99,7 +107,9 @@ export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
onSourceImportComplete,
showEnforceStructure = false,
analyzeWarnings,
advancedSettings,
}) => {
const [showAdvanced, setShowAdvanced] = useState(false);
const [files, setFiles] = useState<File[]>([]);
const [enforceStructure, setEnforceStructure] = useState(showEnforceStructure);
const [previewContent, setPreviewContent] = useState<string>("");
Expand Down Expand Up @@ -428,6 +438,27 @@ export const UnifiedImporterForm: React.FC<UnifiedImporterFormProps> = ({
</CardContent>
</Card>

{/* Advanced Settings (optional, importer-specific) */}
{advancedSettings && (
<div className="border border-gray-300 rounded-md">
<button
type="button"
onClick={() => setShowAdvanced((v) => !v)}
className="flex w-full items-center justify-between px-3 py-2 text-sm font-medium text-gray-500 hover:text-gray-800 transition-colors"
>
<span>Advanced Settings</span>
<ChevronDown
className={`h-4 w-4 transition-transform ${showAdvanced ? "rotate-180" : ""}`}
/>
</button>
{showAdvanced && (
<div className="px-3 pb-3 pt-1 space-y-2">
{advancedSettings}
</div>
)}
</div>
)}

{/* Enforce HTML Structure Checkbox */}
{showEnforceStructure && hasFiles && (
<EnforceStructureCheckbox
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import React, { useCallback } from "react";
import React, { useCallback, useState } from "react";
import { FileText } from "lucide-react";
import { UnifiedImporterForm, type FileAnalysisStat } from "../../components/UnifiedImporterForm";
import { type ImporterComponentProps, sequentialCellAligner } from "../../types/plugin";
import type { NotebookPair, ImportProgress } from "../../types/common";
import { validateFile, parseFile } from "./index";
import { DEFAULT_IDEAL_CELL_LENGTH } from "../../utils/textSplitter";

export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
const [idealCellLength, setIdealCellLength] = useState<number>(DEFAULT_IDEAL_CELL_LENGTH);
const locale = props.wizardContext?.sourceLanguageTag;

const analyzeFiles = useCallback(async (files: File[]): Promise<FileAnalysisStat[]> => {
const totalBytes = files.reduce((sum, f) => sum + f.size, 0);
return [
Expand Down Expand Up @@ -38,7 +42,7 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
throw new Error(`${file.name}: ${validation.errors.join(", ")}`);
}

const importResult = await parseFile(file, onProgress);
const importResult = await parseFile(file, onProgress, { idealCellLength, locale });
if (!importResult.success || !importResult.notebookPair) {
throw new Error(importResult.error || `Failed to parse ${file.name}`);
}
Expand All @@ -48,7 +52,36 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {

return results.length === 1 ? results[0]! : results;
},
[]
[idealCellLength, locale]
);

const advancedSettings = (
<>
<div className="flex items-center gap-3">
<label
htmlFor="ideal-cell-length"
className="text-sm font-medium whitespace-nowrap"
>
Ideal cell length (in characters)
</label>
<input
id="ideal-cell-length"
type="number"
min={0}
step={10}
value={idealCellLength}
onChange={(e) => {
const v = parseInt(e.target.value, 10);
if (!isNaN(v) && v >= 0) setIdealCellLength(v);
}}
className="w-24 rounded-md border border-input bg-background px-3 py-1 text-sm shadow-sm focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:opacity-50"
/>
</div>
<p className="text-xs text-gray-500">
Long paragraphs are split into smaller cells at sentence boundaries. Set to 0 to
disable splitting.
</p>
</>
);

return (
Expand All @@ -65,6 +98,7 @@ export const DocxImporterForm: React.FC<ImporterComponentProps> = (props) => {
cellAligner={sequentialCellAligner}
showPreview={false}
showEnforceStructure
advancedSettings={advancedSettings}
/>
);
};
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,24 @@ export interface DocxCellMetadataParams {
paragraph: DocxParagraph;
docxDoc: DocxDocument;
fileName: string;
/**
* When a paragraph is split into multiple cells, this is the 0-based index
* of this cell within that paragraph. Undefined for unsplit paragraphs.
*/
segmentIndex?: number;
/**
* Total number of cells this paragraph was split into.
* Undefined for unsplit paragraphs.
*/
segmentCount?: number;
}

/**
* Creates metadata for a DOCX paragraph cell
* Generates a UUID for the cell ID
*/
export function createDocxCellMetadata(params: DocxCellMetadataParams): { metadata: any; cellId: string; } {
const { paragraphId, paragraphIndex, originalContent, paragraph, docxDoc, fileName } = params;
const { paragraphId, paragraphIndex, originalContent, segmentIndex, segmentCount } = params;

// Generate UUID for cell ID
const cellId = uuidv4();
Expand All @@ -39,6 +49,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada
*
* To keep `.source`/`.codex` small, we only persist what we need to map a Codex cell
* back to a paragraph in `word/document.xml`.
*
* For split paragraphs, segmentIndex/segmentCount allow the exporter to
* recombine the translated segments in order before writing them back to
* the original <w:p>.
*/
const cellMetadata = {
id: cellId,
Expand All @@ -47,6 +61,10 @@ export function createDocxCellMetadata(params: DocxCellMetadataParams): { metada
paragraphId,
paragraphIndex,

// Present only when the paragraph was split into multiple cells
...(segmentIndex !== undefined && { segmentIndex }),
...(segmentCount !== undefined && { segmentCount }),

// Data object for consistency with other importers
data: {
originalText: originalContent,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,59 +100,82 @@ export async function exportDocxWithTranslations(
}

/**
* Collect translations from Codex cells
* Collect translations from Codex cells.
*
* Handles three cell shapes:
* 1. Table cells – one Codex cell maps to multiple DOCX paragraphs (paragraphIndices[]).
* 2. Split cells – one DOCX paragraph was split into N Codex cells (segmentIndex present).
* The per-segment translations are joined in order before writing to the <w:p>.
* 3. Normal cells – one Codex cell ↔ one DOCX paragraph (paragraphIndex only).
*/
function collectTranslations(
codexCells: Array<{ kind: number; value: string; metadata: any; }>
): Map<number, string> {
const translations = new Map<number, string>();

console.log(`[Exporter] Processing ${codexCells.length} cells for translations`);

for (let i = 0; i < codexCells.length; i++) {
const cell = codexCells[i];
const meta = cell.metadata;
// Accumulate per-paragraph segments: paragraphIndex → sorted list of {segmentIndex, text}
const segmentsByParagraph = new Map<number, Array<{ segmentIndex: number; text: string }>>();
// Table cells bypass the segment system entirely
const tableTranslations = new Map<number, string>();

// Only DOCX cells have paragraphIndex/paragraphId; everything else is skipped naturally.
// (Don't rely on kind/type here; it varies by host and we only need the mapping fields.)
for (const cell of codexCells) {
const meta = cell.metadata;

// Get translated content (strip HTML tags)
const translated = removeHtmlTags(cell.value).trim();
if (!translated) {
continue;
}
if (!translated) continue;

// Get paragraph identifier
const paragraphId = meta?.paragraphId;
const paragraphIndex = meta?.paragraphIndex;
const paragraphIndices = meta?.paragraphIndices;
const segmentIndex: number | undefined = meta?.segmentIndex;

if (Array.isArray(paragraphIndices) && paragraphIndices.length > 0) {
// Table-cell case: a single Codex cell maps to multiple DOCX paragraphs.
// We map lines of the translation to each paragraph index (preserves paragraph count).
// Table-cell case: map lines of translation to each paragraph index.
const parts = translated.split(/\r?\n/);
for (let j = 0; j < paragraphIndices.length; j++) {
const idx = paragraphIndices[j];
if (typeof idx !== "number") continue;
translations.set(idx, parts[j] ?? '');
if (typeof idx !== 'number') continue;
tableTranslations.set(idx, parts[j] ?? '');
}
} else if (typeof paragraphIndex === 'number') {
translations.set(paragraphIndex, translated);
// Keep logs light; large documents can have thousands of cells.
continue;
}

// Resolve the paragraph index (numeric or from paragraphId string)
let paraIdx: number | undefined;
if (typeof paragraphIndex === 'number') {
paraIdx = paragraphIndex;
} else if (typeof paragraphId === 'string') {
const m = paragraphId.match(/^p-(\d+)$/);
if (m) {
const idx = Number(m[1]);
translations.set(idx, translated);
paraIdx = Number(m[1]);
} else {
console.warn(`[Exporter] ⚠ Unrecognized paragraphId format: ${paragraphId}`);
continue;
}
} else {
continue;
}

if (!segmentsByParagraph.has(paraIdx)) {
segmentsByParagraph.set(paraIdx, []);
}
segmentsByParagraph.get(paraIdx)!.push({
// Unsplit paragraphs have no segmentIndex; treat them as the sole segment (index 0).
segmentIndex: segmentIndex ?? 0,
text: translated,
});
}

console.log(`[Exporter] Collected ${translations.size} translations total`);
// Avoid dumping thousands of IDs in logs.
// Build the final map: for split paragraphs, join segments in order.
const translations = new Map<number, string>(tableTranslations);

for (const [paraIdx, segments] of segmentsByParagraph) {
segments.sort((a, b) => a.segmentIndex - b.segmentIndex);
const combined = segments.map(s => s.text).join(' ');
translations.set(paraIdx, combined);
}

console.log(`[Exporter] Collected ${translations.size} translations total`);
return translations;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ export class DocxParser {
extractTables: false, // TODO: Implement table support
segmentationStrategy: 'paragraph',
validateStructure: true,
idealCellLength: 160,
...config,
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,13 @@ export interface DocxParseConfig {
extractTables: boolean;
segmentationStrategy: 'paragraph' | 'sentence' | 'run';
validateStructure: boolean;
/**
* Ideal cell length in characters.
* Paragraphs longer than ~N*1.1 are recursively split at sentence
* boundaries (L1), then sub-sentence stops (L2), then whitespace (L3).
* Defaults to 160. Set to 0 to disable splitting.
*/
idealCellLength: number;
}

// Error types
Expand Down
Loading
Loading