diff --git a/.github/scripts/translate-sync.py b/.github/scripts/translate-sync.py new file mode 100644 index 00000000..c1eb6bf0 --- /dev/null +++ b/.github/scripts/translate-sync.py @@ -0,0 +1,650 @@ +#!/usr/bin/env python3 +""" +Translation sync script for ComfyUI docs. + +Usage: + python translate-sync.py --before --after [--dry-run] + +For each changed English .mdx file, checks which language translations are +out of sync and translates only the changed sections using DeepSeek API. + +Section matching strategy: + Uses unified diff hunk headers (@@ -a,b +c,d @@) to get the exact line + ranges that changed in the new English file. These are mapped to section + indices by comparing line numbers against each section's start/end lines. + This is language-agnostic and correctly handles inserted, deleted, or + reordered sections. + +Robustness: + - Per-section error isolation: if a section fails, keeps original translation + and logs [FALLBACK]. Other sections are unaffected. + - API retry with exponential backoff (3 attempts). + - MDX tag count validation: if translated output has mismatched MDX tags, + falls back to original translation. + - Frontmatter integrity check: validates --- delimiters after translation. + - File-level error isolation: one file failing doesn't stop others. + - Empty response guard: keeps original if API returns empty content. + - Fallback to full-file translation if diff parsing yields no changed sections + but the file is confirmed changed. +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional + +try: + from openai import OpenAI +except ImportError: + print("Error: openai package required. Run: pip install openai") + sys.exit(1) + +SCRIPT_DIR = Path(__file__).parent +REPO_ROOT = SCRIPT_DIR.parent.parent +CONFIG_PATH = SCRIPT_DIR / "translation-config.json" + +# MDX component tags to validate (opening tags) +MDX_TAG_PATTERN = re.compile(r'<(Card|Accordion|AccordionGroup|Note|Warning|Tip|Steps|Step|Tab|Tabs|Columns|video|CardGroup|Frame|ResponseField|ParamField)\b') + + +def load_config() -> dict: + with open(CONFIG_PATH) as f: + return json.load(f) + + +def get_changed_files(before_sha: str, after_sha: str, exclude_dirs: list[str]) -> list[str]: + """Get English .mdx files changed between two commits.""" + result = subprocess.run( + ["git", "diff", "--name-only", "--diff-filter=ACMRT", before_sha, after_sha], + capture_output=True, text=True, cwd=REPO_ROOT + ) + files = [] + for line in result.stdout.strip().splitlines(): + if not line.endswith(".mdx"): + continue + if any(line.startswith(d + "/") for d in exclude_dirs): + continue + files.append(line) + return files + + +def get_files_changed_in_range(before_sha: str, after_sha: str) -> set[str]: + """Get all files (any language) changed in the commit range.""" + result = subprocess.run( + ["git", "diff", "--name-only", before_sha, after_sha], + capture_output=True, text=True, cwd=REPO_ROOT + ) + return set(result.stdout.strip().splitlines()) + + +def get_file_diff(before_sha: str, after_sha: str, filepath: str) -> str: + """Get the unified diff of a file between two commits.""" + result = subprocess.run( + ["git", "diff", before_sha, after_sha, "--", filepath], + capture_output=True, text=True, cwd=REPO_ROOT + ) + return result.stdout + + +def parse_sections(content: str) -> dict: + """ + Parse MDX content into frontmatter + sections split by ## / ### headings. + + Returns: + { + "frontmatter": str, + "frontmatter_lines": int, + "sections": [ + { + "heading": str | None, + "level": int, + "content": str, + "start_line": int, # 1-based, inclusive + "end_line": int, # 1-based, inclusive + } + ] + } + """ + lines = content.splitlines(keepends=True) + frontmatter_lines_list = [] + in_frontmatter = False + fm_done = False + + for i, line in enumerate(lines): + if i == 0 and line.strip() == "---": + in_frontmatter = True + frontmatter_lines_list.append(line) + continue + if in_frontmatter: + frontmatter_lines_list.append(line) + if line.strip() == "---": + in_frontmatter = False + fm_done = True + continue + if not fm_done: + break + + fm_line_count = len(frontmatter_lines_list) + body_lines = lines[fm_line_count:] + + sections = [] + current_heading = None + current_level = 0 + current_lines = [] + current_start = fm_line_count + 1 # 1-based + + for i, line in enumerate(body_lines): + lineno = fm_line_count + i + 1 # 1-based absolute + heading_match = re.match(r'^(#{2,3})\s+(.+)', line) + if heading_match: + if current_lines: + sections.append({ + "heading": current_heading, + "level": current_level, + "content": "".join(current_lines), + "start_line": current_start, + "end_line": lineno - 1, + }) + current_heading = heading_match.group(2).strip() + current_level = len(heading_match.group(1)) + current_lines = [line] + current_start = lineno + else: + current_lines.append(line) + + if current_lines: + sections.append({ + "heading": current_heading, + "level": current_level, + "content": "".join(current_lines), + "start_line": current_start, + "end_line": fm_line_count + len(body_lines), + }) + + return { + "frontmatter": "".join(frontmatter_lines_list), + "frontmatter_lines": fm_line_count, + "sections": sections, + } + + +def get_changed_line_ranges(diff: str) -> list[tuple[int, int]]: + """ + Parse unified diff hunk headers to extract changed line ranges in the NEW file. + Returns list of (start_line, end_line) — 1-based, inclusive. + Pure deletions (count=0) are excluded since they add no new lines. + """ + ranges = [] + for line in diff.splitlines(): + m = re.match(r'^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@', line) + if m: + start = int(m.group(1)) + count = int(m.group(2)) if m.group(2) is not None else 1 + if count == 0: + continue + ranges.append((start, start + count - 1)) + return ranges + + +def get_changed_section_indices(diff: str, sections: list[dict]) -> set[int]: + """ + Map changed line ranges (from diff hunk headers) onto section indices. + + A section is marked as changed if any of its lines overlap with a + changed range. This is fully language-agnostic and handles inserted, + deleted, or reordered sections correctly. + """ + changed_ranges = get_changed_line_ranges(diff) + if not changed_ranges: + return set() + + changed_indices = set() + for i, section in enumerate(sections): + s_start = section["start_line"] + s_end = section["end_line"] + for r_start, r_end in changed_ranges: + if r_start <= s_end and r_end >= s_start: + changed_indices.add(i) + break + + return changed_indices + + +def count_mdx_tags(content: str) -> dict[str, int]: + """Count occurrences of known MDX component tags.""" + counts = {} + for m in MDX_TAG_PATTERN.finditer(content): + tag = m.group(1) + counts[tag] = counts.get(tag, 0) + 1 + return counts + + +def validate_mdx_tags(original: str, translated: str) -> bool: + """ + Return True if translated content has the same MDX tag counts as original. + Allows translated to have 0 of a tag only if original also has 0. + """ + orig_counts = count_mdx_tags(original) + trans_counts = count_mdx_tags(translated) + for tag, count in orig_counts.items(): + if count > 0 and trans_counts.get(tag, 0) != count: + return False + return True + + +def validate_frontmatter(text: str) -> bool: + """Return True if text starts and ends with --- delimiters.""" + stripped = text.strip() + lines = stripped.splitlines() + if len(lines) < 2: + return False + return lines[0].strip() == "---" and lines[-1].strip() == "---" + + +def call_api_with_retry(fn, max_retries: int = 3, initial_delay: float = 2.0): + """ + Call fn() with exponential backoff on failure. + Raises the last exception if all retries fail. + """ + delay = initial_delay + last_exc = None + for attempt in range(max_retries): + try: + return fn() + except Exception as e: + last_exc = e + if attempt < max_retries - 1: + print(f" [RETRY {attempt + 1}/{max_retries}] API error: {e}. Retrying in {delay:.0f}s...") + time.sleep(delay) + delay *= 2 + raise last_exc + + +def translate_content( + client: OpenAI, + en_content: str, + target_language: str, + target_language_name: str, + preserve_terms: list[str], + existing_translation: Optional[str] = None, +) -> str: + """Translate a section of MDX content using DeepSeek API.""" + preserve_str = ", ".join(preserve_terms) + + system_prompt = f"""You are a technical documentation translator specializing in MDX format. Translate into {target_language_name}. + +Rules: +- Preserve ALL MDX component tags exactly: , , , , , , , , , , ,