diff --git a/.gitignore b/.gitignore index 0f40381..c2868b4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ node_modules/ .DS_Store .env .env.local +.cursor + apps/frontend/dist/ apps/frontend/.vite/ apps/backend/.cache/ diff --git a/apps/backend/package.json b/apps/backend/package.json index dc37e85..5a863cc 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -20,6 +20,8 @@ "langchain": "~0.3.37", "lib0": "^0.2.95", "localtunnel": "^2.0.2", + "pdf-lib": "^1.17.1", + "sharp": "^0.34.5", "tar": "^6.2.1", "unzipper": "^0.10.14", "y-protocols": "^1.0.6", diff --git a/apps/backend/src/config/constants.js b/apps/backend/src/config/constants.js index 8ae3268..7fb7bbd 100644 --- a/apps/backend/src/config/constants.js +++ b/apps/backend/src/config/constants.js @@ -5,6 +5,9 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url)); export const REPO_ROOT = path.resolve(__dirname, '..', '..', '..', '..'); export const DATA_DIR = process.env.OPENPRISM_DATA_DIR || path.join(REPO_ROOT, 'data'); +const TRANSFER_AGENT_DIR = path.join(REPO_ROOT, 'apps', 'backend', 'src', 'services', 'transferAgent'); +/** Venue handbook markdown for transfer agent (neurips.md, acl.md, …) */ +export const RULES_DIR = path.join(TRANSFER_AGENT_DIR, 'rules'); export const TEMPLATE_DIR = path.join(REPO_ROOT, 'templates'); export const TEMPLATE_MANIFEST = path.join(TEMPLATE_DIR, 'manifest.json'); export const PORT = Number(process.env.PORT || 8787); diff --git a/apps/backend/src/index.js b/apps/backend/src/index.js index 2a86173..a1245ff 100644 --- a/apps/backend/src/index.js +++ b/apps/backend/src/index.js @@ -24,6 +24,7 @@ const fastify = Fastify({ logger: true }); await fastify.register(cors, { origin: true }); await fastify.register(multipart, { + preservePath: true, limits: { fileSize: 200 * 1024 * 1024 } diff --git a/apps/backend/src/routes/projects.js b/apps/backend/src/routes/projects.js index 60f39fe..1b179c7 100644 --- a/apps/backend/src/routes/projects.js +++ b/apps/backend/src/routes/projects.js @@ -321,7 +321,13 @@ export function registerProjectRoutes(fastify) { const parts = req.parts(); for await (const part of parts) { if (part.type !== 'file') continue; - const relPath = sanitizeUploadPath(part.filename); + // With preservePath (busboy), filename keeps relative dirs (e.g. figs/a.png). + // Fallback for odd clients: basename-only still works. + const rawRel = + (typeof part.filename === 'string' && part.filename.trim()) || + (typeof part.filepath === 'string' && part.filepath.trim()) || + ''; + const relPath = sanitizeUploadPath(rawRel); if (!relPath) continue; const abs = safeJoin(projectRoot, relPath); await ensureDir(path.dirname(abs)); diff --git a/apps/backend/src/routes/transfer.js b/apps/backend/src/routes/transfer.js index afb467c..e2d847b 100644 --- a/apps/backend/src/routes/transfer.js +++ b/apps/backend/src/routes/transfer.js @@ -2,17 +2,97 @@ import crypto from 'crypto'; import path from 'path'; import { promises as fs } from 'fs'; import { buildTransferGraph } from '../services/transferAgent/graph.js'; +import { buildNeuripsLatexGraph } from '../services/transferAgent/graphNeurips.js'; +import { buildVenueAgentGraph } from '../services/transferAgent/graphVenueAgent.js'; import { buildMineruTransferGraph } from '../services/transferAgent/graphMineru.js'; +import { buildMineruAgentGraph } from '../services/transferAgent/graphMineruAgent.js'; +import { buildRuleBaseTransferGraph } from '../services/transferAgent/graphRuleBaseTransfer.js'; import { resolveLLMConfig } from '../services/llmService.js'; import { resolveMineruConfig } from '../services/mineruService.js'; import { readTemplateManifest } from '../services/templateService.js'; import { DATA_DIR, TEMPLATE_DIR } from '../config/constants.js'; import { ensureDir, readJson, writeJson, copyDir } from '../utils/fsUtils.js'; +import { + transferDebugLog, + transferDebugProgressDelta, + transferDebugEntriesDelta, + announceTransferDebugOnce, +} from '../services/transferAgent/transferDebugLog.js'; +import { pushToolTraceRecent } from '../services/transferAgent/toolTrace.js'; +import { TransferNodeError } from '../services/transferAgent/transferNodeError.js'; // In-memory job store: jobId → { graph, state, status, progressLog } const jobs = new Map(); +const INVOKE_OPTS = { recursionLimit: 120 }; + +function isGraphInterruptErr(e) { + return e !== undefined && ['GraphInterrupt', 'NodeInterrupt'].includes(e?.name); +} + +function logTransferStepResult(jobId, job, st) { + if (!st || typeof st !== 'object') return; + announceTransferDebugOnce(); + transferDebugProgressDelta(jobId, job, st.progressLog); + transferDebugEntriesDelta(jobId, job, st.progressLogEntries); + transferDebugLog(jobId, 'log', 'step snapshot', { + status: st.status, + lastCompletedNode: st.lastCompletedNode, + currentPhase: st.currentPhase, + transferGraphKind: st.transferGraphKind, + completedNodesLen: Array.isArray(st.completedNodes) ? st.completedNodes.length : 0, + pendingQA: Array.isArray(st.pendingQA) ? st.pendingQA.length : st.pendingQA ? 1 : 0, + compileOk: st.compileResult?.ok, + compileExit: st.compileResult?.status, + verifyBuildOk: st.verifyBuildResult?.ok, + verifyPattern: st.verifyBuildResult?.pattern, + layoutCheckOk: st.layoutCheckResult?.ok, + }); + if (st.compileResult && !st.compileResult.ok && st.compileResult.log) { + transferDebugLog( + jobId, + 'error', + 'compile failed — log tail', + String(st.compileResult.log).slice(-12000), + ); + } + if (st.verifyBuildResult && !st.verifyBuildResult.ok) { + transferDebugLog(jobId, 'warn', 'verifyBuild failed', st.verifyBuildResult); + if (st.compileResult?.log) { + transferDebugLog( + jobId, + 'warn', + 'compile log tail (for verify debug)', + String(st.compileResult.log).slice(-8000), + ); + } + } +} + +function buildTransferApiPayload(job, state) { + const st = state || job.state || {}; + const log = st.progressLog || job.progressLog || []; + return { + status: st.status || job.status || 'running', + progressLog: Array.isArray(log) ? log : [], + progressLogEntries: st.progressLogEntries || [], + currentNode: st.lastCompletedNode || '', + phase: st.currentPhase || '', + agentPhase: st.agentPhase || null, + currentIteration: st.currentIteration ?? null, + interruptedBeforeNode: st.interruptedBeforeNode || '', + completedNodes: st.completedNodes || [], + pendingQA: st.pendingQA ?? null, + error: job.error || st.error || null, + bundleNotes: st.bundleNotes || null, + transferGraphKind: st.transferGraphKind || job.state?.transferGraphKind || 'legacy', + liveProgress: job.liveProgress || null, + toolTraceRecent: job.toolTraceRecent || [], + }; +} + export function registerTransferRoutes(fastify) { + console.log('[transfer] Routes registered — venue agent graph (neurips/icml/cvpr/acl): ENABLED'); /** * POST /api/transfer/start @@ -27,7 +107,13 @@ export function registerTransferRoutes(fastify) { targetTemplateId, targetMainFile, engine = 'pdflatex', layoutCheck = false, + enableSensitiveMask = false, + useAgent = false, llmConfig, + venue, + doubleBlind, + preprint, + outputNotes, } = request.body || {}; if (!sourceProjectId || !sourceMainFile || !targetTemplateId || !targetMainFile) { @@ -41,7 +127,7 @@ export function registerTransferRoutes(fastify) { return reply.code(400).send({ error: `Unknown template: ${targetTemplateId}` }); } - // Create a new project from the template + // Create a new project container await ensureDir(DATA_DIR); const newProjectId = crypto.randomUUID(); const projectRoot = path.join(DATA_DIR, newProjectId); @@ -61,23 +147,49 @@ export function registerTransferRoutes(fastify) { }; await writeJson(path.join(projectRoot, 'project.json'), meta); - // Copy template files into the new project - const templateRoot = path.join(TEMPLATE_DIR, targetTemplateId); - await copyDir(templateRoot, projectRoot); + // Build graph + decide whether to pre-copy the template. + // useAgent=true : keep existing behavior (pre-copy template, run LLM graph). + // useAgent=false: run the rule-based transfer pipeline which re- + // populates `projectRoot` from the source and overlays + // the target template itself, so we MUST NOT pre-copy. + const AGENT_ENABLED_VENUES = ['neurips', 'icml', 'cvpr', 'acl']; + let graph; + let transferGraphKind; + if (useAgent) { + const templateRoot = path.join(TEMPLATE_DIR, targetTemplateId); + await copyDir(templateRoot, projectRoot); + const useAgentGraph = AGENT_ENABLED_VENUES.includes(targetTemplateId); + graph = useAgentGraph ? buildVenueAgentGraph() : buildTransferGraph(); + transferGraphKind = useAgentGraph ? targetTemplateId : 'legacy'; + } else { + graph = buildRuleBaseTransferGraph(); + transferGraphKind = 'rulebasetransfer'; + } - // Build transfer graph const jobId = crypto.randomUUID(); - const graph = buildTransferGraph(); + + const transferIntake = { + venue: venue || targetTemplateId || 'neurips', + doubleBlind: doubleBlind !== false, + preprint: !!preprint, + outputNotes: outputNotes || '', + }; const initialState = { sourceProjectId, sourceMainFile, targetProjectId: newProjectId, targetMainFile, + targetTemplateId, engine, layoutCheck, + enableSensitiveMask: !!enableSensitiveMask, + useAgent: !!useAgent, llmConfig: resolveLLMConfig(llmConfig), jobId, + transferGraphKind, + transferIntake, + userConfirmations: {}, }; jobs.set(jobId, { @@ -87,6 +199,23 @@ export function registerTransferRoutes(fastify) { progressLog: [], hasStarted: false, iterator: null, + liveProgress: null, + toolTraceRecent: [], + _transferDebugLogLen: 0, + _transferDebugEntriesLen: 0, + }); + + announceTransferDebugOnce(); + transferDebugLog(jobId, 'log', 'POST /transfer/start', { + targetTemplateId, + transferGraphKind, + useAgent: !!useAgent, + newProjectId, + sourceProjectId, + sourceMainFile, + targetMainFile, + engine, + layoutCheck, }); return { jobId, newProjectId }; @@ -105,33 +234,122 @@ export function registerTransferRoutes(fastify) { return reply.code(404).send({ error: 'Job not found.' }); } - // If waiting for images, don't proceed if (job.status === 'waiting_images') { - return { status: 'waiting_images', progressLog: job.progressLog }; + return buildTransferApiPayload(job, job.state); + } + + if (job.status === 'waiting_confirm') { + return buildTransferApiPayload(job, job.state); } try { job.status = 'running'; - const runConfig = { configurable: { thread_id: jobId } }; + // Initialize live progress for tool-level granularity + job.liveProgress = { + activeRole: '', + toolName: '', + toolArgs: '', + toolRound: 0, + maxToolRounds: 0, + seq: 0, + lastUpdate: Date.now(), + }; + const runConfig = { + configurable: { + thread_id: jobId, + _liveProgress: job.liveProgress, + _recordToolTrace: (entry) => { + if (!job.toolTraceRecent) job.toolTraceRecent = []; + pushToolTraceRecent(job, entry); + }, + }, + ...INVOKE_OPTS, + }; const input = job.hasStarted ? null : job.state; - const result = await job.graph.invoke(input, runConfig); + let result; + try { + // Use graph.stream() instead of graph.invoke() for node-level granularity. + // streamMode 'values' yields the full accumulated state after each node completes, + // allowing the SSE poll to pick up intermediate progress. + const stream = await job.graph.stream(input, { ...runConfig, streamMode: 'values' }); + for await (const snapshot of stream) { + // snapshot is the full accumulated state after this node completed + job.state = { ...job.state, ...snapshot }; + job.progressLog = job.state.progressLog || job.progressLog || []; + job.hasStarted = true; + transferDebugLog(jobId, 'log', `stream node completed: ${job.state.lastCompletedNode || '?'}`); + } + result = job.state; + } catch (invokeErr) { + if (isGraphInterruptErr(invokeErr)) { + const snap = await job.graph.getState(runConfig); + const values = snap?.values || {}; + job.hasStarted = true; + job.state = { ...job.state, ...values }; + job.progressLog = values.progressLog || job.progressLog || []; + + // Handle raiseQuestion interrupt from agentic nodes: + // The interrupt() call passes { type: 'raiseQuestion', pendingQA: [...] } + const interruptValues = snap?.tasks?.[0]?.interrupts?.[0]?.value; + if (interruptValues?.type === 'raiseQuestion' && interruptValues.pendingQA) { + job.state.pendingQA = interruptValues.pendingQA; + job.state.status = 'waiting_confirm'; + job.status = 'waiting_confirm'; + } else { + job.status = values.status || job.state.status || 'running'; + } + + job.error = undefined; + transferDebugLog(jobId, 'log', 'LangGraph interrupt — paused before next node (checkpoint saved)'); + logTransferStepResult(jobId, job, job.state); + return buildTransferApiPayload(job, job.state); + } + throw invokeErr; + } + // Also check for interrupt after stream completes normally (some LangGraph versions + // don't throw on interrupt when using stream) + try { + const snap = await job.graph.getState(runConfig); + const interruptValues = snap?.tasks?.[0]?.interrupts?.[0]?.value; + if (interruptValues?.type === 'raiseQuestion' && interruptValues.pendingQA) { + job.state.pendingQA = interruptValues.pendingQA; + job.state.status = 'waiting_confirm'; + job.status = 'waiting_confirm'; + job.error = undefined; + transferDebugLog(jobId, 'log', 'LangGraph interrupt detected after stream (checkpoint saved)'); + logTransferStepResult(jobId, job, job.state); + return buildTransferApiPayload(job, job.state); + } + } catch { /* getState may fail if graph fully completed — that's fine */ } + job.hasStarted = true; job.state = result; job.progressLog = result.progressLog || []; job.status = result.status || 'running'; + job.error = undefined; + // Clear live progress when step finishes + job.liveProgress = null; - return { - status: job.status, - progressLog: job.progressLog, - }; + logTransferStepResult(jobId, job, result); + + return buildTransferApiPayload(job, result); } catch (err) { const msg = err?.message || String(err || 'Unknown error'); job.status = 'error'; job.error = msg; - return reply.code(500).send({ + transferDebugLog(jobId, 'error', `POST /transfer/step failed: ${msg}`, err?.stack); + const payload = { error: msg, - progressLog: job.progressLog, - }); + ...buildTransferApiPayload(job, job.state), + }; + if (err instanceof TransferNodeError) { + payload.failedNode = err.node; + payload.failedPhase = err.phase; + payload.failedDetail = err.detail; + if (err.debugRelPath) payload.failedDebugPath = err.debugRelPath; + if (typeof err.inputChars === 'number') payload.failedInputChars = err.inputChars; + } + return reply.code(500).send(payload); } }); @@ -166,6 +384,44 @@ export function registerTransferRoutes(fastify) { job.state = { ...job.state, ...updated }; job.status = 'running'; + transferDebugLog(jobId, 'log', `submit-images: ${(images || []).length} page(s)`); + + return { ok: true }; + }); + + /** + * POST /api/transfer/submit-confirm + * Body: { jobId, answers: { [qaId]: string | string[] } } + */ + fastify.post('/api/transfer/submit-confirm', async (request, reply) => { + const { jobId, answers = {} } = request.body || {}; + const job = jobs.get(jobId); + if (!job) { + return reply.code(404).send({ error: 'Job not found.' }); + } + + if (job.status !== 'waiting_confirm') { + return reply.code(400).send({ error: 'Job is not waiting for confirmations.' }); + } + + const prev = job.state?.userConfirmations || {}; + const merged = { ...prev, ...answers }; + const updated = { userConfirmations: merged, status: 'running', pendingQA: null }; + + try { + if (job.hasStarted && typeof job.graph.updateState === 'function') { + await job.graph.updateState( + { configurable: { thread_id: jobId }, ...INVOKE_OPTS }, + updated, + ); + } + } catch { /* fallback below */ } + + job.state = { ...job.state, ...updated }; + job.status = 'running'; + + transferDebugLog(jobId, 'log', 'submit-confirm', { answerKeys: Object.keys(answers || {}) }); + return { ok: true }; }); @@ -180,12 +436,151 @@ export function registerTransferRoutes(fastify) { } return { - status: job.status, - progressLog: job.progressLog, - error: job.error || null, + transferGraphKind: job.state?.transferGraphKind || 'legacy', + ...buildTransferApiPayload(job, job.state), }; }); + /** + * GET /api/transfer/stream/:jobId + * SSE endpoint — pushes real-time progress events to the frontend. + * + * Events emitted: + * event: progress — full payload (same shape as /status) + * event: done — final payload when job finishes (success/failed/error) + * + * The connection stays open and polls the in-memory job state + * every 500 ms, emitting an event whenever the state has changed + * (new completedNodes, phase change, status change, new log entries). + */ + fastify.get('/api/transfer/stream/:jobId', async (request, reply) => { + const { jobId } = request.params; + const job = jobs.get(jobId); + if (!job) { + return reply.code(404).send({ error: 'Job not found.' }); + } + + // SSE headers + reply.raw.writeHead(200, { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache', + Connection: 'keep-alive', + 'X-Accel-Buffering': 'no', // disable nginx buffering + }); + + // Tracking: only send when something changed + let lastCompletedLen = 0; + let lastEntriesLen = 0; + let lastStatus = ''; + let lastPhase = ''; + let lastNode = ''; + let lastLpToolName = ''; + let lastLpToolArgs = ''; + let lastLpToolRound = -1; + let lastLpActiveRole = ''; + let lastLpSeq = -1; + let lastToolTraceLen = -1; + let closed = false; + + request.raw.on('close', () => { closed = true; }); + + function sendEvent(eventName, data) { + if (closed) return; + try { + reply.raw.write(`event: ${eventName}\ndata: ${JSON.stringify(data)}\n\n`); + } catch { closed = true; } + } + + // Send initial state immediately + const initialPayload = { + transferGraphKind: job.state?.transferGraphKind || 'legacy', + ...buildTransferApiPayload(job, job.state), + }; + sendEvent('progress', initialPayload); + lastCompletedLen = (initialPayload.completedNodes || []).length; + lastEntriesLen = (initialPayload.progressLogEntries || []).length; + lastStatus = initialPayload.status; + lastPhase = initialPayload.phase; + lastNode = initialPayload.currentNode; + if (initialPayload.liveProgress) { + lastLpToolName = initialPayload.liveProgress.toolName || ''; + lastLpToolArgs = initialPayload.liveProgress.toolArgs || ''; + lastLpToolRound = initialPayload.liveProgress.toolRound ?? -1; + lastLpActiveRole = initialPayload.liveProgress.activeRole || ''; + lastLpSeq = initialPayload.liveProgress.seq ?? -1; + } + lastToolTraceLen = (initialPayload.toolTraceRecent || []).length; + + // Poll loop + const interval = setInterval(() => { + if (closed) { clearInterval(interval); return; } + + const j = jobs.get(jobId); + if (!j) { sendEvent('done', { status: 'not_found' }); clearInterval(interval); reply.raw.end(); return; } + + const payload = { + transferGraphKind: j.state?.transferGraphKind || 'legacy', + ...buildTransferApiPayload(j, j.state), + }; + + const completedLen = (payload.completedNodes || []).length; + const entriesLen = (payload.progressLogEntries || []).length; + const lp = payload.liveProgress; + + const lpSeq = lp?.seq ?? -1; + const traceLen = (payload.toolTraceRecent || []).length; + const changed = + payload.status !== lastStatus || + payload.phase !== lastPhase || + payload.currentNode !== lastNode || + completedLen !== lastCompletedLen || + entriesLen !== lastEntriesLen || + traceLen !== lastToolTraceLen || + (lp && ( + lpSeq !== lastLpSeq || + lp.toolName !== lastLpToolName || + lp.toolArgs !== lastLpToolArgs || + lp.toolRound !== lastLpToolRound || + lp.activeRole !== lastLpActiveRole + )); + + if (changed) { + sendEvent('progress', payload); + lastCompletedLen = completedLen; + lastEntriesLen = entriesLen; + lastStatus = payload.status; + lastPhase = payload.phase; + lastNode = payload.currentNode; + lastToolTraceLen = traceLen; + if (lp) { + lastLpToolName = lp.toolName; + lastLpToolArgs = lp.toolArgs || ''; + lastLpToolRound = lp.toolRound; + lastLpActiveRole = lp.activeRole; + lastLpSeq = lpSeq; + } + } + + // Terminal states — send done and close + if (['success', 'failed', 'error'].includes(payload.status)) { + sendEvent('done', payload); + clearInterval(interval); + if (!closed) reply.raw.end(); + } + }, 500); + + // Keep-alive: send comment every 15s to prevent proxy timeout + const keepAlive = setInterval(() => { + if (closed) { clearInterval(keepAlive); return; } + try { reply.raw.write(': keepalive\n\n'); } catch { closed = true; } + }, 15000); + + request.raw.on('close', () => { + clearInterval(interval); + clearInterval(keepAlive); + }); + }); + /** * POST /api/transfer/start-mineru * Body: { sourceProjectId?, sourceMainFile?, targetTemplateId, targetMainFile, @@ -201,6 +596,7 @@ export function registerTransferRoutes(fastify) { targetTemplateId, targetMainFile, engine = 'pdflatex', layoutCheck = false, + enableSensitiveMask = false, llmConfig, mineruConfig, } = request.body || {}; @@ -245,21 +641,33 @@ export function registerTransferRoutes(fastify) { const templateRoot = path.join(TEMPLATE_DIR, targetTemplateId); await copyDir(templateRoot, projectRoot); - // Build MinerU transfer graph + // Build MinerU transfer graph — use agent hybrid for supported venues const jobId = crypto.randomUUID(); - const graph = buildMineruTransferGraph(); + const AGENT_ENABLED_VENUES = ['neurips', 'icml', 'cvpr', 'acl']; + const useAgentBackend = AGENT_ENABLED_VENUES.includes(targetTemplateId); + const graph = useAgentBackend ? buildMineruAgentGraph() : buildMineruTransferGraph(); const initialState = { sourceProjectId: sourceProjectId || '', sourceMainFile: sourceMainFile || '', targetProjectId: newProjectId, targetMainFile, + targetTemplateId, engine, layoutCheck, + enableSensitiveMask: !!enableSensitiveMask, llmConfig: resolveLLMConfig(llmConfig), mineruConfig: resolveMineruConfig(mineruConfig), transferMode: 'mineru', jobId, + transferGraphKind: useAgentBackend ? targetTemplateId : 'legacy', + transferIntake: { + venue: targetTemplateId || 'neurips', + doubleBlind: true, + preprint: false, + outputNotes: '', + }, + userConfirmations: {}, }; jobs.set(jobId, { @@ -269,6 +677,18 @@ export function registerTransferRoutes(fastify) { progressLog: [], hasStarted: false, iterator: null, + liveProgress: null, + toolTraceRecent: [], + _transferDebugLogLen: 0, + _transferDebugEntriesLen: 0, + }); + + announceTransferDebugOnce(); + transferDebugLog(jobId, 'log', 'POST /transfer/start-mineru', { + targetTemplateId, + newProjectId, + transferGraphKind: useAgentBackend ? targetTemplateId : 'legacy', + hasSourceProject: !!sourceProjectId, }); return { jobId, newProjectId }; diff --git a/apps/backend/src/services/compileService.js b/apps/backend/src/services/compileService.js index 6e598df..cfd46a4 100644 --- a/apps/backend/src/services/compileService.js +++ b/apps/backend/src/services/compileService.js @@ -8,16 +8,16 @@ import { getProjectRoot } from './projectService.js'; const SUPPORTED_ENGINES = ['pdflatex', 'xelatex', 'lualatex', 'latexmk', 'tectonic']; -function buildCommand(engine, outDir, mainFile) { +function buildCommand(engine, mainFile) { switch (engine) { case 'pdflatex': case 'xelatex': case 'lualatex': - return { cmd: engine, args: ['-interaction=nonstopmode', `-output-directory=${outDir}`, mainFile] }; + return { cmd: engine, args: ['-interaction=nonstopmode', mainFile] }; case 'latexmk': - return { cmd: 'latexmk', args: ['-pdf', '-interaction=nonstopmode', `-outdir=${outDir}`, mainFile] }; + return { cmd: 'latexmk', args: ['-pdf', '-interaction=nonstopmode', mainFile] }; case 'tectonic': - return { cmd: 'tectonic', args: ['--outdir', outDir, mainFile] }; + return { cmd: 'tectonic', args: [mainFile] }; default: return null; } @@ -64,56 +64,182 @@ export async function runCompile({ projectId, mainFile, engine = 'pdflatex' }) { logChunks.push(next.slice(0, remaining)); }; - const { cmd, args } = buildCommand(engine, outDir, mainFile); + const { cmd, args } = buildCommand(engine, mainFile); const needsBibPass = MULTI_PASS_ENGINES.includes(engine); + // Copy all project files to output directory + try { + const files = await fs.readdir(projectRoot, { withFileTypes: true }); + for (const file of files) { + if (file.name === '.compile' || file.name === 'node_modules') continue; + const srcPath = path.join(projectRoot, file.name); + const dstPath = path.join(outDir, file.name); + if (file.isDirectory()) { + await fs.cp(srcPath, dstPath, { recursive: true }); + } else { + await fs.copyFile(srcPath, dstPath); + } + } + pushLog(Buffer.from('[info] Copied project files to build directory.\n')); + } catch (err) { + await fs.rm(outDir, { recursive: true, force: true }); + return { ok: false, error: `Failed to copy project files: ${err.message}` }; + } + let code; try { // Pass 1: generate .aux with \citation{} entries - code = await runSpawn(cmd, args, projectRoot, pushLog); + code = await runSpawn(cmd, args, outDir, pushLog); if (needsBibPass) { const base = path.basename(mainFile, path.extname(mainFile)); const auxPath = path.join(outDir, `${base}.aux`); - // Detect whether to use biber or bibtex by checking .aux / source for biblatex - let useBiber = false; + // Check if user-provided .bib files exist in the project. + // IMPORTANT: check against the ORIGINAL project files (projectRoot), + // not outDir, because Pass 1 may auto-generate .bib files + // (e.g. revtex4-1 + apsrev4-1.bst creates *Notes.bib). + // Using outDir would give a false positive and cause bibtex to run, + // which overwrites the existing .bbl with an empty one. + // Also skip empty/placeholder .bib files (< 50 bytes, e.g. template stubs). + let hasUserBibFiles = false; try { - const auxContent = await fs.readFile(auxPath, 'utf8'); - // biblatex writes \abx@aux@... commands in .aux; traditional bibtex does not - useBiber = auxContent.includes('\\abx@aux@'); - } catch { /* .aux missing — skip bib pass */ } + const projFiles = await fs.readdir(projectRoot); + const bibFiles = projFiles.filter(f => f.endsWith('.bib')); + for (const bf of bibFiles) { + try { + const st = await fs.stat(path.join(projectRoot, bf)); + if (st.size >= 50) { hasUserBibFiles = true; break; } + } catch { /* ignore */ } + } + } catch { /* ignore */ } - // Also check the source .tex for \usepackage{biblatex} as a fallback - if (!useBiber) { + // Also check if a pre-compiled .bbl already exists in the project + const mainBblPath = path.join(outDir, `${base}.bbl`); + let hasPrecompiledBbl = false; + try { + await fs.access(mainBblPath); + hasPrecompiledBbl = true; + } catch { /* no main.bbl yet */ } + + // If no .bbl with the main name, look for one that matches + // \bibliography{} or \addbibresource{} or \input{*.bbl} references + if (!hasPrecompiledBbl) { try { - const texContent = await fs.readFile(safeJoin(projectRoot, mainFile), 'utf8'); - useBiber = /\\usepackage(\[.*?\])?\{biblatex\}/.test(texContent); + const texContent = await fs.readFile(path.join(outDir, mainFile), 'utf8'); + + // Check if tex uses \input{something.bbl} — in that case, + // the bbl is loaded directly and we don't need main.bbl at all. + const inputBblRe = /\\input\s*\{\s*([^}]*\.bbl)\s*\}/g; + let inputBblMatch; + while ((inputBblMatch = inputBblRe.exec(texContent)) !== null) { + const bblName = inputBblMatch[1].trim(); + const bblPath = path.join(outDir, bblName); + try { + const st = await fs.stat(bblPath); + if (st.size > 100) { + // The tex directly \input's a real .bbl file — skip bibtex entirely + hasPrecompiledBbl = true; + pushLog(Buffer.from(`[info] Found \\input{${bblName}} (${st.size} bytes); using it directly.\n`)); + break; + } + } catch { /* file not found, continue */ } + } + + if (!hasPrecompiledBbl) { + // Collect bibliography names from \bibliography{a,b} and \addbibresource{a.bib} + const bibNames = []; + const bibPatternTrad = /\\bibliography\{([^}]+)\}/g; + let m; + while ((m = bibPatternTrad.exec(texContent)) !== null) { + m[1].split(',').forEach(ref => bibNames.push(ref.trim())); + } + const bibPatternRes = /\\addbibresource\{([^}]+)\}/g; + while ((m = bibPatternRes.exec(texContent)) !== null) { + bibNames.push(m[1].trim().replace(/\.bib$/i, '')); + } + + // Try each matched name to find a corresponding .bbl and copy it + for (const bibName of bibNames) { + if (bibName === base) continue; + const candidateBbl = path.join(outDir, `${bibName}.bbl`); + try { + await fs.access(candidateBbl); + await fs.copyFile(candidateBbl, mainBblPath); + hasPrecompiledBbl = true; + pushLog(Buffer.from(`[info] Copied ${bibName}.bbl to ${base}.bbl for LaTeX to use.\n`)); + break; + } catch { /* this .bbl not found, try next */ } + } + } + + // Fallback: if still no main.bbl, try any lone .bbl in the directory + if (!hasPrecompiledBbl) { + const allFiles = await fs.readdir(outDir); + const bblFiles = allFiles.filter(f => f.endsWith('.bbl')); + // If there's exactly one .bbl that's substantial (> 100 bytes), use it + const realBbls = []; + for (const bf of bblFiles) { + try { + const st = await fs.stat(path.join(outDir, bf)); + if (st.size > 100) realBbls.push(bf); + } catch { /* ignore */ } + } + if (realBbls.length === 1) { + await fs.copyFile(path.join(outDir, realBbls[0]), mainBblPath); + hasPrecompiledBbl = true; + pushLog(Buffer.from(`[info] Copied ${realBbls[0]} to ${base}.bbl (only substantial .bbl found).\n`)); + } + } } catch { /* ignore */ } } - const bibCmd = useBiber ? 'biber' : 'bibtex'; - const bibEnv = { - ...process.env, - BIBINPUTS: `${projectRoot}:`, - BSTINPUTS: `${projectRoot}:`, - }; - // Run bibtex/biber with cwd=outDir and relative base name to avoid - // openout_any=p blocking writes to absolute paths. - const bibArgs = useBiber - ? [`--input-directory=${projectRoot}`, base] - : [base]; + if (!hasUserBibFiles) { + // No user .bib files — do NOT run bibtex/biber as it would overwrite + // the existing .bbl with an empty one. Just use whatever .bbl is available. + if (hasPrecompiledBbl) { + pushLog(Buffer.from(`[info] No .bib files found; using existing ${base}.bbl (skipping bibtex/biber).\n`)); + } else { + pushLog(Buffer.from('[warn] No .bib or .bbl files found, citations will not resolve.\n')); + } + } else { + // Detect whether to use biber or bibtex by checking .aux / source for biblatex + let useBiber = false; + try { + const auxContent = await fs.readFile(auxPath, 'utf8'); + // biblatex writes \abx@aux@... commands in .aux; traditional bibtex does not + useBiber = auxContent.includes('\\abx@aux@'); + } catch { /* .aux missing — skip bib pass */ } - try { - await runSpawn(bibCmd, bibArgs, outDir, pushLog, bibEnv); - } catch { - // bibtex/biber not installed or failed — continue without it - pushLog(Buffer.from(`[warn] ${bibCmd} not available, skipping bibliography pass.\n`)); + // Also check the source .tex for \usepackage{biblatex} as a fallback + if (!useBiber) { + try { + const texContent = await fs.readFile(path.join(outDir, mainFile), 'utf8'); + useBiber = /\\usepackage(\[.*?\])?\{biblatex\}/.test(texContent); + } catch { /* ignore */ } + } + + const bibCmd = useBiber + ? 'biber' + : 'bibtex'; + const bibEnv = { + ...process.env, + BIBINPUTS: `${outDir}:`, + BSTINPUTS: `${outDir}:`, + }; + const bibArgs = useBiber ? [base] : [base]; + + try { + await runSpawn(bibCmd, bibArgs, outDir, pushLog, bibEnv); + } catch { + // bibtex/biber not installed or failed — continue without it + pushLog(Buffer.from(`[warn] ${bibCmd} not available, skipping bibliography pass.\n`)); + } } // Pass 2 + 3: resolve citations and cross-references - code = await runSpawn(cmd, args, projectRoot, pushLog); - code = await runSpawn(cmd, args, projectRoot, pushLog); + code = await runSpawn(cmd, args, outDir, pushLog); + code = await runSpawn(cmd, args, outDir, pushLog); } } catch (err) { await fs.rm(outDir, { recursive: true, force: true }); @@ -129,7 +255,35 @@ export async function runCompile({ projectId, mainFile, engine = 'pdflatex' }) { } catch { pdfBase64 = ''; } + + // Copy all .bbl files back to project root + try { + const files = await fs.readdir(outDir); + const bblFiles = files.filter(f => f.endsWith('.bbl')); + for (const bblFile of bblFiles) { + const srcPath = path.join(outDir, bblFile); + const dstPath = path.join(projectRoot, bblFile); + await fs.copyFile(srcPath, dstPath); + } + if (bblFiles.length > 0) { + pushLog(Buffer.from(`[info] Copied ${bblFiles.length} .bbl file(s) back to project.\n`)); + } + } catch { + // Ignore errors copying .bbl files + } + const log = logChunks.join(''); + + // Save compile log to project directory + try { + const logPath = path.join(projectRoot, 'compile.log'); + const timestamp = new Date().toISOString(); + const logContent = `=== Compile Log (${timestamp}) ===\nEngine: ${engine}\nMain File: ${mainFile}\n\n${log}`; + await fs.writeFile(logPath, logContent, 'utf8'); + } catch { + // Ignore errors saving log + } + await fs.rm(outDir, { recursive: true, force: true }); if (!pdfBase64) { return { ok: false, error: 'No PDF generated.', log, status: code ?? -1 }; diff --git a/apps/backend/src/services/mineruContentListCrop.js b/apps/backend/src/services/mineruContentListCrop.js new file mode 100644 index 0000000..d6c3681 --- /dev/null +++ b/apps/backend/src/services/mineruContentListCrop.js @@ -0,0 +1,239 @@ +import { promises as fs } from 'fs'; +import os from 'os'; +import path from 'path'; +import { execFile } from 'child_process'; +import { promisify } from 'util'; +import { PDFDocument } from 'pdf-lib'; +import sharp from 'sharp'; +import { safeJoin } from '../utils/pathUtils.js'; + +const execFileAsync = promisify(execFile); + +/** + * Normalize MinerU / zip JSON root to a flat list of block objects. + */ +function flattenContentList(raw) { + if (!raw) return []; + if (Array.isArray(raw)) return raw; + if (Array.isArray(raw.content_list)) return raw.content_list; + if (Array.isArray(raw.pdf_info?.content_list)) return raw.pdf_info.content_list; + return []; +} + +async function findContentListJson(searchDir) { + const names = await fs.readdir(searchDir, { withFileTypes: true }); + for (const ent of names) { + if (!ent.isFile()) continue; + const low = ent.name.toLowerCase(); + if (low.includes('content_list') && low.endsWith('.json')) { + return path.join(searchDir, ent.name); + } + } + return ''; +} + +/** + * @param {number[]} bbox [x0,y0,x1,y1] + * @param {number} pageWPts + * @param {number} pageHPts + * @param {number} scale px per PDF point (= dpi/72) + * @param {'pdf'|'top_left'} coords + */ +function bboxToSharpExtract(bbox, pageWPts, pageHPts, scale, coords) { + const [x0, y0, x1, y1] = bbox.map(Number); + if (![x0, y0, x1, y1].every(n => Number.isFinite(n))) return null; + + let left; + let top; + let width; + let height; + + if (coords === 'top_left') { + left = Math.max(0, Math.floor(x0 * scale)); + top = Math.max(0, Math.floor(y0 * scale)); + width = Math.max(1, Math.ceil((x1 - x0) * scale)); + height = Math.max(1, Math.ceil((y1 - y0) * scale)); + } else { + // PDF default: origin bottom-left, y increases upward + left = Math.max(0, Math.floor(x0 * scale)); + width = Math.max(1, Math.ceil((x1 - x0) * scale)); + height = Math.max(1, Math.ceil((y1 - y0) * scale)); + top = Math.max(0, Math.floor((pageHPts - y1) * scale)); + } + + return { left, top, width, height }; +} + +/** + * Render one PDF page to PNG via poppler pdftoppm (must be on PATH). + * @returns {Promise} + */ +async function renderPdfPagePng(pdfPath, page1Based, dpi, tmpDir, prefix) { + const outBase = path.join(tmpDir, prefix); + await execFileAsync('pdftoppm', [ + '-png', + '-r', + String(dpi), + '-f', + String(page1Based), + '-l', + String(page1Based), + pdfPath, + outBase, + ], { maxBuffer: 64 * 1024 * 1024 }); + + const outFile = `${outBase}-${page1Based}.png`; + return fs.readFile(outFile); +} + +/** + * Replace MinerU-exported raster crops using source PDF + content_list.json bboxes. + * Requires: `pdftoppm` (poppler-utils). Enable with OPENPRISM_MINERU_BBOX_CROP=1 or mineruConfig.bboxCrop. + * + * @param {object} opts + * @param {string} opts.sourcePdfPath + * @param {string} opts.searchDir + * @param {Array<{name:string,localPath:string}>} opts.images + * @param {object} opts.mineruConfig + * @returns {Promise<{ images: typeof opts.images, cropped: number, diagnostics: string }>} + */ +export async function cropMineruImagesFromContentList(opts) { + const { sourcePdfPath, searchDir, images, mineruConfig } = opts; + if (!mineruConfig?.bboxCrop) { + return { + images, + cropped: 0, + diagnostics: 'bboxCrop disabled (set OPENPRISM_MINERU_BBOX_CROP=1 or mineruConfig.bboxCrop).', + }; + } + if (!sourcePdfPath) { + return { images, cropped: 0, diagnostics: 'bboxCrop skipped: no sourcePdfPath.' }; + } + + const dpi = typeof mineruConfig.cropDpi === 'number' && mineruConfig.cropDpi > 0 + ? mineruConfig.cropDpi + : 200; + const coords = mineruConfig.bboxCoords === 'top_left' ? 'top_left' : 'pdf'; + const jsonPath = await findContentListJson(searchDir); + if (!jsonPath) { + return { images, cropped: 0, diagnostics: 'bboxCrop: no *content_list*.json next to Markdown.' }; + } + + let raw; + try { + raw = JSON.parse(await fs.readFile(jsonPath, 'utf8')); + } catch (e) { + return { images, cropped: 0, diagnostics: `bboxCrop: failed to parse ${jsonPath}: ${e.message}` }; + } + + const items = flattenContentList(raw); + if (!items.length) { + return { images, cropped: 0, diagnostics: 'bboxCrop: content_list empty or unknown shape.' }; + } + + let pdfBuf; + try { + pdfBuf = await fs.readFile(sourcePdfPath); + } catch (e) { + return { images, cropped: 0, diagnostics: `bboxCrop: cannot read source PDF: ${e.message}` }; + } + + let pageSizes; + try { + const doc = await PDFDocument.load(pdfBuf, { ignoreEncryption: true }); + pageSizes = doc.getPages().map((p) => { + const { width, height } = p.getSize(); + return { width, height }; + }); + } catch (e) { + return { images, cropped: 0, diagnostics: `bboxCrop: pdf-lib load failed: ${e.message}` }; + } + + const byRelPath = new Map(); + for (const img of images || []) { + let rel; + try { + rel = path.relative(searchDir, img.localPath).replace(/\\/g, '/'); + } catch { + continue; + } + byRelPath.set(rel, img); + byRelPath.set(path.basename(img.localPath), img); + } + + const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'openprism-mineru-crop-')); + const scale = dpi / 72; + let cropped = 0; + const pageCache = new Map(); + + try { + for (const entry of items) { + const imgRel = entry.img_path || entry.image_path || entry.image_file; + const bbox = entry.bbox; + if (!imgRel || !Array.isArray(bbox) || bbox.length < 4) continue; + + let page1Based; + if (entry.page_idx !== undefined && entry.page_idx !== null) { + page1Based = Number(entry.page_idx) + 1; + } else if (entry.page_number !== undefined) { + page1Based = Number(entry.page_number); + } else if (entry.page !== undefined) { + page1Based = Number(entry.page); + } else { + continue; + } + if (!Number.isFinite(page1Based) || page1Based < 1 || page1Based > pageSizes.length) continue; + + let targetAbs; + try { + targetAbs = safeJoin(searchDir, String(imgRel).replace(/\\/g, '/')); + } catch { + continue; + } + + const imgRecord = byRelPath.get(String(imgRel).replace(/\\/g, '/')) + || byRelPath.get(path.basename(imgRel)); + if (!imgRecord) continue; + if (!/\.(png|jpe?g|webp|gif|bmp)$/i.test(targetAbs)) continue; + + const { width: pw, height: ph } = pageSizes[page1Based - 1]; + const extract = bboxToSharpExtract(bbox, pw, ph, scale, coords); + if (!extract) continue; + + let pagePng; + if (pageCache.has(page1Based)) { + pagePng = pageCache.get(page1Based); + } else { + try { + pagePng = await renderPdfPagePng(sourcePdfPath, page1Based, dpi, tmpRoot, `pg`); + pageCache.set(page1Based, pagePng); + } catch (e) { + return { + images, + cropped, + diagnostics: `bboxCrop: pdftoppm failed (${e.message}). Install poppler-utils or disable bboxCrop.`, + }; + } + } + + try { + const outBuf = await sharp(pagePng) + .extract(extract) + .png({ compressionLevel: 6 }) + .toBuffer(); + await fs.writeFile(targetAbs, outBuf); + cropped++; + } catch (e) { + console.warn('[mineruContentListCrop] extract failed', targetAbs, e?.message || e); + } + } + } finally { + await fs.rm(tmpRoot, { recursive: true, force: true }).catch(() => {}); + } + + return { + images, + cropped, + diagnostics: `bboxCrop: rewrote ${cropped} image(s) from source PDF at ${dpi} dpi (coords=${coords}).`, + }; +} diff --git a/apps/backend/src/services/mineruRasterToPdf.js b/apps/backend/src/services/mineruRasterToPdf.js new file mode 100644 index 0000000..a784722 --- /dev/null +++ b/apps/backend/src/services/mineruRasterToPdf.js @@ -0,0 +1,173 @@ +import { promises as fs } from 'fs'; +import path from 'path'; +import { PDFDocument } from 'pdf-lib'; +import sharp from 'sharp'; + +const RASTER_EXT = /\.(png|jpe?g|webp|gif|bmp)$/i; + +/** + * Rewrite Markdown / HTML image references after converting raster basenames to .pdf. + * @param {string} md + * @param {{ relOld: string, relNew: string, baseOld: string, baseNew: string }[]} conversions + */ +export function rewriteMarkdownImageRefs(md, conversions) { + if (!conversions.length) return md; + + const sorted = [...conversions].sort((a, b) => b.relOld.length - a.relOld.length); + let out = md; + for (const c of sorted) { + out = out.split(c.relOld).join(c.relNew); + const withDotSlash = `./${c.relOld}`; + const withDotSlashNew = `./${c.relNew}`; + out = out.split(withDotSlash).join(withDotSlashNew); + } + + out = out.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (full, alt, url) => { + const trimmed = url.trim().replace(/^<|>$/g, ''); + for (const c of conversions) { + if (trimmed === c.baseOld || trimmed.endsWith(`/${c.baseOld}`)) { + const next = + trimmed === c.baseOld + ? c.baseNew + : `${trimmed.slice(0, -c.baseOld.length)}${c.baseNew}`; + return `![${alt}](${next})`; + } + } + return full; + }); + + out = out.replace(/]*?)src\s*=\s*["']([^"']+)["']/gi, (full, pre, url) => { + const trimmed = url.trim(); + for (const c of conversions) { + if (trimmed === c.baseOld || trimmed.endsWith(`/${c.baseOld}`)) { + const next = + trimmed === c.baseOld + ? c.baseNew + : `${trimmed.slice(0, -c.baseOld.length)}${c.baseNew}`; + return ` 1 ? imageScale : 1; + + let pngBytes; + if (ext === '.png' && scale <= 1) { + pngBytes = raw; + } else if ((ext === '.jpg' || ext === '.jpeg') && scale <= 1) { + const pdfDoc = await PDFDocument.create(); + const image = await pdfDoc.embedJpg(raw); + const page = pdfDoc.addPage([image.width, image.height]); + page.drawImage(image, { x: 0, y: 0, width: image.width, height: image.height }); + return pdfDoc.save(); + } else { + let pipeline = sharp(raw); + if (scale > 1) { + const meta = await sharp(raw).metadata(); + if (meta.width && meta.height) { + const w = Math.max(1, Math.round(meta.width * scale)); + const h = Math.max(1, Math.round(meta.height * scale)); + pipeline = sharp(raw).resize(w, h, { kernel: sharp.kernel.lanczos3 }); + } + } + pngBytes = await pipeline.png({ compressionLevel: 6 }).toBuffer(); + } + + const pdfDoc = await PDFDocument.create(); + const image = await pdfDoc.embedPng(pngBytes); + const page = pdfDoc.addPage([image.width, image.height]); + page.drawImage(image, { x: 0, y: 0, width: image.width, height: image.height }); + return pdfDoc.save(); +} + +/** + * Convert MinerU raster images to single-page PDFs; sync Markdown and image manifest. + * @param {object} opts + * @param {string} opts.markdownContent + * @param {Array<{name:string,localPath:string}>} opts.images + * @param {string} opts.searchDir - directory containing the .md (MinerU extract root) + * @param {object} opts.mineruConfig - resolved config (rasterToPdf, deleteRasterAfterPdf, imageScale) + * @returns {Promise<{ markdownContent: string, images: Array<{name:string,localPath:string}>, converted: number, diagnostics: string }>} + */ +export async function applyMineruRasterToPdf(opts) { + const { markdownContent, images, searchDir, mineruConfig } = opts; + if (!mineruConfig?.rasterToPdf) { + return { + markdownContent, + images, + converted: 0, + diagnostics: 'rasterToPdf disabled (set OPENPRISM_MINERU_RASTER_TO_PDF=1 or mineruConfig.rasterToPdf).', + }; + } + + const imageScale = typeof mineruConfig.imageScale === 'number' && mineruConfig.imageScale > 0 + ? mineruConfig.imageScale + : 1; + const deleteAfter = !!mineruConfig.deleteRasterAfterPdf; + + const conversions = []; + let converted = 0; + const nextImages = []; + + for (const img of images || []) { + const localPath = img.localPath; + if (!localPath || !RASTER_EXT.test(localPath)) { + nextImages.push(img); + continue; + } + + const pdfPath = localPath.replace(RASTER_EXT, '.pdf'); + if (pdfPath === localPath) { + nextImages.push(img); + continue; + } + + try { + const pdfBytes = await rasterToPdfBytes(localPath, imageScale); + await fs.writeFile(pdfPath, pdfBytes); + converted++; + + const relOld = path.relative(searchDir, localPath).replace(/\\/g, '/'); + const relNew = path.relative(searchDir, pdfPath).replace(/\\/g, '/'); + conversions.push({ + relOld, + relNew, + baseOld: path.basename(localPath), + baseNew: path.basename(pdfPath), + }); + + nextImages.push({ + name: path.basename(pdfPath), + localPath: pdfPath, + }); + + if (deleteAfter) { + await fs.unlink(localPath).catch(() => {}); + } + } catch (e) { + nextImages.push(img); + console.warn('[mineruRasterToPdf] skip', localPath, e?.message || e); + } + } + + const newMd = rewriteMarkdownImageRefs(markdownContent, conversions); + const diagnostics = + `rasterToPdf: converted ${converted} file(s) to single-page PDF` + + (imageScale > 1 ? ` (imageScale=${imageScale})` : '') + + (deleteAfter ? ', deleted originals' : '') + + '. Compare compile output with OPENPRISM_MINERU_RASTER_TO_PDF=0 to isolate PNG-vs-PDF embedding differences.'; + + return { + markdownContent: newMd, + images: nextImages, + converted, + diagnostics, + }; +} diff --git a/apps/backend/src/services/mineruService.js b/apps/backend/src/services/mineruService.js index b7cc77f..f3b9530 100644 --- a/apps/backend/src/services/mineruService.js +++ b/apps/backend/src/services/mineruService.js @@ -6,9 +6,31 @@ import { safeJoin } from '../utils/pathUtils.js'; const MINERU_MAX_FILE_BYTES = 200 * 1024 * 1024; +function envBool(name, defaultVal = false) { + const v = process.env[name]; + if (v === undefined || v === '') return defaultVal; + return !['0', 'false', 'no', 'off'].includes(String(v).toLowerCase()); +} + +function envFloat(name, defaultVal) { + const v = process.env[name]; + if (v === undefined || v === '') return defaultVal; + const n = Number(v); + return Number.isFinite(n) ? n : defaultVal; +} + /** * Resolve MinerU configuration from request config or environment variables. * apiBase can be overridden by the frontend; falls back to MINERU_API_BASE constant. + * + * Tuning (for clearer PDFs / different MinerU outputs): + * - modelVersion: MinerU API `model_version`, e.g. `vlm` or `pipeline` — compare zip contents empirically. + * - extraFormats: optional `['docx','html','latex']` per API; may add auxiliary files inside the result zip. + * + * Post-processing (after zip extract): + * - rasterToPdf / OPENPRISM_MINERU_RASTER_TO_PDF — wrap PNG/JPEG/WebP as single-page PDFs and rewrite Markdown refs. + * - imageScale / OPENPRISM_MINERU_IMAGE_SCALE (>1) — Lanczos upscale before embedding (optional). + * - bboxCrop / OPENPRISM_MINERU_BBOX_CROP — replace images using source PDF + *content_list*.json (needs `pdftoppm`). */ export function resolveMineruConfig(mineruConfig) { const rawBase = (mineruConfig?.apiBase || process.env.OPENPRISM_MINERU_API_BASE || MINERU_API_BASE).trim(); @@ -30,6 +52,24 @@ export function resolveMineruConfig(mineruConfig) { callback: typeof mineruConfig?.callback === 'string' ? mineruConfig.callback.trim() : '', seed: typeof mineruConfig?.seed === 'string' ? mineruConfig.seed.trim() : '', extraFormats, + rasterToPdf: typeof mineruConfig?.rasterToPdf === 'boolean' + ? mineruConfig.rasterToPdf + : envBool('OPENPRISM_MINERU_RASTER_TO_PDF', false), + deleteRasterAfterPdf: typeof mineruConfig?.deleteRasterAfterPdf === 'boolean' + ? mineruConfig.deleteRasterAfterPdf + : envBool('OPENPRISM_MINERU_DELETE_RASTER_AFTER_PDF', false), + imageScale: typeof mineruConfig?.imageScale === 'number' && mineruConfig.imageScale > 0 + ? mineruConfig.imageScale + : envFloat('OPENPRISM_MINERU_IMAGE_SCALE', 1), + bboxCrop: typeof mineruConfig?.bboxCrop === 'boolean' + ? mineruConfig.bboxCrop + : envBool('OPENPRISM_MINERU_BBOX_CROP', false), + cropDpi: typeof mineruConfig?.cropDpi === 'number' && mineruConfig.cropDpi > 0 + ? mineruConfig.cropDpi + : envFloat('OPENPRISM_MINERU_CROP_DPI', 200), + bboxCoords: mineruConfig?.bboxCoords === 'top_left' || process.env.OPENPRISM_MINERU_BBOX_COORDS === 'top_left' + ? 'top_left' + : 'pdf', }; } @@ -245,7 +285,7 @@ async function parseExtractedOutput(outputDir) { images.push(...fallback); } - return { markdownContent, images, searchDir }; + return { markdownContent, images, searchDir, markdownPath }; } async function findFirstFileRecursive(rootDir, predicate) { @@ -313,7 +353,7 @@ async function findFilesUnderDirNamedRecursive(rootDir, targetDirName, filePredi * @param {object} mineruConfig - { apiBase, token, modelVersion } * @param {string} outputDir - directory to extract results into * @param {function} onProgress - optional progress callback - * @returns {{ markdownContent: string, images: Array<{name,localPath}> }} + * @returns {{ markdownContent: string, images: Array<{name,localPath}>, searchDir: string, markdownPath: string }} */ export async function parsePdfWithMineru(pdfPath, mineruConfig, outputDir, onProgress) { const config = resolveMineruConfig(mineruConfig); diff --git a/apps/backend/src/services/projectService.js b/apps/backend/src/services/projectService.js index f8bb752..7af8600 100644 --- a/apps/backend/src/services/projectService.js +++ b/apps/backend/src/services/projectService.js @@ -3,8 +3,13 @@ import path from 'path'; import { DATA_DIR } from '../config/constants.js'; export async function getProjectRoot(id) { + if (!id) throw new Error('getProjectRoot: project id is required'); const projectRoot = path.join(DATA_DIR, id); const metaPath = path.join(projectRoot, 'project.json'); - await fs.access(metaPath); + try { + await fs.access(metaPath); + } catch { + throw new Error(`Project not found: ${id} (missing ${metaPath})`); + } return projectRoot; } diff --git a/apps/backend/src/services/transferAgent/fsTools.js b/apps/backend/src/services/transferAgent/fsTools.js new file mode 100644 index 0000000..9771919 --- /dev/null +++ b/apps/backend/src/services/transferAgent/fsTools.js @@ -0,0 +1,30 @@ +import { promises as fs } from 'fs'; +import { safeJoin } from '../../utils/pathUtils.js'; + +/** + * Read a file relative to workspace root (target project); throws on escape. + */ +export async function readWorkspaceFile(workspaceRoot, relPath) { + const abs = safeJoin(workspaceRoot, relPath); + return fs.readFile(abs, 'utf8'); +} + +/** + * Read a file relative to source read root; throws on escape. + */ +export async function readSourceFile(sourceReadRoot, relPath) { + const abs = safeJoin(sourceReadRoot, relPath); + return fs.readFile(abs, 'utf8'); +} + +/** + * True if path exists under workspace. + */ +export async function workspaceFileExists(workspaceRoot, relPath) { + try { + await fs.access(safeJoin(workspaceRoot, relPath)); + return true; + } catch { + return false; + } +} diff --git a/apps/backend/src/services/transferAgent/graphMineruAgent.js b/apps/backend/src/services/transferAgent/graphMineruAgent.js new file mode 100644 index 0000000..f41fb18 --- /dev/null +++ b/apps/backend/src/services/transferAgent/graphMineruAgent.js @@ -0,0 +1,74 @@ +/** + * graphMineruAgent.js — MinerU + Agentic Transfer Graph + * + * Hybrid graph: MinerU front-end (PDF → Markdown) + Agent back-end (migration). + * + * compileSource → parsePdfWithMineru → planner → generator → reviewer ──┐ + * ↑ │ + * └──── revise (iteration < max) ┘ + * │ + * (pass)│ + * ▼ + * finalize + * + * The planner/generator/reviewer nodes receive the parsed Markdown content + * via state.sourceMarkdown and state.sourceImages, and use venue skills + * to produce the target LaTeX. + */ + +import { StateGraph, END, MemorySaver } from '@langchain/langgraph'; +import { TransferState } from './state.js'; +import { compileSource } from './nodes/compileSource.js'; +import { parsePdfWithMineru } from './nodes/parsePdfWithMineru.js'; +import { agentPlanner } from './nodes/agentPlanner.js'; +import { agentGenerator } from './nodes/agentGenerator.js'; +import { agentReviewer } from './nodes/agentReviewer.js'; +import { finalize } from './nodes/finalize.js'; + +/** + * Route after Reviewer: loop back to Planner or proceed to Finalize. + */ +function routeAfterReview(state) { + const review = state.reviewResult || {}; + const iteration = state.currentIteration || 0; + const maxIterations = state.maxIterations || 5; + + if (review.verdict === 'pass') return 'finalize'; + if (iteration >= maxIterations) return 'finalize'; + return 'planner'; +} + +/** + * Build the MinerU + Agent hybrid transfer graph. + */ +export function buildMineruAgentGraph() { + const graph = new StateGraph(TransferState); + + // MinerU front-end: PDF → Markdown + graph.addNode('compileSource', compileSource); + graph.addNode('parsePdfWithMineru', parsePdfWithMineru); + + // Agent back-end: Markdown → LaTeX (venue-aware) + graph.addNode('planner', agentPlanner); + graph.addNode('generator', agentGenerator); + graph.addNode('reviewer', agentReviewer); + graph.addNode('finalize', finalize); + + // Wire edges + graph.setEntryPoint('compileSource'); + graph.addEdge('compileSource', 'parsePdfWithMineru'); + graph.addEdge('parsePdfWithMineru', 'planner'); + graph.addEdge('planner', 'generator'); + graph.addEdge('generator', 'reviewer'); + + graph.addConditionalEdges('reviewer', routeAfterReview, { + planner: 'planner', + finalize: 'finalize', + }); + + graph.addEdge('finalize', END); + + return graph.compile({ + checkpointer: new MemorySaver(), + }); +} diff --git a/apps/backend/src/services/transferAgent/graphNeurips.js b/apps/backend/src/services/transferAgent/graphNeurips.js new file mode 100644 index 0000000..8d116c6 --- /dev/null +++ b/apps/backend/src/services/transferAgent/graphNeurips.js @@ -0,0 +1,87 @@ +import { StateGraph, END, MemorySaver } from '@langchain/langgraph'; +import { TransferState } from './state.js'; +import { intake } from './nodes/neurips/intake.js'; +import { analyzeSource } from './nodes/analyzeSource.js'; +import { analyzeTarget } from './nodes/analyzeTarget.js'; +import { draftPlan } from './nodes/draftPlan.js'; +import { prepareConfirmPlan } from './nodes/neurips/prepareConfirmPlan.js'; +import { consumeConfirmPlan } from './nodes/neurips/consumeConfirmPlan.js'; +import { applyPreamble } from './nodes/neurips/applyPreamble.js'; +import { applyBody } from './nodes/neurips/applyBody.js'; +import { normalizeFigures } from './nodes/neurips/normalizeFigures.js'; +import { copyAssets } from './nodes/copyAssets.js'; +import { applyBibliography } from './nodes/neurips/applyBibliography.js'; +import { prepareConfirmBlind } from './nodes/neurips/prepareConfirmBlind.js'; +import { consumeConfirmBlind } from './nodes/neurips/consumeConfirmBlind.js'; +import { blindConfirmBypass } from './nodes/neurips/blindConfirmBypass.js'; +import { sanitizeBlind } from './nodes/neurips/sanitizeBlind.js'; +import { policyCheck } from './nodes/neurips/policyCheck.js'; +import { finalize } from './nodes/finalize.js'; + +function routeBlind(state) { + if (state.pendingQA?.length) return 'consumeConfirmBlind'; + return 'blindConfirmBypass'; +} + +/** + * NeurIPS LaTeX→LaTeX transfer: stops after sanitizeBlind (no server pdflatex / fixCompile / layout). + * Authors compile locally. + */ +export function buildNeuripsLatexGraph() { + const graph = new StateGraph(TransferState); + + graph.addNode('intake', intake); + graph.addNode('analyzeSource', analyzeSource); + graph.addNode('analyzeTarget', analyzeTarget); + graph.addNode('draftPlan', draftPlan); + graph.addNode('prepareConfirmPlan', prepareConfirmPlan); + graph.addNode('consumeConfirmPlan', consumeConfirmPlan); + graph.addNode('applyPreamble', applyPreamble); + graph.addNode('applyBody', applyBody); + graph.addNode('normalizeFigures', normalizeFigures); + graph.addNode('copyAssets', copyAssets); + graph.addNode('applyBibliography', applyBibliography); + graph.addNode('prepareConfirmBlind', prepareConfirmBlind); + graph.addNode('consumeConfirmBlind', consumeConfirmBlind); + graph.addNode('blindConfirmBypass', blindConfirmBypass); + graph.addNode('sanitizeBlind', sanitizeBlind); + graph.addNode('policyCheck', policyCheck); + graph.addNode('finalize', finalize); + + graph.setEntryPoint('intake'); + + graph.addEdge('intake', 'analyzeSource'); + graph.addEdge('analyzeSource', 'analyzeTarget'); + graph.addEdge('analyzeTarget', 'draftPlan'); + graph.addEdge('draftPlan', 'prepareConfirmPlan'); + graph.addEdge('prepareConfirmPlan', 'consumeConfirmPlan'); + graph.addEdge('consumeConfirmPlan', 'applyPreamble'); + graph.addEdge('applyPreamble', 'applyBody'); + graph.addEdge('applyBody', 'normalizeFigures'); + graph.addEdge('normalizeFigures', 'copyAssets'); + graph.addEdge('copyAssets', 'applyBibliography'); + graph.addEdge('applyBibliography', 'prepareConfirmBlind'); + graph.addConditionalEdges('prepareConfirmBlind', routeBlind, { + consumeConfirmBlind: 'consumeConfirmBlind', + blindConfirmBypass: 'blindConfirmBypass', + }); + graph.addEdge('consumeConfirmBlind', 'sanitizeBlind'); + graph.addEdge('blindConfirmBypass', 'sanitizeBlind'); + graph.addEdge('sanitizeBlind', 'policyCheck'); + graph.addEdge('policyCheck', 'finalize'); + graph.addEdge('finalize', END); + + return graph.compile({ + checkpointer: new MemorySaver(), + interruptBefore: [ + 'consumeConfirmPlan', + 'applyPreamble', + 'applyBody', + 'normalizeFigures', + 'applyBibliography', + 'consumeConfirmBlind', + 'sanitizeBlind', + 'policyCheck', + ], + }); +} diff --git a/apps/backend/src/services/transferAgent/graphRuleBaseTransfer.js b/apps/backend/src/services/transferAgent/graphRuleBaseTransfer.js new file mode 100644 index 0000000..f988f9b --- /dev/null +++ b/apps/backend/src/services/transferAgent/graphRuleBaseTransfer.js @@ -0,0 +1,19 @@ +import { StateGraph, END, MemorySaver } from '@langchain/langgraph'; +import { TransferState } from './state.js'; +import { ruleBaseTransferConvert } from './nodes/ruleBaseTransferConvert.js'; +import { finalize } from './nodes/finalize.js'; + +export function buildRuleBaseTransferGraph() { + const graph = new StateGraph(TransferState); + + graph.addNode('ruleBaseTransferConvert', ruleBaseTransferConvert); + graph.addNode('finalize', finalize); + + graph.setEntryPoint('ruleBaseTransferConvert'); + graph.addEdge('ruleBaseTransferConvert', 'finalize'); + graph.addEdge('finalize', END); + + return graph.compile({ + checkpointer: new MemorySaver(), + }); +} diff --git a/apps/backend/src/services/transferAgent/graphVenueAgent.js b/apps/backend/src/services/transferAgent/graphVenueAgent.js new file mode 100644 index 0000000..82c1131 --- /dev/null +++ b/apps/backend/src/services/transferAgent/graphVenueAgent.js @@ -0,0 +1,88 @@ +/** + * graphVenueAgent.js — Agentic venue transfer graph (multi-template) + * + * Used for NeurIPS, ICML, CVPR, ACL, etc. when `useAgent` is true. Venue-specific + * prompts come from `skills/` + `rules/.md` under `services/transferAgent/`; + * `transferGraphKind` holds the template id (e.g. `neurips`, `icml`). + * + * Replaces the 17-node NeurIPS-only pipeline (graphNeurips.js) with a 3-node agentic loop: + * + * ┌──────────────────────────────────────┐ + * │ │ + * ▼ │ + * planner ──► generator ──► reviewer ──────┤ + * │ │ + * (pass)│ (revise)│ + * ▼ │ + * finalize │ + * │ + * (max_iterations)───────┘ + * + * Each node is a ReAct-style agent with tool-calling capabilities. + * + * Human-in-the-loop: the raiseQuestion tool triggers LangGraph interrupt(), + * pausing the graph until the user provides answers via the API. + */ + +import { StateGraph, END, MemorySaver } from '@langchain/langgraph'; +import { TransferState } from './state.js'; +import { agentPlanner } from './nodes/agentPlanner.js'; +import { agentGenerator } from './nodes/agentGenerator.js'; +import { agentReviewer } from './nodes/agentReviewer.js'; +import { finalize } from './nodes/finalize.js'; + +/** + * Route after Reviewer: loop back to Planner or proceed to Finalize. + */ +function routeAfterReview(state) { + const review = state.reviewResult || {}; + const iteration = state.currentIteration || 0; + const maxIterations = state.maxIterations || 5; + + // Pass → finalize + if (review.verdict === 'pass') { + return 'finalize'; + } + + // Max iterations exceeded → finalize anyway + if (iteration >= maxIterations) { + return 'finalize'; + } + + // Revise → loop back to planner + return 'planner'; +} + +/** + * Build the agentic transfer graph for supported venue templates. + * + * API surface (state shape, interrupt handling) matches legacy transfer routes. + */ +export function buildVenueAgentGraph() { + const graph = new StateGraph(TransferState); + + // Register nodes + graph.addNode('planner', agentPlanner); + graph.addNode('generator', agentGenerator); + graph.addNode('reviewer', agentReviewer); + graph.addNode('finalize', finalize); + + // Wire edges: linear planner → generator → reviewer + graph.setEntryPoint('planner'); + graph.addEdge('planner', 'generator'); + graph.addEdge('generator', 'reviewer'); + + // Conditional edge from reviewer: pass→finalize, revise→planner + graph.addConditionalEdges('reviewer', routeAfterReview, { + planner: 'planner', + finalize: 'finalize', + }); + + graph.addEdge('finalize', END); + + return graph.compile({ + checkpointer: new MemorySaver(), + // raiseQuestion tool triggers interrupt() internally; + // no need for interruptBefore on specific nodes. + }); +} diff --git a/apps/backend/src/services/transferAgent/llmUnifiedDiff.js b/apps/backend/src/services/transferAgent/llmUnifiedDiff.js new file mode 100644 index 0000000..259efd8 --- /dev/null +++ b/apps/backend/src/services/transferAgent/llmUnifiedDiff.js @@ -0,0 +1,281 @@ +import path from 'path'; +import { promises as fs } from 'fs'; +import { applyPatch } from 'diff'; +import { stripCodeFences, rejectCatastrophicFullTexRewrite } from './utils.js'; +import { TransferNodeError } from './transferNodeError.js'; +import { ensureDir } from '../../utils/fsUtils.js'; + +/** Set OPENPRISM_TRANSFER_SAVE_LLM_DIFF=0 to skip writing raw/patch files under .agent_runs/…/llm_diff/ */ +function isLlmDiffArtifactSaveEnabled() { + const e = process.env.OPENPRISM_TRANSFER_SAVE_LLM_DIFF; + if (e === '0' || e === 'false' || e === 'no') return false; + return true; +} + +/** + * @param {{ projectRoot: string, jobId: string }} debug + * @returns {{ absDir: string, relPosix: string } | null} + */ +function resolveDiffDebugDir(debug, nodeName, runId) { + if (!debug?.projectRoot || !debug?.jobId) return null; + const folder = `${nodeName}-${runId}`; + const relPosix = `.agent_runs/${debug.jobId}/llm_diff/${folder}`; + const absDir = path.join(debug.projectRoot, '.agent_runs', debug.jobId, 'llm_diff', folder); + return { absDir, relPosix }; +} + +async function persistDiffAttempt(absDir, attempt, payload) { + const p = (n) => path.join(absDir, n); + await fs.writeFile(p(`attempt_${attempt}_raw.txt`), payload.raw, 'utf8'); + await fs.writeFile(p(`attempt_${attempt}_extracted.patch`), payload.patchText, 'utf8'); + await fs.writeFile( + p(`attempt_${attempt}_meta.json`), + `${JSON.stringify(payload.meta, null, 2)}\n`, + 'utf8', + ); +} + +/** + * Prompt appendix: require git unified diff for a single virtual path (matches applyPatch on full file text). + */ +export function mainTexDiffInstructions(virtualPath = 'main.tex') { + const v = virtualPath.replace(/\\/g, '/'); + return ` + +Output ONLY a unified diff in git format. Do NOT output the full .tex file or any explanation outside the patch. +Patch headers MUST be exactly (use these paths): +--- a/${v} ++++ b/${v} + +Then @@ ... @@ hunks with context lines (space prefix), removals (-), additions (+). Every context line (leading space) and every removed line (-) MUST be copied verbatim from CURRENT_FILE — same characters, trailing spaces, and line breaks. Do not paraphrase or re-wrap lines. If the change is small, use a single hunk with 3+ lines of real context from the file. + +Multi-hunk / structure (critical for applyPatch): +- If edits are separated by any lines you are not changing (paragraphs, equations, \\subsection, blank lines, etc.), use SEPARATE @@ hunks. Do NOT end one hunk right after \\end{figure} and immediately continue with \\begin{figure*} unless those lines are truly adjacent in CURRENT_FILE with nothing between them. +- If one hunk spans two distant regions, EVERY intervening line must appear unchanged as context lines (leading space) inside that same hunk. Safer: split into multiple hunks, each anchored at the real line numbers in CURRENT_FILE. +- In each @@ -OLDSTART,OLDCOUNT +NEWSTART,NEWCOUNT @@ header: OLDCOUNT must equal the number of lines in this hunk that start with SPACE or MINUS (old-file side). NEWCOUNT must equal the number of lines that start with SPACE or PLUS (new-file side). Wrong counts cause patch rejection. + +Example of valid minimal patch: +--- a/${v} ++++ b/${v} +@@ -1,3 +1,3 @@ + line1 +-old ++new + line3 +`; +} + +/** + * Strip prose/fences and keep the first unified diff block. + */ +export function extractUnifiedDiff(raw) { + if (raw == null) return ''; + let s = typeof raw === 'string' ? raw : String(raw); + // Handle ```diff ... ``` wrapped output + const diffFence = s.match(/```(?:diff|patch)?\s*\n([\s\S]*?)```/i); + if (diffFence) s = diffFence[1].trim(); + else s = stripCodeFences(s); + + const gitIdx = s.search(/^diff --git\s/m); + const minusIdx = s.search(/^---\s+/m); + const start = + gitIdx >= 0 ? gitIdx : minusIdx >= 0 ? minusIdx : -1; + if (start === -1) return ''; + return s.slice(start).trimEnd(); +} + +/** + * @returns {{ ok: true, text: string } | { ok: false, reason: string }} + */ +export function applyUnifiedDiffToMainTex(baseTex, patchText) { + const patch = (patchText || '').trim(); + if (!patch) return { ok: false, reason: 'empty_diff' }; + try { + const result = applyPatch(baseTex, patch); + if (result === false) return { ok: false, reason: 'hunk_mismatch' }; + return { ok: true, text: result }; + } catch (e) { + return { + ok: false, + reason: `parse_or_apply: ${e?.message || String(e)}`, + }; + } +} + +/** + * After a successful apply: decide if we should retry the LLM. + * @returns {{ retry: boolean, reason?: string }} + */ +export function shouldRetryTexEdit(prevTex, nextTex) { + if (prevTex === nextTex) { + return { retry: true, reason: 'no_op_patch' }; + } + const catastrophic = rejectCatastrophicFullTexRewrite(prevTex, nextTex); + if (catastrophic) { + return { retry: true, reason: catastrophic }; + } + return { retry: false }; +} + +const DEFAULT_MAX = 3; + +/** Map machine reason → hint for the next LLM attempt */ +function retryHintForFailure(reason) { + const r = reason || ''; + if (r === 'hunk_mismatch') { + return `${r}: patch could not be aligned — context/remove lines must match CURRENT_FILE exactly. If you merged two distant edits into one hunk, split into separate @@ hunks and include every line between them as context (or do not skip intervening paragraphs/equations). Fix @@ OLDCOUNT/NEWCOUNT to match space/-/+ line counts.`; + } + if (r === 'empty_diff') { + return `${r}: no valid unified diff found in your reply. Output only the patch starting with --- a/`; + } + if (r.startsWith('parse_or_apply')) { + return `${r}: malformed patch syntax. Use standard unified diff with ---/+++/@@ and lines starting with space, -, or +.`; + } + if (r === 'no_op_patch') { + return `${r}: patch applied but file unchanged; include real +/- edits for the requested normalization.`; + } + if (r === 'output too short') { + return `${r}: result was far shorter than the source; do not delete large regions — small targeted hunks only.`; + } + return r; +} + +/** + * @param {object} opts + * @param {{ invoke: (messages: unknown[]) => Promise<{ content: unknown }> }} opts.llm + * @param {string} opts.baseTex - current file content + * @param {(failureNote: string) => string} opts.buildPrompt - full user prompt; failureNote is '' or PREVIOUS_ATTEMPT block + * @param {string} opts.nodeName + * @param {string} opts.phase + * @param {number} [opts.maxAttempts] + * @param {{ projectRoot: string, jobId: string }} [opts.debug] — saves each attempt under .agent_runs//llm_diff/-/ + * @returns {Promise} merged text after successful patch + */ +export async function runLlmUnifiedDiffWithRetries({ + llm, + baseTex, + buildPrompt, + nodeName, + phase, + maxAttempts = DEFAULT_MAX, + debug, +}) { + let lastFailure = ''; + const runId = Date.now(); + const debugResolved = + isLlmDiffArtifactSaveEnabled() ? resolveDiffDebugDir(debug, nodeName, runId) : null; + let absDebugDir = null; + if (debugResolved) { + absDebugDir = debugResolved.absDir; + await ensureDir(absDebugDir); + await fs.writeFile(path.join(absDebugDir, 'input_main.tex'), baseTex, 'utf8'); + await fs.writeFile( + path.join(absDebugDir, 'README.txt'), + [ + 'OpenPrism unified-diff LLM debug bundle.', + 'input_main.tex — file content before this node ran.', + 'attempt_N_raw.txt — full model reply.', + 'attempt_N_extracted.patch — text passed to applyPatch after extractUnifiedDiff.', + 'attempt_N_meta.json — apply result and retry reasons.', + 'summary.json — written if all attempts fail.', + '', + 'Disable: OPENPRISM_TRANSFER_SAVE_LLM_DIFF=0', + '', + ].join('\n'), + 'utf8', + ); + } + + const attemptSummaries = []; + + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + const failureNote = lastFailure + ? `\n\nPREVIOUS_ATTEMPT_FAILED: ${retryHintForFailure(lastFailure)}\nReply with ONLY a corrected unified diff; headers --- a/ and +++ b/ must match the instructions.` + : ''; + const prompt = buildPrompt(failureNote); + const response = await llm.invoke([{ role: 'user', content: prompt }]); + const raw = + typeof response.content === 'string' + ? response.content + : Array.isArray(response.content) + ? response.content.map((p) => (typeof p === 'string' ? p : p?.text || '')).join('') + : ''; + + const patchText = extractUnifiedDiff(raw); + const applied = applyUnifiedDiffToMainTex(baseTex, patchText); + + let retryAfterApply = false; + let postApplyReason = ''; + if (applied.ok) { + const { retry, reason } = shouldRetryTexEdit(baseTex, applied.text); + retryAfterApply = retry; + postApplyReason = reason || ''; + } + + if (absDebugDir) { + await persistDiffAttempt(absDebugDir, attempt, { + raw, + patchText, + meta: { + attempt, + ts: new Date().toISOString(), + baseTexLength: baseTex.length, + rawLength: raw.length, + patchLength: patchText.length, + applyOk: applied.ok, + applyReason: applied.ok ? undefined : applied.reason, + postApplyRetry: retryAfterApply, + postApplyReason: retryAfterApply ? postApplyReason : undefined, + }, + }); + } + + attemptSummaries.push({ + attempt, + applyOk: applied.ok, + applyReason: applied.ok ? null : applied.reason, + postApplyRetry: retryAfterApply, + postApplyReason: retryAfterApply ? postApplyReason : null, + }); + + if (!applied.ok) { + lastFailure = applied.reason || 'apply_failed'; + continue; + } + + if (retryAfterApply) { + lastFailure = retryHintForFailure(postApplyReason || 'retry'); + continue; + } + + return applied.text; + } + + const relPath = debugResolved?.relPosix; + if (absDebugDir) { + await fs.writeFile( + path.join(absDebugDir, 'summary.json'), + `${JSON.stringify( + { + nodeName, + phase, + lastFailure: lastFailure || 'unknown', + maxAttempts, + inputTexChars: baseTex.length, + attempts: attemptSummaries, + }, + null, + 2, + )}\n`, + 'utf8', + ); + } + + const detail = lastFailure || 'unknown'; + const inputLen = baseTex.length; + const msg = relPath + ? `[${nodeName}] Unified diff failed after ${maxAttempts} attempt(s): ${detail} — input ${inputLen} chars — LLM outputs saved under ${relPath}/` + : `[${nodeName}] Unified diff failed after ${maxAttempts} attempt(s): ${detail} — input ${inputLen} chars`; + + throw new TransferNodeError(nodeName, phase, detail, msg, relPath, inputLen); +} diff --git a/apps/backend/src/services/transferAgent/masking/index.js b/apps/backend/src/services/transferAgent/masking/index.js new file mode 100644 index 0000000..afe7047 --- /dev/null +++ b/apps/backend/src/services/transferAgent/masking/index.js @@ -0,0 +1,284 @@ +import { promises as fs } from 'fs'; +import path from 'path'; +import { listFilesRecursive } from '../../../utils/fsUtils.js'; + +const TABLE_ENVS = new Set([ + 'table', + 'table*', + 'tabular', + 'tabular*', + 'tabularx', + 'longtable', + 'array', +]); + +const MATH_ENVS = new Set([ + 'equation', + 'equation*', + 'align', + 'align*', + 'aligned', + 'alignat', + 'alignat*', + 'gather', + 'gather*', + 'multline', + 'multline*', + 'flalign', + 'flalign*', + 'split', + 'math', + 'displaymath', + 'cases', +]); + +const TOKEN_RE = /__OP_MASK_[A-Z]+_\d{4,}__/g; + +function isEscaped(text, idx) { + let backslashes = 0; + for (let i = idx - 1; i >= 0 && text[i] === '\\'; i--) backslashes++; + return backslashes % 2 === 1; +} + +function isCommentStart(text, idx) { + return text[idx] === '%' && !isEscaped(text, idx); +} + +function readComment(text, idx) { + let end = idx; + while (end < text.length && text[end] !== '\n') end++; + return end; +} + +function matchBeginEnvironment(text, idx) { + const slice = text.slice(idx); + const match = slice.match(/^\\begin\s*\{\s*([A-Za-z*@]+)\s*\}/); + if (!match) return null; + return { + env: match[1], + open: match[0], + end: idx + match[0].length, + }; +} + +function matchEndEnvironment(text, idx) { + const slice = text.slice(idx); + const match = slice.match(/^\\end\s*\{\s*([A-Za-z*@]+)\s*\}/); + if (!match) return null; + return { + env: match[1], + close: match[0], + end: idx + match[0].length, + }; +} + +function findEnvironmentEnd(text, startIdx, env) { + let depth = 1; + let i = startIdx; + while (i < text.length) { + if (isCommentStart(text, i)) { + i = readComment(text, i); + continue; + } + const begin = matchBeginEnvironment(text, i); + if (begin && begin.env === env) { + depth++; + i = begin.end; + continue; + } + const end = matchEndEnvironment(text, i); + if (end && end.env === env) { + depth--; + if (depth === 0) return end.end; + i = end.end; + continue; + } + i++; + } + return -1; +} + +function findDelimitedEnd(text, startIdx, close) { + let i = startIdx; + while (i < text.length) { + if (isCommentStart(text, i)) { + i = readComment(text, i); + continue; + } + if (text.startsWith(close, i) && !isEscaped(text, i)) { + return i + close.length; + } + i++; + } + return -1; +} + +function findInlineDollarEnd(text, startIdx) { + let i = startIdx; + while (i < text.length) { + if (isCommentStart(text, i)) { + i = readComment(text, i); + continue; + } + if (text[i] === '$' && !isEscaped(text, i)) { + if (text[i + 1] === '$') { + i += 2; + continue; + } + return i + 1; + } + i++; + } + return -1; +} + +function nextToken(kind, state) { + state.counter += 1; + return `__OP_MASK_${kind}_${String(state.counter).padStart(4, '0')}__`; +} + +function pushMaskedSegment(buffer, state, kind, original, filePath) { + const token = nextToken(kind, state); + state.manifest.push({ token, kind, filePath, original }); + buffer.push(token); +} + +function maskTexLikeContent(content, filePath, state, opts = {}) { + const { allowTables = true } = opts; + const out = []; + let i = 0; + while (i < content.length) { + if (isCommentStart(content, i)) { + const end = readComment(content, i); + out.push(content.slice(i, end)); + i = end; + continue; + } + + const begin = matchBeginEnvironment(content, i); + if (begin) { + if ((allowTables && TABLE_ENVS.has(begin.env)) || MATH_ENVS.has(begin.env)) { + const end = findEnvironmentEnd(content, begin.end, begin.env); + if (end !== -1) { + const original = content.slice(i, end); + pushMaskedSegment(out, state, allowTables && TABLE_ENVS.has(begin.env) ? 'TBL' : 'EQ', original, filePath); + i = end; + continue; + } + state.warnings.push(`Unclosed environment \\begin{${begin.env}} in ${filePath}`); + } + } + + if (content.startsWith('\\[', i) && !isEscaped(content, i)) { + const end = findDelimitedEnd(content, i + 2, '\\]'); + if (end !== -1) { + pushMaskedSegment(out, state, 'EQ', content.slice(i, end), filePath); + i = end; + continue; + } + state.warnings.push(`Unclosed display math \\[ in ${filePath}`); + } + + if (content.startsWith('$$', i) && !isEscaped(content, i)) { + const end = findDelimitedEnd(content, i + 2, '$$'); + if (end !== -1) { + pushMaskedSegment(out, state, 'EQ', content.slice(i, end), filePath); + i = end; + continue; + } + state.warnings.push(`Unclosed $$ display math in ${filePath}`); + } + + if (content.startsWith('\\(', i) && !isEscaped(content, i)) { + const end = findDelimitedEnd(content, i + 2, '\\)'); + if (end !== -1) { + pushMaskedSegment(out, state, 'EQ', content.slice(i, end), filePath); + i = end; + continue; + } + state.warnings.push(`Unclosed inline math \\( in ${filePath}`); + } + + if (content[i] === '$' && !isEscaped(content, i) && content[i + 1] !== '$') { + const end = findInlineDollarEnd(content, i + 1); + if (end !== -1) { + pushMaskedSegment(out, state, 'EQ', content.slice(i, end), filePath); + i = end; + continue; + } + state.warnings.push(`Unclosed inline $ math in ${filePath}`); + } + + out.push(content[i]); + i++; + } + return out.join(''); +} + +export function countMaskTokens(content) { + if (!content) return 0; + const matches = content.match(TOKEN_RE); + return matches ? matches.length : 0; +} + +export function unmaskContent(content, manifest = []) { + if (!content || !Array.isArray(manifest) || manifest.length === 0) { + return { content: content || '', restored: 0, remaining: countMaskTokens(content) }; + } + let restored = 0; + let result = content; + for (const entry of manifest) { + if (!entry?.token) continue; + if (result.includes(entry.token)) { + restored++; + result = result.split(entry.token).join(entry.original || ''); + } + } + return { + content: result, + restored, + remaining: countMaskTokens(result), + }; +} + +export async function maskSourceProjectFiles(projectRoot) { + const files = await listFilesRecursive(projectRoot); + const candidates = files + .filter((file) => file.type === 'file') + .map((file) => file.path) + .filter((relPath) => ['.tex', '.bib'].includes(path.extname(relPath).toLowerCase())) + .sort(); + + const state = { + counter: 0, + manifest: [], + warnings: [], + }; + + const maskedContents = {}; + const maskedFiles = []; + + for (const relPath of candidates) { + const absPath = path.join(projectRoot, relPath); + const original = await fs.readFile(absPath, 'utf8'); + const masked = path.extname(relPath).toLowerCase() === '.bib' + ? maskTexLikeContent(original, relPath, state, { allowTables: false }) + : maskTexLikeContent(original, relPath, state, { allowTables: true }); + if (masked !== original) { + maskedFiles.push(relPath); + maskedContents[relPath] = masked; + } + } + + return { + manifest: state.manifest, + maskedFiles, + maskedContents, + warnings: state.warnings, + }; +} + +export function getMaskedSourceContent(sourceMaskedContents, relPath) { + if (!sourceMaskedContents || typeof sourceMaskedContents !== 'object') return null; + return typeof sourceMaskedContents[relPath] === 'string' ? sourceMaskedContents[relPath] : null; +} diff --git a/apps/backend/src/services/transferAgent/masking/index.test.js b/apps/backend/src/services/transferAgent/masking/index.test.js new file mode 100644 index 0000000..c98ecc3 --- /dev/null +++ b/apps/backend/src/services/transferAgent/masking/index.test.js @@ -0,0 +1,67 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import os from 'os'; +import path from 'path'; +import { promises as fs } from 'fs'; +import { maskSourceProjectFiles, unmaskContent } from './index.js'; + +test('maskSourceProjectFiles masks tables and inline/display math and restores them', async () => { + const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'openprism-mask-')); + try { + await fs.writeFile( + path.join(tmpRoot, 'main.tex'), + [ + '\\section{Intro}', + 'Inline math $E=mc^2$ should be masked.', + '\\begin{equation}', + 'a^2+b^2=c^2', + '\\end{equation}', + '\\begin{table}', + '\\centering', + '\\begin{tabular}{cc}', + 'a & b\\\\', + '\\end{tabular}', + '\\end{table}', + 'Escaped dollar \\$100 should stay.', + '% $commented$ math should stay untouched', + '\\input{refs}', + '', + ].join('\n'), + 'utf8', + ); + await fs.writeFile( + path.join(tmpRoot, 'refs.tex'), + 'Display math: \\[x+y\\] and inline \\(z\\).\n', + 'utf8', + ); + await fs.writeFile( + path.join(tmpRoot, 'refs.bib'), + '@article{key,\n title={Energy $E=mc^2$}\n}\n', + 'utf8', + ); + + const masked = await maskSourceProjectFiles(tmpRoot); + + assert.equal(masked.warnings.length, 0); + assert.ok(masked.maskedFiles.includes('main.tex')); + assert.ok(masked.maskedFiles.includes('refs.tex')); + assert.ok(masked.maskedFiles.includes('refs.bib')); + assert.ok(masked.manifest.length >= 5); + + const maskedMain = masked.maskedContents['main.tex']; + assert.match(maskedMain, /__OP_MASK_EQ_\d{4}__/); + assert.match(maskedMain, /__OP_MASK_TBL_\d{4}__/); + assert.match(masked.maskedContents['refs.tex'], /__OP_MASK_EQ_\d{4}__/); + assert.match(masked.maskedContents['refs.bib'], /__OP_MASK_EQ_\d{4}__/); + assert.match(maskedMain, /\\\$100/); + assert.match(maskedMain, /% \$commented\$ math should stay untouched/); + + const restored = unmaskContent(maskedMain, masked.manifest); + assert.equal(restored.remaining, 0); + assert.match(restored.content, /Inline math \$E=mc\^2\$/); + assert.match(restored.content, /\\begin\{equation\}/); + assert.match(restored.content, /\\begin\{table\}/); + } finally { + await fs.rm(tmpRoot, { recursive: true, force: true }); + } +}); diff --git a/apps/backend/src/services/transferAgent/neuripsRules.js b/apps/backend/src/services/transferAgent/neuripsRules.js new file mode 100644 index 0000000..b80e346 --- /dev/null +++ b/apps/backend/src/services/transferAgent/neuripsRules.js @@ -0,0 +1,66 @@ +import { promises as fs } from 'fs'; +import path from 'path'; +import { RULES_DIR } from '../../config/constants.js'; + +/** + * Per-venue rules cache: venueId → { content, mtimeMs } + */ +const cache = new Map(); + +/** + * Load venue rules markdown from disk (cached in process memory). + * + * Convention: rules file lives at `${RULES_DIR}/${venueId}.md` + * e.g. apps/backend/src/services/transferAgent/rules/neurips.md + * apps/backend/src/services/transferAgent/rules/acl.md + * + * @param {string} venueId — e.g. 'neurips', 'icml', 'cvpr' + * @returns {Promise} + */ +export async function loadVenueRules(venueId) { + const filePath = path.join(RULES_DIR, `${venueId}.md`); + try { + const st = await fs.stat(filePath); + const cached = cache.get(venueId); + if (cached && cached.mtimeMs === st.mtimeMs) { + return cached.content; + } + const content = await fs.readFile(filePath, 'utf8'); + cache.set(venueId, { content, mtimeMs: st.mtimeMs }); + return content; + } catch { + return ''; + } +} + +/** + * Synchronous accessor (after warm-up via loadVenueRules). + */ +export function getVenueRulesSync(venueId) { + return cache.get(venueId)?.content || ''; +} + +/** + * Format rules as an LLM prompt block. + */ +export function formatVenueHandbookBlock(venueId, fullMd) { + const label = venueId.toUpperCase(); + if (!fullMd?.trim()) { + return `\n\n[${label} handbook missing on disk — use template comments only.]\n`; + } + return `\n\n--- ${label}_FULL_HANDBOOK (Markdown, authoritative; follow strictly) ---\n${fullMd}\n--- END_${label}_FULL_HANDBOOK ---\n`; +} + +// ────────── Backward-compatible NeurIPS aliases ────────── + +export async function loadNeuripsRulesFull() { + return loadVenueRules('neurips'); +} + +export function getNeuripsRulesSync() { + return getVenueRulesSync('neurips'); +} + +export function formatNeuripsHandbookBlock(fullMd) { + return formatVenueHandbookBlock('neurips', fullMd); +} diff --git a/apps/backend/src/services/transferAgent/nodes/agentGenerator.js b/apps/backend/src/services/transferAgent/nodes/agentGenerator.js new file mode 100644 index 0000000..6e95ae5 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/agentGenerator.js @@ -0,0 +1,218 @@ +/** + * agentGenerator — Generator node for the multi-venue agentic transfer (graphVenueAgent). + * + * The Generator takes the migration plan from the Planner and executes it + * by reading source files, writing/patching target files, and copying assets. + * It operates autonomously through tool calls, deciding the order and strategy + * of modifications (preamble first, then body, then figures, then bibliography, etc.). + * + * Tools available: readFile, writeFile, applyDiff, grepFile, listProjectTree, copyAsset, measureFigures, compileProject + */ + +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../llmService.js'; +import { buildVenueSkillFromState } from '../skills/index.js'; +import { createGeneratorTools } from '../tools/index.js'; +import { NeuripsPhase, progressUpdate } from '../progressMeta.js'; +import { bumpLiveProgress, runAgentToolCall, recordUnknownToolTrace } from '../toolTrace.js'; + +const MAX_TOOL_ROUNDS = 40; + +/** + * Run the Generator agent. + * + * Receives the migration plan and autonomously executes it through tool calls. + */ +export async function agentGenerator(state, config) { + const iteration = state.currentIteration || 0; + const plan = state.migrationPlan || state.transferPlan || {}; + const lp = config?.configurable?._liveProgress; + + // Build tools + const ctx = { + sourceReadRoot: state.sourceReadRoot || state.sourceProjectRoot, + workspaceRoot: state.workspaceRoot || state.targetProjectRoot, + jobId: state.jobId, + enableSensitiveMask: !!state.enableSensitiveMask, + sourceMaskManifest: state.sourceMaskManifest || [], + sourceMaskedContents: state.sourceMaskedContents || {}, + targetProjectId: state.targetProjectId, + targetMainFile: state.targetMainFile, + engine: state.engine || 'pdflatex', + llmConfig: state.llmConfig, + }; + const tools = createGeneratorTools(ctx); + + // Build LLM + const { endpoint, apiKey, model } = resolveLLMConfig(state.llmConfig); + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.2, + }); + const llmWithTools = llm.bindTools(tools); + + // Build system prompt + const skill = await buildVenueSkillFromState(state); + + // Build user message + const reviewContext = + iteration > 0 && state.reviewResult + ? `\n\nREVIEWER FEEDBACK FROM PREVIOUS ITERATION: +${JSON.stringify(state.reviewResult, null, 2)} + +Fix the issues identified by the Reviewer. Read the current state of files before making changes.` + : ''; + + const userConfirmations = state.userConfirmations || {}; + const hasConfirmations = Object.keys(userConfirmations).length > 0; + + const venue = (state.transferIntake?.venue || 'neurips').toUpperCase(); + const isMineruMode = state.transferMode === 'mineru'; + const sourceNote = isMineruMode + ? `\nSOURCE MODE: MinerU (PDF → Markdown → LaTeX) +The source content is in Markdown format under _mineru_output/ in the target project. +Read the Markdown files and convert the content to LaTeX for the ${venue} template. +Images from the PDF are also in _mineru_output/ — use copyAsset to move them to the project root if needed.\n` + : ''; + const userMessage = `You are the GENERATOR. Execute the migration plan by reading source files and writing/patching the target ${venue} project. +${sourceNote} +MIGRATION PLAN: +${JSON.stringify(plan, null, 2)} + +${hasConfirmations ? `USER CONFIRMATIONS:\n${JSON.stringify(userConfirmations, null, 2)}\n` : ''} + +Source main file: "${state.sourceMainFile}" +Target main file: "${state.targetMainFile}" +${reviewContext} + +EXECUTION INSTRUCTIONS: +1. First, use listProjectTree("source") and listProjectTree("target") to see what's available +2. Use readFile to read both source and target main .tex files +3. Execute the migration following the CRITICAL CONSTRAINTS in your system prompt. General order: + a. PREAMBLE: Read source preamble → generate venue-compliant preamble per your system prompt rules → writeFile or applyDiff + b. BODY: Read source body → migrate content following section mapping → writeFile or applyDiff + c. FIGURES/TABLES: Normalize figure environments per venue rules (single-column venues: figure*→figure; two-column venues: keep figure* for full-width) + d. ASSETS: Use copyAsset to copy all referenced .bib, .bbl, images, .sty/.cls/.bst files + e. BIBLIOGRAPHY: Align \\cite commands and bibliography mechanism per venue rules in your system prompt + f. BLIND COMPLIANCE (if doubleBlind): Sanitize \\hypersetup{pdfauthor={}}, anonymize identifying content + g. VENUE-SPECIFIC STRUCTURE: Follow any venue-specific structural requirements from your system prompt (e.g. checklist for NeurIPS, impact statement for ICML) +4. After each major step, re-read the file to verify your changes; use compileProject() to check the target builds (uses the user-selected engine; tool returns a short LLM summary of errors/warnings) + +STRATEGY NOTES: +- For the initial full migration (iteration 0), prefer writeFile for the complete .tex rewrite +- For subsequent fix iterations, prefer applyDiff for surgical corrections +- Always use applyDiff if you're only changing a few lines +- Always readFile BEFORE writeFile or applyDiff to get the current file state + +When you are done with all modifications, output: +Migration complete. Applied: [brief summary of what was done]`; + + // Run tool-calling loop + const messages = [ + { role: 'system', content: skill }, + { role: 'user', content: userMessage }, + ]; + + let summary = ''; + let toolCallCount = 0; + const projectRoot = state.workspaceRoot || state.targetProjectRoot; + const jobId = state.jobId; + + for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { + if (lp) { + lp.activeRole = 'generator'; + lp.toolName = 'llm'; + lp.toolArgs = ''; + lp.toolRound = round; + lp.maxToolRounds = MAX_TOOL_ROUNDS; + bumpLiveProgress(lp); + } + const response = await llmWithTools.invoke(messages); + messages.push(response); + + // Check for tool calls + if (response.tool_calls && response.tool_calls.length > 0) { + for (const toolCall of response.tool_calls) { + const tool = tools.find((t) => t.name === toolCall.name); + if (!tool) { + await recordUnknownToolTrace({ + config, + lp, + projectRoot, + jobId, + agent: 'generator', + iteration, + round, + toolName: toolCall.name, + }); + messages.push({ + role: 'tool', + content: `[ERROR] Unknown tool: ${toolCall.name}`, + tool_call_id: toolCall.id, + }); + continue; + } + if (lp) lp.maxToolRounds = MAX_TOOL_ROUNDS; + const result = await runAgentToolCall({ + config, + lp, + projectRoot, + jobId, + agent: 'generator', + iteration, + round, + toolCall, + invokeFn: () => tool.invoke(toolCall.args), + }); + toolCallCount++; + messages.push({ + role: 'tool', + content: typeof result === 'string' ? result : JSON.stringify(result), + tool_call_id: toolCall.id, + }); + } + continue; + } + + // No tool calls — check for completion signal + const content = + typeof response.content === 'string' + ? response.content + : Array.isArray(response.content) + ? response.content.map((p) => (typeof p === 'string' ? p : p?.text || '')).join('') + : ''; + + const doneMatch = content.match( + /([\s\S]*?)<\/GENERATOR_DONE>/, + ); + if (doneMatch) { + summary = doneMatch[1].trim(); + break; + } + + // If no done signal and no tool calls, it might be reasoning — let it continue + // but ask it to either use tools or signal completion + if (round > MAX_TOOL_ROUNDS - 5) { + messages.push({ + role: 'user', + content: + 'Please complete your remaining work and output summary when finished.', + }); + } + } + + if (!summary) { + summary = `Generator completed after ${toolCallCount} tool calls (max rounds reached).`; + } + + return { + agentPhase: 'reviewing', + ...progressUpdate( + 'agentGenerator', + NeuripsPhase.agent_generating, + `Iteration ${iteration}: ${summary} (${toolCallCount} tool calls).`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/agentPlanner.js b/apps/backend/src/services/transferAgent/nodes/agentPlanner.js new file mode 100644 index 0000000..8928be5 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/agentPlanner.js @@ -0,0 +1,262 @@ +/** + * agentPlanner — Planner node for the multi-venue agentic transfer (graphVenueAgent). + * + * The Planner autonomously explores source and target projects using tools, + * then produces a structured migration plan. On subsequent iterations + * (when Reviewer sends back issues), it revises the plan accordingly. + * + * Tools available: readFile, grepFile, listProjectTree, raiseQuestion, compileProject + */ + +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../llmService.js'; +import { buildVenueSkillFromState } from '../skills/index.js'; +import { createReadOnlyTools } from '../tools/index.js'; +import { NeuripsPhase, progressUpdate } from '../progressMeta.js'; +import { bumpLiveProgress, runAgentToolCall, recordUnknownToolTrace } from '../toolTrace.js'; +import { analyzeSource, buildSourceProfile } from './analyzeSource.js'; +import { analyzeTarget } from './analyzeTarget.js'; + +const MAX_TOOL_ROUNDS = 20; + +/** + * Run the Planner agent. + * + * On iteration 0: performs source/target analysis, then calls LLM with tools + * to explore and produce a migration plan. + * + * On iteration N>0: receives reviewer feedback, revises the plan. + */ +export async function agentPlanner(state, config) { + const iteration = state.currentIteration || 0; + const lp = config?.configurable?._liveProgress; + + // --- First iteration: run source + target analysis --- + let analysisState = {}; + if (iteration === 0) { + // Reuse existing analysis logic (no LLM, pure heuristic) + const sourceResult = await analyzeSource(state); + const targetResult = await analyzeTarget({ ...state, ...sourceResult }); + analysisState = { ...sourceResult, ...targetResult }; + } + + const mergedState = { ...state, ...analysisState }; + + // Build tools with project roots + const ctx = { + sourceReadRoot: mergedState.sourceReadRoot || mergedState.sourceProjectRoot, + workspaceRoot: mergedState.workspaceRoot || mergedState.targetProjectRoot, + jobId: mergedState.jobId, + enableSensitiveMask: !!mergedState.enableSensitiveMask, + sourceMaskManifest: mergedState.sourceMaskManifest || [], + sourceMaskedContents: mergedState.sourceMaskedContents || {}, + targetProjectId: mergedState.targetProjectId, + targetMainFile: mergedState.targetMainFile, + engine: mergedState.engine || 'pdflatex', + llmConfig: mergedState.llmConfig, + }; + const tools = createReadOnlyTools(ctx); + + // Build LLM + const { endpoint, apiKey, model } = resolveLLMConfig(mergedState.llmConfig); + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.2, + }); + const llmWithTools = llm.bindTools(tools); + + // Build system prompt + const skill = await buildVenueSkillFromState(mergedState); + + // Build user message for this iteration + const isMineruMode = mergedState.transferMode === 'mineru'; + const sourceDesc = isMineruMode + ? `The source content has been parsed from PDF by MinerU into Markdown format. + - Markdown content is available in the target project under _mineru_output/ + - Images extracted from the PDF are also in _mineru_output/ + - You should read the Markdown content and convert it to LaTeX for the target template. + - The source project may also have the original .tex files for reference.` + : `Source main file: "${mergedState.sourceMainFile}"`; + + let userMessage; + if (iteration === 0) { + userMessage = `You are the PLANNER. Your job is to explore the source and target projects and produce a detailed migration plan. + +${isMineruMode ? 'SOURCE MODE: MinerU (PDF → Markdown → LaTeX)\n' : ''}INSTRUCTIONS: +1. Use listProjectTree to see what files exist in both projects +2. Use readFile to examine key files (${isMineruMode ? 'look for Markdown files in _mineru_output/ and' : `source main file: "${mergedState.sourceMainFile}",`} target main file: "${mergedState.targetMainFile}") +3. Analyze the source paper's structure, ${isMineruMode ? 'sections, figures, tables, equations, and references from the Markdown' : 'packages, bibliography mechanism, figures, and special formatting'} +4. Study the target template structure (follow the venue-specific rules in your system prompt) +5. If you need user input on ambiguous decisions (e.g., float strategy, content dropping), use raiseQuestion +6. Optional: call compileProject() to verify the target template already builds (returns an LLM summary of the log, not raw TeX output) + +After exploring, output your migration plan as a JSON object wrapped in tags: + + +{ + "sectionMapping": [ + { "sourceSection": "...", "targetSection": "...", "action": "map|merge|create|drop" } + ], + "assetStrategy": { + "bibFiles": ["files to copy"], + "images": ["image files to copy"], + "styles": ["style files to copy"], + "bibCommand": "bibliography|addbibresource|input_bbl" + }, + "preambleStrategy": "description of how to handle preamble migration", + "bodyStrategy": "description of how to handle body migration", + "bibliographyStrategy": "description of bibliography handling", + "blindStrategy": "description of double-blind compliance steps (if applicable)", + "figureStrategy": "description of figure/table normalization", + "risks": ["potential issues to watch for"], + "notes": "any special instructions" +} +`; + } else { + const review = mergedState.reviewResult || {}; + const issues = (review.issues || []) + .map((iss, i) => ` ${i + 1}. [${iss.severity || 'medium'}] ${iss.description}`) + .join('\n'); + const suggestions = (review.suggestions || []).join('\n - '); + + userMessage = `You are the PLANNER (revision iteration ${iteration}). + +The Reviewer found the following issues with the previous migration: + +ISSUES: +${issues || ' (none)'} + +SUGGESTIONS: + - ${suggestions || '(none)'} + +PREVIOUS PLAN: +${JSON.stringify(mergedState.migrationPlan || {}, null, 2)} + +Please revise the migration plan to address these issues. Use tools to inspect the current state of target files if needed. + +Output the revised plan in tags (same JSON format as before).`; + } + + // Run tool-calling loop + const messages = [ + { role: 'system', content: skill }, + { role: 'user', content: userMessage }, + ]; + + const projectRoot = mergedState.workspaceRoot || mergedState.targetProjectRoot; + const jobId = mergedState.jobId; + + let plan = null; + for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { + if (lp) { + lp.activeRole = 'planner'; + lp.toolName = 'llm'; + lp.toolArgs = ''; + lp.toolRound = round; + lp.maxToolRounds = MAX_TOOL_ROUNDS; + bumpLiveProgress(lp); + } + const response = await llmWithTools.invoke(messages); + messages.push(response); + + // Check for tool calls + if (response.tool_calls && response.tool_calls.length > 0) { + for (const toolCall of response.tool_calls) { + const tool = tools.find((t) => t.name === toolCall.name); + if (!tool) { + await recordUnknownToolTrace({ + config, + lp, + projectRoot, + jobId, + agent: 'planner', + iteration, + round, + toolName: toolCall.name, + }); + messages.push({ + role: 'tool', + content: `[ERROR] Unknown tool: ${toolCall.name}`, + tool_call_id: toolCall.id, + }); + continue; + } + if (lp) lp.maxToolRounds = MAX_TOOL_ROUNDS; + const result = await runAgentToolCall({ + config, + lp, + projectRoot, + jobId, + agent: 'planner', + iteration, + round, + toolCall, + invokeFn: () => tool.invoke(toolCall.args), + }); + messages.push({ + role: 'tool', + content: typeof result === 'string' ? result : JSON.stringify(result), + tool_call_id: toolCall.id, + }); + } + continue; + } + + // No tool calls — extract plan from response + const content = + typeof response.content === 'string' + ? response.content + : Array.isArray(response.content) + ? response.content.map((p) => (typeof p === 'string' ? p : p?.text || '')).join('') + : ''; + + const planMatch = content.match( + /([\s\S]*?)<\/MIGRATION_PLAN>/, + ); + if (planMatch) { + try { + plan = JSON.parse(planMatch[1].trim()); + } catch { + // Try to extract JSON more aggressively + const { extractJSON } = await import('../utils.js'); + plan = extractJSON(planMatch[1]); + } + } + + if (!plan) { + // Ask the LLM to output the plan properly + messages.push({ + role: 'user', + content: + 'Please output your migration plan as a JSON object inside tags.', + }); + continue; + } + + break; + } + + // Fallback plan if LLM didn't produce one + if (!plan) { + plan = { + sectionMapping: [], + assetStrategy: {}, + notes: 'Planner failed to produce a structured plan after max rounds.', + _plannerError: true, + }; + } + + return { + ...analysisState, + migrationPlan: plan, + transferPlan: plan, // backward compat + agentPhase: 'generating', + ...progressUpdate( + 'agentPlanner', + NeuripsPhase.agent_planning, + `Iteration ${iteration}: migration plan ${plan._plannerError ? 'FAILED' : 'ready'} (${(plan.sectionMapping || []).length} section mappings).`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/agentReviewer.js b/apps/backend/src/services/transferAgent/nodes/agentReviewer.js new file mode 100644 index 0000000..68c158f --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/agentReviewer.js @@ -0,0 +1,262 @@ +/** + * agentReviewer — Reviewer node for the agentic transfer. + * + * The Reviewer inspects the target project after the Generator has made changes, + * checking for venue compliance, correctness, and completeness. + * It produces a structured review result with verdict ('pass' or 'revise'). + * + * All venue-specific constraints are loaded from reviewerChecklist skill — + * the prompt skeleton here is venue-agnostic. + * + * Tools available: readFile, grepFile, listProjectTree, raiseQuestion, compileProject + */ + +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../llmService.js'; +import { buildVenueSkillFromState } from '../skills/index.js'; +import { buildReviewChecklist } from '../skills/reviewerChecklist.js'; +import { createReviewerTools } from '../tools/index.js'; +import { NeuripsPhase, progressUpdate } from '../progressMeta.js'; +import { extractJSON } from '../utils.js'; +import { bumpLiveProgress, runAgentToolCall, recordUnknownToolTrace } from '../toolTrace.js'; + +const MAX_TOOL_ROUNDS = 20; + +/** + * Run the Reviewer agent. + * + * Inspects the current state of target files and produces a review verdict. + */ +export async function agentReviewer(state, config) { + const iteration = state.currentIteration || 0; + const intake = state.transferIntake || {}; + const lp = config?.configurable?._liveProgress; + + // Build tools + const ctx = { + sourceReadRoot: state.sourceReadRoot || state.sourceProjectRoot, + workspaceRoot: state.workspaceRoot || state.targetProjectRoot, + jobId: state.jobId, + enableSensitiveMask: !!state.enableSensitiveMask, + sourceMaskManifest: state.sourceMaskManifest || [], + sourceMaskedContents: state.sourceMaskedContents || {}, + targetProjectId: state.targetProjectId, + targetMainFile: state.targetMainFile, + engine: state.engine || 'pdflatex', + llmConfig: state.llmConfig, + }; + const tools = createReviewerTools(ctx); + + // Build LLM + const { endpoint, apiKey, model } = resolveLLMConfig(state.llmConfig); + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.1, + }); + const llmWithTools = llm.bindTools(tools); + + // Build system prompt (venue-specific skill) + const skill = await buildVenueSkillFromState(state); + + // Load venue-specific review checklist + const venueId = (intake.venue || state.transferGraphKind || 'neurips').toLowerCase(); + const venueUpper = venueId.toUpperCase(); + const checklist = buildReviewChecklist(venueId, { intake }); + + // ── Build user message (venue-agnostic skeleton) ── + + const userMessage = `You are the REVIEWER (iteration ${iteration}). Inspect the target ${venueUpper} project and determine if the migration is complete and correct. + +Target main file: "${state.targetMainFile}" + +REVIEW CHECKLIST — check each item using tools: + +1. STRUCTURE & COMPILATION READINESS: + - \\documentclass{article} (not revtex, amsart, llncs, etc.) + ${checklist.structure} + - \\begin{document} ... \\end{document} present and well-formed + - Use compileProject() to compile the target with the user-selected engine; the tool returns an LLM-compressed log summary — treat FAIL summaries as high-severity issues + +2. CONTENT COMPLETENESS: + - All source sections mapped to target (compare with source) + - Mathematical content, equations preserved + - \\cite{}, \\ref{}, \\label{} references intact + - No placeholder text like "TODO", "INSERT HERE", "FIXME" in the body + +3. FIGURE/TABLE COMPLIANCE: + ${checklist.figures} + +4. BIBLIOGRAPHY: + - Bibliography mechanism is consistent (bibtex natbib, or \\input{.bbl}) + ${checklist.bibliography} + +5. ASSETS: + - All referenced images exist in target project + - Required .sty/.cls/.bst files present + +${checklist.policy} + +7. ${checklist.blind} + +INSTRUCTIONS: +1. Use readFile to read the target main .tex file; call compileProject() when you need an actual build check (returns an LLM summary of the compile log, not the raw log) +2. Use grepFile to check for specific patterns +3. Use listProjectTree to verify asset files exist +4. Compare key sections with the source if needed +5. If you discover an issue requiring user decision, use raiseQuestion + +After your review, output a JSON result in tags: + + +{ + "verdict": "pass" or "revise", + "issues": [ + { + "category": "structure|content|figures|bibliography|assets|policy|blind", + "severity": "high|medium|low", + "description": "What's wrong", + "suggestion": "How to fix it" + } + ], + "suggestions": ["General improvement suggestions"], + "summary": "Brief overall assessment" +} + + +Rules for verdict: +- "pass" = no high-severity issues, the file is submission-ready +- "revise" = has high or multiple medium-severity issues that must be fixed`; + + // Run tool-calling loop + const messages = [ + { role: 'system', content: skill }, + { role: 'user', content: userMessage }, + ]; + + let reviewResult = null; + const projectRoot = state.workspaceRoot || state.targetProjectRoot; + const jobId = state.jobId; + + for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { + if (lp) { + lp.activeRole = 'reviewer'; + lp.toolName = 'llm'; + lp.toolArgs = ''; + lp.toolRound = round; + lp.maxToolRounds = MAX_TOOL_ROUNDS; + bumpLiveProgress(lp); + } + const response = await llmWithTools.invoke(messages); + messages.push(response); + + // Check for tool calls + if (response.tool_calls && response.tool_calls.length > 0) { + for (const toolCall of response.tool_calls) { + const tool = tools.find((t) => t.name === toolCall.name); + if (!tool) { + await recordUnknownToolTrace({ + config, + lp, + projectRoot, + jobId, + agent: 'reviewer', + iteration, + round, + toolName: toolCall.name, + }); + messages.push({ + role: 'tool', + content: `[ERROR] Unknown tool: ${toolCall.name}`, + tool_call_id: toolCall.id, + }); + continue; + } + if (lp) lp.maxToolRounds = MAX_TOOL_ROUNDS; + const result = await runAgentToolCall({ + config, + lp, + projectRoot, + jobId, + agent: 'reviewer', + iteration, + round, + toolCall, + invokeFn: () => tool.invoke(toolCall.args), + }); + messages.push({ + role: 'tool', + content: typeof result === 'string' ? result : JSON.stringify(result), + tool_call_id: toolCall.id, + }); + } + continue; + } + + // No tool calls — extract review result + const content = + typeof response.content === 'string' + ? response.content + : Array.isArray(response.content) + ? response.content.map((p) => (typeof p === 'string' ? p : p?.text || '')).join('') + : ''; + + const reviewMatch = content.match( + /([\s\S]*?)<\/REVIEW_RESULT>/, + ); + if (reviewMatch) { + try { + reviewResult = JSON.parse(reviewMatch[1].trim()); + } catch { + reviewResult = extractJSON(reviewMatch[1]); + } + } + + if (!reviewResult) { + messages.push({ + role: 'user', + content: + 'Please output your review result as a JSON object inside tags.', + }); + continue; + } + + break; + } + + // Fallback + if (!reviewResult) { + reviewResult = { + verdict: 'pass', + issues: [], + suggestions: [], + summary: 'Reviewer could not complete structured review; passing by default.', + _reviewerError: true, + }; + } + + // Ensure verdict is valid + if (!['pass', 'revise'].includes(reviewResult.verdict)) { + reviewResult.verdict = reviewResult.issues?.some( + (i) => i.severity === 'high', + ) + ? 'revise' + : 'pass'; + } + + const isPass = reviewResult.verdict === 'pass'; + const nextIteration = isPass ? iteration : iteration + 1; + + return { + reviewResult, + currentIteration: nextIteration, + agentPhase: isPass ? 'finalized' : 'planning', + ...progressUpdate( + 'agentReviewer', + NeuripsPhase.agent_reviewing, + `Iteration ${iteration}: verdict=${reviewResult.verdict}, ${(reviewResult.issues || []).length} issues. ${reviewResult.summary || ''}`, + isPass ? 'info' : 'warn', + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/analyzeSource.js b/apps/backend/src/services/transferAgent/nodes/analyzeSource.js index 50b31a4..a3a1ba1 100644 --- a/apps/backend/src/services/transferAgent/nodes/analyzeSource.js +++ b/apps/backend/src/services/transferAgent/nodes/analyzeSource.js @@ -3,22 +3,27 @@ import path from 'path'; import { getProjectRoot } from '../../projectService.js'; import { safeJoin } from '../../../utils/pathUtils.js'; import { listFilesRecursive } from '../../../utils/fsUtils.js'; -import { isTextFile } from '../../../utils/texUtils.js'; +import { progressUpdate } from '../progressMeta.js'; +import { maskSourceProjectFiles } from '../masking/index.js'; /** * Recursively resolve \input{} and \include{} references, * returning the concatenated full content. */ -async function resolveInputs(projectRoot, relPath, visited = new Set()) { +async function resolveInputs(projectRoot, relPath, visited = new Set(), contentOverrides = {}) { if (visited.has(relPath)) return ''; visited.add(relPath); const absPath = safeJoin(projectRoot, relPath); let content; - try { - content = await fs.readFile(absPath, 'utf8'); - } catch { - return ''; + if (typeof contentOverrides[relPath] === 'string') { + content = contentOverrides[relPath]; + } else { + try { + content = await fs.readFile(absPath, 'utf8'); + } catch { + return ''; + } } // Match \input{...} and \include{...} @@ -32,7 +37,7 @@ async function resolveInputs(projectRoot, relPath, visited = new Set()) { let ref = match[1].trim(); // Add .tex extension if missing if (!path.extname(ref)) ref += '.tex'; - const childContent = await resolveInputs(projectRoot, ref, visited); + const childContent = await resolveInputs(projectRoot, ref, visited, contentOverrides); result += childContent; lastIndex = pattern.lastIndex; } @@ -53,6 +58,25 @@ function parseOutline(content) { return outline; } +/** Map ATX headings to the same { level, title } shape as parseOutline. */ +const MD_HEADING_LEVEL = { 1: 'section', 2: 'subsection', 3: 'subsubsection' }; + +function parseMarkdownOutline(md) { + const outline = []; + if (!md) return outline; + const lines = md.split(/\r?\n/); + for (const line of lines) { + const m = /^(#{1,6})\s+(.+?)\s*$/.exec(line); + if (!m) continue; + const depth = m[1].length; + const title = m[2].replace(/\s*#+\s*$/, '').trim(); + if (!title) continue; + const level = MD_HEADING_LEVEL[Math.min(depth, 3)] || 'subsubsection'; + outline.push({ level, title }); + } + return outline; +} + /** * Collect asset references from LaTeX content. */ @@ -87,24 +111,134 @@ function collectAssets(content, allFiles) { return assets; } +/** + * Heuristic profile of LaTeX source (no LLM). + */ +export function buildSourceProfile(content) { + const docMatch = content.match(/\\documentclass(?:\[[^\]]*\])?\{([^}]+)\}/); + const documentclass = docMatch ? docMatch[1].trim() : ''; + + const pkgRe = /\\usepackage(?:\[[^\]]*\])?\{([^}]+)\}/g; + const packages = new Set(); + let m; + while ((m = pkgRe.exec(content)) !== null) { + m[1].split(',').forEach((p) => packages.add(p.trim())); + } + + const twocolumn = /\\documentclass(?:\[[^\]]*twocolumn[^\]]*\])?\{[^}]+\}/.test(content) + || /\\usepackage(?:\[[^\]]*\])?\{twocolumn\}/.test(content); + + const hasBiblatex = packages.has('biblatex'); + const hasNatbib = packages.has('natbib'); + const hasInputBbl = /\\input\s*\{[^}]*\.bbl\}/i.test(content) + || /\\include\s*\{[^}]*\.bbl\}/i.test(content); + const hasBibtexCmd = /\\bibliography\s*\{/.test(content); + + let bibMechanism = 'none'; + if (hasBiblatex) bibMechanism = 'biblatex'; + else if (hasInputBbl) bibMechanism = 'input_bbl'; + else if (hasBibtexCmd || hasNatbib) bibMechanism = 'bibtex_natbib'; + + const figureStar = /\\begin\s*\{\s*figure\*\s*\}/i.test(content); + const tableStar = /\\begin\s*\{\s*table\*\s*\}/i.test(content); + + const revtex = /revtex|revtex4/i.test(documentclass); + + return { + documentclass, + packages: [...packages].sort(), + twocolumn, + figureStar, + tableStar, + revtex, + bibMechanism, + hasNatbib, + hasBiblatex, + }; +} + /** * analyzeSource node — reads source project, resolves inputs, * parses outline, collects assets. */ export async function analyzeSource(state) { + // MinerU + PDF upload: no LaTeX source project; content is Markdown under _mineru_output/. + if (!state.sourceProjectId && state.transferMode === 'mineru') { + const readRoot = state.mineruOutputDir + || (state.targetProjectRoot ? path.join(state.targetProjectRoot, '_mineru_output') : ''); + if (!readRoot) { + throw new Error( + 'MinerU PDF path: missing mineruOutputDir and targetProjectRoot; run parsePdfWithMineru before analyzeSource.', + ); + } + const md = state.sourceMarkdown || ''; + const outline = parseMarkdownOutline(md); + const imagePaths = (state.sourceImages || []) + .map((img) => (img?.name ? path.join('images', img.name) : '')) + .filter(Boolean); + const assets = { bib: [], images: imagePaths, styles: [], other: [] }; + const sourceProfile = buildSourceProfile(''); + return { + sourceReadRoot: readRoot, + sourceOutline: outline, + sourceFullContent: md, + sourceAssets: assets, + sourceProfile, + ...progressUpdate( + 'analyzeSource', + 'source_analysis', + `MinerU Markdown: ${outline.length} headings; ${imagePaths.length} images; readRoot=${readRoot}`, + ), + }; + } + const projectRoot = await getProjectRoot(state.sourceProjectId); const allFiles = await listFilesRecursive(projectRoot); + let sourceMaskManifest = []; + let sourceMaskedFiles = []; + let sourceMaskedContents = {}; + let sourceMaskWarnings = []; + + if (state.enableSensitiveMask) { + const masked = await maskSourceProjectFiles(projectRoot); + sourceMaskManifest = masked.manifest; + sourceMaskedFiles = masked.maskedFiles; + sourceMaskedContents = masked.maskedContents; + sourceMaskWarnings = masked.warnings; + } // Resolve all \input/\include and get full content - const fullContent = await resolveInputs(projectRoot, state.sourceMainFile); + const fullContent = await resolveInputs( + projectRoot, + state.sourceMainFile, + new Set(), + sourceMaskedContents, + ); const outline = parseOutline(fullContent); const assets = collectAssets(fullContent, allFiles); + const sourceProfile = buildSourceProfile(fullContent); + const maskSummary = state.enableSensitiveMask + ? `; maskedFiles=${sourceMaskedFiles.length}; maskedSegments=${sourceMaskManifest.length}` + : ''; + const warningSummary = state.enableSensitiveMask && sourceMaskWarnings.length + ? `; maskWarnings=${sourceMaskWarnings.length}` + : ''; return { sourceProjectRoot: projectRoot, + sourceReadRoot: projectRoot, sourceOutline: outline, sourceFullContent: fullContent, sourceAssets: assets, - progressLog: `[analyzeSource] Parsed ${outline.length} sections, found ${assets.bib.length} bib files, ${assets.images.length} images, ${assets.styles.length} style files.`, + sourceProfile, + sourceMaskManifest, + sourceMaskedFiles, + sourceMaskedContents, + sourceMaskWarnings, + ...progressUpdate( + 'analyzeSource', + 'source_analysis', + `Parsed ${outline.length} sections; bibMechanism=${sourceProfile.bibMechanism}; class=${sourceProfile.documentclass || '?'}${maskSummary}${warningSummary}`, + ), }; } diff --git a/apps/backend/src/services/transferAgent/nodes/analyzeTarget.js b/apps/backend/src/services/transferAgent/nodes/analyzeTarget.js index e29bb89..2924fb7 100644 --- a/apps/backend/src/services/transferAgent/nodes/analyzeTarget.js +++ b/apps/backend/src/services/transferAgent/nodes/analyzeTarget.js @@ -3,6 +3,7 @@ import path from 'path'; import { getProjectRoot } from '../../projectService.js'; import { safeJoin } from '../../../utils/pathUtils.js'; import { listFilesRecursive } from '../../../utils/fsUtils.js'; +import { progressUpdate } from '../progressMeta.js'; /** * Recursively resolve \input{} and \include references. @@ -72,9 +73,14 @@ export async function analyzeTarget(state) { return { targetProjectRoot: projectRoot, + workspaceRoot: projectRoot, targetOutline: outline, targetPreamble: preamble, targetTemplateContent: fullContent, - progressLog: `[analyzeTarget] Template has ${outline.length} sections. Preamble length: ${preamble.length} chars.`, + ...progressUpdate( + 'analyzeTarget', + 'source_analysis', + `Template ${outline.length} sections; preamble ${preamble.length} chars.`, + ), }; } diff --git a/apps/backend/src/services/transferAgent/nodes/applyTransfer.js b/apps/backend/src/services/transferAgent/nodes/applyTransfer.js index 0c67c88..61b1d3f 100644 --- a/apps/backend/src/services/transferAgent/nodes/applyTransfer.js +++ b/apps/backend/src/services/transferAgent/nodes/applyTransfer.js @@ -3,6 +3,7 @@ import { ChatOpenAI } from '@langchain/openai'; import { resolveLLMConfig, normalizeBaseURL } from '../../llmService.js'; import { safeJoin } from '../../../utils/pathUtils.js'; import { writeFileWithSnapshot, stripCodeFences } from '../utils.js'; +import { unmaskContent } from '../masking/index.js'; /** * Build the LLM prompt for content migration. @@ -23,7 +24,7 @@ SOURCE CONTENT (full): ${state.sourceFullContent} RULES: -1. Keep the target preamble (everything before \\begin{document}) EXACTLY as-is +1. Keep the target preamble (everything before \\begin{document}) EXACTLY as-is — do NOT change \\documentclass, \\usepackage for the venue style, or any template-specific commands 2. Only modify content between \\begin{document} and \\end{document} 3. Follow the section mapping in the migration plan 4. Preserve ALL \\cite{}, \\ref{}, \\label{} commands from the source @@ -32,7 +33,8 @@ RULES: 7. Do NOT add any content that doesn't exist in the source 8. Do NOT remove any substantive content from the source 9. If the source uses \\bibliography{} but target uses \\addbibresource{}, adapt accordingly -10. Output the COMPLETE .tex file content, not just the body +10. Preserve placeholder markers like __OP_MASK_EQ_0001__ and __OP_MASK_TBL_0002__ exactly if they appear in the source +11. Output the COMPLETE .tex file content, not just the body Output ONLY the complete LaTeX file content. No explanations, no markdown fences.`; } @@ -59,12 +61,12 @@ ${state.targetTemplateContent} ${imageList || '(none)'} ## RULES: -1. Keep the target preamble (everything before \\begin{document}) EXACTLY as-is +1. Keep the target preamble (everything before \\begin{document}) EXACTLY as-is — do NOT change \\documentclass, \\usepackage for the venue style, or any template-specific commands 2. Only modify content between \\begin{document} and \\end{document} 3. Map Markdown headings to the corresponding \\section{}, \\subsection{} etc. in the template 4. Formulas in the Markdown are already in LaTeX format ($...$ or $$...$$) — preserve them as-is 5. Convert HTML tables in the Markdown to LaTeX \\begin{tabular} environments -6. For images referenced in the Markdown, use \\includegraphics{images/} wrapped in \\begin{figure}...\\end{figure} +6. For images referenced in the Markdown, use \\includegraphics[width=\\linewidth,keepaspectratio]{images/} wrapped in \\begin{figure}...\\end{figure} 7. Preserve ALL text content — do not omit any paragraphs or sections 8. Do NOT add content that doesn't exist in the Markdown 9. Output the COMPLETE .tex file content, not just the body @@ -88,16 +90,20 @@ async function applyTransferLegacy(state) { const prompt = buildTransferPrompt(state); const response = await llm.invoke([{ role: 'user', content: prompt }]); const newContent = stripCodeFences(response.content); + const unmasked = unmaskContent(newContent, state.sourceMaskManifest); await writeFileWithSnapshot( state.targetProjectRoot, state.targetMainFile, - newContent, + unmasked.content, state.jobId ); + const maskNote = state.enableSensitiveMask + ? ` Restored ${unmasked.restored} mask token(s); remaining=${unmasked.remaining}.` + : ''; return { - progressLog: `[applyTransfer] Wrote migrated content to ${state.targetMainFile} (${newContent.length} chars).`, + progressLog: `[applyTransfer] Wrote migrated content to ${state.targetMainFile} (${unmasked.content.length} chars).${maskNote}`, }; } diff --git a/apps/backend/src/services/transferAgent/nodes/compile.js b/apps/backend/src/services/transferAgent/nodes/compile.js index 1523818..e510031 100644 --- a/apps/backend/src/services/transferAgent/nodes/compile.js +++ b/apps/backend/src/services/transferAgent/nodes/compile.js @@ -1,4 +1,5 @@ import { runCompile } from '../../compileService.js'; +import { progressUpdate } from '../progressMeta.js'; /** * compile node — runs LaTeX compilation on the target project @@ -12,10 +13,19 @@ export async function compile(state) { }); const attempt = (state.compileAttempt || 0) + 1; + const msg = `Attempt ${attempt}: ${result.ok ? 'SUCCESS' : 'FAILED'} (exit ${result.status}).`; + + if (state.transferGraphKind === 'neurips') { + return { + compileResult: result, + compileAttempt: attempt, + ...progressUpdate('compile', 'compile', msg, result.ok ? 'info' : 'warn'), + }; + } return { compileResult: result, compileAttempt: attempt, - progressLog: `[compile] Attempt ${attempt}: ${result.ok ? 'SUCCESS' : 'FAILED'} (exit ${result.status}).`, + progressLog: `[compile] ${msg}`, }; } diff --git a/apps/backend/src/services/transferAgent/nodes/compileSource.js b/apps/backend/src/services/transferAgent/nodes/compileSource.js index 0bfd592..e2ea8a4 100644 --- a/apps/backend/src/services/transferAgent/nodes/compileSource.js +++ b/apps/backend/src/services/transferAgent/nodes/compileSource.js @@ -10,7 +10,15 @@ import { ensureDir } from '../../../utils/fsUtils.js'; * (e.g. user uploaded a PDF directly). */ export async function compileSource(state) { - const sourceProjectRoot = state.sourceProjectId ? await getProjectRoot(state.sourceProjectId) : undefined; + let sourceProjectRoot; + if (state.sourceProjectId) { + try { + sourceProjectRoot = await getProjectRoot(state.sourceProjectId); + } catch { + // Source project not found — not fatal if we have a PDF + sourceProjectRoot = undefined; + } + } // If user uploaded a PDF directly, skip compilation if (state.sourcePdfPath) { diff --git a/apps/backend/src/services/transferAgent/nodes/copyAssets.js b/apps/backend/src/services/transferAgent/nodes/copyAssets.js index b0f9cd1..928e31c 100644 --- a/apps/backend/src/services/transferAgent/nodes/copyAssets.js +++ b/apps/backend/src/services/transferAgent/nodes/copyAssets.js @@ -38,7 +38,7 @@ async function copySingleAsset(srcRoot, destRoot, relPath) { } /** - * Legacy mode: copy bib files, images, and style files from source project. + * Legacy mode: copy bib/bbl files, images, and style files from source project. */ async function copyAssetsLegacy(state) { const assets = state.sourceAssets || {}; @@ -49,6 +49,19 @@ async function copyAssetsLegacy(state) { results.push(r); } + // Copy .bbl files + if (state.sourceProjectRoot) { + const allFiles = await listFilesRecursive(state.sourceProjectRoot); + const bblFiles = allFiles + .filter(f => f.type === 'file' && path.extname(f.path).toLowerCase() === '.bbl') + .map(f => f.path); + + for (const bbl of bblFiles) { + const r = await copySingleAsset(state.sourceProjectRoot, state.targetProjectRoot, bbl); + results.push(r); + } + } + for (const img of (assets.images || [])) { const r = await copySingleAsset(state.sourceProjectRoot, state.targetProjectRoot, img); results.push(r); @@ -73,7 +86,7 @@ async function copyAssetsLegacy(state) { /** * MinerU mode: copy MinerU-extracted images to target project images/ dir, - * and optionally copy bib files from source project if available. + * and optionally copy bib/bbl files from source project if available. */ async function copyAssetsMineru(state) { const images = state.sourceImages || []; @@ -110,8 +123,24 @@ async function copyAssetsMineru(state) { } } + // Copy .bbl files from source project if available + let bblCount = 0; + if (state.sourceProjectRoot) { + const allFiles = await listFilesRecursive(state.sourceProjectRoot); + const bblFiles = allFiles + .filter(f => f.type === 'file' && path.extname(f.path).toLowerCase() === '.bbl') + .map(f => f.path); + + for (const bbl of bblFiles) { + const r = await copySingleAsset( + state.sourceProjectRoot, state.targetProjectRoot, bbl + ); + if (r.status === 'copied') bblCount++; + } + } + return { - progressLog: `[copyAssets:mineru] Copied ${copiedCount} images, ${bibCount} bib files.`, + progressLog: `[copyAssets:mineru] Copied ${copiedCount} images, ${bibCount} bib files, ${bblCount} bbl files.`, }; } diff --git a/apps/backend/src/services/transferAgent/nodes/draftPlan.js b/apps/backend/src/services/transferAgent/nodes/draftPlan.js index b053141..5b4da47 100644 --- a/apps/backend/src/services/transferAgent/nodes/draftPlan.js +++ b/apps/backend/src/services/transferAgent/nodes/draftPlan.js @@ -1,6 +1,8 @@ import { ChatOpenAI } from '@langchain/openai'; import { resolveLLMConfig, normalizeBaseURL } from '../../llmService.js'; import { invokeLLMForJSON } from '../utils.js'; +import { loadNeuripsRulesFull, formatNeuripsHandbookBlock } from '../neuripsRules.js'; +import { progressUpdate } from '../progressMeta.js'; /** * draftPlan node — LLM generates a structured transfer plan @@ -16,6 +18,30 @@ export async function draftPlan(state) { temperature: 0.2, }); + const isNeurips = state.transferGraphKind === 'neurips'; + const handbook = isNeurips + ? formatNeuripsHandbookBlock(await loadNeuripsRulesFull()) + : ''; + + const extraNeurips = isNeurips + ? ` +SOURCE_PROFILE (heuristic JSON): +${JSON.stringify(state.sourceProfile || {}, null, 2)} + +TRANSFER_INTAKE: +${JSON.stringify(state.transferIntake || {}, null, 2)} +${handbook} +` + : ''; + + const neuripsStructure = isNeurips + ? `, + "dependencies": ["ordered strings, e.g. natbib before cite fixes"], + "humanReview": ["items needing author judgment"], + "preambleNotes": "short preamble migration notes", + "bodyNotes": "short body migration notes"` + : ''; + const prompt = `You are a LaTeX template migration planner. Given a SOURCE paper outline and a TARGET template outline, produce a JSON migration plan. @@ -31,6 +57,7 @@ ${JSON.stringify(state.sourceAssets, null, 2)} TARGET PREAMBLE (first 2000 chars): ${(state.targetPreamble || '').slice(0, 2000)} +${extraNeurips} Produce a JSON object with this structure: { @@ -42,7 +69,7 @@ Produce a JSON object with this structure: "images": ["copy list"], "bibCommand": "bibliography|addbibresource" }, - "notes": "any special instructions for the migration" + "notes": "any special instructions for the migration"${neuripsStructure} } Rules: @@ -50,13 +77,17 @@ Rules: - If target has no matching section, use action "create" - If source section has no place in target, use action "drop" (rare) - Preserve all citations, references, labels, and figure/table environments -- Keep the target preamble unchanged +${isNeurips ? '- Follow NeurIPS handbook above for anonymous mode, floats, bibliography, and page limits' : '- Keep the target preamble unchanged'} - Output ONLY valid JSON, no markdown fences`; const planSchema = { sectionMapping: { type: 'array', required: true }, - assetStrategy: { type: 'object', required: true }, - notes: { type: 'string', required: false }, + assetStrategy: { type: 'object', required: true }, + notes: { type: 'string', required: false }, + dependencies: { type: 'array', required: false }, + humanReview: { type: 'array', required: false }, + preambleNotes: { type: 'string', required: false }, + bodyNotes: { type: 'string', required: false }, }; const { parsed, raw, retries } = await invokeLLMForJSON( @@ -70,6 +101,10 @@ Rules: return { transferPlan: plan, - progressLog: `[draftPlan] Generated migration plan with ${plan.sectionMapping?.length || 0} section mappings${retryNote}.`, + ...progressUpdate( + 'draftPlan', + 'migration_plan', + `Generated migration plan with ${plan.sectionMapping?.length || 0} section mappings${retryNote}.`, + ), }; } diff --git a/apps/backend/src/services/transferAgent/nodes/finalize.js b/apps/backend/src/services/transferAgent/nodes/finalize.js index f0387be..5f323a4 100644 --- a/apps/backend/src/services/transferAgent/nodes/finalize.js +++ b/apps/backend/src/services/transferAgent/nodes/finalize.js @@ -1,19 +1,67 @@ +import { NeuripsPhase, progressUpdate } from '../progressMeta.js'; + +/** + * Venue-specific bundle notes for the user. + */ +const VENUE_BUNDLE_NOTES = { + neurips: [ + 'NeurIPS 流程在服务端编译前结束;请在本地用 pdflatex/bibtex 等自行生成 PDF。', + '提交材料:main.tex、neurips_2026.sty、checklist.tex、插图、所用 .bib 或 .bbl。', + '勿上传:.aux、.log、.out、.synctex.gz、.compile/、.agent_runs/', + ].join(' '), + icml: [ + 'ICML 流程在服务端编译前结束;请在本地用 pdflatex/bibtex 等自行生成 PDF。', + '提交材料:main.tex、icml2026.sty、icml2026.bst、插图、所用 .bib 或 .bbl。', + '勿上传:.aux、.log、.out、.synctex.gz、.compile/、.agent_runs/', + ].join(' '), +}; + /** * finalize node — sets final status and collects results. */ export async function finalize(state) { + const venue = (state.transferIntake?.venue || state.transferGraphKind || 'legacy').toLowerCase(); + const isAgentVenue = ['neurips', 'icml', 'cvpr', 'acl'].includes(venue); + const isRuleBasedTransfer = state.transferGraphKind === 'rulebasetransfer'; + const isNoCompilePath = isAgentVenue || isRuleBasedTransfer; const compileOk = state.compileResult?.ok || false; const hasPdf = !!state.compileResult?.pdf; + const hasUpstreamError = state.status === 'failed' || !!state.error; + + const finalStatus = hasUpstreamError + ? 'failed' + : isNoCompilePath + ? 'success' + : compileOk && hasPdf + ? 'success' + : 'failed'; + const error = hasUpstreamError + ? state.error + : isNoCompilePath + ? undefined + : !hasPdf + ? (state.compileResult?.error || 'No PDF generated after all attempts.') + : undefined; + + const bundleNotes = VENUE_BUNDLE_NOTES[venue] + || (isRuleBasedTransfer + ? '规则模式转换完成(未在服务端编译)。请在本地用 pdflatex/xelatex/latexmk 等自行生成 PDF。' + : ''); - const finalStatus = compileOk && hasPdf ? 'success' : 'failed'; - const error = !hasPdf - ? (state.compileResult?.error || 'No PDF generated after all attempts.') - : undefined; + const summaryMsg = isNoCompilePath + ? `Transfer ${finalStatus} (no server compile).${bundleNotes ? ` ${bundleNotes}` : ''}` + : `Transfer ${finalStatus}. Compile attempts: ${state.compileAttempt}, Layout attempts: ${state.layoutAttempt}.${bundleNotes ? ` ${bundleNotes}` : ''}`; return { status: finalStatus, finalPdf: state.compileResult?.pdf || '', error, - progressLog: `[finalize] Transfer ${finalStatus}. Compile attempts: ${state.compileAttempt}, Layout attempts: ${state.layoutAttempt}.`, + bundleNotes, + ...progressUpdate( + 'finalize', + NeuripsPhase.finalize, + summaryMsg, + finalStatus === 'success' ? 'info' : 'error', + ), }; } diff --git a/apps/backend/src/services/transferAgent/nodes/fixCompile.js b/apps/backend/src/services/transferAgent/nodes/fixCompile.js index 8d1e1da..45562f7 100644 --- a/apps/backend/src/services/transferAgent/nodes/fixCompile.js +++ b/apps/backend/src/services/transferAgent/nodes/fixCompile.js @@ -3,6 +3,8 @@ import { ChatOpenAI } from '@langchain/openai'; import { resolveLLMConfig, normalizeBaseURL } from '../../llmService.js'; import { safeJoin } from '../../../utils/pathUtils.js'; import { writeFileWithSnapshot, stripCodeFences } from '../utils.js'; +import { loadNeuripsRulesFull, formatNeuripsHandbookBlock } from '../neuripsRules.js'; +import { progressUpdate } from '../progressMeta.js'; const MAX_LOG_TAIL = 8000; @@ -24,15 +26,30 @@ export async function fixCompile(state) { temperature: 0.2, }); + // Determine venue context so the LLM doesn't switch templates + const venue = state.transferIntake?.venue || state.transferGraphKind || 'unknown'; + const venueConstraint = ` +CRITICAL: This paper targets the "${venue.toUpperCase()}" venue. +- Do NOT change the \\usepackage{} for the venue style (e.g. icml2026, neurips_2026). +- Do NOT switch from one venue template to another. +- If a .sty file is missing, do NOT replace it with a different venue's .sty. +- Only fix actual LaTeX errors; preserve the venue template structure. +`; + + const neuripsBlock = state.transferGraphKind === 'neurips' + ? formatNeuripsHandbookBlock(await loadNeuripsRulesFull()) + : ''; + const prompt = `You are a LaTeX compilation error fixer. The following LaTeX file failed to compile. Fix the errors and return the corrected COMPLETE file. - +${venueConstraint} COMPILE LOG (last ${MAX_LOG_TAIL} chars): ${log} CURRENT FILE (${state.targetMainFile}): ${currentTex} +${neuripsBlock} Common fixes: - Missing packages: add \\usepackage{...} in preamble @@ -53,6 +70,16 @@ Output ONLY the complete corrected LaTeX file. No explanations, no markdown fenc state.jobId ); + if (state.transferGraphKind === 'neurips') { + return { + ...progressUpdate( + 'fixCompile', + 'compile', + `Applied LLM fix for compile attempt ${state.compileAttempt}.`, + ), + }; + } + return { progressLog: `[fixCompile] Applied LLM fix for compile attempt ${state.compileAttempt}.`, }; diff --git a/apps/backend/src/services/transferAgent/nodes/fixLayout.js b/apps/backend/src/services/transferAgent/nodes/fixLayout.js index 05e6ff5..c6f6caf 100644 --- a/apps/backend/src/services/transferAgent/nodes/fixLayout.js +++ b/apps/backend/src/services/transferAgent/nodes/fixLayout.js @@ -3,6 +3,7 @@ import { ChatOpenAI } from '@langchain/openai'; import { resolveLLMConfig, normalizeBaseURL } from '../../llmService.js'; import { safeJoin } from '../../../utils/pathUtils.js'; import { writeFileWithSnapshot, stripCodeFences } from '../utils.js'; +import { loadNeuripsRulesFull, formatNeuripsHandbookBlock } from '../neuripsRules.js'; /** * fixLayout node — LLM reads current main.tex + VLM layout issues, @@ -23,6 +24,10 @@ export async function fixLayout(state) { temperature: 0.2, }); + const neuripsBlock = state.transferGraphKind === 'neurips' + ? formatNeuripsHandbookBlock(await loadNeuripsRulesFull()) + : ''; + const prompt = `You are a LaTeX layout fixer. The following LaTeX file has layout issues identified by visual inspection. @@ -33,6 +38,7 @@ ${issuesText} CURRENT FILE (${state.targetMainFile}): ${currentTex} +${neuripsBlock} Common layout fixes: - Overflow: adjust figure width, use \\resizebox, or \\adjustbox diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/applyBibliography.js b/apps/backend/src/services/transferAgent/nodes/neurips/applyBibliography.js new file mode 100644 index 0000000..50250d6 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/applyBibliography.js @@ -0,0 +1,72 @@ +import { promises as fs } from 'fs'; +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../../llmService.js'; +import { safeJoin } from '../../../../utils/pathUtils.js'; +import { writeFileWithSnapshot } from '../../utils.js'; +import { loadNeuripsRulesFull, formatNeuripsHandbookBlock } from '../../neuripsRules.js'; +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; +import { + mainTexDiffInstructions, + runLlmUnifiedDiffWithRetries, +} from '../../llmUnifiedDiff.js'; + +export async function applyBibliography(state) { + const root = state.workspaceRoot || state.targetProjectRoot; + const rel = state.targetMainFile; + const abs = safeJoin(root, rel); + const currentTex = await fs.readFile(abs, 'utf8'); + const virtualPath = rel.replace(/\\/g, '/'); + + const { endpoint, apiKey, model } = resolveLLMConfig(state.llmConfig); + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.2, + }); + + const handbook = formatNeuripsHandbookBlock(await loadNeuripsRulesFull()); + const diffInstr = mainTexDiffInstructions(virtualPath); + const basePrompt = `Fix bibliography / citations block for NeurIPS 2026. + +CRITICAL RULES: +1. NeurIPS uses NUMERIC citations [1,2,3], NOT author-year (Author [2007]). +2. The file MUST contain "\\\\PassOptionsToPackage{numbers,compress,sort}{natbib}" BEFORE "\\\\documentclass". + If it is missing or commented out, ADD it before \\\\documentclass. +3. Use \\\\bibliographystyle{unsrtnat} (NOT plainnat, which defaults to author-year). +4. Keep \\\\bibliography{...} pointing to the correct .bib file name. +5. If no .bib file exists and a .bbl file is present, that is fine — LaTeX will use the .bbl directly. + +USER_CONFIRMATIONS_JSON: +${JSON.stringify(state.userConfirmations || {})} + +SOURCE_PROFILE_JSON: +${JSON.stringify(state.sourceProfile || {}, null, 2)} + +CURRENT_FILE: +${currentTex} +${handbook} + +Align \\\\cite with the bibliography mechanism chosen; keep \\\\input{checklist.tex} and ack/references structure valid.${diffInstr}`; + + const merged = await runLlmUnifiedDiffWithRetries({ + llm, + baseTex: currentTex, + buildPrompt: (failureNote) => basePrompt + (failureNote || ''), + nodeName: 'applyBibliography', + phase: NeuripsPhase.bibliography, + maxAttempts: 3, + debug: { projectRoot: root, jobId: state.jobId }, + }); + + await writeFileWithSnapshot(root, rel, merged, state.jobId); + + return { + lastGoodPhase: 'bib', + ...progressUpdate( + 'applyBibliography', + NeuripsPhase.bibliography, + `Bibliography pass (unified diff applied, ${merged.length} chars).`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/applyBody.js b/apps/backend/src/services/transferAgent/nodes/neurips/applyBody.js new file mode 100644 index 0000000..053d317 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/applyBody.js @@ -0,0 +1,74 @@ +import { promises as fs } from 'fs'; +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../../llmService.js'; +import { safeJoin } from '../../../../utils/pathUtils.js'; +import { + writeFileWithSnapshot, + stripCodeFences, + splitTexDocument, + mergeTexDocument, +} from '../../utils.js'; +import { loadNeuripsRulesFull, formatNeuripsHandbookBlock } from '../../neuripsRules.js'; +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +export async function applyBody(state) { + const root = state.workspaceRoot || state.targetProjectRoot; + const rel = state.targetMainFile; + const abs = safeJoin(root, rel); + const currentTex = await fs.readFile(abs, 'utf8'); + const srcParts = splitTexDocument(state.sourceFullContent || ''); + const tgtParts = splitTexDocument(currentTex); + + if (!tgtParts.hasDocument || !srcParts.hasDocument) { + return { + ...progressUpdate('applyBody', NeuripsPhase.body, 'Missing document environment; skipped.'), + }; + } + + const { endpoint, apiKey, model } = resolveLLMConfig(state.llmConfig); + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.2, + }); + + const handbook = formatNeuripsHandbookBlock(await loadNeuripsRulesFull()); + const prompt = `You migrate the DOCUMENT BODY to NeurIPS 2026 structure. + +USER_CONFIRMATIONS_JSON: +${JSON.stringify(state.userConfirmations || {})} + +MIGRATION_PLAN_JSON: +${JSON.stringify(state.transferPlan || {}, null, 2)} + +SOURCE_BODY (\\begin{document}...\\end{document}): +${srcParts.body} + +CURRENT_TARGET_FILE (full, for reference of checklist/ack placement): +${currentTex} +${handbook} + +Output ONLY the document body block: from \\begin{document} through \\end{document} inclusive. Map sections per plan. Preserve all \\\\cite{}, \\\\ref{}, \\\\label{} and substantive math/figures/tables. Follow NeurIPS abstract (one paragraph) and sectioning rules from the handbook. No markdown fences.`; + + const response = await llm.invoke([{ role: 'user', content: prompt }]); + let newBody = stripCodeFences( + typeof response.content === 'string' ? response.content : '', + ).trim(); + + if (!newBody.includes('\\begin{document}')) { + newBody = `\\begin{document}\n\n${newBody}\n\n\\end{document}`; + } + + const merged = mergeTexDocument(tgtParts.preamble, newBody, tgtParts.tail); + await writeFileWithSnapshot(root, rel, merged, state.jobId); + + return { + lastGoodPhase: 'body', + ...progressUpdate( + 'applyBody', + NeuripsPhase.body, + `Wrote document body (${newBody.length} chars).`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/applyPreamble.js b/apps/backend/src/services/transferAgent/nodes/neurips/applyPreamble.js new file mode 100644 index 0000000..62b1c48 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/applyPreamble.js @@ -0,0 +1,107 @@ +import { promises as fs } from 'fs'; +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../../llmService.js'; +import { safeJoin } from '../../../../utils/pathUtils.js'; +import { + writeFileWithSnapshot, + stripCodeFences, + splitTexDocument, + mergeTexDocument, +} from '../../utils.js'; +import { loadNeuripsRulesFull, formatNeuripsHandbookBlock } from '../../neuripsRules.js'; +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +export async function applyPreamble(state) { + const root = state.workspaceRoot || state.targetProjectRoot; + const rel = state.targetMainFile; + const abs = safeJoin(root, rel); + const currentTex = await fs.readFile(abs, 'utf8'); + const srcParts = splitTexDocument(state.sourceFullContent || ''); + const tgtParts = splitTexDocument(currentTex); + + if (!tgtParts.hasDocument) { + return { + ...progressUpdate('applyPreamble', NeuripsPhase.preamble, 'Target missing \\begin{document}; skipped preamble merge.'), + }; + } + + const { endpoint, apiKey, model } = resolveLLMConfig(state.llmConfig); + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.2, + }); + + const handbook = formatNeuripsHandbookBlock(await loadNeuripsRulesFull()); + const intake = state.transferIntake || {}; + const isDoubleBlind = intake.doubleBlind !== false; + const isPreprint = !!intake.preprint; + const neuripsOption = isPreprint ? '[preprint]' : '[main]'; + const prompt = `You migrate a LaTeX preamble to NeurIPS 2026 (see handbook below). + +SUBMISSION MODE: +- doubleBlind: ${isDoubleBlind} +- preprint: ${isPreprint} +- THEREFORE: use \\\\usepackage${neuripsOption}{neurips_2026} +${isDoubleBlind ? '- MUST use [main] option (gives line numbers + anonymous mode). Do NOT use [preprint].' : '- Using [preprint] option (non-anonymous, no line numbers).'} +- MUST add \\\\PassOptionsToPackage{numbers,compress,sort}{natbib} BEFORE \\\\documentclass for numeric citations [1,2,3] + +USER_CONFIRMATIONS_JSON: +${JSON.stringify(state.userConfirmations || {})} + +MIGRATION_PLAN_JSON: +${JSON.stringify(state.transferPlan || {}, null, 2)} + +SOURCE_PROFILE_JSON: +${JSON.stringify(state.sourceProfile || {}, null, 2)} + +SOURCE_PREAMBLE_ONLY: +${srcParts.preamble || '(empty)'} + +CURRENT_TARGET_FILE: +${currentTex} +${handbook} + +Output ONLY the new preamble: from \\documentclass through the line immediately before \\begin{document}. Do NOT output \\begin{document} or anything after it. No markdown fences.`; + + const response = await llm.invoke([{ role: 'user', content: prompt }]); + let newPreamble = stripCodeFences( + typeof response.content === 'string' ? response.content : '', + ).trim(); + + if (newPreamble.includes('\\begin{document}')) { + newPreamble = newPreamble.split('\\begin{document}')[0].trimEnd(); + } + + // ---- Deterministic post-processing (do NOT rely on LLM for these) ---- + + // 1. Force correct neurips_2026 package option based on submission mode + const correctOption = isPreprint ? '[preprint]' : '[main]'; + // Match any \usepackage[...]{neurips_2026} or \usepackage{neurips_2026} + newPreamble = newPreamble.replace( + /\\usepackage(?:\s*\[[^\]]*\])?\s*\{neurips_2026\}/, + `\\usepackage${correctOption}{neurips_2026}`, + ); + + // 2. Ensure \PassOptionsToPackage{numbers,compress,sort}{natbib} exists before \documentclass + if (!/\\PassOptionsToPackage\s*\{[^}]*numbers[^}]*\}\s*\{natbib\}/.test(newPreamble)) { + // Insert before \documentclass + newPreamble = newPreamble.replace( + /(\\documentclass)/, + '\\PassOptionsToPackage{numbers,compress,sort}{natbib}\n$1', + ); + } + + const merged = mergeTexDocument(newPreamble, tgtParts.body, tgtParts.tail); + await writeFileWithSnapshot(root, rel, merged, state.jobId); + + return { + lastGoodPhase: 'preamble', + ...progressUpdate( + 'applyPreamble', + NeuripsPhase.preamble, + `Wrote preamble (${newPreamble.length} chars); body preserved for next step.`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/blindConfirmBypass.js b/apps/backend/src/services/transferAgent/nodes/neurips/blindConfirmBypass.js new file mode 100644 index 0000000..4db383a --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/blindConfirmBypass.js @@ -0,0 +1,10 @@ +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +/** Used when blind QA is skipped — avoids interrupt-before consumeConfirmBlind. */ +export async function blindConfirmBypass() { + return progressUpdate( + 'blindConfirmBypass', + NeuripsPhase.compile, + 'Skipped consumeConfirmBlind (no blind QA).', + ); +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/consumeConfirmBlind.js b/apps/backend/src/services/transferAgent/nodes/neurips/consumeConfirmBlind.js new file mode 100644 index 0000000..e2a9955 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/consumeConfirmBlind.js @@ -0,0 +1,13 @@ +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +export async function consumeConfirmBlind(state) { + return { + pendingQA: null, + status: 'running', + ...progressUpdate( + 'consumeConfirmBlind', + NeuripsPhase.compile, + 'Blind confirmations recorded; proceeding to compile.', + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/consumeConfirmPlan.js b/apps/backend/src/services/transferAgent/nodes/neurips/consumeConfirmPlan.js new file mode 100644 index 0000000..822b858 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/consumeConfirmPlan.js @@ -0,0 +1,18 @@ +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +/** + * Runs after user submits answers via /api/transfer/submit-confirm + */ +export async function consumeConfirmPlan(state) { + const answers = state.userConfirmations || {}; + const keys = Object.keys(answers); + return { + pendingQA: null, + status: 'running', + ...progressUpdate( + 'consumeConfirmPlan', + NeuripsPhase.migration_plan, + `Applied ${keys.length} confirmation answer(s).`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/intake.js b/apps/backend/src/services/transferAgent/nodes/neurips/intake.js new file mode 100644 index 0000000..32b840f --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/intake.js @@ -0,0 +1,16 @@ +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; +import { loadNeuripsRulesFull } from '../../neuripsRules.js'; + +export async function intake(state) { + if (state.transferGraphKind === 'neurips') { + await loadNeuripsRulesFull(); + } + const t = state.transferIntake || {}; + return { + ...progressUpdate( + 'intake', + NeuripsPhase.intake, + `venue=${t.venue || 'neurips'} preprint=${!!t.preprint} doubleBlind=${t.doubleBlind !== false}`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/normalizeFigures.js b/apps/backend/src/services/transferAgent/nodes/neurips/normalizeFigures.js new file mode 100644 index 0000000..d79ebf5 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/normalizeFigures.js @@ -0,0 +1,313 @@ +import { promises as fs } from 'fs'; +import path from 'path'; +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../../llmService.js'; +import { safeJoin } from '../../../../utils/pathUtils.js'; +import { writeFileWithSnapshot } from '../../utils.js'; +import { loadNeuripsRulesFull, formatNeuripsHandbookBlock } from '../../neuripsRules.js'; +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; +import { + mainTexDiffInstructions, + runLlmUnifiedDiffWithRetries, +} from '../../llmUnifiedDiff.js'; + +/* ------------------------------------------------------------------ */ +/* Lightweight figure / layout measurement (no external binaries) */ +/* ------------------------------------------------------------------ */ + +/** Known column-width (pt) for common document classes. */ +const LAYOUT_DB = { + neurips: { textwidthPt: 396, columnwidthPt: 396, columns: 1 }, + article: { textwidthPt: 345, columnwidthPt: 345, columns: 1 }, + 'revtex4-1': { textwidthPt: 510, columnwidthPt: 246, columns: 2 }, + 'revtex4-2': { textwidthPt: 510, columnwidthPt: 246, columns: 2 }, + revtex: { textwidthPt: 510, columnwidthPt: 246, columns: 2 }, + IEEEtran: { textwidthPt: 516, columnwidthPt: 252, columns: 2 }, + llncs: { textwidthPt: 336, columnwidthPt: 336, columns: 1 }, + acmart: { textwidthPt: 506, columnwidthPt: 241, columns: 2 }, + cvpr: { textwidthPt: 496, columnwidthPt: 237, columns: 2 }, + icml: { textwidthPt: 487, columnwidthPt: 233, columns: 2 }, +}; + +/** Read PDF MediaBox from the first 8 KB of the file. */ +async function pdfPageSize(filePath) { + try { + const fd = await fs.open(filePath, 'r'); + const buf = Buffer.alloc(8192); + await fd.read(buf, 0, 8192, 0); + await fd.close(); + const str = buf.toString('latin1'); + const m = str.match(/\/MediaBox\s*\[\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*\]/); + if (m) { + const w = parseFloat(m[3]) - parseFloat(m[1]); + const h = parseFloat(m[4]) - parseFloat(m[2]); + if (w > 0 && h > 0) return { widthPt: Math.round(w * 10) / 10, heightPt: Math.round(h * 10) / 10 }; + } + } catch { /* ignore */ } + return null; +} + +/** Resolve source layout: handle twocolumn flag overriding a single-col class. */ +function resolveSourceLayout(sourceProfile) { + const cls = (sourceProfile?.documentclass || '').toLowerCase(); + let layout = LAYOUT_DB[cls] || null; + + // Check if twocolumn was set explicitly even though the class DB entry is single-column + if (layout && sourceProfile?.twocolumn && layout.columns === 1) { + layout = { + ...layout, + columnwidthPt: Math.round((layout.textwidthPt - 20) / 2), + columns: 2, + }; + } + // If no DB entry but twocolumn is true, fall back to a reasonable guess + if (!layout && sourceProfile?.twocolumn) { + layout = { textwidthPt: 500, columnwidthPt: 240, columns: 2 }; + } + return layout; +} + +/** + * Collect all \includegraphics from the tex, measure each image file, + * and compute recommended widths for the target layout. + */ +async function measureAllFigures(texContent, workspaceRoot, sourceProfile, venue) { + const srcLayout = resolveSourceLayout(sourceProfile); + const tgtLayout = LAYOUT_DB[(venue || 'neurips').toLowerCase()] || LAYOUT_DB.neurips; + + // Parse every \includegraphics[...]{file} + const figRe = /\\begin\s*\{\s*figure(\*?)\s*\}[\s\S]*?\\includegraphics(?:\[([^\]]*)\])?\{([^}]+)\}[\s\S]*?\\end\s*\{\s*figure\*?\s*\}/g; + const figures = []; + let m; + while ((m = figRe.exec(texContent)) !== null) { + const isStar = m[1] === '*'; + const opts = m[2] || ''; + const file = m[3].trim(); + figures.push({ file, opts, isStar }); + } + + if (figures.length === 0) return null; + + const measurements = []; + for (const fig of figures) { + const absPath = safeJoin(workspaceRoot, fig.file); + let naturalSize = null; + const ext = path.extname(fig.file).toLowerCase(); + if (ext === '.pdf') { + naturalSize = await pdfPageSize(absPath); + } + + // Source effective width = columnwidth for normal figure, textwidth for figure* + const srcEffective = srcLayout + ? (fig.isStar ? srcLayout.textwidthPt : srcLayout.columnwidthPt) + : null; + const tgtLinewidth = tgtLayout.columnwidthPt; + + let recommendedSpec = '\\linewidth'; + let reason = ''; + + if (srcEffective && tgtLinewidth) { + const ratio = srcEffective / tgtLinewidth; + + if (ratio < 0.75) { + // Source figure was narrower than NeurIPS \linewidth + const r = Math.round(ratio * 100) / 100; + recommendedSpec = `${r}\\linewidth`; + reason = `source colwidth ${Math.round(srcEffective)}pt < target ${Math.round(tgtLinewidth)}pt → scale to ${r}\\linewidth`; + } else if (ratio <= 1.05) { + recommendedSpec = '\\linewidth'; + reason = 'source and target widths similar → \\linewidth is fine'; + } else { + // Source was wider (figure* in twocolumn or wide class) + // Scale down to avoid overflow; cap at \linewidth + recommendedSpec = '\\linewidth'; + reason = `source was wider (${Math.round(srcEffective)}pt) but capped at \\linewidth (${Math.round(tgtLinewidth)}pt)`; + } + } + + // Height check: will the figure be taller than 60% of the page? + let heightWarning = ''; + if (naturalSize && tgtLinewidth) { + // If using recommended width, what is the resulting height? + let usedWidth = tgtLinewidth; + const ratioMatch = recommendedSpec.match(/([\d.]+)\\linewidth/); + if (ratioMatch) usedWidth = parseFloat(ratioMatch[1]) * tgtLinewidth; + const scaledHeight = naturalSize.heightPt * (usedWidth / naturalSize.widthPt); + const pageTextHeight = 650; // NeurIPS ≈ 650 pt + if (scaledHeight > 0.60 * pageTextHeight) { + const safeRatio = Math.round((0.55 * pageTextHeight / naturalSize.heightPt) * (naturalSize.widthPt / tgtLinewidth) * 100) / 100; + const capped = Math.min(safeRatio, 1.0); + recommendedSpec = `${capped}\\linewidth`; + heightWarning = `at full width figure would be ${Math.round(scaledHeight)}pt tall (${Math.round(scaledHeight / pageTextHeight * 100)}% of page) → reduced to ${capped}\\linewidth`; + reason = heightWarning; + } + } + + measurements.push({ + file: fig.file, + isStar: fig.isStar, + currentOpts: fig.opts, + naturalSizePt: naturalSize ? `${naturalSize.widthPt} × ${naturalSize.heightPt}` : 'unknown', + recommendedWidth: recommendedSpec, + reason, + }); + } + + return { + sourceLayout: srcLayout + ? `${sourceProfile.documentclass}, ${srcLayout.columns}-col, colwidth=${srcLayout.columnwidthPt}pt, textwidth=${srcLayout.textwidthPt}pt` + : `${sourceProfile?.documentclass || 'unknown'} (layout not in DB)`, + targetLayout: `neurips, 1-col, linewidth=${tgtLayout.columnwidthPt}pt`, + figures: measurements, + }; +} + +/* ------------------------------------------------------------------ */ +/* Node entry point */ +/* ------------------------------------------------------------------ */ + +export async function normalizeFigures(state) { + const root = state.workspaceRoot || state.targetProjectRoot; + const rel = state.targetMainFile; + const abs = safeJoin(root, rel); + const currentTex = await fs.readFile(abs, 'utf8'); + const virtualPath = rel.replace(/\\/g, '/'); + + const { endpoint, apiKey, model } = resolveLLMConfig(state.llmConfig); + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.2, + }); + + const handbook = formatNeuripsHandbookBlock(await loadNeuripsRulesFull()); + const diffInstr = mainTexDiffInstructions(virtualPath); + const lineCount = currentTex.split(/\r\n|\r|\n/).length; + const hasFigure = /\\begin\s*\{\s*figure\*?\s*\}/.test(currentTex); + const hasTable = /\\begin\s*\{\s*table\*?\s*\}/.test(currentTex); + + // ---- Measure figures and compute scaling recommendations ---- + const venue = state.transferIntake?.venue || 'neurips'; + const measurement = hasFigure + ? await measureAllFigures(currentTex, root, state.sourceProfile, venue) + : null; + + let figureScalingBlock = ''; + if (measurement && measurement.figures.length > 0) { + figureScalingBlock = ` +FIGURE SCALING REPORT (computed from source and target layouts — follow these): +Source layout: ${measurement.sourceLayout} +Target layout: ${measurement.targetLayout} + +${measurement.figures.map((f, i) => + ` Figure ${i + 1}: ${f.file} + Natural size: ${f.naturalSizePt} + Current opts: ${f.currentOpts || '(none)'} + → Recommended width: ${f.recommendedWidth} + Reason: ${f.reason}` +).join('\n')} + +IMPORTANT: Apply the recommended widths above to each \\includegraphics. +`; + } + + const basePrompt = `Adjust figures/tables/paths in this NeurIPS-bound LaTeX file. + +FILE_FACTS (read before writing any @@ hunk): +- CURRENT_FILE has exactly ${lineCount} lines (including blanks). @@ line numbers must stay within this range. +- Contains \\begin{figure} or \\begin{figure*}: ${hasFigure ? 'yes' : 'NO — do not invent figure environments or PDF names that are not in FILE'}. +- Contains \\begin{table}: ${hasTable ? 'yes' : 'NO — do not invent table environments'}. +${figureScalingBlock} +FLOAT PLACEMENT RULES: +- Use \\begin{figure}[htbp] (NOT just [t]) so LaTeX can place figures near their first reference. +- NEVER use \\begin{figure}[H] (requires extra package and forces bad page breaks). +- Convert figure* to figure (NeurIPS is single-column; figure* is unnecessary). + +USER_CONFIRMATIONS_JSON: +${JSON.stringify(state.userConfirmations || {})} + +FILE: +${currentTex} +${handbook} + +Rules: prefer single-column figure/table; fix \\includegraphics widths per FIGURE SCALING REPORT above; add \\graphicspath if needed; respect float policy in handbook. +Do NOT add substantive caption prose, "explain the figure", or editorial instructions inside \\caption{...} — only layout/path/float-type fixes per handbook. +If there are no figure/table environments in FILE, only change preamble (e.g. \\graphicspath, packages) or make no structural edits; never hallucinate missing floats. + +Figure/table patches: each float environment you change should usually be its own @@ hunk (or a short group of adjacent lines). Two figures are often separated by paragraphs, \\beq/\\eeq, or \\subsection — those lines stay in the file; do not skip them in the diff. Open CURRENT_FILE, locate each \\begin{figure}…\\end{figure} (or figure*) block you touch, and emit a hunk whose context includes only lines that really appear consecutively there.${diffInstr}`; + + const merged = await runLlmUnifiedDiffWithRetries({ + llm, + baseTex: currentTex, + buildPrompt: (failureNote) => basePrompt + (failureNote || ''), + nodeName: 'normalizeFigures', + phase: NeuripsPhase.figures, + maxAttempts: 3, + debug: { projectRoot: root, jobId: state.jobId }, + }); + + // ---- Deterministic post-processing (do NOT rely on LLM for these) ---- + let postProcessed = merged; + let postFixLog = []; + + // 1. Fix float placement: [t], [b], [!t], [!b] → [htbp] + // This is critical: [t]-only often pushes figures to the end of the document. + postProcessed = postProcessed.replace( + /\\begin\s*\{(figure|table)\*?\}\s*\[([^\]]*)\]/g, + (match, env, opts) => { + // Already has h or htbp — leave alone + if (/h/.test(opts) && /[tbp]/.test(opts)) return match; + const star = match.includes('*') ? '*' : ''; + postFixLog.push(`\\begin{${env}${star}}[${opts}] → [htbp]`); + return `\\begin{${env}${star}}[htbp]`; + }, + ); + + // 2. Convert figure* → figure ONLY for single-column venues (e.g. NeurIPS) + // Two-column venues (ICML, CVPR, ACL) NEED figure* for full-width figures. + const tgtLayout = LAYOUT_DB[(state.transferIntake?.venue || 'neurips').toLowerCase()] || LAYOUT_DB.neurips; + if (tgtLayout.columns === 1) { + postProcessed = postProcessed.replace(/\\begin\s*\{\s*figure\*\s*\}/g, (m) => { + postFixLog.push('figure* → figure (single-column venue)'); + return '\\begin{figure}'; + }); + postProcessed = postProcessed.replace(/\\end\s*\{\s*figure\*\s*\}/g, () => '\\end{figure}'); + } + + // 3. Apply recommended widths from measurement (deterministic, not LLM) + if (measurement && measurement.figures.length > 0) { + for (const fig of measurement.figures) { + if (fig.recommendedWidth && fig.recommendedWidth !== '\\linewidth') { + // Replace width=\linewidth or width=\columnwidth for this specific file + const fileEscaped = fig.file.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const widthRe = new RegExp( + `(\\\\includegraphics\\s*\\[(?:[^\\]]*?)width\\s*=\\s*)(?:\\\\linewidth|\\\\columnwidth|1(?:\\.0)?\\\\(?:linewidth|columnwidth))(([^\\]]*?)\\]\\s*\\{${fileEscaped}\\})`, + 'g', + ); + const before = postProcessed; + postProcessed = postProcessed.replace(widthRe, `$1${fig.recommendedWidth}$2`); + if (postProcessed !== before) { + postFixLog.push(`${fig.file}: width → ${fig.recommendedWidth}`); + } + } + } + } + + if (postFixLog.length > 0) { + // Log what deterministic fixes were applied + const logMsg = `[normalizeFigures] Deterministic post-fixes: ${postFixLog.join('; ')}`; + // We don't have pushLog here, but the progressUpdate message will carry the info + } + + await writeFileWithSnapshot(root, rel, postProcessed, state.jobId); + + return { + figureMeasurement: measurement, + ...progressUpdate( + 'normalizeFigures', + NeuripsPhase.figures, + `Normalized floats and graphics (${measurement?.figures.length || 0} figures measured, ${postFixLog.length} deterministic fixes: ${postFixLog.join('; ') || 'none'}).`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/policyCheck.js b/apps/backend/src/services/transferAgent/nodes/neurips/policyCheck.js new file mode 100644 index 0000000..96092a3 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/policyCheck.js @@ -0,0 +1,111 @@ +import { promises as fs } from 'fs'; +import { safeJoin } from '../../../../utils/pathUtils.js'; +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +/** + * Lightweight policy check + deterministic structure fixes (no LLM). + * + * Ensures NeurIPS structural rules that the LLM agent often gets wrong: + * 1. \input{checklist.tex} must be the LAST thing before \end{document} + * 2. \appendix + appendix content must come BEFORE checklist, not after + */ +export async function policyCheck(state) { + const root = state.workspaceRoot || state.targetProjectRoot; + const rel = state.targetMainFile; + const abs = safeJoin(root, rel); + let tex = ''; + try { + tex = await fs.readFile(abs, 'utf8'); + } catch { + return { + ...progressUpdate( + 'policyCheck', + NeuripsPhase.policy, + 'Could not read main.tex for policy check.', + 'error', + ), + }; + } + + const issues = []; + const fixes = []; + const venue = (state.transferIntake?.venue || 'neurips').toLowerCase(); + const isNeurips = venue === 'neurips'; + + // NeurIPS-specific: checklist checks + if (isNeurips) { + if (/\\answerTODO/.test(tex)) { + issues.push('found \\\\answerTODO (fill checklist)'); + } + } + + const hasChecklist = /\\input\s*\{\s*checklist(?:\.tex)?\s*\}/.test(tex) + || /\\include\s*\{\s*checklist(?:\.tex)?\s*\}/.test(tex); + if (isNeurips && !hasChecklist) { + issues.push('checklist.tex not \\input/include'); + } + + // ---- Deterministic fix: ensure checklist is LAST before \end{document} ---- + // This is a NeurIPS-specific requirement; other venues don't have mandatory checklist. + if (isNeurips && hasChecklist) { + // Find the checklist \input line and \end{document} + const checklistRe = /^[ \t]*\\(?:input|include)\s*\{\s*checklist(?:\.tex)?\s*\}[ \t]*$/m; + const endDocRe = /^[ \t]*\\end\s*\{\s*document\s*\}[ \t]*$/m; + const checklistMatch = checklistRe.exec(tex); + const endDocMatch = endDocRe.exec(tex); + + if (checklistMatch && endDocMatch) { + const checklistPos = checklistMatch.index; + const endDocPos = endDocMatch.index; + + // Get everything between checklist and \end{document} + const afterChecklist = tex.slice( + checklistPos + checklistMatch[0].length, + endDocPos, + ).trim(); + + // If there's substantive content after checklist (appendix, \input, \section, etc.) + // that is NOT just whitespace/newpage, we need to reorder + const hasContentAfterChecklist = afterChecklist.length > 0 + && !/^[\s]*(?:\\newpage[\s]*)*$/.test(afterChecklist); + + if (hasContentAfterChecklist) { + // Extract the content that's wrongly after checklist + const contentAfterChecklist = afterChecklist; + + // Also grab any \newpage before checklist + const beforeChecklist = tex.slice(0, checklistPos); + const afterEndDoc = tex.slice(endDocPos); + + // Rebuild: beforeChecklist + movedContent + \newpage + checklist + \end{document} + const checklistLine = checklistMatch[0]; + + tex = beforeChecklist.trimEnd() + + '\n\n' + contentAfterChecklist.trim() + + '\n\n\\newpage\n' + checklistLine + '\n\n' + + afterEndDoc; + + fixes.push('moved appendix/content before checklist (checklist must be last before \\end{document})'); + } + } + } + + // Write back if fixes were applied + if (fixes.length > 0) { + try { + await fs.writeFile(abs, tex, 'utf8'); + } catch { + issues.push('failed to write structure fix'); + } + } + + const allNotes = [...issues, ...fixes]; + const level = issues.length ? 'warn' : 'info'; + const msg = allNotes.length + ? `Policy: ${allNotes.join('; ')}` + : 'Policy check: checklist present and correctly positioned, no answerTODO.'; + + return { + ...progressUpdate('policyCheck', NeuripsPhase.policy, msg, level), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/prepareConfirmBlind.js b/apps/backend/src/services/transferAgent/nodes/neurips/prepareConfirmBlind.js new file mode 100644 index 0000000..464d41e --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/prepareConfirmBlind.js @@ -0,0 +1,47 @@ +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +export async function prepareConfirmBlind(state) { + const intake = state.transferIntake || {}; + if (intake.doubleBlind === false || intake.preprint) { + return { + pendingQA: null, + status: 'running', + ...progressUpdate( + 'prepareConfirmBlind', + NeuripsPhase.blind, + 'Skipped blind QA (preprint or non-double-blind).', + ), + }; + } + + const pendingQA = [ + { + id: 'anon_citations', + prompt: '参考文献中是否可能存在可识别本人/本组的条目,需要匿名化或改为第三人称引用?', + type: 'single', + options: [ + '需要,请按 neurips.md 双盲条款尽量匿名化文内与文献表', + '不需要,源稿已匿名', + ], + }, + { + id: 'self_referential', + prompt: '正文是否包含 “our previous work” / 项目主页 / GitHub 等可识别链接?', + type: 'single', + options: [ + '有,请改写为匿名表述或删除链接', + '无或已处理', + ], + }, + ]; + + return { + pendingQA, + status: 'waiting_confirm', + ...progressUpdate( + 'prepareConfirmBlind', + NeuripsPhase.blind_qa, + 'Blind compliance questions ready.', + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/prepareConfirmPlan.js b/apps/backend/src/services/transferAgent/nodes/neurips/prepareConfirmPlan.js new file mode 100644 index 0000000..ebca88c --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/prepareConfirmPlan.js @@ -0,0 +1,58 @@ +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +/** + * Sets pending QA before consumeConfirmPlan (graph interrupts before consume). + */ +export async function prepareConfirmPlan(state) { + const plan = state.transferPlan || {}; + const profile = state.sourceProfile || {}; + + const pendingQA = [ + { + id: 'float_strategy', + prompt: '源稿含双栏通栏图 (figure*/table*) 或自定义浮动体策略。迁移时如何处理?', + type: 'single', + options: [ + '改为 NeurIPS 单栏 figure/table(推荐)', + '尽量保留结构,我稍后手动改', + ], + }, + { + id: 'bibliography_strategy', + prompt: `检测到文献机制倾向:${profile.bibMechanism || 'unknown'}。是否按 NeurIPS 模板默认 thebibliography / BibTeX 路径收敛?`, + type: 'single', + options: [ + '是,按模板与 neurips.md 收敛', + '否,保留现有 .bbl / biblatex 结构并仅做最小修补', + ], + }, + { + id: 'content_drop', + prompt: '迁移计划中有 drop/merge 段落时,是否允许删除源稿中无法映射的小节?', + type: 'single', + options: [ + '不允许删除正文;无法映射则合并到最近小节', + '允许按计划在极少数情况 drop(我会在 QA 后检查)', + ], + }, + ]; + + if (plan.notes) { + pendingQA.push({ + id: 'plan_notes_ack', + prompt: `Planner notes(请确认已理解):\n${plan.notes.slice(0, 1200)}`, + type: 'single', + options: ['已理解并继续', '暂停,我先改源项目'], + }); + } + + return { + pendingQA, + status: 'waiting_confirm', + ...progressUpdate( + 'prepareConfirmPlan', + NeuripsPhase.qa_plan, + `Prepared ${pendingQA.length} confirmation question(s).`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/sanitizeBlind.js b/apps/backend/src/services/transferAgent/nodes/neurips/sanitizeBlind.js new file mode 100644 index 0000000..400923c --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/sanitizeBlind.js @@ -0,0 +1,71 @@ +import { promises as fs } from 'fs'; +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../../llmService.js'; +import { safeJoin } from '../../../../utils/pathUtils.js'; +import { writeFileWithSnapshot } from '../../utils.js'; +import { loadNeuripsRulesFull, formatNeuripsHandbookBlock } from '../../neuripsRules.js'; +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; +import { + mainTexDiffInstructions, + runLlmUnifiedDiffWithRetries, +} from '../../llmUnifiedDiff.js'; + +export async function sanitizeBlind(state) { + const root = state.workspaceRoot || state.targetProjectRoot; + const rel = state.targetMainFile; + const abs = safeJoin(root, rel); + const currentTex = await fs.readFile(abs, 'utf8'); + const virtualPath = rel.replace(/\\/g, '/'); + + const intake = state.transferIntake || {}; + if (intake.doubleBlind === false || intake.preprint) { + return { + ...progressUpdate( + 'sanitizeBlind', + NeuripsPhase.blind, + 'Skipped anonymization (preprint or non-double-blind).', + ), + }; + } + + const { endpoint, apiKey, model } = resolveLLMConfig(state.llmConfig); + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.2, + }); + + const handbook = formatNeuripsHandbookBlock(await loadNeuripsRulesFull()); + const diffInstr = mainTexDiffInstructions(virtualPath); + const basePrompt = `Apply double-blind / PDF metadata sanitization for NeurIPS anonymous submission. + +BLIND_QA_ANSWERS_JSON: +${JSON.stringify(state.userConfirmations || {})} + +FILE: +${currentTex} +${handbook} + +Ensure \\\\hypersetup{pdfauthor={}} (or equivalent), remove identifying URLs in text if required by answers, anonymize self-citations per handbook.${diffInstr}`; + + const merged = await runLlmUnifiedDiffWithRetries({ + llm, + baseTex: currentTex, + buildPrompt: (failureNote) => basePrompt + (failureNote || ''), + nodeName: 'sanitizeBlind', + phase: NeuripsPhase.blind, + maxAttempts: 3, + debug: { projectRoot: root, jobId: state.jobId }, + }); + + await writeFileWithSnapshot(root, rel, merged, state.jobId); + + return { + ...progressUpdate( + 'sanitizeBlind', + NeuripsPhase.blind, + `Blind sanitization pass (unified diff applied, ${merged.length} chars).`, + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/neurips/verifyBuild.js b/apps/backend/src/services/transferAgent/nodes/neurips/verifyBuild.js new file mode 100644 index 0000000..49d7651 --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/neurips/verifyBuild.js @@ -0,0 +1,35 @@ +import { NeuripsPhase, progressUpdate } from '../../progressMeta.js'; + +const BAD_PATTERNS = [ + /undefined references/i, + /Citation.*undefined/i, + /There were undefined citations/i, + /^! LaTeX Error/m, + /Fatal error/i, +]; + +/** + * Post-compile log gate (compile may exit 0 with residual issues). + */ +export async function verifyBuild(state) { + const log = state.compileResult?.log || ''; + let hit = ''; + for (const re of BAD_PATTERNS) { + if (re.test(log)) { + hit = re.source; + break; + } + } + + const ok = !hit; + return { + verifyBuildResult: { ok, pattern: hit || null }, + buildFailureReason: ok ? '' : `verifyBuild: log matched ${hit}`, + ...progressUpdate( + 'verifyBuild', + NeuripsPhase.verify, + ok ? 'Log check passed (no fatal/undefined patterns).' : `Log check FAILED (${hit}).`, + ok ? 'info' : 'warn', + ), + }; +} diff --git a/apps/backend/src/services/transferAgent/nodes/parsePdfWithMineru.js b/apps/backend/src/services/transferAgent/nodes/parsePdfWithMineru.js index b5a67a4..d941623 100644 --- a/apps/backend/src/services/transferAgent/nodes/parsePdfWithMineru.js +++ b/apps/backend/src/services/transferAgent/nodes/parsePdfWithMineru.js @@ -1,31 +1,73 @@ +import { promises as fs } from 'fs'; import path from 'path'; -import { parsePdfWithMineru as callMineru } from '../../mineruService.js'; +import { parsePdfWithMineru as callMineru, resolveMineruConfig } from '../../mineruService.js'; import { ensureDir } from '../../../utils/fsUtils.js'; import { getProjectRoot } from '../../projectService.js'; +import { cropMineruImagesFromContentList } from '../../mineruContentListCrop.js'; +import { applyMineruRasterToPdf } from '../../mineruRasterToPdf.js'; /** * parsePdfWithMineru node — calls MinerU API to parse the source PDF - * into Markdown + images. + * into Markdown + images. Optional: bbox crop from source PDF, raster → single-page PDF. */ export async function parsePdfWithMineru(state) { const targetProjectRoot = state.targetProjectRoot || await getProjectRoot(state.targetProjectId); const outputDir = path.join(targetProjectRoot, '_mineru_output'); await ensureDir(outputDir); + // Merge env defaults (OPENPRISM_MINERU_RASTER_TO_PDF, etc.) — same as download step uses. + const config = resolveMineruConfig(state.mineruConfig || {}); + const result = await callMineru( state.sourcePdfPath, - state.mineruConfig, + config, outputDir, ); - const mdLen = (result.markdownContent || '').length; - const imgCount = (result.images || []).length; + let { markdownContent, images, searchDir, markdownPath } = result; + + const cropRes = await cropMineruImagesFromContentList({ + sourcePdfPath: state.sourcePdfPath, + searchDir, + images, + mineruConfig: config, + }); + images = cropRes.images; + + const pdfRes = await applyMineruRasterToPdf({ + markdownContent, + images, + searchDir, + mineruConfig: config, + }); + markdownContent = pdfRes.markdownContent; + images = pdfRes.images; + + await fs.writeFile(markdownPath, markdownContent, 'utf8'); + + const mdLen = markdownContent.length; + const imgCount = images.length; + + const extras = []; + if (cropRes.diagnostics) extras.push(cropRes.diagnostics); + if (pdfRes.diagnostics) extras.push(pdfRes.diagnostics); + if (!config.rasterToPdf && config.imageScale > 1) { + extras.push(`imageScale=${config.imageScale} ignored until rasterToPdf is enabled.`); + } + + let progressLog = `[parsePdfWithMineru] Parsed PDF: ${mdLen} chars markdown, ${imgCount} images.`; + if (extras.length) { + progressLog += ` ${extras.join(' | ')}`; + } + if (config.bboxCrop || config.rasterToPdf) { + progressLog += ' Toggle OPENPRISM_MINERU_BBOX_CROP / OPENPRISM_MINERU_RASTER_TO_PDF to compare output.'; + } return { - sourceMarkdown: result.markdownContent, - sourceImages: result.images || [], + sourceMarkdown: markdownContent, + sourceImages: images, targetProjectRoot, mineruOutputDir: outputDir, - progressLog: `[parsePdfWithMineru] Parsed PDF: ${mdLen} chars markdown, ${imgCount} images.`, + progressLog, }; } diff --git a/apps/backend/src/services/transferAgent/nodes/ruleBaseTransferConvert.js b/apps/backend/src/services/transferAgent/nodes/ruleBaseTransferConvert.js new file mode 100644 index 0000000..86f7edd --- /dev/null +++ b/apps/backend/src/services/transferAgent/nodes/ruleBaseTransferConvert.js @@ -0,0 +1,59 @@ +import path from 'path'; +import { NeuripsPhase, progressUpdate } from '../progressMeta.js'; +import { runConversion } from '../rulebasetransfer/pipeline.js'; +import { DATA_DIR, TEMPLATE_DIR } from '../../../config/constants.js'; + +export async function ruleBaseTransferConvert(state) { + const sourceDir = path.join(DATA_DIR, state.sourceProjectId); + const targetTemplateDir = path.join(TEMPLATE_DIR, state.targetTemplateId); + const outputDir = path.join(DATA_DIR, state.targetProjectId); + const outputMainName = state.targetMainFile || 'main.tex'; + + let result; + try { + result = await runConversion({ + sourceDir, + targetTemplateDir, + outputDir, + outputMainName, + }); + } catch (err) { + const message = err?.message || String(err || 'Rule-based transfer conversion failed'); + return { + error: message, + status: 'failed', + ...progressUpdate( + 'ruleBaseTransferConvert', + NeuripsPhase.body, + `Rule-based transfer 转换失败:${message}`, + 'error', + ), + }; + } + + const relativeMainTex = path.relative(outputDir, result.mainTex); + const summaryParts = [ + `target=${result.targetKind}`, + `main=${relativeMainTex}`, + result.warnings.length ? `warnings=${result.warnings.length}` : 'warnings=0', + ]; + + const extraEntries = result.warnings.map((message) => ({ + node: 'ruleBaseTransferConvert', + level: 'warn', + message, + ts: Date.now(), + })); + + const base = progressUpdate( + 'ruleBaseTransferConvert', + NeuripsPhase.body, + `Rule-based transfer 规则转换完成(${summaryParts.join(', ')})`, + 'info', + ); + + return { + ...base, + progressLogEntries: [...base.progressLogEntries, ...extraEntries], + }; +} diff --git a/apps/backend/src/services/transferAgent/progressMeta.js b/apps/backend/src/services/transferAgent/progressMeta.js new file mode 100644 index 0000000..e188d63 --- /dev/null +++ b/apps/backend/src/services/transferAgent/progressMeta.js @@ -0,0 +1,36 @@ +/** + * Stable phase ids for NeurIPS UI timeline (map nodes → phase). + */ +export const NeuripsPhase = { + intake: 'intake', + source_analysis: 'source_analysis', + migration_plan: 'migration_plan', + qa_plan: 'qa_plan', + preamble: 'preamble', + body: 'body', + figures: 'figures', + assets: 'assets', + bibliography: 'bibliography', + blind_qa: 'blind_qa', + blind: 'blind', + compile: 'compile', + verify: 'verify', + policy: 'policy', + finalize: 'finalize', + layout: 'layout', + // --- Agent loop phases --- + agent_planning: 'agent_planning', + agent_generating: 'agent_generating', + agent_reviewing: 'agent_reviewing', +}; + +export function progressUpdate(node, phase, message, level = 'info') { + return { + lastCompletedNode: node, + currentPhase: phase, + interruptedBeforeNode: '', + completedNodes: [node], + progressLog: `[${node}] ${message}`, + progressLogEntries: [{ node, level, message, ts: Date.now() }], + }; +} diff --git a/apps/backend/src/services/transferAgent/rulebasetransfer/latex.js b/apps/backend/src/services/transferAgent/rulebasetransfer/latex.js new file mode 100644 index 0000000..b860f18 --- /dev/null +++ b/apps/backend/src/services/transferAgent/rulebasetransfer/latex.js @@ -0,0 +1,1008 @@ +import { promises as fs } from 'fs'; +import path from 'path'; +import { getTemplateConfig } from './templateConfigs.js'; +function reEscape(s) { + return String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} +function subn(text, regex, replacement) { + let count = 0; + const out = text.replace(regex, (...args) => { + count += 1; + if (typeof replacement === 'function') { + return replacement(...args); + } + return replacement; + }); + return { text: out, count }; +} +async function rglob(rootDir) { + const results = []; + async function walk(dir) { + let entries; + try { + entries = await fs.readdir(dir, { withFileTypes: true }); + } + catch { + return; + } + for (const entry of entries) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + results.push({ path: full, isDir: true, isFile: false }); + await walk(full); + } + else if (entry.isFile()) { + results.push({ path: full, isDir: false, isFile: true }); + } + } + } + await walk(rootDir); + return results; +} +const DOCUMENT_REGEX = /(?[\s\S]*?)(?\\begin\{document\})(?[\s\S]*?)(?\\end\{document\})/; +const LAYOUT_LENGTH_NAMES = [ + 'pdfpagewidth', + 'pdfpageheight', + 'paperwidth', + 'paperheight', + 'textwidth', + 'textheight', + 'columnsep', + 'oddsidemargin', + 'evensidemargin', + 'topmargin', + 'headheight', + 'headsep', + 'footskip', + 'parindent', + 'parskip', + 'floatsep', + 'textfloatsep', + 'intextsep', +]; +export async function discoverMainTex(rootDir, preferredNames = []) { + for (const preferred of preferredNames) { + const matches = []; + for (const entry of await rglob(rootDir)) { + if (entry.isFile && path.basename(entry.path) === preferred) { + matches.push(entry.path); + } + } + matches.sort(); + if (matches.length > 0) + return matches[0]; + } + const texFiles = []; + for (const entry of await rglob(rootDir)) { + if (entry.isFile && entry.path.toLowerCase().endsWith('.tex')) { + texFiles.push(entry.path); + } + } + texFiles.sort(); + if (texFiles.length === 0) { + throw new Error(`No .tex files found under ${rootDir}`); + } + let bestFile = null; + let bestScore = null; + for (const candidate of texFiles) { + const text = await loadText(candidate); + if (!text.includes('\\documentclass') || !text.includes('\\begin{document}')) { + continue; + } + let score = 0; + if (text.includes('\\title')) + score += 50; + if (text.includes('\\begin{abstract}')) + score += 30; + if (text.includes('\\bibliography') || text.includes('\\printbibliography')) + score += 20; + score += Math.min(Math.floor(text.length / 500), 100); + if (bestScore === null || score > bestScore) { + bestScore = score; + bestFile = candidate; + } + } + if (!bestFile) { + throw new Error(`Unable to identify main LaTeX file under ${rootDir}`); + } + return bestFile; +} +export async function loadText(filePath) { + return fs.readFile(filePath, 'utf8'); +} +export function maskComments(text) { + const out = []; + const lines = text.split(/(\r\n|\n|\r)/); + for (let i = 0; i < lines.length; i += 2) { + const content = lines[i]; + const sep = lines[i + 1] || ''; + if (content === undefined) + continue; + const chars = content.split(''); + let escaped = false; + for (let j = 0; j < chars.length; j += 1) { + if (escaped) { + escaped = false; + continue; + } + if (chars[j] === '\\') { + escaped = true; + continue; + } + if (chars[j] === '%') { + for (let k = j; k < chars.length; k += 1) { + if (chars[k] !== '\n' && chars[k] !== '\r') + chars[k] = ' '; + } + break; + } + } + out.push(chars.join('')); + out.push(sep); + } + return out.join(''); +} +export function extractDocumentclass(text) { + const match = text.match(/^\s*\\documentclass(?:\[[^\]]*\])?\{[^}]+\}/m); + if (!match) { + throw new Error('Missing \\documentclass in target template'); + } + return match[0].trim(); +} +export async function detectProjectKindFromText(text, rootDir = null) { + const masked = maskComments(text); + const packageMarkers = [ + ['acl', /\\usepackage(?:\[[^\]]*\])?\{acl\}/], + ['neurips', /\\usepackage(?:\[[^\]]*\])?\{neurips(?:_[0-9]{4})?\}/], + ['icml', /\\usepackage(?:\[[^\]]*\])?\{icml2026\}/], + ['iclr', /\\usepackage(?:\[[^\]]*\])?\{iclr2026_conference\}/], + ['cvpr', /\\usepackage(?:\[[^\]]*\])?\{cvpr\}/], + ['aaai', /\\usepackage(?:\[[^\]]*\])?\{aaai2026\}/], + ]; + for (const [kind, pattern] of packageMarkers) { + if (pattern.test(masked)) + return kind; + } + return rootDir !== null ? detectTargetKind(rootDir) : 'generic'; +} +export function stripDocumentclass(preamble) { + return preamble.replace(/^\s*\\documentclass(?:\[[^\]]*\])?\{[^}]+\}\s*/m, '').trim(); +} +export function stripTitleAuthorBlocks(preamble) { + let cleaned = stripDocumentclass(preamble); + for (const macro of getTemplateConfig('generic').stripMacros) { + for (;;) { + const block = extractMacroBlock(cleaned, macro); + if (!block) + break; + cleaned = cleaned.replace(block, ''); + } + } + return cleaned.trim(); +} +export function stripTemplateMacros(preamble, targetKind) { + let cleaned = stripDocumentclass(preamble); + const macroNames = getTemplateConfig(targetKind).stripMacros; + for (const macro of macroNames) { + for (;;) { + const block = extractMacroBlock(cleaned, macro); + if (!block) + break; + cleaned = cleaned.replace(block, ''); + } + } + return cleaned.trim(); +} +export function splitDocument(text) { + const match = text.match(DOCUMENT_REGEX); + if (!match) { + throw new Error('Invalid LaTeX file: missing document environment'); + } + return [match.groups.prefix, match.groups.document]; +} +export function extractMacroBlock(text, macroName) { + const masked = maskComments(text); + const pattern = new RegExp(`\\\\${reEscape(macroName)}(?:\\s*\\[[^\\]]*\\])?\\s*\\{`, 'gm'); + const match = pattern.exec(masked); + if (!match) + return null; + const openBrace = masked.indexOf('{', match.index); + if (openBrace === -1) + return null; + const endIndex = findMatchingBrace(masked, openBrace); + return text.slice(match.index, endIndex + 1); +} +export function extractEnvironment(text, envName) { + const masked = maskComments(text); + const beginRe = new RegExp(`\\\\begin\\{${reEscape(envName)}\\}`); + const beginMatch = masked.match(beginRe); + if (!beginMatch) + return null; + const beginStart = beginMatch.index; + const beginEnd = beginStart + beginMatch[0].length; + const endRe = new RegExp(`\\\\end\\{${reEscape(envName)}\\}`); + const tail = masked.slice(beginEnd); + const endMatch = tail.match(endRe); + if (!endMatch) + return null; + const blockEnd = beginEnd + endMatch.index + endMatch[0].length; + const block = text.slice(beginStart, blockEnd); + const inner = text.slice(beginEnd, beginEnd + endMatch.index).trim(); + return [block, inner]; +} +export function findMatchingBrace(text, openBrace) { + let depth = 0; + let escaped = false; + for (let i = openBrace; i < text.length; i += 1) { + const ch = text[i]; + if (escaped) { + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if (ch === '{') + depth += 1; + else if (ch === '}') { + depth -= 1; + if (depth === 0) + return i; + } + } + throw new Error('Unbalanced braces in LaTeX content'); +} +export function findMatchingBracket(text, openBracket) { + let depth = 0; + let escaped = false; + for (let i = openBracket; i < text.length; i += 1) { + const ch = text[i]; + if (escaped) { + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if (ch === '[') + depth += 1; + else if (ch === ']') { + depth -= 1; + if (depth === 0) + return i; + } + } + throw new Error('Unbalanced brackets in LaTeX content'); +} +export function parseUsepackageLines(preamble) { + const packages = []; + const remainingLines = []; + const packageRe = /^\s*\\usepackage(?:\[([^\]]*)\])?\{([^}]*)\}/; + for (const line of preamble.split('\n')) { + const match = line.match(packageRe); + if (!match) { + remainingLines.push(line); + continue; + } + const options = (match[1] || '') + .split(',') + .map((p) => p.trim()) + .filter((p) => p.length > 0); + const names = match[2] + .split(',') + .map((n) => n.trim()) + .filter((n) => n.length > 0); + for (const name of names) { + packages.push({ name, options, raw: line.replace(/\s+$/, '') }); + } + } + return { packages, remaining: remainingLines.join('\n').trim() }; +} +export function mergePreambles(targetPreamble, sourcePreamble, targetKind = 'generic') { + const sourcePreambleNoComments = maskComments(sourcePreamble); + const cleanedSource = stripTitleAuthorBlocks(sourcePreambleNoComments); + const { packages: sourcePackages, remaining: sourceOther } = parseUsepackageLines(cleanedSource); + const { packages: targetPackages } = parseUsepackageLines(stripTemplateMacros(targetPreamble, targetKind)); + const skipPackages = new Set([ + 'acl', + 'neurips_2026', + 'icml2026', + 'iclr2026_conference', + 'cvpr', + 'aaai2026', + ]); + for (const p of getTemplateConfig(targetKind).skipPackages) + skipPackages.add(p); + const targetPackageOptions = new Map(); + for (const pkg of targetPackages) { + targetPackageOptions.set(pkg.name, new Set(pkg.options)); + } + const passOptions = {}; + const addedPackages = []; + const addedSeen = new Set(); + for (const pkg of sourcePackages) { + if (skipPackages.has(pkg.name)) + continue; + if (targetKind === 'neurips' && isNeuripsStylePackage(pkg.name)) + continue; + if (targetPackageOptions.has(pkg.name)) { + const already = targetPackageOptions.get(pkg.name); + const missing = pkg.options.filter((opt) => !already.has(opt)); + if (missing.length > 0) { + if (!passOptions[pkg.name]) + passOptions[pkg.name] = new Set(); + for (const opt of missing) + passOptions[pkg.name].add(opt); + } + continue; + } + if (addedSeen.has(pkg.name)) + continue; + const optionsPrefix = pkg.options.length ? `[${pkg.options.join(',')}]` : ''; + addedPackages.push(`\\usepackage${optionsPrefix}{${pkg.name}}`); + addedSeen.add(pkg.name); + } + const additions = []; + if (addedPackages.length) { + additions.push('% Added from the source project to preserve paper content'); + additions.push(...addedPackages); + } + const keepTemplateLineNumbers = templateWantsLineNumbers(targetPreamble, targetKind); + const normalizedSourceOther = keepTemplateLineNumbers + ? stripSourceLineNumberDisablers(sourceOther) + : sourceOther; + if (normalizedSourceOther.trim()) { + additions.push('% Source project macros and local configuration'); + additions.push(normalizedSourceOther.trim()); + } + let merged = targetPreamble.trim(); + if (additions.length) { + merged = `${merged}\n\n${additions.join('\n').trim()}`; + } + return [merged.trim(), passOptions]; +} +export function splitAppendix(documentBody) { + const masked = maskComments(documentBody); + const marker = masked.match(/(^|\n)\s*\\appendix\b/); + if (!marker) + return [documentBody.trim(), '']; + const start = marker.index; + return [documentBody.slice(0, start).trim(), documentBody.slice(start).trim()]; +} +export function splitBibliography(documentBody) { + const masked = maskComments(documentBody); + const patterns = [ + /\\bibliographystyle(?:\s*\[[^\]]*\])?\s*\{/, + /\\bibliography(?:\s*\[[^\]]*\])?\s*\{/, + /\\printbibliography\b/, + /\\begin\{thebibliography\}/, + ]; + const locations = []; + for (const p of patterns) { + const m = masked.match(p); + if (m) + locations.push(m.index); + } + if (locations.length === 0) + return [documentBody.trim(), '']; + const start = Math.min(...locations); + let prefix = documentBody.slice(0, start); + let bibliographyBody = documentBody.slice(start).trim(); + const groupMatch = prefix.match(/\{\s*\\small\s*$/s); + if (groupMatch) { + prefix = prefix.slice(0, groupMatch.index); + if (bibliographyBody.endsWith('}')) { + bibliographyBody = bibliographyBody.slice(0, -1).trim(); + } + } + return [prefix.replace(/\s+$/, ''), bibliographyBody]; +} +export function extractBibliographyStyle(text) { + const block = extractMacroBlock(text, 'bibliographystyle'); + if (!block) + return [text.trim(), '']; + return [text.replace(block, '').trim(), block.trim()]; +} +export async function stripFrontmatter(documentBody, sourceKind, mainTex) { + let body = documentBody; + let abstractText = ''; + if (sourceKind === 'icml') + body = stripIcmlFrontmatter(body); + if (sourceKind === 'neurips') + body = stripNeuripsArtifacts(body); + if (sourceKind === 'cvpr') + body = stripCvprFrontmatter(body); + const abstractEnv = extractEnvironment(body, 'abstract'); + if (abstractEnv) { + const [block, inner] = abstractEnv; + abstractText = inner; + body = body.replace(block, ''); + } + else if (sourceKind === 'cvpr') { + const [extracted, newBody] = await extractCvprAbstractFromInputs(body, mainTex); + abstractText = extracted; + body = newBody; + } + body = body.replace(/^\s*\\maketitle\s*/, ''); + body = body.trim(); + return [abstractText.trim(), body]; +} +export function stripIcmlFrontmatter(body) { + const masked = maskComments(body); + const start = masked.indexOf('\\twocolumn['); + let out = body; + if (start !== -1) { + const openBracket = masked.indexOf('[', start); + if (openBracket !== -1) { + const closeBracket = findMatchingBracket(masked, openBracket); + out = body.slice(0, start) + body.slice(closeBracket + 1); + } + } + out = out.replace(/^\s*\\printAffiliationsAndNotice\s*\{[\s\S]*?\}\s*/m, ''); + return out; +} +export function stripNeuripsArtifacts(body) { + let out = body.replace(/^\s*\\newpage\s*\n\s*\\input\{checklist\.tex\}\s*$/gm, ''); + out = out.replace(/^\s*\\input\{checklist\.tex\}\s*$/gm, ''); + return out; +} +export function stripCvprFrontmatter(body) { + let out = unwrapCvprTeaserBlock(body); + out = out.replace(/^\s*\\maketitle\s*$/gm, ''); + out = out.replace(/^\s*\\renewcommand\\twocolumn\[1\]\[\]\{#1\}\s*%?\s*$/gm, ''); + for (;;) { + const footnoteBlock = extractMacroBlock(out, 'blfootnote'); + if (!footnoteBlock) + break; + out = out.replace(footnoteBlock, ''); + } + return out.trim(); +} +export function unwrapCvprTeaserBlock(body) { + const masked = maskComments(body); + const start = masked.indexOf('\\twocolumn['); + if (start === -1) + return body; + const openBracket = masked.indexOf('[', start); + if (openBracket === -1) + return body; + const closeBracket = findMatchingBracket(masked, openBracket); + let inner = body.slice(openBracket + 1, closeBracket); + inner = inner.replace(/^\s*\{?%\s*/, ''); + inner = inner.replace(/^\s*\\renewcommand\\twocolumn\[1\]\[\]\{#1\}\s*/, ''); + inner = inner.replace(/^\s*\\maketitle\s*/, ''); + inner = inner.trim().replace(/\}\s*$/, ''); + inner = inner.trim(); + const rebuilt = `${inner}\n\n${body.slice(closeBracket + 1)}`; + return rebuilt.trim(); +} +export async function extractCvprAbstractFromInputs(body, mainTex) { + const masked = maskComments(body); + const patterns = [ + /^\s*\\input\{(?[^}]*abstract[^}]*)\}\s*$/m, + /^\s*\\input\{(?sec\/0_abstract)\}\s*$/m, + ]; + for (const pattern of patterns) { + const match = masked.match(pattern); + if (!match) + continue; + const inputPath = await resolveTexInputPath(path.dirname(mainTex), match.groups.path); + let abstractText = ''; + if (inputPath) { + abstractText = (await loadText(inputPath)).trim(); + const abstractEnv = extractEnvironment(abstractText, 'abstract'); + if (abstractEnv) { + abstractText = abstractEnv[1]; + } + } + const newBody = body.slice(0, match.index) + body.slice(match.index + match[0].length); + return [abstractText.trim(), newBody]; + } + return ['', body]; +} +export async function extractCvprAppendixFromInputs(body, mainTex) { + const masked = maskComments(body); + const patterns = [ + /^\s*\\input\{(?[^}]*suppl[^}]*)\}\s*$/m, + /^\s*\\input\{(?[^}]*appendix[^}]*)\}\s*$/m, + /^\s*\\input\{(?sec\/X_suppl)\}\s*$/m, + ]; + for (const pattern of patterns) { + const match = masked.match(pattern); + if (!match) + continue; + const inputPath = await resolveTexInputPath(path.dirname(mainTex), match.groups.path); + let appendixText = ''; + if (inputPath) { + appendixText = (await loadText(inputPath)).trim(); + } + appendixText = stripCvprAppendixFrontmatter(appendixText); + const newBody = body.slice(0, match.index) + body.slice(match.index + match[0].length); + return [appendixText, newBody]; + } + return ['', body]; +} +export function stripCvprAppendixFrontmatter(text) { + let cleaned = text.trim(); + cleaned = cleaned.replace(/^\s*\\clearpage\s*/gm, ''); + cleaned = cleaned.replace(/^\s*\\setcounter\{page\}\{[^}]*\}\s*/gm, ''); + cleaned = cleaned.replace(/^\s*\\maketitlesupplementary\s*/gm, ''); + return cleaned.trim(); +} +export async function resolveTexInputPath(baseDir, relativePath) { + const candidates = [path.join(baseDir, relativePath)]; + if (!relativePath.endsWith('.tex')) { + candidates.push(path.join(baseDir, `${relativePath}.tex`)); + } + for (const candidate of candidates) { + try { + await fs.access(candidate); + return candidate; + } + catch { } + } + return null; +} +export async function buildSourceRepresentation(rootDir) { + const mainTex = await discoverMainTex(rootDir, ['resubmitted.tex', 'main.tex']); + const text = await loadText(mainTex); + const [preamble, documentBody] = splitDocument(text); + const sourceKind = await detectProjectKindFromText(preamble, rootDir); + let sourcePreamble = extractPreservedSourcePreamble(maskComments(preamble)); + sourcePreamble = stripAlgorithmCompatibilityLines(stripTemplateMacros(sourcePreamble, sourceKind)); + sourcePreamble = stripSourceTemplatePackages(sourcePreamble, sourceKind); + sourcePreamble = stripSourceTemplateCommands(sourcePreamble, sourceKind); + sourcePreamble = stripAutoresubmitInjectedBlocks(sourcePreamble); + const [sourcePreambleCleaned, layoutWarnings] = stripLayoutModifyingCommands(sourcePreamble); + sourcePreamble = sourcePreambleCleaned; + const warnings = [...layoutWarnings]; + const titleBlock = extractMacroBlock(preamble, 'title') || '\\title{Untitled Submission}'; + const authorBlock = extractMacroBlock(preamble, 'author') || '\\author{}'; + const dateBlock = extractMacroBlock(preamble, 'date') || '\\date{}'; + if (authorBlock === '\\author{}') { + warnings.push('Source paper does not define \\author; generated output keeps authors blank.'); + } + const [abstract, bodyWithoutFrontmatter] = await stripFrontmatter(documentBody, sourceKind, mainTex); + if (!abstract) { + warnings.push('Source paper does not contain an abstract environment.'); + } + let mainPlusBib; + let appendix; + [mainPlusBib, appendix] = splitAppendix(bodyWithoutFrontmatter); + if (sourceKind === 'cvpr' && !appendix) { + const [cvprAppendix, newMainPlusBib] = await extractCvprAppendixFromInputs(mainPlusBib, mainTex); + appendix = cvprAppendix; + mainPlusBib = newMainPlusBib; + } + let [mainBody, bibliographyBlock] = splitBibliography(mainPlusBib); + const [bibliographyBlockCleaned, bibliographyStyle] = extractBibliographyStyle(bibliographyBlock); + bibliographyBlock = bibliographyBlockCleaned; + const combinedText = [abstract, mainBody, appendix].filter((s) => s.trim()).join('\n'); + sourcePreamble = addInferredSourcePackages(sourcePreamble, combinedText); + return { + mainTex, + sourcePreamble: sourcePreamble.trim(), + titleBlock: titleBlock.trim(), + authorBlock: authorBlock.trim(), + dateBlock: dateBlock.trim(), + abstract: abstract.trim(), + mainBody: mainBody.trim(), + bibliographyStyle: bibliographyStyle.trim(), + bibliographyBlock: bibliographyBlock.trim(), + appendixBody: appendix.trim(), + warnings, + }; +} +export async function buildTargetRepresentation(rootDir) { + const targetKind = await detectTargetKind(rootDir); + const preferredNames = getTemplateConfig(targetKind).preferredMainNames; + const mainTex = await discoverMainTex(rootDir, preferredNames); + const text = await loadText(mainTex); + const [preamble] = splitDocument(text); + const documentclass = extractDocumentclass(text); + const normalizedPreamble = normalizeTargetSubmissionMode(stripTemplateMacros(preamble, targetKind), targetKind); + return { mainTex, documentclass, targetPreamble: normalizedPreamble, targetKind }; +} +export function renderMergedTex(project, includeChecklist) { + if (project.targetKind === 'icml') + return renderIcmlMergedTex(project); + if (project.targetKind === 'aaai') + return renderAaaiMergedTex(project); + const passOptionLines = []; + for (const packageName of Object.keys(project.passOptions).sort()) { + const options = [...project.passOptions[packageName]].sort().join(','); + passOptionLines.push(`\\PassOptionsToPackage{${options}}{${packageName}}`); + } + const parts = [project.documentclass]; + if (passOptionLines.length) + parts.push(passOptionLines.join('\n')); + parts.push(project.targetPreamble.trim()); + const submissionSafeguards = renderSubmissionSafeguards(project); + if (submissionSafeguards) + parts.push(submissionSafeguards); + parts.push(project.titleBlock.trim()); + parts.push(project.authorBlock.trim()); + if (project.dateBlock.trim()) + parts.push(project.dateBlock.trim()); + parts.push('\\begin{document}'); + parts.push('\\maketitle'); + const postMaketitleHook = renderPostMaketitleSubmissionSafeguards(project); + if (postMaketitleHook) + parts.push(postMaketitleHook); + if (project.abstract) { + parts.push(`\\begin{abstract}\n${project.abstract.trim()}\n\\end{abstract}`); + } + if (project.mainBody) + parts.push(project.mainBody.trim()); + let bibliographyStyle = project.bibliographyStyle.trim(); + const bibliographyBlock = project.bibliographyBlock.trim(); + if (bibliographyBlock) { + if (!bibliographyStyle) + bibliographyStyle = defaultBibliographystyle(project.targetKind); + if (bibliographyStyle) + parts.push(bibliographyStyle); + parts.push(bibliographyBlock); + } + if (project.appendixBody.trim()) + parts.push(project.appendixBody.trim()); + if (includeChecklist) { + parts.push('\\newpage'); + parts.push('\\input{checklist.tex}'); + } + parts.push('\\end{document}'); + return parts.filter((p) => p && p.trim()).join('\n\n') + '\n'; +} +export function renderAaaiMergedTex(project) { + const passOptionLines = []; + for (const packageName of Object.keys(project.passOptions).sort()) { + const options = [...project.passOptions[packageName]].sort().join(','); + passOptionLines.push(`\\PassOptionsToPackage{${options}}{${packageName}}`); + } + const titleContent = extractMacroContent(project.titleBlock) || 'Untitled Submission'; + const authorContent = 'Anonymous Submission'; + const compatibilityMacros = [ + '\\usepackage{iftex}', + '\\ifPDFTeX\\else', + "% AAAI's PSNFSS times/helvet/courier stack falls back to Latin Modern under", + '% TU/XeTeX. Re-select the T1 text encoding so tectonic/XeTeX uses the intended', + '% Times-compatible Type1 fonts without requiring extra packages.', + '\\renewcommand{\\encodingdefault}{T1}', + '\\AtBeginDocument{\\normalfont\\selectfont}', + '\\fi', + '\\providecommand{\\texorpdfstring}[2]{#1}', + '\\providecommand{\\State}{\\STATE}', + '\\providecommand{\\Statex}{\\item[]}', + '\\providecommand{\\Require}{\\REQUIRE}', + '\\providecommand{\\Ensure}{\\ENSURE}', + '\\providecommand{\\Return}{\\textbf{return}}', + '\\providecommand{\\Comment}[1]{\\COMMENT{#1}}', + ].join('\n'); + const parts = [project.documentclass]; + if (passOptionLines.length) + parts.push(passOptionLines.join('\n')); + parts.push(project.targetPreamble.trim()); + parts.push(compatibilityMacros); + parts.push(`\\title{${titleContent}}`); + parts.push(`\\author{${authorContent}}`); + parts.push('\\affiliations{}'); + parts.push('\\begin{document}'); + parts.push('\\maketitle'); + if (project.abstract) { + parts.push(`\\begin{abstract}\n${project.abstract.trim()}\n\\end{abstract}`); + } + if (project.mainBody) + parts.push(project.mainBody.trim()); + const bibliographyBlock = project.bibliographyBlock.trim(); + if (bibliographyBlock) + parts.push(bibliographyBlock); + if (project.appendixBody.trim()) + parts.push(project.appendixBody.trim()); + parts.push('\\end{document}'); + return parts.filter((p) => p && p.trim()).join('\n\n') + '\n'; +} +export function renderIcmlMergedTex(project) { + const passOptionLines = []; + for (const packageName of Object.keys(project.passOptions).sort()) { + const options = [...project.passOptions[packageName]].sort().join(','); + passOptionLines.push(`\\PassOptionsToPackage{${options}}{${packageName}}`); + } + const titleContent = extractMacroContent(project.titleBlock) || 'Untitled Submission'; + const runningTitle = collapseTitleForRunningHead(titleContent); + const compatibilityMacros = [ + '% Compatibility layer for source projects that use algpseudocode-style commands.', + '\\providecommand{\\State}{\\STATE}', + '\\providecommand{\\Statex}{\\item[]}', + '\\providecommand{\\Require}{\\REQUIRE}', + '\\providecommand{\\Ensure}{\\ENSURE}', + '\\providecommand{\\Return}{\\textbf{return}}', + ].join('\n'); + const parts = [project.documentclass]; + if (passOptionLines.length) + parts.push(passOptionLines.join('\n')); + parts.push(project.targetPreamble.trim()); + parts.push(`\\icmltitlerunning{${runningTitle}}`); + parts.push(compatibilityMacros); + parts.push('\\begin{document}'); + parts.push([ + '\\twocolumn[', + ` \\icmltitle{${titleContent}}`, + ' \\vskip 0.3in', + ']', + '\\printAffiliationsAndNotice{}', + ].join('\n')); + if (project.abstract) { + parts.push(`\\begin{abstract}\n${project.abstract.trim()}\n\\end{abstract}`); + } + if (project.mainBody) + parts.push(project.mainBody.trim()); + const bibliographyStyle = project.bibliographyStyle.trim() || defaultBibliographystyle(project.targetKind); + const bibliographyBlock = project.bibliographyBlock.trim(); + if (bibliographyBlock) { + parts.push(bibliographyStyle); + parts.push(bibliographyBlock); + } + if (project.appendixBody.trim()) + parts.push(project.appendixBody.trim()); + parts.push('\\end{document}'); + return parts.filter((p) => p && p.trim()).join('\n\n') + '\n'; +} +export function renderSubmissionSafeguards(project) { + void project; + return ''; +} +export function renderPostMaketitleSubmissionSafeguards(_project) { + return ''; +} +function templateWantsLineNumbers(targetPreamble, targetKind) { + const maskedPreamble = maskComments(targetPreamble); + const optsFor = (packageName) => { + const re = new RegExp(`\\\\usepackage(?:\\[([^\\]]*)\\])?\\{${reEscape(packageName)}\\}`); + const match = maskedPreamble.match(re); + if (!match) + return new Set(); + return new Set((match[1] || '').split(',').map((s) => s.trim()).filter(Boolean)); + }; + if (targetKind === 'acl') { + return optsFor('acl').has('review'); + } + if (targetKind === 'cvpr') { + const options = optsFor('cvpr'); + return options.has('review') || options.has('pagenumbers'); + } + if (targetKind === 'neurips') { + const options = optsFor('neurips_2026'); + return !options.has('preprint') && !options.has('final') && !options.has('nonanonymous'); + } + return false; +} +function stripSourceLineNumberDisablers(text) { + let cleaned = text; + const patterns = [ + /^\s*\\nolinenumbers\s*$/gm, + /^\s*\\AtBeginDocument\s*\{\s*\\nolinenumbers\s*\}\s*$/gm, + /^\s*\\internallinenumbers\s*$/gm, + ]; + for (const pattern of patterns) { + cleaned = cleaned.replace(pattern, ''); + } + return cleaned.trim(); +} +export function stripLayoutModifyingCommands(sourcePreamble) { + let cleaned = sourcePreamble; + let removed = 0; + const lengthPattern = LAYOUT_LENGTH_NAMES.join('|'); + const patterns = [ + new RegExp(`^\\s*\\\\(?:setlength|addtolength)\\s*\\{\\\\(?:${lengthPattern})\\}\\s*\\{[^}]*\\}\\s*$`, 'gm'), + new RegExp(`^\\s*\\\\(?:setlength|addtolength)\\s*\\\\(?:${lengthPattern})\\s*\\{[^}]*\\}\\s*$`, 'gm'), + /^\s*\\(?:geometry|newgeometry)\s*\{[^}]*\}\s*$/gm, + /^\s*\\restoregeometry\s*$/gm, + /^\s*\\(?:pagestyle|thispagestyle|pagenumbering)\s*\{[^}]*\}\s*$/gm, + ]; + for (const pattern of patterns) { + const r = subn(cleaned, pattern, ''); + cleaned = r.text; + removed += r.count; + } + const warnings = []; + if (removed) { + warnings.push(`Removed ${removed} source preamble command(s) that alter page layout or pagination so the output stays within the target template.`); + } + return [cleaned.trim(), warnings]; +} +export function stripAutoresubmitInjectedBlocks(sourcePreamble) { + let cleaned = sourcePreamble.replace(/% Keep review-mode line numbers readable around wide floats in direct submission PDFs\.[\s\S]*?\\makeatother\s*/, ''); + cleaned = cleaned.replace(/\\makeatletter\s*% lineno's built-in switching is page-based, so in two-column ACL review mode it leaves[\s\S]*?\\makeatother\s*/, ''); + return cleaned.trim(); +} +export function normalizeTargetSubmissionMode(targetPreamble, targetKind) { + let normalized = targetPreamble; + if (targetKind === 'acl') { + return normalizeUsepackageOptions(normalized, 'acl', new Set(['review']), new Set(['final', 'preprint'])); + } + if (targetKind === 'cvpr') { + return normalizeUsepackageOptions(normalized, 'cvpr', new Set(['review']), new Set(['pagenumbers'])); + } + if (targetKind === 'neurips') { + return normalizeUsepackageOptions(normalized, 'neurips_2026', new Set(), new Set(['final', 'preprint', 'nonanonymous'])); + } + if (targetKind === 'icml') { + return normalizeUsepackageOptions(normalized, 'icml2026', new Set(), new Set(['accepted', 'preprint'])); + } + if (targetKind === 'iclr') { + return normalized.replace(/^\s*\\iclrfinalcopy\s*$/gm, '').trim(); + } + if (targetKind === 'aaai') { + return normalizeUsepackageOptions(normalized, 'aaai2026', new Set(['submission']), new Set()); + } + return normalized.trim(); +} +export function normalizeUsepackageOptions(preamble, packageName, add, remove) { + const pattern = new RegExp(`^(?\\s*)\\\\usepackage(?:\\[(?[^\\]]*)\\])?\\{${reEscape(packageName)}\\}(?\\s*(?:%.*)?)$`, 'm'); + let replaced = false; + const out = preamble.replace(pattern, (...args) => { + replaced = true; + const groups = args[args.length - 1]; + const existing = (groups.options || '') + .split(',') + .map((o) => o.trim()) + .filter((o) => o.length > 0); + let filtered = existing.filter((o) => !remove.has(o)); + for (const opt of [...add].sort()) { + if (!filtered.includes(opt)) + filtered.push(opt); + } + const optionBlock = filtered.length ? `[${filtered.join(',')}]` : ''; + const suffix = groups.suffix || ''; + return `${groups.indent}\\usepackage${optionBlock}{${packageName}}${suffix}`; + }); + return replaced ? out.trim() : preamble.trim(); +} +export async function detectTargetKind(rootDir) { + const filenames = new Set(); + for (const entry of await rglob(rootDir)) { + if (entry.isFile) + filenames.add(path.basename(entry.path).toLowerCase()); + } + if (filenames.has('aaai2026.sty')) + return 'aaai'; + if (filenames.has('cvpr.sty')) + return 'cvpr'; + if (filenames.has('iclr2026_conference.sty')) + return 'iclr'; + if (filenames.has('icml2026.sty')) + return 'icml'; + if (filenames.has('neurips_2026.sty')) + return 'neurips'; + if (filenames.has('acl.sty')) + return 'acl'; + return 'generic'; +} +export function defaultBibliographystyle(targetKind) { + return String(getTemplateConfig(targetKind).defaultBibliographystyle || ''); +} +export function extractMacroContent(block) { + if (!block) + return ''; + const openBrace = block.indexOf('{'); + if (openBrace === -1) + return ''; + const closeBrace = findMatchingBrace(block, openBrace); + return block.slice(openBrace + 1, closeBrace).trim(); +} +export function collapseTitleForRunningHead(title) { + let collapsed = title.replace(/(? p.name)); + const inferredPackages = []; + const inferredMacros = []; + const combinedText = `${sourcePreamble}\n${bodyText}`; + const inferenceRules = [ + [['\\begin{algorithm}', '\\begin{algorithm*}'], 'algorithm'], + [ + [ + '\\begin{algorithmic}', + '\\State', + '\\Statex', + '\\Require', + '\\Ensure', + '\\Return', + '\\Comment', + ], + 'algorithmic', + ], + [['\\resizebox', '\\scalebox', '\\rotatebox', '\\includegraphics'], 'graphicx'], + [['\\multirow'], 'multirow'], + [['\\toprule', '\\midrule', '\\bottomrule', '\\cmidrule'], 'booktabs'], + [['\\rowcolor', '\\cellcolor', '\\columncolor'], 'colortbl'], + [['\\text{', '\\eqref{', '\\dfrac', '\\overset', '\\underset'], 'amsmath'], + [['\\triangleq', '\\mathbb', '\\mathfrak', '\\leqslant', '\\geqslant'], 'amssymb'], + [['\\mathscr'], 'mathrsfs'], + [['\\xspace'], 'xspace'], + [['\\begin{enumerate}[', '\\begin{itemize}['], 'enumitem'], + [['\\DeclareCaptionStyle', '\\captionsetup', '\\captionof'], 'caption'], + ]; + const tokenMatched = (token) => { + if (/^\\[A-Za-z@]+$/.test(token)) { + const pattern = new RegExp(`${reEscape(token)}(?![A-Za-z@])`); + return pattern.test(combinedText); + } + return combinedText.includes(token); + }; + for (const [tokens, packageName] of inferenceRules) { + if (existingPackageNames.has(packageName)) + continue; + if (packageName === 'algorithmic' && (existingPackageNames.has('algorithmic') + || existingPackageNames.has('algorithmicx') + || existingPackageNames.has('algpseudocode'))) + continue; + if (tokens.some((t) => tokenMatched(t))) { + inferredPackages.push(`\\usepackage{${packageName}}`); + existingPackageNames.add(packageName); + } + } + if (combinedText.includes('\\begin{links}') || combinedText.includes('\\link{')) { + inferredMacros.push('\\providecommand{\\link}[2]{\\item \\textbf{#1}: \\url{#2}}'); + inferredMacros.push('\\newenvironment{links}{\\begin{itemize}}{\\end{itemize}}'); + } + if (inferredPackages.length === 0 && inferredMacros.length === 0) { + return sourcePreamble; + } + const additions = [...inferredPackages, ...inferredMacros]; + return `${sourcePreamble.trim()}\n\n% Inferred from source body during conversion\n${additions.join('\n')}`; +} +export function stripAlgorithmCompatibilityLines(sourcePreamble) { + return sourcePreamble.replace(/^\s*\\newcommand\{\\theHalgorithm\}.*$/gm, '').trim(); +} +export function extractPreservedSourcePreamble(preamble) { + const markers = [ + '% Added from the source project to preserve paper content', + '% Source project macros and local configuration', + ]; + const starts = markers + .map((m) => preamble.indexOf(m)) + .filter((i) => i !== -1); + if (starts.length === 0) + return preamble; + return preamble.slice(Math.min(...starts)).trim(); +} +export function stripSourceTemplatePackages(sourcePreamble, sourceKind) { + const stripPackages = getTemplateConfig(sourceKind).sourceStripPackages; + if (!stripPackages || stripPackages.size === 0) + return sourcePreamble; + const { packages, remaining } = parseUsepackageLines(sourcePreamble); + const keptLines = []; + for (const pkg of packages) { + if (stripPackages.has(pkg.name)) + continue; + if (sourceKind === 'neurips' && isNeuripsStylePackage(pkg.name)) + continue; + const optionsPrefix = pkg.options.length ? `[${pkg.options.join(',')}]` : ''; + keptLines.push(`\\usepackage${optionsPrefix}{${pkg.name}}`); + } + const parts = []; + if (keptLines.length) + parts.push(keptLines.join('\n')); + if (remaining.trim()) + parts.push(remaining.trim()); + return parts.join('\n\n').trim(); +} +function isNeuripsStylePackage(packageName) { + return /^neurips(?:_[0-9]{4})?$/.test(String(packageName || '').trim()); +} +export function stripSourceTemplateCommands(sourcePreamble, sourceKind) { + const commands = getTemplateConfig(sourceKind).sourceStripCommands || []; + let cleaned = sourcePreamble; + for (const command of commands) { + cleaned = cleaned.replace(new RegExp(`^\\s*\\\\${reEscape(command)}\\s*$`, 'gm'), ''); + } + return cleaned.trim(); +} diff --git a/apps/backend/src/services/transferAgent/rulebasetransfer/pipeline.js b/apps/backend/src/services/transferAgent/rulebasetransfer/pipeline.js new file mode 100644 index 0000000..df89db6 --- /dev/null +++ b/apps/backend/src/services/transferAgent/rulebasetransfer/pipeline.js @@ -0,0 +1,316 @@ +import { promises as fs } from 'fs'; +import path from 'path'; +import crypto from 'crypto'; +import { buildSourceRepresentation, buildTargetRepresentation, mergePreambles, renderMergedTex, } from './latex.js'; +const SOURCE_ASSET_SUFFIXES = new Set([ + '.bib', + '.bmp', + '.csv', + '.eps', + '.jpeg', + '.jpg', + '.json', + '.pdf', + '.png', + '.svg', + '.tsv', +]); +const REMOVABLE_SUFFIXES = new Set([ + '.aux', + '.bbl', + '.bcf', + '.blg', + '.fdb_latexmk', + '.fls', + '.log', + '.nav', + '.out', + '.run.xml', + '.snm', + '.toc', + '.vrb', + '.xdv', +]); +const REMOVABLE_NAMES = new Set(['tectonic.log']); +export async function runConversion({ sourceDir, targetTemplateDir, outputDir, outputMainName = 'main.tex', }) { + if (!(await pathExists(sourceDir))) { + throw new Error(`Source directory not found: ${sourceDir}`); + } + if (!(await pathExists(targetTemplateDir))) { + throw new Error(`Target template directory not found: ${targetTemplateDir}`); + } + await resetOutputDirectory(outputDir); + const source = await buildSourceRepresentation(sourceDir); + const target = await buildTargetRepresentation(targetTemplateDir); + const [mergedPreamble, passOptions] = mergePreambles(target.targetPreamble, source.sourcePreamble, target.targetKind); + const sourceMainRel = path.relative(sourceDir, source.mainTex); + const generatedDir = path.join(outputDir, path.dirname(sourceMainRel)); + await fs.mkdir(generatedDir, { recursive: true }); + const targetTemplateBaseDir = path.dirname(target.mainTex); + for (const templateFile of await walkFiles(targetTemplateBaseDir)) { + const rel = path.relative(targetTemplateBaseDir, templateFile); + const destination = path.join(generatedDir, rel); + await fs.mkdir(path.dirname(destination), { recursive: true }); + await fs.copyFile(templateFile, destination); + } + await copySourceSupportFiles({ + sourceDir, + sourceMainTex: source.mainTex, + sourceMainDir: path.dirname(source.mainTex), + generatedDir, + mainBody: source.mainBody, + bibliographyBlock: source.bibliographyBlock, + appendixBody: source.appendixBody, + }); + const includeChecklist = target.targetKind === 'neurips' + && (await pathExists(path.join(generatedDir, 'checklist.tex'))); + const project = { + rootDir: outputDir, + targetKind: target.targetKind, + documentclass: target.documentclass, + titleBlock: source.titleBlock, + authorBlock: source.authorBlock, + dateBlock: source.dateBlock, + targetPreamble: mergedPreamble, + sourceMacroPreamble: source.sourcePreamble, + passOptions, + abstract: source.abstract, + mainBody: source.mainBody, + bibliographyStyle: source.bibliographyStyle, + bibliographyBlock: source.bibliographyBlock, + appendixBody: source.appendixBody, + }; + let renderedTex = renderMergedTex(project, includeChecklist); + const fixResult = applySafeTexFixes(renderedTex, target.targetKind); + renderedTex = fixResult.text; + const warnings = [...source.warnings, ...fixResult.warnings]; + const generatedMainTexInOutput = path.join(generatedDir, outputMainName); + await fs.writeFile(generatedMainTexInOutput, renderedTex, 'utf8'); + const compileEntry = generatedMainTexInOutput; + const copiedAssetCount = await countCopiedAssets(outputDir); + const audit = buildContentAudit({ + sourceMainTex: source.mainTex, + generatedMainTex: generatedMainTexInOutput, + abstract: source.abstract, + mainBody: source.mainBody, + bibliographyStyle: source.bibliographyStyle, + bibliographyBlock: source.bibliographyBlock, + appendixBody: source.appendixBody, + copiedAssetCount, + }); + const auditPath = path.join(outputDir, 'content_audit.json'); + await fs.writeFile(auditPath, `${JSON.stringify(audit, null, 2)}\n`, 'utf8'); + const manifestPath = path.join(outputDir, 'conversion_manifest.json'); + const manifest = { + source_dir: sourceDir, + target_template_dir: targetTemplateDir, + source_main_tex: path.relative(sourceDir, source.mainTex), + target_main_tex: path.relative(targetTemplateDir, target.mainTex), + target_kind: target.targetKind, + generated_main_tex: path.relative(outputDir, generatedMainTexInOutput), + compile_entry: path.relative(outputDir, compileEntry), + content_audit: path.relative(outputDir, auditPath), + warnings, + }; + await fs.writeFile(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`, 'utf8'); + return { + projectDir: outputDir, + mainTex: generatedMainTexInOutput, + manifestPath, + auditPath, + targetKind: target.targetKind, + warnings, + }; +} +async function pathExists(p) { + try { + await fs.access(p); + return true; + } + catch { + return false; + } +} +async function resetOutputDirectory(dir) { + await fs.mkdir(dir, { recursive: true }); + const entries = await fs.readdir(dir, { withFileTypes: true }); + for (const entry of entries) { + if (entry.isFile() && entry.name === 'project.json') { + continue; + } + await fs.rm(path.join(dir, entry.name), { recursive: true, force: true }); + } +} +async function copyTreeInto(src, dest) { + await fs.mkdir(dest, { recursive: true }); + const entries = await fs.readdir(src, { withFileTypes: true }); + for (const entry of entries) { + if (entry.name === 'project.json' || entry.name === '.compile') { + continue; + } + const srcPath = path.join(src, entry.name); + const destPath = path.join(dest, entry.name); + if (entry.isDirectory()) { + await copyTreeInto(srcPath, destPath); + } + else if (entry.isFile()) { + await fs.copyFile(srcPath, destPath); + } + } +} +async function walkFiles(rootDir) { + const results = []; + async function walk(dir) { + let entries; + try { + entries = await fs.readdir(dir, { withFileTypes: true }); + } + catch { + return; + } + for (const entry of entries) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + await walk(full); + } + else if (entry.isFile()) { + results.push(full); + } + } + } + await walk(rootDir); + results.sort(); + return results; +} +async function pruneLatexBuildArtifacts(projectDir) { + for (const file of await walkFiles(projectDir)) { + const base = path.basename(file); + const ext = path.extname(file).toLowerCase(); + if (REMOVABLE_NAMES.has(base) || REMOVABLE_SUFFIXES.has(ext)) { + try { + await fs.unlink(file); + } + catch { } + } + } +} +async function countCopiedAssets(projectDir) { + let count = 0; + for (const file of await walkFiles(projectDir)) { + if (path.extname(file) !== '.tex') + count += 1; + } + return count; +} +async function copySourceSupportFiles({ sourceDir, sourceMainTex, sourceMainDir, generatedDir, mainBody, bibliographyBlock, appendixBody, }) { + const referencedTexRel = collectReferencedTexRelPaths(`${mainBody}\n${appendixBody}`); + const referencedBibRel = collectReferencedBibRelPaths(bibliographyBlock); + for (const file of await walkFiles(sourceMainDir)) { + const base = path.basename(file); + const ext = path.extname(file).toLowerCase(); + if (base === 'project.json' || base === '.compile') + continue; + if (REMOVABLE_NAMES.has(base) || REMOVABLE_SUFFIXES.has(ext)) + continue; + if (path.resolve(file) === path.resolve(sourceMainTex)) + continue; + const relToMainDir = path.relative(sourceMainDir, file); + const relNoExt = ext ? relToMainDir.slice(0, -ext.length) : relToMainDir; + if (ext === '.tex') { + if (!referencedTexRel.has(relToMainDir) && !referencedTexRel.has(relNoExt)) { + continue; + } + } + if (ext === '.bib') { + if (referencedBibRel.size > 0 && !referencedBibRel.has(relToMainDir) && !referencedBibRel.has(relNoExt)) { + continue; + } + } + const destination = path.join(generatedDir, relToMainDir); + await fs.mkdir(path.dirname(destination), { recursive: true }); + await fs.copyFile(file, destination); + } +} +function collectReferencedTexRelPaths(text) { + const refs = new Set(); + const pattern = /\\(?:input|include)\{([^}]+)\}/g; + for (const match of text.matchAll(pattern)) { + const raw = match[1].trim(); + if (!raw) + continue; + refs.add(raw); + if (!raw.endsWith('.tex')) + refs.add(`${raw}.tex`); + } + return refs; +} +function collectReferencedBibRelPaths(text) { + const refs = new Set(); + const pattern = /\\bibliography(?:\[[^\]]*\])?\{([^}]+)\}/g; + for (const match of text.matchAll(pattern)) { + const entries = match[1].split(',').map((s) => s.trim()).filter(Boolean); + for (const raw of entries) { + refs.add(raw); + if (!raw.endsWith('.bib')) + refs.add(`${raw}.bib`); + } + } + return refs; +} +function buildContentAudit({ sourceMainTex, generatedMainTex, abstract, mainBody, bibliographyStyle, bibliographyBlock, appendixBody, copiedAssetCount, }) { + return { + source_main_tex: sourceMainTex, + generated_main_tex: generatedMainTex, + segments: { + abstract: segmentFingerprint(abstract), + main_body: segmentFingerprint(mainBody), + bibliography_style: segmentFingerprint(bibliographyStyle), + bibliography_block: segmentFingerprint(bibliographyBlock), + appendix_body: segmentFingerprint(appendixBody), + }, + copied_asset_count: copiedAssetCount, + }; +} +function segmentFingerprint(text) { + const value = text || ''; + const raw = Buffer.from(value, 'utf8'); + return { + chars: value.length, + lines: value.split('\n').length - (value.length === 0 ? 1 : (value.endsWith('\n') ? 1 : 0)), + sha256: crypto.createHash('sha256').update(raw).digest('hex'), + }; +} +function applySafeTexFixes(text, targetKind) { + const warnings = []; + let out = text; + let count; + ({ out, count } = subnAll(out, /(? { + count += 1; + if (typeof replacement === 'function') + return replacement(...args); + return replacement; + }); + return { out, count }; +} diff --git a/apps/backend/src/services/transferAgent/rulebasetransfer/templateConfigs.js b/apps/backend/src/services/transferAgent/rulebasetransfer/templateConfigs.js new file mode 100644 index 0000000..11decfe --- /dev/null +++ b/apps/backend/src/services/transferAgent/rulebasetransfer/templateConfigs.js @@ -0,0 +1,107 @@ +export const TEMPLATE_CONFIGS = { + acl: { + preferredMainNames: ['acl_latex.tex', 'acl_lualatex.tex'], + defaultBibliographystyle: '\\bibliographystyle{acl_natbib}', + skipPackages: new Set(['acl', 'times']), + sourceStripPackages: new Set(['acl', 'times']), + sourceStripCommands: [], + stripMacros: ['title', 'author', 'date'], + }, + neurips: { + preferredMainNames: ['neurips_2026.tex'], + defaultBibliographystyle: '\\bibliographystyle{plainnat}', + skipPackages: new Set(['neurips_2026', 'times']), + sourceStripPackages: new Set(['neurips_2026', 'times']), + sourceStripCommands: [], + stripMacros: ['title', 'author', 'date', 'pdfinfo'], + }, + icml: { + preferredMainNames: ['example_paper.tex'], + defaultBibliographystyle: '\\bibliographystyle{icml2026}', + skipPackages: new Set(['icml2026', 'times', 'algorithm', 'algorithmicx', 'algpseudocode']), + sourceStripPackages: new Set(['icml2026', 'times']), + sourceStripCommands: [], + stripMacros: ['title', 'author', 'date', 'icmltitlerunning'], + }, + iclr: { + preferredMainNames: ['iclr2026_conference.tex'], + defaultBibliographystyle: '\\bibliographystyle{iclr2026_conference}', + skipPackages: new Set(['iclr2026_conference', 'times']), + sourceStripPackages: new Set(['iclr2026_conference', 'times']), + sourceStripCommands: [], + stripMacros: ['title', 'author', 'date'], + }, + cvpr: { + preferredMainNames: ['main.tex'], + defaultBibliographystyle: '\\bibliographystyle{ieeenat_fullname}', + skipPackages: new Set(['cvpr']), + sourceStripPackages: new Set(['cvpr', 'axessibility']), + sourceStripCommands: [], + stripMacros: ['title', 'author', 'date'], + }, + aaai: { + preferredMainNames: [ + 'anonymous-submission-latex-2026.tex', + 'Formatting-Instructions-LaTeX-2026.tex', + ], + defaultBibliographystyle: '', + skipPackages: new Set([ + 'aaai2026', + 'times', + 'helvet', + 'courier', + 'url', + 'graphicx', + 'natbib', + 'caption', + 'algorithm', + 'algorithmicx', + 'algpseudocode', + 'hyperref', + 'fontenc', + ]), + sourceStripPackages: new Set([ + 'aaai2026', + 'times', + 'helvet', + 'courier', + 'url', + 'natbib', + 'caption', + ]), + sourceStripCommands: ['nocopyright'], + stripMacros: ['title', 'author', 'date', 'affiliations', 'pdfinfo'], + }, + generic: { + preferredMainNames: [], + defaultBibliographystyle: '\\bibliographystyle{plainnat}', + skipPackages: new Set(['times']), + sourceStripPackages: new Set(), + sourceStripCommands: [], + stripMacros: ['title', 'author', 'date'], + }, +}; +export const CONFERENCE_TO_FAMILY = { + acl: 'acl', + emnlp: 'acl', + neurips: 'neurips', + nips: 'neurips', + icml: 'icml', + iclr: 'iclr', + cvpr: 'cvpr', + iccv: 'cvpr', + aaai: 'aaai', +}; +export function getTemplateConfig(kind) { + return TEMPLATE_CONFIGS[kind] || TEMPLATE_CONFIGS.generic; +} +export function normalizeConferenceName(name) { + const lowered = String(name || '').trim().toLowerCase(); + if (!(lowered in CONFERENCE_TO_FAMILY)) { + throw new Error(`Unsupported conference alias: ${name}`); + } + return lowered; +} +export function conferenceFamily(name) { + return CONFERENCE_TO_FAMILY[normalizeConferenceName(name)]; +} diff --git a/apps/backend/src/services/transferAgent/rules/README.md b/apps/backend/src/services/transferAgent/rules/README.md new file mode 100644 index 0000000..aa2baac --- /dev/null +++ b/apps/backend/src/services/transferAgent/rules/README.md @@ -0,0 +1,39 @@ +# Venue rules (Transfer Agent) + +Per-venue handbooks. One Markdown file per venue, named `${venueId}.md`. +Loaded at runtime by `loadVenueRules(venueId)` in +[`apps/backend/src/services/transferAgent/neuripsRules.js`](../neuripsRules.js) +and injected verbatim as the `{VENUE}_FULL_HANDBOOK` block inside the +venue-skill system prompt (see `apps/backend/src/services/transferAgent/skills/`). + +## Contents + +| File | Venue | Loaded by | +|------|-------|-----------| +| `neurips.md` | NeurIPS 2026 | `buildNeuripsSkillFromState` | +| `icml.md` | ICML 2026 | `buildIcmlSkillFromState` | +| `cvpr.md` | CVPR 2026 | `buildCvprSkillFromState` | +| `acl.md` | *ACL family (ACL/NAACL/EACL/EMNLP) | `buildAclSkillFromState` | + +## Authoring conventions + +- **Language**: English only. The runtime LLM prompt is English; mixed-language rules have caused prompt-injection issues in the past. +- **Ground truth**: every rule should tie back to a real artefact in `templates//` (a line in `main.tex`, an option in `.sty`, a filename, …). Do not cite the conference website inline — site text changes; local templates are the authoritative baseline. +- **Tool-awareness**: rules should name the actual agent tools (`readFile`, `writeFile`, `applyDiff`, `grepFile`, `listProjectTree`, `copyAsset`, `raiseQuestion`, `measureFigures`) when prescribing a migration step, since the LLM sees the tool list and the handbook in the same prompt. +- **Shape**: sections covered (at minimum): + 1. Template layout on disk + 2. Submission modes (`\usepackage` options, based on `transferIntake.doubleBlind` / `preprint`) + 3. Document class & preamble + 4. Title / authors / anonymity + 5. Document structure & page budget + 6. Citations & bibliography + 7. Figures & tables + 8. Math / typography + 9. Critical DON'Ts (desk-rejection risks) + 10. Migration playbook (tool-call recipes) + 11. Pre-submission checklist + +## Related paths + +- Templates: `templates//` (with `templates/manifest.json` listing `id`, `label`, `mainFile`, …). +- Caching: `loadVenueRules` caches per venue by `mtimeMs`, so editing a file here is picked up on next load without a process restart. diff --git a/apps/backend/src/services/transferAgent/rules/acl.md b/apps/backend/src/services/transferAgent/rules/acl.md new file mode 100644 index 0000000..65fe951 --- /dev/null +++ b/apps/backend/src/services/transferAgent/rules/acl.md @@ -0,0 +1,237 @@ +# ACL — Venue Handbook (OpenPrism) + +Authoritative rules for `venueId = acl`. Loaded verbatim by +`loadVenueRules('acl')` and injected as `ACL_FULL_HANDBOOK` into the +skill system prompt. Ground every decision in this file plus the actual +ACL template under `templates/acl/`. The same `acl.sty` is used across +the `*ACL` family (ACL, NAACL, EACL, EMNLP); the per-conference call +for papers may override page limits and deadlines — honour intake notes +when they do. + +--- + +## 1. Template layout on disk + +| Path | Purpose | +|------|---------| +| `templates/acl/acl_latex.tex` | Reference paper shell (the `mainFile` in `templates/manifest.json`); `pdflatex` entry. | +| `templates/acl/acl_lualatex.tex` | Alternate shell for `lualatex` / `xelatex` (non-Latin scripts). | +| `templates/acl/acl.sty` | Official style with `review` / `final` / `preprint` options. **Never modify.** | +| `templates/acl/acl_natbib.bst` | Official bibliography style (APA-like author-year). | +| `templates/acl/custom.bib` | Sample bibliography. | + +Tool usage: + +- `listProjectTree({project:"target"})` to confirm the shell, `.sty`, `.bst`, and `.bib` are present. +- `readFile({project:"target", path:"acl_latex.tex"})` before any edit. + +--- + +## 2. Submission modes (`\usepackage` options) + +`acl.sty` defines three top-level options: + +| Intake flags | Required preamble | +|---|---| +| `doubleBlind=true` (default review submission) | `\usepackage[review]{acl}` — anonymisation on, line numbers on, page numbers on | +| Camera-ready (accepted paper, final version) | `\usepackage{acl}` — no anonymisation, no line numbers, no page numbers | +| `preprint=true` (non-anonymous, arXiv) | `\usepackage[preprint]{acl}` — authors visible, page numbers on, line numbers off | + +Facts: + +- `review` mode inserts left-margin line numbers (`lineno`) — never quote these in the body; they vanish in `final`. +- `review` mode also suppresses acknowledgements (per ACL policy — do not render them during blind review). +- `final` mode reveals authors and drops line numbers. +- The default engine is `pdflatex`. `acl.sty` is also compatible with `lualatex` / `xelatex`; the alternate shell `acl_lualatex.tex` demonstrates the setup for non-Latin scripts. + +--- + +## 3. Document class & preamble + +Required first line (matches `templates/acl/acl_latex.tex`): + +```latex +\documentclass[11pt]{article} +``` + +Expected package order: + +```latex +\usepackage[review]{acl} % or \usepackage{acl} / [preprint] +\usepackage{times} % font — alternatives: txfonts, newtx +\usepackage{latexsym} +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{microtype} +\usepackage{inconsolata} +\usepackage{graphicx} +% \setlength\titlebox{} % only if the title/authors overflow; never below 5cm +``` + +Hard rules: + +1. `\documentclass[11pt]{article}` is mandatory — do not add extra class options, do not switch to `IEEEtran`, `revtex*`, `acmart`, `llncs`, etc. +2. **Never modify `acl.sty`** or `acl_natbib.bst`. The sty enforces column width, margins, and anonymisation logic. +3. Never load `geometry` or set `\textwidth` / `\columnsep` / `\oddsidemargin` manually. +4. `\setlength\titlebox{}` is the **only** sanctioned layout tweak, and only when the title/author block genuinely overflows. `` must be ≥ 5 cm. +5. For non-Latin scripts, compile with `lualatex` or `xelatex` and start from `acl_lualatex.tex`; do not add `\usepackage[T5]{fontenc}` on top of a `pdflatex` build — migrate the engine instead. + +--- + +## 4. Title, authors, anonymity + +- Use standard LaTeX `\title{...}` and `\author{...}`. Author separators: + - `\and` / `\And` — same author block, no forced row break. + - `\AND` — force a new row of author blocks. +- `review` mode hides the author block automatically; still keep real `\author{...}` content so `\usepackage{acl}` produces a correct camera-ready without re-editing. +- Double-blind hygiene in review mode: + - Remove names, affiliations, identifying emails, public URLs, GitHub handles, grant numbers, dataset slugs from body, captions, and comments. + - `acl.sty` already hides the acknowledgements section in review — ensure acknowledgements live in their own `\section*{Acknowledgments}` (or `\section*{Acknowledgements}`) so the suppression works. + - Self-cite in the third person (“Smith et al. (2023) …”). + - Clear `hyperref` metadata (`pdfauthor`, `pdftitle`, `pdfkeywords`) if the source set it. + +--- + +## 5. Document structure & ordering + +Required ordering inside `\begin{document} … \end{document}`: + +1. `\maketitle` +2. `\begin{abstract} … \end{abstract}` — typically a single paragraph. +3. Main body sections. +4. `\section*{Limitations}` — **required by most *ACL venues** (e.g. ACL/NAACL/EMNLP); check the current CFP. Unnumbered (starred). Does not count toward the page limit. +5. `\section*{Acknowledgments}` — hidden in `review` mode; rendered in `final`. +6. **References** — `\bibliography{custom}` (or your actual `.bib` name). Must come **before** appendices. +7. `\appendix` followed by appendix sections. + +Notes: + +- Some *ACL venues also require `\section*{Ethics Statement}` or an analogous section. Consult intake notes; if present in the source, preserve it and place it near the Limitations section. +- Do not place appendices before References — that order is wrong for *ACL submissions. + +--- + +## 6. Page budget + +- *ACL page limits are venue-specific and change per call for papers. Common defaults: + - Long papers: 8 pages main body; camera-ready adds 1 page. + - Short papers: 4 pages main body; camera-ready adds 1 page. +- References, Limitations, Ethics Statement, Acknowledgments, and Appendices do **not** count toward the page limit. +- If `transferIntake.outputNotes` names a specific *ACL venue, use its current limit. When unsure, `raiseQuestion` once with the two or three plausible choices rather than guessing silently. + +--- + +## 7. Citations & bibliography + +- `acl.sty` loads `natbib`. Use the *ACL-natbib macros: + - `\citep{key}` → `(Author, Year)` + - `\citet{key}` → `Author (Year)` (narrative) + - `\citealp{key}` → `Author, Year` (no parentheses) + - `\citeyearpar{key}` → `(Year)` + - `\citeposs{key}` → possessive (“Author's (Year)”) — *ACL-only convenience, skip it for cross-venue portability. +- Bibliography style is `acl_natbib.bst`; the template does not require `\bibliographystyle{...}` explicitly because the sty already sets it. Use only: + - `\bibliography{custom}` for your own `.bib`, or + - `\bibliography{anthology,custom}` to merge the ACL Anthology bib with your own. +- Do not anonymise the reference list — self-citations keep real authorship; anonymity is enforced via third-person narration. +- Prefer BibTeX. If the source uses `biblatex`, migrate to BibTeX + ACL's natbib style. `copyAsset` the `.bib` file(s) into the target and update the `\bibliography{...}` argument accordingly. +- `.bib` entries should include DOI or URL fields when possible — `acl.sty` renders the paper title as a hyperlink when one is present. +- Bib entries must avoid raw Unicode (BibTeX does not handle it reliably); use the `\"a`, `\^e`, `\'u`, … forms for accents. + +--- + +## 8. Figures & tables (two-column layout) + +- *ACL uses **two-column** layout (inherited from `acl.sty`). +- Keep `figure` for single-column graphics and `figure*` for full-width art (`width=\linewidth` inside `figure*` spans both columns). **Do not** flatten `figure*` / `table*` to single-column. +- Inside a single-column `figure`, use `width=\columnwidth` (or equivalently `\linewidth`) — this is what the template does in its `figure[t]` example. +- Caption below figures, above tables. **Do not override default caption sizes** (the ACL author guide is explicit). +- Reference figures and tables with `\ref{...}` — the template does not require `\cref`. If the source uses `cleveref`, feel free to keep it but configure the labels consistently. +- When migrating from a single-column source (`article`, `neurips`, `llncs`, `revtex` single), call `measureFigures({sourceClass:, sourceTwocolumn:false, targetClass:"acmart", figures:[...]})` as an approximation for two-column width advice — the ACL column width (~240 pt) is close to `acmart` sigconf — and apply the recommended ratios via `applyDiff`. (`acl` is not a first-class entry in the layout DB, so treat the tool output as guidance, not ground truth.) + +Tool usage: + +- `copyAsset(srcPath, destPath)` every image referenced by `\includegraphics` and every `.bib` the source needs. +- After copying, `listProjectTree({project:"target"})` and `grepFile({project:"target", pattern:"\\\\includegraphics"})` to confirm paths resolve from the target root. + +--- + +## 9. Math, typography, hyperlinks + +- Use LaTeX/AMS math environments (`equation`, `align`, `gather`). Never use `$$ … $$` — interacts badly with `lineno` in review mode. +- Do not override body font sizes or section heading styles; `acl.sty` controls them. +- `hyperref` is not loaded by the template by default — add it late in the preamble if needed, after all other packages. The *ACL template warns about `\pdfendlink`/`\pdfstartlink` nesting errors with older TeX Live versions; ensure the build environment is modern (TeX Live 2018-12-01 or newer). +- Footnotes use `\footnote{...}`; keep them rare. +- For Bib\TeX accents and special characters, follow the accent-command table in the reference shell (e.g. `\"a`, `\^e`, `\'u`, `\aa`). + +--- + +## 10. Critical DON'Ts (desk-rejection / format-check risks) + +- Do **not** modify `acl.sty` or `acl_natbib.bst`. +- Do **not** add `\usepackage{geometry}`, or change `\textwidth`, `\columnsep`, `\oddsidemargin`, `\topmargin`. +- Do **not** set `\setlength\titlebox{}` below 5 cm. +- Do **not** convert `figure*` / `table*` to single-column wholesale; keep wide art wide. +- Do **not** leave author names, affiliations, identifying URLs, or acknowledgement content visible in `review` mode. +- Do **not** place appendices before References. +- Do **not** drop the Limitations section when the target venue requires it. +- Do **not** use `$$ … $$` for display math. +- Do **not** override default caption sizes. + +--- + +## 11. Migration playbook (agent actions) + +1. **Reconnaissance** + - `listProjectTree({project:"source"})` + `listProjectTree({project:"target"})`. + - `readFile({project:"target", path:"acl_latex.tex"})` to read the *ACL shell as-is. + - `grepFile({project:"source", pattern:"\\\\documentclass|\\\\usepackage|\\\\author|\\\\title|\\\\bibliography(style)?|biblatex|figure\\*|table\\*|\\$\\$", glob:"*.tex"})`. +2. **Engine decision** + - Default to `pdflatex` + `acl_latex.tex`. + - If the source requires non-Latin scripts (Chinese, Arabic, Devanagari, etc.), migrate to `acl_lualatex.tex` or `acl_xelatex`-style preamble. Decide up front; mixing engines mid-migration is error-prone. +3. **Preamble transplant** + - Replace the source `\documentclass{...}` with `\documentclass[11pt]{article}` via `applyDiff`. + - Swap the style package to `\usepackage[review]{acl}` / `\usepackage{acl}` / `\usepackage[preprint]{acl}` based on intake. + - Remove any source `\usepackage{geometry}`, `\setlength{\textwidth}{...}`, `\oddsidemargin`, column overrides. + - Keep/add `\usepackage{times}`, `\usepackage{microtype}`, `\usepackage{graphicx}`, `\usepackage{inconsolata}` to match the template's defaults. +4. **Body** + - Copy section content verbatim. Preserve `figure*` / `table*`. + - Ensure the tail order is: body → `\section*{Limitations}` → `\section*{Acknowledgments}` → `\bibliography{...}` → `\appendix`. +5. **Citations & bibliography** + - If source is `biblatex`: switch to `\bibliography{}`, replace `\autocite` / `\parencite` / `\textcite` with `\citep` / `\citet`, remove `\addbibresource{...}`. + - `grepFile({project:"target", pattern:"\\\\cite(alp|p|t|poss|yearpar)?\\{", glob:"*.tex"})` to audit coverage. + - `copyAsset` every `.bib` needed (and `anthology.bib` if the source merges it). +6. **Figures** + - `copyAsset` every image. Use `measureFigures` as guidance (see §8) and apply widths via `applyDiff`. +7. **Anonymisation sweep** (review mode) + - `grepFile` for names, affiliations, emails, grant IDs, public URLs, GitHub handles, dataset slugs. Redact via `applyDiff` or `writeFile`. + - Clear hyperref metadata if present. + - Verify the acknowledgements live in `\section*{Acknowledg(e)?ments}` so `acl.sty` can suppress them. +8. **Limitations / Ethics** + - Ensure `\section*{Limitations}` is present. If the source lacks one, insert a placeholder (`raiseQuestion` only if the intake explicitly leaves this ambiguous). + - Preserve any `\section*{Ethics Statement}` the source carries. +9. **Verification** + - `grepFile(pattern:"\\$\\$")` → zero hits. + - `grepFile(pattern:"\\\\usepackage\\{geometry\\}|\\\\setlength\\\\textwidth|\\\\setlength\\\\columnsep")` → zero hits. + - `grepFile(pattern:"\\\\appendix")` appears after `\\bibliography{...}`, not before. + - `grepFile(pattern:"\\\\section\\*\\{Limitations\\}")` → exactly one hit (when required). + - `grepFile(pattern:"\\\\bibliography\\{")` → exactly one hit; argument matches a `.bib` file that exists in the target. + +Prefer `applyDiff` for small surgical changes (style swap, `\documentclass` change, inserting Limitations). Use `writeFile` only for wholesale rewrites, e.g. when moving from a single-column `revtex` paper with a fundamentally different tail layout. + +Use `raiseQuestion` sparingly: only when the choice changes the output and cannot be derived from `transferIntake` / file contents (e.g. which *ACL venue the submission targets when the intake says only “acl”, whether to keep a `figure*` span, whether to keep or remove an Ethics Statement inherited from another venue). + +--- + +## 12. Pre-submission checklist + +- [ ] `\documentclass[11pt]{article}` and the correct `acl` option (`[review]` / none / `[preprint]`) per intake. +- [ ] `acl.sty`, `acl_natbib.bst` unmodified. +- [ ] No `\usepackage{geometry}`, no manual `\textwidth` / `\columnsep` / margins; `\setlength\titlebox{...}` (if used) is ≥ 5 cm. +- [ ] Two-column layout preserved; `figure*` / `table*` kept for full-width art; default caption sizes intact. +- [ ] Tail order: body → Limitations (unnumbered) → Acknowledgments (unnumbered) → References → `\appendix` → appendix sections. +- [ ] Citations use `\citep` / `\citet` / `\citealp` / `\citeyearpar`; `\bibliography{...}` points to an existing `.bib`; DOIs / URLs populated where possible. +- [ ] No `$$ … $$` in the body. +- [ ] Double-blind sweep (review): no names, affiliations, identifying URLs, GitHub handles, acknowledgements in body; `hyperref` metadata clean; line numbers untouched in source. +- [ ] Limitations section present (when required by the venue). +- [ ] Engine matches the target shell (`pdflatex` ↔ `acl_latex.tex`; `lualatex`/`xelatex` ↔ `acl_lualatex.tex`). +- [ ] `pdflatex` (or chosen engine) compiles cleanly (at least two passes). diff --git a/apps/backend/src/services/transferAgent/rules/cvpr.md b/apps/backend/src/services/transferAgent/rules/cvpr.md new file mode 100644 index 0000000..1e54790 --- /dev/null +++ b/apps/backend/src/services/transferAgent/rules/cvpr.md @@ -0,0 +1,223 @@ +# CVPR 2026 — Venue Handbook (OpenPrism) + +Authoritative rules for `venueId = cvpr`. Loaded verbatim by +`loadVenueRules('cvpr')` and injected as `CVPR_FULL_HANDBOOK` into the +skill system prompt. Ground every decision in this file plus the actual +CVPR template under `templates/cvpr/`. + +--- + +## 1. Template layout on disk + +| Path | Purpose | +|------|---------| +| `templates/cvpr/main.tex` | Paper shell; `pdflatex` entry point. | +| `templates/cvpr/cvpr.sty` | Official style (two-column, letterpaper, `\textwidth=6.875in`, `\textheight=8.875in`). Offers `review`, `pagenumbers` options. **Never modify.** | +| `templates/cvpr/preamble.tex` | Optional tweaks (TODO macros, `microtype`, spacing). Loaded via `\input{preamble}`. | +| `templates/cvpr/ieeenat_fullname.bst` | CVPR bibliography style (numeric, full author names). | +| `templates/cvpr/main.bib` | Sample bibliography. | +| `templates/cvpr/rebuttal.tex` | Rebuttal template — only relevant after reviewer feedback. | +| `templates/cvpr/sec/0_abstract.tex` etc. | Section files already split by `\input{sec/...}`. | +| `templates/cvpr/sec/X_suppl.tex` | Supplementary material — **do not** include it in the main submission by default. | + +Tool usage: + +- `listProjectTree({project:"target"})` to confirm the `sec/` directory and `.sty/.bst` files exist. +- `readFile({project:"target", path:"main.tex"})` and, as needed, each `sec/*.tex`. + +--- + +## 2. Submission modes (`\usepackage` options) + +| Intake flags | Required preamble | +|---|---| +| `doubleBlind=true` (review submission) | `\usepackage[review]{cvpr}` | +| Camera-ready (accepted paper) | `\usepackage{cvpr}` | +| Preprint / arXiv (non-anonymous, numbered pages) | `\usepackage[pagenumbers]{cvpr}` | + +Facts: + +- `review` mode enables anonymisation + page numbers + the CVPR ruler/line numbers on the left margin. +- Camera-ready mode hides page numbers automatically. +- `pagenumbers` forces page numbers without enabling review annotations. + +--- + +## 3. Document class & preamble + +Required first line (matches `templates/cvpr/main.tex`): + +```latex +\documentclass[10pt,twocolumn,letterpaper]{article} +``` + +Required package/preamble order: + +```latex +\usepackage[review]{cvpr} % or \usepackage{cvpr} for camera-ready +\input{preamble} % optional per-paper tweaks +\definecolor{cvprblue}{rgb}{0.21,0.49,0.74} +\usepackage[pagebackref,breaklinks,colorlinks,allcolors=cvprblue]{hyperref} + +\def\paperID{*****} % reviewer-supplied paper ID +\def\confName{CVPR} +\def\confYear{2026} +``` + +Hard rules: + +1. `\documentclass` **must** be `\documentclass[10pt,twocolumn,letterpaper]{article}`. Reject `IEEEtran`, `revtex*`, `acmart`, `llncs`, etc. +2. Load `hyperref` **after** `cvpr.sty` (the template comment warns against disabling it). `pagebackref,breaklinks,colorlinks,allcolors=cvprblue` are the expected options for review/final. +3. **Never modify `cvpr.sty`** or `ieeenat_fullname.bst`. The sty enforces `\textwidth=6.875in` and `\textheight=8.875in`. +4. Never load `geometry` or set `\textwidth`, `\columnsep`, `\oddsidemargin`, `\topmargin` by hand. +5. Keep `\paperID`, `\confName`, `\confYear` present; `\paperID` must be populated with the reviewer-supplied ID before submission. + +--- + +## 4. Title, authors, anonymity + +- Review mode shows a generated anonymous author block; the real `\author{...}` is only rendered in camera-ready. +- In review mode, remove every identifying string: names, affiliations, acknowledgements, public URLs, GitHub handles, dataset slugs, grant numbers. +- Self-cite in the third person (“Smith et al. [12] …”), never “our prior work [12]”. +- Hyperref metadata: if the source sets `pdfauthor=` / `pdftitle=` / `pdfkeywords=`, strip or blank them for review. + +--- + +## 5. Document structure (follows the template) + +The shell already splits content into `sec/*.tex`: + +```latex +\begin{document} +\maketitle +\input{sec/0_abstract} +\input{sec/1_intro} +\input{sec/2_formatting} +\input{sec/3_finalcopy} +{ + \small + \bibliographystyle{ieeenat_fullname} + \bibliography{main} +} +% \input{sec/X_suppl} % ← DO NOT ship in the main submission +\end{document} +``` + +When migrating: + +- Either keep this split and populate the section files from the source, or inline everything into `main.tex` — match whatever the source uses. Fewer files is fine if the source has ≤ 3 logical sections. +- **Never** include `sec/X_suppl.tex` or any other supplementary file inside the main submission unless the user explicitly asks. Supplementary PDF goes as a separate upload. + +Content order rules: + +1. `\maketitle`. +2. Abstract (`\begin{abstract} … \end{abstract}`), single paragraph. +3. Main body sections. +4. References (inside `{\small \bibliographystyle{ieeenat_fullname} \bibliography{main}}`). +5. Optional appendix / supplementary — again, do **not** inline in main for submission. + +--- + +## 6. Page budget + +- Main body: typically **8 pages excluding references**. Camera-ready usually extends by 1 page for references. Always check the current call for papers; if intake notes disagree with the template, call `raiseQuestion`. +- References do not count toward the main-body limit. +- Supplementary material is a **separate PDF**, not appended to the main submission. + +--- + +## 7. Citations & bibliography + +- Bibliography style is `ieeenat_fullname` (numeric `[1, 2, 3]` style with full author names). Use `\bibliographystyle{ieeenat_fullname}` + `\bibliography{main}` (or whatever `.bib` the source uses — rename the argument, not the style). +- Wrap the bibliography in `{\small … }` as in the template. +- Do not anonymise the reference list. Self-citations retain their real author names; anonymisation is satisfied by third-person narration in the body. +- If the source uses `biblatex`, migrate to BibTeX. `copyAsset` the `.bib` file(s) into the target, then replace `\printbibliography` + `\addbibresource{...}` with the CVPR pattern. + +--- + +## 8. Figures & tables (two-column layout) + +- CVPR is **two-column**. `\textwidth = 6.875in` ≈ 496 pt; each column ≈ 237 pt. +- Keep `figure` for single-column graphics and `figure*` for full-width art that must span both columns. **Do not** flatten `figure*`/`table*` to single-column. +- Caption below figures, above tables. Floats (especially `figure*`/`table*`) may only appear at the top or bottom of a page. +- Use `\includegraphics[width=\linewidth]{...}` inside a one-column `figure`; in `figure*`, `\linewidth = \textwidth` so the same command spans both columns. +- `\cref`-style references are encouraged (the template supports them via `cleveref` when loaded in `preamble.tex`). +- When migrating from a single-column source (`article`, `revtex` single, `neurips`, `llncs`): call `measureFigures({sourceClass:, sourceTwocolumn:false, targetClass:"cvpr", figures:[...]})` and apply the recommended widths via `applyDiff`. + +Tool usage: + +- `copyAsset(srcPath, destPath)` every image file referenced by `\includegraphics` and every `.bib` that the source relied on. +- After copies, `listProjectTree({project:"target"})` and `grepFile({project:"target", pattern:"\\\\includegraphics"})` to confirm paths resolve. + +--- + +## 9. Math, typography, hyperlinks + +- Use LaTeX/AMS math environments (`equation`, `align`, `gather`). Never use `$$ … $$`. +- Do not redefine section fonts or spacing — the sty controls them. +- Keep `hyperref` enabled with the CVPR preset (`pagebackref,breaklinks,colorlinks,allcolors=cvprblue`). Disable only as a last resort when a genuine compile blocker appears. If you do disable it, delete the project's `*.aux` files before the next compile. +- Keep references in the bibliography and `\label`/`\ref` targets intact; migration must not break cross-references. + +--- + +## 10. Critical DON'Ts (desk-rejection risks) + +- Do **not** modify `cvpr.sty` or `ieeenat_fullname.bst`. +- Do **not** add `\usepackage{geometry}` or change `\textwidth`, `\columnsep`, `\oddsidemargin`, `\topmargin`. +- Do **not** drop two-column layout or convert `figure*`/`table*` to single-column wholesale. +- Do **not** include `sec/X_suppl.tex` (or any supplementary content) in the main review PDF by default. +- Do **not** leave `\paperID{*****}` in the final review submission — replace with the real ID (if intake notes provide one; otherwise `raiseQuestion`). +- Do **not** use `$$ … $$` for display math. +- Do **not** expose author names, affiliations, acknowledgements, or identifying URLs in review mode. + +--- + +## 11. Migration playbook (agent actions) + +1. **Reconnaissance** + - `listProjectTree({project:"source"})` + `listProjectTree({project:"target"})`. + - `readFile({project:"target", path:"main.tex"})`; `readFile({project:"target", path:"preamble.tex"})`. + - `grepFile({project:"source", pattern:"\\\\documentclass|\\\\usepackage|\\\\bibliographystyle|biblatex|\\$\\$|\\\\author|\\\\affiliation", glob:"*.tex"})`. +2. **Preamble transplant** + - Replace the source `\documentclass{...}` with `\documentclass[10pt,twocolumn,letterpaper]{article}` via `applyDiff`. + - Swap the style package to `\usepackage[review]{cvpr}` (or `\usepackage{cvpr}` / `\usepackage[pagenumbers]{cvpr}` based on intake). + - Keep the hyperref line as in the template. Remove any source `\usepackage{geometry}` / custom margin commands. +3. **Body** + - Move section content into the matching `sec/*.tex` files or inline into `main.tex`, whichever matches the source shape. + - Preserve `figure*` and `table*` — do not downgrade them. +4. **Citations & bibliography** + - If source is `biblatex`: switch to `\bibliographystyle{ieeenat_fullname}` + `\bibliography{}`. Replace `\autocite` / `\parencite` / `\textcite` with `\cite` / `\citep` / `\citet` as appropriate. Citations are numeric under CVPR, so `\cite{...}` prints as `[N]`. + - Confirm the `.bib` file is present in target and that its name matches the `\bibliography{...}` argument. +5. **Figures** + - `copyAsset` every image. Then `measureFigures` with `targetClass:"cvpr"` and apply recommended widths via `applyDiff`. +6. **Double-blind sweep** (review mode) + - `grepFile` for names, affiliations, acknowledgements, grant numbers, GitHub URLs, dataset paths that expose identity. + - Also clear hyperref metadata (`pdfauthor=`, `pdftitle=`, `pdfkeywords=`) or set `pdfauthor={}` — `\hypersetup{pdfauthor={}}` is safe to add. +7. **Paper ID** + - Replace `\def\paperID{*****}` with the real ID if intake notes provide one. Otherwise leave the placeholder for the user and flag it via `raiseQuestion` **only if the intake notes are ambiguous**. +8. **Verification** + - `grepFile(pattern:"\\$\\$")` → zero hits. + - `grepFile(pattern:"\\\\usepackage\\{geometry\\}|\\\\setlength\\\\textwidth|\\\\setlength\\\\columnsep")` → zero hits. + - `grepFile(pattern:"sec/X_suppl")` → must not be `\input`'d uncommented. + - `grepFile(pattern:"\\\\bibliographystyle\\{ieeenat_fullname\\}")` → exactly one hit. + +Prefer `applyDiff` for small surgical changes (style swap, `\documentclass` change, single `\bibliographystyle` update). Fall back to `writeFile` only for a full rewrite — e.g. converting from a single-column layout to CVPR's two-column shell. + +Use `raiseQuestion` sparingly: only when intake notes cannot resolve a genuine fork (e.g. whether a figure should stay as `figure*` because the source had a custom two-column override, or when the user forgot to provide a paper ID). + +--- + +## 12. Pre-submission checklist + +- [ ] `\documentclass[10pt,twocolumn,letterpaper]{article}` present. +- [ ] Correct `cvpr` option: `[review]` / none / `[pagenumbers]` per intake. +- [ ] `cvpr.sty` and `ieeenat_fullname.bst` unmodified. +- [ ] `\paperID` populated (for review submissions). +- [ ] `hyperref` loaded with the CVPR preset (or deliberately disabled with `*.aux` cleaned). +- [ ] Two-column layout preserved; `figure*` / `table*` kept for full-width art. +- [ ] Captions: below figures, above tables; `\cref`/`\ref` targets intact. +- [ ] `{\small \bibliographystyle{ieeenat_fullname} \bibliography{}}` present; numeric citations. +- [ ] No `$$ … $$`; no `\usepackage{geometry}`; no manual `\textwidth`/`\columnsep`. +- [ ] Double-blind sweep done (review): names, affiliations, ack, URLs, GitHub handles, hyperref metadata all clean. +- [ ] `sec/X_suppl.tex` (or any supplementary) **not** `\input`'d in the main submission. +- [ ] `pdflatex` compiles cleanly (at least two passes) on US Letter. diff --git a/apps/backend/src/services/transferAgent/rules/icml.md b/apps/backend/src/services/transferAgent/rules/icml.md new file mode 100644 index 0000000..15f3f7f --- /dev/null +++ b/apps/backend/src/services/transferAgent/rules/icml.md @@ -0,0 +1,217 @@ +# ICML 2026 — Venue Handbook (OpenPrism) + +Authoritative rules for `venueId = icml`. Loaded verbatim by +`loadVenueRules('icml')` and injected as `ICML_FULL_HANDBOOK` into the +skill system prompt. Ground all decisions in this file plus the actual +template under `templates/icml/`. + +--- + +## 1. Template layout on disk + +| File | Purpose | +|------|---------| +| `templates/icml/main.tex` | Paper shell and `pdflatex` entry. | +| `templates/icml/icml2026.sty` | Official two-column style. `textwidth=487.8225pt` is enforced; the sty warns if it is altered. **Never modify.** | +| `templates/icml/icml2026.bst` | Official bibliography style (APA author-year). | +| `templates/icml/references.bib` | Sample bibliography. | +| `templates/icml/algorithm.sty`, `algorithmic.sty`, `fancyhdr.sty` | Bundled dependencies; copy as-is if the target project needs them. | + +Tool usage: + +- `listProjectTree({project:"target"})` to confirm the five style/bst files are present. +- `readFile({project:"target", path:"main.tex"})` before any edit. + +--- + +## 2. Submission modes (`\usepackage` options) + +`icml2026.sty` defines the following options: + +| Intake flags | Required preamble | +|---|---| +| `doubleBlind=true` (default anonymous submission) | `\usepackage{icml2026}` | +| `preprint=true` (non-anonymous, page numbers) | `\usepackage[preprint]{icml2026}` | +| Camera-ready (accepted paper) | `\usepackage[accepted]{icml2026}` | +| Package clash with `hyperref` | append `,nohyperref` | + +Facts: + +- Anonymous mode shows `Anonymous Authors` regardless of the `\icmlauthor` metadata and hides acknowledgements. +- The style auto-loads `natbib` and `newtx`/`times`, plus line numbers in review mode. Do not add them again. + +--- + +## 3. Document class & preamble + +Required structure (matches `templates/icml/main.tex`): + +```latex +\documentclass{article} +\usepackage{icml2026} + +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{url} +\usepackage{booktabs} +\usepackage{amsfonts} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{microtype} +\usepackage{graphicx} +\usepackage{xcolor} +\usepackage{algorithm} +\usepackage{algorithmic} +\usepackage{hyperref} + +\icmltitlerunning{} +``` + +Hard rules: + +1. `\documentclass` **must be `article`**. Reject `revtex*`, `IEEEtran`, `acmart`, `llncs`, etc. +2. Never load `geometry`, never set `\textwidth`, `\columnsep`, `\oddsidemargin` by hand. The sty enforces `\textwidth=487.8225pt` and flushbottom+twocolumn layout; any alteration triggers a warning and may be grounds for desk rejection. +3. Never modify `icml2026.sty` (or its bundled `algorithm.sty`, `algorithmic.sty`, `fancyhdr.sty`). + +--- + +## 4. Title, authors, anonymity + +Use the ICML macros, not `\author{}`: + +```latex +\twocolumn[ + \icmltitle{Paper Title} + \icmlsetsymbol{equal}{*} + \begin{icmlauthorlist} + \icmlauthor{First Author}{inst1} + \icmlauthor{Second Author}{inst2} + \end{icmlauthorlist} + \icmlaffiliation{inst1}{Department, University, City, Country} + \icmlaffiliation{inst2}{Department, University, City, Country} + \icmlcorrespondingauthor{First Author}{email@domain} + \icmlkeywords{Machine Learning, ICML} + \vskip 0.3in +] +\printAffiliationsAndNotice{} +``` + +- Always call `\printAffiliationsAndNotice{}` somewhere in the body, otherwise the style emits an end-of-document warning. +- Double-blind hygiene: remove author names, affiliations, identifying URLs, GitHub handles, dataset slugs, grant numbers. Self-cite in the third person (“Jones et al. [4] …”). Keep `\icmlauthor` / `\icmlaffiliation` populated in source — they are hidden by the style in anonymous mode and surface when `[accepted]` is set. + +--- + +## 5. Document structure & page budget + +Required ordering (does **not** all count toward the page limit): + +1. Title, `\icmlauthorlist`, `\icmlaffiliation`, `\icmlcorrespondingauthor`, `\icmlkeywords`. +2. `\begin{abstract} … \end{abstract}` — single paragraph, 4–6 sentences. +3. Main body sections. **Max 8 pages.** Camera-ready gets one extra page (9 total). +4. `\section{Impact Statement}` — mandatory, unnumbered effectively, placed before References. If the paper does not discuss impacts, state so in a sentence; do not omit the section. +5. `\section*{Acknowledgements}` — hidden in anonymous mode; visible with `[accepted]`. +6. `\bibliographystyle{icml2026}` then `\bibliography{}` (no `natbib` style swap). +7. Appendix (optional) via `\appendix`. Submitted in the **same PDF**, never a separate file. `\onecolumn` after `\appendix` is allowed if the appendix would otherwise be cramped. + +What does **not** count toward the 8-page limit: Impact Statement, Acknowledgements, References, Appendix. + +--- + +## 6. Citations & bibliography + +- Citation style is **APA author-year**, NOT numeric. +- `icml2026.sty` auto-loads `natbib`. Use: + - `\citet{key}` for narrative: `Jones et al. (2022) showed …`. + - `\citep{key}` for parenthetical: `… (Jones et al., 2022)`. + - `\citeauthor`, `\citeyear`, `\citealp` for variants. +- `\bibliographystyle{icml2026}` + `\bibliography{}` — do **not** swap to `plainnat`, `unsrtnat`, `IEEEtran`, etc. +- If the source uses `biblatex`, migrate to BibTeX + `icml2026.bst`. Copy `.bib` files with `copyAsset`. +- Do not anonymise the reference list. Self-citations stay under their real authorship; the anonymisation requirement is satisfied by third-person narration in the body. +- Alphabetise the `.bib` by first-author surname for APA-like output. + +--- + +## 7. Figures & tables (two-column layout) + +- ICML is **two-column**. `\textwidth ≈ 487.8 pt` (column width ≈ 233 pt). +- Keep `figure` for single-column art and `figure*` for full-width art that must span both columns. **Do not** convert `figure*`/`table*` to single-column; they are needed for wide panels/tables. +- Caption below figures, above tables; captions are 9 pt (set by the style). +- Two-column floats (`figure*`, `table*`) may only be placed at the top or bottom of a page. +- Do not put titles inside the figure graphics — rely on captions. +- Use `\includegraphics[width=\linewidth]{...}` inside a one-column `figure`; in `figure*`, `\linewidth = \textwidth` so the same command spans both columns. +- Call `measureFigures({sourceClass:, sourceTwocolumn:, targetClass:"icml", figures:[...]})` after migrating from another venue — the tool returns a recommended width ratio based on ICML's geometry. + +Tool usage: + +- `copyAsset(srcPath, destPath)` for every `\includegraphics` target (`.pdf/.png/.jpg`) and every `.bib` / `.bst` file the source relied on. + +--- + +## 8. Math, algorithms, typography + +- Use LaTeX/AMS math environments (`equation`, `align`, `gather`, …). Never use `$$ … $$` — it breaks `lineno` in review mode. +- Pseudocode: `algorithm` + `algorithmic` (already loaded by the sample preamble). Keep algorithms inside floats (`\begin{algorithm}[tb]`). +- 10 pt Times (newtx) body is enforced by the sty. Do not override fonts. +- Headings: `\section` 11 pt bold, content words capitalised; `\subsection` 10 pt bold; `\subsubsection` 10 pt small caps. Do not go deeper than three levels. +- Footnotes are 9 pt at column bottom — keep them rare. + +--- + +## 9. Critical DON'Ts (any of these is a desk-rejection risk) + +- Do **not** include author info in the anonymous submission (names, affiliations, emails, identifying URLs, GitHub profiles, acknowledgements, grant IDs). +- Do **not** modify `icml2026.sty`, `icml2026.bst`, `algorithm.sty`, `algorithmic.sty`, or `fancyhdr.sty`. +- Do **not** add `\usepackage{geometry}` or hand-set text width / margins — the sty refuses. +- Do **not** use Type-3 fonts. `pdflatex` with the provided sty is correct. +- Do **not** use `$$ … $$` for display math. +- Do **not** split the appendix into a separate PDF; append it with `\appendix` in the same file. +- Do **not** drop the Impact Statement. + +--- + +## 10. Migration playbook (agent actions) + +1. **Reconnaissance** + - `listProjectTree({project:"source"})` + `listProjectTree({project:"target"})`. + - `readFile({project:"target", path:"main.tex"})` to read the ICML shell as-is. + - `grepFile({project:"source", pattern:"\\\\documentclass|\\\\usepackage|\\\\title|\\\\author|\\\\bibliographystyle|biblatex|figure\\*|table\\*|\\$\\$", glob:"*.tex"})` to spot trouble areas at once. +2. **Preamble / title block** + - Replace source `\documentclass{...}` with `\documentclass{article}` + `\usepackage{icml2026}` via `applyDiff`. + - Convert `\author{...}` (and any `\affiliation{...}`, `\email{...}`) into `\icmlauthorlist` / `\icmlaffiliation` / `\icmlcorrespondingauthor`. For anonymous submissions, keep the real data inside these macros — the style hides them. +3. **Body** + - Copy content verbatim. **Preserve `figure*` / `table*`.** Remove custom `\geometry{...}`, `\oddsidemargin`, column overrides. + - Ensure `\printAffiliationsAndNotice{}` is called after the title block (the shell already does this). +4. **Impact Statement** + - Add or keep `\section{Impact Statement}` before References. If the source has no equivalent, insert a short placeholder the user can expand. Consider `raiseQuestion` only if intake notes are ambiguous about societal impact posture. +5. **Citations & bibliography** + - If the source uses `biblatex`, switch to `\bibliographystyle{icml2026}` + `\bibliography{...}`. Replace every `\autocite`, `\parencite`, `\textcite` with the natbib equivalents. + - Enforce author-year via `\citep`/`\citet`. `grepFile(pattern:"\\\\cite(alp|p|t|author|year)?\\{")` to audit. +6. **Assets** + - `copyAsset` every image and every `.bib` referenced by the source. After copying, rerun `listProjectTree({project:"target"})`. +7. **Figures** + - Call `measureFigures` with `targetClass:"icml"` and the source class. Update widths via `applyDiff`. +8. **Double-blind sweep** + - `grepFile` for real names, affiliations, emails, public URLs, GitHub handles. Redact anything that survives in body text, captions, comments, or hyperref metadata. +9. **Verification** + - `grepFile(pattern:"\\$\\$")` → zero hits expected. + - `grepFile(pattern:"\\\\usepackage\\{geometry\\}|\\\\setlength\\\\textwidth|\\\\setlength\\\\columnsep")` → zero hits. + - `grepFile(pattern:"\\\\section\\*?\\{Impact Statement\\}")` → exactly one hit. + - `grepFile(pattern:"\\\\bibliographystyle\\{icml2026\\}")` → one hit. + +Use `raiseQuestion` only when a choice materially changes output and cannot be inferred from `transferIntake` / file contents (e.g., whether to keep a figure as `figure*` because the user resized it). + +--- + +## 11. Pre-submission checklist + +- [ ] `\documentclass{article}` and `\usepackage{icml2026}` (anonymous) or `\usepackage[accepted]{icml2026}` (camera-ready) or `\usepackage[preprint]{icml2026}`. +- [ ] Two-column layout preserved; `figure*` / `table*` used for full-width art. +- [ ] `\icmlauthorlist` / `\icmlaffiliation` / `\icmlcorrespondingauthor` populated; `\printAffiliationsAndNotice{}` present. +- [ ] Abstract is a single paragraph, 4–6 sentences. +- [ ] Main body ≤ 8 pages (camera-ready may use 9). +- [ ] `\section{Impact Statement}` present before References. +- [ ] `\bibliographystyle{icml2026}` + `\bibliography{}`; citations are author-year (`\citet` / `\citep`). +- [ ] No `$$ … $$`, no `\usepackage{geometry}`, no `\textwidth` overrides. +- [ ] Double-blind sweep done: no identifying data, no acknowledgements in anonymous mode, no identifying URLs. +- [ ] Appendix (if any) in the same PDF via `\appendix`. +- [ ] `pdflatex` compiles cleanly (at least two passes); no Type-3 fonts. diff --git a/apps/backend/src/services/transferAgent/rules/neurips.md b/apps/backend/src/services/transferAgent/rules/neurips.md new file mode 100644 index 0000000..5136113 --- /dev/null +++ b/apps/backend/src/services/transferAgent/rules/neurips.md @@ -0,0 +1,304 @@ +# NeurIPS 2026 — Venue Handbook (OpenPrism) + +This file is the authoritative handbook for any `venueId = neurips` transfer. +It is loaded verbatim by `loadVenueRules('neurips')` and injected as +`NEURIPS_FULL_HANDBOOK` into the venue-skill system prompt. Treat conflicts +with the live NeurIPS site as cues to ask the user (`raiseQuestion`) — do +**not** try to fetch external pages from within a transfer run. + +> Line numbers ≠ preprint. The default anonymous submission *does* ship with +> line numbers (`lineno`). The `preprint` option *disables* them and reveals +> authors. See §2. + +--- + +## 1. Template layout on disk + +The reference NeurIPS workspace lives in `templates/neurips/` and contains +exactly four files. When migrating, copy/adapt these into the target project +and never edit the `.sty`. + +| File | Purpose | +|------|---------| +| `main.tex` | Paper shell. `pdflatex` entry point. | +| `neurips_2026.sty` | Official style. Sets `letterpaper`, `textwidth=5.5in`, `textheight=9in`, `\normalsize = 10pt/11pt leading`. **Never modify.** | +| `checklist.tex` | NeurIPS Paper Checklist; `\input{checklist.tex}` at end of `main.tex`. Required for conference submissions — removing it causes **desk rejection**. | +| `references.bib` | Sample bibliography. | + +Tool usage: + +- Use `listProjectTree({project:"target"})` first to confirm the four files landed in the target workspace. +- Use `readFile({project:"target", path:"main.tex"})` (or with `startLine/endLine` for partial reads) before every edit. + +--- + +## 2. Submission modes (`\usepackage` options) + +NeurIPS 2026 uses **one** style package with several mutually exclusive options declared by `neurips_2026.sty`: `main` (default), `position`, `eandd` (with optional `nonanonymous`), `creativeai`, `sglblindworkshop`, `dblblindworkshop`, plus the modifiers `final`, `preprint`, `nonatbib`. + +Pick the option from `transferIntake`: + +| Intake flags | Required line in `main.tex` | +|---|---| +| `doubleBlind=true`, `preprint=false` (default conference submission) | `\usepackage[main]{neurips_2026}` (or `\usepackage{neurips_2026}`, equivalent) | +| `doubleBlind=false`, `preprint=true` (arXiv / non-anonymous) | `\usepackage[preprint]{neurips_2026}` | +| Camera-ready (accepted paper) | `\usepackage[main,final]{neurips_2026}` | +| Workshop track | `\usepackage[sglblindworkshop]{neurips_2026}` **and** `\workshoptitle{...}` | +| Package clash with natbib | append `,nonatbib`, e.g. `\usepackage[preprint,nonatbib]{neurips_2026}` | + +Anonymous/main mode facts: + +- Line numbers (`lineno`) are auto-loaded. Never quote them in the body — they disappear at acceptance. +- The author block renders as **Anonymous Author(s)** regardless of what is inside `\author{...}` (so keep the real author metadata in source for camera-ready). +- The `ack` environment is present in the source but **hidden** in the anonymous PDF; write the acknowledgements now, they will surface with `final`. +- First-page footer reads “Submitted to … NeurIPS …. Do not distribute.” + +Preprint mode facts: + +- No line numbers. Real authors visible. +- Footer reads “Preprint.”; `ack` renders normally. +- Never use `[final]` on an un-accepted paper. Do not declare the target conference in a preprint. + +--- + +## 3. Document class & preamble + +Required preamble header (matches `templates/neurips/main.tex`): + +```latex +\PassOptionsToPackage{numbers,compress,sort}{natbib} +\documentclass{article} +\usepackage[main]{neurips_2026} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{url} +\usepackage{booktabs} +\usepackage{amsfonts} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{nicefrac} +\usepackage{microtype} +\usepackage{xcolor} +\usepackage{graphicx} +\usepackage[hidelinks]{hyperref} +\hypersetup{pdfauthor={}} % anonymity +``` + +Hard rules: + +1. `\documentclass` **must be `article`**. Reject `revtex*`, `amsart`, `IEEEtran`, `acmart`, `llncs`, etc. +2. `\PassOptionsToPackage{numbers,compress,sort}{natbib}` must appear **before** `\documentclass` whenever numeric `[1, 2, 4–7]` citations are wanted. +3. Load `hyperref` late in the preamble (after `amsmath`, `graphicx`). For anonymous submissions keep `\hypersetup{pdfauthor={}}` to avoid PDF metadata leaks. +4. Never load `geometry` with A4 or custom margins. Never edit geometry inside `neurips_2026.sty` (altering textwidth, textheight, font sizes → **desk rejection**). + +--- + +## 4. Page budget + +- Main text: **≤ 9 pages including figures**. Papers exceeding 9 pages are not reviewed. +- **Not counted** toward the 9-page limit: acknowledgements, references, NeurIPS Paper Checklist, optional technical appendix. +- Recommended tail order in `main.tex` (matches the template): + `\clearpage → \begin{ack}...\end{ack} → \section*{References} → \newpage → \input{checklist.tex}`. + Appendices, if any, go before `References` or after `checklist.tex` per the current submission-system instructions; they do not count toward 9 pages but the main body must stand alone. +- Use `\input{sections/...}` to split long papers; do **not** duplicate `\documentclass` or `\usepackage{neurips_2026}` inside sub-files. + +--- + +## 5. Typography & layout (for sanity checks — do not re-implement) + +`neurips_2026.sty` sets: + +- `letterpaper`, `\textwidth = 5.5in` (33 pc), `\textheight = 9in` (54 pc), left margin 1.5 in. +- Body text 10 pt, leading 11 pt. Half-line paragraph spacing, no first-line indent. +- Title: ~17 pt bold, centered, initial caps + lower case, with top rule 4 pt and bottom rule 1 pt. +- Section heading sizes: `\section` 12 pt, `\subsection` 10 pt, `\subsubsection` 10 pt (all lower-case except sentence start & proper nouns, left-aligned, bold). +- `\paragraph`: bold, inline, 1 em space after the heading word. + +The agent should never emit commands that override these (`\setlength{\textwidth}...`, `\geometry{...}`, custom `\renewcommand\normalsize`, etc.). Delete them when migrating. + +--- + +## 6. Title, authors, anonymity + +- Keep `\title{...}` and a real `\author{...}` in source. In `[main]` mode the PDF shows the anonymous placeholder; in `[preprint]` or `[main,final]` your metadata renders. +- Author separators: `\and` (LaTeX decides line break); `\And` (no hard break); `\AND` (force new author row). +- `\thanks{...}` is for extra author info (homepages, primary-contact notes), **not** funding/acknowledgements — those belong in `ack`. +- Double-blind text hygiene (§9.2): refer to your own prior work in the third person (“Jones et al. [4] showed …”). Never write “our previous work [4]”. Do not expose identifying URLs, dataset slugs, institution references, or self-revealing GitHub handles. + +--- + +## 7. Abstract + +- Single paragraph only. +- Heading `Abstract` is centered, bold, ≈ 12 pt (handled by the style). +- Left/right indent ≈ 0.5 in (3 pc). Body text 10 pt / 11 pt leading. +- Approx 2-line vertical gap before the abstract body. + +--- + +## 8. Citations & bibliography + +- `neurips_2026` auto-loads `natbib`. Pick one citation style — numeric **or** author-year — and stay consistent throughout. +- For numeric, compressed and sorted ranges (`[1, 3–6]`), the preamble must contain: + `\PassOptionsToPackage{numbers,compress,sort}{natbib}` **before** `\documentclass`. +- For author-year, omit `numbers` and use `\citet{…}` for narrative, `\citep{…}` for parenthetical. +- If the source project uses `biblatex`, either: + - Migrate to BibTeX + a natbib-compatible `.bst` (preferred), or + - Use `\usepackage[nonatbib]{neurips_2026}` and bring back whatever the source used. + In either case copy `.bib` files via `copyAsset`, not `writeFile`. +- Handwritten `thebibliography` items need the optional label for author-year, e.g. + `\bibitem[Hasselmo et al.(1995)]{hasselmo}`. +- Place the bibliography after `ack` and before the checklist. Use `\section*{References}` and wrap the list in `{\small …}` (≈ 9 pt) to save space. + +--- + +## 9. Figures & tables + +- NeurIPS is **single column**. Convert every `figure*`/`table*` from the source to `figure`/`table`. +- Caption convention: label below figures, above tables. Use sentence case. Every caption must state one key take-away in addition to describing the panel. +- Recommended graphics call: + `\includegraphics[width=\linewidth]{path}` or a fraction thereof (`0.8\linewidth`). +- **Never** use `\special` for positioning. Use `graphicx`. +- Use `measureFigures` with `targetClass:"neurips"` to pick a sane width when migrating from a two-column source (CVPR/ICML/acmart/IEEEtran/revtex). The tool returns a recommended `\linewidth` ratio based on textwidth deltas. +- `copyAsset(srcPath, destPath)` any image file (`.pdf/.png/.jpg/.eps`) that the source `\includegraphics` references — then verify the path resolves from the target root. + +Float overflow (“all figures end up at the end”): + +1. Prefer placement specifiers `[!htbp]`, not bare `[t]`. +2. Place the `figure` environment near the first `\ref{fig:…}` in the source, not in its own later block. +3. In the main file (never in the `.sty`), tune float fractions: + ```latex + \renewcommand{\topfraction}{0.9} + \renewcommand{\bottomfraction}{0.8} + \renewcommand{\textfraction}{0.1} + ``` +4. Use `\FloatBarrier` (from `placeins`) between sections to stop drift. +5. Reserve `[H]` (from `float`) for desperate cases only — it often leaves whitespace. + +--- + +## 10. Math + +- Line numbers interact badly with TeX `$$ … $$`. Always use `equation`, `align`, `gather`, `equation*`, etc. +- Even in preprint mode, keep LaTeX environments so behaviour is the same when you later switch to `[main]`. +- `amsmath` and `amsfonts` are part of the template preamble; use `\mathbb{…}` instead of `bbold`. + +--- + +## 11. Fonts & PDF requirements + +- `pdflatex` only. Two consecutive runs minimum so cross-references stabilise. +- PDF must contain Type 1 or embedded TrueType. No Type 3. Check with `pdffonts`. +- Stay on US Letter output. + +--- + +## 12. Acknowledgements / funding disclosure + +Use the `ack` environment defined by the style — **do not** hand-roll `\section*{Acknowledgments...}`. + +```latex +\begin{ack} + This work was supported by ... +\end{ack} +``` + +- Anonymous mode: hidden in PDF. Keep real content in source; it surfaces with `[main,final]`. +- Preprint: rendered. Still follow the NeurIPS Funding Disclosure rules. + +--- + +## 13. NeurIPS Paper Checklist (conference submissions only) + +Located in `templates/neurips/checklist.tex`. + +- The checklist is **mandatory** for any conference submission. Removing it causes desk rejection. +- Must appear after `References` (and after the appendix if present). The template already ends with `\newpage\input{checklist.tex}`. +- The checklist does **not** count toward the 9-page limit. + +Editing discipline (enforced by the reviewers): + +- **Delete** the `%%% BEGIN INSTRUCTIONS %%%` … `%%% END INSTRUCTIONS %%%` block entirely. +- **Keep** the `\section*{NeurIPS Paper Checklist}` heading, every subsection heading, and every question verbatim. +- Answers may only use `\answerYes{}`, `\answerNo{}`, `\answerNA{}`. Replace every `\answerTODO{}` and `\justificationTODO{}` before submission. +- Provide a 1–2 sentence justification after every answer (even for `\answerNA`). +- Answering `No` or `N/A` with a valid justification is acceptable. Reviewers do not reject solely on answer value. +- Typical question→paper mapping the reviewer will spot-check: + +| Question topic | Expected location in the paper | +|---|---| +| Claims | Abstract + end of Introduction | +| Limitations | A dedicated `\section*{Limitations}` or a paragraph in the discussion | +| Theory assumptions & proofs | Main text statements; full proofs in appendix if needed | +| Reproducibility | Experimental setup / Methods | +| Open data & code | Methods + appendix pointer to supplementary | +| Experimental details | Methods + appendix tables (hyperparameters, seeds) | +| Statistical significance | Results (error bars / CIs / repeats) | +| Compute resources | Methods or a short dedicated paragraph | +| Ethics, broader impacts, safeguards | Discussion or a dedicated section | +| Licenses / new assets | Methods or appendix | +| Human subjects / IRB | Only if applicable | +| LLM usage declaration | Methods or acknowledgements | + +Each `\answerYes{}` justification should cite the section or appendix it refers to (“see §4.2” / “see Appendix B”). + +--- + +## 14. Appendices & supplementary material + +- Appendices are allowed, unlimited in length, and are part of the same PDF. Do **not** submit a separate appendix PDF. +- Additional videos/code/data go into the supplementary ZIP. +- Reviewers may skip appendices — keep the main text self-contained. Do not hide the key experiment supporting a main claim in the appendix only. + +--- + +## 15. Migration playbook (agent actions) + +When migrating a source project into the NeurIPS workspace, prefer this order. Each step maps directly to the tools available in the skill prompt. + +1. **Reconnaissance** + - `listProjectTree({project:"source"})` and `listProjectTree({project:"target"})`. + - `readFile({project:"target", path:"main.tex"})` — current NeurIPS shell (never modify the `.sty`). + - `readFile({project:"source", path:""})` — full file. If very long, use `startLine/endLine` or `grepFile` for hotspots (`\\documentclass`, `\\usepackage`, `\\title`, `\\author`, `\\begin{abstract}`, `figure\\*?`, `\\bibliography`, `\\bibliographystyle`, `biblatex`, `hyperref`). +2. **Preamble transplant** + - Use `applyDiff` when replacing `\\documentclass`, swapping the `neurips_2026` option, or inserting the numbered-citation `\PassOptionsToPackage` line. Prefer a single minimal hunk per logical change. + - Use `writeFile` only when the source preamble and body have to be rewritten wholesale (e.g., moving away from a revtex two-column layout). +3. **Body ingestion** + - Copy section content verbatim into `main.tex`, converting `figure*`/`table*` → `figure`/`table`. + - If the source uses `\input{sections/...}`, either copy those files with `copyAsset` and keep the `\input`, or inline them — decide based on file count (> 3 sub-files → keep split; ≤ 3 → inline). +4. **Assets** + - For every image and bibliography file referenced by the source, call `copyAsset(src, dest)`. Never dump binary content through `writeFile`. + - After copies, `listProjectTree({project:"target"})` to confirm. +5. **Bibliography** + - Preserve or rebuild. Enforce consistency (numeric vs author-year) across the whole paper. Run `grepFile({project:"target", pattern:"\\\\cite[tp]?\\{", glob:"*.tex"})` to audit. +6. **Figures** + - After paths are fixed, call `measureFigures({sourceClass:, sourceTwocolumn:, targetClass:"neurips", figures:[...]})` and apply the recommended widths with `applyDiff`. +7. **Double-blind sweep** + - `grepFile` for author names, affiliations, grant numbers, public URLs (`github.com/`, `https?://.+`). Redact as needed with `applyDiff`. + - Also check `\hypersetup{pdfauthor=...}` and anything the source may have set in `pdftitle`, `pdfkeywords`, etc. +8. **Checklist** + - `readFile({project:"target", path:"checklist.tex"})` to confirm the template version is present. + - Remove the instruction block, convert every `\answerTODO` into a real answer, and write 1–2 sentence justifications. Cross-link to `\ref{sec:…}` / `\ref{app:…}`. +9. **Verification** + - `grepFile({project:"target", pattern:"\\\\bibliographystyle|\\\\bibliography|biblatex"})`, `grepFile(pattern:"\\$\\$")`, `grepFile(pattern:"figure\\*|table\\*")` — no hits expected at the end of a clean migration. + - If anything is ambiguous (e.g., workshop vs main track, whether to preserve a tabular float as full-width) call `raiseQuestion` **once** with the minimal number of options. + +`raiseQuestion` is expensive (it pauses the graph). Do not raise questions for information you can recover by reading files or by consulting `transferIntake` (`venue`, `doubleBlind`, `preprint`, `outputNotes`). + +--- + +## 16. Pre-submission checklist (run before `finalize`) + +- [ ] `\documentclass{article}` and `\usepackage[...]{neurips_2026}` options match the intake. +- [ ] `\PassOptionsToPackage{numbers,compress,sort}{natbib}` above `\documentclass` if numeric citations are used. +- [ ] `neurips_2026.sty` is unmodified (compare against `templates/neurips/neurips_2026.sty`). +- [ ] Compiles with `pdflatex` twice without errors; US Letter output; fonts are Type 1 / embedded TrueType. +- [ ] Main body ≤ 9 pages including figures (excluding ack, references, checklist, appendix). +- [ ] All `figure*` / `table*` converted to single-column. +- [ ] No `$$ … $$` anywhere in body (`grepFile` the source and target). +- [ ] All `\cite{}`, `\ref{}`, `\label{}` still resolve; consistent citation style across the paper. +- [ ] Double-blind: no identifying names, affiliations, URLs, grant numbers, GitHub handles; `\hypersetup{pdfauthor={}}` present. +- [ ] `ack` content written in source (even if hidden). +- [ ] `\input{checklist.tex}` present; instruction block removed; no `\answerTODO` / `\justificationTODO` remains; every answer has a justification; justifications cite back to section/appendix. +- [ ] Appendix (if any) is inside the same PDF; no separate appendix PDF. +- [ ] For preprint: `[preprint]` option; no mention of the target conference in the body; `[final]` is not used. diff --git a/apps/backend/src/services/transferAgent/skills/acl.js b/apps/backend/src/services/transferAgent/skills/acl.js new file mode 100644 index 0000000..e0d0997 --- /dev/null +++ b/apps/backend/src/services/transferAgent/skills/acl.js @@ -0,0 +1,106 @@ +/** + * ACL Skill — system prompt builder for ACL submissions. + */ + +import { loadVenueRules } from '../neuripsRules.js'; + +export function buildAclSkill({ + aclHandbook, + sourceProfile, + transferIntake, + sourceOutline, + targetOutline, + sourceAssets, +}) { + const intake = transferIntake || {}; + const profile = sourceProfile || {}; + + return `You are an expert LaTeX paper template migration agent specializing in ACL-style submissions. + +Your mission: migrate a user's source paper into the target ACL template, producing a submission-ready .tex file that compiles cleanly and follows ACL formatting constraints. + +═══════════════════════════════════════════════════ +AVAILABLE TOOLS +═══════════════════════════════════════════════════ + +You have the following tools at your disposal. Call them as needed: + +• readFile(project, path, startLine?, endLine?) — Read a file; omit line args for full file, or 1-based inclusive range (partial reads show line numbers) +• writeFile(path, content) — Write/overwrite a file in the target project (auto-snapshots) +• applyDiff(path, diff) — Apply a unified diff to a target file (surgical edits) +• grepFile(project, pattern, glob) — Regex search across project files +• listProjectTree(project) — List all files in a project +• copyAsset(srcPath, destPath?) — Copy a resource file from source to target +• compileProject() — Compile the target with the user-selected engine; returns an LLM-compressed log summary +• raiseQuestion(questions) — Ask the user a question (ONLY when truly needed) + +═══════════════════════════════════════════════════ +ACL HANDBOOK +═══════════════════════════════════════════════════ + +${aclHandbook || '[ACL handbook not available — use template comments and standard ACL conventions.]'} + +═══════════════════════════════════════════════════ +SOURCE PAPER PROFILE +═══════════════════════════════════════════════════ + +documentclass: ${profile.documentclass || 'unknown'} +packages: ${(profile.packages || []).join(', ') || 'unknown'} +bibMechanism: ${profile.bibMechanism || 'unknown'} +twocolumn: ${profile.twocolumn ?? 'unknown'} +figureStar: ${profile.figureStar ?? false} +tableStar: ${profile.tableStar ?? false} +revtex: ${profile.revtex ?? false} +natbib: ${profile.hasNatbib ?? false} +biblatex: ${profile.hasBiblatex ?? false} + +${sourceOutline ? `SOURCE OUTLINE:\n${JSON.stringify(sourceOutline, null, 2)}` : ''} +${targetOutline ? `TARGET TEMPLATE OUTLINE:\n${JSON.stringify(targetOutline, null, 2)}` : ''} +${sourceAssets ? `SOURCE ASSETS:\n${JSON.stringify(sourceAssets, null, 2)}` : ''} + +═══════════════════════════════════════════════════ +MIGRATION PARAMETERS +═══════════════════════════════════════════════════ + +venue: ${intake.venue || 'acl'} +doubleBlind: ${intake.doubleBlind !== false} +preprint: ${!!intake.preprint} +${intake.outputNotes ? `notes: ${intake.outputNotes}` : ''} + +═══════════════════════════════════════════════════ +CRITICAL CONSTRAINTS (MUST FOLLOW) +═══════════════════════════════════════════════════ + +1. Use ACL style package correctly: + - review mode: \\usepackage[review]{acl} + - final mode: \\usepackage{acl} +2. Do NOT modify acl.sty. +3. Keep two-column layout and preserve figure*/table* where full-width layout is required. +4. Preserve all citations, labels, refs, equations, and source scientific meaning. +5. ACL citations should remain author-year natbib style. +6. In double-blind mode, remove author-identifying information and acknowledgements. +7. Keep appendices/supplementary references consistent with ACL ordering (references before appendices). + +═══════════════════════════════════════════════════ +BEST PRACTICES +═══════════════════════════════════════════════════ + +• Use applyDiff for surgical edits (small targeted changes) +• Use writeFile for initial full-file generation or very large rewrites +• Always readFile before modifying an existing file +• Copy all referenced assets (images, bibliography files, style dependencies) +• Ask the user only when ambiguity blocks a correct migration +`; +} + +export async function buildAclSkillFromState(state) { + const handbook = await loadVenueRules('acl'); + return buildAclSkill({ + aclHandbook: handbook, + sourceProfile: state.sourceProfile, + transferIntake: state.transferIntake, + sourceOutline: state.sourceOutline, + targetOutline: state.targetOutline, + sourceAssets: state.sourceAssets, + }); +} diff --git a/apps/backend/src/services/transferAgent/skills/cvpr.js b/apps/backend/src/services/transferAgent/skills/cvpr.js new file mode 100644 index 0000000..f25a756 --- /dev/null +++ b/apps/backend/src/services/transferAgent/skills/cvpr.js @@ -0,0 +1,120 @@ +/** + * CVPR Skill — system prompt builder for CVPR 2026. + */ + +import { loadVenueRules } from '../neuripsRules.js'; + +/** + * Build the CVPR skill system prompt for the agentic transfer. + */ +export function buildCvprSkill({ + cvprHandbook, + sourceProfile, + transferIntake, + sourceOutline, + targetOutline, + sourceAssets, +}) { + const intake = transferIntake || {}; + const profile = sourceProfile || {}; + + return `You are an expert LaTeX paper template migration agent specializing in CVPR 2026. + +Your mission: migrate a user's source paper into the CVPR 2026 template, producing a submission-ready .tex file that compiles cleanly and passes all CVPR formatting requirements. + +═══════════════════════════════════════════════════ +AVAILABLE TOOLS +═══════════════════════════════════════════════════ + +You have the following tools at your disposal. Call them as needed: + +• readFile(project, path, startLine?, endLine?) — Read a file; omit line args for full file, or 1-based inclusive range (partial reads show line numbers) +• writeFile(path, content) — Write/overwrite a file in the target project (auto-snapshots) +• applyDiff(path, diff) — Apply a unified diff to a target file (surgical edits) +• grepFile(project, pattern, glob) — Regex search across project files +• listProjectTree(project) — List all files in a project +• copyAsset(srcPath, destPath?) — Copy a resource file from source to target +• measureFigures(...) — Suggest \\includegraphics widths when layout changes +• compileProject() — Compile the target with the user-selected engine; returns an LLM-compressed log summary +• raiseQuestion(questions) — Ask the user a question (ONLY when truly needed) + +═══════════════════════════════════════════════════ +CVPR 2026 COMPLETE HANDBOOK +═══════════════════════════════════════════════════ + +${cvprHandbook || '[CVPR handbook not available — use template comments and standard CVPR conventions.]'} + +═══════════════════════════════════════════════════ +SOURCE PAPER PROFILE +═══════════════════════════════════════════════════ + +documentclass: ${profile.documentclass || 'unknown'} +packages: ${(profile.packages || []).join(', ') || 'unknown'} +bibMechanism: ${profile.bibMechanism || 'unknown'} +twocolumn: ${profile.twocolumn ?? 'unknown'} +figureStar: ${profile.figureStar ?? false} +tableStar: ${profile.tableStar ?? false} +revtex: ${profile.revtex ?? false} +natbib: ${profile.hasNatbib ?? false} +biblatex: ${profile.hasBiblatex ?? false} + +${sourceOutline ? `SOURCE OUTLINE:\n${JSON.stringify(sourceOutline, null, 2)}` : ''} +${targetOutline ? `TARGET TEMPLATE OUTLINE:\n${JSON.stringify(targetOutline, null, 2)}` : ''} +${sourceAssets ? `SOURCE ASSETS:\n${JSON.stringify(sourceAssets, null, 2)}` : ''} + +═══════════════════════════════════════════════════ +MIGRATION PARAMETERS +═══════════════════════════════════════════════════ + +venue: ${intake.venue || 'cvpr'} +doubleBlind: ${intake.doubleBlind !== false} +preprint: ${!!intake.preprint} +${intake.outputNotes ? `notes: ${intake.outputNotes}` : ''} + +═══════════════════════════════════════════════════ +CRITICAL CONSTRAINTS (MUST FOLLOW) +═══════════════════════════════════════════════════ + +1. Use CVPR article setup: \\documentclass[10pt,twocolumn,letterpaper]{article} +2. Use CVPR style package options correctly: + - review submission: \\usepackage[review]{cvpr} + - camera-ready: \\usepackage{cvpr} + - preprint with page numbers: \\usepackage[pagenumbers]{cvpr} +3. NEVER modify cvpr.sty — style-file edits are disallowed +4. Keep two-column layout; do NOT convert all figure* / table* to single-column forms +5. Preserve ALL \\cite{}, \\ref{}, \\label{}, equations, figures, and tables +6. Bibliography should use CVPR style (typically \\bibliographystyle{ieeenat_fullname}) with numeric citations +7. Keep paper on US Letter format (letterpaper), no custom geometry overrides +8. Keep hyperref enabled unless there is a severe compile blocker +9. In review mode, enforce anonymity (remove identifying author metadata/URLs unless user explicitly requests otherwise) +10. Do NOT include supplementary pages inline in main submission unless user explicitly requests it +11. If source uses biblatex, migrate to CVPR natbib-compatible bibliography flow +12. Keep display math in robust LaTeX environments (equation/align), avoid fragile formatting hacks + +═══════════════════════════════════════════════════ +BEST PRACTICES +═══════════════════════════════════════════════════ + +• Use applyDiff for surgical edits (small targeted changes) — safer than full rewrites +• Use writeFile for initial full-file generation or when the diff would be larger than the file +• Always readFile the current state of a file before modifying it +• Copy ALL referenced assets (images, .bib, .bbl, .sty, .cls, .bst) from source +• When uncertain about user intent, prefer conservative choices over raising questions +• ONLY call raiseQuestion for genuinely ambiguous decisions that affect the final output +`; +} + +/** + * Convenience: load handbook and build the skill. + */ +export async function buildCvprSkillFromState(state) { + const handbook = await loadVenueRules('cvpr'); + return buildCvprSkill({ + cvprHandbook: handbook, + sourceProfile: state.sourceProfile, + transferIntake: state.transferIntake, + sourceOutline: state.sourceOutline, + targetOutline: state.targetOutline, + sourceAssets: state.sourceAssets, + }); +} diff --git a/apps/backend/src/services/transferAgent/skills/icml.js b/apps/backend/src/services/transferAgent/skills/icml.js new file mode 100644 index 0000000..046a4ef --- /dev/null +++ b/apps/backend/src/services/transferAgent/skills/icml.js @@ -0,0 +1,122 @@ +/** + * ICML Skill — system prompt builder for ICML 2026. + */ + +import { loadVenueRules } from '../neuripsRules.js'; + +/** + * Build the ICML skill system prompt for the agentic transfer. + */ +export function buildIcmlSkill({ + icmlHandbook, + sourceProfile, + transferIntake, + sourceOutline, + targetOutline, + sourceAssets, +}) { + const intake = transferIntake || {}; + const profile = sourceProfile || {}; + + return `You are an expert LaTeX paper template migration agent specializing in ICML 2026. + +Your mission: migrate a user's source paper into the ICML 2026 template, producing a submission-ready .tex file that compiles cleanly and passes all ICML formatting requirements. + +═══════════════════════════════════════════════════ +AVAILABLE TOOLS +═══════════════════════════════════════════════════ + +You have the following tools at your disposal. Call them as needed: + +• readFile(project, path, startLine?, endLine?) — Read a file; omit line args for full file, or 1-based inclusive range (partial reads show line numbers) +• writeFile(path, content) — Write/overwrite a file in the target project (auto-snapshots) +• applyDiff(path, diff) — Apply a unified diff to a target file (surgical edits) +• grepFile(project, pattern, glob) — Regex search across project files +• listProjectTree(project) — List all files in a project +• copyAsset(srcPath, destPath?) — Copy a resource file from source to target +• compileProject() — Compile the target with the user-selected engine; returns an LLM-compressed log summary +• raiseQuestion(questions) — Ask the user a question (ONLY when truly needed) + +═══════════════════════════════════════════════════ +ICML 2026 COMPLETE HANDBOOK +═══════════════════════════════════════════════════ + +${icmlHandbook || '[ICML handbook not available — use template comments and standard ICML conventions.]'} + +═══════════════════════════════════════════════════ +SOURCE PAPER PROFILE +═══════════════════════════════════════════════════ + +documentclass: ${profile.documentclass || 'unknown'} +packages: ${(profile.packages || []).join(', ') || 'unknown'} +bibMechanism: ${profile.bibMechanism || 'unknown'} +twocolumn: ${profile.twocolumn ?? 'unknown'} +figureStar: ${profile.figureStar ?? false} +tableStar: ${profile.tableStar ?? false} +revtex: ${profile.revtex ?? false} +natbib: ${profile.hasNatbib ?? false} +biblatex: ${profile.hasBiblatex ?? false} + +${sourceOutline ? `SOURCE OUTLINE:\n${JSON.stringify(sourceOutline, null, 2)}` : ''} +${targetOutline ? `TARGET TEMPLATE OUTLINE:\n${JSON.stringify(targetOutline, null, 2)}` : ''} +${sourceAssets ? `SOURCE ASSETS:\n${JSON.stringify(sourceAssets, null, 2)}` : ''} + +═══════════════════════════════════════════════════ +MIGRATION PARAMETERS +═══════════════════════════════════════════════════ + +venue: ${intake.venue || 'icml'} +doubleBlind: ${intake.doubleBlind !== false} +preprint: ${!!intake.preprint} +${intake.outputNotes ? `notes: ${intake.outputNotes}` : ''} + +═══════════════════════════════════════════════════ +CRITICAL CONSTRAINTS (MUST FOLLOW) +═══════════════════════════════════════════════════ + +1. \\documentclass MUST be {article} — never revtex, amsart, llncs, etc. +2. ICML package option depends on submission mode: + - doubleBlind=true → \\usepackage{icml2026} (anonymous, with line numbers) + - camera-ready → \\usepackage[accepted]{icml2026} (non-anonymous) + Check doubleBlind flag above and pick the correct option. +3. NEVER modify icml2026.sty — any geometry/font changes inside .sty → desk rejection +4. Paper size: US Letter. Do NOT load geometry. +5. Preserve ALL \\cite{}, \\ref{}, \\label{}, mathematical content, figures, tables +6. ICML is TWO-COLUMN: keep figure* for full-width figures, figure for single-column. Do NOT convert figure* to figure. +7. Use \\icmlauthor{Name}{affiliation} and \\icmlaffiliation{label}{...} for authors (NOT \\author{}) +8. Double-blind: NO author info, NO identifying URLs, self-cite in third person +9. No $$ ... $$ for display math — use equation/align environments (lineno compat) +10. Use \\bibliographystyle{icml2026} and \\bibliography{references} — APA author-year citations (NOT numeric) +11. If source uses biblatex: switch to natbib (loaded by icml2026.sty) +12. Acknowledgements: hidden in anonymous mode — keep section but content won't show +13. Impact Statement: required unnumbered section before References +14. Appendix goes AFTER references, submitted in same PDF (NOT separate file) +15. Main body max 8 pages (excluding references and appendices) + +═══════════════════════════════════════════════════ +BEST PRACTICES +═══════════════════════════════════════════════════ + +• Use applyDiff for surgical edits (small targeted changes) — safer than full rewrites +• Use writeFile for initial full-file generation or when the diff would be larger than the file +• Always readFile the current state of a file before modifying it +• Copy ALL referenced assets (images, .bib, .bbl, .sty, .cls, .bst) from source +• When uncertain about user intent, prefer conservative choices over raising questions +• ONLY call raiseQuestion for genuinely ambiguous decisions that affect the final output +`; +} + +/** + * Convenience: load handbook and build the skill. + */ +export async function buildIcmlSkillFromState(state) { + const handbook = await loadVenueRules('icml'); + return buildIcmlSkill({ + icmlHandbook: handbook, + sourceProfile: state.sourceProfile, + transferIntake: state.transferIntake, + sourceOutline: state.sourceOutline, + targetOutline: state.targetOutline, + sourceAssets: state.sourceAssets, + }); +} diff --git a/apps/backend/src/services/transferAgent/skills/index.js b/apps/backend/src/services/transferAgent/skills/index.js new file mode 100644 index 0000000..314dbf6 --- /dev/null +++ b/apps/backend/src/services/transferAgent/skills/index.js @@ -0,0 +1,40 @@ +/** + * Venue skill dispatcher — selects the correct skill builder based on venue. + * + * This is the single entry point all agent nodes should use instead of + * importing venue-specific skill builders directly. + */ + +import { buildNeuripsSkillFromState } from './neurips.js'; +import { buildIcmlSkillFromState } from './icml.js'; +import { buildCvprSkillFromState } from './cvpr.js'; +import { buildAclSkillFromState } from './acl.js'; + +/** + * Resolve the venue from state (checks transferIntake.venue and transferGraphKind). + */ +function resolveVenue(state) { + const intake = state.transferIntake || {}; + return intake.venue || state.transferGraphKind || ''; +} + +/** + * Build the venue-specific skill system prompt from state. + * + * @param {object} state — LangGraph TransferState + * @returns {Promise} — The system prompt string + */ +export async function buildVenueSkillFromState(state) { + const venue = resolveVenue(state); + switch (venue) { + case 'cvpr': + return buildCvprSkillFromState(state); + case 'acl': + return buildAclSkillFromState(state); + case 'icml': + return buildIcmlSkillFromState(state); + case 'neurips': + default: + return buildNeuripsSkillFromState(state); + } +} diff --git a/apps/backend/src/services/transferAgent/skills/neurips.js b/apps/backend/src/services/transferAgent/skills/neurips.js new file mode 100644 index 0000000..3debd57 --- /dev/null +++ b/apps/backend/src/services/transferAgent/skills/neurips.js @@ -0,0 +1,136 @@ +/** + * NeurIPS Skill — system prompt builder. + * + * Encapsulates the entire NeurIPS 2026 specification as an agent "skill". + * Instead of hardcoding rules in each pipeline node, the agent receives + * the full handbook + migration context as its system prompt and makes + * autonomous decisions through tool calls. + */ + +import { loadNeuripsRulesFull } from '../neuripsRules.js'; + +/** + * Build the NeurIPS skill system prompt for the agentic transfer. + * + * @param {object} opts + * @param {string} opts.neuripsHandbook — full neurips.md content + * @param {object} opts.sourceProfile — heuristic source analysis (documentclass, packages, bibMechanism, …) + * @param {object} opts.transferIntake — { venue, doubleBlind, preprint, outputNotes } + * @param {object} [opts.sourceOutline] — parsed section outline of source + * @param {object} [opts.targetOutline] — parsed section outline of target template + * @param {object} [opts.sourceAssets] — { bib, images, styles } from source analysis + * @returns {string} + */ +export function buildNeuripsSkill({ + neuripsHandbook, + sourceProfile, + transferIntake, + sourceOutline, + targetOutline, + sourceAssets, +}) { + const intake = transferIntake || {}; + const profile = sourceProfile || {}; + + return `You are an expert LaTeX paper template migration agent specializing in NeurIPS 2026. + +Your mission: migrate a user's source paper into the NeurIPS 2026 template, producing a submission-ready .tex file that compiles cleanly and passes all NeurIPS formatting requirements. + +═══════════════════════════════════════════════════ +AVAILABLE TOOLS +═══════════════════════════════════════════════════ + +You have the following tools at your disposal. Call them as needed: + +• readFile(project, path, startLine?, endLine?) — Read a file; omit line args for full file, or 1-based inclusive range (partial reads show line numbers) +• writeFile(path, content) — Write/overwrite a file in the target project (auto-snapshots) +• applyDiff(path, diff) — Apply a unified diff to a target file (surgical edits) +• grepFile(project, pattern, glob) — Regex search across project files +• listProjectTree(project) — List all files in a project +• copyAsset(srcPath, destPath?) — Copy a resource file from source to target +• measureFigures(...) — Suggest \\includegraphics widths when layout changes +• compileProject() — Compile the target with the user-selected engine; returns an LLM-compressed log summary (not full raw log) +• raiseQuestion(questions) — Ask the user a question (ONLY when truly needed) + +═══════════════════════════════════════════════════ +NEURIPS 2026 COMPLETE HANDBOOK +═══════════════════════════════════════════════════ + +${neuripsHandbook || '[NeurIPS handbook not available — use template comments and standard NeurIPS conventions.]'} + +═══════════════════════════════════════════════════ +SOURCE PAPER PROFILE +═══════════════════════════════════════════════════ + +documentclass: ${profile.documentclass || 'unknown'} +packages: ${(profile.packages || []).join(', ') || 'unknown'} +bibMechanism: ${profile.bibMechanism || 'unknown'} +twocolumn: ${profile.twocolumn ?? 'unknown'} +figureStar: ${profile.figureStar ?? false} +tableStar: ${profile.tableStar ?? false} +revtex: ${profile.revtex ?? false} +natbib: ${profile.hasNatbib ?? false} +biblatex: ${profile.hasBiblatex ?? false} + +${sourceOutline ? `SOURCE OUTLINE:\n${JSON.stringify(sourceOutline, null, 2)}` : ''} +${targetOutline ? `TARGET TEMPLATE OUTLINE:\n${JSON.stringify(targetOutline, null, 2)}` : ''} +${sourceAssets ? `SOURCE ASSETS:\n${JSON.stringify(sourceAssets, null, 2)}` : ''} + +═══════════════════════════════════════════════════ +MIGRATION PARAMETERS +═══════════════════════════════════════════════════ + +venue: ${intake.venue || 'neurips'} +doubleBlind: ${intake.doubleBlind !== false} +preprint: ${!!intake.preprint} +${intake.outputNotes ? `notes: ${intake.outputNotes}` : ''} + +═══════════════════════════════════════════════════ +CRITICAL CONSTRAINTS (MUST FOLLOW) +═══════════════════════════════════════════════════ + +1. \\documentclass MUST be {article} — never revtex, amsart, llncs, etc. +2. neurips_2026 package option MUST match the submission mode: + - doubleBlind=true → \\usepackage[main]{neurips_2026} (anonymous + line numbers) + - preprint=true → \\usepackage[preprint]{neurips_2026} (non-anonymous, no line numbers) + - camera-ready → \\usepackage[main,final]{neurips_2026} + Check doubleBlind/preprint flags above and pick the correct option. +3. NEVER modify neurips_2026.sty — any geometry/font changes inside .sty → desk rejection +4. Paper size: US Letter. Do NOT load geometry with A4. +5. Preserve ALL \\cite{}, \\ref{}, \\label{}, mathematical content, figures, tables +6. figure* → figure, table* → table (NeurIPS is single-column); use \\begin{figure}[htbp] for flexible float placement +7. MUST \\input{checklist.tex} — missing checklist → desk rejection +8. Double-blind: \\hypersetup{pdfauthor={}} and author block shows "Anonymous Author(s)" +9. No $$ ... $$ for display math — use equation/align environments (lineno compat) +10. \\bibliography{} or \\input{*.bbl} for references; natbib loaded by default with numeric citations +11. If source uses biblatex: switch to natbib or use [nonatbib]{neurips_2026} +12. ack environment is hidden in anonymous mode — keep it but content won't show +13. MUST add \\PassOptionsToPackage{numbers,compress,sort}{natbib} BEFORE \\documentclass for numeric [1,2,3] citations + +═══════════════════════════════════════════════════ +BEST PRACTICES +═══════════════════════════════════════════════════ + +• Use applyDiff for surgical edits (small targeted changes) — safer than full rewrites +• Use writeFile for initial full-file generation or when the diff would be larger than the file +• Always readFile the current state of a file before modifying it +• Copy ALL referenced assets (images, .bib, .bbl, .sty, .cls, .bst) from source +• When uncertain about user intent, prefer conservative choices over raising questions +• ONLY call raiseQuestion for genuinely ambiguous decisions that affect the final output +`; +} + +/** + * Convenience: load handbook and build the skill. + */ +export async function buildNeuripsSkillFromState(state) { + const handbook = await loadNeuripsRulesFull(); + return buildNeuripsSkill({ + neuripsHandbook: handbook, + sourceProfile: state.sourceProfile, + transferIntake: state.transferIntake, + sourceOutline: state.sourceOutline, + targetOutline: state.targetOutline, + sourceAssets: state.sourceAssets, + }); +} diff --git a/apps/backend/src/services/transferAgent/skills/reviewerChecklist.js b/apps/backend/src/services/transferAgent/skills/reviewerChecklist.js new file mode 100644 index 0000000..6b35899 --- /dev/null +++ b/apps/backend/src/services/transferAgent/skills/reviewerChecklist.js @@ -0,0 +1,198 @@ +/** + * Reviewer skills — venue-specific review checklists. + * + * Each venue exports a function that returns the review checklist string + * to be injected into the reviewer's user message. + * + * The reviewer prompt skeleton is venue-agnostic; all venue-specific + * constraints live here for progressive disclosure and extensibility. + */ + +// ───────────────────────────────────────────── +// NeurIPS +// ───────────────────────────────────────────── + +function neuripsReviewChecklist({ intake }) { + const isBlind = intake.doubleBlind !== false && !intake.preprint; + return { + structure: `- \\usepackage[main]{neurips_2026} (anonymous + line numbers) or \\usepackage[preprint]{neurips_2026} + - \\PassOptionsToPackage{numbers,compress,sort}{natbib} BEFORE \\documentclass + - No \\usepackage{geometry} (neurips_2026 handles layout)`, + + figures: `- No figure* or table* environments (NeurIPS is single-column) + - \\includegraphics paths point to files that exist + - Reasonable \\includegraphics widths (\\linewidth or fraction)`, + + bibliography: `- \\bibliographystyle is NeurIPS-compatible (unsrtnat, plainnat, abbrvnat) + - Numeric citations [1,2,3] (NOT author-year) + - .bib or .bbl files present in target project`, + + policy: `6. NEURIPS POLICY: + - \\input{checklist.tex} or \\include{checklist.tex} present + - No \\answerTODO remaining (checklist should be filled or template default) + - $$ ... $$ display math → equation/align (lineno compatibility)`, + + blind: isBlind + ? `BLIND COMPLIANCE: + - \\hypersetup{pdfauthor={}} or equivalent + - No identifying URLs (GitHub repos, project pages) unless user confirmed + - Self-citations in third-person form + - Author block shows "Anonymous Author(s)"` + : '(Single-blind or preprint — no anonymization needed)', + }; +} + +// ───────────────────────────────────────────── +// ICML +// ───────────────────────────────────────────── + +function icmlReviewChecklist({ intake }) { + const isBlind = intake.doubleBlind !== false; + return { + structure: `- \\usepackage{icml2026} (anonymous) or \\usepackage[accepted]{icml2026} (camera-ready) + - Do NOT add \\PassOptionsToPackage{numbers}{natbib} (ICML uses author-year) + - No \\usepackage{geometry} (icml2026.sty handles layout) + - Use \\icmlauthor / \\icmlaffiliation for author info (NOT \\author{})`, + + figures: `- figure* for full-width figures, figure for single-column (ICML is two-column) + - Do NOT convert figure* to figure + - \\includegraphics paths point to files that exist + - Reasonable \\includegraphics widths`, + + bibliography: `- \\bibliographystyle{icml2026} (APA author-year, NOT numeric) + - natbib loaded by icml2026.sty automatically + - .bib or .bbl files present in target project`, + + policy: `6. ICML POLICY: + - Impact Statement section present (unnumbered, before References) + - No checklist required (this is NOT NeurIPS — do NOT create checklist.tex) + - Do NOT create or reference neurips_2026.sty + - $$ ... $$ display math → equation/align (lineno compatibility) + - Appendix (if any) goes AFTER references in same PDF + - Main body max 8 pages (excluding references and appendices)`, + + blind: isBlind + ? `BLIND COMPLIANCE: + - \\hypersetup{pdfauthor={}} or equivalent + - No identifying URLs unless user confirmed + - Self-citations in third-person form + - Author info hidden (only visible with [accepted] option)` + : '(Camera-ready — author info should be visible)', + }; +} + +// ───────────────────────────────────────────── +// CVPR +// ───────────────────────────────────────────── + +function cvprReviewChecklist({ intake }) { + const isBlind = intake.doubleBlind !== false; + return { + structure: `- Use official CVPR style setup and keep two-column layout + - Do NOT modify cvpr.sty + - No conflicting geometry/font packages`, + + figures: `- Preserve figure* / table* when full-width layout is needed + - \\includegraphics paths point to files that exist + - Reasonable \\includegraphics widths`, + + bibliography: `- Keep natbib-compatible CVPR bibliography flow + - Citation style should match venue defaults + - .bib or .bbl files present in target project`, + + policy: `6. CVPR POLICY: + - Follow official template defaults (no ad-hoc style hacks) + - Preserve scientific content and ordering + - $$ ... $$ display math → equation/align where needed`, + + blind: isBlind + ? `BLIND COMPLIANCE: + - No identifying author/affiliation details + - No identifying project/repo URLs unless user confirmed + - Self-citations in third-person form` + : '(Camera-ready or non-blind mode — author info can be visible)', + }; +} + +// ───────────────────────────────────────────── +// ACL +// ───────────────────────────────────────────── + +function aclReviewChecklist({ intake }) { + const isBlind = intake.doubleBlind !== false; + return { + structure: `- Use \\usepackage[review]{acl} for review; \\usepackage{acl} for final + - Do NOT modify acl.sty + - Keep two-column ACL layout and avoid conflicting geometry settings`, + + figures: `- Preserve figure* / table* when full-width layout is needed + - \\includegraphics paths point to files that exist + - Reasonable \\includegraphics widths`, + + bibliography: `- ACL references should follow natbib author-year behavior + - References section before appendices + - .bib or .bbl files present in target project`, + + policy: `6. ACL POLICY: + - Review mode should include line numbers via [review] option + - Final mode should remove review-only markers + - Keep citations/references consistent with ACL guidance`, + + blind: isBlind + ? `BLIND COMPLIANCE: + - Remove identifying author/affiliation details + - Remove acknowledgements in review version + - Self-citations in third-person form` + : '(Final mode — author and acknowledgement blocks may be present)', + }; +} + +// ───────────────────────────────────────────── +// Fallback (generic) +// ───────────────────────────────────────────── + +function genericReviewChecklist({ intake }) { + return { + structure: `- Correct \\documentclass and style package for the target venue + - No conflicting geometry/font packages`, + + figures: `- Figure environments appropriate for the venue's column layout + - \\includegraphics paths point to files that exist + - Reasonable \\includegraphics widths`, + + bibliography: `- Bibliography mechanism consistent with venue requirements + - .bib or .bbl files present in target project`, + + policy: `6. VENUE POLICY: + - Follow venue-specific rules from your system prompt`, + + blind: intake.doubleBlind + ? `BLIND COMPLIANCE: + - No author-identifying information visible + - Self-citations in third-person form` + : '(No anonymization needed)', + }; +} + +// ───────────────────────────────────────────── +// Dispatcher +// ───────────────────────────────────────────── + +const VENUE_CHECKLIST_BUILDERS = { + neurips: neuripsReviewChecklist, + icml: icmlReviewChecklist, + cvpr: cvprReviewChecklist, + acl: aclReviewChecklist, +}; + +/** + * Build the venue-specific review checklist sections. + * + * @param {string} venueId — e.g. 'neurips', 'icml' + * @param {{ intake: object }} ctx — context with transferIntake + * @returns {{ structure, figures, bibliography, policy, blind }} + */ +export function buildReviewChecklist(venueId, ctx) { + const builder = VENUE_CHECKLIST_BUILDERS[venueId] || genericReviewChecklist; + return builder(ctx); +} diff --git a/apps/backend/src/services/transferAgent/state.js b/apps/backend/src/services/transferAgent/state.js index 367e15f..8c46723 100644 --- a/apps/backend/src/services/transferAgent/state.js +++ b/apps/backend/src/services/transferAgent/state.js @@ -9,18 +9,45 @@ export const TransferState = Annotation.Root({ sourceMainFile: Annotation({ reducer: replace }), targetProjectId: Annotation({ reducer: replace }), targetMainFile: Annotation({ reducer: replace }), + targetTemplateId: Annotation({ reducer: replace }), engine: Annotation({ reducer: replace, default: () => 'pdflatex' }), maxCompileLoops: Annotation({ reducer: replace, default: () => 5 }), maxLayoutLoops: Annotation({ reducer: replace, default: () => 3 }), layoutCheck: Annotation({ reducer: replace, default: () => false }), + enableSensitiveMask: Annotation({ reducer: replace, default: () => false }), + /** When false, POST /transfer/start runs the rule-based transfer graph instead of the LLM-driven ones. */ + useAgent: Annotation({ reducer: replace, default: () => false }), llmConfig: Annotation({ reducer: replace }), jobId: Annotation({ reducer: replace }), + /** Graph topology id: 'legacy' | 'rulebasetransfer' | target template id when using venue agent (neurips, icml, …) */ + transferGraphKind: Annotation({ reducer: replace, default: () => 'legacy' }), + + // --- Workspace roots (explicit tool boundary) --- + workspaceRoot: Annotation({ reducer: replace }), + sourceReadRoot: Annotation({ reducer: replace }), + + // --- Intake (POST /transfer/start); no network tools --- + transferIntake: Annotation({ + reducer: replace, + default: () => ({ + venue: '', + doubleBlind: true, + preprint: false, + outputNotes: '', + }), + }), + // --- Source analysis --- sourceProjectRoot: Annotation({ reducer: replace }), sourceOutline: Annotation({ reducer: replace }), sourceFullContent: Annotation({ reducer: replace }), sourceAssets: Annotation({ reducer: replace }), + sourceProfile: Annotation({ reducer: replace }), + sourceMaskManifest: Annotation({ reducer: replace }), + sourceMaskedFiles: Annotation({ reducer: replace }), + sourceMaskedContents: Annotation({ reducer: replace }), + sourceMaskWarnings: Annotation({ reducer: replace }), // --- Target analysis --- targetProjectRoot: Annotation({ reducer: replace }), @@ -31,15 +58,33 @@ export const TransferState = Annotation.Root({ // --- Transfer plan --- transferPlan: Annotation({ reducer: replace }), + // --- Human QA --- + pendingQA: Annotation({ reducer: replace }), + userConfirmations: Annotation({ reducer: replace, default: () => ({}) }), + + // --- UI / progress (API surfaces these) --- + lastCompletedNode: Annotation({ reducer: replace, default: () => '' }), + currentPhase: Annotation({ reducer: replace, default: () => '' }), + /** Next node name when graph paused (interrupt-before); filled by route on GraphInterrupt */ + interruptedBeforeNode: Annotation({ reducer: replace, default: () => '' }), + completedNodes: Annotation({ reducer: appendList, default: () => [] }), + progressLogEntries: Annotation({ reducer: appendList, default: () => [] }), + // --- Compile loop --- compileResult: Annotation({ reducer: replace }), compileAttempt: Annotation({ reducer: replace, default: () => 0 }), + verifyBuildResult: Annotation({ reducer: replace }), + lastGoodPhase: Annotation({ reducer: replace }), + buildFailureReason: Annotation({ reducer: replace }), // --- Layout check --- pageImages: Annotation({ reducer: replace }), layoutCheckResult: Annotation({ reducer: replace }), layoutAttempt: Annotation({ reducer: replace, default: () => 0 }), + // --- Figure measurement (normalizeFigures) --- + figureMeasurement: Annotation({ reducer: replace }), + // --- MinerU pipeline --- transferMode: Annotation({ reducer: replace, default: () => 'legacy' }), mineruConfig: Annotation({ reducer: replace }), @@ -48,9 +93,24 @@ export const TransferState = Annotation.Root({ sourceImages: Annotation({ reducer: replace }), mineruOutputDir: Annotation({ reducer: replace }), + // --- Agentic loop (graphVenueAgent: planner → generator → reviewer) --- + /** LLM message history for the agentic loop (accumulated across iterations) */ + agentMessages: Annotation({ reducer: appendList, default: () => [] }), + /** Current Planner→Generator→Reviewer iteration (0-based) */ + currentIteration: Annotation({ reducer: replace, default: () => 0 }), + /** Maximum allowed iterations before forced finalize */ + maxIterations: Annotation({ reducer: replace, default: () => 5 }), + /** Structured migration plan produced by Planner */ + migrationPlan: Annotation({ reducer: replace }), + /** Review result from Reviewer: { verdict: 'pass'|'revise', issues: [], suggestions: [] } */ + reviewResult: Annotation({ reducer: replace }), + /** Current agent phase: 'planning' | 'generating' | 'reviewing' | 'finalized' */ + agentPhase: Annotation({ reducer: replace, default: () => 'planning' }), + // --- Final output --- finalPdf: Annotation({ reducer: replace }), status: Annotation({ reducer: replace, default: () => 'pending' }), error: Annotation({ reducer: replace }), progressLog: Annotation({ reducer: appendList, default: () => [] }), + bundleNotes: Annotation({ reducer: replace }), }); diff --git a/apps/backend/src/services/transferAgent/toolTrace.js b/apps/backend/src/services/transferAgent/toolTrace.js new file mode 100644 index 0000000..e281576 --- /dev/null +++ b/apps/backend/src/services/transferAgent/toolTrace.js @@ -0,0 +1,171 @@ +import { promises as fs } from 'fs'; +import path from 'path'; +import { ensureDir } from '../../utils/fsUtils.js'; +import { briefToolArgs } from './utils.js'; + +export const MAX_TOOL_TRACE_RECENT = 200; + +/** + * Monotonic counter on liveProgress so SSE clients detect updates when the same tool runs twice in a row. + */ +export function bumpLiveProgress(lp) { + if (!lp) return; + lp.seq = (lp.seq ?? 0) + 1; + lp.lastUpdate = Date.now(); +} + +function jsonlEnabled() { + const v = process.env.OPENPRISM_TOOL_TRACE_JSONL; + if (v === '0' || v === 'false' || v === 'no') return false; + return true; +} + +export function pushToolTraceRecent(job, entry) { + if (!job?.toolTraceRecent) return; + job.toolTraceRecent.push(entry); + while (job.toolTraceRecent.length > MAX_TOOL_TRACE_RECENT) { + job.toolTraceRecent.shift(); + } +} + +async function appendToolTraceJsonl(projectRoot, jobId, record) { + if (!jsonlEnabled() || !projectRoot || !jobId) return; + try { + const dir = path.join(projectRoot, '.agent_runs', jobId); + await ensureDir(dir); + const file = path.join(dir, 'tool_trace.jsonl'); + await fs.appendFile(file, `${JSON.stringify(record)}\n`, 'utf8'); + } catch (e) { + console.warn('[toolTrace] append failed', e?.message || e); + } +} + +/** + * Execute a bound agent tool with JSONL + in-memory trace and liveProgress bumps. + * @param {object} opts + * @param {object} [opts.config] - LangGraph runnable config + * @param {object} [opts.lp] - job.liveProgress + * @param {string} [opts.projectRoot] - target workspace root (for .agent_runs) + * @param {string} opts.jobId + * @param {'planner'|'generator'|'reviewer'} opts.agent + * @param {number} opts.iteration + * @param {number} opts.round + * @param {{ id: string, name: string, args: object }} opts.toolCall + * @param {() => Promise} opts.invokeFn + */ +export async function runAgentToolCall(opts) { + const { + config, + lp, + projectRoot, + jobId, + agent, + iteration, + round, + toolCall, + invokeFn, + } = opts; + + const recordTool = typeof config?.configurable?._recordToolTrace === 'function' + ? config.configurable._recordToolTrace + : null; + + const toolName = toolCall.name; + const argsBrief = briefToolArgs(toolName, toolCall.args); + const t0 = Date.now(); + const activeRole = agent === 'planner' ? 'planner' : agent === 'generator' ? 'generator' : 'reviewer'; + + const base = { + ts: t0, + agent, + iteration, + round, + tool: toolName, + argsBrief, + toolCallId: toolCall.id, + }; + + if (lp) { + lp.activeRole = activeRole; + lp.toolName = toolName; + lp.toolArgs = argsBrief; + lp.toolRound = round; + bumpLiveProgress(lp); + } + + await appendToolTraceJsonl(projectRoot, jobId, { ...base, phase: 'start' }); + + try { + const result = await invokeFn(); + const durationMs = Date.now() - t0; + const endEntry = { + ...base, + ts: Date.now(), + phase: 'end', + durationMs, + ok: true, + }; + await appendToolTraceJsonl(projectRoot, jobId, endEntry); + if (recordTool) recordTool(endEntry); + if (lp) bumpLiveProgress(lp); + return result; + } catch (err) { + const durationMs = Date.now() - t0; + const endEntry = { + ...base, + ts: Date.now(), + phase: 'end', + durationMs, + ok: false, + error: err?.message || String(err), + }; + await appendToolTraceJsonl(projectRoot, jobId, endEntry); + if (recordTool) recordTool(endEntry); + if (lp) bumpLiveProgress(lp); + throw err; + } +} + +/** + * Record a failed tool resolution (unknown tool name) without invoking. + */ +export async function recordUnknownToolTrace(opts) { + const { + config, + lp, + projectRoot, + jobId, + agent, + iteration, + round, + toolName, + } = opts; + + const recordTool = typeof config?.configurable?._recordToolTrace === 'function' + ? config.configurable._recordToolTrace + : null; + + const base = { + ts: Date.now(), + agent, + iteration, + round, + tool: toolName, + argsBrief: '', + phase: 'end', + durationMs: 0, + ok: false, + error: 'Unknown tool', + }; + + if (lp) { + lp.activeRole = agent === 'planner' ? 'planner' : agent === 'generator' ? 'generator' : 'reviewer'; + lp.toolName = toolName; + lp.toolArgs = '(unknown)'; + lp.toolRound = round; + bumpLiveProgress(lp); + } + + await appendToolTraceJsonl(projectRoot, jobId, base); + if (recordTool) recordTool(base); +} diff --git a/apps/backend/src/services/transferAgent/tools/applyDiff.js b/apps/backend/src/services/transferAgent/tools/applyDiff.js new file mode 100644 index 0000000..4e06150 --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/applyDiff.js @@ -0,0 +1,59 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { promises as fs } from 'fs'; +import { safeJoin } from '../../../utils/pathUtils.js'; +import { writeFileWithSnapshot } from '../utils.js'; +import { + extractUnifiedDiff, + applyUnifiedDiffToMainTex, +} from '../llmUnifiedDiff.js'; +import { unmaskContent } from '../masking/index.js'; + +/** + * Creates the applyDiff tool — applies a unified diff patch to a target file. + * Wraps the existing llmUnifiedDiff infrastructure. + * + * @param {{ workspaceRoot: string, jobId: string }} ctx + */ +export function createApplyDiffTool(ctx) { + return new DynamicStructuredTool({ + name: 'applyDiff', + description: + 'Apply a unified diff (git format) to a file in the target project. ' + + 'The diff must include proper --- a/ and +++ b/ headers and @@ hunk headers. ' + + 'Context lines (space prefix) and removed lines (-) must match the file exactly. ' + + 'Returns OK with the new file length, or an error reason if the patch cannot be applied.', + schema: z.object({ + path: z + .string() + .describe('Relative file path in target project, e.g. "main.tex"'), + diff: z + .string() + .describe('Unified diff in git format (--- a/path, +++ b/path, @@ hunks)'), + }), + func: async ({ path, diff }) => { + try { + const abs = safeJoin(ctx.workspaceRoot, path); + const baseTex = await fs.readFile(abs, 'utf8'); + const patchText = extractUnifiedDiff(diff); + if (!patchText) { + return '[ERROR] No valid unified diff found in the provided text. Ensure --- a/ and +++ b/ headers are present.'; + } + const result = applyUnifiedDiffToMainTex(baseTex, patchText); + if (!result.ok) { + return `[ERROR] Patch failed: ${result.reason}. Context/remove lines must match the file exactly.`; + } + const unmasked = ctx.enableSensitiveMask + ? unmaskContent(result.text, ctx.sourceMaskManifest) + : { content: result.text, restored: 0, remaining: 0 }; + await writeFileWithSnapshot(ctx.workspaceRoot, path, unmasked.content, ctx.jobId); + const maskNote = ctx.enableSensitiveMask + ? ` Restored ${unmasked.restored} token(s); remaining=${unmasked.remaining}.` + : ''; + return `[OK] Patch applied successfully. File is now ${unmasked.content.length} chars.${maskNote}`; + } catch (err) { + return `[ERROR] applyDiff failed on target:${path} — ${err.message}`; + } + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/tools/compileProject.js b/apps/backend/src/services/transferAgent/tools/compileProject.js new file mode 100644 index 0000000..ac54dd7 --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/compileProject.js @@ -0,0 +1,149 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { runCompile, SUPPORTED_ENGINES } from '../../compileService.js'; +import { ChatOpenAI } from '@langchain/openai'; +import { resolveLLMConfig, normalizeBaseURL } from '../../llmService.js'; + +const MAX_LOG_FOR_LLM = 14_000; +const MAX_SUMMARY_OUTPUT = 3_500; + +/** + * Heuristic summary when LLM is unavailable or fails. + * @param {string} log + * @param {boolean} ok + * @param {number} [status] + */ +function heuristicCompileSummary(log, ok, status) { + const lines = (log || '').split('\n'); + const interesting = lines.filter( + (l) => + /^! /.test(l) || + /LaTeX Error|Emergency stop|Fatal error|Undefined control sequence|Missing .* inserted|not found|error:/i.test(l), + ); + const tail = interesting.slice(-25); + const head = `compile_ok=${ok} exit=${status ?? 'n/a'}`; + if (!tail.length) { + return `${head}\n(no obvious error lines in log tail; raw log length=${(log || '').length})`; + } + return `${head}\n--- error-ish lines (last ${tail.length}) ---\n${tail.join('\n')}`; +} + +/** + * Ask a small LLM pass to compress the raw TeX log for the agent loop. + */ +async function summarizeCompileLogWithLlm(fullLog, meta, llmConfig) { + const log = (fullLog || '').trim(); + const slice = + log.length <= MAX_LOG_FOR_LLM + ? log + : `…[truncated ${log.length - MAX_LOG_FOR_LLM} chars from start]…\n${log.slice(-MAX_LOG_FOR_LLM)}`; + + const { endpoint, apiKey, model } = resolveLLMConfig(llmConfig); + if (!apiKey) { + return heuristicCompileSummary(fullLog, meta.ok, meta.status); + } + + const llm = new ChatOpenAI({ + modelName: model, + openAIApiKey: apiKey, + configuration: { baseURL: normalizeBaseURL(endpoint) }, + temperature: 0.1, + maxTokens: 900, + }); + + const user = `Compile metadata: +- success (PDF produced): ${meta.ok} +- process exit code: ${meta.status ?? 'unknown'} +- engine: ${meta.engine} +- main file: ${meta.mainFile} +${meta.error ? `- early failure message: ${meta.error}` : ''} + +Raw compiler log (may be truncated): +--- +${slice} +--- + +Reply with a concise summary for another LLM agent (plain text, no JSON): +1. One-line outcome (PASS / FAIL). +2. If FAIL: list each distinct error with file:line when visible, and the underlying cause in one short phrase each. +3. If FAIL: 1–3 concrete fix hints (what to change in .tex / missing files / packages). +4. If PASS: note any non-fatal warnings worth fixing (optional, brief). +Keep under ${MAX_SUMMARY_OUTPUT} characters.`; + + try { + const res = await llm.invoke([ + { + role: 'system', + content: + 'You summarize LaTeX compile logs for automated migration agents. Be precise and actionable; do not invent file names or line numbers that are not in the log.', + }, + { role: 'user', content: user }, + ]); + const text = + typeof res.content === 'string' + ? res.content + : Array.isArray(res.content) + ? res.content.map((p) => (typeof p === 'string' ? p : p?.text || '')).join('') + : ''; + const out = (text || '').trim(); + if (!out) return heuristicCompileSummary(fullLog, meta.ok, meta.status); + return out.length > MAX_SUMMARY_OUTPUT ? `${out.slice(0, MAX_SUMMARY_OUTPUT)}…` : out; + } catch (e) { + return `${heuristicCompileSummary(fullLog, meta.ok, meta.status)}\n[LLM summary failed: ${e?.message || e}]`; + } +} + +/** + * Run LaTeX on the **target** project using the engine the user chose at transfer start, + * then return an LLM-compressed log summary for the agent. + * + * @param {object} ctx + * @param {string} ctx.targetProjectId + * @param {string} ctx.targetMainFile + * @param {string} [ctx.engine='pdflatex'] + * @param {object} [ctx.llmConfig] + */ +export function createCompileProjectTool(ctx) { + return new DynamicStructuredTool({ + name: 'compileProject', + description: + 'Compile the target LaTeX project using the engine the user selected when starting the transfer ' + + '(same as the pipeline compile step: pdflatex, xelatex, lualatex, latexmk, or tectonic). ' + + 'Returns a short LLM-produced summary of the compile log (errors, file:line hints, fix suggestions) — not the full raw log. ' + + 'Use after substantive .tex edits to verify the project builds.', + schema: z.object({}).describe('No arguments; engine and main file come from the transfer job.'), + func: async () => { + const projectId = ctx.targetProjectId; + const mainFile = ctx.targetMainFile; + const engine = ctx.engine && SUPPORTED_ENGINES.includes(ctx.engine) ? ctx.engine : 'pdflatex'; + + if (!projectId || !mainFile) { + return '[ERROR] compileProject: missing targetProjectId or targetMainFile in job context.'; + } + + let result; + try { + result = await runCompile({ projectId, mainFile, engine }); + } catch (err) { + return `[ERROR] compileProject: ${err?.message || err}`; + } + + const log = result.log || ''; + const meta = { + ok: !!result.ok, + status: result.status, + engine, + mainFile, + error: result.error || '', + }; + + const summary = await summarizeCompileLogWithLlm(log, meta, ctx.llmConfig); + + const header = `[compileProject] engine=${engine} main=${mainFile} ok=${meta.ok} exit=${meta.status ?? 'n/a'}`; + if (result.error && !log) { + return `${header}\n${summary}\n(raw error: ${result.error})`; + } + return `${header}\n\n--- log summary ---\n${summary}`; + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/tools/copyAsset.js b/apps/backend/src/services/transferAgent/tools/copyAsset.js new file mode 100644 index 0000000..4acc67b --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/copyAsset.js @@ -0,0 +1,55 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { promises as fs } from 'fs'; +import path from 'path'; +import { safeJoin } from '../../../utils/pathUtils.js'; +import { ensureDir } from '../../../utils/fsUtils.js'; + +/** + * Creates the copyAsset tool — copies a file from source project to target project. + * + * @param {{ sourceReadRoot: string, workspaceRoot: string }} ctx + */ +export function createCopyAssetTool(ctx) { + return new DynamicStructuredTool({ + name: 'copyAsset', + description: + 'Copy a file from the source project to the target (workspace) project. ' + + 'Use this for .bib, .bbl, images (.png, .jpg, .pdf, .eps), ' + + 'and style files (.sty, .cls, .bst). ' + + 'If destPath is omitted, the file is placed at the same relative path.', + schema: z.object({ + srcPath: z + .string() + .describe('Relative file path in the source project, e.g. "refs.bib" or "figures/fig1.png"'), + destPath: z + .string() + .optional() + .describe('Destination path in target project. Defaults to same as srcPath.'), + }), + func: async ({ srcPath, destPath }) => { + try { + const dest = destPath || srcPath; + const srcAbs = safeJoin(ctx.sourceReadRoot, srcPath); + const destAbs = safeJoin(ctx.workspaceRoot, dest); + + // Check source exists + try { + await fs.access(srcAbs); + } catch { + return `[ERROR] Source file not found: source:${srcPath}`; + } + + // Ensure destination directory exists + await ensureDir(path.dirname(destAbs)); + + // Copy + await fs.copyFile(srcAbs, destAbs); + const stat = await fs.stat(destAbs); + return `[OK] Copied source:${srcPath} → target:${dest} (${stat.size} bytes)`; + } catch (err) { + return `[ERROR] Copy failed: ${err.message}`; + } + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/tools/grepFile.js b/apps/backend/src/services/transferAgent/tools/grepFile.js new file mode 100644 index 0000000..887bdf6 --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/grepFile.js @@ -0,0 +1,99 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { promises as fs } from 'fs'; +import path from 'path'; +import { safeJoin } from '../../../utils/pathUtils.js'; +import { listFilesRecursive } from '../../../utils/fsUtils.js'; + +/** + * Creates the grepFile tool — searches file contents with a regex pattern. + * + * @param {{ sourceReadRoot: string, workspaceRoot: string }} ctx + */ +export function createGrepFileTool(ctx) { + return new DynamicStructuredTool({ + name: 'grepFile', + description: + 'Search file contents in the source or target project using a regular expression. ' + + 'Returns matching lines with line numbers and surrounding context. ' + + 'Use glob to filter by file extension (e.g. "*.tex", "*.bib").', + schema: z.object({ + project: z + .enum(['source', 'target']) + .describe('Which project to search'), + pattern: z + .string() + .describe('Regular expression pattern to search for'), + glob: z + .string() + .optional() + .describe('File glob pattern to filter, e.g. "*.tex" or "*.bib"'), + }), + func: async ({ project, pattern, glob }) => { + try { + const root = + project === 'source' ? ctx.sourceReadRoot : ctx.workspaceRoot; + const allFiles = await listFilesRecursive(root); + const files = allFiles + .filter((f) => f.type === 'file') + .filter((f) => { + if (!glob) return true; + // Simple glob: *.ext matching + if (glob.startsWith('*.')) { + const ext = glob.slice(1); // e.g. ".tex" + return f.path.endsWith(ext); + } + return f.path.includes(glob); + }); + + let re; + try { + re = new RegExp(pattern, 'gim'); + } catch { + return `[ERROR] Invalid regex pattern: ${pattern}`; + } + + const results = []; + let totalMatches = 0; + const MAX_MATCHES = 100; + + for (const file of files) { + if (totalMatches >= MAX_MATCHES) break; + let content; + try { + content = await fs.readFile(safeJoin(root, file.path), 'utf8'); + } catch { + continue; + } + const lines = content.split('\n'); + for (let i = 0; i < lines.length; i++) { + if (totalMatches >= MAX_MATCHES) break; + if (re.test(lines[i])) { + re.lastIndex = 0; // reset for global regex + const ctxStart = Math.max(0, i - 1); + const ctxEnd = Math.min(lines.length - 1, i + 1); + const snippet = []; + for (let j = ctxStart; j <= ctxEnd; j++) { + const prefix = j === i ? '>>>' : ' '; + snippet.push(`${prefix} ${j + 1}: ${lines[j]}`); + } + results.push(`--- ${file.path} ---\n${snippet.join('\n')}`); + totalMatches++; + } + } + } + + if (!results.length) { + return `No matches found for /${pattern}/ in ${project} project${glob ? ` (glob: ${glob})` : ''}.`; + } + const truncNote = + totalMatches >= MAX_MATCHES + ? `\n\n[TRUNCATED — showing first ${MAX_MATCHES} matches]` + : ''; + return results.join('\n\n') + truncNote; + } catch (err) { + return `[ERROR] grep failed: ${err.message}`; + } + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/tools/index.js b/apps/backend/src/services/transferAgent/tools/index.js new file mode 100644 index 0000000..b86990e --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/index.js @@ -0,0 +1,86 @@ +/** + * Agent tools registry. + * + * Creates all tools bound to a specific job context (workspace roots, jobId). + * Returns an array of DynamicStructuredTool instances ready for bind_tools(). + */ + +import { createReadFileTool } from './readFile.js'; +import { createWriteFileTool } from './writeFile.js'; +import { createApplyDiffTool } from './applyDiff.js'; +import { createGrepFileTool } from './grepFile.js'; +import { createListProjectTreeTool } from './listProjectTree.js'; +import { createCopyAssetTool } from './copyAsset.js'; +import { createRaiseQuestionTool } from './raiseQuestion.js'; +import { createMeasureFiguresTool } from './measureFigures.js'; +import { createCompileProjectTool } from './compileProject.js'; + +/** + * @param {object} ctx + * @param {string} ctx.sourceReadRoot — absolute path to source project + * @param {string} ctx.workspaceRoot — absolute path to target workspace + * @param {string} ctx.jobId — transfer job ID (for snapshots) + * @param {string} [ctx.targetProjectId] — for compileProject + * @param {string} [ctx.targetMainFile] — for compileProject + * @param {string} [ctx.engine] — user-selected LaTeX engine + * @param {object} [ctx.llmConfig] — for compile log summarization + * @returns {import('@langchain/core/tools').DynamicStructuredTool[]} + */ +export function createAllTools(ctx) { + return [ + createReadFileTool(ctx), + createWriteFileTool(ctx), + createApplyDiffTool(ctx), + createGrepFileTool(ctx), + createListProjectTreeTool(ctx), + createCopyAssetTool(ctx), + createRaiseQuestionTool(ctx), + createMeasureFiguresTool(ctx), + createCompileProjectTool(ctx), + ]; +} + +/** + * Create a subset of tools (read-only) for Planner and Reviewer nodes. + * These nodes should NOT write files or apply diffs. + */ +export function createReadOnlyTools(ctx) { + return [ + createReadFileTool(ctx), + createGrepFileTool(ctx), + createListProjectTreeTool(ctx), + createRaiseQuestionTool(ctx), + createCompileProjectTool(ctx), + ]; +} + +/** + * Create the full tool set for the Generator node. + * Generator can read, write, diff, copy, and grep. + */ +export function createGeneratorTools(ctx) { + return [ + createReadFileTool(ctx), + createWriteFileTool(ctx), + createApplyDiffTool(ctx), + createGrepFileTool(ctx), + createListProjectTreeTool(ctx), + createCopyAssetTool(ctx), + createMeasureFiguresTool(ctx), + createCompileProjectTool(ctx), + ]; +} + +/** + * Create tools for the Reviewer node. + * Reviewer can read, grep, list, but also raiseQuestion for user confirmations. + */ +export function createReviewerTools(ctx) { + return [ + createReadFileTool(ctx), + createGrepFileTool(ctx), + createListProjectTreeTool(ctx), + createRaiseQuestionTool(ctx), + createCompileProjectTool(ctx), + ]; +} diff --git a/apps/backend/src/services/transferAgent/tools/listProjectTree.js b/apps/backend/src/services/transferAgent/tools/listProjectTree.js new file mode 100644 index 0000000..25aafd6 --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/listProjectTree.js @@ -0,0 +1,42 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { listFilesRecursive } from '../../../utils/fsUtils.js'; + +/** + * Creates the listProjectTree tool — lists files in a project directory. + * + * @param {{ sourceReadRoot: string, workspaceRoot: string }} ctx + */ +export function createListProjectTreeTool(ctx) { + return new DynamicStructuredTool({ + name: 'listProjectTree', + description: + 'List all files in the source or target project directory tree. ' + + 'Returns file paths with their types (file/directory). ' + + 'Useful for understanding project structure before reading specific files.', + schema: z.object({ + project: z + .enum(['source', 'target']) + .describe('Which project to list files from'), + }), + func: async ({ project }) => { + try { + const root = + project === 'source' ? ctx.sourceReadRoot : ctx.workspaceRoot; + const entries = await listFilesRecursive(root); + if (!entries.length) { + return `(empty — no files found in ${project} project)`; + } + const tree = entries + .map((e) => { + const icon = e.type === 'file' ? ' ' : ' [dir]'; + return `${icon} ${e.path}`; + }) + .join('\n'); + return `${project} project files:\n${tree}`; + } catch (err) { + return `[ERROR] Failed to list ${project} project tree: ${err.message}`; + } + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/tools/measureFigures.js b/apps/backend/src/services/transferAgent/tools/measureFigures.js new file mode 100644 index 0000000..044de22 --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/measureFigures.js @@ -0,0 +1,241 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { promises as fs } from 'fs'; +import path from 'path'; +import { safeJoin } from '../../../utils/pathUtils.js'; + +/** + * Known text-width (in pt) for common document classes / layouts. + * These are the width of the text body (single column) at default settings. + * + * For twocolumn documents the *column* width is roughly half of + * textwidth minus columnsep, which is what \linewidth resolves to + * inside a column. + */ +const LAYOUT_DB = { + // NeurIPS: 5.5 in text width => 396 pt + neurips: { textwidthPt: 396, columnwidthPt: 396, columns: 1 }, + // Standard article 10 pt, letterpaper: ~345 pt + article: { textwidthPt: 345, columnwidthPt: 345, columns: 1 }, + // revtex4-1 / revtex4-2 twocolumn (APS default): textwidth ≈ 510 pt, colwidth ≈ 246 pt + 'revtex4-1': { textwidthPt: 510, columnwidthPt: 246, columns: 2 }, + 'revtex4-2': { textwidthPt: 510, columnwidthPt: 246, columns: 2 }, + revtex: { textwidthPt: 510, columnwidthPt: 246, columns: 2 }, + // IEEEtran twocolumn: textwidth ≈ 516 pt, colwidth ≈ 252 pt + IEEEtran: { textwidthPt: 516, columnwidthPt: 252, columns: 2 }, + // LNCS (Springer): textwidth ≈ 336 pt + llncs: { textwidthPt: 336, columnwidthPt: 336, columns: 1 }, + // ACM acmart sigconf twocolumn: textwidth ≈ 506 pt, colwidth ≈ 241 pt + acmart: { textwidthPt: 506, columnwidthPt: 241, columns: 2 }, + // CVPR / ICCV twocolumn: textwidth ≈ 496 pt, colwidth ≈ 237 pt + cvpr: { textwidthPt: 496, columnwidthPt: 237, columns: 2 }, + // ICML: textwidth ≈ 487 pt, colwidth ≈ 233 pt + icml: { textwidthPt: 487, columnwidthPt: 233, columns: 2 }, +}; + +/** + * Parse the MediaBox / page size from a PDF file header (first 4 KB). + * Returns { widthPt, heightPt } or null. + */ +async function pdfPageSize(filePath) { + let buf; + try { + const fd = await fs.open(filePath, 'r'); + buf = Buffer.alloc(8192); + await fd.read(buf, 0, 8192, 0); + await fd.close(); + } catch { + return null; + } + + const str = buf.toString('latin1'); + + // Try /MediaBox [x0 y0 x1 y1] + const mediaMatch = str.match(/\/MediaBox\s*\[\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*\]/); + if (mediaMatch) { + const w = parseFloat(mediaMatch[3]) - parseFloat(mediaMatch[1]); + const h = parseFloat(mediaMatch[4]) - parseFloat(mediaMatch[2]); + if (w > 0 && h > 0) return { widthPt: Math.round(w * 100) / 100, heightPt: Math.round(h * 100) / 100 }; + } + + return null; +} + +/** + * Measure the natural dimensions of a raster image (PNG/JPG) in pt. + * Falls back to pixel dimensions / 72 dpi. + */ +async function rasterSize(filePath) { + try { + const buf = Buffer.alloc(32); + const fd = await fs.open(filePath, 'r'); + await fd.read(buf, 0, 32, 0); + await fd.close(); + + // PNG: width at byte 16-19, height at byte 20-23 (big-endian) + if (buf[0] === 0x89 && buf[1] === 0x50) { + const w = buf.readUInt32BE(16); + const h = buf.readUInt32BE(20); + return { widthPt: w * 72 / 150, heightPt: h * 72 / 150 }; // assume 150 dpi + } + + // JPEG: need to find SOF marker — simpler: just return null and let the tool skip + return null; + } catch { + return null; + } +} + +/** + * Compute the effective \linewidth (in pt) a figure sees. + * For twocolumn documents, \linewidth inside a normal figure = columnwidth. + * For figure*, \linewidth = textwidth. + */ +function effectiveLinewidth(layout, isStar) { + if (!layout) return null; + return isStar ? layout.textwidthPt : layout.columnwidthPt; +} + +/** + * Create the measureFigures tool. + * + * Does NOT call any external binary — reads PDF headers directly and uses + * a built-in layout database for document-class dimensions. + * + * @param {{ sourceReadRoot: string, workspaceRoot: string }} ctx + */ +export function createMeasureFiguresTool(ctx) { + return new DynamicStructuredTool({ + name: 'measureFigures', + description: + 'Measure figure image dimensions and compute recommended \\includegraphics width ' + + 'based on source and target document layouts. ' + + 'Returns a JSON report with per-figure measurements and scaling advice.', + schema: z.object({ + sourceClass: z + .string() + .describe('Source document class, e.g. "revtex4-1", "article", "IEEEtran"'), + sourceTwocolumn: z + .boolean() + .describe('Whether the source document uses twocolumn layout'), + targetClass: z + .string() + .default('neurips') + .describe('Target document class / template, e.g. "neurips"'), + figures: z + .array( + z.object({ + file: z.string().describe('Image file path relative to workspace, e.g. "fig1.pdf"'), + currentWidth: z + .string() + .optional() + .describe('Current \\includegraphics width spec, e.g. "\\linewidth", "0.48\\textwidth", "3in"'), + isStar: z + .boolean() + .default(false) + .describe('Whether this figure is in a figure* (full-width) environment'), + }), + ) + .describe('List of figures to measure'), + }), + func: async ({ sourceClass, sourceTwocolumn, targetClass, figures }) => { + try { + // Resolve layouts + let srcLayout = LAYOUT_DB[sourceClass] || null; + // If the class itself is single-column but twocolumn flag is set, + // approximate column width as (textwidth - 20 pt columnsep) / 2 + if (srcLayout && sourceTwocolumn && srcLayout.columns === 1) { + srcLayout = { + ...srcLayout, + columnwidthPt: Math.round((srcLayout.textwidthPt - 20) / 2), + columns: 2, + }; + } + const tgtLayout = LAYOUT_DB[targetClass] || LAYOUT_DB.neurips; + + const results = []; + + for (const fig of figures) { + const absPath = safeJoin(ctx.workspaceRoot, fig.file); + let naturalSize = null; + + // Try to measure the image + const ext = path.extname(fig.file).toLowerCase(); + if (ext === '.pdf') { + naturalSize = await pdfPageSize(absPath); + } else if (['.png', '.jpg', '.jpeg'].includes(ext)) { + naturalSize = await rasterSize(absPath); + } + + // Compute the effective width the figure occupied in the source + const srcLinewidth = effectiveLinewidth(srcLayout, fig.isStar); + const tgtLinewidth = effectiveLinewidth(tgtLayout, false); // NeurIPS is always single-col + + // Determine recommended width + let recommendation = ''; + let recommendedSpec = ''; + + if (srcLinewidth && tgtLinewidth) { + // The ratio: how much of \linewidth in the source did the figure use? + // If currentWidth is "\linewidth" or "1\linewidth", ratio = 1.0 + // If "0.48\textwidth" in twocolumn source, actual = 0.48 * textwidth + const srcEffectivePt = srcLinewidth; // assume width=\linewidth by default + const ratio = srcEffectivePt / tgtLinewidth; + + if (ratio < 0.75) { + // Source figure was narrower than target \linewidth — keep as-is or minor adjust + recommendedSpec = `${Math.round(ratio * 100) / 100}\\linewidth`; + recommendation = `Source figure occupied ${Math.round(srcLinewidth)}pt; target \\linewidth is ${Math.round(tgtLinewidth)}pt. Scale to ${recommendedSpec} to preserve visual proportion.`; + } else if (ratio >= 0.75 && ratio <= 1.05) { + // Close to full width — use \linewidth + recommendedSpec = '\\linewidth'; + recommendation = `Source and target widths are similar — \\linewidth is fine.`; + } else { + // Source column was wider than target (unusual) or figure* in twocolumn → very wide + // Scale down to fit + const scaledRatio = Math.min(ratio, 1.0); + recommendedSpec = `${Math.round(scaledRatio * 100) / 100}\\linewidth`; + recommendation = `Source figure was ${Math.round(srcLinewidth)}pt wide (${srcLayout?.columns === 2 ? 'figure* spanning full textwidth' : 'single column'}); target is ${Math.round(tgtLinewidth)}pt. Use ${recommendedSpec}.`; + } + } + + // If the figure is very tall relative to the target page, also warn + let heightWarning = ''; + if (naturalSize && tgtLinewidth) { + const scaledWidth = tgtLinewidth; // if using \linewidth + const scaledHeight = naturalSize.heightPt * (scaledWidth / naturalSize.widthPt); + const pageHeight = 650; // NeurIPS text height ≈ 650 pt + const heightRatio = scaledHeight / pageHeight; + if (heightRatio > 0.65) { + heightWarning = `At \\linewidth, figure height would be ${Math.round(scaledHeight)}pt (${Math.round(heightRatio * 100)}% of page). Consider reducing width to ${Math.round(0.6 / heightRatio * 100) / 100}\\linewidth so it fits alongside text.`; + } + } + + results.push({ + file: fig.file, + naturalSizePt: naturalSize + ? `${naturalSize.widthPt} x ${naturalSize.heightPt}` + : 'unknown', + sourceEffectiveWidthPt: srcLinewidth ? Math.round(srcLinewidth) : null, + targetLinewidthPt: tgtLinewidth ? Math.round(tgtLinewidth) : null, + recommendedWidth: recommendedSpec, + recommendation, + heightWarning: heightWarning || null, + }); + } + + const summary = { + sourceLayout: srcLayout + ? `${sourceClass}, ${srcLayout.columns}-column, textwidth=${srcLayout.textwidthPt}pt, colwidth=${srcLayout.columnwidthPt}pt` + : `${sourceClass} (unknown layout)`, + targetLayout: `${targetClass}, ${tgtLayout.columns}-column, textwidth=${tgtLayout.textwidthPt}pt`, + figures: results, + }; + + return `[OK] ${JSON.stringify(summary, null, 2)}`; + } catch (err) { + return `[ERROR] measureFigures failed: ${err.message}`; + } + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/tools/raiseQuestion.js b/apps/backend/src/services/transferAgent/tools/raiseQuestion.js new file mode 100644 index 0000000..4ba7f2d --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/raiseQuestion.js @@ -0,0 +1,58 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { interrupt } from '@langchain/langgraph'; + +/** + * Creates the raiseQuestion tool — pauses the graph and asks the user a question. + * The frontend will display the question(s) and resume with answers via POST /submit-confirm. + * + * When this tool is called, it triggers a LangGraph interrupt. The graph will + * be suspended until the user provides answers via the API. + * + * @param {{ getState: () => object }} ctx — accessor for current graph state + */ +export function createRaiseQuestionTool(ctx) { + return new DynamicStructuredTool({ + name: 'raiseQuestion', + description: + 'Ask the user one or more questions and pause execution until they respond. ' + + 'Use this ONLY when you genuinely need user input to proceed ' + + '(e.g. ambiguous migration choices, blind-review decisions). ' + + 'Do NOT use this for information you can determine from the files.', + schema: z.object({ + questions: z + .array( + z.object({ + id: z.string().describe('Unique question identifier, e.g. "float_strategy"'), + prompt: z.string().describe('The question text to show the user'), + options: z + .array(z.string()) + .describe('Available answer choices'), + }), + ) + .min(1) + .max(5) + .describe('Array of questions to ask'), + }), + func: async ({ questions }) => { + // Format questions for the pendingQA state field + const pendingQA = questions.map((q) => ({ + id: q.id, + prompt: q.prompt, + type: 'single', + options: q.options, + })); + + // Trigger LangGraph interrupt — this suspends the graph + // The interrupt value is picked up by the graph runner + interrupt({ + type: 'raiseQuestion', + pendingQA, + }); + + // This return value is used if/when the graph resumes + // The actual answers will be in state.userConfirmations + return '[PAUSED] Questions sent to user. Awaiting response. When resumed, check state.userConfirmations for answers.'; + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/tools/readFile.js b/apps/backend/src/services/transferAgent/tools/readFile.js new file mode 100644 index 0000000..93cafee --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/readFile.js @@ -0,0 +1,90 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { readSourceFile, readWorkspaceFile } from '../fsTools.js'; +import { getMaskedSourceContent } from '../masking/index.js'; + +/** + * Creates the readFile tool bound to a specific job's workspace roots. + * + * The agent uses this to read any file from the source or target project, + * with automatic \input{} resolution for .tex files if needed. + * + * @param {{ sourceReadRoot: string, workspaceRoot: string }} ctx + */ +export function createReadFileTool(ctx) { + return new DynamicStructuredTool({ + name: 'readFile', + description: + 'Read a file from the source or target (workspace) project. ' + + 'Use project="source" to read the original paper, project="target" to read the NeurIPS workspace being built. ' + + 'Supports optional 1-based startLine/endLine for partial reads. ' + + 'Returns the file content as a string (truncated to 60 000 chars).', + schema: z.object({ + project: z + .enum(['source', 'target']) + .describe('Which project to read from'), + path: z + .string() + .describe('Relative file path, e.g. "main.tex" or "sections/intro.tex"'), + startLine: z + .number() + .int() + .positive() + .optional() + .describe('Optional 1-based start line (inclusive)'), + endLine: z + .number() + .int() + .positive() + .optional() + .describe('Optional 1-based end line (inclusive)'), + }), + func: async ({ project, path, startLine, endLine }) => { + try { + const root = + project === 'source' ? ctx.sourceReadRoot : ctx.workspaceRoot; + const reader = + project === 'source' ? readSourceFile : readWorkspaceFile; + let content = ''; + if (project === 'source' && ctx.enableSensitiveMask) { + const masked = getMaskedSourceContent(ctx.sourceMaskedContents, path); + if (typeof masked === 'string') content = masked; + } + if (!content) { + content = await reader(root, path); + } + const lines = content.split('\n'); + + const hasRange = startLine !== undefined || endLine !== undefined; + if (hasRange) { + const resolvedStart = Math.max(1, startLine ?? 1); + const resolvedEnd = Math.min(lines.length, endLine ?? lines.length); + if (resolvedStart > resolvedEnd) { + return `[ERROR] Invalid line range: startLine (${resolvedStart}) > endLine (${resolvedEnd})`; + } + const numbered = lines + .slice(resolvedStart - 1, resolvedEnd) + .map((line, idx) => `L${resolvedStart + idx}:${line}`) + .join('\n'); + const rangeMeta = `[RANGE] ${path} lines ${resolvedStart}-${resolvedEnd}\n`; + const result = `${rangeMeta}${numbered}`; + if (result.length > 60_000) { + return ( + result.slice(0, 60_000) + + `\n\n[TRUNCATED — range output is ${result.length} chars total]` + ); + } + return result; + } + + if (content.length <= 60_000) return content; + return ( + content.slice(0, 60_000) + + `\n\n[TRUNCATED — file is ${content.length} chars total]` + ); + } catch (err) { + return `[ERROR] Could not read ${project}:${path} — ${err.message}`; + } + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/tools/writeFile.js b/apps/backend/src/services/transferAgent/tools/writeFile.js new file mode 100644 index 0000000..4c2e66f --- /dev/null +++ b/apps/backend/src/services/transferAgent/tools/writeFile.js @@ -0,0 +1,42 @@ +import { z } from 'zod'; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { writeFileWithSnapshot } from '../utils.js'; +import { unmaskContent } from '../masking/index.js'; + +/** + * Creates the writeFile tool — writes (or overwrites) a file in the target + * workspace with automatic snapshot backup. + * + * @param {{ workspaceRoot: string, jobId: string }} ctx + */ +export function createWriteFileTool(ctx) { + return new DynamicStructuredTool({ + name: 'writeFile', + description: + 'Write content to a file in the target (workspace) project. ' + + 'A snapshot of the previous version is saved automatically. ' + + 'Use this for creating or replacing .tex files, preambles, bib files, etc.', + schema: z.object({ + path: z + .string() + .describe('Relative file path inside target project, e.g. "main.tex"'), + content: z + .string() + .describe('The full file content to write'), + }), + func: async ({ path, content }) => { + try { + const unmasked = ctx.enableSensitiveMask + ? unmaskContent(content, ctx.sourceMaskManifest) + : { content, restored: 0, remaining: 0 }; + await writeFileWithSnapshot(ctx.workspaceRoot, path, unmasked.content, ctx.jobId); + const maskNote = ctx.enableSensitiveMask + ? ` Restored ${unmasked.restored} token(s); remaining=${unmasked.remaining}.` + : ''; + return `[OK] Wrote ${unmasked.content.length} chars to target:${path}.${maskNote}`; + } catch (err) { + return `[ERROR] Failed to write target:${path} — ${err.message}`; + } + }, + }); +} diff --git a/apps/backend/src/services/transferAgent/transferDebugLog.js b/apps/backend/src/services/transferAgent/transferDebugLog.js new file mode 100644 index 0000000..810b207 --- /dev/null +++ b/apps/backend/src/services/transferAgent/transferDebugLog.js @@ -0,0 +1,81 @@ +/** + * Console logging for transfer agent debugging. + * + * - Default ON when NODE_ENV !== 'production' + * - OPENPRISM_TRANSFER_DEBUG=1|true|yes — force ON + * - OPENPRISM_TRANSFER_DEBUG=0|false|no — force OFF + */ + +const env = process.env.OPENPRISM_TRANSFER_DEBUG; + +export function isTransferDebugEnabled() { + if (env === '0' || env === 'false' || env === 'no') return false; + if (env === '1' || env === 'true' || env === 'yes') return true; + return process.env.NODE_ENV !== 'production'; +} + +function shortJobId(jobId) { + if (!jobId) return '—'; + const s = String(jobId); + return s.length > 8 ? `${s.slice(0, 8)}…` : s; +} + +/** + * @param {string} [jobId] + * @param {'log'|'info'|'warn'|'error'} level + * @param {string} message + * @param {unknown} [detail] — object/array printed on next line(s) + */ +export function transferDebugLog(jobId, level, message, detail) { + if (!isTransferDebugEnabled()) return; + const prefix = `[OpenPrism:transfer:${shortJobId(jobId)}]`; + const fn = level === 'error' ? console.error : level === 'warn' ? console.warn : console.log; + const ts = new Date().toISOString(); + fn(`${ts} ${prefix} ${message}`); + if (detail !== undefined && detail !== null) { + fn(detail); + } +} + +/** + * Emit new progress lines since last call (by array length). + */ +export function transferDebugProgressDelta(jobId, job, nextLogArray) { + if (!isTransferDebugEnabled()) return; + const log = Array.isArray(nextLogArray) ? nextLogArray : []; + const prevLen = job._transferDebugLogLen ?? 0; + if (log.length <= prevLen) { + job._transferDebugLogLen = log.length; + return; + } + const added = log.slice(prevLen); + job._transferDebugLogLen = log.length; + transferDebugLog(jobId, 'log', `progress +${added.length} line(s):`); + added.forEach((line) => transferDebugLog(jobId, 'log', ` | ${line}`)); +} + +/** + * Structured entries (NeurIPS / progressMeta). + */ +export function transferDebugEntriesDelta(jobId, job, entries) { + if (!isTransferDebugEnabled()) return; + const arr = Array.isArray(entries) ? entries : []; + const prevLen = job._transferDebugEntriesLen ?? 0; + if (arr.length <= prevLen) { + job._transferDebugEntriesLen = arr.length; + return; + } + const added = arr.slice(prevLen); + job._transferDebugEntriesLen = arr.length; + transferDebugLog(jobId, 'log', `progressLogEntries +${added.length}:`, added); +} + +let _announced; +export function announceTransferDebugOnce() { + if (_announced || !isTransferDebugEnabled()) return; + _announced = true; + console.log( + '[OpenPrism:transfer] 控制台调试已开启:每次 /transfer/step 会打印进度增量与状态快照;' + + '关闭请设 OPENPRISM_TRANSFER_DEBUG=0', + ); +} diff --git a/apps/backend/src/services/transferAgent/transferNodeError.js b/apps/backend/src/services/transferAgent/transferNodeError.js new file mode 100644 index 0000000..be2eaaf --- /dev/null +++ b/apps/backend/src/services/transferAgent/transferNodeError.js @@ -0,0 +1,25 @@ +/** + * Thrown when a NeurIPS transfer node exhausts LLM retries; surfaced to API as failedNode/failedPhase. + */ +export class TransferNodeError extends Error { + /** + * @param {string} node - Graph node name (e.g. applyBibliography) + * @param {string} phase - NeuripsPhase value + * @param {string} detail - Last failure reason summary + * @param {string} [message] - Full error message for logs/UI + * @param {string} [debugRelPath] - Project-relative dir with saved LLM raw/patch (e.g. .agent_runs/…/llm_diff/…) + * @param {number} [inputChars] - Length of main.tex (or target file) fed to the diff step when it failed + */ + constructor(node, phase, detail, message, debugRelPath, inputChars) { + const msg = message || `[${node}] ${detail}`; + super(msg); + this.name = 'TransferNodeError'; + this.node = node; + this.phase = phase; + this.detail = detail; + /** @type {string | undefined} */ + this.debugRelPath = debugRelPath; + /** @type {number | undefined} */ + this.inputChars = inputChars; + } +} diff --git a/apps/backend/src/services/transferAgent/utils.js b/apps/backend/src/services/transferAgent/utils.js index 5d3d335..077d6d3 100644 --- a/apps/backend/src/services/transferAgent/utils.js +++ b/apps/backend/src/services/transferAgent/utils.js @@ -7,6 +7,43 @@ import { safeJoin } from '../../utils/pathUtils.js'; // Shared text helpers // --------------------------------------------------------------------------- +/** + * Produce a brief human-readable summary of tool call arguments (max ~100 chars). + * Used by agent nodes to populate liveProgress.toolArgs. + */ +export function briefToolArgs(toolName, args) { + if (!args || typeof args !== 'object') return ''; + try { + switch (toolName) { + case 'readFile': + if (typeof args.startLine === 'number' || typeof args.endLine === 'number') { + const start = typeof args.startLine === 'number' ? args.startLine : 1; + const end = typeof args.endLine === 'number' ? args.endLine : '?'; + return `${args.project || 'target'}:${args.path || ''}#L${start}-L${end}`.slice(0, 100); + } + return `${args.project || 'target'}:${args.path || ''}`.slice(0, 100); + case 'writeFile': + return `${args.path || ''} (${(args.content || '').length} chars)`.slice(0, 100); + case 'applyDiff': + return `${args.path || ''} (diff ${(args.diff || '').length} chars)`.slice(0, 100); + case 'grepFile': + return `pattern="${(args.pattern || '').slice(0, 40)}" ${args.path ? `in ${args.path}` : ''}`.slice(0, 100); + case 'listProjectTree': + return args.project || 'target'; + case 'copyAsset': + return `${args.srcPath || ''} → ${args.destPath || ''}`.slice(0, 100); + case 'compileProject': + return 'target compile'; + case 'raiseQuestion': + return `${(args.questions || []).length} question(s)`; + default: + return JSON.stringify(args).slice(0, 100); + } + } catch { + return ''; + } +} + /** * Strip markdown code fences (```json, ```latex, ```tex, etc.) from LLM output. */ @@ -17,6 +54,48 @@ export function stripCodeFences(text) { .trim(); } +/** + * Reject LLM "full .tex file" output that would wipe the project (empty or far shorter than input). + * @returns {string|null} rejection reason, or null if OK to write + */ +export function rejectCatastrophicFullTexRewrite(previousContent, candidateContent) { + const prevLen = (previousContent || '').length; + const outLen = (candidateContent || '').trim().length; + if (!outLen) return 'empty output'; + if (prevLen > 2000 && outLen < Math.floor(prevLen * 0.2)) return 'output too short'; + return null; +} + +/** + * Split LaTeX into preamble (before \\begin{document}), body block (inclusive), and trailing tail. + */ +export function splitTexDocument(tex) { + const beginMark = '\\begin{document}'; + const endMark = '\\end{document}'; + const beginIdx = tex.indexOf(beginMark); + const endIdx = tex.lastIndexOf(endMark); + if (beginIdx === -1 || endIdx === -1 || endIdx < beginIdx) { + return { preamble: tex.trimEnd(), body: '', tail: '', hasDocument: false }; + } + const preamble = tex.slice(0, beginIdx).trimEnd(); + const bodyEnd = endIdx + endMark.length; + const body = tex.slice(beginIdx, bodyEnd); + const tail = tex.slice(bodyEnd); + return { preamble, body, tail, hasDocument: true }; +} + +/** + * Merge preamble + body + tail (body must include begin/end document). + */ +export function mergeTexDocument(preamble, body, tail = '') { + const p = (preamble || '').trimEnd(); + const b = body || ''; + const t = tail || ''; + if (!p && !b) return t; + if (!b) return `${p}${t}`; + return `${p}\n\n${b}${t}`; +} + /** * Extract the first JSON object or array from a string that may contain * surrounding prose. Handles cases where the LLM outputs explanatory text diff --git a/apps/frontend/src/api/client.ts b/apps/frontend/src/api/client.ts index 8e77535..34699c6 100644 --- a/apps/frontend/src/api/client.ts +++ b/apps/frontend/src/api/client.ts @@ -50,7 +50,15 @@ export interface ArxivPaper { arxivId: string; } -const API_BASE = ''; +const rawApiBase = import.meta.env.VITE_API_BASE; +const API_BASE = + typeof rawApiBase === 'string' && rawApiBase.trim() !== '' ? rawApiBase.trim().replace(/\/$/, '') : ''; + +function apiUrl(path: string): string { + const p = path.startsWith('/') ? path : `/${path}`; + return `${API_BASE}${p}`; +} + const LANG_KEY = 'openprism-lang'; const COLLAB_TOKEN_KEY = 'openprism-collab-token'; const COLLAB_SERVER_KEY = 'openprism-collab-server'; @@ -104,7 +112,7 @@ async function request(url: string, options?: RequestInit): Promise { if (options?.body) { mergedHeaders['Content-Type'] = 'application/json'; } - const res = await fetch(`${API_BASE}${url}`, { + const res = await fetch(apiUrl(url), { ...options, headers: mergedHeaders }); @@ -210,7 +218,7 @@ export function renamePath(id: string, from: string, to: string) { export async function deleteFile(id: string, filePath: string) { const qs = new URLSearchParams({ path: filePath }).toString(); - const res = await fetch(`/api/projects/${id}/file?${qs}`, { + const res = await fetch(apiUrl(`/api/projects/${id}/file?${qs}`), { method: 'DELETE', headers: { 'x-lang': getLangHeader() @@ -236,7 +244,7 @@ export async function uploadFiles(projectId: string, files: File[], basePath?: s const finalPath = basePath ? `${basePath}/${rel}` : rel; form.append('files', file, finalPath); }); - const res = await fetch(`/api/projects/${projectId}/upload`, { + const res = await fetch(apiUrl(`/api/projects/${projectId}/upload`), { method: 'POST', body: form, headers: { @@ -319,7 +327,7 @@ export async function uploadTemplate(templateId: string, templateLabel: string, form.append('templateLabel', templateLabel); form.append('file', file); const lang = getLangHeader(); - const res = await fetch(`${API_BASE}/api/templates/upload`, { + const res = await fetch(apiUrl('/api/templates/upload'), { method: 'POST', headers: { 'x-lang': lang, ...getAuthHeader() }, body: form, @@ -384,7 +392,7 @@ export async function importZip(payload: { file: File; projectName?: string }) { if (payload.projectName) { form.append('projectName', payload.projectName); } - const res = await fetch('/api/projects/import-zip', { + const res = await fetch(apiUrl('/api/projects/import-zip'), { method: 'POST', body: form, headers: { @@ -407,7 +415,7 @@ export function importArxivSSE( if (payload.projectName) params.set('projectName', payload.projectName); const token = getCollabToken(); if (token) params.set('token', token); - const es = new EventSource(`/api/projects/import-arxiv-sse?${params.toString()}`); + const es = new EventSource(apiUrl(`/api/projects/import-arxiv-sse?${params.toString()}`)); es.addEventListener('progress', (e) => { if (onProgress) { @@ -450,7 +458,7 @@ export async function visionToLatex(payload: { if (payload.llmConfig) { form.append('llmConfig', JSON.stringify(payload.llmConfig)); } - const res = await fetch('/api/vision/latex', { + const res = await fetch(apiUrl('/api/vision/latex'), { method: 'POST', body: form, headers: { @@ -473,13 +481,80 @@ export interface TransferStartPayload { targetMainFile: string; engine?: string; layoutCheck?: boolean; + enableSensitiveMask?: boolean; + /** When true, run the LLM-driven agent pipeline; when false (default), run the rule-based transfer converter. */ + useAgent?: boolean; llmConfig?: Partial; + venue?: string; + doubleBlind?: boolean; + preprint?: boolean; + outputNotes?: string; +} + +export interface TransferQaItem { + id: string; + prompt: string; + type: 'single' | 'multi' | 'text'; + options?: string[]; +} + +export interface TransferProgressEntry { + node?: string; + level?: 'info' | 'warn' | 'error'; + message?: string; + ts?: number; +} + +export interface LiveProgress { + activeRole: string; + toolName: string; + toolArgs: string; + toolRound: number; + maxToolRounds: number; + /** Monotonic; SSE uses this to detect back-to-back same toolName updates */ + seq?: number; + lastUpdate: number; +} + +/** One completed agent tool invocation (from job.toolTraceRecent / SSE). */ +export interface ToolTraceEntry { + ts: number; + agent: string; + iteration: number; + round: number; + tool: string; + argsBrief: string; + toolCallId?: string; + phase?: string; + durationMs?: number; + ok: boolean; + error?: string; } export interface TransferStepResult { status: string; progressLog: string[]; + progressLogEntries?: TransferProgressEntry[]; + currentNode?: string; + phase?: string; + agentPhase?: string | null; + currentIteration?: number | null; + interruptedBeforeNode?: string; + completedNodes?: string[]; + pendingQA?: TransferQaItem[] | null; error?: string; + bundleNotes?: string | null; + transferGraphKind?: string; + liveProgress?: LiveProgress | null; + toolTraceRecent?: ToolTraceEntry[]; + /** Present on 500 from /transfer/step when a graph node fails (e.g. diff retries exhausted) */ + failedNode?: string; + failedPhase?: string; + failedDetail?: string; + /** Project-relative path to saved LLM raw/patch (unified-diff nodes) */ + failedDebugPath?: string; + /** Length of target .tex input to the failed diff step */ + failedInputChars?: number; } export interface PageImage { @@ -495,10 +570,45 @@ export function transferStart(payload: TransferStartPayload) { }); } -export function transferStep(jobId: string) { - return request('/api/transfer/step', { +/** + * Parses JSON error bodies so TransferNodeError fields (failedNode, etc.) are available on thrown Error. + */ +export function transferStep(jobId: string): Promise { + const lang = getLangHeader(); + const mergedHeaders: Record = { + 'x-lang': lang, + ...getAuthHeader(), + 'Content-Type': 'application/json', + }; + return fetch(apiUrl('/api/transfer/step'), { method: 'POST', + headers: mergedHeaders, body: JSON.stringify({ jobId }), + }).then(async (res) => { + const text = await res.text(); + let body: Record = {}; + try { + body = text ? (JSON.parse(text) as Record) : {}; + } catch { + body = { error: text || 'Step failed' }; + } + if (!res.ok) { + const baseMsg = String(body.error ?? text ?? 'Step failed'); + const err = new Error(baseMsg) as Error & { + failedNode?: string; + failedPhase?: string; + failedDetail?: string; + failedDebugPath?: string; + failedInputChars?: number; + }; + if (typeof body.failedNode === 'string') err.failedNode = body.failedNode; + if (typeof body.failedPhase === 'string') err.failedPhase = body.failedPhase; + if (typeof body.failedDetail === 'string') err.failedDetail = body.failedDetail; + if (typeof body.failedDebugPath === 'string') err.failedDebugPath = body.failedDebugPath; + if (typeof body.failedInputChars === 'number') err.failedInputChars = body.failedInputChars; + throw err; + } + return body as unknown as TransferStepResult; }); } @@ -513,12 +623,68 @@ export function transferStatus(jobId: string) { return request(`/api/transfer/status/${jobId}`); } +/** + * Connect to the SSE progress stream for a transfer job. + * Returns an EventSource instance. Call .close() to disconnect. + */ +export function transferStream( + jobId: string, + onProgress: (data: TransferStepResult) => void, + onDone?: (data: TransferStepResult) => void, + onError?: (err: Event) => void, +): EventSource { + const es = new EventSource(apiUrl(`/api/transfer/stream/${jobId}`)); + + es.addEventListener('progress', (e: MessageEvent) => { + try { + const data = JSON.parse(e.data) as TransferStepResult; + onProgress(data); + } catch { /* ignore parse errors */ } + }); + + es.addEventListener('done', (e: MessageEvent) => { + try { + const data = JSON.parse(e.data) as TransferStepResult; + (onDone || onProgress)(data); + } catch { /* ignore */ } + es.close(); + }); + + es.onerror = (e) => { + if (onError) onError(e); + // EventSource auto-reconnects on transient errors; + // only close on permanent failure (readyState === CLOSED) + if (es.readyState === EventSource.CLOSED) { + es.close(); + } + }; + + return es; +} + +export function transferSubmitConfirm(jobId: string, answers: Record) { + return request<{ ok: boolean }>('/api/transfer/submit-confirm', { + method: 'POST', + body: JSON.stringify({ jobId, answers }), + }); +} + // ─── MinerU Transfer API ─── export interface MineruConfig { apiBase?: string; token?: string; modelVersion?: string; + /** Wrap PNG/JPEG/WebP as single-page PDF and rewrite Markdown (or set env OPENPRISM_MINERU_RASTER_TO_PDF=1). */ + rasterToPdf?: boolean; + deleteRasterAfterPdf?: boolean; + /** Upscale factor before PDF embed when rasterToPdf is true (or env OPENPRISM_MINERU_IMAGE_SCALE). */ + imageScale?: number; + /** Replace images from source PDF using *content_list*.json (needs pdftoppm; or env OPENPRISM_MINERU_BBOX_CROP=1). */ + bboxCrop?: boolean; + cropDpi?: number; + /** MinerU bbox coords: default PDF bottom-left; use top_left if crops misaligned. */ + bboxCoords?: 'pdf' | 'top_left'; } export interface MineruTransferStartPayload { @@ -528,6 +694,7 @@ export interface MineruTransferStartPayload { targetMainFile: string; engine?: string; layoutCheck?: boolean; + enableSensitiveMask?: boolean; llmConfig?: Partial; mineruConfig?: MineruConfig; } @@ -543,7 +710,7 @@ export async function mineruTransferUploadPdf(jobId: string, pdfFile: File) { const form = new FormData(); form.append('jobId', jobId); form.append('pdf', pdfFile); - const res = await fetch('/api/transfer/upload-pdf', { + const res = await fetch(apiUrl('/api/transfer/upload-pdf'), { method: 'POST', body: form, headers: { diff --git a/apps/frontend/src/app/App.css b/apps/frontend/src/app/App.css index ec09d64..bc8e485 100644 --- a/apps/frontend/src/app/App.css +++ b/apps/frontend/src/app/App.css @@ -3680,62 +3680,252 @@ textarea.input { border-radius: 999px; } -/* ── Transfer Progress Widget ── */ -.transfer-widget { +/* ── Transfer progress: FAB + right drawer ── */ +.transfer-progress-fab { position: fixed; - bottom: 20px; - right: 20px; - width: 360px; - max-height: 400px; - background: var(--panel); + bottom: 22px; + right: 22px; + z-index: 10050; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: 2px; + min-width: 56px; + min-height: 56px; + padding: 8px 10px; border: 1px solid var(--border); - border-radius: 12px; + border-radius: 14px; + background: var(--panel); box-shadow: var(--shadow); - z-index: 1000; + color: var(--text); + cursor: pointer; + transition: transform 0.15s ease, box-shadow 0.15s ease; +} + +.transfer-progress-fab:hover { + transform: translateY(-2px); + box-shadow: 0 8px 24px rgba(0, 0, 0, 0.12); +} + +.transfer-progress-fab--pulse { + animation: transfer-fab-pulse 2s ease-in-out infinite; +} + +@keyframes transfer-fab-pulse { + 0%, 100% { box-shadow: var(--shadow), 0 0 0 0 rgba(21, 101, 192, 0.35); } + 50% { box-shadow: var(--shadow), 0 0 0 8px rgba(21, 101, 192, 0); } +} + +.transfer-progress-fab-icon { + display: flex; + line-height: 0; + opacity: 0.9; +} + +.transfer-progress-fab-label { + font-size: 10px; + font-weight: 600; + letter-spacing: 0.02em; +} + +.transfer-progress-drawer-root { + position: fixed; + inset: 0; + z-index: 10040; + pointer-events: none; +} + +.transfer-progress-drawer-root.is-open { + pointer-events: auto; +} + +.transfer-progress-drawer-backdrop { + position: absolute; + inset: 0; + background: rgba(0, 0, 0, 0.35); + opacity: 0; + transition: opacity 0.25s ease; +} + +.transfer-progress-drawer-root.is-open .transfer-progress-drawer-backdrop { + opacity: 1; +} + +.transfer-progress-drawer { + position: absolute; + top: 0; + right: 0; + bottom: 0; + width: min(440px, 92vw); + max-width: 100%; + background: var(--panel); + border-left: 1px solid var(--border); + box-shadow: -12px 0 40px rgba(0, 0, 0, 0.12); display: flex; flex-direction: column; + transform: translateX(100%); + transition: transform 0.28s cubic-bezier(0.22, 1, 0.36, 1); overflow: hidden; } -.transfer-widget-header { +.transfer-progress-drawer-root.is-open .transfer-progress-drawer { + transform: translateX(0); +} + +.transfer-progress-drawer-header { display: flex; - align-items: center; + align-items: flex-start; justify-content: space-between; - padding: 10px 14px; + gap: 10px; + padding: 16px 16px 12px; border-bottom: 1px solid var(--border); - font-size: 13px; - font-weight: 500; background: var(--panel-muted); + flex-shrink: 0; } -.transfer-widget-header .icon-btn { - width: 22px; - height: 22px; - font-size: 13px; +.transfer-progress-drawer-title { + margin: 0; + font-size: 15px; + font-weight: 600; + line-height: 1.3; +} + +.transfer-progress-drawer-actions { + display: flex; + align-items: center; + gap: 6px; + flex-shrink: 0; } -.transfer-widget-status { - padding: 8px 14px; +.transfer-progress-drawer-meta { + padding: 12px 16px; font-size: 12px; + line-height: 1.55; color: var(--text); + border-bottom: 1px solid var(--border); + flex-shrink: 0; +} + +.transfer-progress-drawer-meta strong { + margin-right: 6px; + color: var(--muted); + font-weight: 500; +} + +.transfer-progress-drawer-node { + font-weight: 400; + opacity: 0.88; +} + +.transfer-progress-drawer-badge { + display: inline-block; + margin-left: 8px; + padding: 2px 8px; + border-radius: 6px; + font-size: 11px; + font-weight: 500; +} + +.transfer-progress-drawer-badge--images { + background: rgba(184, 134, 11, 0.15); + color: #b8860b; +} + +.transfer-progress-drawer-badge--qa { + background: rgba(21, 101, 192, 0.12); + color: #1565c0; +} + +.transfer-progress-drawer-nodes { + padding: 10px 16px; + border-bottom: 1px solid var(--border); + flex-shrink: 0; +} + +.transfer-progress-drawer-nodes-label { + font-size: 11px; + font-weight: 600; + color: var(--muted); + margin-bottom: 6px; + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.transfer-progress-drawer-nodes-list { + display: flex; + flex-wrap: wrap; + gap: 6px; +} + +.transfer-progress-drawer-node-chip { + font-size: 10px; + padding: 3px 8px; + border-radius: 6px; + background: rgba(120, 98, 83, 0.1); + font-family: 'JetBrains Mono', ui-monospace, monospace; } -.transfer-widget-error { - padding: 4px 14px 8px; +.transfer-progress-drawer-error { + margin: 0 16px 8px; + padding: 10px 12px; font-size: 12px; - color: #d32f2f; + color: #c62828; + background: rgba(198, 40, 40, 0.08); + border-radius: 8px; + flex-shrink: 0; +} + +.transfer-progress-drawer-log-wrap { + flex: 1; + display: flex; + flex-direction: column; + min-height: 0; + padding: 12px 16px 20px; } -.transfer-widget-log { - padding: 8px 14px 12px; +.transfer-progress-drawer-log-title { font-size: 11px; - font-family: 'JetBrains Mono', monospace; - background: rgba(120, 98, 83, 0.06); - max-height: 220px; + font-weight: 600; + color: var(--muted); + margin-bottom: 8px; + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.transfer-progress-drawer-log { + flex: 1; + min-height: 120px; overflow-y: auto; - line-height: 1.5; + font-size: 11px; + font-family: 'JetBrains Mono', ui-monospace, monospace; + line-height: 1.55; + padding: 12px; + border-radius: 10px; + background: rgba(120, 98, 83, 0.06); + border: 1px solid var(--border); } -.transfer-widget-log > div { - margin-bottom: 2px; +.transfer-progress-drawer-log-empty { + color: var(--muted); + font-style: italic; +} + +.transfer-progress-drawer-log-line { + margin-bottom: 4px; + word-break: break-word; +} + +.transfer-progress-drawer-log-line--warn { + color: #b8860b; +} + +.transfer-progress-drawer-log-line--error { + color: #c62828; +} + +.transfer-progress-drawer-log-node { + margin-right: 6px; + opacity: 0.75; + font-weight: 500; } diff --git a/apps/frontend/src/app/ProjectPage.tsx b/apps/frontend/src/app/ProjectPage.tsx index acb88ab..2681783 100644 --- a/apps/frontend/src/app/ProjectPage.tsx +++ b/apps/frontend/src/app/ProjectPage.tsx @@ -1,4 +1,5 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { createPortal } from 'react-dom'; import { useNavigate } from 'react-router-dom'; import { useTranslation } from 'react-i18next'; import { @@ -14,9 +15,11 @@ import { trashProject, updateProjectTags, permanentDeleteProject, - uploadTemplate + uploadTemplate, + transferStatus, + transferStream, } from '../api/client'; -import type { ProjectMeta, TemplateMeta, TemplateCategory } from '../api/client'; +import type { ProjectMeta, TemplateMeta, TemplateCategory, TransferProgressEntry, TransferStepResult } from '../api/client'; import TransferPanel from './TransferPanel'; type ViewFilter = 'all' | 'mine' | 'archived' | 'trash'; @@ -119,8 +122,93 @@ export default function ProjectPage() { const [activeJob, setActiveJob] = useState<{ jobId: string; status: string; progressLog: string[]; error?: string; sourceName?: string; + phase?: string; + currentNode?: string; + completedNodes?: string[]; + pendingQA?: unknown; + progressLogEntries?: TransferProgressEntry[]; } | null>(null); - const [jobWidgetOpen, setJobWidgetOpen] = useState(true); + /** 右侧滑出进度面板 */ + const [transferProgressDrawerOpen, setTransferProgressDrawerOpen] = useState(false); + + // SSE ref for progress streaming recovery + const recoverySSERef = useRef(null); + + // Recover active transfer job from sessionStorage on mount + useEffect(() => { + try { + const saved = sessionStorage.getItem('openprism-active-job'); + if (!saved) return; + const { jobId: savedJobId } = JSON.parse(saved); + if (!savedJobId) return; + + transferStatus(savedJobId).then((res: TransferStepResult) => { + if (!res || !res.status) { + sessionStorage.removeItem('openprism-active-job'); + return; + } + + // Restore the floating progress window + setActiveJob({ + jobId: savedJobId, + status: res.status, + progressLog: res.progressLog || [], + error: res.error, + phase: res.phase, + currentNode: res.currentNode, + completedNodes: res.completedNodes, + progressLogEntries: res.progressLogEntries, + }); + + const isTerminal = ['success', 'failed', 'error'].includes(res.status); + if (!isTerminal) { + // Job still active — connect SSE for real-time updates to the floating window + if (recoverySSERef.current) recoverySSERef.current.close(); + recoverySSERef.current = transferStream( + savedJobId, + (data) => { + setActiveJob((prev) => ({ + ...prev, + jobId: savedJobId, + status: data.status, + progressLog: data.progressLog || [], + error: data.error, + phase: data.phase, + currentNode: data.currentNode, + completedNodes: data.completedNodes, + progressLogEntries: data.progressLogEntries, + })); + }, + (data) => { + setActiveJob((prev) => ({ + ...prev, + jobId: savedJobId, + status: data.status, + progressLog: data.progressLog || [], + error: data.error, + phase: data.phase, + currentNode: data.currentNode, + completedNodes: data.completedNodes, + progressLogEntries: data.progressLogEntries, + })); + recoverySSERef.current = null; + if (['success', 'failed'].includes(data.status)) { + sessionStorage.removeItem('openprism-active-job'); + loadProjects(); + } + }, + ); + } + }).catch(() => { + sessionStorage.removeItem('openprism-active-job'); + }); + } catch { /* ignore */ } + + return () => { + if (recoverySSERef.current) { recoverySSERef.current.close(); recoverySSERef.current = null; } + }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); // Template upload state const templateZipRef = useRef(null); @@ -902,7 +990,6 @@ export default function ProjectPage() { projectId={transferSource.id} onJobUpdate={(job) => { setActiveJob({ ...job, sourceName: transferSource.name }); - setJobWidgetOpen(true); if (job.status === 'success') loadProjects(); }} /> @@ -911,33 +998,119 @@ export default function ProjectPage() { )} - {/* Floating transfer progress widget */} - {activeJob && !transferOpen && jobWidgetOpen && ( -
-
- {t('模板转换')} — {activeJob.sourceName || ''} -
- - -
-
-
- {t('状态')}: {activeJob.status} + {/* 转换进度:悬浮按钮 + 右侧抽屉(Portal 到 body,避免被 overflow 裁剪;弹窗打开时也显示) */} + {typeof document !== 'undefined' && activeJob && createPortal( + <> + + +
+
setTransferProgressDrawerOpen(false)} + /> +
- {activeJob.error && ( -
{activeJob.error}
- )} - {activeJob.progressLog.length > 0 && ( -
- {activeJob.progressLog.map((line, i) => ( -
{line}
- ))} -
- )} -
+ , + document.body, )} {/* Settings Modal */} diff --git a/apps/frontend/src/app/TransferPanel.tsx b/apps/frontend/src/app/TransferPanel.tsx index 9144590..0075386 100644 --- a/apps/frontend/src/app/TransferPanel.tsx +++ b/apps/frontend/src/app/TransferPanel.tsx @@ -4,6 +4,9 @@ import { transferStart, transferStep, transferSubmitImages, + transferSubmitConfirm, + transferStream, + transferStatus, mineruTransferStart, mineruTransferUploadPdf, listTemplates, @@ -13,11 +16,26 @@ import type { LLMConfig, TemplateMeta, FileItem, + TransferQaItem, + TransferProgressEntry, + TransferStepResult, + LiveProgress, + ToolTraceEntry, } from '../api/client'; interface TransferPanelProps { projectId: string; - onJobUpdate?: (job: { jobId: string; status: string; progressLog: string[]; error?: string }) => void; + onJobUpdate?: (job: { + jobId: string; + status: string; + progressLog: string[]; + error?: string; + phase?: string; + currentNode?: string; + completedNodes?: string[]; + pendingQA?: TransferQaItem[] | null; + progressLogEntries?: TransferProgressEntry[]; + }) => void; } type TransferMode = 'legacy' | 'mineru'; @@ -25,6 +43,49 @@ type MineruSource = 'project' | 'upload'; const ENGINES = ['pdflatex', 'xelatex', 'lualatex', 'latexmk'] as const; +function formatTransferStepFailure(err: unknown): string { + const e = err as Error & { + failedNode?: string; + failedPhase?: string; + failedDetail?: string; + failedDebugPath?: string; + failedInputChars?: number; + }; + const msg = e?.message || String(err || 'Step failed'); + const bits: string[] = []; + if (e.failedNode) bits.push(`节点 ${e.failedNode}`); + if (e.failedPhase) bits.push(`阶段 ${e.failedPhase}`); + if (e.failedDetail) bits.push(`原因 ${e.failedDetail}`); + if (typeof e.failedInputChars === 'number') bits.push(`输入 ${e.failedInputChars} 字符`); + if (e.failedDebugPath) bits.push(`调试文件 ${e.failedDebugPath}`); + return bits.length ? `${msg}\n${bits.join(' · ')}` : msg; +} + +/** NeurIPS 图阶段时间线(与后端 currentPhase 对齐) */ +const NEURIPS_PHASE_STEPS: { id: string; label: string }[] = [ + { id: 'intake', label: '摄入' }, + { id: 'source_analysis', label: '源稿/模板分析' }, + { id: 'migration_plan', label: '迁移计划' }, + { id: 'qa_plan', label: '计划确认 QA' }, + { id: 'preamble', label: '导言' }, + { id: 'body', label: '正文' }, + { id: 'figures', label: '图表' }, + { id: 'assets', label: '资源复制' }, + { id: 'bibliography', label: '参考文献' }, + { id: 'blind_qa', label: '双盲 QA' }, + { id: 'blind', label: '匿名处理' }, + { id: 'policy', label: '政策核对' }, + { id: 'finalize', label: '完成(本地编译)' }, +]; + +/** 会场 Agent 模式时间线(neurips / icml / cvpr / acl 共用) */ +const VENUE_AGENT_STEPS: { id: string; label: string }[] = [ + { id: 'agent_planning', label: '🧠 规划' }, + { id: 'agent_generating', label: '⚡ 执行' }, + { id: 'agent_reviewing', label: '🔍 审查' }, + { id: 'finalize', label: '✅ 完成' }, +]; + export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelProps) { const { t } = useTranslation(); @@ -42,6 +103,13 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP const [targetTemplateId, setTargetTemplateId] = useState(''); const [engine, setEngine] = useState('pdflatex'); const [layoutCheck, setLayoutCheck] = useState(false); + const [enableSensitiveMask, setEnableSensitiveMask] = useState(false); + const [neuripsDoubleBlind, setNeuripsDoubleBlind] = useState(true); + const [neuripsPreprint, setNeuripsPreprint] = useState(false); + const [neuripsOutputNotes, setNeuripsOutputNotes] = useState(''); + // Classic-mode-only: when false (default), use the rule-based transfer + // converter (no content changes). When true, use the LLM agent pipeline. + const [useAgent, setUseAgent] = useState(false); // LLM config — read from shared localStorage (set via ProjectPage / EditorPage settings) const SETTINGS_KEY = 'openprism-settings-v1'; @@ -54,24 +122,27 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP } catch { return { llmEndpoint: '', llmApiKey: '', llmModel: '' }; } }; - const readMineruConfigFromStorage = (): { mineruApiBase: string; mineruToken: string } => { + const readMineruConfigFromStorage = (): { mineruApiBase: string; mineruToken: string; mineruRasterToPdf: boolean } => { try { const raw = window.localStorage.getItem(SETTINGS_KEY); - if (!raw) return { mineruApiBase: '', mineruToken: '' }; + if (!raw) return { mineruApiBase: '', mineruToken: '', mineruRasterToPdf: true }; const p = JSON.parse(raw); return { mineruApiBase: p.mineruApiBase || '', mineruToken: p.mineruToken || '', + // 默认开启:与 LaTeX 中优先使用 PDF 矢量/嵌入图一致;可在界面关闭 + mineruRasterToPdf: p.mineruRasterToPdf !== false, }; - } catch { return { mineruApiBase: '', mineruToken: '' }; } + } catch { return { mineruApiBase: '', mineruToken: '', mineruRasterToPdf: true }; } }; - const saveMineruConfigToStorage = (apiBase: string, token: string) => { + const saveMineruConfigToStorage = (apiBase: string, token: string, rasterToPdf?: boolean) => { try { const raw = window.localStorage.getItem(SETTINGS_KEY); const p = raw ? JSON.parse(raw) : {}; p.mineruApiBase = apiBase; p.mineruToken = token; + if (typeof rasterToPdf === 'boolean') p.mineruRasterToPdf = rasterToPdf; window.localStorage.setItem(SETTINGS_KEY, JSON.stringify(p)); } catch { /* ignore */ } }; @@ -79,6 +150,7 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP // MinerU API config — initialized from localStorage const [mineruApiBase, setMineruApiBase] = useState(() => readMineruConfigFromStorage().mineruApiBase); const [mineruToken, setMineruToken] = useState(() => readMineruConfigFromStorage().mineruToken); + const [mineruRasterToPdf, setMineruRasterToPdf] = useState(() => readMineruConfigFromStorage().mineruRasterToPdf); // Dropdown open states const [templateDropdownOpen, setTemplateDropdownOpen] = useState(false); @@ -89,8 +161,26 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP const [jobId, setJobId] = useState(''); const [status, setStatus] = useState('idle'); const [progressLog, setProgressLog] = useState([]); + const [progressLogEntries, setProgressLogEntries] = useState([]); + const [currentNode, setCurrentNode] = useState(''); + const [currentPhase, setCurrentPhase] = useState(''); + const [agentPhase, setAgentPhase] = useState(null); + const [currentIteration, setCurrentIteration] = useState(null); + const [completedNodes, setCompletedNodes] = useState([]); + const [pendingQA, setPendingQA] = useState(null); + const [qaAnswers, setQaAnswers] = useState>({}); + const [qaSubmitting, setQaSubmitting] = useState(false); + const [logFilterNode, setLogFilterNode] = useState(''); const [error, setError] = useState(''); const [running, setRunning] = useState(false); + const [transferGraphKind, setTransferGraphKind] = useState(''); + const [liveProgress, setLiveProgress] = useState(null); + const [toolTraceRecent, setToolTraceRecent] = useState([]); + const [toolTraceOpen, setToolTraceOpen] = useState(false); + + // SSE stream ref + const sseRef = useRef(null); + const JOB_STORAGE_KEY = 'openprism-active-job'; // Template list for target selection const [templates, setTemplates] = useState([]); @@ -161,19 +251,27 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP const targetMainFile = selectedTemplate?.mainFile || 'main.tex'; setError(''); setProgressLog([]); + setProgressLogEntries([]); + setCurrentNode(''); + setCurrentPhase(''); + setCompletedNodes([]); + setPendingQA(null); + setQaAnswers({}); + setToolTraceRecent([]); + setToolTraceOpen(false); + setLiveProgress(null); setRunning(true); setStatus('starting'); try { if (transferMode === 'mineru') { // MinerU mode — persist config to localStorage - saveMineruConfigToStorage(mineruApiBase, mineruToken); - const mineruConfig = (mineruApiBase || mineruToken) - ? { - ...(mineruApiBase ? { apiBase: mineruApiBase } : {}), - ...(mineruToken ? { token: mineruToken } : {}), - } - : undefined; + saveMineruConfigToStorage(mineruApiBase, mineruToken, mineruRasterToPdf); + const mineruConfig = { + ...(mineruApiBase ? { apiBase: mineruApiBase } : {}), + ...(mineruToken ? { token: mineruToken } : {}), + rasterToPdf: mineruRasterToPdf, + }; const res = await mineruTransferStart({ sourceProjectId: mineruSource === 'project' ? projectId : undefined, @@ -182,10 +280,22 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP targetMainFile, engine, layoutCheck, + ...(mineruSource === 'project' ? { enableSensitiveMask } : {}), llmConfig: buildLlmConfig(), mineruConfig, }); setJobId(res.jobId); + try { sessionStorage.setItem(JOB_STORAGE_KEY, JSON.stringify({ jobId: res.jobId })); } catch { /* ignore */ } + onJobUpdate?.({ + jobId: res.jobId, + status: 'starting', + progressLog: [], + progressLogEntries: [], + currentNode: '', + phase: '', + completedNodes: [], + pendingQA: null, + }); // If uploading PDF, upload it before running graph if (mineruSource === 'upload' && uploadedPdf) { @@ -205,9 +315,30 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP targetMainFile, engine, layoutCheck, + enableSensitiveMask, + useAgent, llmConfig: buildLlmConfig(), + ...(targetTemplateId === 'neurips' + ? { + venue: 'neurips', + doubleBlind: neuripsDoubleBlind, + preprint: neuripsPreprint, + outputNotes: neuripsOutputNotes, + } + : {}), }); setJobId(res.jobId); + try { sessionStorage.setItem(JOB_STORAGE_KEY, JSON.stringify({ jobId: res.jobId })); } catch { /* ignore */ } + onJobUpdate?.({ + jobId: res.jobId, + status: 'starting', + progressLog: [], + progressLogEntries: [], + currentNode: '', + phase: '', + completedNodes: [], + pendingQA: null, + }); setStatus('started'); await runGraph(res.jobId); } @@ -216,32 +347,175 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP setRunning(false); setStatus('error'); } - }, [transferMode, mineruSource, uploadedPdf, targetTemplateId, sourceMainFile, projectId, engine, layoutCheck, selectedTemplate, mineruApiBase, mineruToken]); + }, [transferMode, mineruSource, uploadedPdf, targetTemplateId, sourceMainFile, projectId, engine, layoutCheck, enableSensitiveMask, useAgent, selectedTemplate, mineruApiBase, mineruToken, mineruRasterToPdf, neuripsDoubleBlind, neuripsPreprint, neuripsOutputNotes, onJobUpdate]); + + const pushJobUpdate = useCallback((jid: string, res: TransferStepResult) => { + setProgressLog(res.progressLog || []); + setProgressLogEntries(res.progressLogEntries || []); + setCurrentNode(res.currentNode || ''); + setCurrentPhase(res.phase || ''); + setAgentPhase(res.agentPhase ?? null); + setCurrentIteration(res.currentIteration ?? null); + setCompletedNodes(res.completedNodes || []); + setPendingQA(res.pendingQA ?? null); + setStatus(res.status); + if (res.transferGraphKind) setTransferGraphKind(res.transferGraphKind); + setLiveProgress(res.liveProgress ?? null); + setToolTraceRecent(res.toolTraceRecent || []); + onJobUpdate?.({ + jobId: jid, + status: res.status, + progressLog: res.progressLog || [], + error: res.error, + phase: res.phase, + currentNode: res.currentNode, + completedNodes: res.completedNodes, + pendingQA: res.pendingQA ?? null, + progressLogEntries: res.progressLogEntries, + }); + }, [onJobUpdate]); + /** Connect SSE stream for real-time progress updates */ + const connectSSE = useCallback((jid: string) => { + // Close any existing SSE connection + if (sseRef.current) { sseRef.current.close(); sseRef.current = null; } + + const es = transferStream( + jid, + // onProgress + (data) => { + pushJobUpdate(jid, data); + // Handle terminal-like states from SSE + if (data.status === 'waiting_images' || data.status === 'waiting_confirm') { + setRunning(false); + } + }, + // onDone + (data) => { + pushJobUpdate(jid, data); + setRunning(false); + sseRef.current = null; + if (data.status === 'success' || data.status === 'failed') { + try { sessionStorage.removeItem(JOB_STORAGE_KEY); } catch { /* ignore */ } + } + }, + // onError + () => { + // SSE reconnects automatically; only log + }, + ); + sseRef.current = es; + }, [pushJobUpdate]); + + /** Drive the graph forward step by step, with SSE providing real-time updates */ const runGraph = useCallback(async (jid: string) => { + // Connect SSE for real-time progress display + connectSSE(jid); + // eslint-disable-next-line no-constant-condition while (true) { try { const res = await transferStep(jid); - setProgressLog(res.progressLog || []); - setStatus(res.status); - onJobUpdate?.({ jobId: jid, status: res.status, progressLog: res.progressLog || [], error: res.error }); + pushJobUpdate(jid, res); if (res.status === 'waiting_images') { setRunning(false); return; } - if (res.status === 'success' || res.status === 'failed') { setRunning(false); return; } - if (res.error) { setError(res.error); setRunning(false); return; } + if (res.status === 'waiting_confirm') { + setRunning(false); + return; + } + if (res.status === 'success' || res.status === 'failed') { + setRunning(false); + try { sessionStorage.removeItem(JOB_STORAGE_KEY); } catch { /* ignore */ } + return; + } + if (res.error) { + const bits: string[] = []; + if (res.failedNode) bits.push(`节点 ${res.failedNode}`); + if (res.failedPhase) bits.push(`阶段 ${res.failedPhase}`); + if (res.failedDetail) bits.push(`原因 ${res.failedDetail}`); + if (typeof res.failedInputChars === 'number') bits.push(`输入 ${res.failedInputChars} 字符`); + if (res.failedDebugPath) bits.push(`调试文件 ${res.failedDebugPath}`); + setError(bits.length ? `${res.error}\n${bits.join(' · ')}` : res.error); + setRunning(false); + return; + } - // Brief pause before next poll - await new Promise(r => setTimeout(r, 1000)); - } catch (err: any) { - setError(err.message || 'Step failed'); + await new Promise(r => setTimeout(r, 400)); + } catch (err: unknown) { + const display = formatTransferStepFailure(err); + setError(display); setRunning(false); setStatus('error'); - onJobUpdate?.({ jobId: jid, status: 'error', progressLog: [], error: err.message }); + onJobUpdate?.({ + jobId: jid, + status: 'error', + progressLog: [], + error: display, + }); return; } } - }, [onJobUpdate]); + }, [onJobUpdate, pushJobUpdate, connectSSE]); + + // Cleanup SSE on unmount + useEffect(() => { + return () => { + if (sseRef.current) { sseRef.current.close(); sseRef.current = null; } + }; + }, []); + + // Recover active job from sessionStorage on mount + useEffect(() => { + try { + const saved = sessionStorage.getItem(JOB_STORAGE_KEY); + if (!saved) return; + const { jobId: savedJobId } = JSON.parse(saved); + if (!savedJobId) return; + + // Try to recover state from backend + transferStatus(savedJobId).then((res) => { + if (!res || res.status === 'not_found') { + sessionStorage.removeItem(JOB_STORAGE_KEY); + return; + } + setJobId(savedJobId); + pushJobUpdate(savedJobId, res); + + const isTerminal = ['success', 'failed', 'error'].includes(res.status); + if (!isTerminal) { + // Job still running — reconnect SSE and resume driving + setRunning(true); + connectSSE(savedJobId); + // If waiting for user input, don't drive + if (res.status !== 'waiting_images' && res.status !== 'waiting_confirm') { + runGraph(savedJobId); + } else { + setRunning(false); + } + } + }).catch(() => { + sessionStorage.removeItem(JOB_STORAGE_KEY); + }); + } catch { /* ignore */ } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + const handleSubmitQa = useCallback(async () => { + if (!jobId || !pendingQA?.length) return; + setQaSubmitting(true); + setError(''); + try { + await transferSubmitConfirm(jobId, qaAnswers); + setPendingQA(null); + setRunning(true); + setStatus('running'); + await runGraph(jobId); + } catch (err: any) { + setError(err.message || 'Confirm submit failed'); + } finally { + setQaSubmitting(false); + } + }, [jobId, pendingQA, qaAnswers, runGraph]); const chevronSvg = (open: boolean) => ( @@ -437,6 +711,61 @@ export default function TransferPanel({ projectId, onJobUpdate }: TransferPanelP {t('启用排版检查 (VLM)')} + + {transferMode === 'mineru' && mineruSource === 'upload' && ( +
+ {t('上传 PDF 模式没有源 .tex/.bib 文件,此开关不会生效。')} +
+ )} + + {transferMode === 'legacy' && ( + <> + +
+ {useAgent + ? t('开启:使用大模型按规划-生成-审查循环重写论文以贴合目标模板。') + : t('关闭:走 Rule-Based Transfer 规则式无损转换,保留原文内容不变。')} +
+ + )} + + {transferMode === 'legacy' && targetTemplateId === 'neurips' && ( +
+
NeurIPS 投稿选项
+ + + +