diff --git a/.changeset/add-batch-size.md b/.changeset/add-batch-size.md new file mode 100644 index 000000000..e07ad8344 --- /dev/null +++ b/.changeset/add-batch-size.md @@ -0,0 +1,5 @@ +--- +"lingo.dev": minor +--- + +feat: add `--batch-size` parameter to `run` and `i18n` commands to prevent context leaking diff --git a/packages/cli/src/cli/cmd/i18n.ts b/packages/cli/src/cli/cmd/i18n.ts index 3f828ad54..a83aeb568 100644 --- a/packages/cli/src/cli/cmd/i18n.ts +++ b/packages/cli/src/cli/cmd/i18n.ts @@ -90,6 +90,11 @@ export default new Command() "--strict", "Stop immediately on first error instead of continuing to process remaining buckets and locales (fail-fast mode)", ) + .option( + "--batch-size ", + "Number of translations to process in a single batch", + parseInt, + ) .action(async function (options) { updateGitignore(); @@ -440,6 +445,7 @@ export default new Command() apiKey: settings.auth.apiKey, apiUrl: settings.auth.apiUrl, engineId: i18nConfig!.engineId, + batchSize: flags.batchSize, }); processPayload = withExponentialBackoff( processPayload, @@ -662,6 +668,7 @@ function parseFlags(options: any) { file: Z.array(Z.string()).optional(), interactive: Z.boolean().prefault(false), debug: Z.boolean().prefault(false), + batchSize: Z.number().min(1).max(250).optional(), }).parse(options); } diff --git a/packages/cli/src/cli/cmd/run/_types.ts b/packages/cli/src/cli/cmd/run/_types.ts index 967d05deb..fab0c274a 100644 --- a/packages/cli/src/cli/cmd/run/_types.ts +++ b/packages/cli/src/cli/cmd/run/_types.ts @@ -56,5 +56,6 @@ export const flagsSchema = z.object({ debounce: z.number().positive().prefault(5000), // 5 seconds default sound: z.boolean().optional(), pseudo: z.boolean().optional(), + batchSize: z.number().min(1).max(250).optional(), }); export type CmdRunFlags = z.infer; diff --git a/packages/cli/src/cli/cmd/run/index.ts b/packages/cli/src/cli/cmd/run/index.ts index 7b28ecd9f..ed0b49d8e 100644 --- a/packages/cli/src/cli/cmd/run/index.ts +++ b/packages/cli/src/cli/cmd/run/index.ts @@ -123,6 +123,11 @@ export default new Command() "--pseudo", "Enable pseudo-localization mode: automatically pseudo-translates all extracted strings with accented characters and visual markers without calling any external API. Useful for testing UI internationalization readiness", ) + .option( + "--batch-size ", + "Number of translations to process in a single batch (not applicable when using lingo.dev provider)", + (val: string) => parseInt(val), + ) .action(async (args) => { let userIdentity: UserIdentity = null; try { diff --git a/packages/cli/src/cli/cmd/run/setup.ts b/packages/cli/src/cli/cmd/run/setup.ts index 4277bd6b9..19bfe83de 100644 --- a/packages/cli/src/cli/cmd/run/setup.ts +++ b/packages/cli/src/cli/cmd/run/setup.ts @@ -54,7 +54,12 @@ export default async function setup(input: CmdRunContext) { ctx.flags.pseudo || ctx.config?.dev?.usePseudotranslator; const provider = isPseudo ? "pseudo" : ctx.config?.provider; const engineId = ctx.config?.engineId; - ctx.localizer = createLocalizer(provider, engineId, ctx.flags.apiKey); + ctx.localizer = createLocalizer( + provider, + engineId, + ctx.flags.apiKey, + ctx.flags.batchSize, + ); if (!ctx.localizer) { throw new Error( "Could not create localization provider. Please check your i18n.json configuration.", diff --git a/packages/cli/src/cli/localizer/explicit.ts b/packages/cli/src/cli/localizer/explicit.ts index ec212d035..e6a5746a4 100644 --- a/packages/cli/src/cli/localizer/explicit.ts +++ b/packages/cli/src/cli/localizer/explicit.ts @@ -6,14 +6,16 @@ import { createMistral } from "@ai-sdk/mistral"; import { I18nConfig } from "@lingo.dev/_spec"; import chalk from "chalk"; import dedent from "dedent"; -import { ILocalizer, LocalizerData } from "./_types"; +import { ILocalizer, LocalizerData, LocalizerProgressFn } from "./_types"; import { LanguageModel, ModelMessage, generateText } from "ai"; import { colors } from "../constants"; import { jsonrepair } from "jsonrepair"; import { createOllama } from "ollama-ai-provider-v2"; - +import _ from "lodash"; +import { extractPayloadChunks } from "../utils/chunk"; export default function createExplicitLocalizer( provider: NonNullable, + batchSize?: number, ): ILocalizer { const settings = provider.settings || {}; @@ -26,10 +28,10 @@ export default function createExplicitLocalizer( To fix this issue: 1. Switch to one of the supported providers, or 2. Remove the ${chalk.italic( - "provider", - )} node from your i18n.json configuration to switch to ${chalk.hex( - colors.green, - )("Lingo.dev")} + "provider", + )} node from your i18n.json configuration to switch to ${chalk.hex( + colors.green, + )("Lingo.dev")} ${chalk.hex(colors.blue)("Docs: https://lingo.dev/go/docs")} `, @@ -42,6 +44,7 @@ export default function createExplicitLocalizer( apiKeyName: "OPENAI_API_KEY", baseUrl: provider.baseUrl, settings, + batchSize, }); case "anthropic": return createAiSdkLocalizer({ @@ -52,6 +55,7 @@ export default function createExplicitLocalizer( apiKeyName: "ANTHROPIC_API_KEY", baseUrl: provider.baseUrl, settings, + batchSize, }); case "google": return createAiSdkLocalizer({ @@ -62,6 +66,7 @@ export default function createExplicitLocalizer( apiKeyName: "GOOGLE_API_KEY", baseUrl: provider.baseUrl, settings, + batchSize, }); case "openrouter": return createAiSdkLocalizer({ @@ -72,6 +77,7 @@ export default function createExplicitLocalizer( apiKeyName: "OPENROUTER_API_KEY", baseUrl: provider.baseUrl, settings, + batchSize, }); case "ollama": return createAiSdkLocalizer({ @@ -80,6 +86,7 @@ export default function createExplicitLocalizer( prompt: provider.prompt, skipAuth: true, settings, + batchSize, }); case "mistral": return createAiSdkLocalizer({ @@ -90,6 +97,7 @@ export default function createExplicitLocalizer( apiKeyName: "MISTRAL_API_KEY", baseUrl: provider.baseUrl, settings, + batchSize, }); } } @@ -120,6 +128,7 @@ function createAiSdkLocalizer(params: { baseUrl?: string; skipAuth?: boolean; settings?: { temperature?: number }; + batchSize?: number; }): ILocalizer { const skipAuth = params.skipAuth === true; @@ -127,19 +136,21 @@ function createAiSdkLocalizer(params: { if (!skipAuth && (!apiKey || !params.apiKeyName)) { throw new Error( dedent` - You're trying to use raw ${chalk.dim(params.id)} API for translation. ${params.apiKeyName - ? `However, ${chalk.dim( - params.apiKeyName, - )} environment variable is not set.` - : "However, that provider is unavailable." + You're trying to use raw ${chalk.dim(params.id)} API for translation. ${ + params.apiKeyName + ? `However, ${chalk.dim( + params.apiKeyName, + )} environment variable is not set.` + : "However, that provider is unavailable." } To fix this issue: - 1. ${params.apiKeyName - ? `Set ${chalk.dim( - params.apiKeyName, - )} in your environment variables` - : "Set the environment variable for your provider (if required)" + 1. ${ + params.apiKeyName + ? `Set ${chalk.dim( + params.apiKeyName, + )} in your environment variables` + : "Set the environment variable for your provider (if required)" }, or 2. Remove the ${chalk.italic( "provider", @@ -183,85 +194,132 @@ function createAiSdkLocalizer(params: { return { valid: false, error: errorMessage }; } }, - localize: async (input: LocalizerData) => { - const systemPrompt = params.prompt - .replaceAll("{source}", input.sourceLocale) - .replaceAll("{target}", input.targetLocale); - const shots = [ - [ - { - sourceLocale: "en", - targetLocale: "es", - data: { - message: "Hello, world!", - }, - }, - { - sourceLocale: "en", - targetLocale: "es", - data: { - message: "Hola, mundo!", + localize: async ( + input: LocalizerData, + onProgress?: LocalizerProgressFn, + ) => { + const chunks = extractPayloadChunks( + input.processableData, + params.batchSize, + ); + const subResults: Record[] = []; + + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + + const systemPrompt = params.prompt + .replaceAll("{source}", input.sourceLocale) + .replaceAll("{target}", input.targetLocale); + + const shots = [ + [ + { + sourceLocale: "en", + targetLocale: "es", + data: { + message: "Hello, world!", + }, }, - }, - ], - [ - { - sourceLocale: "en", - targetLocale: "es", - data: { - spring: "Spring", + { + sourceLocale: "en", + targetLocale: "es", + data: { + message: "Hola, mundo!", + }, }, - hints: { - spring: ["A source of water"], + ], + [ + { + sourceLocale: "en", + targetLocale: "es", + data: { + spring: "Spring", + }, + hints: { + spring: ["A source of water"], + }, }, - }, - { - sourceLocale: "en", - targetLocale: "es", - data: { - spring: "Manantial", + { + sourceLocale: "en", + targetLocale: "es", + data: { + spring: "Manantial", + }, }, - }, - ], - ]; + ], + ]; - const hasHints = input.hints && Object.keys(input.hints).length > 0; + const chunkHints = input.hints + ? _.pick(input.hints, Object.keys(chunk)) + : undefined; + const hasHints = chunkHints && Object.keys(chunkHints).length > 0; - const payload = { - sourceLocale: input.sourceLocale, - targetLocale: input.targetLocale, - data: input.processableData, - ...(hasHints && { hints: input.hints }), - }; + const payload = { + sourceLocale: input.sourceLocale, + targetLocale: input.targetLocale, + data: chunk, + ...(hasHints && { hints: chunkHints }), + }; - const response = await generateText({ - model, - ...params.settings, - messages: [ - { role: "system", content: systemPrompt }, - ...shots.flatMap( - ([userShot, assistantShot]) => - [ - { role: "user", content: JSON.stringify(userShot) }, - { role: "assistant", content: JSON.stringify(assistantShot) }, - ] as ModelMessage[], - ), - { role: "user", content: JSON.stringify(payload) }, - ], - }); + const response = await generateText({ + model, + ...params.settings, + messages: [ + { role: "system", content: systemPrompt }, + ...shots.flatMap( + ([userShot, assistantShot]) => + [ + { role: "user", content: JSON.stringify(userShot) }, + { role: "assistant", content: JSON.stringify(assistantShot) }, + ] as ModelMessage[], + ), + { role: "user", content: JSON.stringify(payload) }, + ], + }); - const result = parseModelResponse(response.text); + let result: any; + try { + result = parseModelResponse(response.text); + } catch (e2) { + const snippet = + response.text.length > 500 + ? `${response.text.slice(0, 500)}…` + : response.text; + console.error( + `Failed to parse response from ${params.id}. Response snippet: ${snippet}`, + ); + throw new Error( + `Failed to parse response from ${params.id}: ${e2} (Snippet: ${snippet})`, + ); + } + let finalResult: Record = {}; - // Handle both object and string responses - if (typeof result.data === "object" && result.data !== null) { - return result.data; + // Handle both object and string responses + if (typeof result?.data === "object" && result.data !== null) { + finalResult = result.data; + } else if (typeof result?.data === "string") { + // Handle string responses where the model double-stringified the JSON + try { + const parsed = parseModelResponse(result.data); + finalResult = parsed.data || parsed || {}; + } catch (e) { + console.error( + `Failed to parse nested JSON response. Snippet: ${result.data.slice(0, 100)}...`, + ); + throw new Error( + `Failed to parse nested JSON response: ${e} (Snippet: ${result.data.slice(0, 100)}...)`, + ); + } + } + + subResults.push(finalResult); + if (onProgress) { + onProgress(((i + 1) / chunks.length) * 100, chunk, finalResult); + } } - // Handle string responses - extract and repair JSON - const index = result.data.indexOf("{"); - const lastIndex = result.data.lastIndexOf("}"); - const trimmed = result.data.slice(index, lastIndex + 1); - return JSON.parse(jsonrepair(trimmed)).data; + const finalMergedResult = _.merge({}, ...subResults); + return finalMergedResult; }, }; } diff --git a/packages/cli/src/cli/localizer/index.ts b/packages/cli/src/cli/localizer/index.ts index 6d20b192b..ea5f3b18d 100644 --- a/packages/cli/src/cli/localizer/index.ts +++ b/packages/cli/src/cli/localizer/index.ts @@ -9,6 +9,7 @@ export default function createLocalizer( provider: I18nConfig["provider"] | "pseudo" | null | undefined, engineId?: string, apiKey?: string, + batchSize?: number, ): ILocalizer { if (provider === "pseudo") { return createPseudoLocalizer(); @@ -17,6 +18,6 @@ export default function createLocalizer( if (!provider) { return createLingoDotDevLocalizer(apiKey, engineId); } else { - return createExplicitLocalizer(provider); + return createExplicitLocalizer(provider, batchSize); } } diff --git a/packages/cli/src/cli/processor/basic.spec.ts b/packages/cli/src/cli/processor/basic.spec.ts new file mode 100644 index 000000000..53e1d425f --- /dev/null +++ b/packages/cli/src/cli/processor/basic.spec.ts @@ -0,0 +1,160 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { createBasicTranslator } from "./basic"; +import { LanguageModel, generateText } from "ai"; + +// Mock the ai module +vi.mock("ai", async () => { + const actual = await vi.importActual("ai"); + return { + ...actual, + generateText: vi.fn(), + }; +}); + +describe("createBasicTranslator", () => { + const mockModel = {} as LanguageModel; + const mockSystemPrompt = "Translate from {source} to {target}"; + + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("should process all keys in a single batch by default", async () => { + const input = { + sourceLocale: "en", + targetLocale: "fr", + processableData: { + key1: "value1", + key2: "value2", + key3: "value3", + }, + }; + + // Mock response + (generateText as any).mockResolvedValue({ + text: JSON.stringify({ + data: { + key1: "valeur1", + key2: "valeur2", + key3: "valeur3", + }, + }), + }); + + const onProgress = vi.fn(); + const translator = createBasicTranslator(mockModel, mockSystemPrompt); + + await translator(input, onProgress); + + expect(generateText).toHaveBeenCalledTimes(1); + expect(generateText).toHaveBeenCalledWith( + expect.objectContaining({ + messages: expect.arrayContaining([ + expect.objectContaining({ + role: "user", + content: expect.stringContaining("key1"), + }), + ]), + }), + ); + }); + + it("should process >25 keys in multiple batches by default (fallback batch size 25)", async () => { + const inputData: Record = {}; + for (let i = 0; i < 30; i++) { + inputData[`key${i}`] = `value${i}`; + } + + const input = { + sourceLocale: "en", + targetLocale: "fr", + processableData: inputData, + }; + + (generateText as any).mockResolvedValue({ + text: JSON.stringify({ data: {} }), + }); + + const onProgress = vi.fn(); + const translator = createBasicTranslator(mockModel, mockSystemPrompt); + + await translator(input, onProgress); + + // Should be 2 calls, since default fallback is 25 and we have 30 keys + expect(generateText).toHaveBeenCalledTimes(2); + }); + + it("should respect batchSize parameter", async () => { + const input = { + sourceLocale: "en", + targetLocale: "fr", + processableData: { + key1: "value1", + key2: "value2", + key3: "value3", + }, + }; + + // Mock response + (generateText as any).mockResolvedValue({ + text: JSON.stringify({ + data: {}, + }), + }); + + const onProgress = vi.fn(); + // Set batchSize to 1 to force individual requests + const translator = createBasicTranslator(mockModel, mockSystemPrompt, { + batchSize: 1, + }); + + await translator(input, onProgress); + + expect(generateText).toHaveBeenCalledTimes(3); + + // allow calls to be in any order, but each should contain exactly one key + const calls = (generateText as any).mock.calls; + const keysProcessed = new Set(); + + calls.forEach((call: any) => { + const messages = call[0].messages; + const userMessage = messages[messages.length - 1]; + const content = JSON.parse(userMessage.content); + const keys = Object.keys(content.data); + expect(keys.length).toBe(1); + keysProcessed.add(keys[0]); + }); + + expect(keysProcessed.has("key1")).toBe(true); + expect(keysProcessed.has("key2")).toBe(true); + expect(keysProcessed.has("key3")).toBe(true); + }); + + it("should chunk requests correctly with batchSize > 1", async () => { + const input = { + sourceLocale: "en", + targetLocale: "fr", + processableData: { + key1: "value1", + key2: "value2", + key3: "value3", + key4: "value4", + key5: "value5", + }, + }; + + (generateText as any).mockResolvedValue({ + text: JSON.stringify({ data: {} }), + }); + + const onProgress = vi.fn(); + const translator = createBasicTranslator(mockModel, mockSystemPrompt, { + batchSize: 2, + }); + + await translator(input, onProgress); + + // 5 items with batchSize 2 -> 3 chunks (2, 2, 1) + expect(generateText).toHaveBeenCalledTimes(3); + }); +}); diff --git a/packages/cli/src/cli/processor/basic.ts b/packages/cli/src/cli/processor/basic.ts index ed962adf6..5a3596833 100644 --- a/packages/cli/src/cli/processor/basic.ts +++ b/packages/cli/src/cli/processor/basic.ts @@ -2,8 +2,10 @@ import { generateText, LanguageModel } from "ai"; import { LocalizerInput, LocalizerProgressFn } from "./_base"; import _ from "lodash"; +import { extractPayloadChunks } from "../utils/chunk"; type ModelSettings = { temperature?: number; + batchSize?: number; }; export function createBasicTranslator( @@ -12,7 +14,10 @@ export function createBasicTranslator( settings: ModelSettings = {}, ) { return async (input: LocalizerInput, onProgress: LocalizerProgressFn) => { - const chunks = extractPayloadChunks(input.processableData); + const chunks = extractPayloadChunks( + input.processableData, + settings.batchSize, + ); const subResults: Record[] = []; for (let i = 0; i < chunks.length; i++) { @@ -22,7 +27,7 @@ export function createBasicTranslator( processableData: chunk, }); subResults.push(result); - onProgress((i / chunks.length) * 100, chunk, result); + onProgress(((i + 1) / chunks.length) * 100, chunk, result); } const result = _.merge({}, ...subResults); @@ -84,60 +89,3 @@ export function createBasicTranslator( return result?.data || {}; } } - -/** - * Extract payload chunks based on the ideal chunk size - * @param payload - The payload to be chunked - * @returns An array of payload chunks - */ -function extractPayloadChunks( - payload: Record, -): Record[] { - const idealBatchItemSize = 250; - const batchSize = 25; - const result: Record[] = []; - let currentChunk: Record = {}; - let currentChunkItemCount = 0; - - const payloadEntries = Object.entries(payload); - for (let i = 0; i < payloadEntries.length; i++) { - const [key, value] = payloadEntries[i]; - currentChunk[key] = value; - currentChunkItemCount++; - - const currentChunkSize = countWordsInRecord(currentChunk); - if ( - currentChunkSize > idealBatchItemSize || - currentChunkItemCount >= batchSize || - i === payloadEntries.length - 1 - ) { - result.push(currentChunk); - currentChunk = {}; - currentChunkItemCount = 0; - } - } - - return result; -} - -/** - * Count words in a record or array - * @param payload - The payload to count words in - * @returns The total number of words - */ -function countWordsInRecord( - payload: any | Record | Array, -): number { - if (Array.isArray(payload)) { - return payload.reduce((acc, item) => acc + countWordsInRecord(item), 0); - } else if (typeof payload === "object" && payload !== null) { - return Object.values(payload).reduce( - (acc: number, item) => acc + countWordsInRecord(item), - 0, - ); - } else if (typeof payload === "string") { - return payload.trim().split(/\s+/).filter(Boolean).length; - } else { - return 0; - } -} diff --git a/packages/cli/src/cli/processor/index.ts b/packages/cli/src/cli/processor/index.ts index 6b7db441a..92845fde6 100644 --- a/packages/cli/src/cli/processor/index.ts +++ b/packages/cli/src/cli/processor/index.ts @@ -14,7 +14,7 @@ import { createOllama } from "ollama-ai-provider-v2"; export default function createProcessor( provider: I18nConfig["provider"], - params: { apiKey?: string; apiUrl: string; engineId?: string }, + params: { apiKey?: string; apiUrl: string; engineId?: string; batchSize?: number }, ): LocalizerFn { if (!provider) { const result = createLingoLocalizer(params); @@ -22,7 +22,10 @@ export default function createProcessor( } else { const model = getPureModelProvider(provider); const settings = provider.settings || {}; - const result = createBasicTranslator(model, provider.prompt, settings); + const result = createBasicTranslator(model, provider.prompt, { + ...settings, + batchSize: params.batchSize, + }); return result; } } @@ -32,23 +35,21 @@ function getPureModelProvider(provider: I18nConfig["provider"]) { providerId: string, envVar?: string, ) => dedent` - You're trying to use raw ${chalk.dim(providerId)} API for translation. ${ - envVar + You're trying to use raw ${chalk.dim(providerId)} API for translation. ${envVar ? `However, ${chalk.dim(envVar)} environment variable is not set.` : "However, that provider is unavailable." - } + } To fix this issue: - 1. ${ - envVar + 1. ${envVar ? `Set ${chalk.dim(envVar)} in your environment variables` : "Set the environment variable for your provider (if required)" - }, or + }, or 2. Remove the ${chalk.italic( - "provider", - )} node from your i18n.json configuration to switch to ${chalk.hex( - colors.green, - )("Lingo.dev")} + "provider", + )} node from your i18n.json configuration to switch to ${chalk.hex( + colors.green, + )("Lingo.dev")} ${chalk.hex(colors.blue)("Docs: https://lingo.dev/go/docs")} `; @@ -60,10 +61,10 @@ function getPureModelProvider(provider: I18nConfig["provider"]) { To fix this issue: 1. Switch to one of the supported providers, or 2. Remove the ${chalk.italic( - "provider", - )} node from your i18n.json configuration to switch to ${chalk.hex( - colors.green, - )("Lingo.dev")} + "provider", + )} node from your i18n.json configuration to switch to ${chalk.hex( + colors.green, + )("Lingo.dev")} ${chalk.hex(colors.blue)("Docs: https://lingo.dev/go/docs")} `; diff --git a/packages/cli/src/cli/utils/chunk.ts b/packages/cli/src/cli/utils/chunk.ts new file mode 100644 index 000000000..dfe0d959d --- /dev/null +++ b/packages/cli/src/cli/utils/chunk.ts @@ -0,0 +1,58 @@ +/** + * Extract payload chunks based on the ideal chunk size + * @param payload - The payload to be chunked + * @param batchSize - Max number of keys per chunk + * @returns An array of payload chunks + */ +export function extractPayloadChunks( + payload: Record, + batchSize?: number, +): Record[] { + const idealBatchItemSize = 250; + const result: Record[] = []; + let currentChunk: Record = {}; + let currentChunkItemCount = 0; + + const payloadEntries = Object.entries(payload); + for (let i = 0; i < payloadEntries.length; i++) { + const [key, value] = payloadEntries[i]; + currentChunk[key] = value; + currentChunkItemCount++; + + const currentChunkSize = countWordsInRecord(currentChunk); + const effectiveBatchSize = batchSize && batchSize > 0 ? batchSize : 25; + if ( + currentChunkSize > idealBatchItemSize || + currentChunkItemCount >= effectiveBatchSize || + i === payloadEntries.length - 1 + ) { + result.push(currentChunk); + currentChunk = {}; + currentChunkItemCount = 0; + } + } + + return result; +} + +/** + * Count words in a record or array + * @param payload - The payload to count words in + * @returns The total number of words + */ +export function countWordsInRecord( + payload: any | Record | Array, +): number { + if (Array.isArray(payload)) { + return payload.reduce((acc, item) => acc + countWordsInRecord(item), 0); + } else if (typeof payload === "object" && payload !== null) { + return Object.values(payload).reduce( + (acc: number, item) => acc + countWordsInRecord(item), + 0, + ); + } else if (typeof payload === "string") { + return payload.trim().split(/\s+/).filter(Boolean).length; + } else { + return 0; + } +}