Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,9 @@ jobs:

# ── Sync compliance knowledge base (smart sync, detached) ─────
# Compares Qdrant point count to JSON article count.
# No-op if already in sync (~1s); first-run seeding embeds 58
# articles via Ollama Cloud and can take 10–15 min.
# No-op if already in sync (~1s); seeding embeds 116 entries
# (58 EN + 58 FR working translation, #424) via the embedding
# endpoint and can take ~20 min on a cold collection.
# Run detached (-d) so the deploy job is NOT blocked on the
# seed — the operator inspects /var/log/retrieva-seed-kb.log
# afterwards if compliance_kb is unexpectedly empty.
Expand Down
1,014 changes: 1,014 additions & 0 deletions backend/data/compliance/dora-articles.fr.json

Large diffs are not rendered by default.

42 changes: 30 additions & 12 deletions backend/scripts/seedComplianceKb.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,20 +70,36 @@ function getEmbeddings() {
}

/**
* Load knowledge base from JSON.
* Supports both old format (plain array) and new format ({ version, articles }).
* Returns { articles, meta }.
* Load one knowledge-base file, tagging every article with its language and
* whether it's the official text. Returns null if the file is absent (e.g. the
* French translation hasn't been generated yet).
*/
function loadData() {
const filePath = path.join(__dirname, '../data/compliance/dora-articles.json');
const raw = JSON.parse(readFileSync(filePath, 'utf-8'));
if (Array.isArray(raw)) {
return { articles: raw, meta: { version: '1.0', lastVerified: null, sources: [] } };
function loadOne(fileName, defaultLang) {
const filePath = path.join(__dirname, '../data/compliance/', fileName);
let raw;
try {
raw = JSON.parse(readFileSync(filePath, 'utf-8'));
} catch {
return null;
}
return {
articles: raw.articles,
meta: { version: raw.version, lastVerified: raw.lastVerified, sources: raw.sources || [] },
};
const articles = Array.isArray(raw) ? raw : raw.articles;
const lang = (!Array.isArray(raw) && raw.lang) || defaultLang;
const official = Array.isArray(raw) ? true : raw.official !== false;
const meta = Array.isArray(raw)
? { version: '1.0', lastVerified: null, sources: [] }
: { version: raw.version, lastVerified: raw.lastVerified, sources: raw.sources || [] };
return { articles: articles.map((a) => ({ ...a, lang, official })), meta };
}

/**
* Load the knowledge base across all available languages (English + the optional
* working French translation). Returns { articles, meta }.
*/
function loadData() {
const en = loadOne('dora-articles.json', 'en');
const fr = loadOne('dora-articles.fr.json', 'fr');
const articles = [...(en?.articles || []), ...(fr?.articles || [])];
return { articles, meta: en?.meta || { version: '1.0', lastVerified: null, sources: [] } };
}

/**
Expand Down Expand Up @@ -152,6 +168,8 @@ async function embedAndUpsert(client, articles) {
domain: article.domain,
obligations: article.obligations || [],
fullText: article.text,
lang: article.lang || 'en',
official: article.official !== false,
},
},
}));
Expand Down
119 changes: 119 additions & 0 deletions backend/scripts/translateComplianceKb.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/**
* translateComplianceKb.js
*
* Produces a WORKING (unofficial) French translation of the English DORA
* knowledge base (`data/compliance/dora-articles.json`) →
* `data/compliance/dora-articles.fr.json`, via the configured LLM.
*
* ⚠️ The output is a WORKING translation, NOT the official EUR-Lex text. Each
* regulation citation keeps a link to the official text on EUR-Lex (set at
* retrieval time). This file is explicitly flagged `official: false` so the seed
* can tag chunks `metadata.official = false` and the UI can label them.
*
* Usage (from backend/):
* node scripts/translateComplianceKb.js # translate all articles
* node scripts/translateComplianceKb.js --limit 2 # translate first N (smoke test)
*/
import 'dotenv/config';
import { readFileSync, writeFileSync } from 'fs';
import { fileURLToPath } from 'url';
import path from 'path';
import { createLLM } from '../config/llmProvider.js';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const SRC = path.join(__dirname, '../data/compliance/dora-articles.json');
const OUT = path.join(__dirname, '../data/compliance/dora-articles.fr.json');

const limitArg = process.argv.indexOf('--limit');
const LIMIT = limitArg !== -1 ? parseInt(process.argv[limitArg + 1], 10) : Infinity;

function parseJsonLoose(raw) {
const start = raw.indexOf('{');
const end = raw.lastIndexOf('}');
if (start === -1 || end === -1) throw new Error('no JSON object in response');
return JSON.parse(raw.slice(start, end + 1));
}

async function translateArticle(llm, article) {
const payload = {
title: article.title,
domain: article.domain,
text: article.text,
obligations: article.obligations || [],
};

const prompt = [
'You are a professional French legal translator. Translate the following DORA',
'(Regulation (EU) 2022/2554) excerpt from English to French.',
'Rules:',
'- Translate meaning faithfully, in formal regulatory French.',
'- Keep legal/article references intact (e.g. "Article 28", "ICT" -> "TIC",',
' "RTS", roman numerals, sub-paragraph letters like (a), (i)).',
'- Return ONLY valid JSON with EXACTLY these keys: title, domain, text, obligations',
' (obligations is an array of strings). No commentary, no markdown fences.',
'',
'English JSON to translate:',
JSON.stringify(payload),
].join('\n');

const response = await llm.invoke(prompt);
const content = typeof response === 'string' ? response : response.content;
const fr = parseJsonLoose(content);

return {
regulation: article.regulation,
article: article.article,
title: fr.title || article.title,
domain: fr.domain || article.domain,
text: fr.text || article.text,
obligations: Array.isArray(fr.obligations) ? fr.obligations : article.obligations || [],
};
}

async function main() {
const en = JSON.parse(readFileSync(SRC, 'utf-8'));
const articles = en.articles.slice(0, LIMIT);
const llm = await createLLM({ purpose: 'chat', temperature: 0, maxTokens: 2000 });

console.log(`Translating ${articles.length} articles to French…`);

const translated = [];
for (let i = 0; i < articles.length; i++) {
try {
translated.push(await translateArticle(llm, articles[i]));

console.log(
` [${i + 1}/${articles.length}] ${articles[i].regulation} ${articles[i].article} ✓`
);
} catch (err) {
console.error(
` [${i + 1}/${articles.length}] ${articles[i].article} ✗ ${err.message} — keeping English`
);
translated.push(articles[i]);
}
}

const out = {
version: en.version,
lang: 'fr',
official: false,
translatedFrom: 'en',
translatedWith: 'llm-working-translation',
disclaimer:
'Traduction de travail non-officielle générée automatiquement. Le texte officiel et ' +
'faisant foi est la version française publiée sur EUR-Lex.',
sourceUrl: 'https://eur-lex.europa.eu/legal-content/FR/TXT/?uri=CELEX:32022R2554',
lastVerified: en.lastVerified,
sources: en.sources || [],
articles: translated,
};

writeFileSync(OUT, JSON.stringify(out, null, 2) + '\n', 'utf-8');

console.log(`Wrote ${translated.length} articles → ${OUT}`);
}

main().catch((err) => {
console.error('Translation failed:', err);
process.exit(1);
});
27 changes: 25 additions & 2 deletions backend/services/rag/complianceKbRetriever.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,20 @@ function adaptRegulationDoc(doc, lang) {
documentTitle: documentTitle || regulation,
heading_path: [regulation, article].filter(Boolean),
documentType: 'regulation',
// false for the working French translation → the UI labels it unofficial.
official: meta.official !== false,
...(url ? { url } : {}),
},
};
}

// Qdrant similarity search restricted to one language of the KB.
function searchByLang(store, query, k, lang) {
return store.similaritySearch(query, k, {
must: [{ key: 'metadata.lang', match: { value: lang } }],
});
}

/**
* Retrieve top-k regulation chunks for a query.
* Returns an empty array if the collection is missing or any error occurs —
Expand All @@ -124,9 +133,23 @@ export async function retrieveRegulationDocs(query, k = 5, lang = 'en') {
const store = await getComplianceKbStore();
if (!store) return [];

const primaryLang = String(lang || 'en')
.toLowerCase()
.startsWith('fr')
? 'fr'
: 'en';

try {
const docs = await store.similaritySearch(query, k);
return docs.map((doc) => adaptRegulationDoc(doc, lang));
// Prefer the user's language; fall back to English if it has no content
// (French not yet seeded); last-resort unfiltered for pre-language-tag data.
let docs = await searchByLang(store, query, k, primaryLang);
if (docs.length === 0 && primaryLang !== 'en') {
docs = await searchByLang(store, query, k, 'en');
}
if (docs.length === 0) {
docs = await store.similaritySearch(query, k);
}
return docs.map((doc) => adaptRegulationDoc(doc, primaryLang));
} catch (error) {
logger.warn('compliance_kb similarity search failed', {
service: 'compliance-kb-retriever',
Expand Down
57 changes: 55 additions & 2 deletions backend/tests/unittest/complianceKbRetriever.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,63 @@ describe('retrieveRegulationDocs', () => {

const docs = await retrieveRegulationDocs('what does Article 28 require?', 5);

expect(similaritySearch).toHaveBeenCalledWith('what does Article 28 require?', 5);
// Defaults to English, restricted to the English language partition (#424).
expect(similaritySearch).toHaveBeenCalledWith('what does Article 28 require?', 5, {
must: [{ key: 'metadata.lang', match: { value: 'en' } }],
});
expect(docs).toHaveLength(1);
expect(docs[0].metadata.source).toBe('regulation');
expect(docs[0].metadata.documentTitle).toBe('DORA Article 28: ICT third-party risk');
expect(docs[0].metadata.heading_path).toEqual(['DORA', 'Article 28']);
expect(docs[0].metadata.documentType).toBe('regulation');
// Official EN text → flagged official + linked to EUR-Lex EN.
expect(docs[0].metadata.official).toBe(true);
expect(docs[0].metadata.url).toContain('/EN/TXT/');
});

it('filters by French and links to EUR-Lex FR when lang=fr', async () => {
const similaritySearch = vi.fn().mockResolvedValue([
{
pageContent: 'Les entités financières...',
metadata: {
regulation: 'DORA',
article: 'Article 28',
title: 'Risque lié aux tiers',
lang: 'fr',
official: false,
},
},
]);
fromExistingCollection.mockResolvedValue({ similaritySearch });

const docs = await retrieveRegulationDocs('que dit l’article 28 ?', 5, 'fr-FR');

expect(similaritySearch).toHaveBeenCalledWith('que dit l’article 28 ?', 5, {
must: [{ key: 'metadata.lang', match: { value: 'fr' } }],
});
// Working translation → flagged unofficial, but still linked to official FR text.
expect(docs[0].metadata.official).toBe(false);
expect(docs[0].metadata.url).toContain('/FR/TXT/');
});

it('falls back to English when the French partition is empty', async () => {
const similaritySearch = vi
.fn()
.mockResolvedValueOnce([]) // fr partition empty
.mockResolvedValueOnce([
{ pageContent: 'EN text', metadata: { regulation: 'DORA', article: 'Article 1' } },
]); // en fallback
fromExistingCollection.mockResolvedValue({ similaritySearch });

const docs = await retrieveRegulationDocs('question', 5, 'fr');

expect(similaritySearch).toHaveBeenNthCalledWith(1, 'question', 5, {
must: [{ key: 'metadata.lang', match: { value: 'fr' } }],
});
expect(similaritySearch).toHaveBeenNthCalledWith(2, 'question', 5, {
must: [{ key: 'metadata.lang', match: { value: 'en' } }],
});
expect(docs).toHaveLength(1);
});

it('returns [] when the collection is unavailable', async () => {
Expand Down Expand Up @@ -92,7 +143,9 @@ describe('retrieveRegulationDocs', () => {
await retrieveRegulationDocs('q2');
await retrieveRegulationDocs('q3');

// Store is built once and reused across calls.
expect(fromExistingCollection).toHaveBeenCalledTimes(1);
expect(similaritySearch).toHaveBeenCalledTimes(3);
// Each call: empty lang-filtered search → one unfiltered last-resort search = 2.
expect(similaritySearch).toHaveBeenCalledTimes(6);
});
});
37 changes: 37 additions & 0 deletions backend/tests/unittest/contextFormatter.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ describe('Context Formatter', () => {
title: 'Test Doc',
content: 'Content',
url: 'https://example.com/doc',
official: true,
pageId: 'doc-123',
score: 0.8567,
// Extended metadata
Expand Down Expand Up @@ -185,6 +186,7 @@ describe('Context Formatter', () => {
title: 'Untitled',
content: 'Content',
url: '',
official: true,
pageId: null,
score: null,
// Extended metadata
Expand Down Expand Up @@ -232,6 +234,41 @@ describe('Context Formatter', () => {
const result = formatSources([]);
expect(result).toEqual([]);
});

// #424 — regulation chunks carry an explicit EUR-Lex `metadata.url` plus a
// category tag `metadata.source = 'regulation'`. The real link must win.
it('prefers metadata.url over the source category tag', () => {
const docs = [
{
pageContent: 'Regulation text',
metadata: {
documentTitle: 'DORA Article 28',
source: 'regulation',
url: 'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32022R2554',
},
},
];

const result = formatSources(docs);

expect(result[0].url).toBe(
'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32022R2554'
);
});

// #424 — working French translation chunks are flagged official: false so
// the UI can label them; everything else stays official by default.
it('passes through official: false and defaults to true', () => {
const docs = [
{ pageContent: 'fr', metadata: { documentTitle: 'DORA Art. 28', official: false } },
{ pageContent: 'en', metadata: { documentTitle: 'DORA Art. 28' } },
];

const result = formatSources(docs);

expect(result[0].official).toBe(false);
expect(result[1].official).toBe(true);
});
});

// ============================================================================
Expand Down
Loading
Loading