BhuvanArn · BhuvanArn · Jun 24, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/package-lock.json b/package-lock.json
diff --git a/server/.env.example b/server/.env.example
@@ -15,3 +15,7 @@ FRONTEND_URL=http://localhost:5173
 # AI C++ server (ai/docker-compose, port 8088). In Docker on Linux, use host.docker.internal
 # with extra_hosts in docker-compose.yml, or set e.g. http://172.17.0.1:8088
 # AI_SERVER_URL=http://host.docker.internal:8088
+
+# Required — Groq API key for CV / job-offer extraction (users uploadCV & uploadJobOffer).
+# Without it those routes fail at runtime. Get one at https://console.groq.com/keys
+GROQ_API_KEY=
diff --git a/server/package.json b/server/package.json
@@ -37,14 +37,17 @@
     "@types/bcrypt": "^5.0.2",
     "@types/cookie-parser": "^1.4.10",
     "@types/express": "^4.17.17",
-    "axios": "^1.9.0",
+    "axios": "^1.16.1",
     "bcrypt": "^6.0.0",
+    "cheerio": "^1.2.0",
     "class-transformer": "^0.5.1",
     "class-validator": "^0.14.1",
     "cookie-parser": "^1.4.7",
     "dotenv": "^16.5.0",
+    "groq-sdk": "^1.2.0",
     "ioredis": "^5.10.1",
     "nodemailer": "^8.0.2",
+    "pdf-parse-debugging-disabled": "^1.1.1",
     "pg": "^8.15.6",
     "reflect-metadata": "^0.2.0",
     "rxjs": "^7.8.1",

diff --git a/server/src/common/utils/JobOfferExtraction.spec.ts b/server/src/common/utils/JobOfferExtraction.spec.ts
@@ -0,0 +1,107 @@
+import axios from "axios";
+
+import { scrapeLinkedin, scrapeAxios } from "./JobOfferExtraction";
+
+jest.mock("axios");
+
+const mockedAxios = axios as jest.Mocked<typeof axios>;
+
+// Fake timers so the linkedin retry backoff (attempt * 2000ms) resolves
+// instantly. Run a scraper through to completion by draining all queued
+// timers while its async work settles.
+jest.useFakeTimers();
+const runScraper = async (start: () => Promise<string>): Promise<string> => {
+  const promise = start();
+  await jest.runAllTimersAsync();
+  return promise;
+};
+
+const LINKEDIN_HTML = `
+  <h1 class="top-card-layout__title">Senior Backend Engineer</h1>
+  <a class="topcard__org-name-link">TechCorp</a>
+  <span class="topcard__flavor--bullet">Paris, France</span>
+  <div class="show-more-less-html__markup">We build distributed systems and we are hiring engineers to grow the platform team across Europe.</div>
+  <ul>
+    <li class="description__job-criteria-item"><h3>Employment type</h3><span>Full-time</span></li>
+    <li class="description__job-criteria-item"><h3>Seniority level</h3><span>Senior</span></li>
+  </ul>
+`;
+
+const longBody = "Job description ".repeat(40); // > 300 chars
+const GENERIC_HTML = `<html><body><nav>nav</nav><p>${longBody}</p></body></html>`;
+
+afterEach(() => {
+  jest.clearAllMocks();
+});
+
+describe("scrapeLinkedin", () => {
+  it("extracts structured text from the guest API on first try", async () => {
+    mockedAxios.get.mockResolvedValueOnce({ data: LINKEDIN_HTML });
+
+    const text = await runScraper(() =>
+      scrapeLinkedin("https://www.linkedin.com/jobs/view/1234567890"),
+    );
+
+    expect(text).toContain("Senior Backend Engineer");
+    expect(text).toContain("TechCorp");
+    expect(mockedAxios.get).toHaveBeenCalledTimes(1);
+  });
+
+  it("returns empty when the URL has no extractable job id", async () => {
+    // No 8+ digit id → throws inside, retries, ends empty.
+    const text = await runScraper(() =>
+      scrapeLinkedin("https://www.linkedin.com/jobs/view/abc"),
+    );
+    expect(text).toBe("");
+  });
+
+  it("retries on failure then succeeds", async () => {
+    mockedAxios.get
+      .mockRejectedValueOnce({ code: "ETIMEDOUT" })
+      .mockResolvedValueOnce({ data: LINKEDIN_HTML });
+
+    const text = await runScraper(() =>
+      scrapeLinkedin("https://www.linkedin.com/jobs/view/1234567890"),
+    );
+
+    expect(text).toContain("Senior Backend Engineer");
+    expect(mockedAxios.get).toHaveBeenCalledTimes(2);
+  });
+
+  it("returns empty when content is too short", async () => {
+    mockedAxios.get.mockResolvedValue({ data: "<h1></h1>" });
+
+    const text = await runScraper(() =>
+      scrapeLinkedin("https://www.linkedin.com/jobs/view/1234567890"),
+    );
+
+    expect(text).toBe("");
+  });
+});
+
+describe("scrapeAxios", () => {
+  it("extracts body text from a simple page", async () => {
+    mockedAxios.get.mockResolvedValueOnce({ data: GENERIC_HTML });
+
+    const text = await scrapeAxios("https://example.com/job/1");
+
+    expect(text).toContain("Job description");
+    expect(text).not.toContain("nav");
+  });
+
+  it("returns empty when the page text is too short", async () => {
+    mockedAxios.get.mockResolvedValueOnce({ data: "<body>hi</body>" });
+
+    const text = await scrapeAxios("https://example.com/job/1");
+
+    expect(text).toBe("");
+  });
+
+  it("returns empty on request error", async () => {
+    mockedAxios.get.mockRejectedValueOnce({ code: "ECONNREFUSED" });
+
+    const text = await scrapeAxios("https://example.com/job/1");
+
+    expect(text).toBe("");
+  });
+});
diff --git a/server/src/common/utils/JobOfferExtraction.ts b/server/src/common/utils/JobOfferExtraction.ts
@@ -0,0 +1,159 @@
+import * as cheerio from "cheerio";
+import { Logger } from "@nestjs/common";
+
+import { safeAxiosGet } from "./urlGuard";
+
+const logger = new Logger("JobOfferExtraction");
+
+export const scrapeLinkedin = async (url: string): Promise<string> => {
+  const maxRetries = 3;
+  let attempt = 0;
+  let pageText = "";
+
+  const userAgents = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
+  ];
+
+  while (attempt < maxRetries && !pageText) {
+    try {
+      // Délai croissant entre chaque tentative : 0ms, 2000ms, 4000ms
+      if (attempt > 0) {
+        const delay = attempt * 2000;
+        logger.debug(
+          `LinkedIn retry ${attempt}/${maxRetries - 1} - waiting ${delay}ms...`,
+        );
+        await new Promise((resolve) => setTimeout(resolve, delay));
+      }
+
+      // Anchor to where LinkedIn actually puts the job id: /jobs/view/<id>,
+      // ?currentJobId=<id>, or the trailing -<id> of a view slug. A bare
+      // /(\d{8,})/ would grab the first long digit run anywhere — a tracking
+      // param or timestamp could win over the real id.
+      const jobIdMatch = url.match(
+        /(?:jobs\/view\/|currentJobId=)(\d+)|-(\d{8,})(?:[/?#]|$)/,
+      );
+      if (!jobIdMatch)
+        throw new Error("Could not extract LinkedIn job ID from URL");
+
+      const jobId = jobIdMatch[1] ?? jobIdMatch[2];
+      const guestApiUrl = `https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/${jobId}`;
+      const response = await safeAxiosGet(guestApiUrl, {
+        headers: {
+          "User-Agent": userAgents[attempt % userAgents.length],
+          Accept:
+            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+          "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8",
+          Referer: "https://www.linkedin.com/",
+        },
+        timeout: 15000,
+      });
+
+      const temp = cheerio.load(response.data);
+
+      const jobTitle = temp(
+        "h2.top-card-layout__title, h1.top-card-layout__title",
+      )
+        .text()
+        .trim();
+      const companyName = temp(
+        "a.topcard__org-name-link, span.topcard__org-name-link",
+      )
+        .text()
+        .trim();
+      const location = temp("span.topcard__flavor--bullet")
+        .first()
+        .text()
+        .trim();
+      const description = temp("div.show-more-less-html__markup")
+        .text()
+        .replace(/\s+/g, " ")
+        .trim();
+
+      const criteria: Record<string, string> = {};
+      temp("li.description__job-criteria-item").each((_, el) => {
+        const label = temp(el).find("h3").text().trim();
+        const value = temp(el).find("span").text().trim();
+        if (label && value) criteria[label] = value;
+      });
+
+      const extracted = `
+                Job Title: ${jobTitle}
+                Company: ${companyName}
+                Location: ${location}
+                Contract Type: ${criteria["Type de poste"] || criteria["Employment type"] || ""}
+                Seniority Level: ${criteria["Niveau hiérarchique"] || criteria["Seniority level"] || ""}
+                Industry: ${criteria["Secteur"] || criteria["Industries"] || ""}
+                Job Function: ${criteria["Fonction"] || criteria["Job function"] || ""}
+                Description: ${description}
+            `
+        .replace(/\s+/g, " ")
+        .trim();
+
+      if (extracted.length >= 100) {
+        pageText = extracted;
+        logger.debug(`Strategy LinkedIn succeeded on attempt ${attempt + 1}`);
+      } else {
+        throw new Error("Extracted content too short");
+      }
+    } catch (err) {
+      logger.debug(
+        `LinkedIn attempt ${attempt + 1} failed: ${(err as { code?: string })?.code || err}`,
+      );
+      attempt++;
+    }
+  }
+  if (!pageText) {
+    logger.debug(
+      "All LinkedIn attempts failed, falling through to next strategy...",
+    );
+  }
+  return pageText;
+};
+
+// ─── AXIOS (sites simples) ───────────────────────────────────────────────────
+export const scrapeAxios = async (url: string): Promise<string> => {
+  let pageText = "";
+
+  try {
+    const response = await safeAxiosGet(
+      url,
+      {
+        headers: {
+          "User-Agent":
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+          Accept:
+            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+          "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
+          "Accept-Encoding": "gzip, deflate, br",
+          Connection: "keep-alive",
+          "Upgrade-Insecure-Requests": "1",
+          "Sec-Fetch-Dest": "document",
+          "Sec-Fetch-Mode": "navigate",
+          "Sec-Fetch-Site": "none",
+          "Cache-Control": "max-age=0",
+        },
+        timeout: 10000,
+      },
+      5,
+    );
+
+    const temp = cheerio.load(response.data);
+    temp(
+      "script, style, nav, footer, header, iframe, noscript, [aria-hidden='true']",
+    ).remove();
+    const extracted = temp("body").text().replace(/\s+/g, " ").trim();
+
+    if (extracted.length >= 300) {
+      pageText = extracted;
+      logger.debug("Strategy Axios succeeded");
+    }
+  } catch (err) {
+    logger.debug(
+      `Strategy Axios failed: ${(err as { code?: string })?.code || err}`,
+    );
+  }
+
+  return pageText;
+};