Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
6a7ad05
Add the data base for the user_cv by creating a new entity
eregine Mar 13, 2026
3edd889
Modify some files to respect prettier style
eregine Mar 13, 2026
aa5925d
Add the route to upload the user's cv from yhe frontend
eregine Mar 13, 2026
1a7c960
Add the extraction of the user's information on the cv and send it to…
eregine Apr 2, 2026
3f660e1
Merge the stagging branch
eregine Apr 2, 2026
049c630
Add the start of changing the IA for sorting the user CV
eregine Apr 11, 2026
9f4639a
add the package-lock of all the project
eregine Apr 11, 2026
7bdb061
Add the end of the upload user's cv feature
eregine May 15, 2026
66ccf9e
Merge the stagging branch
eregine May 15, 2026
5d51a34
Add some change for the coding style
eregine May 15, 2026
b797ea0
Add the extration of the job offer from linkedin, using axios and pepeer
eregine May 15, 2026
921da14
Add more structure to the code for the job offer scrapping
eregine May 16, 2026
b56d8ae
Cleaning the code by pretttier
eregine May 16, 2026
585f60c
Merge branch 'staging' of github.com:Tugduoff/TalkUp.AI into 124-extr…
eregine May 16, 2026
b76af56
Add new unity test in the server for the job offer
eregine May 31, 2026
618669a
Add some unity test for the upload cv part
eregine May 31, 2026
bdfd100
fix: guard SSRF, drop dead multer, restore coverage gate on cv/job-of…
BhuvanArn Jun 21, 2026
926afe8
refactor: align cv/job-offer upload with controller conventions
BhuvanArn Jun 21, 2026
aaa0dcc
Merge staging into 122-extract-user-info-from-cv; fix Groq boot crash
BhuvanArn Jun 21, 2026
122c14d
fix: close SSRF redirect bypass and drop dead deps on cv/job-offer up…
BhuvanArn Jun 23, 2026
0efbdc7
refactor: drop puppeteer from job-offer scraping
BhuvanArn Jun 23, 2026
1a3f1b3
fix: close IPv6 SSRF bypass and enforce one-per-user cv/job-offer rows
BhuvanArn Jun 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
725 changes: 461 additions & 264 deletions package-lock.json

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions server/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@ FRONTEND_URL=http://localhost:5173
# AI C++ server (ai/docker-compose, port 8088). In Docker on Linux, use host.docker.internal
# with extra_hosts in docker-compose.yml, or set e.g. http://172.17.0.1:8088
# AI_SERVER_URL=http://host.docker.internal:8088

# Required β€” Groq API key for CV / job-offer extraction (users uploadCV & uploadJobOffer).
# Without it those routes fail at runtime. Get one at https://console.groq.com/keys
GROQ_API_KEY=
5 changes: 4 additions & 1 deletion server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,17 @@
"@types/bcrypt": "^5.0.2",
"@types/cookie-parser": "^1.4.10",
"@types/express": "^4.17.17",
"axios": "^1.9.0",
"axios": "^1.16.1",
"bcrypt": "^6.0.0",
"cheerio": "^1.2.0",
"class-transformer": "^0.5.1",
"class-validator": "^0.14.1",
"cookie-parser": "^1.4.7",
"dotenv": "^16.5.0",
"groq-sdk": "^1.2.0",
"ioredis": "^5.10.1",
"nodemailer": "^8.0.2",
"pdf-parse-debugging-disabled": "^1.1.1",
"pg": "^8.15.6",
"reflect-metadata": "^0.2.0",
"rxjs": "^7.8.1",
Expand Down
107 changes: 107 additions & 0 deletions server/src/common/utils/JobOfferExtraction.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import axios from "axios";

import { scrapeLinkedin, scrapeAxios } from "./JobOfferExtraction";

jest.mock("axios");

const mockedAxios = axios as jest.Mocked<typeof axios>;

// Fake timers so the linkedin retry backoff (attempt * 2000ms) resolves
// instantly. Run a scraper through to completion by draining all queued
// timers while its async work settles.
jest.useFakeTimers();
const runScraper = async (start: () => Promise<string>): Promise<string> => {
const promise = start();
await jest.runAllTimersAsync();
return promise;
};

const LINKEDIN_HTML = `
<h1 class="top-card-layout__title">Senior Backend Engineer</h1>
<a class="topcard__org-name-link">TechCorp</a>
<span class="topcard__flavor--bullet">Paris, France</span>
<div class="show-more-less-html__markup">We build distributed systems and we are hiring engineers to grow the platform team across Europe.</div>
<ul>
<li class="description__job-criteria-item"><h3>Employment type</h3><span>Full-time</span></li>
<li class="description__job-criteria-item"><h3>Seniority level</h3><span>Senior</span></li>
</ul>
`;

const longBody = "Job description ".repeat(40); // > 300 chars
const GENERIC_HTML = `<html><body><nav>nav</nav><p>${longBody}</p></body></html>`;

afterEach(() => {
jest.clearAllMocks();
});

describe("scrapeLinkedin", () => {
it("extracts structured text from the guest API on first try", async () => {
mockedAxios.get.mockResolvedValueOnce({ data: LINKEDIN_HTML });

const text = await runScraper(() =>
scrapeLinkedin("https://www.linkedin.com/jobs/view/1234567890"),
);

expect(text).toContain("Senior Backend Engineer");
expect(text).toContain("TechCorp");
expect(mockedAxios.get).toHaveBeenCalledTimes(1);
});

it("returns empty when the URL has no extractable job id", async () => {
// No 8+ digit id β†’ throws inside, retries, ends empty.
const text = await runScraper(() =>
scrapeLinkedin("https://www.linkedin.com/jobs/view/abc"),
);
expect(text).toBe("");
});

it("retries on failure then succeeds", async () => {
mockedAxios.get
.mockRejectedValueOnce({ code: "ETIMEDOUT" })
.mockResolvedValueOnce({ data: LINKEDIN_HTML });

const text = await runScraper(() =>
scrapeLinkedin("https://www.linkedin.com/jobs/view/1234567890"),
);

expect(text).toContain("Senior Backend Engineer");
expect(mockedAxios.get).toHaveBeenCalledTimes(2);
});

it("returns empty when content is too short", async () => {
mockedAxios.get.mockResolvedValue({ data: "<h1></h1>" });

const text = await runScraper(() =>
scrapeLinkedin("https://www.linkedin.com/jobs/view/1234567890"),
);

expect(text).toBe("");
});
});

describe("scrapeAxios", () => {
it("extracts body text from a simple page", async () => {
mockedAxios.get.mockResolvedValueOnce({ data: GENERIC_HTML });

const text = await scrapeAxios("https://example.com/job/1");

expect(text).toContain("Job description");
expect(text).not.toContain("nav");
});

it("returns empty when the page text is too short", async () => {
mockedAxios.get.mockResolvedValueOnce({ data: "<body>hi</body>" });

const text = await scrapeAxios("https://example.com/job/1");

expect(text).toBe("");
});

it("returns empty on request error", async () => {
mockedAxios.get.mockRejectedValueOnce({ code: "ECONNREFUSED" });

const text = await scrapeAxios("https://example.com/job/1");

expect(text).toBe("");
});
});
159 changes: 159 additions & 0 deletions server/src/common/utils/JobOfferExtraction.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import * as cheerio from "cheerio";
import { Logger } from "@nestjs/common";

import { safeAxiosGet } from "./urlGuard";

const logger = new Logger("JobOfferExtraction");

export const scrapeLinkedin = async (url: string): Promise<string> => {
const maxRetries = 3;
let attempt = 0;
let pageText = "";

const userAgents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
];

while (attempt < maxRetries && !pageText) {
try {
// DΓ©lai croissant entre chaque tentative : 0ms, 2000ms, 4000ms
if (attempt > 0) {
const delay = attempt * 2000;
logger.debug(
`LinkedIn retry ${attempt}/${maxRetries - 1} - waiting ${delay}ms...`,
);
await new Promise((resolve) => setTimeout(resolve, delay));
}

// Anchor to where LinkedIn actually puts the job id: /jobs/view/<id>,
// ?currentJobId=<id>, or the trailing -<id> of a view slug. A bare
// /(\d{8,})/ would grab the first long digit run anywhere β€” a tracking
// param or timestamp could win over the real id.
const jobIdMatch = url.match(
/(?:jobs\/view\/|currentJobId=)(\d+)|-(\d{8,})(?:[/?#]|$)/,
);
if (!jobIdMatch)
throw new Error("Could not extract LinkedIn job ID from URL");

const jobId = jobIdMatch[1] ?? jobIdMatch[2];
const guestApiUrl = `https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/${jobId}`;
const response = await safeAxiosGet(guestApiUrl, {
headers: {
"User-Agent": userAgents[attempt % userAgents.length],
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8",
Referer: "https://www.linkedin.com/",
},
timeout: 15000,
});

const temp = cheerio.load(response.data);

const jobTitle = temp(
"h2.top-card-layout__title, h1.top-card-layout__title",
)
.text()
.trim();
const companyName = temp(
"a.topcard__org-name-link, span.topcard__org-name-link",
)
.text()
.trim();
const location = temp("span.topcard__flavor--bullet")
.first()
.text()
.trim();
const description = temp("div.show-more-less-html__markup")
.text()
.replace(/\s+/g, " ")
.trim();

const criteria: Record<string, string> = {};
temp("li.description__job-criteria-item").each((_, el) => {
const label = temp(el).find("h3").text().trim();
const value = temp(el).find("span").text().trim();
if (label && value) criteria[label] = value;
});

const extracted = `
Job Title: ${jobTitle}
Company: ${companyName}
Location: ${location}
Contract Type: ${criteria["Type de poste"] || criteria["Employment type"] || ""}
Seniority Level: ${criteria["Niveau hiΓ©rarchique"] || criteria["Seniority level"] || ""}
Industry: ${criteria["Secteur"] || criteria["Industries"] || ""}
Job Function: ${criteria["Fonction"] || criteria["Job function"] || ""}
Description: ${description}
`
.replace(/\s+/g, " ")
.trim();

if (extracted.length >= 100) {
pageText = extracted;
logger.debug(`Strategy LinkedIn succeeded on attempt ${attempt + 1}`);
} else {
throw new Error("Extracted content too short");
}
} catch (err) {
logger.debug(
`LinkedIn attempt ${attempt + 1} failed: ${(err as { code?: string })?.code || err}`,
);
attempt++;
}
}
if (!pageText) {
logger.debug(
"All LinkedIn attempts failed, falling through to next strategy...",
);
}
return pageText;
};

// ─── AXIOS (sites simples) ───────────────────────────────────────────────────
export const scrapeAxios = async (url: string): Promise<string> => {
let pageText = "";

try {
const response = await safeAxiosGet(
url,
{
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
Connection: "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Cache-Control": "max-age=0",
},
timeout: 10000,
},
5,
);

const temp = cheerio.load(response.data);
temp(
"script, style, nav, footer, header, iframe, noscript, [aria-hidden='true']",
).remove();
const extracted = temp("body").text().replace(/\s+/g, " ").trim();

if (extracted.length >= 300) {
pageText = extracted;
logger.debug("Strategy Axios succeeded");
}
} catch (err) {
logger.debug(
`Strategy Axios failed: ${(err as { code?: string })?.code || err}`,
);
}

return pageText;
};
Loading
Loading