From 822a618a75071f3cb0befe20167e19e98a6b13d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B4=AA=E6=B3=BD=E9=91=AB?= Date: Tue, 24 Feb 2026 19:17:53 +0800 Subject: [PATCH] fix(crawler,content): harden XHS search + utf8mb4 Prevent unhandled waitForResponse rejections in XHS search flow and migrate content text tables to utf8mb4 to accept emojis. Co-authored-by: Cursor --- .../resources/db/migration/V7__utf8mb4.sql | 7 ++++ backend-CHEK-crawler/src/platform/xhs.mjs | 42 +++++++++++++------ 2 files changed, 37 insertions(+), 12 deletions(-) create mode 100644 backend-CHEK-content/src/main/resources/db/migration/V7__utf8mb4.sql diff --git a/backend-CHEK-content/src/main/resources/db/migration/V7__utf8mb4.sql b/backend-CHEK-content/src/main/resources/db/migration/V7__utf8mb4.sql new file mode 100644 index 0000000..5b9756c --- /dev/null +++ b/backend-CHEK-content/src/main/resources/db/migration/V7__utf8mb4.sql @@ -0,0 +1,7 @@ +-- CHEK Content Service: ensure emoji-safe utf8mb4 for text columns + +ALTER TABLE chek_content_tag CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +ALTER TABLE chek_content_wiki_entry CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +ALTER TABLE chek_content_post CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +ALTER TABLE chek_content_comment CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; + diff --git a/backend-CHEK-crawler/src/platform/xhs.mjs b/backend-CHEK-crawler/src/platform/xhs.mjs index 5233644..b9c5f47 100644 --- a/backend-CHEK-crawler/src/platform/xhs.mjs +++ b/backend-CHEK-crawler/src/platform/xhs.mjs @@ -21,22 +21,40 @@ function extractNoteIdFromUrl(url) { async function collectSearchResults(page, keyword, maxLinks, log) { const url = buildSearchUrl(keyword); - const respPromise = page.waitForResponse( - (r) => r.url().includes('/api/sns/web/v1/search/notes') && r.request().method() === 'POST', - { timeout: 20_000 } - ); - - await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 }); - await page.waitForTimeout(1200); + const respPromise = page + .waitForResponse( + (r) => r.url().includes('/api/sns/web/v1/search/notes') && r.request().method() === 'POST', + { timeout: 20_000 } + ) + .catch((e) => { + log({ + level: 'warn', + msg: 'xhs_search_api_wait_failed', + keyword, + url, + error: String(e?.message || e || ''), + }); + return null; + }); - let j = null; try { - const resp = await respPromise; - j = await resp.json(); - } catch { - j = null; + await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 }); + await page.waitForTimeout(1200); + } catch (e) { + log({ + level: 'warn', + msg: 'xhs_search_nav_failed', + keyword, + url, + error: String(e?.message || e || ''), + }); + return []; } + let j = null; + const resp = await respPromise; + if (resp) j = await resp.json().catch(() => null); + const items = Array.isArray(j?.data?.items) ? j.data.items : []; const normalized = items .map((it) => {