From 5aa276402cfcb514828bf1d86cc9be6755aa15fd Mon Sep 17 00:00:00 2001 From: lihaidong Date: Wed, 20 May 2026 19:06:03 +0800 Subject: [PATCH] fix(zhihu): decode numeric entities in text output --- clis/zhihu/answer-comments.js | 23 ++--------------------- clis/zhihu/answer-detail.js | 33 ++------------------------------- clis/zhihu/collection.js | 14 +------------- clis/zhihu/collection.test.js | 3 ++- clis/zhihu/question.js | 10 +--------- clis/zhihu/question.test.js | 4 ++-- clis/zhihu/search.js | 13 +------------ clis/zhihu/search.test.js | 4 ++-- clis/zhihu/text.js | 29 +++++++++++++++++++++++++++++ clis/zhihu/text.test.js | 24 ++++++++++++++++++++++++ 10 files changed, 66 insertions(+), 91 deletions(-) create mode 100644 clis/zhihu/text.js create mode 100644 clis/zhihu/text.test.js diff --git a/clis/zhihu/answer-comments.js b/clis/zhihu/answer-comments.js index b6028f99f..05864a5aa 100644 --- a/clis/zhihu/answer-comments.js +++ b/clis/zhihu/answer-comments.js @@ -1,28 +1,9 @@ import { cli, Strategy } from '@jackwener/opencli/registry'; import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; - -function decodeEntity(codePoint) { - return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF - ? String.fromCodePoint(codePoint) - : null; -} +import { stripHtml as stripHtmlText } from './text.js'; function stripHtml(html) { - if (!html) return ''; - return html - .replace(//gi, '\n') - .replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n') - .replace(/<[^>]+>/g, '') - .replace(/ /g, ' ') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/&/g, '&') - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/&#(\d+);/g, (entity, value) => decodeEntity(Number(value)) ?? entity) - .replace(/&#x([0-9a-f]+);/gi, (entity, value) => decodeEntity(Number.parseInt(value, 16)) ?? entity) - .replace(/\n{3,}/g, '\n\n') - .trim(); + return stripHtmlText(html, { preserveBlocks: true }); } const ANSWER_ID_RE = /^\d+$/; diff --git a/clis/zhihu/answer-detail.js b/clis/zhihu/answer-detail.js index 02dd67881..4d968290c 100644 --- a/clis/zhihu/answer-detail.js +++ b/clis/zhihu/answer-detail.js @@ -1,38 +1,9 @@ import { cli, Strategy } from '@jackwener/opencli/registry'; import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; +import { stripHtml as stripHtmlText } from './text.js'; -// Light-weight HTML → text, preserving paragraph / heading / list-item -// line breaks. Zhihu answer `content` is HTML, so we map block-level -// closing tags + `
` to newlines before stripping the rest. function stripHtml(html) { - if (!html) return ''; - return html - .replace(//gi, '\n') - // Block-level closing tags become paragraph breaks (double - // newline) so the stripped text stays readable. The trailing - // `\n{3,}` collapse pass below normalizes accidental triples. - .replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n') - .replace(/<[^>]+>/g, '') - .replace(/ /g, ' ') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/&/g, '&') - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/&#(\d+);/g, (_, value) => { - const codePoint = Number(value); - return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF - ? String.fromCodePoint(codePoint) - : _; - }) - .replace(/&#x([0-9a-f]+);/gi, (_, value) => { - const codePoint = Number.parseInt(value, 16); - return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF - ? String.fromCodePoint(codePoint) - : _; - }) - .replace(/\n{3,}/g, '\n\n') - .trim(); + return stripHtmlText(html, { preserveBlocks: true }); } const ANSWER_ID_RE = /^\d+$/; diff --git a/clis/zhihu/collection.js b/clis/zhihu/collection.js index be9dee722..bed24b5b4 100644 --- a/clis/zhihu/collection.js +++ b/clis/zhihu/collection.js @@ -1,19 +1,7 @@ import { cli, Strategy } from '@jackwener/opencli/registry'; import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; import { log } from '@jackwener/opencli/logger'; - -function stripHtml(html) { - return html - .replace(/<[^>]+>/g, '') - .replace(/ /g, ' ') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/&/g, '&') - .replace(/"/g, '"') - .replace(//g, '') - .replace(/<\/em>/g, '') - .trim(); -} +import { stripHtml } from './text.js'; function validatePositiveInt(value, name) { const n = Number(value); diff --git a/clis/zhihu/collection.test.js b/clis/zhihu/collection.test.js index c0b100cdd..75cccb990 100644 --- a/clis/zhihu/collection.test.js +++ b/clis/zhihu/collection.test.js @@ -37,7 +37,7 @@ describe('zhihu collection', () => { question: { id: 789012, title: 'Test Question' }, author: { name: 'test_author' }, voteup_count: 42, - content: '

Test answer content

', + content: '

"Test" & answer content

', url: 'https://www.zhihu.com/question/789012/answer/123456', }, }, @@ -57,6 +57,7 @@ describe('zhihu collection', () => { title: 'Test Question', author: 'test_author', votes: 42, + excerpt: '"Test" & answer content', url: 'https://www.zhihu.com/question/789012/answer/123456', }); diff --git a/clis/zhihu/question.js b/clis/zhihu/question.js index 122a3efa4..fe78c99ee 100644 --- a/clis/zhihu/question.js +++ b/clis/zhihu/question.js @@ -1,14 +1,6 @@ import { cli, Strategy } from '@jackwener/opencli/registry'; import { AuthRequiredError, CliError } from '@jackwener/opencli/errors'; -function stripHtml(html) { - return html - .replace(/<[^>]+>/g, '') - .replace(/ /g, ' ') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/&/g, '&') - .trim(); -} +import { stripHtml } from './text.js'; function answerIdFromUrl(url) { if (typeof url !== 'string') return ''; diff --git a/clis/zhihu/question.test.js b/clis/zhihu/question.test.js index 57c8f7401..f42feb1cb 100644 --- a/clis/zhihu/question.test.js +++ b/clis/zhihu/question.test.js @@ -20,7 +20,7 @@ describe('zhihu question', () => { id: '2036567240334653053', author: { name: 'alice' }, voteup_count: 12, - content: 'Hello Zhihu', + content: '

"Hello" & Zhihu

', }, ], }; @@ -33,7 +33,7 @@ describe('zhihu question', () => { author: 'alice', votes: 12, url: 'https://www.zhihu.com/question/2021881398772981878/answer/2036567240334653053', - content: 'Hello Zhihu', + content: '"Hello" & Zhihu', }, ]); expect(goto).toHaveBeenCalledWith('https://www.zhihu.com/question/2021881398772981878'); diff --git a/clis/zhihu/search.js b/clis/zhihu/search.js index 56ac187d4..a58a5e4d0 100644 --- a/clis/zhihu/search.js +++ b/clis/zhihu/search.js @@ -1,17 +1,6 @@ import { cli, Strategy } from '@jackwener/opencli/registry'; import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; - -function stripHtml(html) { - return (html || '') - .replace(/<[^>]+>/g, '') - .replace(/ /g, ' ') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/&/g, '&') - .replace(//g, '') - .replace(/<\/em>/g, '') - .trim(); -} +import { stripHtml } from './text.js'; function itemKey(item) { const obj = item.object || {}; diff --git a/clis/zhihu/search.test.js b/clis/zhihu/search.test.js index 6e8c85705..0ee594cfb 100644 --- a/clis/zhihu/search.test.js +++ b/clis/zhihu/search.test.js @@ -36,7 +36,7 @@ describe('zhihu search', () => { type: 'answer', author: { name: 'alice' }, voteup_count: 12, - question: { id: 'q1', name: 'Codex question' }, + question: { id: 'q1', name: 'Codex "question"' }, }, }, { @@ -57,7 +57,7 @@ describe('zhihu search', () => { await expect(cmd.func(page, { query: 'codex', limit: 2 })).resolves.toEqual([ { rank: 1, - title: 'Codex question', + title: 'Codex "question"', type: 'answer', author: 'alice', votes: 12, diff --git a/clis/zhihu/text.js b/clis/zhihu/text.js new file mode 100644 index 000000000..7b8f19d97 --- /dev/null +++ b/clis/zhihu/text.js @@ -0,0 +1,29 @@ +function decodeEntity(codePoint) { + return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF + ? String.fromCodePoint(codePoint) + : null; +} + +export function stripHtml(html, { preserveBlocks = false } = {}) { + if (!html) return ''; + let text = String(html); + if (preserveBlocks) { + text = text + .replace(//gi, '\n') + .replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n'); + } + return text + .replace(/<[^>]+>/g, '') + .replace(/ /g, ' ') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/&#(\d+);/g, (entity, value) => decodeEntity(Number(value)) ?? entity) + .replace(/&#x([0-9a-f]+);/gi, (entity, value) => decodeEntity(Number.parseInt(value, 16)) ?? entity) + .replace(/\n{3,}/g, '\n\n') + .trim(); +} + +export const __test__ = { decodeEntity }; diff --git a/clis/zhihu/text.test.js b/clis/zhihu/text.test.js new file mode 100644 index 000000000..5f238e836 --- /dev/null +++ b/clis/zhihu/text.test.js @@ -0,0 +1,24 @@ +import { describe, expect, it } from 'vitest'; +import { stripHtml } from './text.js'; + +describe('zhihu text helpers', () => { + it('strips tags and decodes named entities in flat mode', () => { + expect(stripHtml('Codex & <CLI>')).toBe('Codex & '); + }); + + it('decodes decimal and hexadecimal numeric entities', () => { + expect(stripHtml('"中文" & 'test'')).toBe('"中文" & \'test\''); + }); + + it('keeps invalid numeric entities unchanged', () => { + expect(stripHtml('bad � entity')).toBe('bad � entity'); + }); + + it('keeps list excerpts flat by default', () => { + expect(stripHtml('

first


second

')).toBe('firstsecond'); + }); + + it('preserves block breaks when requested', () => { + expect(stripHtml('

first


second

', { preserveBlocks: true })).toBe('first\n\nsecond'); + }); +});