Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 2 additions & 21 deletions clis/zhihu/answer-comments.js
Original file line number Diff line number Diff line change
@@ -1,28 +1,9 @@
import { cli, Strategy } from '@jackwener/opencli/registry';
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';

function decodeEntity(codePoint) {
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF
? String.fromCodePoint(codePoint)
: null;
}
import { stripHtml as stripHtmlText } from './text.js';

function stripHtml(html) {
if (!html) return '';
return html
.replace(/<br\s*\/?\s*>/gi, '\n')
.replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n')
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#(\d+);/g, (entity, value) => decodeEntity(Number(value)) ?? entity)
.replace(/&#x([0-9a-f]+);/gi, (entity, value) => decodeEntity(Number.parseInt(value, 16)) ?? entity)
.replace(/\n{3,}/g, '\n\n')
.trim();
return stripHtmlText(html, { preserveBlocks: true });
}

const ANSWER_ID_RE = /^\d+$/;
Expand Down
33 changes: 2 additions & 31 deletions clis/zhihu/answer-detail.js
Original file line number Diff line number Diff line change
@@ -1,38 +1,9 @@
import { cli, Strategy } from '@jackwener/opencli/registry';
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
import { stripHtml as stripHtmlText } from './text.js';

// Light-weight HTML → text, preserving paragraph / heading / list-item
// line breaks. Zhihu answer `content` is HTML, so we map block-level
// closing tags + `<br>` to newlines before stripping the rest.
function stripHtml(html) {
if (!html) return '';
return html
.replace(/<br\s*\/?\s*>/gi, '\n')
// Block-level closing tags become paragraph breaks (double
// newline) so the stripped text stays readable. The trailing
// `\n{3,}` collapse pass below normalizes accidental triples.
.replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n')
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#(\d+);/g, (_, value) => {
const codePoint = Number(value);
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF
? String.fromCodePoint(codePoint)
: _;
})
.replace(/&#x([0-9a-f]+);/gi, (_, value) => {
const codePoint = Number.parseInt(value, 16);
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF
? String.fromCodePoint(codePoint)
: _;
})
.replace(/\n{3,}/g, '\n\n')
.trim();
return stripHtmlText(html, { preserveBlocks: true });
}

const ANSWER_ID_RE = /^\d+$/;
Expand Down
14 changes: 1 addition & 13 deletions clis/zhihu/collection.js
Original file line number Diff line number Diff line change
@@ -1,19 +1,7 @@
import { cli, Strategy } from '@jackwener/opencli/registry';
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
import { log } from '@jackwener/opencli/logger';

function stripHtml(html) {
return html
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/<em>/g, '')
.replace(/<\/em>/g, '')
.trim();
}
import { stripHtml } from './text.js';

function validatePositiveInt(value, name) {
const n = Number(value);
Expand Down
3 changes: 2 additions & 1 deletion clis/zhihu/collection.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ describe('zhihu collection', () => {
question: { id: 789012, title: 'Test Question' },
author: { name: 'test_author' },
voteup_count: 42,
content: '<p>Test answer content</p>',
content: '<p>&#34;Test&#34; &#x26; answer content</p>',
url: 'https://www.zhihu.com/question/789012/answer/123456',
},
},
Expand All @@ -57,6 +57,7 @@ describe('zhihu collection', () => {
title: 'Test Question',
author: 'test_author',
votes: 42,
excerpt: '"Test" & answer content',
url: 'https://www.zhihu.com/question/789012/answer/123456',
});

Expand Down
10 changes: 1 addition & 9 deletions clis/zhihu/question.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
import { cli, Strategy } from '@jackwener/opencli/registry';
import { AuthRequiredError, CliError } from '@jackwener/opencli/errors';
function stripHtml(html) {
return html
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.trim();
}
import { stripHtml } from './text.js';

function answerIdFromUrl(url) {
if (typeof url !== 'string') return '';
Expand Down
4 changes: 2 additions & 2 deletions clis/zhihu/question.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ describe('zhihu question', () => {
id: '2036567240334653053',
author: { name: 'alice' },
voteup_count: 12,
content: 'Hello Zhihu',
content: '<p>&#34;Hello&#34; &#x26; Zhihu</p>',
},
],
};
Expand All @@ -33,7 +33,7 @@ describe('zhihu question', () => {
author: 'alice',
votes: 12,
url: 'https://www.zhihu.com/question/2021881398772981878/answer/2036567240334653053',
content: 'Hello Zhihu',
content: '"Hello" & Zhihu',
},
]);
expect(goto).toHaveBeenCalledWith('https://www.zhihu.com/question/2021881398772981878');
Expand Down
13 changes: 1 addition & 12 deletions clis/zhihu/search.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,6 @@
import { cli, Strategy } from '@jackwener/opencli/registry';
import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';

function stripHtml(html) {
return (html || '')
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/<em>/g, '')
.replace(/<\/em>/g, '')
.trim();
}
import { stripHtml } from './text.js';

function itemKey(item) {
const obj = item.object || {};
Expand Down
4 changes: 2 additions & 2 deletions clis/zhihu/search.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ describe('zhihu search', () => {
type: 'answer',
author: { name: 'alice' },
voteup_count: 12,
question: { id: 'q1', name: '<em>Codex</em> question' },
question: { id: 'q1', name: '<em>Codex</em> &#34;question&#34;' },
},
},
{
Expand All @@ -57,7 +57,7 @@ describe('zhihu search', () => {
await expect(cmd.func(page, { query: 'codex', limit: 2 })).resolves.toEqual([
{
rank: 1,
title: 'Codex question',
title: 'Codex "question"',
type: 'answer',
author: 'alice',
votes: 12,
Expand Down
29 changes: 29 additions & 0 deletions clis/zhihu/text.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
function decodeEntity(codePoint) {
return Number.isInteger(codePoint) && codePoint >= 0 && codePoint <= 0x10FFFF
? String.fromCodePoint(codePoint)
: null;
}

export function stripHtml(html, { preserveBlocks = false } = {}) {
if (!html) return '';
let text = String(html);
if (preserveBlocks) {
text = text
.replace(/<br\s*\/?\s*>/gi, '\n')
.replace(/<\/(?:p|div|h[1-6]|li|blockquote)>/gi, '\n\n');
}
return text
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#(\d+);/g, (entity, value) => decodeEntity(Number(value)) ?? entity)
.replace(/&#x([0-9a-f]+);/gi, (entity, value) => decodeEntity(Number.parseInt(value, 16)) ?? entity)
.replace(/\n{3,}/g, '\n\n')
.trim();
}

export const __test__ = { decodeEntity };
24 changes: 24 additions & 0 deletions clis/zhihu/text.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import { describe, expect, it } from 'vitest';
import { stripHtml } from './text.js';

describe('zhihu text helpers', () => {
it('strips tags and decodes named entities in flat mode', () => {
expect(stripHtml('<em>Codex</em>&nbsp;&amp;&nbsp;&lt;CLI&gt;')).toBe('Codex & <CLI>');
});

it('decodes decimal and hexadecimal numeric entities', () => {
expect(stripHtml('&#34;中文&#34; &#x26; &#39;test&#39;')).toBe('"中文" & \'test\'');
});

it('keeps invalid numeric entities unchanged', () => {
expect(stripHtml('bad &#9999999999; entity')).toBe('bad &#9999999999; entity');
});

it('keeps list excerpts flat by default', () => {
expect(stripHtml('<p>first</p><br><p>second</p>')).toBe('firstsecond');
});

it('preserves block breaks when requested', () => {
expect(stripHtml('<p>first</p><br><p>second</p>', { preserveBlocks: true })).toBe('first\n\nsecond');
});
});
Loading