From 7ed7de10d218adf5839ae2c26dd66760083c4371 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Mar 2026 05:53:00 +0000 Subject: [PATCH 1/2] Initial plan From d7c97aafba437739de0d8c61af3b4f06bdebbccc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Mar 2026 06:03:37 +0000 Subject: [PATCH 2/2] Add adaptive topN selection for different document sizes Co-authored-by: yanyiwu <2162645+yanyiwu@users.noreply.github.com> --- README.md | 31 ++++++++++++++++++++++++++++ include/simhash/Simhasher.hpp | 33 ++++++++++++++++++++++++++++++ test/unittest/TSimhash.cpp | 38 +++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) diff --git a/README.md b/README.md index 2c09b3a..cf89762 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,37 @@ simhash值是: 17831459094038722629 详情请看 [demo](https://github.com/yanyiwu/simhash-demo) +### 关键词数量(topN)的设置 + +`make()` 函数的 `topN` 参数控制从文本中抽取多少个关键词来参与 simhash 计算。关键词越多,指纹对文本内容的覆盖越全面,但计算开销也随之增加。 + +**不同大小的文件应该使用不同的 topN:** + +| 文本长度(字节) | 建议 topN | +|:---:|:---:| +| ≤ 600 | 5 | +| ~1200 | 10 | +| ~7800 | 65 | +| ≥ 24000 | 200 | + +**自动选择 topN(推荐):** + +可以使用 `Simhasher::getTopN(text)` 获取自动推荐的 topN 值(规则:`max(5, min(200, text.size() / 120))`), +或直接调用无需手动指定 topN 的 `make(text, v64)` 重载,它会自动调用 `getTopN()`: + +```cpp +Simhasher shash(DICT_PATH, HMM_PATH, IDF_PATH, STOP_WORDS_PATH); +string text = /* 读入文本 */; + +// 方法一:自动确定 topN +uint64_t simhashValue; +shash.make(text, simhashValue); + +// 方法二:手动查询推荐值后再调用 +size_t topN = Simhasher::getTopN(text); +shash.make(text, topN, simhashValue); +``` + ### Benchmark ```sh ./benchmark/benchmarking diff --git a/include/simhash/Simhasher.hpp b/include/simhash/Simhasher.hpp index 238801f..602c446 100644 --- a/include/simhash/Simhasher.hpp +++ b/include/simhash/Simhasher.hpp @@ -106,6 +106,39 @@ namespace simhash return v64; } + /** + * @brief + * Compute a recommended topN (number of keywords) for the given text. + * + * For Chinese UTF-8 text, each character is roughly 3 bytes and each word + * is roughly 2 characters (~6 bytes). A good rule of thumb is to extract + * approximately one keyword per 120 bytes of input, clamped to [5, 200]. + * + * Example expected values: + * text.size() <= 600 bytes → topN = 5 + * text.size() ~ 1200 bytes → topN = 10 + * text.size() ~ 7800 bytes → topN = 65 + * text.size() >= 24000 bytes → topN = 200 (cap) + * + * Users who need tighter control can still pass an explicit topN to make(). + */ + static size_t getTopN(const string& text) + { + const size_t topNMin = 5; + const size_t topNMax = 200; + return std::max(topNMin, std::min(topNMax, text.size() / 120)); + } + + /** + * @brief + * Adaptive overload: topN is chosen automatically via getTopN(). + * Suitable when the caller does not want to tune topN manually. + */ + bool make(const string& text, uint64_t& v64) const + { + return make(text, getTopN(text), v64); + } + static bool isEqual(uint64_t lhs, uint64_t rhs, unsigned short n = 3) { unsigned short cnt = 0; diff --git a/test/unittest/TSimhash.cpp b/test/unittest/TSimhash.cpp index 2964307..e1fdb7d 100644 --- a/test/unittest/TSimhash.cpp +++ b/test/unittest/TSimhash.cpp @@ -87,3 +87,41 @@ TEST(SimhasherTest, Test2) } +TEST(SimhasherTest, TestGetTopN) +{ + // Very short text should return the minimum topN of 5. + string shortText(100, 'a'); + ASSERT_EQ(Simhasher::getTopN(shortText), (size_t)5); + + // Text of exactly 600 bytes: 600/120 = 5 → still 5 (minimum). + string text600(600, 'a'); + ASSERT_EQ(Simhasher::getTopN(text600), (size_t)5); + + // Text of 1200 bytes: 1200/120 = 10. + string text1200(1200, 'a'); + ASSERT_EQ(Simhasher::getTopN(text1200), (size_t)10); + + // Text of 7800 bytes: 7800/120 = 65. + string text7800(7800, 'a'); + ASSERT_EQ(Simhasher::getTopN(text7800), (size_t)65); + + // Very long text should be capped at the maximum topN of 200. + string longText(30000, 'a'); + ASSERT_EQ(Simhasher::getTopN(longText), (size_t)200); +} + +TEST(SimhasherTest, TestAdaptiveMake) +{ + Simhasher shash("../submodules/cppjieba/dict/jieba.dict.utf8", "../submodules/cppjieba/dict/hmm_model.utf8", "../submodules/cppjieba/dict/idf.utf8", "../submodules/cppjieba/dict/stop_words.utf8"); + + // Verify that the adaptive make() overload (no explicit topN) produces the same + // result as calling make() with getTopN() explicitly. + string s; + ASSERT_TRUE(loadFile2Str("../test/testdata/news_content", s)); + + uint64_t u_adaptive, u_explicit; + ASSERT_TRUE(shash.make(s, u_adaptive)); + ASSERT_TRUE(shash.make(s, Simhasher::getTopN(s), u_explicit)); + ASSERT_EQ(u_adaptive, u_explicit); +} +