yanyiwu · Copilot · Mar 5, 2026 · Mar 5, 2026
diff --git a/README.md b/README.md
@@ -54,6 +54,37 @@ simhash值是: 17831459094038722629
 
 详情请看 [demo](https://github.com/yanyiwu/simhash-demo)
 
+### 关键词数量（topN）的设置
+
+`make()` 函数的 `topN` 参数控制从文本中抽取多少个关键词来参与 simhash 计算。关键词越多，指纹对文本内容的覆盖越全面，但计算开销也随之增加。
+
+**不同大小的文件应该使用不同的 topN：**
+
+| 文本长度（字节） | 建议 topN |
+|:---:|:---:|
+| ≤ 600   | 5   |
+| ~1200   | 10  |
+| ~7800   | 65  |
+| ≥ 24000 | 200 |
+
+**自动选择 topN（推荐）：**
+
+可以使用 `Simhasher::getTopN(text)` 获取自动推荐的 topN 值（规则：`max(5, min(200, text.size() / 120))`），
+或直接调用无需手动指定 topN 的 `make(text, v64)` 重载，它会自动调用 `getTopN()`:
+
+```cpp
+Simhasher shash(DICT_PATH, HMM_PATH, IDF_PATH, STOP_WORDS_PATH);
+string text = /* 读入文本 */;
+
+// 方法一：自动确定 topN
+uint64_t simhashValue;
+shash.make(text, simhashValue);
+
+// 方法二：手动查询推荐值后再调用
+size_t topN = Simhasher::getTopN(text);
+shash.make(text, topN, simhashValue);
+```
+
 ### Benchmark
 ```sh
 ./benchmark/benchmarking

diff --git a/include/simhash/Simhasher.hpp b/include/simhash/Simhasher.hpp
@@ -106,6 +106,39 @@ namespace simhash
                 return v64;
             }
 
+            /**
+             * @brief
+             * Compute a recommended topN (number of keywords) for the given text.
+             *
+             * For Chinese UTF-8 text, each character is roughly 3 bytes and each word
+             * is roughly 2 characters (~6 bytes).  A good rule of thumb is to extract
+             * approximately one keyword per 120 bytes of input, clamped to [5, 200].
+             *
+             * Example expected values:
+             *   text.size() <=  600 bytes  → topN =  5
+             *   text.size() ~  1200 bytes  → topN = 10
+             *   text.size() ~  7800 bytes  → topN = 65
+             *   text.size() >= 24000 bytes → topN = 200 (cap)
+             *
+             * Users who need tighter control can still pass an explicit topN to make().
+             */
+            static size_t getTopN(const string& text)
+            {
+                const size_t topNMin = 5;
+                const size_t topNMax = 200;
+                return std::max(topNMin, std::min(topNMax, text.size() / 120));
+            }
+
+            /**
+             * @brief
+             * Adaptive overload: topN is chosen automatically via getTopN().
+             * Suitable when the caller does not want to tune topN manually.
+             */
+            bool make(const string& text, uint64_t& v64) const
+            {
+                return make(text, getTopN(text), v64);
+            }
+
             static bool isEqual(uint64_t lhs, uint64_t rhs, unsigned short n = 3)
             {
                 unsigned short cnt = 0;

diff --git a/test/unittest/TSimhash.cpp b/test/unittest/TSimhash.cpp
@@ -87,3 +87,41 @@ TEST(SimhasherTest, Test2)
 
 }
 
+TEST(SimhasherTest, TestGetTopN)
+{
+    // Very short text should return the minimum topN of 5.
+    string shortText(100, 'a');
+    ASSERT_EQ(Simhasher::getTopN(shortText), (size_t)5);
+
+    // Text of exactly 600 bytes: 600/120 = 5 → still 5 (minimum).
+    string text600(600, 'a');
+    ASSERT_EQ(Simhasher::getTopN(text600), (size_t)5);
+
+    // Text of 1200 bytes: 1200/120 = 10.
+    string text1200(1200, 'a');
+    ASSERT_EQ(Simhasher::getTopN(text1200), (size_t)10);
+
+    // Text of 7800 bytes: 7800/120 = 65.
+    string text7800(7800, 'a');
+    ASSERT_EQ(Simhasher::getTopN(text7800), (size_t)65);
+
+    // Very long text should be capped at the maximum topN of 200.
+    string longText(30000, 'a');
+    ASSERT_EQ(Simhasher::getTopN(longText), (size_t)200);
+}
+
+TEST(SimhasherTest, TestAdaptiveMake)
+{
+    Simhasher shash("../submodules/cppjieba/dict/jieba.dict.utf8", "../submodules/cppjieba/dict/hmm_model.utf8", "../submodules/cppjieba/dict/idf.utf8", "../submodules/cppjieba/dict/stop_words.utf8");
+
+    // Verify that the adaptive make() overload (no explicit topN) produces the same
+    // result as calling make() with getTopN() explicitly.
+    string s;
+    ASSERT_TRUE(loadFile2Str("../test/testdata/news_content", s));
+
+    uint64_t u_adaptive, u_explicit;
+    ASSERT_TRUE(shash.make(s, u_adaptive));
+    ASSERT_TRUE(shash.make(s, Simhasher::getTopN(s), u_explicit));
+    ASSERT_EQ(u_adaptive, u_explicit);
+}
+