xiaozhejiya · xiaozhejiya · Jun 8, 2026
diff --git a/backend/agents/error_correction/prompts.py b/backend/agents/error_correction/prompts.py
@@ -18,7 +18,8 @@
    - 页首出现 A/B/C/D 选项但无题号 → 是上一题的选项
    - 页首出现（1）（2）小问但无大题号 → 是上一题的子问内容
    - **页首出现 image block 且其后才出现新题号 → 该图片属于上一题，加入上一题 content_blocks 末尾，不归属本页第一道题**
-   - **绝对不要把这类无题号内容创建为新题目**
+   - **绝对不要把这类跨页续写的无题号内容创建为新题目**
+   - 例外：如果 `is_primary: true` 页内开头或同页中部出现独立公式、图片、计算表达式，且前后没有可归属的题目题干，则不要丢弃；应作为独立题输出，`question_id` 使用 `"未编号-1"`、`"未编号-2"` 等稳定编号，`section_title` 使用当前大题标题或 `null`
 3. **content_blocks 的 block_type 只能填 text 或 image**，不能填 paragraph_title 等其他值
 4. **question_type 只能填**：选择题、填空题、解答题、判断题
 5. **选项放 options 数组**，不要放在 content_blocks 里
@@ -28,7 +29,8 @@
 1. 按 block_order 顺序扫描
 2. 遇到题号（1. 2. 3. 或 一、二、三、）标记为新题目起点
 3. 收集该题所有内容直到下一题号
-4. 忽略页眉、页脚、页码
+4. 对 `is_primary: true` 页内没有题号但有实质解题内容的公式/图片/计算表达式，不要忽略；无法归入前后题时作为 `"未编号-N"` 独立题输出
+5. 忽略页眉、页脚、页码
 
 ## 各字段填写
 
@@ -80,7 +82,8 @@
 2. 遇到 `paragraph_title` block（如"四、我会计算。"）时，更新当前大题标题（`section_title`）；该 block **不输出为题目**
 3. 遇到阿拉伯数字题号（1. 2. 3.）时，标记为新题目起点，并将当前 `section_title` 写入该题
 4. 收集题目内容直到下一个题号：`content_blocks` 只允许 `text`/`image`；公式用 LaTeX 嵌入 text；图片必须作为 image block 加入 content_blocks
-5. 按 schema 字段组织结构化输出
+5. 若 `is_primary: true` 页内出现没有题号但有实质解题内容的公式、图片、计算表达式，且无法明确归入前后题，则不要丢弃；作为独立题输出，`question_id` 使用 `"未编号-1"`、`"未编号-2"` 等稳定编号
+6. 按 schema 字段组织结构化输出
 
 ## 知识点标注
 
@@ -136,7 +139,8 @@
 5. **跨页连续性**
    - 一道题的内容可能跨越两页。如果某页开头的内容块没有新题号，它属于上一页最后一道题
    - 典型情况：页首出现 A/B/C/D 选项但无题号 → 是上一题的选项；页首出现（1）（2）小问但无大题号 → 是上一题的子问
-   - 不要把这类无题号的跨页内容创建为新题目
+   - 不要把这类无题号的跨页续写内容创建为新题目
+   - 但如果无题号内容出现在 `is_primary: true` 页内，且是独立的公式题、图片题、计算表达式或完整题干，不要忽略；无法归入前后题时应输出为 `"未编号-N"` 独立题
    - **页首图片的归属（极其重要）**：若某页最先出现的 block 是 image，且该图片**之前没有出现任何阿拉伯数字题号**，则该图片**属于上一道题**，必须将其加入上一题的 content_blocks 末尾，而不是归入本页第一道题。
      - 判断依据：图片归属看的是"该图片出现在哪道题的题号之后"。图片在题号之前 → 归上一题；图片在题号之后 → 归该题。
      - 即使本页第一道题的题干中含"如图所示"，也不能把题号之前出现的图片归给它。

diff --git a/backend/src/workflow.py b/backend/src/workflow.py
@@ -428,6 +428,13 @@ def _fix_leading_images(questions: List[Dict[str, Any]]) -> None:
 
         # 移到前一道题末尾
         prev_q = questions[i - 1]
+        if q.get("section_title") != prev_q.get("section_title"):
+            logger.info(
+                f"跳过 leading image 跨 section 移动: 题目 {q.get('question_id')} "
+                f"section={repr(q.get('section_title'))} 前题 {prev_q.get('question_id')} "
+                f"section={repr(prev_q.get('section_title'))}"
+            )
+            continue
         prev_blocks = prev_q.get("content_blocks") or []
         prev_q["content_blocks"] = prev_blocks + leading_images
         q["content_blocks"] = rest
@@ -539,6 +546,7 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         qid = q.get("question_id")
         if qid is not None:
             q["question_id"] = str(qid).strip()
+    original_order = {id(q): idx for idx, q in enumerate(questions)}
 
     # ── 第一轮：按 (section, qid) 复合键去重 ──────────────────
     groups: Dict[tuple, List[Dict[str, Any]]] = defaultdict(list)
@@ -558,10 +566,8 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 
     # ── 第二轮：结构优先 + 内容相似度去重 ───────────────────────
     # 策略：
-    #   1. 同一 qid 下同时存在有 section 和无 section 版本 → 直接丢弃所有 section=None 版本
-    #      （section=None 说明该批次只捕获到选项/部分内容，有 section 的是完整版本）
-    #   2. 同一 qid 下均有 section（不同 section）→ difflib 相似度 ≥ 0.75 视为重复，保留最优
-    #   3. 同一 qid 下均无 section → 保留内容最丰富的一份
+    #   同一 qid 下不再因 section_title 是否存在直接丢弃候选。
+    #   只有内容相似度 ≥ 0.75 才视为重复；否则视为不同大题/不同区域下的同号题，全部保留。
     by_qid: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
     for q in after_round1:
         by_qid[q.get("question_id", "")].append(q)
@@ -577,24 +583,11 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             final.extend(entries)
             continue
 
-        sectioned = [q for q in entries if q.get("section_title")]
-        unsectioned = [q for q in entries if not q.get("section_title")]
-
-        # 策略1：有 section 版本存在时，丢弃全部 section=None 版本
-        if sectioned and unsectioned:
-            round2_removed += len(unsectioned)
-            for q in unsectioned:
-                logger.debug(
-                    f"二次去重剔除(结构): qid={qid} "
-                    f"section=None，保留有大题标题的版本，richness={_question_richness(q)}"
-                )
-            entries = sectioned
-
         if len(entries) == 1:
             final.extend(entries)
             continue
 
-        # 策略2/3：全部有 section 或全部无 section → difflib 去重
+        # 同题号候选统一进入相似度去重；不同内容即使 section_title 缺失也保留。
         entries.sort(key=lambda q: (
             0 if q.get("section_title") else 1,
             -_question_richness(q)
@@ -618,20 +611,21 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         final.extend(kept)
 
     if round2_removed:
-        logger.info(f"二次去重: 剔除 {round2_removed} 道重复题目（结构优先 + 相似度阈值={SIMILARITY_THRESHOLD}）")
+        logger.info(f"二次去重: 剔除 {round2_removed} 道重复题目（相似度阈值={SIMILARITY_THRESHOLD}）")
         console.print(f"[yellow]二次去重: 剔除 {round2_removed} 道重复题目[/yellow]")
 
-    # ── 排序：有 section 的题按首次出现顺序 + 题号，section=None 的题排最后 ──
+    # ── 排序：按原始出现位置排列；同一 section 内按题号稳定排序 ──
     section_order: Dict[str, int] = {}
-    for q in questions:
+    for idx, q in enumerate(questions):
         s = q.get("section_title")
         if s and s not in section_order:
-            section_order[s] = len(section_order)
+            section_order[s] = idx
 
     final.sort(key=lambda q: (
-        0 if q.get("section_title") else 1,          # 有 section 的排前，None 排后
-        section_order.get(q.get("section_title") or "", 999),
-        _sort_key(q.get("question_id", ""))
+        section_order.get(q.get("section_title") or "", original_order.get(id(q), 999999)),
+        0 if q.get("section_title") else 1,
+        _sort_key(q.get("question_id", "")),
+        original_order.get(id(q), 999999),
     ))
     return final
 
@@ -815,13 +809,23 @@ def _invoke_split(batch_idx: int, batch_data: list) -> None:
     split_elapsed = time.time() - split_start
     logger.info(f"并行分割完成, 耗时 {split_elapsed:.2f}s")
 
+    raw_agent_output_path = os.path.join(results_dir, "questions_agent_raw.json")
+    with open(raw_agent_output_path, 'w', encoding='utf-8') as f:
+        json.dump(batch_results, f, ensure_ascii=False, indent=2)
+    logger.info(f"已保存 split agent 原始输出: {raw_agent_output_path}")
+
     # ── Step 6: 跨批次 section 传播 + 合并 + 去重 ──
     _propagate_section_between_batches(batch_results)
 
     all_questions = []
     for questions in batch_results:
         all_questions.extend(questions)
 
+    before_dedup_path = os.path.join(results_dir, "questions_before_dedup.json")
+    with open(before_dedup_path, 'w', encoding='utf-8') as f:
+        json.dump(all_questions, f, ensure_ascii=False, indent=2)
+    logger.info(f"已保存去重前题目列表: {before_dedup_path}")
+
     before_dedup = len(all_questions)
     deduped = _dedup_questions(all_questions)
     after_dedup = len(deduped)

diff --git a/backend/tests/test_workflow_helpers.py b/backend/tests/test_workflow_helpers.py
@@ -19,6 +19,7 @@
     _build_overlapping_batches,
     _run_ocr_and_simplify,
     _dedup_questions,
+    _fix_leading_images,
     _question_richness,
     _sort_key,
     _extract_text_sample,
@@ -407,25 +408,25 @@ def test_skip_missing_id(self):
         assert len(result) == 1
 
     def test_sorted_output(self):
-        """输出应按题号排序"""
+        """输出应保留原始出现顺序，避免无 section 题被排到错误位置"""
         qs = [
             {"question_id": "3", "content_blocks": []},
             {"question_id": "1", "content_blocks": []},
             {"question_id": "2", "content_blocks": []},
         ]
         result = _dedup_questions(qs)
-        assert [q["question_id"] for q in result] == ["1", "2", "3"]
+        assert [q["question_id"] for q in result] == ["3", "1", "2"]
 
     def test_mixed_id_types_sorted(self):
-        """数字和字母 id 混合排序"""
+        """数字和字母 id 混合时也保留原始出现顺序"""
         qs = [
             {"question_id": "B", "content_blocks": []},
             {"question_id": "2", "content_blocks": []},
             {"question_id": "A", "content_blocks": []},
             {"question_id": "1", "content_blocks": []},
         ]
         result = _dedup_questions(qs)
-        assert [q["question_id"] for q in result] == ["1", "2", "A", "B"]
+        assert [q["question_id"] for q in result] == ["B", "2", "A", "1"]
 
     def test_richness_with_options(self):
         """options 也计入 richness"""
@@ -440,6 +441,77 @@ def test_richness_with_options(self):
         # q_with_opt: 2 + 19 = 21 > q_no_opt: 3
         assert result[0].get("options") is not None
 
+    def test_same_id_unsectioned_different_content_is_kept(self):
+        """同题号但内容明显不同，即使一个没有 section_title 也不能直接删除"""
+        qs = [
+            {
+                "question_id": "3",
+                "section_title": None,
+                "content_blocks": [
+                    {"block_type": "text", "content": "3. 解方程。(6分)"},
+                    {"block_type": "text", "content": r"$$\frac{3}{4}x-40\%x=1.4$$"},
+                ],
+            },
+            {
+                "question_id": "3",
+                "section_title": "六、解决问题。(共27分)",
+                "content_blocks": [
+                    {"block_type": "text", "content": "3. 建一座污水处理池用了 48 万元，原计划投资多少万元？"},
+                ],
+            },
+        ]
+        result = _dedup_questions(qs)
+        assert len(result) == 2
+        assert any(q.get("section_title") is None for q in result)
+
+
+class TestFixLeadingImages:
+    """_fix_leading_images 测试"""
+
+    def test_does_not_move_image_across_sections(self):
+        questions = [
+            {
+                "question_id": "3",
+                "section_title": None,
+                "content_blocks": [{"block_type": "text", "content": "3. 解方程"}],
+            },
+            {
+                "question_id": "1",
+                "section_title": "五、看图列式",
+                "content_blocks": [
+                    {"block_type": "image", "content": "/images/q1.jpg"},
+                    {"block_type": "text", "content": "2."},
+                ],
+            },
+        ]
+
+        _fix_leading_images(questions)
+
+        assert questions[0]["content_blocks"] == [{"block_type": "text", "content": "3. 解方程"}]
+        assert questions[1]["content_blocks"][0] == {"block_type": "image", "content": "/images/q1.jpg"}
+
+    def test_moves_image_within_same_section(self):
+        questions = [
+            {
+                "question_id": "1",
+                "section_title": "六、解决问题",
+                "content_blocks": [{"block_type": "text", "content": "1. 上一题"}],
+            },
+            {
+                "question_id": "2",
+                "section_title": "六、解决问题",
+                "content_blocks": [
+                    {"block_type": "image", "content": "/images/prev.jpg"},
+                    {"block_type": "text", "content": "2. 下一题"},
+                ],
+            },
+        ]
+
+        _fix_leading_images(questions)
+
+        assert questions[0]["content_blocks"][-1] == {"block_type": "image", "content": "/images/prev.jpg"}
+        assert questions[1]["content_blocks"] == [{"block_type": "text", "content": "2. 下一题"}]
+
 
 # ═══════════════════════════════════════════════════════════
 # _extract_text_sample