diff --git a/backend/agents/error_correction/prompts.py b/backend/agents/error_correction/prompts.py index c17ae3f4..d4647010 100644 --- a/backend/agents/error_correction/prompts.py +++ b/backend/agents/error_correction/prompts.py @@ -18,7 +18,8 @@ - 页首出现 A/B/C/D 选项但无题号 → 是上一题的选项 - 页首出现(1)(2)小问但无大题号 → 是上一题的子问内容 - **页首出现 image block 且其后才出现新题号 → 该图片属于上一题,加入上一题 content_blocks 末尾,不归属本页第一道题** - - **绝对不要把这类无题号内容创建为新题目** + - **绝对不要把这类跨页续写的无题号内容创建为新题目** + - 例外:如果 `is_primary: true` 页内开头或同页中部出现独立公式、图片、计算表达式,且前后没有可归属的题目题干,则不要丢弃;应作为独立题输出,`question_id` 使用 `"未编号-1"`、`"未编号-2"` 等稳定编号,`section_title` 使用当前大题标题或 `null` 3. **content_blocks 的 block_type 只能填 text 或 image**,不能填 paragraph_title 等其他值 4. **question_type 只能填**:选择题、填空题、解答题、判断题 5. **选项放 options 数组**,不要放在 content_blocks 里 @@ -28,7 +29,8 @@ 1. 按 block_order 顺序扫描 2. 遇到题号(1. 2. 3. 或 一、二、三、)标记为新题目起点 3. 收集该题所有内容直到下一题号 -4. 忽略页眉、页脚、页码 +4. 对 `is_primary: true` 页内没有题号但有实质解题内容的公式/图片/计算表达式,不要忽略;无法归入前后题时作为 `"未编号-N"` 独立题输出 +5. 忽略页眉、页脚、页码 ## 各字段填写 @@ -80,7 +82,8 @@ 2. 遇到 `paragraph_title` block(如"四、我会计算。")时,更新当前大题标题(`section_title`);该 block **不输出为题目** 3. 遇到阿拉伯数字题号(1. 2. 3.)时,标记为新题目起点,并将当前 `section_title` 写入该题 4. 收集题目内容直到下一个题号:`content_blocks` 只允许 `text`/`image`;公式用 LaTeX 嵌入 text;图片必须作为 image block 加入 content_blocks -5. 按 schema 字段组织结构化输出 +5. 若 `is_primary: true` 页内出现没有题号但有实质解题内容的公式、图片、计算表达式,且无法明确归入前后题,则不要丢弃;作为独立题输出,`question_id` 使用 `"未编号-1"`、`"未编号-2"` 等稳定编号 +6. 按 schema 字段组织结构化输出 ## 知识点标注 @@ -136,7 +139,8 @@ 5. **跨页连续性** - 一道题的内容可能跨越两页。如果某页开头的内容块没有新题号,它属于上一页最后一道题 - 典型情况:页首出现 A/B/C/D 选项但无题号 → 是上一题的选项;页首出现(1)(2)小问但无大题号 → 是上一题的子问 - - 不要把这类无题号的跨页内容创建为新题目 + - 不要把这类无题号的跨页续写内容创建为新题目 + - 但如果无题号内容出现在 `is_primary: true` 页内,且是独立的公式题、图片题、计算表达式或完整题干,不要忽略;无法归入前后题时应输出为 `"未编号-N"` 独立题 - **页首图片的归属(极其重要)**:若某页最先出现的 block 是 image,且该图片**之前没有出现任何阿拉伯数字题号**,则该图片**属于上一道题**,必须将其加入上一题的 content_blocks 末尾,而不是归入本页第一道题。 - 判断依据:图片归属看的是"该图片出现在哪道题的题号之后"。图片在题号之前 → 归上一题;图片在题号之后 → 归该题。 - 即使本页第一道题的题干中含"如图所示",也不能把题号之前出现的图片归给它。 diff --git a/backend/src/workflow.py b/backend/src/workflow.py index 0f24d7e2..b6c7c132 100644 --- a/backend/src/workflow.py +++ b/backend/src/workflow.py @@ -428,6 +428,13 @@ def _fix_leading_images(questions: List[Dict[str, Any]]) -> None: # 移到前一道题末尾 prev_q = questions[i - 1] + if q.get("section_title") != prev_q.get("section_title"): + logger.info( + f"跳过 leading image 跨 section 移动: 题目 {q.get('question_id')} " + f"section={repr(q.get('section_title'))} 前题 {prev_q.get('question_id')} " + f"section={repr(prev_q.get('section_title'))}" + ) + continue prev_blocks = prev_q.get("content_blocks") or [] prev_q["content_blocks"] = prev_blocks + leading_images q["content_blocks"] = rest @@ -539,6 +546,7 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: qid = q.get("question_id") if qid is not None: q["question_id"] = str(qid).strip() + original_order = {id(q): idx for idx, q in enumerate(questions)} # ── 第一轮:按 (section, qid) 复合键去重 ────────────────── groups: Dict[tuple, List[Dict[str, Any]]] = defaultdict(list) @@ -558,10 +566,8 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: # ── 第二轮:结构优先 + 内容相似度去重 ─────────────────────── # 策略: - # 1. 同一 qid 下同时存在有 section 和无 section 版本 → 直接丢弃所有 section=None 版本 - # (section=None 说明该批次只捕获到选项/部分内容,有 section 的是完整版本) - # 2. 同一 qid 下均有 section(不同 section)→ difflib 相似度 ≥ 0.75 视为重复,保留最优 - # 3. 同一 qid 下均无 section → 保留内容最丰富的一份 + # 同一 qid 下不再因 section_title 是否存在直接丢弃候选。 + # 只有内容相似度 ≥ 0.75 才视为重复;否则视为不同大题/不同区域下的同号题,全部保留。 by_qid: Dict[str, List[Dict[str, Any]]] = defaultdict(list) for q in after_round1: by_qid[q.get("question_id", "")].append(q) @@ -577,24 +583,11 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: final.extend(entries) continue - sectioned = [q for q in entries if q.get("section_title")] - unsectioned = [q for q in entries if not q.get("section_title")] - - # 策略1:有 section 版本存在时,丢弃全部 section=None 版本 - if sectioned and unsectioned: - round2_removed += len(unsectioned) - for q in unsectioned: - logger.debug( - f"二次去重剔除(结构): qid={qid} " - f"section=None,保留有大题标题的版本,richness={_question_richness(q)}" - ) - entries = sectioned - if len(entries) == 1: final.extend(entries) continue - # 策略2/3:全部有 section 或全部无 section → difflib 去重 + # 同题号候选统一进入相似度去重;不同内容即使 section_title 缺失也保留。 entries.sort(key=lambda q: ( 0 if q.get("section_title") else 1, -_question_richness(q) @@ -618,20 +611,21 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: final.extend(kept) if round2_removed: - logger.info(f"二次去重: 剔除 {round2_removed} 道重复题目(结构优先 + 相似度阈值={SIMILARITY_THRESHOLD})") + logger.info(f"二次去重: 剔除 {round2_removed} 道重复题目(相似度阈值={SIMILARITY_THRESHOLD})") console.print(f"[yellow]二次去重: 剔除 {round2_removed} 道重复题目[/yellow]") - # ── 排序:有 section 的题按首次出现顺序 + 题号,section=None 的题排最后 ── + # ── 排序:按原始出现位置排列;同一 section 内按题号稳定排序 ── section_order: Dict[str, int] = {} - for q in questions: + for idx, q in enumerate(questions): s = q.get("section_title") if s and s not in section_order: - section_order[s] = len(section_order) + section_order[s] = idx final.sort(key=lambda q: ( - 0 if q.get("section_title") else 1, # 有 section 的排前,None 排后 - section_order.get(q.get("section_title") or "", 999), - _sort_key(q.get("question_id", "")) + section_order.get(q.get("section_title") or "", original_order.get(id(q), 999999)), + 0 if q.get("section_title") else 1, + _sort_key(q.get("question_id", "")), + original_order.get(id(q), 999999), )) return final @@ -815,6 +809,11 @@ def _invoke_split(batch_idx: int, batch_data: list) -> None: split_elapsed = time.time() - split_start logger.info(f"并行分割完成, 耗时 {split_elapsed:.2f}s") + raw_agent_output_path = os.path.join(results_dir, "questions_agent_raw.json") + with open(raw_agent_output_path, 'w', encoding='utf-8') as f: + json.dump(batch_results, f, ensure_ascii=False, indent=2) + logger.info(f"已保存 split agent 原始输出: {raw_agent_output_path}") + # ── Step 6: 跨批次 section 传播 + 合并 + 去重 ── _propagate_section_between_batches(batch_results) @@ -822,6 +821,11 @@ def _invoke_split(batch_idx: int, batch_data: list) -> None: for questions in batch_results: all_questions.extend(questions) + before_dedup_path = os.path.join(results_dir, "questions_before_dedup.json") + with open(before_dedup_path, 'w', encoding='utf-8') as f: + json.dump(all_questions, f, ensure_ascii=False, indent=2) + logger.info(f"已保存去重前题目列表: {before_dedup_path}") + before_dedup = len(all_questions) deduped = _dedup_questions(all_questions) after_dedup = len(deduped) diff --git a/backend/tests/test_workflow_helpers.py b/backend/tests/test_workflow_helpers.py index 2a836167..aa615828 100644 --- a/backend/tests/test_workflow_helpers.py +++ b/backend/tests/test_workflow_helpers.py @@ -19,6 +19,7 @@ _build_overlapping_batches, _run_ocr_and_simplify, _dedup_questions, + _fix_leading_images, _question_richness, _sort_key, _extract_text_sample, @@ -407,17 +408,17 @@ def test_skip_missing_id(self): assert len(result) == 1 def test_sorted_output(self): - """输出应按题号排序""" + """输出应保留原始出现顺序,避免无 section 题被排到错误位置""" qs = [ {"question_id": "3", "content_blocks": []}, {"question_id": "1", "content_blocks": []}, {"question_id": "2", "content_blocks": []}, ] result = _dedup_questions(qs) - assert [q["question_id"] for q in result] == ["1", "2", "3"] + assert [q["question_id"] for q in result] == ["3", "1", "2"] def test_mixed_id_types_sorted(self): - """数字和字母 id 混合排序""" + """数字和字母 id 混合时也保留原始出现顺序""" qs = [ {"question_id": "B", "content_blocks": []}, {"question_id": "2", "content_blocks": []}, @@ -425,7 +426,7 @@ def test_mixed_id_types_sorted(self): {"question_id": "1", "content_blocks": []}, ] result = _dedup_questions(qs) - assert [q["question_id"] for q in result] == ["1", "2", "A", "B"] + assert [q["question_id"] for q in result] == ["B", "2", "A", "1"] def test_richness_with_options(self): """options 也计入 richness""" @@ -440,6 +441,77 @@ def test_richness_with_options(self): # q_with_opt: 2 + 19 = 21 > q_no_opt: 3 assert result[0].get("options") is not None + def test_same_id_unsectioned_different_content_is_kept(self): + """同题号但内容明显不同,即使一个没有 section_title 也不能直接删除""" + qs = [ + { + "question_id": "3", + "section_title": None, + "content_blocks": [ + {"block_type": "text", "content": "3. 解方程。(6分)"}, + {"block_type": "text", "content": r"$$\frac{3}{4}x-40\%x=1.4$$"}, + ], + }, + { + "question_id": "3", + "section_title": "六、解决问题。(共27分)", + "content_blocks": [ + {"block_type": "text", "content": "3. 建一座污水处理池用了 48 万元,原计划投资多少万元?"}, + ], + }, + ] + result = _dedup_questions(qs) + assert len(result) == 2 + assert any(q.get("section_title") is None for q in result) + + +class TestFixLeadingImages: + """_fix_leading_images 测试""" + + def test_does_not_move_image_across_sections(self): + questions = [ + { + "question_id": "3", + "section_title": None, + "content_blocks": [{"block_type": "text", "content": "3. 解方程"}], + }, + { + "question_id": "1", + "section_title": "五、看图列式", + "content_blocks": [ + {"block_type": "image", "content": "/images/q1.jpg"}, + {"block_type": "text", "content": "2."}, + ], + }, + ] + + _fix_leading_images(questions) + + assert questions[0]["content_blocks"] == [{"block_type": "text", "content": "3. 解方程"}] + assert questions[1]["content_blocks"][0] == {"block_type": "image", "content": "/images/q1.jpg"} + + def test_moves_image_within_same_section(self): + questions = [ + { + "question_id": "1", + "section_title": "六、解决问题", + "content_blocks": [{"block_type": "text", "content": "1. 上一题"}], + }, + { + "question_id": "2", + "section_title": "六、解决问题", + "content_blocks": [ + {"block_type": "image", "content": "/images/prev.jpg"}, + {"block_type": "text", "content": "2. 下一题"}, + ], + }, + ] + + _fix_leading_images(questions) + + assert questions[0]["content_blocks"][-1] == {"block_type": "image", "content": "/images/prev.jpg"} + assert questions[1]["content_blocks"] == [{"block_type": "text", "content": "2. 下一题"}] + # ═══════════════════════════════════════════════════════════ # _extract_text_sample