Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions backend/agents/error_correction/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
- 页首出现 A/B/C/D 选项但无题号 → 是上一题的选项
- 页首出现(1)(2)小问但无大题号 → 是上一题的子问内容
- **页首出现 image block 且其后才出现新题号 → 该图片属于上一题,加入上一题 content_blocks 末尾,不归属本页第一道题**
- **绝对不要把这类无题号内容创建为新题目**
- **绝对不要把这类跨页续写的无题号内容创建为新题目**
- 例外:如果 `is_primary: true` 页内开头或同页中部出现独立公式、图片、计算表达式,且前后没有可归属的题目题干,则不要丢弃;应作为独立题输出,`question_id` 使用 `"未编号-1"`、`"未编号-2"` 等稳定编号,`section_title` 使用当前大题标题或 `null`
3. **content_blocks 的 block_type 只能填 text 或 image**,不能填 paragraph_title 等其他值
4. **question_type 只能填**:选择题、填空题、解答题、判断题
5. **选项放 options 数组**,不要放在 content_blocks 里
Expand All @@ -28,7 +29,8 @@
1. 按 block_order 顺序扫描
2. 遇到题号(1. 2. 3. 或 一、二、三、)标记为新题目起点
3. 收集该题所有内容直到下一题号
4. 忽略页眉、页脚、页码
4. 对 `is_primary: true` 页内没有题号但有实质解题内容的公式/图片/计算表达式,不要忽略;无法归入前后题时作为 `"未编号-N"` 独立题输出
5. 忽略页眉、页脚、页码

## 各字段填写

Expand Down Expand Up @@ -80,7 +82,8 @@
2. 遇到 `paragraph_title` block(如"四、我会计算。")时,更新当前大题标题(`section_title`);该 block **不输出为题目**
3. 遇到阿拉伯数字题号(1. 2. 3.)时,标记为新题目起点,并将当前 `section_title` 写入该题
4. 收集题目内容直到下一个题号:`content_blocks` 只允许 `text`/`image`;公式用 LaTeX 嵌入 text;图片必须作为 image block 加入 content_blocks
5. 按 schema 字段组织结构化输出
5. 若 `is_primary: true` 页内出现没有题号但有实质解题内容的公式、图片、计算表达式,且无法明确归入前后题,则不要丢弃;作为独立题输出,`question_id` 使用 `"未编号-1"`、`"未编号-2"` 等稳定编号
6. 按 schema 字段组织结构化输出

## 知识点标注

Expand Down Expand Up @@ -136,7 +139,8 @@
5. **跨页连续性**
- 一道题的内容可能跨越两页。如果某页开头的内容块没有新题号,它属于上一页最后一道题
- 典型情况:页首出现 A/B/C/D 选项但无题号 → 是上一题的选项;页首出现(1)(2)小问但无大题号 → 是上一题的子问
- 不要把这类无题号的跨页内容创建为新题目
- 不要把这类无题号的跨页续写内容创建为新题目
- 但如果无题号内容出现在 `is_primary: true` 页内,且是独立的公式题、图片题、计算表达式或完整题干,不要忽略;无法归入前后题时应输出为 `"未编号-N"` 独立题
- **页首图片的归属(极其重要)**:若某页最先出现的 block 是 image,且该图片**之前没有出现任何阿拉伯数字题号**,则该图片**属于上一道题**,必须将其加入上一题的 content_blocks 末尾,而不是归入本页第一道题。
- 判断依据:图片归属看的是"该图片出现在哪道题的题号之后"。图片在题号之前 → 归上一题;图片在题号之后 → 归该题。
- 即使本页第一道题的题干中含"如图所示",也不能把题号之前出现的图片归给它。
Expand Down
54 changes: 29 additions & 25 deletions backend/src/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,13 @@ def _fix_leading_images(questions: List[Dict[str, Any]]) -> None:

# 移到前一道题末尾
prev_q = questions[i - 1]
if q.get("section_title") != prev_q.get("section_title"):
logger.info(
f"跳过 leading image 跨 section 移动: 题目 {q.get('question_id')} "
f"section={repr(q.get('section_title'))} 前题 {prev_q.get('question_id')} "
f"section={repr(prev_q.get('section_title'))}"
)
continue
prev_blocks = prev_q.get("content_blocks") or []
prev_q["content_blocks"] = prev_blocks + leading_images
q["content_blocks"] = rest
Expand Down Expand Up @@ -539,6 +546,7 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
qid = q.get("question_id")
if qid is not None:
q["question_id"] = str(qid).strip()
original_order = {id(q): idx for idx, q in enumerate(questions)}

# ── 第一轮:按 (section, qid) 复合键去重 ──────────────────
groups: Dict[tuple, List[Dict[str, Any]]] = defaultdict(list)
Expand All @@ -558,10 +566,8 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:

# ── 第二轮:结构优先 + 内容相似度去重 ───────────────────────
# 策略:
# 1. 同一 qid 下同时存在有 section 和无 section 版本 → 直接丢弃所有 section=None 版本
# (section=None 说明该批次只捕获到选项/部分内容,有 section 的是完整版本)
# 2. 同一 qid 下均有 section(不同 section)→ difflib 相似度 ≥ 0.75 视为重复,保留最优
# 3. 同一 qid 下均无 section → 保留内容最丰富的一份
# 同一 qid 下不再因 section_title 是否存在直接丢弃候选。
# 只有内容相似度 ≥ 0.75 才视为重复;否则视为不同大题/不同区域下的同号题,全部保留。
by_qid: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
for q in after_round1:
by_qid[q.get("question_id", "")].append(q)
Expand All @@ -577,24 +583,11 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
final.extend(entries)
continue

sectioned = [q for q in entries if q.get("section_title")]
unsectioned = [q for q in entries if not q.get("section_title")]

# 策略1:有 section 版本存在时,丢弃全部 section=None 版本
if sectioned and unsectioned:
round2_removed += len(unsectioned)
for q in unsectioned:
logger.debug(
f"二次去重剔除(结构): qid={qid} "
f"section=None,保留有大题标题的版本,richness={_question_richness(q)}"
)
entries = sectioned

if len(entries) == 1:
final.extend(entries)
continue

# 策略2/3:全部有 section 或全部无 section → difflib 去重
# 同题号候选统一进入相似度去重;不同内容即使 section_title 缺失也保留。
entries.sort(key=lambda q: (
0 if q.get("section_title") else 1,
-_question_richness(q)
Expand All @@ -618,20 +611,21 @@ def _dedup_questions(questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
final.extend(kept)

if round2_removed:
logger.info(f"二次去重: 剔除 {round2_removed} 道重复题目(结构优先 + 相似度阈值={SIMILARITY_THRESHOLD})")
logger.info(f"二次去重: 剔除 {round2_removed} 道重复题目(相似度阈值={SIMILARITY_THRESHOLD})")
console.print(f"[yellow]二次去重: 剔除 {round2_removed} 道重复题目[/yellow]")

# ── 排序: section 的题按首次出现顺序 + 题号,section=None 的题排最后 ──
# ── 排序:按原始出现位置排列;同一 section 内按题号稳定排序 ──
section_order: Dict[str, int] = {}
for q in questions:
for idx, q in enumerate(questions):
s = q.get("section_title")
if s and s not in section_order:
section_order[s] = len(section_order)
section_order[s] = idx

final.sort(key=lambda q: (
0 if q.get("section_title") else 1, # 有 section 的排前,None 排后
section_order.get(q.get("section_title") or "", 999),
_sort_key(q.get("question_id", ""))
section_order.get(q.get("section_title") or "", original_order.get(id(q), 999999)),
0 if q.get("section_title") else 1,
_sort_key(q.get("question_id", "")),
original_order.get(id(q), 999999),
))
return final

Expand Down Expand Up @@ -815,13 +809,23 @@ def _invoke_split(batch_idx: int, batch_data: list) -> None:
split_elapsed = time.time() - split_start
logger.info(f"并行分割完成, 耗时 {split_elapsed:.2f}s")

raw_agent_output_path = os.path.join(results_dir, "questions_agent_raw.json")
with open(raw_agent_output_path, 'w', encoding='utf-8') as f:
json.dump(batch_results, f, ensure_ascii=False, indent=2)
logger.info(f"已保存 split agent 原始输出: {raw_agent_output_path}")

# ── Step 6: 跨批次 section 传播 + 合并 + 去重 ──
_propagate_section_between_batches(batch_results)

all_questions = []
for questions in batch_results:
all_questions.extend(questions)

before_dedup_path = os.path.join(results_dir, "questions_before_dedup.json")
with open(before_dedup_path, 'w', encoding='utf-8') as f:
json.dump(all_questions, f, ensure_ascii=False, indent=2)
logger.info(f"已保存去重前题目列表: {before_dedup_path}")

before_dedup = len(all_questions)
deduped = _dedup_questions(all_questions)
after_dedup = len(deduped)
Expand Down
80 changes: 76 additions & 4 deletions backend/tests/test_workflow_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
_build_overlapping_batches,
_run_ocr_and_simplify,
_dedup_questions,
_fix_leading_images,
_question_richness,
_sort_key,
_extract_text_sample,
Expand Down Expand Up @@ -407,25 +408,25 @@ def test_skip_missing_id(self):
assert len(result) == 1

def test_sorted_output(self):
"""输出应按题号排序"""
"""输出应保留原始出现顺序,避免无 section 题被排到错误位置"""
qs = [
{"question_id": "3", "content_blocks": []},
{"question_id": "1", "content_blocks": []},
{"question_id": "2", "content_blocks": []},
]
result = _dedup_questions(qs)
assert [q["question_id"] for q in result] == ["1", "2", "3"]
assert [q["question_id"] for q in result] == ["3", "1", "2"]

def test_mixed_id_types_sorted(self):
"""数字和字母 id 混合排序"""
"""数字和字母 id 混合时也保留原始出现顺序"""
qs = [
{"question_id": "B", "content_blocks": []},
{"question_id": "2", "content_blocks": []},
{"question_id": "A", "content_blocks": []},
{"question_id": "1", "content_blocks": []},
]
result = _dedup_questions(qs)
assert [q["question_id"] for q in result] == ["1", "2", "A", "B"]
assert [q["question_id"] for q in result] == ["B", "2", "A", "1"]

def test_richness_with_options(self):
"""options 也计入 richness"""
Expand All @@ -440,6 +441,77 @@ def test_richness_with_options(self):
# q_with_opt: 2 + 19 = 21 > q_no_opt: 3
assert result[0].get("options") is not None

def test_same_id_unsectioned_different_content_is_kept(self):
"""同题号但内容明显不同,即使一个没有 section_title 也不能直接删除"""
qs = [
{
"question_id": "3",
"section_title": None,
"content_blocks": [
{"block_type": "text", "content": "3. 解方程。(6分)"},
{"block_type": "text", "content": r"$$\frac{3}{4}x-40\%x=1.4$$"},
],
},
{
"question_id": "3",
"section_title": "六、解决问题。(共27分)",
"content_blocks": [
{"block_type": "text", "content": "3. 建一座污水处理池用了 48 万元,原计划投资多少万元?"},
],
},
]
result = _dedup_questions(qs)
assert len(result) == 2
assert any(q.get("section_title") is None for q in result)


class TestFixLeadingImages:
"""_fix_leading_images 测试"""

def test_does_not_move_image_across_sections(self):
questions = [
{
"question_id": "3",
"section_title": None,
"content_blocks": [{"block_type": "text", "content": "3. 解方程"}],
},
{
"question_id": "1",
"section_title": "五、看图列式",
"content_blocks": [
{"block_type": "image", "content": "/images/q1.jpg"},
{"block_type": "text", "content": "2."},
],
},
]

_fix_leading_images(questions)

assert questions[0]["content_blocks"] == [{"block_type": "text", "content": "3. 解方程"}]
assert questions[1]["content_blocks"][0] == {"block_type": "image", "content": "/images/q1.jpg"}

def test_moves_image_within_same_section(self):
questions = [
{
"question_id": "1",
"section_title": "六、解决问题",
"content_blocks": [{"block_type": "text", "content": "1. 上一题"}],
},
{
"question_id": "2",
"section_title": "六、解决问题",
"content_blocks": [
{"block_type": "image", "content": "/images/prev.jpg"},
{"block_type": "text", "content": "2. 下一题"},
],
},
]

_fix_leading_images(questions)

assert questions[0]["content_blocks"][-1] == {"block_type": "image", "content": "/images/prev.jpg"}
assert questions[1]["content_blocks"] == [{"block_type": "text", "content": "2. 下一题"}]


# ═══════════════════════════════════════════════════════════
# _extract_text_sample
Expand Down