22from __future__ import annotations
33
44import logging
5+ import re
56import shutil
6- from dataclasses import dataclass , field
7+ import unicodedata
8+ from dataclasses import dataclass
79from pathlib import Path
810
911import pymupdf
@@ -25,6 +27,20 @@ class ConvertResult:
2527 is_long_doc : bool = False
2628 skipped : bool = False
2729 file_hash : str | None = None # For deferred hash registration
30+ doc_name : str | None = None
31+
32+
33+ _SAFE_STEM_RE = re .compile (r"[^\w\-]+" )
34+ _DOC_HASH_LEN = 10
35+
36+
37+ def _make_doc_name (src : Path , file_hash : str ) -> str :
38+ """Return the stable internal document name for a source file."""
39+ stem = unicodedata .normalize ("NFKC" , src .stem )
40+ safe_stem = _SAFE_STEM_RE .sub ("-" , stem ).strip ("-" )
41+ if not safe_stem :
42+ safe_stem = "document"
43+ return f"{ safe_stem } -{ file_hash [:_DOC_HASH_LEN ]} "
2844
2945
3046def get_pdf_page_count (path : Path ) -> int :
@@ -56,17 +72,25 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
5672 # 1. Hash check
5773 # ------------------------------------------------------------------
5874 file_hash = HashRegistry .hash_file (src )
75+ doc_name = _make_doc_name (src , file_hash )
5976 if registry .is_known (file_hash ):
6077 logger .info ("Skipping already-known file: %s" , src .name )
61- return ConvertResult (skipped = True )
78+ metadata = registry .get (file_hash ) or {}
79+ return ConvertResult (
80+ skipped = True ,
81+ file_hash = file_hash ,
82+ doc_name = metadata .get ("doc_name" , doc_name ),
83+ )
6284
6385 # ------------------------------------------------------------------
6486 # 2. Copy to raw/
6587 # ------------------------------------------------------------------
6688 raw_dir = kb_dir / "raw"
6789 raw_dir .mkdir (parents = True , exist_ok = True )
68- raw_dest = raw_dir / src .name
69- if raw_dest .resolve () != src .resolve ():
90+ if src .resolve ().is_relative_to (raw_dir .resolve ()):
91+ raw_dest = src
92+ else :
93+ raw_dest = raw_dir / f"{ doc_name } { src .suffix .lower ()} "
7094 shutil .copy2 (src , raw_dest )
7195
7296 # ------------------------------------------------------------------
@@ -81,18 +105,21 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
81105 threshold ,
82106 src .name ,
83107 )
84- return ConvertResult (raw_path = raw_dest , is_long_doc = True , file_hash = file_hash )
108+ return ConvertResult (
109+ raw_path = raw_dest ,
110+ doc_name = doc_name ,
111+ is_long_doc = True ,
112+ file_hash = file_hash ,
113+ )
85114
86115 # ------------------------------------------------------------------
87116 # 4/5. Convert to Markdown
88117 # ------------------------------------------------------------------
89118 sources_dir = kb_dir / "wiki" / "sources"
90119 sources_dir .mkdir (parents = True , exist_ok = True )
91- images_dir = kb_dir / "wiki" / "sources" / "images" / src . stem
120+ images_dir = kb_dir / "wiki" / "sources" / "images" / doc_name
92121 images_dir .mkdir (parents = True , exist_ok = True )
93122
94- doc_name = src .stem
95-
96123 if src .suffix .lower () == ".md" :
97124 markdown = src .read_text (encoding = "utf-8" )
98125 markdown = copy_relative_images (markdown , src .parent , doc_name , images_dir )
@@ -109,4 +136,9 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
109136 dest_md = sources_dir / f"{ doc_name } .md"
110137 dest_md .write_text (markdown , encoding = "utf-8" )
111138
112- return ConvertResult (raw_path = raw_dest , source_path = dest_md , file_hash = file_hash )
139+ return ConvertResult (
140+ raw_path = raw_dest ,
141+ source_path = dest_md ,
142+ doc_name = doc_name ,
143+ file_hash = file_hash ,
144+ )
0 commit comments