11"""Document conversion pipeline for OpenKB."""
22from __future__ import annotations
33
4+ import hashlib
45import logging
56import re
67import shutil
@@ -31,16 +32,31 @@ class ConvertResult:
3132
3233
3334_SAFE_STEM_RE = re .compile (r"[^\w\-]+" )
34- _DOC_HASH_LEN = 10
35+ _DOC_HASH_LEN = 12
3536
3637
37- def _make_doc_name (src : Path , file_hash : str ) -> str :
38+ def _registry_path (path : Path , kb_dir : Path ) -> str :
39+ """Return the portable path key stored in the hash registry."""
40+ resolved_path = path .resolve ()
41+ resolved_kb = kb_dir .resolve ()
42+ if resolved_path .is_relative_to (resolved_kb ):
43+ return resolved_path .relative_to (resolved_kb ).as_posix ()
44+ return resolved_path .as_posix ()
45+
46+
47+ def _path_hash (src : Path , kb_dir : Path ) -> str :
48+ """Return a stable hash for a source path, independent of file content."""
49+ identity = _registry_path (src , kb_dir )
50+ return hashlib .sha256 (identity .encode ("utf-8" )).hexdigest ()
51+
52+
53+ def _make_doc_name (src : Path , kb_dir : Path ) -> str :
3854 """Return the stable internal document name for a source file."""
3955 stem = unicodedata .normalize ("NFKC" , src .stem )
4056 safe_stem = _SAFE_STEM_RE .sub ("-" , stem ).strip ("-" )
4157 if not safe_stem :
4258 safe_stem = "document"
43- return f"{ safe_stem } -{ file_hash [:_DOC_HASH_LEN ]} "
59+ return f"{ safe_stem } -{ _path_hash ( src , kb_dir ) [:_DOC_HASH_LEN ]} "
4460
4561
4662def get_pdf_page_count (path : Path ) -> int :
@@ -53,7 +69,7 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
5369 """Convert a document and integrate it into the knowledge base.
5470
5571 Steps:
56- 1. Hash-check — skip if already known.
72+ 1. Hash-check — skip if this exact content is already known.
5773 2. Copy source to ``raw/``.
5874 3. If PDF and page count >= threshold → return :attr:`ConvertResult.is_long_doc`.
5975 4. If ``.md`` — read, process relative images, save to ``wiki/sources/``.
@@ -72,14 +88,16 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
7288 # 1. Hash check
7389 # ------------------------------------------------------------------
7490 file_hash = HashRegistry .hash_file (src )
75- doc_name = _make_doc_name (src , file_hash )
91+ path_key = _registry_path (src , kb_dir )
92+ path_metadata = registry .get_by_path (path_key ) or {}
93+ doc_name = path_metadata .get ("doc_name" ) or _make_doc_name (src , kb_dir )
7694 if registry .is_known (file_hash ):
7795 logger .info ("Skipping already-known file: %s" , src .name )
7896 metadata = registry .get (file_hash ) or {}
7997 return ConvertResult (
8098 skipped = True ,
8199 file_hash = file_hash ,
82- doc_name = metadata .get ("doc_name" , doc_name ) ,
100+ doc_name = metadata .get ("doc_name" ) or doc_name ,
83101 )
84102
85103 # ------------------------------------------------------------------
0 commit comments