fix: hashing to avoid duplicate doc name collision

saccharin98 · saccharin98 · commit 7139da7fc07e · 2026-04-27T15:42:38.000+08:00
diff --git a/openkb/cli.py b/openkb/cli.py
@@ -154,14 +154,14 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
         click.echo(f"  [SKIP] Already in knowledge base: {file_path.name}")
         return
 
-    doc_name = file_path.stem
+    doc_name = result.doc_name or file_path.stem
 
     # 3/4. Index and compile
     if result.is_long_doc:
         click.echo(f"  Long document detected — indexing with PageIndex...")
         try:
             from openkb.indexer import index_long_document
-            index_result = index_long_document(result.raw_path, kb_dir)
+            index_result = index_long_document(result.raw_path, kb_dir, doc_name=doc_name)
         except Exception as exc:
             click.echo(f"  [ERROR] Indexing failed: {exc}")
             logger.debug("Indexing traceback:", exc_info=True)
@@ -202,7 +202,11 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
     # Register hash only after successful compilation
     if result.file_hash:
         doc_type = "long_pdf" if result.is_long_doc else file_path.suffix.lstrip(".")
-        registry.add(result.file_hash, {"name": file_path.name, "type": doc_type})
+        registry.add(result.file_hash, {
+            "name": file_path.name,
+            "doc_name": doc_name,
+            "type": doc_type,
+        })
 
     append_log(kb_dir / "wiki", "ingest", file_path.name)
     click.echo(f"  [OK] {file_path.name} added to knowledge base.")
diff --git a/openkb/converter.py b/openkb/converter.py
@@ -2,8 +2,10 @@
 from __future__ import annotations
 
 import logging
+import re
 import shutil
-from dataclasses import dataclass, field
+import unicodedata
+from dataclasses import dataclass
 from pathlib import Path
 
 import pymupdf
@@ -25,6 +27,20 @@ class ConvertResult:
     is_long_doc: bool = False
     skipped: bool = False
     file_hash: str | None = None  # For deferred hash registration
+    doc_name: str | None = None
+
+
+_SAFE_STEM_RE = re.compile(r"[^\w\-]+")
+_DOC_HASH_LEN = 10
+
+
+def _make_doc_name(src: Path, file_hash: str) -> str:
+    """Return the stable internal document name for a source file."""
+    stem = unicodedata.normalize("NFKC", src.stem)
+    safe_stem = _SAFE_STEM_RE.sub("-", stem).strip("-")
+    if not safe_stem:
+        safe_stem = "document"
+    return f"{safe_stem}-{file_hash[:_DOC_HASH_LEN]}"
 
 
 def get_pdf_page_count(path: Path) -> int:
@@ -56,17 +72,25 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
     # 1. Hash check
     # ------------------------------------------------------------------
     file_hash = HashRegistry.hash_file(src)
+    doc_name = _make_doc_name(src, file_hash)
     if registry.is_known(file_hash):
         logger.info("Skipping already-known file: %s", src.name)
-        return ConvertResult(skipped=True)
+        metadata = registry.get(file_hash) or {}
+        return ConvertResult(
+            skipped=True,
+            file_hash=file_hash,
+            doc_name=metadata.get("doc_name", doc_name),
+        )
 
     # ------------------------------------------------------------------
     # 2. Copy to raw/
     # ------------------------------------------------------------------
     raw_dir = kb_dir / "raw"
     raw_dir.mkdir(parents=True, exist_ok=True)
-    raw_dest = raw_dir / src.name
-    if raw_dest.resolve() != src.resolve():
+    if src.resolve().is_relative_to(raw_dir.resolve()):
+        raw_dest = src
+    else:
+        raw_dest = raw_dir / f"{doc_name}{src.suffix.lower()}"
         shutil.copy2(src, raw_dest)
 
     # ------------------------------------------------------------------
@@ -81,18 +105,21 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
                 threshold,
                 src.name,
             )
-            return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash)
+            return ConvertResult(
+                raw_path=raw_dest,
+                doc_name=doc_name,
+                is_long_doc=True,
+                file_hash=file_hash,
+            )
 
     # ------------------------------------------------------------------
     # 4/5. Convert to Markdown
     # ------------------------------------------------------------------
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
-    images_dir = kb_dir / "wiki" / "sources" / "images" / src.stem
+    images_dir = kb_dir / "wiki" / "sources" / "images" / doc_name
     images_dir.mkdir(parents=True, exist_ok=True)
 
-    doc_name = src.stem
-
     if src.suffix.lower() == ".md":
         markdown = src.read_text(encoding="utf-8")
         markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
@@ -109,4 +136,9 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
     dest_md = sources_dir / f"{doc_name}.md"
     dest_md.write_text(markdown, encoding="utf-8")
 
-    return ConvertResult(raw_path=raw_dest, source_path=dest_md, file_hash=file_hash)
+    return ConvertResult(
+        raw_path=raw_dest,
+        source_path=dest_md,
+        doc_name=doc_name,
+        file_hash=file_hash,
+    )
diff --git a/openkb/indexer.py b/openkb/indexer.py
@@ -26,7 +26,7 @@ class IndexResult:
     tree: dict
 
 
-def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
+def index_long_document(pdf_path: Path, kb_dir: Path, doc_name: str | None = None) -> IndexResult:
     """Index a long PDF document using PageIndex and write wiki pages."""
     openkb_dir = kb_dir / ".openkb"
     config = load_config(openkb_dir / "config.yaml")
@@ -63,36 +63,37 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
 
     # Fetch complete document (metadata + structure + text)
     doc = col.get_document(doc_id, include_text=True)
-    doc_name: str = doc.get("doc_name", pdf_path.stem)
+    indexed_doc_name: str = doc.get("doc_name", pdf_path.stem)
     description: str = doc.get("doc_description", "")
     structure: list = doc.get("structure", [])
+    source_name = doc_name or pdf_path.stem
 
     # Debug: print doc keys and page_count to diagnose get_page_content range
     logger.info("Doc keys: %s", list(doc.keys()))
     logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT"))
 
     tree = {
-        "doc_name": doc_name,
+        "doc_name": indexed_doc_name,
         "doc_description": description,
         "structure": structure,
     }
 
     # Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
-    images_dir = sources_dir / "images" / pdf_path.stem
+    images_dir = sources_dir / "images" / source_name
 
     from openkb.images import convert_pdf_to_pages
-    all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
+    all_pages = convert_pdf_to_pages(pdf_path, source_name, images_dir)
 
-    (sources_dir / f"{pdf_path.stem}.json").write_text(
+    (sources_dir / f"{source_name}.json").write_text(
         json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
     )
 
     # Write wiki/summaries/ (no images, just summaries)
     summaries_dir = kb_dir / "wiki" / "summaries"
     summaries_dir.mkdir(parents=True, exist_ok=True)
-    summary_md = render_summary_md(tree, pdf_path.stem, doc_id)
-    (summaries_dir / f"{pdf_path.stem}.md").write_text(summary_md, encoding="utf-8")
+    summary_md = render_summary_md(tree, source_name, doc_id)
+    (summaries_dir / f"{source_name}.md").write_text(summary_md, encoding="utf-8")
 
     return IndexResult(doc_id=doc_id, description=description, tree=tree)
diff --git a/tests/test_add_command.py b/tests/test_add_command.py
@@ -139,7 +139,9 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
         mock_result = ConvertResult(
             raw_path=kb_dir / "raw" / "test.md",
             source_path=source_path,
+            doc_name="test-deadbeef00",
             is_long_doc=False,
+            file_hash="deadbeef00" * 8,
         )
 
         runner = CliRunner()
@@ -149,3 +151,6 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
             result = runner.invoke(cli, ["add", str(doc)])
             mock_arun.assert_called_once()
             assert "OK" in result.output
+
+        hashes = json.loads((kb_dir / ".openkb" / "hashes.json").read_text())
+        assert hashes[mock_result.file_hash]["doc_name"] == "test-deadbeef00"
diff --git a/tests/test_converter.py b/tests/test_converter.py
@@ -42,7 +42,9 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir):
 
         assert result.skipped is False
         assert result.is_long_doc is False
+        assert result.doc_name == f"notes-{result.file_hash[:10]}"
         assert result.source_path is not None
+        assert result.source_path.name == f"{result.doc_name}.md"
         assert result.source_path.exists()
         assert result.source_path.read_text(encoding="utf-8").startswith("# Notes")
 
@@ -72,8 +74,31 @@ def test_md_raw_file_copied(self, kb_dir):
         result = convert_document(src, kb_dir)
 
         assert result.raw_path is not None
+        assert result.raw_path.name == f"{result.doc_name}.md"
         assert result.raw_path.exists()
 
+    def test_same_filename_different_content_gets_distinct_outputs(self, kb_dir):
+        """Files with the same basename must not overwrite wiki artifacts."""
+        first_dir = kb_dir / "inputs" / "first"
+        second_dir = kb_dir / "inputs" / "second"
+        first_dir.mkdir(parents=True)
+        second_dir.mkdir(parents=True)
+        first = first_dir / "report.md"
+        second = second_dir / "report.md"
+        first.write_text("# First\n\nAlpha content.", encoding="utf-8")
+        second.write_text("# Second\n\nBeta content.", encoding="utf-8")
+
+        first_result = convert_document(first, kb_dir)
+        second_result = convert_document(second, kb_dir)
+
+        assert first_result.doc_name != second_result.doc_name
+        assert first_result.source_path != second_result.source_path
+        assert first_result.raw_path != second_result.raw_path
+        assert first_result.source_path.read_text(encoding="utf-8").startswith("# First")
+        assert second_result.source_path.read_text(encoding="utf-8").startswith("# Second")
+        assert first_result.raw_path.read_text(encoding="utf-8").startswith("# First")
+        assert second_result.raw_path.read_text(encoding="utf-8").startswith("# Second")
+
 
 # ---------------------------------------------------------------------------
 # convert_document — PDF short doc
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
@@ -102,6 +102,25 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path):
         assert "doc_type: pageindex" in content
         assert "Summary:" in content
 
+    def test_explicit_doc_name_controls_output_paths(self, kb_dir, sample_tree, tmp_path):
+        """Long doc outputs should use the converter's hash-suffixed doc name."""
+        doc_id = "abc-123"
+        fake_col = self._make_fake_collection(doc_id, sample_tree)
+
+        fake_client = MagicMock()
+        fake_client.collection.return_value = fake_col
+
+        pdf_path = tmp_path / "sample.pdf"
+        pdf_path.write_bytes(b"%PDF-1.4 fake")
+
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
+             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
+            index_long_document(pdf_path, kb_dir, doc_name="sample-deadbeef00")
+
+        assert (kb_dir / "wiki" / "sources" / "sample-deadbeef00.json").exists()
+        assert (kb_dir / "wiki" / "summaries" / "sample-deadbeef00.md").exists()
+        assert not (kb_dir / "wiki" / "sources" / "sample.json").exists()
+
     def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_path):
         """LocalClient must be created with the correct IndexConfig flags."""
         doc_id = "xyz-456"