Skip to content

Commit 7139da7

Browse files
author
saccharin98
committed
fix: hashing to avoid duplicate doc name collision
1 parent 2a15587 commit 7139da7

6 files changed

Lines changed: 106 additions & 20 deletions

File tree

openkb/cli.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,14 +154,14 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
154154
click.echo(f" [SKIP] Already in knowledge base: {file_path.name}")
155155
return
156156

157-
doc_name = file_path.stem
157+
doc_name = result.doc_name or file_path.stem
158158

159159
# 3/4. Index and compile
160160
if result.is_long_doc:
161161
click.echo(f" Long document detected — indexing with PageIndex...")
162162
try:
163163
from openkb.indexer import index_long_document
164-
index_result = index_long_document(result.raw_path, kb_dir)
164+
index_result = index_long_document(result.raw_path, kb_dir, doc_name=doc_name)
165165
except Exception as exc:
166166
click.echo(f" [ERROR] Indexing failed: {exc}")
167167
logger.debug("Indexing traceback:", exc_info=True)
@@ -202,7 +202,11 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
202202
# Register hash only after successful compilation
203203
if result.file_hash:
204204
doc_type = "long_pdf" if result.is_long_doc else file_path.suffix.lstrip(".")
205-
registry.add(result.file_hash, {"name": file_path.name, "type": doc_type})
205+
registry.add(result.file_hash, {
206+
"name": file_path.name,
207+
"doc_name": doc_name,
208+
"type": doc_type,
209+
})
206210

207211
append_log(kb_dir / "wiki", "ingest", file_path.name)
208212
click.echo(f" [OK] {file_path.name} added to knowledge base.")

openkb/converter.py

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
from __future__ import annotations
33

44
import logging
5+
import re
56
import shutil
6-
from dataclasses import dataclass, field
7+
import unicodedata
8+
from dataclasses import dataclass
79
from pathlib import Path
810

911
import pymupdf
@@ -25,6 +27,20 @@ class ConvertResult:
2527
is_long_doc: bool = False
2628
skipped: bool = False
2729
file_hash: str | None = None # For deferred hash registration
30+
doc_name: str | None = None
31+
32+
33+
_SAFE_STEM_RE = re.compile(r"[^\w\-]+")
34+
_DOC_HASH_LEN = 10
35+
36+
37+
def _make_doc_name(src: Path, file_hash: str) -> str:
38+
"""Return the stable internal document name for a source file."""
39+
stem = unicodedata.normalize("NFKC", src.stem)
40+
safe_stem = _SAFE_STEM_RE.sub("-", stem).strip("-")
41+
if not safe_stem:
42+
safe_stem = "document"
43+
return f"{safe_stem}-{file_hash[:_DOC_HASH_LEN]}"
2844

2945

3046
def get_pdf_page_count(path: Path) -> int:
@@ -56,17 +72,25 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
5672
# 1. Hash check
5773
# ------------------------------------------------------------------
5874
file_hash = HashRegistry.hash_file(src)
75+
doc_name = _make_doc_name(src, file_hash)
5976
if registry.is_known(file_hash):
6077
logger.info("Skipping already-known file: %s", src.name)
61-
return ConvertResult(skipped=True)
78+
metadata = registry.get(file_hash) or {}
79+
return ConvertResult(
80+
skipped=True,
81+
file_hash=file_hash,
82+
doc_name=metadata.get("doc_name", doc_name),
83+
)
6284

6385
# ------------------------------------------------------------------
6486
# 2. Copy to raw/
6587
# ------------------------------------------------------------------
6688
raw_dir = kb_dir / "raw"
6789
raw_dir.mkdir(parents=True, exist_ok=True)
68-
raw_dest = raw_dir / src.name
69-
if raw_dest.resolve() != src.resolve():
90+
if src.resolve().is_relative_to(raw_dir.resolve()):
91+
raw_dest = src
92+
else:
93+
raw_dest = raw_dir / f"{doc_name}{src.suffix.lower()}"
7094
shutil.copy2(src, raw_dest)
7195

7296
# ------------------------------------------------------------------
@@ -81,18 +105,21 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
81105
threshold,
82106
src.name,
83107
)
84-
return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash)
108+
return ConvertResult(
109+
raw_path=raw_dest,
110+
doc_name=doc_name,
111+
is_long_doc=True,
112+
file_hash=file_hash,
113+
)
85114

86115
# ------------------------------------------------------------------
87116
# 4/5. Convert to Markdown
88117
# ------------------------------------------------------------------
89118
sources_dir = kb_dir / "wiki" / "sources"
90119
sources_dir.mkdir(parents=True, exist_ok=True)
91-
images_dir = kb_dir / "wiki" / "sources" / "images" / src.stem
120+
images_dir = kb_dir / "wiki" / "sources" / "images" / doc_name
92121
images_dir.mkdir(parents=True, exist_ok=True)
93122

94-
doc_name = src.stem
95-
96123
if src.suffix.lower() == ".md":
97124
markdown = src.read_text(encoding="utf-8")
98125
markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
@@ -109,4 +136,9 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
109136
dest_md = sources_dir / f"{doc_name}.md"
110137
dest_md.write_text(markdown, encoding="utf-8")
111138

112-
return ConvertResult(raw_path=raw_dest, source_path=dest_md, file_hash=file_hash)
139+
return ConvertResult(
140+
raw_path=raw_dest,
141+
source_path=dest_md,
142+
doc_name=doc_name,
143+
file_hash=file_hash,
144+
)

openkb/indexer.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class IndexResult:
2626
tree: dict
2727

2828

29-
def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
29+
def index_long_document(pdf_path: Path, kb_dir: Path, doc_name: str | None = None) -> IndexResult:
3030
"""Index a long PDF document using PageIndex and write wiki pages."""
3131
openkb_dir = kb_dir / ".openkb"
3232
config = load_config(openkb_dir / "config.yaml")
@@ -63,36 +63,37 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
6363

6464
# Fetch complete document (metadata + structure + text)
6565
doc = col.get_document(doc_id, include_text=True)
66-
doc_name: str = doc.get("doc_name", pdf_path.stem)
66+
indexed_doc_name: str = doc.get("doc_name", pdf_path.stem)
6767
description: str = doc.get("doc_description", "")
6868
structure: list = doc.get("structure", [])
69+
source_name = doc_name or pdf_path.stem
6970

7071
# Debug: print doc keys and page_count to diagnose get_page_content range
7172
logger.info("Doc keys: %s", list(doc.keys()))
7273
logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT"))
7374

7475
tree = {
75-
"doc_name": doc_name,
76+
"doc_name": indexed_doc_name,
7677
"doc_description": description,
7778
"structure": structure,
7879
}
7980

8081
# Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
8182
sources_dir = kb_dir / "wiki" / "sources"
8283
sources_dir.mkdir(parents=True, exist_ok=True)
83-
images_dir = sources_dir / "images" / pdf_path.stem
84+
images_dir = sources_dir / "images" / source_name
8485

8586
from openkb.images import convert_pdf_to_pages
86-
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
87+
all_pages = convert_pdf_to_pages(pdf_path, source_name, images_dir)
8788

88-
(sources_dir / f"{pdf_path.stem}.json").write_text(
89+
(sources_dir / f"{source_name}.json").write_text(
8990
json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
9091
)
9192

9293
# Write wiki/summaries/ (no images, just summaries)
9394
summaries_dir = kb_dir / "wiki" / "summaries"
9495
summaries_dir.mkdir(parents=True, exist_ok=True)
95-
summary_md = render_summary_md(tree, pdf_path.stem, doc_id)
96-
(summaries_dir / f"{pdf_path.stem}.md").write_text(summary_md, encoding="utf-8")
96+
summary_md = render_summary_md(tree, source_name, doc_id)
97+
(summaries_dir / f"{source_name}.md").write_text(summary_md, encoding="utf-8")
9798

9899
return IndexResult(doc_id=doc_id, description=description, tree=tree)

tests/test_add_command.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,9 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
139139
mock_result = ConvertResult(
140140
raw_path=kb_dir / "raw" / "test.md",
141141
source_path=source_path,
142+
doc_name="test-deadbeef00",
142143
is_long_doc=False,
144+
file_hash="deadbeef00" * 8,
143145
)
144146

145147
runner = CliRunner()
@@ -149,3 +151,6 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
149151
result = runner.invoke(cli, ["add", str(doc)])
150152
mock_arun.assert_called_once()
151153
assert "OK" in result.output
154+
155+
hashes = json.loads((kb_dir / ".openkb" / "hashes.json").read_text())
156+
assert hashes[mock_result.file_hash]["doc_name"] == "test-deadbeef00"

tests/test_converter.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir):
4242

4343
assert result.skipped is False
4444
assert result.is_long_doc is False
45+
assert result.doc_name == f"notes-{result.file_hash[:10]}"
4546
assert result.source_path is not None
47+
assert result.source_path.name == f"{result.doc_name}.md"
4648
assert result.source_path.exists()
4749
assert result.source_path.read_text(encoding="utf-8").startswith("# Notes")
4850

@@ -72,8 +74,31 @@ def test_md_raw_file_copied(self, kb_dir):
7274
result = convert_document(src, kb_dir)
7375

7476
assert result.raw_path is not None
77+
assert result.raw_path.name == f"{result.doc_name}.md"
7578
assert result.raw_path.exists()
7679

80+
def test_same_filename_different_content_gets_distinct_outputs(self, kb_dir):
81+
"""Files with the same basename must not overwrite wiki artifacts."""
82+
first_dir = kb_dir / "inputs" / "first"
83+
second_dir = kb_dir / "inputs" / "second"
84+
first_dir.mkdir(parents=True)
85+
second_dir.mkdir(parents=True)
86+
first = first_dir / "report.md"
87+
second = second_dir / "report.md"
88+
first.write_text("# First\n\nAlpha content.", encoding="utf-8")
89+
second.write_text("# Second\n\nBeta content.", encoding="utf-8")
90+
91+
first_result = convert_document(first, kb_dir)
92+
second_result = convert_document(second, kb_dir)
93+
94+
assert first_result.doc_name != second_result.doc_name
95+
assert first_result.source_path != second_result.source_path
96+
assert first_result.raw_path != second_result.raw_path
97+
assert first_result.source_path.read_text(encoding="utf-8").startswith("# First")
98+
assert second_result.source_path.read_text(encoding="utf-8").startswith("# Second")
99+
assert first_result.raw_path.read_text(encoding="utf-8").startswith("# First")
100+
assert second_result.raw_path.read_text(encoding="utf-8").startswith("# Second")
101+
77102

78103
# ---------------------------------------------------------------------------
79104
# convert_document — PDF short doc

tests/test_indexer.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,25 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path):
102102
assert "doc_type: pageindex" in content
103103
assert "Summary:" in content
104104

105+
def test_explicit_doc_name_controls_output_paths(self, kb_dir, sample_tree, tmp_path):
106+
"""Long doc outputs should use the converter's hash-suffixed doc name."""
107+
doc_id = "abc-123"
108+
fake_col = self._make_fake_collection(doc_id, sample_tree)
109+
110+
fake_client = MagicMock()
111+
fake_client.collection.return_value = fake_col
112+
113+
pdf_path = tmp_path / "sample.pdf"
114+
pdf_path.write_bytes(b"%PDF-1.4 fake")
115+
116+
with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
117+
patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
118+
index_long_document(pdf_path, kb_dir, doc_name="sample-deadbeef00")
119+
120+
assert (kb_dir / "wiki" / "sources" / "sample-deadbeef00.json").exists()
121+
assert (kb_dir / "wiki" / "summaries" / "sample-deadbeef00.md").exists()
122+
assert not (kb_dir / "wiki" / "sources" / "sample.json").exists()
123+
105124
def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_path):
106125
"""LocalClient must be created with the correct IndexConfig flags."""
107126
doc_id = "xyz-456"

0 commit comments

Comments
 (0)