Skip to content

Commit b8731a9

Browse files
author
saccharin98
committed
fix: change from doc hash to path hash
1 parent f19f434 commit b8731a9

10 files changed

Lines changed: 136 additions & 17 deletions

File tree

openkb/agent/compiler.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -538,10 +538,13 @@ def _update_index(
538538
lines = index_path.read_text(encoding="utf-8").split("\n")
539539

540540
doc_link = f"[[summaries/{doc_name}]]"
541-
if not _section_contains_link(lines, "## Documents", doc_link):
542-
doc_entry = f"- {doc_link} ({doc_type})"
541+
doc_entry = f"- {doc_link} ({doc_type})"
542+
if doc_brief:
543+
doc_entry += f" — {doc_brief}"
544+
if _section_contains_link(lines, "## Documents", doc_link):
543545
if doc_brief:
544-
doc_entry += f" — {doc_brief}"
546+
_replace_section_entry(lines, "## Documents", doc_link, doc_entry)
547+
else:
545548
_insert_section_entry(lines, "## Documents", doc_entry)
546549

547550
for name in concept_names:

openkb/cli.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from dotenv import load_dotenv
2727

2828
from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb
29-
from openkb.converter import convert_document
29+
from openkb.converter import _registry_path, convert_document
3030
from openkb.log import append_log
3131
from openkb.schema import AGENTS_MD
3232

@@ -208,11 +208,18 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
208208
# Register hash only after successful compilation
209209
if result.file_hash:
210210
doc_type = "long_pdf" if result.is_long_doc else file_path.suffix.lstrip(".")
211-
registry.add(result.file_hash, {
211+
metadata = {
212212
"name": file_path.name,
213213
"doc_name": doc_name,
214214
"type": doc_type,
215-
})
215+
"path": _registry_path(file_path, kb_dir),
216+
}
217+
if result.raw_path is not None:
218+
metadata["raw_path"] = _registry_path(result.raw_path, kb_dir)
219+
if result.source_path is not None:
220+
metadata["source_path"] = _registry_path(result.source_path, kb_dir)
221+
registry.remove_by_doc_name(doc_name)
222+
registry.add(result.file_hash, metadata)
216223

217224
append_log(kb_dir / "wiki", "ingest", file_path.name)
218225
click.echo(f" [OK] {file_path.name} added to knowledge base.")

openkb/converter.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Document conversion pipeline for OpenKB."""
22
from __future__ import annotations
33

4+
import hashlib
45
import logging
56
import re
67
import shutil
@@ -31,16 +32,31 @@ class ConvertResult:
3132

3233

3334
_SAFE_STEM_RE = re.compile(r"[^\w\-]+")
34-
_DOC_HASH_LEN = 10
35+
_DOC_HASH_LEN = 12
3536

3637

37-
def _make_doc_name(src: Path, file_hash: str) -> str:
38+
def _registry_path(path: Path, kb_dir: Path) -> str:
39+
"""Return the portable path key stored in the hash registry."""
40+
resolved_path = path.resolve()
41+
resolved_kb = kb_dir.resolve()
42+
if resolved_path.is_relative_to(resolved_kb):
43+
return resolved_path.relative_to(resolved_kb).as_posix()
44+
return resolved_path.as_posix()
45+
46+
47+
def _path_hash(src: Path, kb_dir: Path) -> str:
48+
"""Return a stable hash for a source path, independent of file content."""
49+
identity = _registry_path(src, kb_dir)
50+
return hashlib.sha256(identity.encode("utf-8")).hexdigest()
51+
52+
53+
def _make_doc_name(src: Path, kb_dir: Path) -> str:
3854
"""Return the stable internal document name for a source file."""
3955
stem = unicodedata.normalize("NFKC", src.stem)
4056
safe_stem = _SAFE_STEM_RE.sub("-", stem).strip("-")
4157
if not safe_stem:
4258
safe_stem = "document"
43-
return f"{safe_stem}-{file_hash[:_DOC_HASH_LEN]}"
59+
return f"{safe_stem}-{_path_hash(src, kb_dir)[:_DOC_HASH_LEN]}"
4460

4561

4662
def get_pdf_page_count(path: Path) -> int:
@@ -53,7 +69,7 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
5369
"""Convert a document and integrate it into the knowledge base.
5470
5571
Steps:
56-
1. Hash-check — skip if already known.
72+
1. Hash-check — skip if this exact content is already known.
5773
2. Copy source to ``raw/``.
5874
3. If PDF and page count >= threshold → return :attr:`ConvertResult.is_long_doc`.
5975
4. If ``.md`` — read, process relative images, save to ``wiki/sources/``.
@@ -72,14 +88,16 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
7288
# 1. Hash check
7389
# ------------------------------------------------------------------
7490
file_hash = HashRegistry.hash_file(src)
75-
doc_name = _make_doc_name(src, file_hash)
91+
path_key = _registry_path(src, kb_dir)
92+
path_metadata = registry.get_by_path(path_key) or {}
93+
doc_name = path_metadata.get("doc_name") or _make_doc_name(src, kb_dir)
7694
if registry.is_known(file_hash):
7795
logger.info("Skipping already-known file: %s", src.name)
7896
metadata = registry.get(file_hash) or {}
7997
return ConvertResult(
8098
skipped=True,
8199
file_hash=file_hash,
82-
doc_name=metadata.get("doc_name", doc_name),
100+
doc_name=metadata.get("doc_name") or doc_name,
83101
)
84102

85103
# ------------------------------------------------------------------

openkb/state.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,13 @@ def all_entries(self) -> dict[str, dict]:
3232
"""Return a shallow copy of all hash -> metadata entries."""
3333
return dict(self._data)
3434

35+
def get_by_path(self, path: str) -> dict | None:
36+
"""Return metadata for a registered raw/source path, if present."""
37+
for metadata in self._data.values():
38+
if metadata.get("raw_path") == path or metadata.get("source_path") == path:
39+
return metadata
40+
return None
41+
3542
# ------------------------------------------------------------------
3643
# Mutation
3744
# ------------------------------------------------------------------
@@ -41,6 +48,19 @@ def add(self, file_hash: str, metadata: dict) -> None:
4148
self._data[file_hash] = metadata
4249
self._persist()
4350

51+
def remove_by_doc_name(self, doc_name: str) -> None:
52+
"""Remove stale content-hash entries for a document identity."""
53+
stale_hashes = [
54+
file_hash
55+
for file_hash, metadata in self._data.items()
56+
if metadata.get("doc_name") == doc_name
57+
]
58+
if not stale_hashes:
59+
return
60+
for file_hash in stale_hashes:
61+
del self._data[file_hash]
62+
self._persist()
63+
4464
# ------------------------------------------------------------------
4565
# Internal
4666
# ------------------------------------------------------------------

openkb/watcher.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,11 @@ def _flush(self) -> None:
5353
if paths:
5454
self._callback(paths)
5555

56-
def _handle_event(self, event) -> None:
57-
"""Add the event's source path to pending if it's a supported file."""
56+
def _handle_event(self, event, path_attr: str = "src_path") -> None:
57+
"""Add the event path to pending if it's a supported file."""
5858
if event.is_directory:
5959
return
60-
path = Path(event.src_path)
60+
path = Path(getattr(event, path_attr))
6161
# Ignore hidden/dotfiles
6262
if path.name.startswith("."):
6363
return
@@ -73,6 +73,10 @@ def on_modified(self, event) -> None:
7373
"""Handle file modification events."""
7474
self._handle_event(event)
7575

76+
def on_moved(self, event) -> None:
77+
"""Handle atomic-save and rename events."""
78+
self._handle_event(event, "dest_path")
79+
7680

7781
def watch_directory(
7882
raw_dir: Path,

tests/test_add_command.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
143143
is_long_doc=False,
144144
file_hash="deadbeef00" * 8,
145145
)
146+
stale_hash = "oldhash"
147+
hashes_path = kb_dir / ".openkb" / "hashes.json"
148+
hashes_path.write_text(json.dumps({
149+
stale_hash: {"name": "test.md", "doc_name": mock_result.doc_name, "type": "md"}
150+
}))
146151

147152
runner = CliRunner()
148153
with patch("openkb.cli._find_kb_dir", return_value=kb_dir), \
@@ -153,4 +158,7 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
153158
assert "OK" in result.output
154159

155160
hashes = json.loads((kb_dir / ".openkb" / "hashes.json").read_text())
161+
assert stale_hash not in hashes
156162
assert hashes[mock_result.file_hash]["doc_name"] == "test-deadbeef00"
163+
assert hashes[mock_result.file_hash]["raw_path"] == "raw/test.md"
164+
assert hashes[mock_result.file_hash]["source_path"] == "wiki/sources/test.md"

tests/test_compiler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ def test_no_duplicates(self, tmp_path):
228228
_update_index(wiki, "my-doc", [], doc_brief="New brief")
229229
text = (wiki / "index.md").read_text()
230230
assert text.count("[[summaries/my-doc]]") == 1
231+
assert "- [[summaries/my-doc]] (short) — New brief" in text
232+
assert "Old brief" not in text
231233

232234
def test_backwards_compat_no_briefs(self, tmp_path):
233235
wiki = tmp_path / "wiki"

tests/test_converter.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import pytest
99

10-
from openkb.converter import ConvertResult, convert_document, get_pdf_page_count
10+
from openkb.converter import ConvertResult, _make_doc_name, convert_document, get_pdf_page_count
1111

1212

1313
# ---------------------------------------------------------------------------
@@ -42,12 +42,27 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir):
4242

4343
assert result.skipped is False
4444
assert result.is_long_doc is False
45-
assert result.doc_name == f"notes-{result.file_hash[:10]}"
45+
assert result.doc_name == _make_doc_name(src, kb_dir)
4646
assert result.source_path is not None
4747
assert result.source_path.name == f"{result.doc_name}.md"
4848
assert result.source_path.exists()
4949
assert result.source_path.read_text(encoding="utf-8").startswith("# Notes")
5050

51+
def test_md_file_keeps_doc_name_when_content_changes(self, kb_dir):
52+
"""A raw file keeps the same document identity across edits."""
53+
src = kb_dir / "raw" / "notes.md"
54+
src.write_text("# Notes\n\nOld content.", encoding="utf-8")
55+
56+
first_result = convert_document(src, kb_dir)
57+
src.write_text("# Notes\n\nNew content.", encoding="utf-8")
58+
second_result = convert_document(src, kb_dir)
59+
60+
assert second_result.skipped is False
61+
assert second_result.file_hash != first_result.file_hash
62+
assert second_result.doc_name == first_result.doc_name
63+
assert second_result.source_path == first_result.source_path
64+
assert second_result.source_path.read_text(encoding="utf-8").startswith("# Notes\n\nNew content.")
65+
5166
def test_md_duplicate_skipped(self, kb_dir):
5267
"""Second call with same file returns skipped=True when hash is registered."""
5368
from openkb.state import HashRegistry

tests/test_state.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,30 @@ def test_all_entries_empty(tmp_path):
5353
assert registry.all_entries() == {}
5454

5555

56+
def test_get_by_path_matches_raw_or_source_path(tmp_path):
57+
registry = HashRegistry(tmp_path / "hashes.json")
58+
metadata = {
59+
"doc_name": "paper-abc123",
60+
"raw_path": "raw/paper.pdf",
61+
"source_path": "wiki/sources/paper.md",
62+
}
63+
registry.add("hash1", metadata)
64+
65+
assert registry.get_by_path("raw/paper.pdf") == metadata
66+
assert registry.get_by_path("wiki/sources/paper.md") == metadata
67+
68+
69+
def test_remove_by_doc_name_deletes_stale_hash_entries(tmp_path):
70+
registry = HashRegistry(tmp_path / "hashes.json")
71+
registry.add("old", {"doc_name": "paper-abc123"})
72+
registry.add("other", {"doc_name": "other-def456"})
73+
74+
registry.remove_by_doc_name("paper-abc123")
75+
76+
assert registry.is_known("old") is False
77+
assert registry.is_known("other") is True
78+
79+
5680
def test_hash_file_produces_64_char_hex(tmp_path):
5781
f = tmp_path / "sample.txt"
5882
f.write_text("hello world")

tests/test_watcher.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ def _make_file_event(src_path: str, is_directory: bool = False):
1616
return event
1717

1818

19+
def _make_moved_event(src_path: str, dest_path: str, is_directory: bool = False):
20+
"""Create a mock watchdog move event."""
21+
event = _make_file_event(src_path, is_directory=is_directory)
22+
event.dest_path = dest_path
23+
return event
24+
25+
1926
class TestDebouncedHandler:
2027
def test_collects_created_files(self):
2128
callback = MagicMock()
@@ -42,6 +49,17 @@ def test_collects_modified_files(self):
4249

4350
assert "/raw/paper.txt" in handler._pending
4451

52+
def test_collects_moved_destination_file(self):
53+
callback = MagicMock()
54+
handler = DebouncedHandler(callback, debounce_seconds=100)
55+
56+
handler.on_moved(_make_moved_event("/raw/.notes.md.tmp", "/raw/notes.md"))
57+
58+
if handler._timer:
59+
handler._timer.cancel()
60+
61+
assert handler._pending == {"/raw/notes.md"}
62+
4563
def test_ignores_directories(self):
4664
callback = MagicMock()
4765
handler = DebouncedHandler(callback, debounce_seconds=100)

0 commit comments

Comments
 (0)