diff --git a/.gitignore b/.gitignore index f4b2283..ac3ad88 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ Gemfile.lock +*.gem +.rspec_status +tmp/ .worktrees/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f4feba..9154b3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,16 @@ # Changelog +## [0.6.10] - 2026-04-28 + +### Fixed +- Chunker `content_hash` now uses MD5 + whitespace normalization (matching `Legion::Extensions::Apollo::Helpers::Writeback.content_hash`) instead of raw SHA-256. This keeps knowledge chunk deduplication aligned with Apollo writeback and avoids insert truncation on deployments whose `apollo_entries.content_hash` column is still fixed at 32 characters. +- `upsert_chunk_with_embedding` now requires an explicit `{success: true}` from `handle_ingest` before reporting `:created`/`:updated`. Failure hashes, missing success keys, and non-Hash returns are reported as `:skipped` with a warn log instead of false-positive success counts. + ## [0.6.9] - 2026-04-27 ### Fixed - `Manifest.scan` no longer crashes on `Errno::EPERM`/`EACCES` encountered during corpus walk (common on macOS for TCC-protected paths like `~/Library/Accounts`). Unreadable subdirs are pruned with a debug log; scan continues. Replaced `Find.find` with a recursive walker that rescues per-dir; also tolerates `Errno::ELOOP` and `Errno::ENOENT` for files that disappear mid-scan. -> **Version note**: `0.6.8` is reserved for the companion `fix/content-hash-md5-match-apollo-schema` PR (chunker SHA-256 → MD5 hash fix). Both branches target `0.6.7` as their merge base; this PR claims `0.6.9` to avoid intra-batch collision. - ## [0.6.7] - 2026-04-15 ### Fixed diff --git a/lib/legion/extensions/knowledge/helpers/chunker.rb b/lib/legion/extensions/knowledge/helpers/chunker.rb index 89d4cd3..435e669 100644 --- a/lib/legion/extensions/knowledge/helpers/chunker.rb +++ b/lib/legion/extensions/knowledge/helpers/chunker.rb @@ -62,6 +62,9 @@ def split_section(section, max_chars, overlap_chars) end private_class_method :split_section + # Hash must match Legion::Extensions::Apollo::Helpers::Writeback.content_hash + # so knowledge chunks deduplicate consistently with Apollo writeback and still + # fit older apollo_entries.content_hash columns fixed at MD5 length. def build_chunk(section, content, index) { content: content, @@ -70,11 +73,23 @@ def build_chunk(section, content, index) source_file: section[:source_file], token_count: (content.length.to_f / CHARS_PER_TOKEN).ceil, chunk_index: index, - content_hash: ::Digest::SHA256.hexdigest(content) + content_hash: apollo_compatible_content_hash(content) } end private_class_method :build_chunk + def apollo_compatible_content_hash(content) + if defined?(Legion::Extensions::Apollo::Helpers::Writeback) + Legion::Extensions::Apollo::Helpers::Writeback.content_hash(content) + else + # Fallback when apollo isn't loaded - match its MD5+normalize semantics + # so future apollo-backed lookups still work. + normalized = content.to_s.strip.downcase.gsub(/\s+/, ' ') + ::Digest::MD5.hexdigest(normalized) + end + end + private_class_method :apollo_compatible_content_hash + def settings_max_tokens return nil unless defined?(Legion::Settings) diff --git a/lib/legion/extensions/knowledge/runners/ingest.rb b/lib/legion/extensions/knowledge/runners/ingest.rb index e07379c..c50f12e 100644 --- a/lib/legion/extensions/knowledge/runners/ingest.rb +++ b/lib/legion/extensions/knowledge/runners/ingest.rb @@ -211,10 +211,24 @@ def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, return :created unless defined?(Legion::Extensions::Apollo) return :skipped if !force && exists - ingest_to_apollo(chunk, embedding) + result = ingest_to_apollo(chunk, embedding) + # handle_ingest returns a Hash on both success and failure paths; the upsert + # status must reflect the actual persistence outcome, not just the `force` flag. + # Previously any non-raising return was treated as success, producing + # false-positive :created/:updated responses to callers. + unless result.is_a?(Hash) && result[:success] == true + hash_prefix = chunk[:content_hash]&.slice(0, 12) + content_len = chunk[:content]&.length + error = result.is_a?(Hash) ? result[:error].inspect : "non-hash result class=#{result.class}" + log.warn( + '[knowledge][upsert_chunk] apollo persistence not confirmed ' \ + "error=#{error} chunk_hash=#{hash_prefix} chunk_len=#{content_len}" + ) + return :skipped + end force ? :updated : :created rescue StandardError => e - log.warn(e.message) + log.warn("[knowledge][upsert_chunk] unexpected error class=#{e.class} message=#{e.message} chunk_hash=#{chunk[:content_hash]&.slice(0, 12)}") :skipped end private_class_method :upsert_chunk_with_embedding diff --git a/lib/legion/extensions/knowledge/version.rb b/lib/legion/extensions/knowledge/version.rb index 1dbc478..f6663c1 100644 --- a/lib/legion/extensions/knowledge/version.rb +++ b/lib/legion/extensions/knowledge/version.rb @@ -3,7 +3,7 @@ module Legion module Extensions module Knowledge - VERSION = '0.6.9' + VERSION = '0.6.10' end end end diff --git a/spec/legion/extensions/knowledge/helpers/chunker_spec.rb b/spec/legion/extensions/knowledge/helpers/chunker_spec.rb index 3e4b9e8..dcd4c7e 100644 --- a/spec/legion/extensions/knowledge/helpers/chunker_spec.rb +++ b/spec/legion/extensions/knowledge/helpers/chunker_spec.rb @@ -34,13 +34,37 @@ expect(result.first[:source_file]).to eq('/docs/guide.md') end - it 'computes a sha256 content_hash for each chunk' do + it 'computes a 32-char (MD5-length) content_hash for each chunk' do + # Keep chunk hashes aligned with Apollo Writeback and compatible with + # older apollo_entries.content_hash columns fixed at MD5 length. result = chunker.chunk(sections: [section]) result.each do |chunk| - expect(chunk[:content_hash]).to match(/\A[0-9a-f]{64}\z/) + expect(chunk[:content_hash]).to match(/\A[0-9a-f]{32}\z/) end end + it 'content_hash matches MD5 of whitespace-normalized content' do + # Mirrors Apollo::Helpers::Writeback.content_hash semantics: strip, downcase, + # collapse whitespace, then MD5. This keeps dedup consistent with apollo. + result = chunker.chunk(sections: [section]) + result.each do |chunk| + normalized = chunk[:content].to_s.strip.downcase.gsub(/\s+/, ' ') + expect(chunk[:content_hash]).to eq(Digest::MD5.hexdigest(normalized)) + end + end + + it 'delegates to Apollo::Helpers::Writeback.content_hash when defined' do + writeback = Module.new do + def self.content_hash(content) + "apollo-#{Digest::MD5.hexdigest(content.to_s)}"[0, 32] + end + end + stub_const('Legion::Extensions::Apollo::Helpers::Writeback', writeback) + result = chunker.chunk(sections: [section]) + expected = writeback.content_hash(result.first[:content]) + expect(result.first[:content_hash]).to eq(expected) + end + it 'assigns sequential chunk_index values starting at 0' do long_content = Array.new(20) { |i| "Paragraph number #{i} with some filler text to exceed the token limit." }.join("\n\n") big_section = section.merge(content: long_content) diff --git a/spec/legion/extensions/knowledge/runners/ingest_spec.rb b/spec/legion/extensions/knowledge/runners/ingest_spec.rb index 9f2c2f2..b55f165 100644 --- a/spec/legion/extensions/knowledge/runners/ingest_spec.rb +++ b/spec/legion/extensions/knowledge/runners/ingest_spec.rb @@ -163,6 +163,89 @@ end end + describe '.upsert_chunk_with_embedding — persistence outcome propagation' do + let(:chunk) do + { + content: 'Chunk contents for persistence test.', + content_hash: 'abcdef0123456789abcdef0123456789', + source_file: '/tmp/test.md', + heading: 'H1', + section_path: ['H1'], + chunk_index: 0, + token_count: 9 + } + end + let(:embedding) { [0.1, 0.2, 0.3] } + + context 'when dry_run: true' do + it 'returns :created without contacting apollo' do + expect(described_class).not_to receive(:ingest_to_apollo) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, dry_run: true) + expect(outcome).to eq(:created) + end + end + + context 'when Legion::Extensions::Apollo is not defined' do + before { hide_const('Legion::Extensions::Apollo') if defined?(Legion::Extensions::Apollo) } + + it 'returns :created' do + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:created) + end + end + + context 'when apollo is defined' do + before { stub_const('Legion::Extensions::Apollo', Module.new) } + + it 'returns :skipped when exists is true and force is false' do + expect(described_class).not_to receive(:ingest_to_apollo) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, exists: true) + expect(outcome).to eq(:skipped) + end + + it 'returns :created when handle_ingest returns a success hash' do + allow(described_class).to receive(:ingest_to_apollo).and_return({ success: true, entry_id: 42 }) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:created) + end + + it 'returns :updated when force: true and handle_ingest succeeds' do + allow(described_class).to receive(:ingest_to_apollo).and_return({ success: true, entry_id: 42 }) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, force: true, exists: true) + expect(outcome).to eq(:updated) + end + + it 'returns :skipped and emits a warn log when handle_ingest returns success: false' do + allow(described_class).to receive(:ingest_to_apollo) + .and_return({ success: false, error: 'PG::StringDataRightTruncation' }) + expect(Legion::Logging).to receive(:warn).with(/apollo persistence not confirmed/) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:skipped) + end + + it 'returns :skipped when handle_ingest returns a non-Hash result' do + allow(described_class).to receive(:ingest_to_apollo).and_return(nil) + expect(Legion::Logging).to receive(:warn).with(/non-hash result class=NilClass/) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:skipped) + end + + it 'returns :skipped when handle_ingest returns a hash without success' do + allow(described_class).to receive(:ingest_to_apollo).and_return({ entry_id: 42 }) + expect(Legion::Logging).to receive(:warn).with(/apollo persistence not confirmed/) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:skipped) + end + + it 'returns :skipped and logs when ingest_to_apollo raises' do + allow(described_class).to receive(:ingest_to_apollo).and_raise(StandardError, 'boom') + expect(Legion::Logging).to receive(:warn).with(/unexpected error/) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:skipped) + end + end + end + describe '.ingest_corpus — delta behavior' do let(:tmp_dir) { Dir.mktmpdir } let(:store_dir) { Dir.mktmpdir }