From 7e584d7aaa54c658587bf6c505cf277b76f1d2d3 Mon Sep 17 00:00:00 2001 From: Sam Armstrong Date: Fri, 24 Apr 2026 16:15:29 -0500 Subject: [PATCH 1/2] fix(chunker): match apollo MD5 content_hash schema; surface persistence failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chunker.build_chunk used Digest::SHA256.hexdigest(content) — 64 hex chars. Apollo's apollo_entries.content_hash column is CHARACTER(32) (MD5 length). PG silently rejected every INSERT with PG::StringDataRightTruncation, which handle_ingest caught via rescue Sequel::Error and returned as {success: false, error: ...}. lex-knowledge's upsert_chunk_with_embedding ignored that return value and reported :created/:updated based purely on the `force` flag — producing false-positive responses to every /api/knowledge/ingest call. No chunks actually persisted to apollo_entries. Fix: chunker now calls Legion::Extensions::Apollo::Helpers::Writeback .content_hash when available (normalize + MD5 → 32 chars), with an inline fallback that preserves the same semantics when apollo isn't loaded. upsert_chunk_with_embedding now returns :skipped when handle_ingest reports failure, so callers see truthful status. Live validation: fresh ingest of a 1490-byte markdown now produces chunks_created:1 and an apollo query by the content's distinctive tokens returns the row with distance 0.27 (strong semantic match). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 6 ++ .../extensions/knowledge/helpers/chunker.rb | 18 ++++- .../extensions/knowledge/runners/ingest.rb | 17 ++++- lib/legion/extensions/knowledge/version.rb | 2 +- .../knowledge/helpers/chunker_spec.rb | 28 ++++++- .../knowledge/runners/ingest_spec.rb | 75 +++++++++++++++++++ 6 files changed, 140 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ac2e58..90c898d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [0.6.8] - 2026-04-24 + +### Fixed +- Chunker `content_hash` now uses MD5 + whitespace normalization (matching `Legion::Extensions::Apollo::Helpers::Writeback.content_hash`) instead of raw SHA-256. Previously, 64-char SHA-256 hashes were rejected by Postgres when writing to `apollo_entries.content_hash` (CHARACTER(32)), and the failure was silently swallowed inside `handle_ingest`. As a result, corpus ingests via `/api/knowledge/ingest` appeared to succeed but persisted nothing. Chunks now round-trip into `apollo_entries` and are retrievable via `/api/apollo/query`. +- `upsert_chunk_with_embedding` now propagates `{success: false, error: ...}` returns from `handle_ingest` as `:skipped` with a warn log, instead of masking them as false-positive `:created`/`:updated`. + ## [0.6.7] - 2026-04-15 ### Fixed diff --git a/lib/legion/extensions/knowledge/helpers/chunker.rb b/lib/legion/extensions/knowledge/helpers/chunker.rb index 89d4cd3..6d5fbfd 100644 --- a/lib/legion/extensions/knowledge/helpers/chunker.rb +++ b/lib/legion/extensions/knowledge/helpers/chunker.rb @@ -62,6 +62,10 @@ def split_section(section, max_chars, overlap_chars) end private_class_method :split_section + # Hash must match Legion::Extensions::Apollo::Helpers::Writeback.content_hash so + # chunks land in apollo_entries.content_hash (CHARACTER(32) — MD5 length) without + # PG::StringDataRightTruncation. A bare SHA-256 (64 chars) silently fails the INSERT + # and the chain of rescues returns the chunk as "persisted" without actually writing. def build_chunk(section, content, index) { content: content, @@ -70,11 +74,23 @@ def build_chunk(section, content, index) source_file: section[:source_file], token_count: (content.length.to_f / CHARS_PER_TOKEN).ceil, chunk_index: index, - content_hash: ::Digest::SHA256.hexdigest(content) + content_hash: apollo_compatible_content_hash(content) } end private_class_method :build_chunk + def apollo_compatible_content_hash(content) + if defined?(Legion::Extensions::Apollo::Helpers::Writeback) + Legion::Extensions::Apollo::Helpers::Writeback.content_hash(content) + else + # Fallback when apollo isn't loaded — match its MD5+normalize semantics + # so future apollo-backed lookups still work. + normalized = content.to_s.strip.downcase.gsub(/\s+/, ' ') + ::Digest::MD5.hexdigest(normalized) + end + end + private_class_method :apollo_compatible_content_hash + def settings_max_tokens return nil unless defined?(Legion::Settings) diff --git a/lib/legion/extensions/knowledge/runners/ingest.rb b/lib/legion/extensions/knowledge/runners/ingest.rb index e07379c..93ce8f7 100644 --- a/lib/legion/extensions/knowledge/runners/ingest.rb +++ b/lib/legion/extensions/knowledge/runners/ingest.rb @@ -211,10 +211,23 @@ def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, return :created unless defined?(Legion::Extensions::Apollo) return :skipped if !force && exists - ingest_to_apollo(chunk, embedding) + result = ingest_to_apollo(chunk, embedding) + # handle_ingest returns a Hash on both success and failure paths; the upsert + # status must reflect the actual persistence outcome, not just the `force` flag. + # Previously any {success: false, error: ...} return was ignored, producing + # false-positive :created/:updated responses to callers. + if result.is_a?(Hash) && result[:success] == false + hash_prefix = chunk[:content_hash]&.slice(0, 12) + content_len = chunk[:content]&.length + log.warn( + '[knowledge][upsert_chunk] apollo persistence failed ' \ + "error=#{result[:error].inspect} chunk_hash=#{hash_prefix} chunk_len=#{content_len}" + ) + return :skipped + end force ? :updated : :created rescue StandardError => e - log.warn(e.message) + log.warn("[knowledge][upsert_chunk] unexpected error class=#{e.class} message=#{e.message} chunk_hash=#{chunk[:content_hash]&.slice(0, 12)}") :skipped end private_class_method :upsert_chunk_with_embedding diff --git a/lib/legion/extensions/knowledge/version.rb b/lib/legion/extensions/knowledge/version.rb index 745579b..aa83b4e 100644 --- a/lib/legion/extensions/knowledge/version.rb +++ b/lib/legion/extensions/knowledge/version.rb @@ -3,7 +3,7 @@ module Legion module Extensions module Knowledge - VERSION = '0.6.7' + VERSION = '0.6.8' end end end diff --git a/spec/legion/extensions/knowledge/helpers/chunker_spec.rb b/spec/legion/extensions/knowledge/helpers/chunker_spec.rb index 3e4b9e8..67ade72 100644 --- a/spec/legion/extensions/knowledge/helpers/chunker_spec.rb +++ b/spec/legion/extensions/knowledge/helpers/chunker_spec.rb @@ -34,13 +34,37 @@ expect(result.first[:source_file]).to eq('/docs/guide.md') end - it 'computes a sha256 content_hash for each chunk' do + it 'computes a 32-char (MD5-length) content_hash for each chunk' do + # Must be MD5 length to fit apollo_entries.content_hash CHARACTER(32). + # Raw SHA-256 (64 chars) is silently rejected by Postgres. result = chunker.chunk(sections: [section]) result.each do |chunk| - expect(chunk[:content_hash]).to match(/\A[0-9a-f]{64}\z/) + expect(chunk[:content_hash]).to match(/\A[0-9a-f]{32}\z/) end end + it 'content_hash matches MD5 of whitespace-normalized content' do + # Mirrors Apollo::Helpers::Writeback.content_hash semantics: strip, downcase, + # collapse whitespace, then MD5. This keeps dedup consistent with apollo. + result = chunker.chunk(sections: [section]) + result.each do |chunk| + normalized = chunk[:content].to_s.strip.downcase.gsub(/\s+/, ' ') + expect(chunk[:content_hash]).to eq(Digest::MD5.hexdigest(normalized)) + end + end + + it 'delegates to Apollo::Helpers::Writeback.content_hash when defined' do + writeback = Module.new do + def self.content_hash(content) + "apollo-#{Digest::MD5.hexdigest(content.to_s)}"[0, 32] + end + end + stub_const('Legion::Extensions::Apollo::Helpers::Writeback', writeback) + result = chunker.chunk(sections: [section]) + expected = writeback.content_hash(result.first[:content]) + expect(result.first[:content_hash]).to eq(expected) + end + it 'assigns sequential chunk_index values starting at 0' do long_content = Array.new(20) { |i| "Paragraph number #{i} with some filler text to exceed the token limit." }.join("\n\n") big_section = section.merge(content: long_content) diff --git a/spec/legion/extensions/knowledge/runners/ingest_spec.rb b/spec/legion/extensions/knowledge/runners/ingest_spec.rb index 9f2c2f2..20edd70 100644 --- a/spec/legion/extensions/knowledge/runners/ingest_spec.rb +++ b/spec/legion/extensions/knowledge/runners/ingest_spec.rb @@ -163,6 +163,81 @@ end end + describe '.upsert_chunk_with_embedding — persistence outcome propagation' do + let(:chunk) do + { + content: 'Chunk contents for persistence test.', + content_hash: 'abcdef0123456789abcdef0123456789', + source_file: '/tmp/test.md', + heading: 'H1', + section_path: ['H1'], + chunk_index: 0, + token_count: 9 + } + end + let(:embedding) { [0.1, 0.2, 0.3] } + + context 'when dry_run: true' do + it 'returns :created without contacting apollo' do + expect(described_class).not_to receive(:ingest_to_apollo) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, dry_run: true) + expect(outcome).to eq(:created) + end + end + + context 'when Legion::Extensions::Apollo is not defined' do + before { hide_const('Legion::Extensions::Apollo') if defined?(Legion::Extensions::Apollo) } + + it 'returns :created' do + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:created) + end + end + + context 'when apollo is defined' do + before { stub_const('Legion::Extensions::Apollo', Module.new) } + + it 'returns :skipped when exists is true and force is false' do + expect(described_class).not_to receive(:ingest_to_apollo) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, exists: true) + expect(outcome).to eq(:skipped) + end + + it 'returns :created when handle_ingest returns a success hash' do + allow(described_class).to receive(:ingest_to_apollo).and_return({ success: true, entry_id: 42 }) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:created) + end + + it 'returns :updated when force: true and handle_ingest succeeds' do + allow(described_class).to receive(:ingest_to_apollo).and_return({ success: true, entry_id: 42 }) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, force: true, exists: true) + expect(outcome).to eq(:updated) + end + + it 'returns :skipped and emits a warn log when handle_ingest returns success: false' do + allow(described_class).to receive(:ingest_to_apollo) + .and_return({ success: false, error: 'PG::StringDataRightTruncation' }) + expect(Legion::Logging).to receive(:warn).with(/apollo persistence failed/) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:skipped) + end + + it 'still returns :created when handle_ingest returns a non-Hash result' do + allow(described_class).to receive(:ingest_to_apollo).and_return(nil) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:created) + end + + it 'returns :skipped and logs when ingest_to_apollo raises' do + allow(described_class).to receive(:ingest_to_apollo).and_raise(StandardError, 'boom') + expect(Legion::Logging).to receive(:warn).with(/unexpected error/) + outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) + expect(outcome).to eq(:skipped) + end + end + end + describe '.ingest_corpus — delta behavior' do let(:tmp_dir) { Dir.mktmpdir } let(:store_dir) { Dir.mktmpdir } From be6ef5e888d16429dabf51f759e5703fb7855db7 Mon Sep 17 00:00:00 2001 From: Esity Date: Tue, 28 Apr 2026 11:14:40 -0500 Subject: [PATCH 2/2] Clarify Apollo content hash compatibility --- CHANGELOG.md | 2 +- lib/legion/extensions/knowledge/helpers/chunker.rb | 9 ++++----- spec/legion/extensions/knowledge/helpers/chunker_spec.rb | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d803548..9154b3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## [0.6.10] - 2026-04-28 ### Fixed -- Chunker `content_hash` now uses MD5 + whitespace normalization (matching `Legion::Extensions::Apollo::Helpers::Writeback.content_hash`) instead of raw SHA-256. Previously, 64-char SHA-256 hashes were rejected by Postgres when writing to `apollo_entries.content_hash` (CHARACTER(32)), and the failure was silently swallowed inside `handle_ingest`. As a result, corpus ingests via `/api/knowledge/ingest` appeared to succeed but persisted nothing. Chunks now round-trip into `apollo_entries` and are retrievable via `/api/apollo/query`. +- Chunker `content_hash` now uses MD5 + whitespace normalization (matching `Legion::Extensions::Apollo::Helpers::Writeback.content_hash`) instead of raw SHA-256. This keeps knowledge chunk deduplication aligned with Apollo writeback and avoids insert truncation on deployments whose `apollo_entries.content_hash` column is still fixed at 32 characters. - `upsert_chunk_with_embedding` now requires an explicit `{success: true}` from `handle_ingest` before reporting `:created`/`:updated`. Failure hashes, missing success keys, and non-Hash returns are reported as `:skipped` with a warn log instead of false-positive success counts. ## [0.6.9] - 2026-04-27 diff --git a/lib/legion/extensions/knowledge/helpers/chunker.rb b/lib/legion/extensions/knowledge/helpers/chunker.rb index 6d5fbfd..435e669 100644 --- a/lib/legion/extensions/knowledge/helpers/chunker.rb +++ b/lib/legion/extensions/knowledge/helpers/chunker.rb @@ -62,10 +62,9 @@ def split_section(section, max_chars, overlap_chars) end private_class_method :split_section - # Hash must match Legion::Extensions::Apollo::Helpers::Writeback.content_hash so - # chunks land in apollo_entries.content_hash (CHARACTER(32) — MD5 length) without - # PG::StringDataRightTruncation. A bare SHA-256 (64 chars) silently fails the INSERT - # and the chain of rescues returns the chunk as "persisted" without actually writing. + # Hash must match Legion::Extensions::Apollo::Helpers::Writeback.content_hash + # so knowledge chunks deduplicate consistently with Apollo writeback and still + # fit older apollo_entries.content_hash columns fixed at MD5 length. def build_chunk(section, content, index) { content: content, @@ -83,7 +82,7 @@ def apollo_compatible_content_hash(content) if defined?(Legion::Extensions::Apollo::Helpers::Writeback) Legion::Extensions::Apollo::Helpers::Writeback.content_hash(content) else - # Fallback when apollo isn't loaded — match its MD5+normalize semantics + # Fallback when apollo isn't loaded - match its MD5+normalize semantics # so future apollo-backed lookups still work. normalized = content.to_s.strip.downcase.gsub(/\s+/, ' ') ::Digest::MD5.hexdigest(normalized) diff --git a/spec/legion/extensions/knowledge/helpers/chunker_spec.rb b/spec/legion/extensions/knowledge/helpers/chunker_spec.rb index 67ade72..dcd4c7e 100644 --- a/spec/legion/extensions/knowledge/helpers/chunker_spec.rb +++ b/spec/legion/extensions/knowledge/helpers/chunker_spec.rb @@ -35,8 +35,8 @@ end it 'computes a 32-char (MD5-length) content_hash for each chunk' do - # Must be MD5 length to fit apollo_entries.content_hash CHARACTER(32). - # Raw SHA-256 (64 chars) is silently rejected by Postgres. + # Keep chunk hashes aligned with Apollo Writeback and compatible with + # older apollo_entries.content_hash columns fixed at MD5 length. result = chunker.chunk(sections: [section]) result.each do |chunk| expect(chunk[:content_hash]).to match(/\A[0-9a-f]{32}\z/)