Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
Gemfile.lock
*.gem
.rspec_status
tmp/
.worktrees/
8 changes: 6 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
# Changelog

## [0.6.10] - 2026-04-28

### Fixed
- Chunker `content_hash` now uses MD5 + whitespace normalization (matching `Legion::Extensions::Apollo::Helpers::Writeback.content_hash`) instead of raw SHA-256. This keeps knowledge chunk deduplication aligned with Apollo writeback and avoids insert truncation on deployments whose `apollo_entries.content_hash` column is still fixed at 32 characters.
- `upsert_chunk_with_embedding` now requires an explicit `{success: true}` from `handle_ingest` before reporting `:created`/`:updated`. Failure hashes, missing success keys, and non-Hash returns are reported as `:skipped` with a warn log instead of false-positive success counts.

## [0.6.9] - 2026-04-27

### Fixed
- `Manifest.scan` no longer crashes on `Errno::EPERM`/`EACCES` encountered during corpus walk (common on macOS for TCC-protected paths like `~/Library/Accounts`). Unreadable subdirs are pruned with a debug log; scan continues. Replaced `Find.find` with a recursive walker that rescues per-dir; also tolerates `Errno::ELOOP` and `Errno::ENOENT` for files that disappear mid-scan.

> **Version note**: `0.6.8` is reserved for the companion `fix/content-hash-md5-match-apollo-schema` PR (chunker SHA-256 → MD5 hash fix). Both branches target `0.6.7` as their merge base; this PR claims `0.6.9` to avoid intra-batch collision.

## [0.6.7] - 2026-04-15

### Fixed
Expand Down
17 changes: 16 additions & 1 deletion lib/legion/extensions/knowledge/helpers/chunker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ def split_section(section, max_chars, overlap_chars)
end
private_class_method :split_section

# Hash must match Legion::Extensions::Apollo::Helpers::Writeback.content_hash
# so knowledge chunks deduplicate consistently with Apollo writeback and still
# fit older apollo_entries.content_hash columns fixed at MD5 length.
def build_chunk(section, content, index)
{
content: content,
Expand All @@ -70,11 +73,23 @@ def build_chunk(section, content, index)
source_file: section[:source_file],
token_count: (content.length.to_f / CHARS_PER_TOKEN).ceil,
chunk_index: index,
content_hash: ::Digest::SHA256.hexdigest(content)
content_hash: apollo_compatible_content_hash(content)
}
end
private_class_method :build_chunk

def apollo_compatible_content_hash(content)
if defined?(Legion::Extensions::Apollo::Helpers::Writeback)
Legion::Extensions::Apollo::Helpers::Writeback.content_hash(content)
else
# Fallback when apollo isn't loaded - match its MD5+normalize semantics
# so future apollo-backed lookups still work.
normalized = content.to_s.strip.downcase.gsub(/\s+/, ' ')
::Digest::MD5.hexdigest(normalized)
end
end
private_class_method :apollo_compatible_content_hash

def settings_max_tokens
return nil unless defined?(Legion::Settings)

Expand Down
18 changes: 16 additions & 2 deletions lib/legion/extensions/knowledge/runners/ingest.rb
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,24 @@ def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false,
return :created unless defined?(Legion::Extensions::Apollo)
return :skipped if !force && exists

ingest_to_apollo(chunk, embedding)
result = ingest_to_apollo(chunk, embedding)
# handle_ingest returns a Hash on both success and failure paths; the upsert
# status must reflect the actual persistence outcome, not just the `force` flag.
# Previously any non-raising return was treated as success, producing
# false-positive :created/:updated responses to callers.
unless result.is_a?(Hash) && result[:success] == true
hash_prefix = chunk[:content_hash]&.slice(0, 12)
content_len = chunk[:content]&.length
error = result.is_a?(Hash) ? result[:error].inspect : "non-hash result class=#{result.class}"
log.warn(
'[knowledge][upsert_chunk] apollo persistence not confirmed ' \
"error=#{error} chunk_hash=#{hash_prefix} chunk_len=#{content_len}"
)
return :skipped
end
force ? :updated : :created
rescue StandardError => e
log.warn(e.message)
log.warn("[knowledge][upsert_chunk] unexpected error class=#{e.class} message=#{e.message} chunk_hash=#{chunk[:content_hash]&.slice(0, 12)}")
:skipped
end
private_class_method :upsert_chunk_with_embedding
Expand Down
2 changes: 1 addition & 1 deletion lib/legion/extensions/knowledge/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
module Legion
module Extensions
module Knowledge
VERSION = '0.6.9'
VERSION = '0.6.10'
end
end
end
28 changes: 26 additions & 2 deletions spec/legion/extensions/knowledge/helpers/chunker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,37 @@
expect(result.first[:source_file]).to eq('/docs/guide.md')
end

it 'computes a sha256 content_hash for each chunk' do
it 'computes a 32-char (MD5-length) content_hash for each chunk' do
# Keep chunk hashes aligned with Apollo Writeback and compatible with
# older apollo_entries.content_hash columns fixed at MD5 length.
result = chunker.chunk(sections: [section])
result.each do |chunk|
expect(chunk[:content_hash]).to match(/\A[0-9a-f]{64}\z/)
expect(chunk[:content_hash]).to match(/\A[0-9a-f]{32}\z/)
end
end

it 'content_hash matches MD5 of whitespace-normalized content' do
# Mirrors Apollo::Helpers::Writeback.content_hash semantics: strip, downcase,
# collapse whitespace, then MD5. This keeps dedup consistent with apollo.
result = chunker.chunk(sections: [section])
result.each do |chunk|
normalized = chunk[:content].to_s.strip.downcase.gsub(/\s+/, ' ')
expect(chunk[:content_hash]).to eq(Digest::MD5.hexdigest(normalized))
end
end

it 'delegates to Apollo::Helpers::Writeback.content_hash when defined' do
writeback = Module.new do
def self.content_hash(content)
"apollo-#{Digest::MD5.hexdigest(content.to_s)}"[0, 32]
end
end
stub_const('Legion::Extensions::Apollo::Helpers::Writeback', writeback)
result = chunker.chunk(sections: [section])
expected = writeback.content_hash(result.first[:content])
expect(result.first[:content_hash]).to eq(expected)
end

it 'assigns sequential chunk_index values starting at 0' do
long_content = Array.new(20) { |i| "Paragraph number #{i} with some filler text to exceed the token limit." }.join("\n\n")
big_section = section.merge(content: long_content)
Expand Down
83 changes: 83 additions & 0 deletions spec/legion/extensions/knowledge/runners/ingest_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,89 @@
end
end

describe '.upsert_chunk_with_embedding — persistence outcome propagation' do
let(:chunk) do
{
content: 'Chunk contents for persistence test.',
content_hash: 'abcdef0123456789abcdef0123456789',
source_file: '/tmp/test.md',
heading: 'H1',
section_path: ['H1'],
chunk_index: 0,
token_count: 9
}
end
let(:embedding) { [0.1, 0.2, 0.3] }

context 'when dry_run: true' do
it 'returns :created without contacting apollo' do
expect(described_class).not_to receive(:ingest_to_apollo)
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, dry_run: true)
expect(outcome).to eq(:created)
end
end

context 'when Legion::Extensions::Apollo is not defined' do
before { hide_const('Legion::Extensions::Apollo') if defined?(Legion::Extensions::Apollo) }

it 'returns :created' do
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding)
expect(outcome).to eq(:created)
end
end

context 'when apollo is defined' do
before { stub_const('Legion::Extensions::Apollo', Module.new) }

it 'returns :skipped when exists is true and force is false' do
expect(described_class).not_to receive(:ingest_to_apollo)
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, exists: true)
expect(outcome).to eq(:skipped)
end

it 'returns :created when handle_ingest returns a success hash' do
allow(described_class).to receive(:ingest_to_apollo).and_return({ success: true, entry_id: 42 })
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding)
expect(outcome).to eq(:created)
end

it 'returns :updated when force: true and handle_ingest succeeds' do
allow(described_class).to receive(:ingest_to_apollo).and_return({ success: true, entry_id: 42 })
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding, force: true, exists: true)
expect(outcome).to eq(:updated)
end

it 'returns :skipped and emits a warn log when handle_ingest returns success: false' do
allow(described_class).to receive(:ingest_to_apollo)
.and_return({ success: false, error: 'PG::StringDataRightTruncation' })
expect(Legion::Logging).to receive(:warn).with(/apollo persistence not confirmed/)
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding)
expect(outcome).to eq(:skipped)
end

it 'returns :skipped when handle_ingest returns a non-Hash result' do
allow(described_class).to receive(:ingest_to_apollo).and_return(nil)
expect(Legion::Logging).to receive(:warn).with(/non-hash result class=NilClass/)
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding)
expect(outcome).to eq(:skipped)
end

it 'returns :skipped when handle_ingest returns a hash without success' do
allow(described_class).to receive(:ingest_to_apollo).and_return({ entry_id: 42 })
expect(Legion::Logging).to receive(:warn).with(/apollo persistence not confirmed/)
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding)
expect(outcome).to eq(:skipped)
end

it 'returns :skipped and logs when ingest_to_apollo raises' do
allow(described_class).to receive(:ingest_to_apollo).and_raise(StandardError, 'boom')
expect(Legion::Logging).to receive(:warn).with(/unexpected error/)
outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding)
expect(outcome).to eq(:skipped)
end
end
end

describe '.ingest_corpus — delta behavior' do
let(:tmp_dir) { Dir.mktmpdir }
let(:store_dir) { Dir.mktmpdir }
Expand Down
Loading