From 5c73befdb07792ca694255fece23a943cdcba76f Mon Sep 17 00:00:00 2001
From: Simon Chow <simon.chow@stellar.org>
Date: Tue, 23 Jun 2026 13:23:48 -0400
Subject: [PATCH 1/4] =?UTF-8?q?feat(fullhistory):=20streaming=20daemon=20s?=
 =?UTF-8?q?lice=201=20=E2=80=94=20layer=202=20(storage)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../internal/fullhistory/ingest/driver.go     | 219 ++--
 .../fullhistory/ingest/ingest_test.go         | 935 ++----------------
 .../internal/fullhistory/ingest/service.go    |  81 +-
 .../pkg/stores/hotchunk/hotchunk.go           | 141 +++
 .../pkg/stores/hotchunk/hotchunk_test.go      | 202 ++++
 .../pkg/stores/ledger/hot_store.go            |  82 +-
 .../fullhistory/streaming/hotsource.go        | 164 +++
 .../internal/fullhistory/streaming/process.go | 364 +++++++
 .../fullhistory/streaming/process_test.go     | 537 ++++++++++
 9 files changed, 1758 insertions(+), 967 deletions(-)
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/hotsource.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/process.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/process_test.go

diff --git a/cmd/stellar-rpc/internal/fullhistory/ingest/driver.go b/cmd/stellar-rpc/internal/fullhistory/ingest/driver.go
index 464a29aff..5bb323c74 100644
--- a/cmd/stellar-rpc/internal/fullhistory/ingest/driver.go
+++ b/cmd/stellar-rpc/internal/fullhistory/ingest/driver.go
@@ -14,58 +14,33 @@ import (
 	"github.com/stellar/go-stellar-sdk/xdr"
 
 	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
-	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/eventstore"
-	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
-	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/txhash"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk"
 )
 
-// HotStores holds the long-lived, caller-owned hot stores injected into RunHot.
-// The caller (the daemon) opens and closes these; RunHot only borrows them to
-// build the per-type hot ingesters. A field left nil for an enabled data type is
-// a configuration error caught by RunHot. Every hot store is chunk-bound (each
-// instance accumulates exactly one chunk before being frozen into cold
-// artifacts), so each injected store must already be bound to the chunk being
-// ingested — RunHot rejects a mismatch up front.
+// HotStores holds the long-lived, caller-owned per-chunk hot DB injected into
+// RunHot. The caller (the daemon) opens and closes it; RunHot only borrows it
+// to drive the per-ledger atomic ingest. The DB is chunk-bound (it accumulates
+// exactly one chunk before being frozen into cold artifacts), so the injected
+// DB must already be bound to the chunk being ingested — RunHot rejects a
+// mismatch up front. A nil DB with any data type enabled in cfg is a
+// configuration error caught by RunHot.
 type HotStores struct {
-	Ledgers *ledger.HotStore
-	Txhash  *txhash.HotStore
-	Events  *eventstore.HotStore
+	// HotDB is the per-chunk hot DB. Required when any hot data type is enabled.
+	HotDB *hotchunk.DB
 }
 
-// buildHotIngesters constructs one HotIngester per data type enabled in cfg, in
-// canonical ledgers→txhash→events order, from the injected stores. It errors if
-// an enabled type's store is nil.
-func buildHotIngesters(stores HotStores, sink MetricSink, cfg Config) ([]HotIngester, error) {
-	var ings []HotIngester
-	if cfg.Ledgers {
-		if stores.Ledgers == nil {
-			return nil, errors.New("ingest: Ledgers enabled but HotStores.Ledgers is nil")
-		}
-		ings = append(ings, NewLedgerHotIngester(stores.Ledgers, sink))
-	}
-	if cfg.Txhash {
-		if stores.Txhash == nil {
-			return nil, errors.New("ingest: Txhash enabled but HotStores.Txhash is nil")
-		}
-		ings = append(ings, NewTxhashHotIngester(stores.Txhash, sink))
-	}
-	if cfg.Events {
-		if stores.Events == nil {
-			return nil, errors.New("ingest: Events enabled but HotStores.Events is nil")
-		}
-		ings = append(ings, NewEventsHotIngester(stores.Events, sink))
-	}
-	return ings, nil
+// ingestContributions maps the ingest Config's enabled data types onto the
+// hotchunk.Ingest toggles that select which CFs the single per-ledger batch
+// writes.
+func ingestContributions(cfg Config) hotchunk.Ingest {
+	return hotchunk.Ingest{Ledgers: cfg.Ledgers}
 }
 
 // buildColdIngesters opens one ColdIngester per data type enabled in cfg,
 // each opening its own per-chunk writer under coldDir/<type> (constructors
 // create their own directories and freely overwrite any prior attempt's
-// files — see the package doc's artifact model). The constructor table below
-// is the single definition site of the canonical ledgers→txhash→events order
-// (buildHotIngesters keeps its explicit if-ladder because its three injected
-// store types differ). On any constructor error it closes the ingesters built
-// so far and returns.
+// files — see the package doc's artifact model). On any constructor error it
+// closes the ingesters built so far and returns.
 func buildColdIngesters(coldDir string, chunkID chunk.ID, sink MetricSink, cfg Config) ([]ColdIngester, error) {
 	ctors := []struct {
 		enabled  bool
@@ -73,8 +48,6 @@ func buildColdIngesters(coldDir string, chunkID chunk.ID, sink MetricSink, cfg C
 		open     func(string, chunk.ID, MetricSink) (ColdIngester, error)
 	}{
 		{cfg.Ledgers, dataTypeLedgers, NewLedgerColdIngester},
-		{cfg.Txhash, dataTypeTxhash, NewTxhashColdIngester},
-		{cfg.Events, dataTypeEvents, NewEventsColdIngester},
 	}
 	var ings []ColdIngester
 	for _, c := range ctors {
@@ -123,11 +96,12 @@ func closeColdAll(ings []ColdIngester, err error) error {
 }
 
 // RunHot opens one stream for chunkID from source and feeds each ledger (as a
-// view) to a HotService over the enabled hot ingesters, built from the INJECTED,
-// caller-owned stores in hotStores. Ingest errors abort fast; HotService.Ingest
-// waits for all ingesters before the loop pulls again so the borrowed view is
-// never read past its lifetime. The hot stores are NOT closed here — the caller
-// owns their lifecycle.
+// view) to a HotService backed by the INJECTED, caller-owned shared per-chunk
+// hot DB in hotStores. Each ledger commits as ONE atomic synced WriteBatch
+// across all enabled CFs (decision (a)); Ingest errors abort fast, and
+// HotService.Ingest consumes the borrowed view synchronously before the loop
+// pulls the next ledger. The hot DB is NOT closed here — the caller owns its
+// lifecycle.
 func RunHot(
 	ctx context.Context,
 	logger *supportlog.Entry,
@@ -140,47 +114,25 @@ func RunHot(
 	if verr := cfg.validate(); verr != nil {
 		return verr
 	}
-	// Every hot store is chunk-bound — each instance accumulates exactly one
-	// chunk's data before being frozen into the chunk's cold artifacts — and
-	// records its chunk at open time. An injected store bound to a different
-	// chunk than we're ingesting would silently interleave two chunks' data
-	// (ledgers, txhash) or fail every per-ledger write with an out-of-range
-	// offset (events, whose LedgerOffsets are chunk-relative), so catch the
-	// mismatch up front with a clear message. Nil stores are skipped here:
-	// buildHotIngesters rejects a nil store for an enabled type with a more
-	// specific error.
-	checkBinding := func(name string, got chunk.ID) error {
-		if got != chunkID {
-			return fmt.Errorf("ingest: RunHot chunk %d but injected %s store is bound to chunk %d",
-				uint32(chunkID), name, uint32(got))
-		}
-		return nil
-	}
-	if cfg.Ledgers && hotStores.Ledgers != nil {
-		if err := checkBinding("Ledgers", hotStores.Ledgers.ChunkID()); err != nil {
-			return err
-		}
-	}
-	if cfg.Txhash && hotStores.Txhash != nil {
-		if err := checkBinding("Txhash", hotStores.Txhash.ChunkID()); err != nil {
-			return err
-		}
-	}
-	if cfg.Events && hotStores.Events != nil {
-		if err := checkBinding("Events", hotStores.Events.ChunkID()); err != nil {
-			return err
-		}
+	anyEnabled := cfg.Ledgers
+	if anyEnabled && hotStores.HotDB == nil {
+		return errors.New("ingest: a hot data type is enabled but HotStores.HotDB is nil")
 	}
-	ings, berr := buildHotIngesters(hotStores, sink, cfg)
-	if berr != nil {
-		return berr
+	// The hot DB is chunk-bound — it accumulates exactly one chunk's data
+	// before being frozen into the chunk's cold artifacts — and records its
+	// chunk at open time. An injected DB bound to a different chunk than we're
+	// ingesting would silently interleave two chunks' data, so catch the
+	// mismatch up front with a clear message.
+	if hotStores.HotDB != nil && hotStores.HotDB.ChunkID() != chunkID {
+		return fmt.Errorf("ingest: RunHot chunk %d but injected hot DB is bound to chunk %d",
+			uint32(chunkID), uint32(hotStores.HotDB.ChunkID()))
 	}
 	stream, oerr := source.OpenStream(chunkID)
 	if oerr != nil {
 		return fmt.Errorf("open stream for chunk %d: %w", uint32(chunkID), oerr)
 	}
 	logger.Debugf("RunHot: ingesting chunk %d [%d, %d]", uint32(chunkID), chunkID.FirstLedger(), chunkID.LastLedger())
-	service := NewHotService(ings, sink)
+	service := NewHotService(hotStores.HotDB, ingestContributions(cfg), sink)
 	return drain(ctx, stream, chunkID, service)
 }
 
@@ -235,6 +187,107 @@ func drain(ctx context.Context, stream ledgerbackend.LedgerStream, chunkID chunk
 	return nil
 }
 
+// ColdDirs names the per-data-type output root for one chunk's cold artifacts.
+// Each field is the directory UNDER WHICH the matching cold ingester composes
+// its {bucketID:05d}/ subdirectory — i.e. the same `coldDir` the per-type
+// constructor (NewLedgerColdIngester) takes. A field left "" for a data type
+// enabled in cfg is a configuration error caught by RunColdChunk.
+//
+// RunCold derives this root from a single coldDir by appending the fixed
+// dataType subdirectory (coldDir/ledgers). ColdDirs exists so a caller with a
+// DIFFERENT on-disk layout can place each artifact at its own canonical path
+// while reusing the very same cold ingesters, ColdService, and drain loop.
+type ColdDirs struct {
+	Ledgers string
+}
+
+// buildColdIngestersIn opens one ColdIngester per data type enabled in cfg,
+// each under its OWN root from dirs (rather than coldDir/<dataType>). It is the
+// ColdDirs counterpart of buildColdIngesters: same constructors, same
+// rollback-on-constructor-error semantics; it differs only in resolving each
+// type's root from an explicit field instead of a fixed subdirectory of one
+// coldDir.
+func buildColdIngestersIn(dirs ColdDirs, chunkID chunk.ID, sink MetricSink, cfg Config) ([]ColdIngester, error) {
+	ctors := []struct {
+		enabled  bool
+		dataType string
+		dir      string
+		open     func(string, chunk.ID, MetricSink) (ColdIngester, error)
+	}{
+		{cfg.Ledgers, dataTypeLedgers, dirs.Ledgers, NewLedgerColdIngester},
+	}
+	var ings []ColdIngester
+	for _, c := range ctors {
+		if !c.enabled {
+			continue
+		}
+		if c.dir == "" {
+			return nil, closeColdAll(ings, fmt.Errorf("ingest: %s enabled but ColdDirs.%s is empty", c.dataType, c.dataType))
+		}
+		ing, err := c.open(c.dir, chunkID, sink)
+		if err != nil {
+			return nil, closeColdAll(ings, fmt.Errorf("open %s cold ingester: %w", c.dataType, err))
+		}
+		ings = append(ings, ing)
+	}
+	return ings, nil
+}
+
+// RunColdChunk ingests EXACTLY ONE chunk's cold artifacts from source into the
+// per-data-type roots named by dirs, in a single streaming pass over the
+// chunk's ledgers. It is the single-chunk, explicit-layout sibling of RunCold:
+// it reuses the same cold ingester constructors, the same ColdService, and the
+// same drain loop (sequence/overrun validation, full-range completeness check
+// before Finalize), differing only in (1) producing one chunk rather than N
+// concurrent chunks and (2) taking explicit per-type output roots so a caller
+// whose layout is not coldDir/<dataType> can still reuse the cold pipeline
+// verbatim.
+//
+// The cold ingesters overwrite any prior attempt's files at their canonical
+// paths (see the package doc's artifact model), so RunColdChunk is the
+// re-materialization primitive the streaming freeze protocol drives: a partial
+// file from a crashed attempt is inert scratch the next call overwrites.
+func RunColdChunk(
+	ctx context.Context,
+	logger *supportlog.Entry,
+	source ChunkSource,
+	dirs ColdDirs,
+	chunkID chunk.ID,
+	sink MetricSink,
+	cfg Config,
+) (err error) {
+	if verr := cfg.validate(); verr != nil {
+		return verr
+	}
+	sink = orNop(sink)
+	start := time.Now()
+	if cerr := ctx.Err(); cerr != nil {
+		sink.ColdChunkTotal(time.Since(start))
+		return cerr
+	}
+	stream, oerr := source.OpenStream(chunkID)
+	if oerr != nil {
+		sink.ColdChunkTotal(time.Since(start))
+		return fmt.Errorf("open stream for chunk %d: %w", uint32(chunkID), oerr)
+	}
+	ings, berr := buildColdIngestersIn(dirs, chunkID, sink, cfg)
+	if berr != nil {
+		sink.ColdChunkTotal(time.Since(start))
+		return berr
+	}
+	logger.Debugf("RunColdChunk: ingesting chunk %d [%d, %d]", uint32(chunkID), chunkID.FirstLedger(), chunkID.LastLedger())
+	service := NewColdService(ings, sink)
+	defer func() {
+		if cerr := service.Close(); cerr != nil {
+			err = errors.Join(err, fmt.Errorf("close: %w", cerr))
+		}
+	}()
+	if derr := drain(ctx, stream, chunkID, service); derr != nil {
+		return derr
+	}
+	return service.Finalize(ctx)
+}
+
 // RunCold ingests numChunks consecutive chunks starting at startChunk into the
 // cold stores under coldDir, processing up to chunkWorkers chunks concurrently.
 // Each chunk worker opens its own stream via source.OpenStream(chunkID), builds
diff --git a/cmd/stellar-rpc/internal/fullhistory/ingest/ingest_test.go b/cmd/stellar-rpc/internal/fullhistory/ingest/ingest_test.go
index 75d6ec825..dcd040df1 100644
--- a/cmd/stellar-rpc/internal/fullhistory/ingest/ingest_test.go
+++ b/cmd/stellar-rpc/internal/fullhistory/ingest/ingest_test.go
@@ -1,7 +1,6 @@
 package ingest
 
 import (
-	"bytes"
 	"context"
 	"errors"
 	"iter"
@@ -22,11 +21,9 @@ import (
 	supportlog "github.com/stellar/go-stellar-sdk/support/log"
 	"github.com/stellar/go-stellar-sdk/xdr"
 
-	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/events"
 	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
-	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/eventstore"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk"
 	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
-	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/txhash"
 )
 
 // testPassphrase is a network passphrase literal used only by the test fixtures
@@ -199,53 +196,6 @@ func marshalLCM(t *testing.T, seq uint32) []byte {
 	return raw
 }
 
-// eventTopic is the symbol topic embedded by event-bearing fixtures; the same
-// term key derives from it, so tests can look the event up in the index.
-const eventTopic = "ingest_test"
-
-// marshalLCMWithEvent builds a V2 LCM carrying one transaction with one
-// operation-level contract event (topic=eventTopic). It returns the wire bytes,
-// the transaction hash (for txhash lookups), and the event's term key (for event
-// index lookups).
-func marshalLCMWithEvent(t *testing.T, seq uint32) ([]byte, [32]byte, events.TermKey) {
-	t.Helper()
-	ev := buildContractEvent(eventTopic)
-	meta := xdr.TransactionMeta{
-		V:  4,
-		V4: &xdr.TransactionMetaV4{Operations: []xdr.OperationMetaV2{{Events: []xdr.ContractEvent{ev}}}},
-	}
-	lcm, hash := buildLCMWithTx(t, seq, meta)
-	rawBytes, err := lcm.MarshalBinary()
-	require.NoError(t, err)
-
-	evBytes, err := ev.MarshalBinary()
-	require.NoError(t, err)
-	keys, err := events.TermsForBytes(evBytes)
-	require.NoError(t, err)
-	require.NotEmpty(t, keys)
-	return rawBytes, hash, keys[0]
-}
-
-// buildContractEvent returns a contract ContractEvent with a single symbol
-// topic, mirroring the events-package test fixture.
-func buildContractEvent(topic string) xdr.ContractEvent {
-	var contractID xdr.ContractId
-	contractID[0] = 0xab
-	contractID[1] = 0xcd
-	sym := xdr.ScSymbol(topic)
-	return xdr.ContractEvent{
-		ContractId: &contractID,
-		Type:       xdr.ContractEventTypeContract,
-		Body: xdr.ContractEventBody{
-			V: 0,
-			V0: &xdr.ContractEventV0{
-				Topics: []xdr.ScVal{{Type: xdr.ScValTypeScvSymbol, Sym: &sym}},
-				Data:   xdr.ScVal{Type: xdr.ScValTypeScvSymbol, Sym: &sym},
-			},
-		},
-	}
-}
-
 func successResult() xdr.TransactionResult {
 	opResults := []xdr.OperationResult{}
 	return xdr.TransactionResult{
@@ -265,14 +215,6 @@ func buildLCM(t *testing.T, seq uint32, txMetas []xdr.TransactionMeta) xdr.Ledge
 	return lcm
 }
 
-// buildLCMWithTx builds a single-transaction V2 LCM and returns the tx hash.
-func buildLCMWithTx(t *testing.T, seq uint32, meta xdr.TransactionMeta) (xdr.LedgerCloseMeta, [32]byte) {
-	t.Helper()
-	lcm, hashes := buildLCMReturningHashes(t, seq, []xdr.TransactionMeta{meta})
-	require.Len(t, hashes, 1)
-	return lcm, hashes[0]
-}
-
 // buildLCMReturningHashes assembles a V2 LedgerCloseMeta with one envelope per tx
 // meta and returns the per-tx transaction hashes in order.
 func buildLCMReturningHashes(
@@ -347,21 +289,6 @@ func viewOf(t *testing.T, seq uint32) xdr.LedgerCloseMetaView {
 	return xdr.LedgerCloseMetaView(marshalLCM(t, seq))
 }
 
-// marshalV0LCM builds a minimal V0 (pre-Soroban) LedgerCloseMeta with no
-// transactions and returns its wire bytes. V0 LCMs carry no contract events,
-// so the events ingesters record them as a zero-payload ledger.
-func marshalV0LCM(t *testing.T, seq uint32) []byte {
-	t.Helper()
-	lcm := xdr.LedgerCloseMeta{V: 0, V0: &xdr.LedgerCloseMetaV0{
-		LedgerHeader: xdr.LedgerHeaderHistoryEntry{
-			Header: xdr.LedgerHeader{LedgerSeq: xdr.Uint32(seq)},
-		},
-	}}
-	raw, err := lcm.MarshalBinary()
-	require.NoError(t, err)
-	return raw
-}
-
 // seqStream is a ledgerbackend.LedgerStream that yields LCMs for an explicit
 // list of ledger sequences (in order), regardless of the requested range. It
 // models a backend that hands back a duplicate / out-of-order / wrong-but-
@@ -434,48 +361,6 @@ func TestLedgerHotIngester_Readback(t *testing.T) {
 	require.Equal(t, raw, got)
 }
 
-// TestTxhashHotIngester_Lookup ingests an event/tx-bearing ledger via the hot
-// txhash ingester and looks the hash up.
-func TestTxhashHotIngester_Lookup(t *testing.T) {
-	seq := chunk.ID(0).FirstLedger()
-	raw, hash, _ := marshalLCMWithEvent(t, seq)
-	dir := t.TempDir()
-	logger := testLogger()
-
-	store, err := txhash.NewHotStore(dir, chunk.ID(0), logger)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, store.Close()) }()
-
-	ing := NewTxhashHotIngester(store, nil)
-	require.NoError(t, ing.Ingest(context.Background(), seq, xdr.LedgerCloseMetaView(raw)))
-
-	got, err := store.Get(hash)
-	require.NoError(t, err)
-	require.Equal(t, seq, got)
-}
-
-// TestEventsHotIngester_Query ingests an event-bearing ledger via the hot events
-// ingester and resolves the term.
-func TestEventsHotIngester_Query(t *testing.T) {
-	chunkID := chunk.ID(0)
-	seq := chunkID.FirstLedger()
-	raw, _, term := marshalLCMWithEvent(t, seq)
-	dir := t.TempDir()
-	logger := testLogger()
-
-	store, err := eventstore.OpenHotStore(dir, chunkID, logger)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, store.Close()) }()
-
-	ing := NewEventsHotIngester(store, nil)
-	require.NoError(t, ing.Ingest(context.Background(), seq, xdr.LedgerCloseMetaView(raw)))
-
-	bm, err := store.Lookup(context.Background(), term)
-	require.NoError(t, err)
-	require.NotNil(t, bm)
-	require.Equal(t, uint64(1), bm.GetCardinality())
-}
-
 // TestLedgerColdIngester_Readback ingests one ledger via the cold ledger
 // ingester, finalizes, and reads back through the cold reader.
 func TestLedgerColdIngester_Readback(t *testing.T) {
@@ -499,355 +384,119 @@ func TestLedgerColdIngester_Readback(t *testing.T) {
 	require.Equal(t, raw, got)
 }
 
-// txhashBinPath composes the documented raw-txhash chunk path under root for
-// the tests' fixed chunk 0: {root}/{bucketID:05d}/{chunkID:08d}.bin.
-func txhashBinPath(root string) string {
-	c := chunk.ID(0)
-	return filepath.Join(root, c.BucketID(), txhash.ColdBinName(c))
-}
-
-// TestTxhashColdIngester_Bin ingests two tx-bearing ledgers via the cold txhash
-// ingester, finalizes, and reads the .bin back through the store codec.
-func TestTxhashColdIngester_Bin(t *testing.T) {
-	chunkID := chunk.ID(0)
-	first := chunkID.FirstLedger()
-	coldDir := t.TempDir()
-
-	ing, err := NewTxhashColdIngester(coldDir, chunkID, nil)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ing.Close()) }()
-
-	for _, seq := range []uint32{first, first + 1} {
-		raw, _, _ := marshalLCMWithEvent(t, seq)
-		require.NoError(t, ing.Ingest(context.Background(), seq, xdr.LedgerCloseMetaView(raw)))
-	}
-	require.NoError(t, ing.Finalize(context.Background()))
-
-	entries, err := txhash.ReadColdBin(txhashBinPath(coldDir))
-	require.NoError(t, err)
-	require.Len(t, entries, 2)
-}
-
-// TestEventsColdIngester_Readback ingests two event-bearing ledgers via the cold
-// events ingester, finalizes, and resolves the term through the cold reader.
-func TestEventsColdIngester_Readback(t *testing.T) {
-	chunkID := chunk.ID(0)
-	first := chunkID.FirstLedger()
-	coldDir := t.TempDir()
-
-	ing, err := NewEventsColdIngester(coldDir, chunkID, nil)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ing.Close()) }()
-
-	var term events.TermKey
-	for _, seq := range []uint32{first, first + 1} {
-		raw, _, tk := marshalLCMWithEvent(t, seq)
-		term = tk
-		require.NoError(t, ing.Ingest(context.Background(), seq, xdr.LedgerCloseMetaView(raw)))
-	}
-	require.NoError(t, ing.Finalize(context.Background()))
-
-	bucketDir := filepath.Join(coldDir, chunkID.BucketID())
-	cr, err := eventstore.OpenColdReader(chunkID, bucketDir, eventstore.ColdReaderOptions{})
-	require.NoError(t, err)
-	defer func() { require.NoError(t, cr.Close()) }()
-	cnt, err := cr.EventCount()
-	require.NoError(t, err)
-	require.Equal(t, uint32(2), cnt)
-	bm, err := cr.Lookup(context.Background(), term)
-	require.NoError(t, err)
-	require.NotNil(t, bm)
-	require.Equal(t, uint64(2), bm.GetCardinality())
-}
-
-// ───────────────────────── V0 (pre-Soroban) events handling ─────────────────────────
-
-// TestEventsHotIngester_V0AsEmpty asserts the hot events ingester treats a V0
-// LCM as a zero-event ledger (no error) rather than failing the range, and that
-// the store records the empty ledger (its event count is unchanged).
-func TestEventsHotIngester_V0AsEmpty(t *testing.T) {
-	chunkID := chunk.ID(0)
-	seq := chunkID.FirstLedger()
-	dir := t.TempDir()
-	logger := testLogger()
-
-	store, err := eventstore.OpenHotStore(dir, chunkID, logger)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, store.Close()) }()
-
-	ing := NewEventsHotIngester(store, nil)
-	require.NoError(t, ing.Ingest(context.Background(), seq, xdr.LedgerCloseMetaView(marshalV0LCM(t, seq))),
-		"V0 ledger must ingest as zero events, not error")
-
-	cnt, err := store.EventCount()
-	require.NoError(t, err)
-	require.Equal(t, uint32(0), cnt, "V0 ledger contributes no events")
-}
-
-// TestEventsColdIngester_V0KeepsOffsetsContiguous ingests a V0 ledger followed by
-// an event-bearing V2 ledger and asserts: the V0 ledger does not error, and the
-// LedgerOffsets stay contiguous (both ledgers present, the event-bearing one's
-// single event ID immediately follows the empty V0 ledger).
-func TestEventsColdIngester_V0KeepsOffsetsContiguous(t *testing.T) {
-	chunkID := chunk.ID(0)
-	first := chunkID.FirstLedger()
-	coldDir := t.TempDir()
-
-	ing, err := NewEventsColdIngester(coldDir, chunkID, nil)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ing.Close()) }()
-
-	// Ledger `first`: V0 → zero events, no error.
-	require.NoError(t, ing.Ingest(context.Background(), first, xdr.LedgerCloseMetaView(marshalV0LCM(t, first))))
-	// Ledger `first+1`: one contract event.
-	rawEv, _, term := marshalLCMWithEvent(t, first+1)
-	require.NoError(t, ing.Ingest(context.Background(), first+1, xdr.LedgerCloseMetaView(rawEv)))
-	require.NoError(t, ing.Finalize(context.Background()))
-
-	bucketDir := filepath.Join(coldDir, chunkID.BucketID())
-	cr, err := eventstore.OpenColdReader(chunkID, bucketDir, eventstore.ColdReaderOptions{})
-	require.NoError(t, err)
-	defer func() { require.NoError(t, cr.Close()) }()
-
-	// One event total, from the V2 ledger.
-	cnt, err := cr.EventCount()
-	require.NoError(t, err)
-	require.Equal(t, uint32(1), cnt)
-
-	// Offsets are contiguous: both ledgers recorded, V0 contributes [0,0), the
-	// event-bearing ledger contributes exactly event ID 0.
-	offsets, err := cr.Offsets()
-	require.NoError(t, err)
-	require.Equal(t, 2, offsets.LedgerCount(), "both V0 and V2 ledgers recorded")
-	require.Equal(t, first, offsets.StartLedger())
-	v0Start, v0End, err := offsets.EventIDs(first)
-	require.NoError(t, err)
-	require.Equal(t, uint32(0), v0Start)
-	require.Equal(t, uint32(0), v0End, "V0 ledger has an empty event range")
-	evStart, evEnd, err := offsets.EventIDs(first + 1)
-	require.NoError(t, err)
-	require.Equal(t, uint32(0), evStart, "event ID follows the empty V0 ledger contiguously")
-	require.Equal(t, uint32(1), evEnd)
-
-	// And the event is queryable by its term.
-	bm, err := cr.Lookup(context.Background(), term)
-	require.NoError(t, err)
-	require.NotNil(t, bm)
-	require.Equal(t, uint64(1), bm.GetCardinality())
-}
-
-// TestRunCold_EventlessChunk_FullyReadable drives a full cold chunk of V0
-// (pre-Soroban, eventless) ledgers with Events enabled — the common backfill
-// case for early history. The whole chunk has zero contract events;
-// eventstore.WriteColdIndex publishes a valid EMPTY index for it, so all
-// three cold artifacts exist and the chunk is fully readable: a term-filtered
-// Lookup resolves to "no matches" through the ordinary path instead of a
-// missing-file error.
-func TestRunCold_EventlessChunk_FullyReadable(t *testing.T) {
-	chunkID := chunk.ID(0)
-	coldDir := t.TempDir()
-	logger := testLogger()
-	sink := &testSink{}
-
-	// Every ledger in the chunk is a V0 (pre-Soroban) ledger → zero events.
-	require.NoError(t, RunCold(
-		context.Background(), logger, sourceOf(fullStream(t, chunkID, marshalV0LCM)),
-		coldDir, chunkID, 1, 1, sink, Config{Events: true},
-	))
-
-	bucketDir := filepath.Join(coldDir, dataTypeEvents, chunkID.BucketID())
-
-	// All three cold artifacts exist (events.pack + the empty index pair).
-	for _, name := range []string{
-		eventstore.EventsPackName(chunkID),
-		eventstore.IndexPackName(chunkID),
-		eventstore.IndexHashName(chunkID),
-	} {
-		_, statErr := os.Stat(filepath.Join(bucketDir, name))
-		require.NoError(t, statErr, "eventless chunk must publish %s", name)
-	}
-
-	// The chunk is readable end to end: zero events, and a filtered lookup
-	// misses cleanly rather than erroring on a missing index.
-	cr, err := eventstore.OpenColdReader(chunkID, bucketDir, eventstore.ColdReaderOptions{})
-	require.NoError(t, err)
-	defer func() { require.NoError(t, cr.Close()) }()
-	cnt, err := cr.EventCount()
-	require.NoError(t, err)
-	require.Zero(t, cnt)
-	_, lerr := cr.Lookup(context.Background(), events.ComputeTermKey([]byte("any"), events.FieldContractID))
-	require.ErrorIs(t, lerr, eventstore.ErrTermNotFound)
-
-	// Metrics still fired: one aggregate per-chunk, one (clean) per-ingester.
-	require.Equal(t, 1, sink.coldChunkTotals, "ColdChunkTotal must fire for an eventless chunk")
-	require.Equal(t, 1, sink.coldDataTypes()[dataTypeEvents], "one ColdIngest for events")
-	require.Zero(t, sink.coldErrorTypes()[dataTypeEvents], "eventless chunk is not an error")
-}
-
 // ───────────────────────── HotService tests ─────────────────────────
 
-// TestHotService_AllTypes_FanOut runs HotService with all three hot ingesters
-// over event/tx-bearing ledgers and reads each store back, asserting the
-// aggregate HotLedgerTotal and per-ingester signals fired.
-func TestHotService_AllTypes_FanOut(t *testing.T) {
+// TestHotService_Ledgers_OneAtomicBatch runs HotService over the SHARED multi-CF
+// hot DB (decision (a)) and reads the ledger CF back through the DB's facade,
+// asserting the aggregate HotLedgerTotal and the per-type HotIngest signals
+// fired. Each ledger committed as ONE atomic synced WriteBatch.
+func TestHotService_Ledgers_OneAtomicBatch(t *testing.T) {
 	chunkID := chunk.ID(0)
 	first := chunkID.FirstLedger()
 	logger := testLogger()
-	dir := t.TempDir()
 
-	ls, err := ledger.OpenHotStore(filepath.Join(dir, "ledgers"), chunkID, logger)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ls.Close()) }()
-	ts, err := txhash.NewHotStore(filepath.Join(dir, "txhash"), chunkID, logger)
+	db, err := hotchunk.Open(t.TempDir(), chunkID, logger)
 	require.NoError(t, err)
-	defer func() { require.NoError(t, ts.Close()) }()
-	es, err := eventstore.OpenHotStore(filepath.Join(dir, "events"), chunkID, logger)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, es.Close()) }()
+	defer func() { require.NoError(t, db.Close()) }()
 
 	sink := &testSink{}
-	service := NewHotService([]HotIngester{
-		NewLedgerHotIngester(ls, sink),
-		NewTxhashHotIngester(ts, sink),
-		NewEventsHotIngester(es, sink),
-	}, sink)
-
-	rawA, hashA, termA := marshalLCMWithEvent(t, first)
-	rawB, hashB, _ := marshalLCMWithEvent(t, first+1)
+	service := NewHotService(db, hotchunk.Ingest{Ledgers: true}, sink)
+
+	rawA := marshalLCM(t, first)
+	rawB := marshalLCM(t, first+1)
 	require.NoError(t, service.Ingest(context.Background(), first, xdr.LedgerCloseMetaView(rawA)))
 	require.NoError(t, service.Ingest(context.Background(), first+1, xdr.LedgerCloseMetaView(rawB)))
 
-	// All three stores retained the data.
-	gotRawA, err := ls.GetLedgerRaw(first)
+	// The ledger CF retained the data (read through the shared DB's facade).
+	gotRawA, err := db.Ledgers().GetLedgerRaw(first)
 	require.NoError(t, err)
 	require.Equal(t, rawA, gotRawA)
-	gotA, err := ts.Get(hashA)
+	gotRawB, err := db.Ledgers().GetLedgerRaw(first + 1)
 	require.NoError(t, err)
-	require.Equal(t, first, gotA)
-	gotB, err := ts.Get(hashB)
-	require.NoError(t, err)
-	require.Equal(t, first+1, gotB)
-	bm, err := es.Lookup(context.Background(), termA)
+	require.Equal(t, rawB, gotRawB)
+
+	// The single watermark advanced to the last committed ledger (decision (a)).
+	maxSeq, ok, err := db.MaxCommittedSeq()
 	require.NoError(t, err)
-	require.Equal(t, uint64(2), bm.GetCardinality())
+	require.True(t, ok)
+	require.Equal(t, first+1, maxSeq)
 
-	// Aggregate + per-ingester signals.
+	// Aggregate + per-type signals.
 	require.Equal(t, 2, sink.hotLedgerTotals, "one HotLedgerTotal per ledger")
 	dt := sink.hotDataTypes()
 	require.Equal(t, 2, dt[dataTypeLedgers])
-	require.Equal(t, 2, dt[dataTypeTxhash])
-	require.Equal(t, 2, dt[dataTypeEvents])
-
-	// Per-stage signals: each ledger fired the hot extract/write stages its
-	// data type defines (ledgers has no extract — it writes the view verbatim).
-	st := sink.stageCounts()
-	require.Equal(t, 2, st[dataTypeLedgers+"/"+tierHot+"/"+stageWrite])
-	require.Equal(t, 2, st[dataTypeTxhash+"/"+tierHot+"/"+stageExtract])
-	require.Equal(t, 2, st[dataTypeTxhash+"/"+tierHot+"/"+stageWrite])
-	require.Equal(t, 2, st[dataTypeEvents+"/"+tierHot+"/"+stageExtract])
-	require.Equal(t, 2, st[dataTypeEvents+"/"+tierHot+"/"+stageWrite])
 }
 
-// TestHotService_EnabledSubset runs HotService with only the ledger ingester and
-// asserts only that type's signals fire.
+// TestHotService_EnabledSubset runs HotService with ledgers enabled and asserts
+// the ledger signal fires for each ingested ledger.
 func TestHotService_EnabledSubset(t *testing.T) {
 	seq := chunk.ID(0).FirstLedger()
 	logger := testLogger()
-	dir := t.TempDir()
 
-	ls, err := ledger.OpenHotStore(dir, chunk.ID(0), logger)
+	db, err := hotchunk.Open(t.TempDir(), chunk.ID(0), logger)
 	require.NoError(t, err)
-	defer func() { require.NoError(t, ls.Close()) }()
+	defer func() { require.NoError(t, db.Close()) }()
 
 	sink := &testSink{}
-	service := NewHotService([]HotIngester{NewLedgerHotIngester(ls, sink)}, sink)
+	service := NewHotService(db, hotchunk.Ingest{Ledgers: true}, sink)
 	require.NoError(t, service.Ingest(context.Background(), seq, viewOf(t, seq)))
 
 	require.Equal(t, 1, sink.hotLedgerTotals)
 	dt := sink.hotDataTypes()
 	require.Equal(t, 1, dt[dataTypeLedgers])
-	require.Zero(t, dt[dataTypeTxhash])
-	require.Zero(t, dt[dataTypeEvents])
 }
 
 // ───────────────────────── ColdService tests ─────────────────────────
 
-// TestColdService_Success drives ledger+txhash+events cold ingesters through a
-// ColdService and asserts readback plus the metrics signals.
+// TestColdService_Success drives the ledger cold ingester through a ColdService
+// and asserts readback plus the metrics signals.
 func TestColdService_Success(t *testing.T) {
 	chunkID := chunk.ID(0)
 	first := chunkID.FirstLedger()
 	coldDir := t.TempDir()
 	sink := &testSink{}
 
-	ings, err := buildColdIngesters(coldDir, chunkID, sink, Config{Ledgers: true, Txhash: true, Events: true})
+	ings, err := buildColdIngesters(coldDir, chunkID, sink, Config{Ledgers: true})
 	require.NoError(t, err)
 	service := NewColdService(ings, sink)
 	defer func() { require.NoError(t, service.Close()) }()
 
-	var term events.TermKey
 	for _, seq := range []uint32{first, first + 1} {
-		raw, _, tk := marshalLCMWithEvent(t, seq)
-		term = tk
-		require.NoError(t, service.Ingest(context.Background(), seq, xdr.LedgerCloseMetaView(raw)))
+		require.NoError(t, service.Ingest(context.Background(), seq, viewOf(t, seq)))
 	}
 	require.NoError(t, service.Finalize(context.Background()))
 
-	// Ledger cold readback: tx hashes use random keypairs, so bytes can't be
-	// regenerated for comparison — assert the boundary ledger reads back and
-	// decodes to the right sequence.
+	// Ledger cold readback: the boundary ledger reads back and decodes to the
+	// right sequence.
 	lcr, err := ledger.OpenColdReader(packPath(filepath.Join(coldDir, dataTypeLedgers), chunkID))
 	require.NoError(t, err)
 	defer func() { require.NoError(t, lcr.Close()) }()
 	gotFirst, err := lcr.GetLedgerRaw(first)
 	require.NoError(t, err)
+	require.Equal(t, marshalLCM(t, first), gotFirst)
 	var decoded xdr.LedgerCloseMeta
 	require.NoError(t, decoded.UnmarshalBinary(gotFirst))
 	require.Equal(t, first, decoded.LedgerSequence())
 
-	// Events cold readback.
-	ecr, err := eventstore.OpenColdReader(
-		chunkID, filepath.Join(coldDir, dataTypeEvents, chunkID.BucketID()), eventstore.ColdReaderOptions{})
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ecr.Close()) }()
-	bm, err := ecr.Lookup(context.Background(), term)
-	require.NoError(t, err)
-	require.Equal(t, uint64(2), bm.GetCardinality())
-
-	// Txhash .bin count.
-	binEntries, err := txhash.ReadColdBin(txhashBinPath(filepath.Join(coldDir, dataTypeTxhash)))
-	require.NoError(t, err)
-	require.Len(t, binEntries, 2)
-
-	// Metrics: one ColdChunkTotal, one ColdIngest per data type, no errors.
+	// Metrics: one ColdChunkTotal, one ColdIngest for ledgers, no errors.
 	require.Equal(t, 1, sink.coldChunkTotals)
 	cdt := sink.coldDataTypes()
 	require.Equal(t, 1, cdt[dataTypeLedgers])
-	require.Equal(t, 1, cdt[dataTypeTxhash])
-	require.Equal(t, 1, cdt[dataTypeEvents])
 	require.Empty(t, sink.coldErrorTypes(), "success path records no ingester errors")
 
-	// Per-stage signals: per-ledger cold stages fired once per (non-empty)
-	// ledger, the per-chunk finalize stage once per ingester. The exact map is
-	// asserted so an unexpected stage emission (or a missing one) also fails —
-	// events now emits term_index/write for every ledger, and txhash's extract
-	// spans its whole per-ledger Ingest.
+	// Per-stage signals: per-ledger cold write fired once per ledger, the
+	// per-chunk finalize stage once. The exact map is asserted so an unexpected
+	// stage emission (or a missing one) also fails.
 	require.Equal(t, map[string]int{
 		dataTypeLedgers + "/" + tierCold + "/" + stageWrite:    2,
 		dataTypeLedgers + "/" + tierCold + "/" + stageFinalize: 1,
-		dataTypeTxhash + "/" + tierCold + "/" + stageExtract:   2,
-		dataTypeTxhash + "/" + tierCold + "/" + stageFinalize:  1,
-		dataTypeEvents + "/" + tierCold + "/" + stageExtract:   2,
-		dataTypeEvents + "/" + tierCold + "/" + stageTermIndex: 2,
-		dataTypeEvents + "/" + tierCold + "/" + stageWrite:     2,
-		dataTypeEvents + "/" + tierCold + "/" + stageFinalize:  1,
 	}, sink.stageCounts())
 
 	// No double-emit: the deferred Close (after this body) must not add a second
 	// ColdIngest or ColdChunkTotal, since Finalize already emitted.
 	require.NoError(t, service.Close())
 	require.Equal(t, 1, sink.coldChunkTotals, "Close after Finalize must not re-emit the aggregate")
-	require.Len(t, sink.coldIngests, 3, "Close after Finalize must not re-emit per-ingester signals")
+	require.Len(t, sink.coldIngests, 1, "Close after Finalize must not re-emit per-ingester signals")
 }
 
 // failingCold is a ColdIngester whose Ingest always fails, modeling a mid-chunk
@@ -869,23 +518,21 @@ func (f *failingCold) Close() error                   { f.closed = true; return
 // failing sibling: ColdService.Ingest returns the sibling's error, Finalize is
 // not called, the deferred Close drops the partial ledger pack, and no finalized
 // artifact remains. It also asserts the cold metrics still fire on this failure
-// path: each real ingester emits exactly one ColdIngest and the service emits one
+// path: the real ingester emits exactly one ColdIngest and the service emits one
 // aggregate ColdChunkTotal — driven from Close, since Finalize never ran.
 func TestColdService_FailurePath_NoArtifact(t *testing.T) {
 	chunkID := chunk.ID(0)
 	coldDir := t.TempDir()
 	sink := &testSink{}
 
-	// Two real cold ingesters (ledger + events) plus a failing sibling, so we can
-	// assert each real ingester emits its per-chunk ColdIngest from Close.
+	// A real cold ledger ingester plus a failing sibling, so we can assert the
+	// real ingester emits its per-chunk ColdIngest from Close.
 	realLedger, err := NewLedgerColdIngester(filepath.Join(coldDir, dataTypeLedgers), chunkID, sink)
 	require.NoError(t, err)
-	realEvents, err := NewEventsColdIngester(filepath.Join(coldDir, dataTypeEvents), chunkID, sink)
-	require.NoError(t, err)
 	failing := &failingCold{}
-	service := NewColdService([]ColdIngester{realLedger, realEvents, failing}, sink)
+	service := NewColdService([]ColdIngester{realLedger, failing}, sink)
 
-	// First ledger: the real ingesters succeed, failing returns an error → the
+	// First ledger: the real ingester succeeds, failing returns an error → the
 	// sequential Ingest aborts the ledger with the sibling's error.
 	err = service.Ingest(context.Background(), chunkID.FirstLedger(), viewOf(t, chunkID.FirstLedger()))
 	require.ErrorIs(t, err, errFailingCold)
@@ -900,10 +547,9 @@ func TestColdService_FailurePath_NoArtifact(t *testing.T) {
 	require.NoError(t, service.Close())
 	require.True(t, failing.closed)
 
-	// Each real ingester emitted exactly one ColdIngest; the aggregate fired once.
+	// The real ingester emitted exactly one ColdIngest; the aggregate fired once.
 	cdt := sink.coldDataTypes()
 	require.Equal(t, 1, cdt[dataTypeLedgers], "ledger cold ingester emits once on failure path")
-	require.Equal(t, 1, cdt[dataTypeEvents], "events cold ingester emits once on failure path")
 	require.Equal(t, 1, sink.coldChunkTotals, "exactly one aggregate ColdChunkTotal")
 
 	// No finalized ledger pack must exist.
@@ -951,12 +597,12 @@ func TestPrometheusSink_Smoke(t *testing.T) {
 	require.NotPanics(t, func() {
 		sink := NewPrometheusSink(reg, "test")
 		sink.HotIngest(dataTypeLedgers, time.Millisecond, 1, nil)
-		sink.HotIngest(dataTypeEvents, time.Millisecond, 3, errFailingCold)
-		sink.ColdIngest(dataTypeTxhash, time.Second, 100, nil)
+		sink.HotIngest(dataTypeLedgers, time.Millisecond, 3, errFailingCold)
+		sink.ColdIngest(dataTypeLedgers, time.Second, 100, nil)
 		sink.HotLedgerTotal(time.Millisecond)
 		sink.ColdChunkTotal(time.Second)
-		sink.IngestStage(dataTypeEvents, tierHot, stageExtract, time.Millisecond, 3)
-		sink.IngestStage(dataTypeEvents, tierCold, stageFinalize, time.Second, 0)
+		sink.IngestStage(dataTypeLedgers, tierHot, stageWrite, time.Millisecond, 1)
+		sink.IngestStage(dataTypeLedgers, tierCold, stageFinalize, time.Second, 0)
 	})
 
 	mfs, err := reg.Gather()
@@ -966,34 +612,27 @@ func TestPrometheusSink_Smoke(t *testing.T) {
 
 // ───────────────────────── hot driver tests ─────────────────────────
 
-// TestRunHot_AllTypes_Readback runs the RunHot driver with injected hot stores
-// over event/tx-bearing ledgers and asserts each hot store reads back. The short
-// stream ends early so RunHot returns the completeness error after both ledgers
-// are fully ingested.
-func TestRunHot_AllTypes_Readback(t *testing.T) {
+// TestRunHot_Ledgers_Readback runs the RunHot driver with the injected SHARED
+// hot DB (decision (a)) and asserts the ledger CF reads back. The short stream
+// ends early so RunHot returns the completeness error after both ledgers are
+// fully ingested.
+func TestRunHot_Ledgers_Readback(t *testing.T) {
 	chunkID := chunk.ID(0)
 	first := chunkID.FirstLedger()
 	logger := testLogger()
-	dir := t.TempDir()
 
-	ls, err := ledger.OpenHotStore(filepath.Join(dir, "ledgers"), chunkID, logger)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ls.Close()) }()
-	ts, err := txhash.NewHotStore(filepath.Join(dir, "txhash"), chunkID, logger)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ts.Close()) }()
-	es, err := eventstore.OpenHotStore(filepath.Join(dir, "events"), chunkID, logger)
+	db, err := hotchunk.Open(t.TempDir(), chunkID, logger)
 	require.NoError(t, err)
-	defer func() { require.NoError(t, es.Close()) }()
+	defer func() { require.NoError(t, db.Close()) }()
 
-	evSeqA, evSeqB := first, first+1
-	rawA, hashA, termA := marshalLCMWithEvent(t, evSeqA)
-	rawB, hashB, _ := marshalLCMWithEvent(t, evSeqB)
+	seqA, seqB := first, first+1
+	rawA := marshalLCM(t, seqA)
+	rawB := marshalLCM(t, seqB)
 	gen := func(tt *testing.T, seq uint32) []byte {
 		switch seq {
-		case evSeqA:
+		case seqA:
 			return rawA
-		case evSeqB:
+		case seqB:
 			return rawB
 		default:
 			return marshalLCM(tt, seq)
@@ -1001,39 +640,30 @@ func TestRunHot_AllTypes_Readback(t *testing.T) {
 	}
 	stream := &fakeStream{t: t, count: 2, gen: gen}
 
-	stores := HotStores{Ledgers: ls, Txhash: ts, Events: es}
-	cfg := Config{Ledgers: true, Txhash: true, Events: true}
+	stores := HotStores{HotDB: db}
+	cfg := Config{Ledgers: true}
 
 	err = RunHot(context.Background(), logger, sourceOf(stream), chunkID, stores, nil, cfg)
 	require.Error(t, err)
 	require.Contains(t, err.Error(), "ended at")
 
-	gotRawA, err := ls.GetLedgerRaw(evSeqA)
+	gotRawA, err := db.Ledgers().GetLedgerRaw(seqA)
 	require.NoError(t, err)
 	require.Equal(t, rawA, gotRawA)
-
-	gotA, err := ts.Get(hashA)
-	require.NoError(t, err)
-	require.Equal(t, evSeqA, gotA)
-	gotB, err := ts.Get(hashB)
-	require.NoError(t, err)
-	require.Equal(t, evSeqB, gotB)
-
-	bm, err := es.Lookup(context.Background(), termA)
+	gotRawB, err := db.Ledgers().GetLedgerRaw(seqB)
 	require.NoError(t, err)
-	require.NotNil(t, bm)
-	require.Equal(t, uint64(2), bm.GetCardinality(), "both sentinel events share the term")
+	require.Equal(t, rawB, gotRawB)
 }
 
 // TestRunHot_MissingStore asserts RunHot rejects an enabled type with a nil
-// injected store.
+// injected shared hot DB.
 func TestRunHot_MissingStore(t *testing.T) {
 	chunkID := chunk.ID(0)
 	logger := testLogger()
 	err := RunHot(context.Background(), logger, sourceOf(&fakeStream{t: t, count: 1}), chunkID,
 		HotStores{}, nil, Config{Ledgers: true})
 	require.Error(t, err)
-	require.Contains(t, err.Error(), "HotStores.Ledgers is nil")
+	require.Contains(t, err.Error(), "HotStores.HotDB is nil")
 }
 
 // TestPackSource_RoundTrip exercises the production PackSource + packStream path
@@ -1197,69 +827,6 @@ func TestRunCold_CustomSource_Extensibility(t *testing.T) {
 	require.Equal(t, marshalLCM(t, first), raw)
 }
 
-// TestRunCold_TxhashCold_Bin runs the cold txhash driver over a chunk whose
-// sentinel ledgers carry one tx each and asserts the .bin entry count.
-func TestRunCold_TxhashCold_Bin(t *testing.T) {
-	chunkID := chunk.ID(0)
-	first := chunkID.FirstLedger()
-	coldDir := t.TempDir()
-	logger := testLogger()
-
-	txSeqs := map[uint32]bool{first: true, first + 1: true}
-	gen := func(tt *testing.T, seq uint32) []byte {
-		if txSeqs[seq] {
-			raw, _, _ := marshalLCMWithEvent(tt, seq)
-			return raw
-		}
-		return marshalLCM(tt, seq)
-	}
-
-	require.NoError(t, RunCold(
-		context.Background(), logger, customSource{t: t, gen: gen}, coldDir, chunkID, 1, 1, nil, Config{Txhash: true},
-	))
-
-	entries, err := txhash.ReadColdBin(txhashBinPath(filepath.Join(coldDir, dataTypeTxhash)))
-	require.NoError(t, err)
-	require.Len(t, entries, len(txSeqs))
-}
-
-// TestRunCold_EventsCold_Readback runs the cold events driver over a chunk whose
-// sentinel ledgers carry one event each and resolves the term post-Finalize.
-func TestRunCold_EventsCold_Readback(t *testing.T) {
-	chunkID := chunk.ID(0)
-	first := chunkID.FirstLedger()
-	coldDir := t.TempDir()
-	logger := testLogger()
-
-	evSeqs := map[uint32]bool{first: true, first + 1: true}
-	var term events.TermKey
-	gen := func(tt *testing.T, seq uint32) []byte {
-		if evSeqs[seq] {
-			raw, _, tk := marshalLCMWithEvent(tt, seq)
-			term = tk
-			return raw
-		}
-		return marshalLCM(tt, seq)
-	}
-
-	require.NoError(t, RunCold(
-		context.Background(), logger, customSource{t: t, gen: gen}, coldDir, chunkID, 1, 1, nil, Config{Events: true},
-	))
-
-	bucketDir := filepath.Join(coldDir, "events", chunkID.BucketID())
-	cr, err := eventstore.OpenColdReader(chunkID, bucketDir, eventstore.ColdReaderOptions{})
-	require.NoError(t, err)
-	defer func() { require.NoError(t, cr.Close()) }()
-
-	cnt, err := cr.EventCount()
-	require.NoError(t, err)
-	require.Equal(t, uint32(len(evSeqs)), cnt)
-	bm, err := cr.Lookup(context.Background(), term)
-	require.NoError(t, err)
-	require.NotNil(t, bm)
-	require.Equal(t, uint64(len(evSeqs)), bm.GetCardinality())
-}
-
 // ───────────────────────── drain seq guard (P0-1) ─────────────────────────
 
 // TestRunCold_OutOfOrderSeq_NoArtifact feeds a stream that yields a ledger out
@@ -1296,37 +863,6 @@ func TestRunCold_OutOfOrderSeq_NoArtifact(t *testing.T) {
 	require.True(t, os.IsNotExist(statErr), "expected no cold artifact at %s, stat err: %v", path, statErr)
 }
 
-// TestDrain_TxhashSeqGuard asserts the guard also fires on the txhash path,
-// where a wrong-but-right-count sequence would otherwise be silently absorbed
-// (each ledger keys on its own LCM seq).
-func TestDrain_TxhashSeqGuard(t *testing.T) {
-	chunkID := chunk.ID(0)
-	first := chunkID.FirstLedger()
-	last := chunkID.LastLedger()
-	coldDir := t.TempDir()
-	logger := testLogger()
-
-	seqs := make([]uint32, 0, last-first+1)
-	for s := first; s <= last; s++ {
-		seqs = append(seqs, s)
-	}
-	require.GreaterOrEqual(t, len(seqs), 2)
-	// Corrupt the SECOND ledger so at least one valid ledger is ingested
-	// before the guard fires.
-	seqs[1] += 100
-
-	err := RunCold(
-		context.Background(), logger, sourceOf(&seqStream{t: t, seqs: seqs}), coldDir, chunkID, 1, 1, nil,
-		Config{Txhash: true},
-	)
-	require.Error(t, err)
-	require.Contains(t, err.Error(), "yielded ledger")
-
-	binPath := txhashBinPath(filepath.Join(coldDir, dataTypeTxhash))
-	_, statErr := os.Stat(binPath)
-	require.True(t, os.IsNotExist(statErr), "expected no .bin at %s, stat err: %v", binPath, statErr)
-}
-
 // TestRunCold_DrainStreamError_NoArtifact exercises the drain mid-stream error
 // path: the backend yields valid ledgers, then hands back (nil, err) at a seq in
 // the middle of the chunk. drain must wrap the error with RawLedgers + the seq,
@@ -1356,154 +892,25 @@ func TestRunCold_DrainStreamError_NoArtifact(t *testing.T) {
 	require.True(t, os.IsNotExist(statErr), "expected no cold artifact at %s, stat err: %v", path, statErr)
 }
 
-// The txhash .bin codec itself — atomic publish, create/rename failure
-// cleanup, layout, and the reader round-trip — is owned and tested by
-// pkg/stores/txhash (cold_bin_test.go); these tests only cover the
-// ingester-level behavior on top of it.
-
 // ───────────────────────── HotService failure path (P1-c) ─────────────────────────
 
-// failingHot is a HotIngester whose Ingest always fails. ctxObserved records
-// whether the ingester's context was already canceled when it ran (used to
-// show errgroup sibling cancellation in the multi-ingester path).
-type failingHot struct {
-	mu          sync.Mutex
-	ran         int
-	ctxObserved error
-}
-
-var errFailingHot = errors.New("failingHot: induced ingest failure")
-
-func (f *failingHot) Ingest(ctx context.Context, _ uint32, _ xdr.LedgerCloseMetaView) error {
-	f.mu.Lock()
-	f.ran++
-	f.ctxObserved = ctx.Err()
-	f.mu.Unlock()
-	return errFailingHot
-}
-
-// blockingHot blocks until its context is canceled, then reports the cancel
-// error. Pairs with failingHot in the multi-ingester test to prove the first
-// error cancels the siblings via the errgroup context.
-type blockingHot struct {
-	canceled chan struct{}
-	once     sync.Once
-}
-
-func (b *blockingHot) Ingest(ctx context.Context, _ uint32, _ xdr.LedgerCloseMetaView) error {
-	<-ctx.Done()
-	b.once.Do(func() { close(b.canceled) })
-	return ctx.Err()
-}
-
-// TestHotService_SingleIngesterFailure asserts the len==1 fast path returns the
-// ingester error and still emits exactly one HotLedgerTotal.
-func TestHotService_SingleIngesterFailure(t *testing.T) {
-	sink := &testSink{}
-	fail := &failingHot{}
-	service := NewHotService([]HotIngester{fail}, sink)
-
-	err := service.Ingest(context.Background(), chunk.ID(0).FirstLedger(), viewOf(t, chunk.ID(0).FirstLedger()))
-	require.ErrorIs(t, err, errFailingHot)
-	require.Equal(t, 1, sink.hotLedgerTotals, "HotLedgerTotal fires exactly once even on failure")
-}
-
-// TestHotService_MultiIngesterFailureCancelsSiblings asserts the errgroup path
-// propagates the failing ingester's error, cancels the sibling via the group
-// context, and still emits exactly one HotLedgerTotal.
-func TestHotService_MultiIngesterFailureCancelsSiblings(t *testing.T) {
-	sink := &testSink{}
-	fail := &failingHot{}
-	block := &blockingHot{canceled: make(chan struct{})}
-	service := NewHotService([]HotIngester{fail, block}, sink)
-
-	err := service.Ingest(context.Background(), chunk.ID(0).FirstLedger(), viewOf(t, chunk.ID(0).FirstLedger()))
-	require.ErrorIs(t, err, errFailingHot)
-
-	// The blocking sibling only returns once its context is canceled, so a
-	// non-blocking Ingest return already proves cancellation propagated.
-	select {
-	case <-block.canceled:
-	case <-time.After(2 * time.Second):
-		t.Fatal("sibling ingester was not canceled by the failing ingester")
-	}
-	require.Equal(t, 1, sink.hotLedgerTotals, "HotLedgerTotal fires exactly once even on failure")
-}
-
-// TestHotIngester_Failure_RecordsErrorMetric drives a REAL hot ingester
-// (eventsHot, built via NewEventsHotIngester) with a malformed view so its own
-// Ingest fails through the production hotMetrics emit path — unlike the
-// failingHot/blockingHot stubs, which bypass hotMetrics entirely. Per #765 a
-// failed hot Ingest must record exactly one HotIngest carrying a non-nil error
-// for that data type. Mirrors the cold-side TestColdIngester_Failure_RecordsErrorMetric.
-func TestHotIngester_Failure_RecordsErrorMetric(t *testing.T) {
-	chunkID := chunk.ID(0)
+// TestHotService_IngestFailureStillEmitsTotal asserts a failed shared-DB ingest
+// (here: a closed DB) returns the error and still emits exactly one
+// HotLedgerTotal. Under decision (a) there is no fan-out to cancel — one atomic
+// batch either commits or returns its error — so a single failure path replaces
+// the old errgroup sibling-cancellation behavior.
+func TestHotService_IngestFailureStillEmitsTotal(t *testing.T) {
 	logger := testLogger()
-	dir := t.TempDir()
-	sink := &testSink{}
-
-	store, err := eventstore.OpenHotStore(dir, chunkID, logger)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, store.Close()) }()
-
-	ing := NewEventsHotIngester(store, sink)
-
-	// A truncated/garbage view makes the event extraction fail inside the real
-	// Ingest, so the deferred hotMetrics.emit reports the wrapped error.
-	bad := xdr.LedgerCloseMetaView([]byte{0x00, 0x01, 0x02})
-	require.Error(t, ing.Ingest(context.Background(), chunkID.FirstLedger(), bad))
-
-	sink.mu.Lock()
-	defer sink.mu.Unlock()
-	require.Len(t, sink.hotIngests, 1, "exactly one HotIngest recorded")
-	require.Equal(t, dataTypeEvents, sink.hotIngests[0].dataType)
-	require.Error(t, sink.hotIngests[0].err, "the recorded HotIngest carries the ingest error")
-}
-
-// ───────────────────────── cold txhash .bin content (P1-d) ─────────────────────────
-
-// TestTxhashColdIngester_BinContent ingests two tx-bearing ledgers, finalizes,
-// then reads the .bin back through the store codec and asserts the contract
-// the deferred streamhash builder relies on: each key == the fixture tx hash
-// truncated to txhash.ColdKeySize (pinned to streamhash.MinKeySize by the
-// codec), each seq == the ledger it was ingested in, and entries are in
-// non-decreasing key order.
-func TestTxhashColdIngester_BinContent(t *testing.T) {
-	chunkID := chunk.ID(0)
-	first := chunkID.FirstLedger()
-	coldDir := t.TempDir()
-
-	ing, err := NewTxhashColdIngester(coldDir, chunkID, nil)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ing.Close()) }()
-
-	// Capture each fixture hash + the seq it was ingested in.
-	wantSeqByKey := map[[txhash.ColdKeySize]byte]uint32{}
-	for _, seq := range []uint32{first, first + 1} {
-		raw, hash, _ := marshalLCMWithEvent(t, seq)
-		var key [txhash.ColdKeySize]byte
-		copy(key[:], hash[:txhash.ColdKeySize])
-		wantSeqByKey[key] = seq
-		require.NoError(t, ing.Ingest(context.Background(), seq, xdr.LedgerCloseMetaView(raw)))
-	}
-	require.NoError(t, ing.Finalize(context.Background()))
-
-	entries, err := txhash.ReadColdBin(txhashBinPath(coldDir))
+	db, err := hotchunk.Open(t.TempDir(), chunk.ID(0), logger)
 	require.NoError(t, err)
-	require.Len(t, entries, 2)
+	require.NoError(t, db.Close()) // closed DB makes IngestLedger fail
 
-	var prevKey [txhash.ColdKeySize]byte
-	for i, e := range entries {
-		wantSeq, known := wantSeqByKey[e.Key]
-		require.True(t, known, "entry %d key %x is not one of the ingested fixture hashes", i, e.Key)
-		require.Equal(t, wantSeq, e.Seq, "entry %d seq must equal the ledger it was ingested in", i)
+	sink := &testSink{}
+	service := NewHotService(db, hotchunk.Ingest{Ledgers: true}, sink)
 
-		if i > 0 {
-			require.LessOrEqual(t, bytes.Compare(prevKey[:], e.Key[:]), 0,
-				"entries must be in non-decreasing key order")
-		}
-		prevKey = e.Key
-	}
+	err = service.Ingest(context.Background(), chunk.ID(0).FirstLedger(), viewOf(t, chunk.ID(0).FirstLedger()))
+	require.Error(t, err)
+	require.Equal(t, 1, sink.hotLedgerTotals, "HotLedgerTotal fires exactly once even on failure")
 }
 
 // ───────────────────────── OpenStream failure through the driver (P1-e) ─────────────────────────
@@ -1564,57 +971,37 @@ func TestRunCold_CanceledContext(t *testing.T) {
 func TestRunHot_OpenStreamError(t *testing.T) {
 	chunkID := chunk.ID(0)
 	logger := testLogger()
-	dir := t.TempDir()
 
-	ls, err := ledger.OpenHotStore(dir, chunkID, logger)
+	db, err := hotchunk.Open(t.TempDir(), chunkID, logger)
 	require.NoError(t, err)
-	defer func() { require.NoError(t, ls.Close()) }()
+	defer func() { require.NoError(t, db.Close()) }()
 
 	err = RunHot(context.Background(), logger, erroringSource{}, chunkID,
-		HotStores{Ledgers: ls}, nil, Config{Ledgers: true})
+		HotStores{HotDB: db}, nil, Config{Ledgers: true})
 	require.ErrorIs(t, err, errOpenStream)
 	require.Contains(t, err.Error(), "open stream for chunk 0")
 }
 
 // ───────────────────────── RunHot chunkID cross-check (P2-e) ─────────────────────────
 
-// TestRunHot_ChunkIDMismatch asserts RunHot rejects ANY injected hot store
-// bound to a different chunk than the one being ingested, with a clear
-// up-front error (rather than silently interleaving chunks on the ledger and
-// txhash paths, or a later per-ledger out-of-range on the events path). All
-// three hot stores are chunk-bound.
+// TestRunHot_ChunkIDMismatch asserts RunHot rejects an injected shared hot DB
+// bound to a different chunk than the one being ingested, with a clear up-front
+// error (rather than silently interleaving two chunks' data into one DB). The
+// shared DB is chunk-bound (decision (a)).
 func TestRunHot_ChunkIDMismatch(t *testing.T) {
 	ingestChunk := chunk.ID(1)
 	storeChunk := chunk.ID(0)
 	logger := testLogger()
 
-	run := func(t *testing.T, stores HotStores, cfg Config) {
-		t.Helper()
-		err := RunHot(context.Background(), logger, sourceOf(&fakeStream{t: t, count: 1}), ingestChunk,
-			stores, nil, cfg)
-		require.Error(t, err)
-		require.Contains(t, err.Error(), "bound to chunk 0")
-		require.Contains(t, err.Error(), "RunHot chunk 1")
-	}
+	db, err := hotchunk.Open(t.TempDir(), storeChunk, logger)
+	require.NoError(t, err)
+	defer func() { require.NoError(t, db.Close()) }()
 
-	t.Run("ledgers", func(t *testing.T) {
-		ls, err := ledger.OpenHotStore(t.TempDir(), storeChunk, logger)
-		require.NoError(t, err)
-		defer func() { require.NoError(t, ls.Close()) }()
-		run(t, HotStores{Ledgers: ls}, Config{Ledgers: true})
-	})
-	t.Run("txhash", func(t *testing.T) {
-		ts, err := txhash.NewHotStore(t.TempDir(), storeChunk, logger)
-		require.NoError(t, err)
-		defer func() { require.NoError(t, ts.Close()) }()
-		run(t, HotStores{Txhash: ts}, Config{Txhash: true})
-	})
-	t.Run("events", func(t *testing.T) {
-		es, err := eventstore.OpenHotStore(t.TempDir(), storeChunk, logger)
-		require.NoError(t, err)
-		defer func() { require.NoError(t, es.Close()) }()
-		run(t, HotStores{Events: es}, Config{Events: true})
-	})
+	err = RunHot(context.Background(), logger, sourceOf(&fakeStream{t: t, count: 1}), ingestChunk,
+		HotStores{HotDB: db}, nil, Config{Ledgers: true})
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "bound to chunk 0")
+	require.Contains(t, err.Error(), "RunHot chunk 1")
 }
 
 // ───────────────────────── Config validate / guard negatives (P2-g) ─────────────────────────
@@ -1738,79 +1125,19 @@ func countCleanColdIngests(s *testSink) int {
 	return n
 }
 
-// TestBuildColdIngesters_RollbackNoPhantomMetric makes a LATER constructor
-// (txhash) fail by planting a regular file at the txhash per-type directory,
-// so the constructor's own MkdirAll fails. The earlier-built ledger ingester
-// is rolled back via closeColdAll, which must NOT emit a phantom success
-// ColdIngest — the recorded ledger metric (if any) must carry the abort
-// error, never a clean (nil-err, 0-items) success.
-func TestBuildColdIngesters_RollbackNoPhantomMetric(t *testing.T) {
-	chunkID := chunk.ID(0)
-	coldDir := t.TempDir()
-	sink := &testSink{}
-
-	// Plant a regular FILE where the txhash per-type directory must be
-	// created: the ledger ingester builds first, then NewTxhashColdIngester
-	// fails its bucket-dir MkdirAll.
-	require.NoError(t, os.WriteFile(filepath.Join(coldDir, dataTypeTxhash), []byte("not a dir"), 0o644))
-
-	_, err := buildColdIngesters(coldDir, chunkID, sink, Config{Ledgers: true, Txhash: true})
-	require.Error(t, err, "txhash constructor must fail on the planted file")
-
-	// The ledger ingester was built then rolled back. No phantom SUCCESS metric:
-	// any recorded ledger ColdIngest must carry an error.
-	cdt := sink.coldDataTypes()
-	if cdt[dataTypeLedgers] > 0 {
-		require.Equal(t, cdt[dataTypeLedgers], sink.coldErrorTypes()[dataTypeLedgers],
-			"rolled-back ledger ingester must not emit a phantom success ColdIngest")
-	}
-	// And the success-only assertion: there must be zero clean (nil-err) cold
-	// ingest signals recorded.
-	require.Zero(t, countCleanColdIngests(sink), "no clean ColdIngest on the rollback path")
-}
-
-// TestBuildColdIngesters_RollbackLaterFailure_TxhashAborts makes the LAST
-// constructor (events) fail AFTER both the ledger AND txhash ingesters were
-// already built, so closeColdAll rolls back two ingesters. It asserts the txhash
-// ingester (which DOES implement abortMetric) emits an error-carrying — not a
-// clean-success — ColdIngest, complementing the ledger-only abort coverage above.
-func TestBuildColdIngesters_RollbackLaterFailure_TxhashAborts(t *testing.T) {
-	chunkID := chunk.ID(0)
-	coldDir := t.TempDir()
-	sink := &testSink{}
-
-	// Plant a directory at the events.pack path: the ledger and txhash
-	// ingesters build first, then NewEventsColdIngester fails opening the
-	// pack over the directory.
-	packPath := filepath.Join(coldDir, dataTypeEvents, chunkID.BucketID(), eventstore.EventsPackName(chunkID))
-	require.NoError(t, os.MkdirAll(packPath, 0o755))
-
-	_, err := buildColdIngesters(coldDir, chunkID, sink,
-		Config{Ledgers: true, Txhash: true, Events: true})
-	require.Error(t, err, "events constructor must fail on the planted directory")
-
-	// The txhash ingester was built then rolled back: its recorded ColdIngest must
-	// carry the abort error, never a clean success.
-	cdt := sink.coldDataTypes()
-	require.Equal(t, 1, cdt[dataTypeTxhash], "rolled-back txhash ingester emits one ColdIngest")
-	require.Equal(t, 1, sink.coldErrorTypes()[dataTypeTxhash],
-		"the rolled-back txhash ColdIngest must carry the abort error")
-
-	// No phantom clean success on the rollback path for any ingester.
-	require.Zero(t, countCleanColdIngests(sink), "no clean ColdIngest on the rollback path")
-}
-
 // TestRunCold_ConstructorFailure_EmitsAggregate drives a constructor failure
 // through RunCold (not buildColdIngesters directly) and asserts the chunk
 // attempt still produces its single aggregate ColdChunkTotal — the invariant
-// is one aggregate per chunk attempt, including pre-service failures.
+// is one aggregate per chunk attempt, including pre-service failures. The
+// rolled-back ledger ingester must not emit a phantom clean-success ColdIngest.
 func TestRunCold_ConstructorFailure_EmitsAggregate(t *testing.T) {
 	chunkID := chunk.ID(0)
 	coldDir := t.TempDir()
 	logger := testLogger()
 	sink := &testSink{}
 
-	// Plant a regular file where the ledgers per-type subdir must be created.
+	// Plant a regular file where the ledgers per-type subdir must be created so
+	// the ledger cold constructor's MkdirAll fails.
 	require.NoError(t, os.WriteFile(filepath.Join(coldDir, dataTypeLedgers), []byte("not a dir"), 0o644))
 
 	err := RunCold(
@@ -1823,68 +1150,6 @@ func TestRunCold_ConstructorFailure_EmitsAggregate(t *testing.T) {
 	require.Zero(t, countCleanColdIngests(sink), "no clean ColdIngest on the rollback path")
 }
 
-// ───────────────────────── events Finish-then-WriteColdIndex failure ─────────────────────────
-
-// TestEventsCold_FinishThenIndexFails_LeavesInertPack forces WriteColdIndex to
-// fail AFTER writer.Finish has committed events.pack, by planting a directory
-// where the index.hash file must be written (buildMPHF then hits EISDIR).
-// Finalize must surface the error; the index-less events.pack stays on disk —
-// without the orchestrator's completion record it is inert scratch (see the
-// package doc's artifact model), and a retry's overwrite is the cleanup.
-func TestEventsCold_FinishThenIndexFails_LeavesInertPack(t *testing.T) {
-	chunkID := chunk.ID(0)
-	first := chunkID.FirstLedger()
-	coldDir := t.TempDir()
-
-	ing, err := NewEventsColdIngester(coldDir, chunkID, nil)
-	require.NoError(t, err)
-
-	// Ingest one event-bearing ledger so the mirror is non-empty (an empty
-	// build set would take the valid empty-index path instead of buildMPHF).
-	rawEv, _, _ := marshalLCMWithEvent(t, first)
-	require.NoError(t, ing.Ingest(context.Background(), first, xdr.LedgerCloseMetaView(rawEv)))
-
-	// Plant a DIRECTORY where index.hash must be written → buildMPHF fails.
-	bucketDir := filepath.Join(coldDir, chunkID.BucketID())
-	indexHashPath := filepath.Join(bucketDir, eventstore.IndexHashName(chunkID))
-	require.NoError(t, os.Mkdir(indexHashPath, 0o755))
-
-	ferr := ing.Finalize(context.Background())
-	require.Error(t, ferr, "Finalize must fail when WriteColdIndex fails")
-	require.Contains(t, ferr.Error(), "WriteColdIndex")
-
-	// The committed events.pack stays in place as inert scratch (Finish ran,
-	// so the later Close does not drop it either).
-	packPath := filepath.Join(bucketDir, eventstore.EventsPackName(chunkID))
-	_, statErr := os.Stat(packPath)
-	require.NoError(t, statErr, "the index-less events.pack stays on disk after WriteColdIndex failure")
-
-	// Close is still safe/idempotent afterwards and does not remove the pack.
-	require.NoError(t, ing.Close())
-	_, statErr = os.Stat(packPath)
-	require.NoError(t, statErr, "Close after a committed Finish must not drop the pack")
-}
-
-// TestEventsCold_FinalizeAfterFailedIngest_Refuses asserts the failed-Ingest
-// latch: once an Ingest errors (here via a malformed view), Finalize must
-// refuse rather than commit a pack+index whose mirror may be ahead of the
-// offsets commit point.
-func TestEventsCold_FinalizeAfterFailedIngest_Refuses(t *testing.T) {
-	chunkID := chunk.ID(0)
-	coldDir := t.TempDir()
-
-	ing, err := NewEventsColdIngester(coldDir, chunkID, nil)
-	require.NoError(t, err)
-	defer func() { require.NoError(t, ing.Close()) }()
-
-	bad := xdr.LedgerCloseMetaView([]byte{0x00, 0x01, 0x02})
-	require.Error(t, ing.Ingest(context.Background(), chunkID.FirstLedger(), bad))
-
-	ferr := ing.Finalize(context.Background())
-	require.Error(t, ferr)
-	require.Contains(t, ferr.Error(), "Finalize after failed Ingest")
-}
-
 // ───────────────────────── ColdService.Finalize first-error ─────────────────────────
 
 // finalizeErrCold is a ColdIngester whose Finalize errors; it records whether
diff --git a/cmd/stellar-rpc/internal/fullhistory/ingest/service.go b/cmd/stellar-rpc/internal/fullhistory/ingest/service.go
index 561ac3e0e..a6fb16722 100644
--- a/cmd/stellar-rpc/internal/fullhistory/ingest/service.go
+++ b/cmd/stellar-rpc/internal/fullhistory/ingest/service.go
@@ -6,9 +6,9 @@ import (
 	"fmt"
 	"time"
 
-	"golang.org/x/sync/errgroup"
-
 	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk"
 )
 
 // errOrFirst returns prev if it is non-nil, else cur. Used to retain the FIRST
@@ -21,49 +21,58 @@ func errOrFirst(prev, cur error) error {
 	return cur
 }
 
-// HotService fans one ledger out to a set of HotIngesters concurrently, waiting
-// for all to finish before returning (so the borrowed view is safe to release),
-// and emits the aggregate per-ledger wall-clock via the sink.
+// HotService commits one ledger to the per-chunk hot DB as ONE atomic, synced
+// WriteBatch (decision (a)) and emits the per-ledger wall-clock plus per-type
+// volume signals via the sink.
+//
+// A ledger is fully present or fully absent because it commits in a single
+// WriteBatch (hotchunk.DB.IngestLedger).
 type HotService struct {
-	ingesters []HotIngester
-	sink      MetricSink
+	db   *hotchunk.DB
+	cfg  hotchunk.Ingest
+	sink MetricSink
 }
 
-// NewHotService builds a HotService over the enabled hot ingesters. A nil sink
-// defaults to NopSink.
-func NewHotService(ingesters []HotIngester, sink MetricSink) *HotService {
-	return &HotService{ingesters: ingesters, sink: orNop(sink)}
+// NewHotService builds a HotService that writes the data types enabled in cfg
+// into the shared per-chunk DB. A nil sink defaults to NopSink.
+func NewHotService(db *hotchunk.DB, cfg hotchunk.Ingest, sink MetricSink) *HotService {
+	return &HotService{db: db, cfg: cfg, sink: orNop(sink)}
 }
 
-// Ingest runs every hot ingester on lcm concurrently and waits for all of them.
-// seq is the driver-validated sequence of lcm, passed through unchanged. The
-// first ingester error is returned; the production HotIngester.Ingest
-// implementations do not check ctx.Err(), so the siblings run to completion
-// regardless (g.Wait still returns the first error). The single-ingester config
-// skips the errgroup entirely. HotLedgerTotal is emitted with the fan-out
-// wall-clock regardless of success.
-func (s *HotService) Ingest(ctx context.Context, seq uint32, lcm xdr.LedgerCloseMetaView) error {
+// Ingest commits lcm to the shared hot DB in one atomic synced WriteBatch
+// (decision (a)). seq is the driver-validated sequence of lcm, passed through
+// unchanged. HotLedgerTotal is emitted with the per-ledger wall-clock
+// regardless of success; on success, one HotIngest signal per enabled data type
+// reports that type's item count. A nil DB (no hot tier enabled for this
+// deployment) is a no-op other than the aggregate timing.
+func (s *HotService) Ingest(_ context.Context, seq uint32, lcm xdr.LedgerCloseMetaView) error {
 	start := time.Now()
-	switch len(s.ingesters) {
-	case 0:
-		// No hot ingesters enabled for this tier: nothing to do.
+	if s.db == nil {
 		s.sink.HotLedgerTotal(time.Since(start))
 		return nil
-	case 1:
-		// Single ingester: call directly, skipping the errgroup overhead.
-		err := s.ingesters[0].Ingest(ctx, seq, lcm)
-		s.sink.HotLedgerTotal(time.Since(start))
-		return err
-	default:
-		// Two or more: concurrent fan-out, waiting for all.
-		g, gctx := errgroup.WithContext(ctx)
-		for _, ing := range s.ingesters {
-			g.Go(func() error { return ing.Ingest(gctx, seq, lcm) })
-		}
-		err := g.Wait()
-		s.sink.HotLedgerTotal(time.Since(start))
-		return err
 	}
+	counts, err := s.db.IngestLedger(seq, lcm, s.cfg)
+	s.emit(counts, time.Since(start), err)
+	s.sink.HotLedgerTotal(time.Since(start))
+	return err
+}
+
+// emit reports one HotIngest signal per enabled data type. On error the counts
+// are reported as 0 items with the error attached (matching the per-type "items
+// written" contract: a failed commit wrote nothing durably).
+func (s *HotService) emit(counts hotchunk.LedgerCounts, d time.Duration, err error) {
+	if s.cfg.Ledgers {
+		s.sink.HotIngest(dataTypeLedgers, d, itemsOnSuccess(counts.Ledgers, err), err)
+	}
+}
+
+// itemsOnSuccess returns n on success and 0 on error — a failed atomic batch
+// commits nothing, so no items were written.
+func itemsOnSuccess(n int, err error) int {
+	if err != nil {
+		return 0
+	}
+	return n
 }
 
 // ColdService drives a set of ColdIngesters for one chunk: sequential per-ledger
diff --git a/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk.go b/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk.go
new file mode 100644
index 000000000..29c0f4ab2
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk.go
@@ -0,0 +1,141 @@
+// Package hotchunk implements decision (a): the per-chunk hot tier is
+// ONE RocksDB instance holding the ledger column family, and each ledger
+// commits as ONE atomic, synced WriteBatch. There is a SINGLE per-chunk
+// watermark (the max committed ledger seq, authoritative from the
+// ledgers CF's last key), with no per-store frontier markers.
+//
+// The typed ledger facade (ledger.HotStore) is composed over the shared
+// store via its NewWithStore constructor and keeps its existing read API
+// for downstream (#770). Its write path is expressed as Puts queued into
+// the shared batch, which commits once.
+package hotchunk
+
+import (
+	"fmt"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/rocksdb"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
+)
+
+// DB is one chunk's hot tier: a single rocksdb.Store plus the typed
+// ledger facade composed over it. It owns the store's lifecycle (Close
+// closes it exactly once); the facade wraps it without owning it.
+//
+// Concurrency: ingestion is single-writer (the daemon's per-chunk
+// ingestion loop). IngestLedger is not safe to call concurrently with
+// itself. Reads via the facade follow its own concurrency contract and
+// are safe alongside the single writer.
+type DB struct {
+	store   *rocksdb.Store
+	chunkID chunk.ID
+
+	ledger *ledger.HotStore
+}
+
+// columnFamilies returns the CF list for the per-chunk DB: the ledger CF.
+func columnFamilies() []string {
+	return []string{ledger.LedgersCF}
+}
+
+// config builds the per-chunk store's rocksdb.Config. It rides on
+// RocksDB's defaults (zero Tuning) — the same choice ledger.OpenHotStore
+// makes for the standalone ledger store: no explicit block cache, bloom
+// filter, or WAL cap. Re-tune only with a workload measurement.
+func config(path string, logger *supportlog.Entry) rocksdb.Config {
+	return rocksdb.Config{
+		Path:           path,
+		ColumnFamilies: columnFamilies(),
+		Logger:         logger,
+	}
+}
+
+// Open opens (or creates) the chunk's hot DB at path and composes the
+// ledger facade over it. path and logger are required.
+func Open(path string, chunkID chunk.ID, logger *supportlog.Entry) (*DB, error) {
+	if path == "" {
+		return nil, stores.ErrInvalidConfig
+	}
+	if logger == nil {
+		return nil, stores.ErrInvalidConfig
+	}
+	store, err := rocksdb.New(config(path, logger))
+	if err != nil {
+		return nil, fmt.Errorf("hotchunk: open chunk %s: %w", chunkID, err)
+	}
+
+	return &DB{
+		store:   store,
+		chunkID: chunkID,
+		ledger:  ledger.NewWithStore(store, chunkID),
+	}, nil
+}
+
+// ChunkID returns the chunk this DB is bound to.
+func (d *DB) ChunkID() chunk.ID { return d.chunkID }
+
+// Ledgers returns the ledger read/write facade over the shared store.
+func (d *DB) Ledgers() *ledger.HotStore { return d.ledger }
+
+// Close releases the shared store exactly once. Idempotent (delegates
+// to rocksdb.Store.Close, which is itself idempotent). Must not be
+// called concurrently with in-flight reads/writes.
+func (d *DB) Close() error { return d.store.Close() }
+
+// MaxCommittedSeq returns the single authoritative per-chunk watermark:
+// the highest ledger seq durably committed, read from the ledgers CF's
+// last key. ok=false on an empty DB (no ledger committed yet).
+func (d *DB) MaxCommittedSeq() (seq uint32, ok bool, err error) {
+	return d.ledger.LastSeq()
+}
+
+// Ingest contributions toggle which data types the single per-ledger
+// batch writes. Mirrors ingest.Config but kept local so hotchunk has no
+// dependency on the ingest package (which depends on the stores).
+type Ingest struct {
+	Ledgers bool
+}
+
+// LedgerCounts reports how many items each data type contributed to one
+// IngestLedger call: 1 ledger (when Ledgers enabled). Lets the caller
+// (HotService) emit per-type volume metrics without re-deriving them.
+type LedgerCounts struct {
+	Ledgers int
+}
+
+// IngestLedger commits ONE ledger to the hot DB as a SINGLE atomic,
+// synced WriteBatch (decision (a)). It queues the ledger row into one
+// rocksdb.BatchWriter and commits once (sync=true via the store's pinned
+// WriteOptions). The single watermark advances atomically.
+//
+// seq is the driver-validated sequence of lcm. lcm is a borrowed,
+// zero-copy view: the ledger bytes are copied into the batch
+// synchronously, so the view need not outlive this call.
+func (d *DB) IngestLedger(seq uint32, lcm xdr.LedgerCloseMetaView, cfg Ingest) (LedgerCounts, error) {
+	var counts LedgerCounts
+	if d.store.IsClosed() {
+		return counts, stores.ErrStoreClosed
+	}
+
+	if cfg.Ledgers {
+		counts.Ledgers = 1
+	}
+
+	cerr := d.store.Batch(func(b *rocksdb.BatchWriter) error {
+		if cfg.Ledgers {
+			if err := d.ledger.AddLedgerToBatch(b, ledger.Entry{Seq: seq, Bytes: []byte(lcm)}); err != nil {
+				return fmt.Errorf("hotchunk: queue ledger seq %d: %w", seq, err)
+			}
+		}
+		return nil
+	})
+	if cerr != nil {
+		return counts, fmt.Errorf("hotchunk: commit ledger %d to chunk %s: %w", seq, d.chunkID, cerr)
+	}
+
+	return counts, nil
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk_test.go b/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk_test.go
new file mode 100644
index 000000000..4afa38c5f
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk_test.go
@@ -0,0 +1,202 @@
+package hotchunk
+
+import (
+	"testing"
+
+	"github.com/sirupsen/logrus"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/rocksdb"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
+)
+
+func silentLogger() *supportlog.Entry {
+	log := supportlog.New()
+	log.SetLevel(logrus.ErrorLevel)
+	return log
+}
+
+func openTestDB(t *testing.T, chunkID chunk.ID) *DB {
+	t.Helper()
+	db, err := Open(t.TempDir(), chunkID, silentLogger())
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = db.Close() })
+	return db
+}
+
+func allTypes() Ingest { return Ingest{Ledgers: true} }
+
+func TestOpen_ValidatesInputs(t *testing.T) {
+	_, err := Open("", chunk.ID(0), silentLogger())
+	require.ErrorIs(t, err, stores.ErrInvalidConfig)
+
+	_, err = Open(t.TempDir(), chunk.ID(0), nil)
+	require.ErrorIs(t, err, stores.ErrInvalidConfig)
+}
+
+func TestColumnFamilies_IsLedgerCF(t *testing.T) {
+	cfs := columnFamilies()
+	require.Len(t, cfs, 1)
+	require.Equal(t, ledger.LedgersCF, cfs[0])
+}
+
+// TestIngestLedger_LedgerCommittedAndWatermarkAdvances is the core decision-(a)
+// happy path: one IngestLedger call writes the ledger into the hot DB, and the
+// single watermark reaches exactly the committed seq.
+func TestIngestLedger_LedgerCommittedAndWatermarkAdvances(t *testing.T) {
+	chunkID := chunk.ID(0)
+	first := chunkID.FirstLedger()
+	db := openTestDB(t, chunkID)
+
+	// Empty DB: no watermark.
+	_, ok, err := db.MaxCommittedSeq()
+	require.NoError(t, err)
+	require.False(t, ok)
+
+	rawA := zeroTxLCM(t, first)
+	rawB := zeroTxLCM(t, first+1)
+
+	counts, err := db.IngestLedger(first, xdr.LedgerCloseMetaView(rawA), allTypes())
+	require.NoError(t, err)
+	assert.Equal(t, LedgerCounts{Ledgers: 1}, counts)
+
+	counts, err = db.IngestLedger(first+1, xdr.LedgerCloseMetaView(rawB), allTypes())
+	require.NoError(t, err)
+	assert.Equal(t, LedgerCounts{Ledgers: 1}, counts)
+
+	// ledgers CF.
+	gotA, err := db.Ledgers().GetLedgerRaw(first)
+	require.NoError(t, err)
+	assert.Equal(t, rawA, gotA)
+
+	// The single authoritative watermark equals the last committed seq.
+	maxSeq, ok, err := db.MaxCommittedSeq()
+	require.NoError(t, err)
+	require.True(t, ok)
+	assert.Equal(t, first+1, maxSeq)
+}
+
+// TestIngestLedger_DurableAcrossReopen confirms a committed ledger survives a
+// close/reopen (sync=true durability), and that a commit into a CLOSED store
+// fails and leaves nothing behind — the single synced WriteBatch is
+// all-or-nothing.
+func TestIngestLedger_DurableAcrossReopen(t *testing.T) {
+	chunkID := chunk.ID(0)
+	first := chunkID.FirstLedger()
+	dir := t.TempDir()
+
+	db, err := Open(dir, chunkID, silentLogger())
+	require.NoError(t, err)
+
+	// Commit one good ledger so there is a known watermark, then close the DB.
+	rawGood := zeroTxLCM(t, first)
+	_, err = db.IngestLedger(first, xdr.LedgerCloseMetaView(rawGood), allTypes())
+	require.NoError(t, err)
+	require.NoError(t, db.Close())
+
+	// Reopen and confirm the watermark survived (sync=true durability).
+	db2, err := Open(dir, chunkID, silentLogger())
+	require.NoError(t, err)
+
+	maxSeq, ok, err := db2.MaxCommittedSeq()
+	require.NoError(t, err)
+	require.True(t, ok)
+	require.Equal(t, first, maxSeq, "the committed ledger is durable across reopen")
+
+	// Now close the DB and attempt to ingest the NEXT ledger into the closed
+	// store: the commit fails, and nothing for that ledger persists anywhere.
+	require.NoError(t, db2.Close())
+	rawNext := zeroTxLCM(t, first+1)
+	_, err = db2.IngestLedger(first+1, xdr.LedgerCloseMetaView(rawNext), allTypes())
+	require.Error(t, err)
+
+	// Reopen a third time: the failed ledger left NO trace, and the watermark is
+	// still the last good seq.
+	db3, err := Open(dir, chunkID, silentLogger())
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = db3.Close() })
+
+	maxSeq, ok, err = db3.MaxCommittedSeq()
+	require.NoError(t, err)
+	require.True(t, ok)
+	assert.Equal(t, first, maxSeq, "the failed ledger did not advance the watermark")
+
+	// The good ledger's data is intact; the failed ledger's is wholly absent.
+	_, gerr := db3.Ledgers().GetLedgerRaw(first + 1)
+	require.ErrorIs(t, gerr, stores.ErrNotFound)
+
+	gotGood, err := db3.Ledgers().GetLedgerRaw(first)
+	require.NoError(t, err)
+	assert.Equal(t, rawGood, gotGood)
+}
+
+// TestSharedBatch_DirectRocksAbort is the lower-level atomicity proof: queue a
+// Put into the ledger CF of the store, then return an error from the batch
+// callback — RocksDB applies NONE of it. Pins the property the IngestLedger
+// path relies on (atomicity of one WriteBatch).
+func TestSharedBatch_DirectRocksAbort(t *testing.T) {
+	db := openTestDB(t, chunk.ID(0))
+
+	sentinelErr := assert.AnError
+
+	err := storeOf(db).Batch(func(b *rocksdb.BatchWriter) error {
+		b.Put(ledger.LedgersCF, rocksdb.EncodeUint32(2), []byte("ledger-row"))
+		return sentinelErr // abort: nothing should commit
+	})
+	require.ErrorIs(t, err, sentinelErr)
+
+	// The CF did not receive the aborted write.
+	_, gerr := db.Ledgers().GetLedgerRaw(2)
+	require.ErrorIs(t, gerr, stores.ErrNotFound)
+	_, ok, derr := db.MaxCommittedSeq()
+	require.NoError(t, derr)
+	require.False(t, ok)
+}
+
+// storeOf exposes the store for the direct-batch atomicity test (same package,
+// so no production accessor is needed).
+func storeOf(db *DB) *rocksdb.Store { return db.store }
+
+// TestIngestLedger_ClosedDBFails confirms a closed DB rejects ingest.
+func TestIngestLedger_ClosedDBFails(t *testing.T) {
+	chunkID := chunk.ID(0)
+	db, err := Open(t.TempDir(), chunkID, silentLogger())
+	require.NoError(t, err)
+	require.NoError(t, db.Close())
+
+	raw := zeroTxLCM(t, chunkID.FirstLedger())
+	_, err = db.IngestLedger(chunkID.FirstLedger(), xdr.LedgerCloseMetaView(raw), allTypes())
+	require.ErrorIs(t, err, stores.ErrStoreClosed)
+}
+
+// ──────────────────────────── LCM fixtures ────────────────────────────
+
+// zeroTxLCM builds a minimal V2 LCM with no transactions at the given sequence.
+func zeroTxLCM(t *testing.T, seq uint32) []byte {
+	t.Helper()
+	lcm := xdr.LedgerCloseMeta{
+		V: 2,
+		V2: &xdr.LedgerCloseMetaV2{
+			LedgerHeader: xdr.LedgerHeaderHistoryEntry{
+				Header: xdr.LedgerHeader{
+					ScpValue:  xdr.StellarValue{CloseTime: xdr.TimePoint(0)},
+					LedgerSeq: xdr.Uint32(seq),
+				},
+			},
+			TxSet: xdr.GeneralizedTransactionSet{
+				V:       1,
+				V1TxSet: &xdr.TransactionSetV1{Phases: []xdr.TransactionPhase{}},
+			},
+			TxProcessing: []xdr.TransactionResultMetaV1{},
+		},
+	}
+	raw, err := lcm.MarshalBinary()
+	require.NoError(t, err)
+	return raw
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger/hot_store.go b/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger/hot_store.go
index 2ba7afd4f..ad197fae0 100644
--- a/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger/hot_store.go
+++ b/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger/hot_store.go
@@ -17,6 +17,14 @@ import (
 	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/zstd"
 )
 
+// LedgersCF is the column family the hot ledger data lives in inside
+// the shared per-chunk hot DB (decision (a): one multi-CF RocksDB per
+// chunk). When the HotStore owns a dedicated single-purpose DB (the
+// standalone OpenHotStore path used by per-store tests and the cold
+// freeze readers), the same CF name is registered so the on-disk
+// layout is identical whether the store is shared or standalone.
+const LedgersCF = "ledgers"
+
 // Entry — one (sequence, uncompressed ledger bytes) pair. Both
 // hot and cold stores compress on write and decompress on read,
 // so callers always pass and receive raw ledger bytes here.
@@ -48,7 +56,13 @@ type Entry struct {
 type HotStore struct {
 	store   *rocksdb.Store
 	chunkID chunk.ID
-	dec     *zstd.Decompressor
+	// ownsStore is true when this HotStore opened its own dedicated
+	// rocksdb.Store (the standalone OpenHotStore path) and must close
+	// it on Close. It is false when the store is the SHARED per-chunk
+	// multi-CF DB injected by the hotchunk package — that DB is owned
+	// by hotchunk.DB and closed once, not three times.
+	ownsStore bool
+	dec       *zstd.Decompressor
 	// compPool — per-store pool of zstd.Compressors. Each
 	// concurrent AddLedgers borrows one for the duration of its
 	// Encode call; the pool's GC finalizer (set inside
@@ -78,12 +92,25 @@ func OpenHotStore(path string, chunkID chunk.ID, logger *supportlog.Entry) (*Hot
 		return nil, stores.ErrInvalidConfig
 	}
 	store, err := rocksdb.New(rocksdb.Config{
-		Path:   path,
-		Logger: logger,
+		Path:           path,
+		ColumnFamilies: []string{LedgersCF},
+		Logger:         logger,
 	})
 	if err != nil {
 		return nil, err
 	}
+	h := NewWithStore(store, chunkID)
+	h.ownsStore = true
+	return h, nil
+}
+
+// NewWithStore wraps an ALREADY-OPEN rocksdb.Store as a ledger HotStore
+// operating on the LedgersCF column family. The store is NOT owned by
+// the returned HotStore (Close is a no-op on the shared DB) — this is
+// the constructor the hotchunk package uses to compose the three
+// per-type facades over one shared multi-CF DB (decision (a)). The
+// store must have been opened with LedgersCF registered.
+func NewWithStore(store *rocksdb.Store, chunkID chunk.ID) *HotStore {
 	return &HotStore{
 		store:   store,
 		chunkID: chunkID,
@@ -91,13 +118,21 @@ func OpenHotStore(path string, chunkID chunk.ID, logger *supportlog.Entry) (*Hot
 		compPool: sync.Pool{
 			New: func() any { return zstd.NewCompressor() },
 		},
-	}, nil
+	}
 }
 
-// Close releases the underlying RocksDB store. Idempotent —
-// delegates to rocksdb.Store.Close. Must not be called concurrently
-// with in-flight reads/writes on this HotStore.
-func (h *HotStore) Close() error { return h.store.Close() }
+// Close releases the underlying RocksDB store IF this HotStore owns it
+// (the standalone OpenHotStore path). When the store is the shared
+// per-chunk DB injected via NewWithStore, Close is a no-op — the
+// hotchunk.DB owns and closes the shared store exactly once.
+// Idempotent. Must not be called concurrently with in-flight
+// reads/writes on this HotStore.
+func (h *HotStore) Close() error {
+	if !h.ownsStore {
+		return nil
+	}
+	return h.store.Close()
+}
 
 // ChunkID returns the chunk this store is bound to (constructor-supplied;
 // never reads the store).
@@ -127,7 +162,7 @@ func (h *HotStore) AddLedgers(entries ...Entry) error {
 		if err != nil {
 			return err
 		}
-		return translateRocksErr(h.store.Put("", rocksdb.EncodeUint32(e.Seq), compressed))
+		return translateRocksErr(h.store.Put(LedgersCF, rocksdb.EncodeUint32(e.Seq), compressed))
 	}
 	// Multi-entry path: compress each into its own fresh slice so
 	// the batch can hold them all simultaneously (the compressor's
@@ -143,19 +178,40 @@ func (h *HotStore) AddLedgers(entries ...Entry) error {
 	}
 	return translateRocksErr(h.store.Batch(func(b *rocksdb.BatchWriter) error {
 		for i, e := range entries {
-			b.Put("", rocksdb.EncodeUint32(e.Seq), compressed[i])
+			b.Put(LedgersCF, rocksdb.EncodeUint32(e.Seq), compressed[i])
 		}
 		return nil
 	}))
 }
 
+// AddLedgerToBatch compresses one ledger and queues its single Put into
+// b (the LedgersCF) — the building block the hotchunk package uses to
+// fold the ledger write into the one atomic per-ledger WriteBatch
+// shared across all CFs (decision (a)). It does not commit: the caller
+// owns the batch and its single synced Write. Compression happens here
+// (synchronously into a fresh buffer that BatchWriter.Put copies), so
+// the caller's bytes need not outlive this call.
+func (h *HotStore) AddLedgerToBatch(b *rocksdb.BatchWriter, e Entry) error {
+	if h.store.IsClosed() {
+		return stores.ErrStoreClosed
+	}
+	c, _ := h.compPool.Get().(*zstd.Compressor)
+	defer h.compPool.Put(c)
+	compressed, err := c.Encode(nil, e.Bytes)
+	if err != nil {
+		return err
+	}
+	b.Put(LedgersCF, rocksdb.EncodeUint32(e.Seq), compressed)
+	return nil
+}
+
 // GetLedgerRaw decodes the ledger stored under seq into a fresh,
 // caller-owned buffer, or returns stores.ErrNotFound on miss. A zstd
 // decode failure surfaces as stores.ErrCorrupt. Sequential bulk readers
 // should prefer IterateLedgers, which yields borrows without the
 // per-ledger decode allocation.
 func (h *HotStore) GetLedgerRaw(seq uint32) ([]byte, error) {
-	v, found, err := h.store.Get("", rocksdb.EncodeUint32(seq))
+	v, found, err := h.store.Get(LedgersCF, rocksdb.EncodeUint32(seq))
 	if err != nil {
 		return nil, translateRocksErr(err)
 	}
@@ -184,7 +240,7 @@ func (h *HotStore) edgeSeq(last bool) (uint32, bool, error) {
 	if last {
 		edge = h.store.LastKey
 	}
-	k, ok, err := edge("")
+	k, ok, err := edge(LedgersCF)
 	if err != nil {
 		return 0, false, translateRocksErr(err)
 	}
@@ -213,7 +269,7 @@ func (h *HotStore) IterateLedgers(start, end uint32) iter.Seq2[Entry, error] {
 		// it past the loop body. The read benches consume each ledger in-scope,
 		// so this avoids a per-ledger decode allocation.
 		var scratch []byte
-		for e, err := range h.store.IterateRange("", rocksdb.EncodeUint32(start), rocksdb.EncodeUint32(end)) {
+		for e, err := range h.store.IterateRange(LedgersCF, rocksdb.EncodeUint32(start), rocksdb.EncodeUint32(end)) {
 			if err != nil {
 				yield(Entry{}, translateRocksErr(err))
 				return
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/hotsource.go b/cmd/stellar-rpc/internal/fullhistory/streaming/hotsource.go
new file mode 100644
index 000000000..908e10a84
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/hotsource.go
@@ -0,0 +1,164 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"iter"
+	"os"
+
+	"github.com/stellar/go-stellar-sdk/ingest/ledgerbackend"
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/ingest"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
+)
+
+// rocksHotProbe is the production HotProbe: it opens the chunk's SINGLE shared
+// per-chunk RocksDB hot DB (one multi-CF instance: ledgers + events CFs +
+// txhash CFs) at the path the daemon's hot-storage layout dictates, and answers
+// backfillSource's completeness question over it.
+//
+// Under decision (a) the hot tier is ONE DB whose every CF advances together in
+// one atomic synced WriteBatch per ledger, so "complete" is the single
+// authoritative maxCommittedSeq (the ledgers CF's last key) — no min-of-three,
+// no per-store frontier reconciliation.
+type rocksHotProbe struct {
+	hotRoot func(chunkID chunk.ID) string
+	logger  *supportlog.Entry
+}
+
+// NewRocksHotProbe returns the production HotProbe. hotChunkPath maps a chunk to
+// its hot-DB directory (the daemon passes Layout.HotChunkPath); logger is
+// forwarded to the shared-DB opener.
+//
+// Caller contract: the chunk passed to OpenHotChunk must NOT be the one captive
+// core is actively ingesting — that chunk holds its hot RocksDB open read-write,
+// and a second open of the same path fails on RocksDB's LOCK. The catch-up loop
+// excludes the live chunk by design (the partial resume chunk is finished by
+// ingestion, not by a freeze pass), so the probe only ever opens chunks
+// ingestion has already released.
+func NewRocksHotProbe(hotChunkPath func(chunk.ID) string, logger *supportlog.Entry) HotProbe {
+	return &rocksHotProbe{hotRoot: hotChunkPath, logger: logger}
+}
+
+func (p *rocksHotProbe) OpenHotChunk(chunkID chunk.ID) (HotChunk, bool, error) {
+	dir := p.hotRoot(chunkID)
+	if _, err := os.Stat(dir); err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return nil, false, nil // dir absent — caller treats as loss under "ready"
+		}
+		return nil, false, fmt.Errorf("stat hot dir %s: %w", dir, err)
+	}
+
+	// One shared multi-CF DB at the chunk's hot dir — the same instance, opened
+	// with the same union of CFs, that the ingestion side writes.
+	db, err := hotchunk.Open(dir, chunkID, p.logger)
+	if err != nil {
+		return nil, false, fmt.Errorf("open hot chunk DB: %w", err)
+	}
+	return &rocksHotChunk{chunkID: chunkID, db: db}, true, nil
+}
+
+// rocksHotChunk is one chunk's opened hot tier — the single shared DB.
+type rocksHotChunk struct {
+	chunkID chunk.ID
+	db      *hotchunk.DB
+}
+
+// MaxCommittedSeq returns the single authoritative watermark (DECISION (a)):
+// the highest ledger seq the shared DB has durably committed, from the ledgers
+// CF's last key. Because every ledger commits as one atomic synced WriteBatch
+// across all CFs, this one value pins every CF's frontier — events and txhash
+// never trail or lead. ok=false on an empty DB.
+func (h *rocksHotChunk) MaxCommittedSeq() (uint32, bool, error) {
+	seq, ok, err := h.db.MaxCommittedSeq()
+	if err != nil {
+		return 0, false, fmt.Errorf("hot DB max committed seq: %w", err)
+	}
+	return seq, ok, nil
+}
+
+// Source streams the chunk's LCMs from the ledgers CF as a ChunkSource the cold
+// pipeline drains.
+func (h *rocksHotChunk) Source() ingest.ChunkSource {
+	return &hotLedgerSource{store: h.db.Ledgers()}
+}
+
+// Close releases the shared hot DB.
+func (h *rocksHotChunk) Close() error {
+	if h.db == nil {
+		return nil
+	}
+	return h.db.Close()
+}
+
+// ---------------------------------------------------------------------------
+// hotLedgerSource — an ingest.ChunkSource backed by a ledger.HotStore, so the
+// merged cold pipeline (RunColdChunk) can freeze a just-closed chunk straight
+// from its hot DB without a refetch.
+// ---------------------------------------------------------------------------
+
+type hotLedgerSource struct {
+	store *ledger.HotStore
+}
+
+// OpenStream returns a stream over the hot store's ledgers for the requested
+// chunk. The store is already chunk-bound; the stream honors the driver's
+// requested [from,to] range via IterateLedgers.
+func (s *hotLedgerSource) OpenStream(chunkID chunk.ID) (ledgerbackend.LedgerStream, error) {
+	if s.store == nil {
+		return nil, errors.New("streaming: hotLedgerSource has no store")
+	}
+	if s.store.ChunkID() != chunkID {
+		return nil, fmt.Errorf("streaming: hotLedgerSource bound to chunk %s, asked for %s",
+			s.store.ChunkID(), chunkID)
+	}
+	return &hotLedgerStream{store: s.store}, nil
+}
+
+type hotLedgerStream struct {
+	store *ledger.HotStore
+}
+
+var _ ledgerbackend.LedgerStream = (*hotLedgerStream)(nil)
+
+// RawLedgers yields each ledger's wire bytes for the requested range from the
+// hot store. The store's IterateLedgers yields BORROWED buffers (valid only to
+// the next step); the cold ingesters copy what they retain (HotIngester
+// contract), and the drain loop consumes each ledger fully before the next
+// yield, so the borrow is safe. ctx cancellation is observed between ledgers,
+// upholding the ChunkSource contract the drain loop relies on.
+func (st *hotLedgerStream) RawLedgers(
+	ctx context.Context, r ledgerbackend.Range, _ ...ledgerbackend.StreamOption,
+) iter.Seq2[[]byte, error] {
+	return func(yield func([]byte, error) bool) {
+		to := r.To()
+		if !r.Bounded() {
+			last, ok, err := st.store.LastSeq()
+			if err != nil {
+				yield(nil, err)
+				return
+			}
+			if !ok {
+				return
+			}
+			to = last
+		}
+		for e, ierr := range st.store.IterateLedgers(r.From(), to) {
+			if cerr := ctx.Err(); cerr != nil {
+				yield(nil, cerr)
+				return
+			}
+			if ierr != nil {
+				yield(nil, ierr)
+				return
+			}
+			if !yield(e.Bytes, nil) {
+				return
+			}
+		}
+	}
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/process.go b/cmd/stellar-rpc/internal/fullhistory/streaming/process.go
new file mode 100644
index 000000000..7d503c3e1
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/process.go
@@ -0,0 +1,364 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"time"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/ingest"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ErrHotVolumeLost is the case-4 fatal: a hot:chunk key is "ready" but its
+// directory is missing or unopenable. The hot DB is the SOLE copy of a chunk's
+// recently-ingested ledgers, so this is unrecoverable loss — never silently
+// healed. Loss is detected LAZILY, on the open that needs the DB (lastCommitted
+// Ledger's one refinement open of the highest ready chunk before ingestion
+// starts, openHotTierForChunk's "ready" branch, or backfillSource's hot branch),
+// not by an eager all-ready-keys scan. It is returned as a sentinel (not a
+// process exit) so the daemon's top-level loop owns the fatal-and-surface
+// decision and tests can assert it.
+var ErrHotVolumeLost = errors.New("streaming: hot storage lost; run surgical recovery (case 4)")
+
+// ErrBackendCoverageTimeout is the bounded-wait fatal from backfillSource's bulk
+// branch: the configured backend's tip never advanced to cover a
+// genuinely-backend-only chunk within the deadline.
+var ErrBackendCoverageTimeout = errors.New("streaming: backend never covered chunk within deadline")
+
+// HotProbe opens the per-chunk shared hot DB for a chunk and answers the two
+// questions backfillSource's hot branch asks: (1) is the hot tier COMPLETE for
+// this chunk — DECISION (a): the single DB's maxCommittedSeq >= the chunk's
+// last ledger — and (2) if so, hand back a ChunkSource that streams the chunk's
+// LCMs from the ledgers CF so the just-closed chunk freezes without a refetch.
+//
+// It is injected so processChunk/backfillSource stay testable without the live
+// ingestion pipeline: production wires the real shared multi-CF RocksDB; tests
+// pass a fake. Under decision (a) the hot tier is ONE DB whose ledgers, events,
+// and txhash CFs all advance together in one atomic synced WriteBatch per
+// ledger, so completeness is a SINGLE watermark — no min-of-three.
+type HotProbe interface {
+	// OpenHotChunk opens the chunk's shared hot DB read-only-ish (the daemon
+	// owns the writer; this is a borrow for a freeze pass). It returns the
+	// opened handle, or an error the caller treats as case-4 loss when the
+	// catalog key said "ready". A nil error with ok==false means the dir is
+	// absent (also loss when "ready").
+	OpenHotChunk(chunkID chunk.ID) (HotChunk, bool, error)
+}
+
+// HotChunk is one chunk's opened hot tier: the single DB's completeness gate
+// plus an LCM source over the ledgers CF. Close releases the shared DB.
+type HotChunk interface {
+	// MaxCommittedSeq returns the single authoritative watermark — the highest
+	// ledger seq the shared DB has durably committed (every CF advances
+	// together, decision (a)) — and ok=false if the DB is empty (no committed
+	// seq, so the chunk cannot be complete).
+	MaxCommittedSeq() (seq uint32, ok bool, err error)
+	// Source yields the chunk's LCMs from the ledgers CF as a ChunkSource the
+	// cold pipeline (RunColdChunk) can drain.
+	Source() ingest.ChunkSource
+	// Close releases the shared hot DB.
+	Close() error
+}
+
+// BackendWaiter bounds backfillSource's bulk branch: it blocks until the
+// configured backend's tip covers chunkLastLedger, polling on a backoff, and
+// returns ErrBackendCoverageTimeout (wrapped) if the tip never advances within
+// the deadline. A chunk WITH a local copy never reaches here, so this never
+// gates a normal restart whose range is entirely local.
+//
+// It is an interface (not an inline poll) so the bulk source's tip query is
+// injectable: production wraps the configured LedgerBackend's tip; tests pass a
+// fake that is either immediately-covered or never-covered.
+type BackendWaiter interface {
+	WaitForCoverage(ctx context.Context, chunkLastLedger uint32) error
+}
+
+// ProcessConfig is the dependency bundle processChunk/backfillSource read. It is
+// the streaming spine's view of everything a freeze pass needs: the catalog
+// (key state + path layout), the hot probe, the bulk backend source + its
+// coverage waiter, and the metric sink/logger. Construction is the daemon's
+// job; the primitives below never reach around it.
+type ProcessConfig struct {
+	Catalog *Catalog
+	Logger  *supportlog.Entry
+	Sink    ingest.MetricSink
+
+	// HotProbe opens the per-chunk hot tier for the hot branch. Required.
+	HotProbe HotProbe
+
+	// Backend is the configured bulk LedgerBackend as a ChunkSource (BSB by
+	// default — the pack/datastore ChunkSource from ingest). It is the only
+	// source for a chunk with no local copy. May be nil in a frontfill
+	// deployment that never backfills; backfillSource errors loudly if a chunk
+	// actually reaches the bulk branch with no backend configured.
+	Backend ingest.ChunkSource
+
+	// BackendWaiter bounds the bulk branch's wait-for-coverage. Required iff
+	// Backend is set; ignored otherwise.
+	BackendWaiter BackendWaiter
+}
+
+func (cfg ProcessConfig) validate() error {
+	if cfg.Catalog == nil {
+		return errors.New("streaming: ProcessConfig.Catalog is nil")
+	}
+	if cfg.HotProbe == nil {
+		return errors.New("streaming: ProcessConfig.HotProbe is nil")
+	}
+	if cfg.Logger == nil {
+		return errors.New("streaming: ProcessConfig.Logger is nil")
+	}
+	return nil
+}
+
+// processChunk materializes the requested cold artifact kinds (ledgers/.pack, events
+// cold segment, txhash/.bin) for ONE chunk in a single streaming pass over its
+// ledgers, applying the Phase A one-write protocol per kind (rule 1):
+//
+//   - Per-kind idempotency: a kind whose chunk key is already "frozen" is
+//     dropped from the request (it self-skips); a "freezing"/"pruning"/absent
+//     key triggers re-materialization, itself idempotent (the cold ingesters
+//     overwrite at the canonical path).
+//   - Mark-then-write: every remaining kind's key is put "freezing" BEFORE any
+//     I/O, the cold pipeline (RunColdChunk) writes the files at their canonical
+//     paths from the source backfillSource chose, the files + their dirents are
+//     fsynced (barrierNewFile), and only then are the keys flipped to "frozen".
+//
+// The cold ingestion is the merged ingest.RunColdChunk over the same cold
+// ingester set RunCold uses — processChunk does not re-derive any extractor or
+// writer; it only chooses the LCM source (backfillSource) and drives the one
+// write protocol around the freeze.
+func processChunk(ctx context.Context, chunkID chunk.ID, artifacts ArtifactSet, cfg ProcessConfig) error {
+	if err := cfg.validate(); err != nil {
+		return err
+	}
+	cat := cfg.Catalog
+
+	// rule 1 per-kind idempotency: frozen kinds self-skip.
+	for _, kind := range artifacts.Kinds() {
+		state, err := cat.State(chunkID, kind)
+		if err != nil {
+			return fmt.Errorf("streaming: read state chunk %s kind %s: %w", chunkID, kind, err)
+		}
+		if state == StateFrozen {
+			artifacts = artifacts.Remove(kind)
+		}
+	}
+	if artifacts.Empty() {
+		return nil
+	}
+
+	// Choose the LCM source BEFORE marking "freezing": backfillSource may fatal
+	// (case-4 loss) or fall through sources, and we must not leave "freezing"
+	// debris for a chunk we then refuse to produce. The returned closer releases
+	// any opened hot stores once the freeze pass finishes.
+	source, closeSource, err := backfillSource(ctx, chunkID, artifacts, cfg)
+	if err != nil {
+		return err
+	}
+	defer func() { _ = closeSource() }()
+
+	// Mark-then-write: every requested kind "freezing" BEFORE any I/O.
+	if err := cat.MarkChunkFreezing(chunkID, artifacts.Kinds()...); err != nil {
+		return fmt.Errorf("streaming: mark freezing chunk %s %s: %w", chunkID, artifacts, err)
+	}
+
+	// Test-only observation point at the exact mark-then-write instant: every
+	// requested kind is now "freezing" and no file has been written yet. A no-op
+	// in production (hook nil); see crashHooks.afterMarkFreezing.
+	cat.hooks.fireAfterMarkFreezing()
+
+	// One streaming pass through the merged cold pipeline. The cold ingesters
+	// (re)create files at their canonical paths — re-materialization overwrites
+	// any partial from a crashed "freezing" attempt.
+	dirs := ingest.ColdDirs{
+		Ledgers: cat.layout.LedgersRoot(),
+	}
+	if rerr := ingest.RunColdChunk(ctx, cfg.Logger, source, dirs, chunkID, cfg.Sink, artifacts.ingestConfig()); rerr != nil {
+		return fmt.Errorf("streaming: cold ingest chunk %s %s: %w", chunkID, artifacts, rerr)
+	}
+
+	// Durability barrier: fsync each file + its parent dirent (+ grandparent
+	// when this chunk created a new bucket dir) BEFORE flipping to "frozen".
+	// The cold writers fsync file DATA on Finalize, but the one-write protocol
+	// also requires the directory entries be durable before the key flips —
+	// barrierNewFile is the exact two-level barrier (paths.go).
+	newBucket := uint32(chunkID)%chunk.ChunksPerBucket == 0
+	for _, kind := range artifacts.Kinds() {
+		for _, path := range cat.layout.ArtifactPaths(chunkID, kind) {
+			if berr := barrierNewFile(path, newBucket); berr != nil {
+				return fmt.Errorf("streaming: fsync barrier %s: %w", path, berr)
+			}
+		}
+	}
+
+	// Flip every produced kind to "frozen" in one atomic synced batch.
+	if ferr := cat.FlipChunkFrozen(chunkID, artifacts.Kinds()...); ferr != nil {
+		return fmt.Errorf("streaming: flip frozen chunk %s %s: %w", chunkID, artifacts, ferr)
+	}
+	return nil
+}
+
+// backfillSource implements rule 2's source-preference order for one chunk. It
+// returns the chosen ingest.ChunkSource, a closer (releasing any opened hot
+// stores; a no-op for the pack/bulk branches), and an error. The hot branch
+// fatals only on LOSS (a "ready" key whose dir is missing/unopenable — ErrHot
+// VolumeLost, detected lazily on this open); an incomplete-but-present hot DB is
+// STALENESS and falls through to the next source, because re-derivation IS its
+// recovery.
+//
+// Preference order:
+//  1. A ready, COMPLETE hot tier read locally — completeness is DECISION (a):
+//     the single shared DB's maxCommittedSeq >= chunkLastLedger.
+//  2. The frozen local .pack via the ledger cold reader, when ledgers is NOT among
+//     the requested outputs (re-derivation without a download).
+//  3. The configured bulk backend, gated by a bounded WaitForCoverage.
+func backfillSource(
+	ctx context.Context, chunkID chunk.ID, artifacts ArtifactSet, cfg ProcessConfig,
+) (ingest.ChunkSource, func() error, error) {
+	noClose := func() error { return nil }
+	cat := cfg.Catalog
+
+	// (1) Hot branch: only consult it when the chunk is owned by ingestion
+	// (hot key present) AND "ready". A "transient" key (mid creation/deletion or
+	// recovery-demoted) is NOT a read source — it falls through like any other
+	// non-ready state.
+	hotState, err := cat.HotState(chunkID)
+	if err != nil {
+		return nil, noClose, fmt.Errorf("streaming: read hot state chunk %s: %w", chunkID, err)
+	}
+	if hotState == HotReady {
+		src, closer, used, herr := tryHotSource(chunkID, cfg)
+		if herr != nil {
+			return nil, noClose, herr // case-4 loss is fatal
+		}
+		if used {
+			cfg.Logger.Debugf("backfillSource: chunk %s from complete hot tier", chunkID)
+			return src, closer, nil
+		}
+		// Present but incomplete: legitimate staleness — fall through.
+		cfg.Logger.Debugf("backfillSource: chunk %s hot tier present but incomplete; falling through", chunkID)
+	}
+
+	// (2) Frozen local .pack, only when ledgers is not requested (producing ledgers from
+	// the pack we'd write would be circular). The ledger cold reader is the same
+	// reader the merged pack ChunkSource opens.
+	ledgersState, err := cat.State(chunkID, KindLedgers)
+	if err != nil {
+		return nil, noClose, fmt.Errorf("streaming: read ledgers state chunk %s: %w", chunkID, err)
+	}
+	if ledgersState == StateFrozen && !artifacts.Has(KindLedgers) {
+		if _, serr := os.Stat(cat.layout.LedgerPackPath(chunkID)); serr == nil {
+			cfg.Logger.Debugf("backfillSource: chunk %s re-derived from frozen .pack", chunkID)
+			// ingest.NewPackSource composes {coldDir}/{bucket}/{chunk}.pack, which
+			// equals LedgerPackPath when coldDir is the ledgers root.
+			return ingest.NewPackSource(cat.layout.LedgersRoot()), noClose, nil
+		}
+		// A "frozen" ledgers key whose pack is gone violates the key invariant
+		// (frozen ⇒ file exists); surface it rather than silently downloading.
+		return nil, noClose, fmt.Errorf(
+			"streaming: chunk %s ledgers is %q but pack file is missing at %s",
+			chunkID, StateFrozen, cat.layout.LedgerPackPath(chunkID))
+	}
+
+	// (3) Bulk backend — the only source for a chunk with no local copy.
+	if cfg.Backend == nil {
+		return nil, noClose, fmt.Errorf(
+			"streaming: chunk %s has no local copy and no bulk backend is configured", chunkID)
+	}
+	if cfg.BackendWaiter != nil {
+		if werr := cfg.BackendWaiter.WaitForCoverage(ctx, chunkID.LastLedger()); werr != nil {
+			return nil, noClose, werr
+		}
+	}
+	cfg.Logger.Debugf("backfillSource: chunk %s from bulk backend", chunkID)
+	return cfg.Backend, noClose, nil
+}
+
+// tryHotSource handles backfillSource's hot branch under a "ready" key. It
+// returns (source, closer, used, err): used=true with a source when the hot
+// tier is present AND complete (single-watermark gate); used=false (source nil)
+// when present but incomplete (staleness — caller falls through); a non-nil err
+// only for case-4 LOSS (dir missing/unopenable under a "ready" key).
+func tryHotSource(chunkID chunk.ID, cfg ProcessConfig) (ingest.ChunkSource, func() error, bool, error) {
+	hot, ok, err := cfg.HotProbe.OpenHotChunk(chunkID)
+	if err != nil {
+		// "ready" key but the DB cannot be opened — hot-volume loss.
+		return nil, nil, false, fmt.Errorf("%w: chunk %s: %w", ErrHotVolumeLost, chunkID, err)
+	}
+	if !ok {
+		// "ready" key but the dir is absent — hot-volume loss.
+		return nil, nil, false, fmt.Errorf("%w: chunk %s: hot directory absent", ErrHotVolumeLost, chunkID)
+	}
+	closer := hot.Close
+	maxSeq, present, merr := hot.MaxCommittedSeq()
+	if merr != nil {
+		_ = hot.Close()
+		// A read error against an opened DB is loss, not staleness: the
+		// DB opened but cannot answer its own progress.
+		return nil, nil, false, fmt.Errorf("%w: chunk %s: max committed seq: %w", ErrHotVolumeLost, chunkID, merr)
+	}
+	// DECISION (a): complete iff the single DB's maxCommittedSeq reaches the
+	// chunk's last ledger. An empty DB (present==false) cannot be complete.
+	if present && maxSeq >= chunkID.LastLedger() {
+		return hot.Source(), closer, true, nil
+	}
+	_ = hot.Close()
+	return nil, nil, false, nil
+}
+
+// ---------------------------------------------------------------------------
+// pollingBackendWaiter — the default BackendWaiter: poll a tip function on a
+// fixed backoff until it covers chunkLastLedger or the deadline expires.
+// ---------------------------------------------------------------------------
+
+// pollingBackendWaiter polls Tip on Interval until it returns a value >=
+// chunkLastLedger, the ctx is canceled, or Timeout elapses (ErrBackendCoverage
+// Timeout). Tip is the bulk backend's current network/object-store tip ledger.
+type pollingBackendWaiter struct {
+	Tip      func(ctx context.Context) (uint32, error)
+	Interval time.Duration
+	Timeout  time.Duration
+}
+
+// NewPollingBackendWaiter returns a BackendWaiter that polls tip on interval up
+// to timeout. A zero interval/timeout falls back to sane defaults.
+func NewPollingBackendWaiter(
+	tip func(ctx context.Context) (uint32, error), interval, timeout time.Duration,
+) BackendWaiter {
+	if interval <= 0 {
+		interval = time.Second
+	}
+	if timeout <= 0 {
+		timeout = 5 * time.Minute
+	}
+	return &pollingBackendWaiter{Tip: tip, Interval: interval, Timeout: timeout}
+}
+
+func (w *pollingBackendWaiter) WaitForCoverage(ctx context.Context, chunkLastLedger uint32) error {
+	deadline := time.Now().Add(w.Timeout)
+	for {
+		tip, err := w.Tip(ctx)
+		if err != nil {
+			return fmt.Errorf("streaming: backend tip query: %w", err)
+		}
+		if tip >= chunkLastLedger {
+			return nil
+		}
+		if time.Now().After(deadline) {
+			return fmt.Errorf("%w: tip %d < needed %d after %s",
+				ErrBackendCoverageTimeout, tip, chunkLastLedger, w.Timeout)
+		}
+		timer := time.NewTimer(w.Interval)
+		select {
+		case <-ctx.Done():
+			timer.Stop()
+			return ctx.Err()
+		case <-timer.C:
+		}
+	}
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/process_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/process_test.go
new file mode 100644
index 000000000..e07fb009f
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/process_test.go
@@ -0,0 +1,537 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"iter"
+	"os"
+	"path/filepath"
+	"sync/atomic"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/go-stellar-sdk/ingest/ledgerbackend"
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/ingest"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
+)
+
+// ---------------------------------------------------------------------------
+// LCM fixtures + fake ChunkSource.
+// ---------------------------------------------------------------------------
+
+// zeroTxLCMBytes builds the wire bytes of a minimal valid zero-transaction V2
+// LedgerCloseMeta for seq. Zero-tx keeps the per-ledger work trivial so a full
+// 10,000-ledger chunk pass stays fast in tests.
+func zeroTxLCMBytes(t *testing.T, seq uint32) []byte {
+	t.Helper()
+	lcm := xdr.LedgerCloseMeta{
+		V: 2,
+		V2: &xdr.LedgerCloseMetaV2{
+			LedgerHeader: xdr.LedgerHeaderHistoryEntry{
+				Header: xdr.LedgerHeader{
+					ScpValue:  xdr.StellarValue{CloseTime: xdr.TimePoint(0)},
+					LedgerSeq: xdr.Uint32(seq),
+				},
+			},
+			TxSet: xdr.GeneralizedTransactionSet{
+				V:       1,
+				V1TxSet: &xdr.TransactionSetV1{Phases: nil},
+			},
+			TxProcessing: nil,
+		},
+	}
+	raw, err := lcm.MarshalBinary()
+	require.NoError(t, err)
+	return raw
+}
+
+// fullChunkStream is an in-memory ledgerbackend.LedgerStream yielding every
+// ledger in [from, to] from a per-seq LCM generator. It models a backend (or a
+// pack) that has the whole requested range. counter (optional) records the
+// number of OpenStream-driven ledgers pulled so a test can assert a source was
+// (or was not) used.
+type fullChunkStream struct {
+	t   *testing.T
+	gen func(*testing.T, uint32) []byte
+}
+
+var _ ledgerbackend.LedgerStream = (*fullChunkStream)(nil)
+
+func (s *fullChunkStream) RawLedgers(
+	_ context.Context, r ledgerbackend.Range, _ ...ledgerbackend.StreamOption,
+) iter.Seq2[[]byte, error] {
+	return func(yield func([]byte, error) bool) {
+		for seq := r.From(); seq <= r.To(); seq++ {
+			if !yield(s.gen(s.t, seq), nil) {
+				return
+			}
+		}
+	}
+}
+
+// countingChunkSource wraps a stream factory and counts OpenStream calls, so a
+// test can assert which preference branch backfillSource picked.
+type countingChunkSource struct {
+	opens atomic.Int32
+	make  func(chunk.ID) (ledgerbackend.LedgerStream, error)
+}
+
+func (c *countingChunkSource) OpenStream(id chunk.ID) (ledgerbackend.LedgerStream, error) {
+	c.opens.Add(1)
+	return c.make(id)
+}
+
+func zeroTxBackend(t *testing.T) *countingChunkSource {
+	return &countingChunkSource{
+		make: func(chunk.ID) (ledgerbackend.LedgerStream, error) {
+			return &fullChunkStream{t: t, gen: zeroTxLCMBytes}, nil
+		},
+	}
+}
+
+// ---------------------------------------------------------------------------
+// fake HotProbe / HotChunk.
+// ---------------------------------------------------------------------------
+
+type fakeHotChunk struct {
+	maxSeq   uint32
+	present  bool
+	maxErr   error
+	source   ingest.ChunkSource
+	closedTo *atomic.Int32
+}
+
+func (h *fakeHotChunk) MaxCommittedSeq() (uint32, bool, error) {
+	return h.maxSeq, h.present, h.maxErr
+}
+func (h *fakeHotChunk) Source() ingest.ChunkSource { return h.source }
+func (h *fakeHotChunk) Close() error {
+	if h.closedTo != nil {
+		h.closedTo.Add(1)
+	}
+	return nil
+}
+
+type fakeHotProbe struct {
+	chunk    *fakeHotChunk
+	ok       bool
+	openErr  error
+	openedTo *atomic.Int32
+}
+
+func (p *fakeHotProbe) OpenHotChunk(chunk.ID) (HotChunk, bool, error) {
+	if p.openedTo != nil {
+		p.openedTo.Add(1)
+	}
+	if p.openErr != nil {
+		return nil, false, p.openErr
+	}
+	if !p.ok {
+		return nil, false, nil
+	}
+	return p.chunk, true, nil
+}
+
+// ---------------------------------------------------------------------------
+// fake BackendWaiter.
+// ---------------------------------------------------------------------------
+
+type fakeWaiter struct {
+	err    error
+	called atomic.Int32
+}
+
+func (w *fakeWaiter) WaitForCoverage(context.Context, uint32) error {
+	w.called.Add(1)
+	return w.err
+}
+
+// ---------------------------------------------------------------------------
+// process config helper.
+// ---------------------------------------------------------------------------
+
+func testProcessConfig(t *testing.T, cat *Catalog) ProcessConfig {
+	t.Helper()
+	return ProcessConfig{
+		Catalog:  cat,
+		Logger:   silentLogger(),
+		Sink:     ingest.NopSink{},
+		HotProbe: &fakeHotProbe{}, // not "ready" by default; tests override
+	}
+}
+
+// ---------------------------------------------------------------------------
+// processChunk — produces the ledger artifact and flips the key to frozen.
+// ---------------------------------------------------------------------------
+
+func TestProcessChunk_ProducesAllArtifactsAndFreezes(t *testing.T) {
+	cat, root := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+	backend := zeroTxBackend(t)
+	cfg.Backend = backend
+	cfg.BackendWaiter = &fakeWaiter{}
+
+	chunkID := chunk.ID(0)
+	require.NoError(t, processChunk(context.Background(), chunkID, AllArtifacts(), cfg))
+
+	// The ledgers catalog key flipped to frozen (verified via Phase A Catalog).
+	for _, kind := range AllKinds() {
+		state, err := cat.State(chunkID, kind)
+		require.NoError(t, err)
+		require.Equal(t, StateFrozen, state, "kind %s should be frozen", kind)
+	}
+
+	// The ledger artifact exists on disk at its canonical Layout path.
+	require.FileExists(t, cat.layout.LedgerPackPath(chunkID))
+
+	// The pack is a valid cold ledger pack covering the whole chunk.
+	cr, err := ledger.OpenColdReader(cat.layout.LedgerPackPath(chunkID))
+	require.NoError(t, err)
+	defer func() { _ = cr.Close() }()
+	last, err := cr.LastSeq()
+	require.NoError(t, err)
+	require.Equal(t, chunkID.LastLedger(), last)
+	_ = root
+}
+
+// ---------------------------------------------------------------------------
+// Idempotency: a frozen kind self-skips.
+// ---------------------------------------------------------------------------
+
+func TestProcessChunk_IdempotentSkipWhenFrozen(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+	backend := zeroTxBackend(t)
+	cfg.Backend = backend
+	cfg.BackendWaiter = &fakeWaiter{}
+
+	chunkID := chunk.ID(0)
+	require.NoError(t, processChunk(context.Background(), chunkID, AllArtifacts(), cfg))
+	opensAfterFirst := backend.opens.Load()
+	require.Equal(t, int32(1), opensAfterFirst, "first pass opens the backend once")
+
+	// Second pass: every kind is frozen, so processChunk returns without opening
+	// any source.
+	require.NoError(t, processChunk(context.Background(), chunkID, AllArtifacts(), cfg))
+	require.Equal(t, opensAfterFirst, backend.opens.Load(),
+		"a fully-frozen chunk must not re-open the source")
+}
+
+// ---------------------------------------------------------------------------
+// Crash recovery: a "freezing" key (partial crash) is re-materialized.
+// ---------------------------------------------------------------------------
+
+func TestProcessChunk_RematerializesAfterFreezingCrash(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+	cfg.Backend = zeroTxBackend(t)
+	cfg.BackendWaiter = &fakeWaiter{}
+
+	chunkID := chunk.ID(0)
+
+	// Simulate a crash mid-freeze: the keys are "freezing" and a stale/partial
+	// pack file exists at the canonical path.
+	require.NoError(t, cat.MarkChunkFreezing(chunkID, AllKinds()...))
+	require.NoError(t, os.MkdirAll(filepath.Dir(cat.layout.LedgerPackPath(chunkID)), 0o755))
+	require.NoError(t, os.WriteFile(cat.layout.LedgerPackPath(chunkID), []byte("PARTIAL-GARBAGE"), 0o644))
+
+	// Re-run: a "freezing" key triggers re-materialization (rule 1), overwriting
+	// the partial at the canonical path.
+	require.NoError(t, processChunk(context.Background(), chunkID, AllArtifacts(), cfg))
+
+	for _, kind := range AllKinds() {
+		state, err := cat.State(chunkID, kind)
+		require.NoError(t, err)
+		require.Equal(t, StateFrozen, state)
+	}
+	// The partial garbage was overwritten with a real pack.
+	cr, err := ledger.OpenColdReader(cat.layout.LedgerPackPath(chunkID))
+	require.NoError(t, err)
+	defer func() { _ = cr.Close() }()
+	last, err := cr.LastSeq()
+	require.NoError(t, err)
+	require.Equal(t, chunkID.LastLedger(), last)
+}
+
+// ---------------------------------------------------------------------------
+// Mark-then-write ORDERING: the core one-write-protocol invariant. At the
+// instant after MarkChunkFreezing and before any file I/O, every requested kind
+// must read "freezing" and no artifact file may exist yet. The afterMarkFreezing
+// crash hook (hooks.go) observes that exact instant from INSIDE processChunk, so
+// dropping the mark (keys would be absent) or reordering the write ahead of it
+// (a file would exist) is caught — neither could ship green.
+// ---------------------------------------------------------------------------
+
+func TestProcessChunk_MarksFreezingBeforeWrite(t *testing.T) {
+	for _, tc := range []struct {
+		name      string
+		artifacts ArtifactSet
+	}{
+		{"all kinds", AllArtifacts()},
+		{"ledgers only", NewArtifactSet(KindLedgers)},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			cat, _ := testCatalog(t)
+			cfg := testProcessConfig(t, cat)
+			cfg.Backend = zeroTxBackend(t)
+			cfg.BackendWaiter = &fakeWaiter{}
+
+			chunkID := chunk.ID(0)
+			requested := tc.artifacts.Kinds()
+
+			var fired bool
+			cat.hooks.afterMarkFreezing = func() {
+				fired = true
+				// (1) Every requested kind reads "freezing" at the mark instant.
+				// Dropping MarkChunkFreezing would leave these absent (empty State).
+				for _, kind := range requested {
+					state, err := cat.State(chunkID, kind)
+					require.NoError(t, err)
+					require.Equal(t, StateFreezing, state,
+						"kind %s must be 'freezing' before any I/O", kind)
+				}
+				// (2) No artifact file exists yet. Reordering the write ahead of the
+				// mark (or writing without marking) would leave a file present here.
+				for _, kind := range requested {
+					for _, p := range cat.layout.ArtifactPaths(chunkID, kind) {
+						require.NoFileExists(t, p,
+							"no %s artifact file may exist at the mark instant", kind)
+					}
+				}
+			}
+
+			require.NoError(t, processChunk(context.Background(), chunkID, tc.artifacts, cfg))
+			require.True(t, fired, "afterMarkFreezing hook must have fired inside processChunk")
+
+			// And the freeze still completes: every requested kind ends "frozen".
+			for _, kind := range requested {
+				state, err := cat.State(chunkID, kind)
+				require.NoError(t, err)
+				require.Equal(t, StateFrozen, state)
+			}
+		})
+	}
+}
+
+// ---------------------------------------------------------------------------
+// backfillSource preference order.
+// ---------------------------------------------------------------------------
+
+func TestBackfillSource_PrefersCompleteHotTier(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+
+	chunkID := chunk.ID(0)
+	// Mark the hot key "ready" and wire a complete hot tier (max committed seq
+	// reaches the chunk's last ledger).
+	require.NoError(t, cat.FlipHotReady(chunkID))
+	hotBackend := zeroTxBackend(t)
+	var closed atomic.Int32
+	cfg.HotProbe = &fakeHotProbe{
+		ok: true,
+		chunk: &fakeHotChunk{
+			maxSeq:   chunkID.LastLedger(),
+			present:  true,
+			source:   hotBackend,
+			closedTo: &closed,
+		},
+	}
+	// A bulk backend is configured but must NOT be used.
+	bulk := zeroTxBackend(t)
+	cfg.Backend = bulk
+	cfg.BackendWaiter = &fakeWaiter{}
+
+	src, closeSrc, err := backfillSource(context.Background(), chunkID, AllArtifacts(), cfg)
+	require.NoError(t, err)
+	require.Same(t, ingest.ChunkSource(hotBackend), src)
+	require.NoError(t, closeSrc())
+	require.Equal(t, int32(1), closed.Load(), "the closer releases the opened hot tier")
+	require.Equal(t, int32(0), bulk.opens.Load(), "the bulk backend was not consulted")
+}
+
+func TestBackfillSource_WatermarkGate_IncompleteFallsThrough(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+
+	chunkID := chunk.ID(0)
+	require.NoError(t, cat.FlipHotReady(chunkID))
+	var closed atomic.Int32
+	// maxSeq is ONE BELOW the chunk's last ledger — i.e. the single DB's
+	// watermark has not reached completeness even though it is present. Under
+	// decision (a) every CF advances together, so a watermark short of the last
+	// ledger means the chunk is genuinely unfinished. It is staleness, not loss:
+	// fall through.
+	cfg.HotProbe = &fakeHotProbe{
+		ok: true,
+		chunk: &fakeHotChunk{
+			maxSeq:   chunkID.LastLedger() - 1,
+			present:  true,
+			closedTo: &closed,
+		},
+	}
+	bulk := zeroTxBackend(t)
+	cfg.Backend = bulk
+	cfg.BackendWaiter = &fakeWaiter{}
+
+	src, closeSrc, err := backfillSource(context.Background(), chunkID, AllArtifacts(), cfg)
+	require.NoError(t, err)
+	require.Same(t, ingest.ChunkSource(bulk), src, "incomplete hot tier falls through to bulk")
+	require.NoError(t, closeSrc())
+	require.GreaterOrEqual(t, closed.Load(), int32(1), "the incomplete hot tier was closed on fall-through")
+}
+
+func TestBackfillSource_LossIsFatal(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+
+	chunkID := chunk.ID(0)
+	require.NoError(t, cat.FlipHotReady(chunkID))
+	// "ready" key but the probe reports the dir absent (ok=false) — case-4 loss.
+	cfg.HotProbe = &fakeHotProbe{ok: false}
+	cfg.Backend = zeroTxBackend(t)
+	cfg.BackendWaiter = &fakeWaiter{}
+
+	_, _, err := backfillSource(context.Background(), chunkID, AllArtifacts(), cfg)
+	require.Error(t, err)
+	require.ErrorIs(t, err, ErrHotVolumeLost)
+}
+
+func TestBackfillSource_LossOnOpenError(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+
+	chunkID := chunk.ID(0)
+	require.NoError(t, cat.FlipHotReady(chunkID))
+	cfg.HotProbe = &fakeHotProbe{openErr: errors.New("cannot open hot dir")}
+	cfg.Backend = zeroTxBackend(t)
+	cfg.BackendWaiter = &fakeWaiter{}
+
+	_, _, err := backfillSource(context.Background(), chunkID, AllArtifacts(), cfg)
+	require.ErrorIs(t, err, ErrHotVolumeLost)
+}
+
+func TestBackfillSource_DoesNotUsePackWhenLFSRequested(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+
+	chunkID := chunk.ID(0)
+	require.NoError(t, cat.MarkChunkFreezing(chunkID, KindLedgers))
+	require.NoError(t, os.MkdirAll(filepath.Dir(cat.layout.LedgerPackPath(chunkID)), 0o755))
+	writeRealPack(t, cat, chunkID)
+	require.NoError(t, cat.FlipChunkFrozen(chunkID, KindLedgers))
+
+	bulk := zeroTxBackend(t)
+	cfg.Backend = bulk
+	cfg.BackendWaiter = &fakeWaiter{}
+
+	// ledgers IS requested — the pack branch is skipped (circular), so it goes to bulk.
+	src, closeSrc, err := backfillSource(context.Background(), chunkID, AllArtifacts(), cfg)
+	require.NoError(t, err)
+	require.NoError(t, closeSrc())
+	require.Same(t, ingest.ChunkSource(bulk), src)
+}
+
+func TestBackfillSource_BulkWaitTimeoutFatal(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+
+	chunkID := chunk.ID(0)
+	cfg.Backend = zeroTxBackend(t)
+	cfg.BackendWaiter = &fakeWaiter{err: ErrBackendCoverageTimeout}
+
+	_, _, err := backfillSource(context.Background(), chunkID, AllArtifacts(), cfg)
+	require.ErrorIs(t, err, ErrBackendCoverageTimeout)
+}
+
+func TestBackfillSource_NoBackendConfigured(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := testProcessConfig(t, cat)
+	cfg.Backend = nil
+
+	_, _, err := backfillSource(context.Background(), chunk.ID(0), AllArtifacts(), cfg)
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "no bulk backend")
+}
+
+// writeRealPack writes a valid cold ledger pack for chunkID at its canonical
+// Layout path by driving the merged cold ledger ingester over a zero-tx stream.
+func writeRealPack(t *testing.T, cat *Catalog, chunkID chunk.ID) {
+	t.Helper()
+	src := &countingChunkSource{
+		make: func(chunk.ID) (ledgerbackend.LedgerStream, error) {
+			return &fullChunkStream{t: t, gen: zeroTxLCMBytes}, nil
+		},
+	}
+	dirs := ingest.ColdDirs{Ledgers: cat.layout.LedgersRoot()}
+	require.NoError(t, ingest.RunColdChunk(
+		context.Background(), silentLogger(), src, dirs, chunkID,
+		ingest.NopSink{}, ingest.Config{Ledgers: true}))
+	require.FileExists(t, cat.layout.LedgerPackPath(chunkID))
+}
+
+// ---------------------------------------------------------------------------
+// Real hot probe: single-watermark completeness over the shared multi-CF
+// RocksDB hot DB (decision (a)).
+// ---------------------------------------------------------------------------
+
+func TestRocksHotProbe_SingleWatermark_CompleteVsStale(t *testing.T) {
+	hotRoot := t.TempDir()
+	chunkID := chunk.ID(0)
+	chunkDir := filepath.Join(hotRoot, chunkID.String())
+
+	// Ingest a SHORT prefix of the chunk into the shared hot DB (one atomic
+	// batch per ledger across all CFs), so the single watermark is well below
+	// the chunk's last ledger (stale).
+	stalePrefix := chunkID.FirstLedger() + 4
+	ingestHotPrefix(t, chunkDir, chunkID, stalePrefix)
+
+	probe := NewRocksHotProbe(func(c chunk.ID) string {
+		return filepath.Join(hotRoot, c.String())
+	}, silentLogger())
+
+	hot, ok, err := probe.OpenHotChunk(chunkID)
+	require.NoError(t, err)
+	require.True(t, ok)
+	defer func() { _ = hot.Close() }()
+
+	maxSeq, present, err := hot.MaxCommittedSeq()
+	require.NoError(t, err)
+	require.True(t, present)
+	require.Equal(t, stalePrefix, maxSeq, "the single watermark equals the last committed ledger")
+	require.Less(t, maxSeq, chunkID.LastLedger(), "a stale prefix is not complete")
+}
+
+func TestRocksHotProbe_AbsentDirIsNotOpened(t *testing.T) {
+	hotRoot := t.TempDir()
+	probe := NewRocksHotProbe(func(c chunk.ID) string {
+		return filepath.Join(hotRoot, c.String())
+	}, silentLogger())
+	_, ok, err := probe.OpenHotChunk(chunk.ID(7))
+	require.NoError(t, err)
+	require.False(t, ok, "an absent hot dir reports ok=false (loss when key is ready)")
+}
+
+// ingestHotPrefix writes ledgers [chunk.First, throughSeq] into the chunk's
+// SHARED multi-CF hot DB via hotchunk.IngestLedger — one atomic synced
+// WriteBatch per ledger across all CFs (decision (a)) — then closes it so the
+// probe can reopen it.
+func ingestHotPrefix(t *testing.T, chunkDir string, chunkID chunk.ID, throughSeq uint32) {
+	t.Helper()
+	require.NoError(t, os.MkdirAll(chunkDir, 0o755))
+
+	db, err := hotchunk.Open(chunkDir, chunkID, silentLogger())
+	require.NoError(t, err)
+
+	cfg := hotchunk.Ingest{Ledgers: true}
+	for seq := chunkID.FirstLedger(); seq <= throughSeq; seq++ {
+		lcm := xdr.LedgerCloseMetaView(zeroTxLCMBytes(t, seq))
+		_, err := db.IngestLedger(seq, lcm, cfg)
+		require.NoError(t, err)
+	}
+	require.NoError(t, db.Close())
+}

From a79bbb0cd91c61c411f9e2d02db27464b3ba443f Mon Sep 17 00:00:00 2001
From: Simon Chow <simon.chow@stellar.org>
Date: Tue, 23 Jun 2026 17:40:16 -0400
Subject: [PATCH 2/4] docs+style(streaming): scope layer-2 doc.go to storage;
 fix lint

doc.go: add the Storage group (process.go, hotsource.go) to the file map
and switch the map header to 'by concern' now that the package spans two
layers; shrink the 'Later layers' note accordingly.

golangci-lint (this layer's own new findings):
- hotchunk.DB.MaxCommittedSeq: drop named returns (nonamedreturns)
- process.go / ingest driver.go: wrap two >120-char lines (lll)
---
 .../internal/fullhistory/ingest/driver.go     |  3 ++-
 .../pkg/stores/hotchunk/hotchunk.go           |  2 +-
 .../internal/fullhistory/streaming/doc.go     | 24 ++++++++++---------
 .../internal/fullhistory/streaming/process.go |  3 ++-
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/cmd/stellar-rpc/internal/fullhistory/ingest/driver.go b/cmd/stellar-rpc/internal/fullhistory/ingest/driver.go
index 5bb323c74..c333d5236 100644
--- a/cmd/stellar-rpc/internal/fullhistory/ingest/driver.go
+++ b/cmd/stellar-rpc/internal/fullhistory/ingest/driver.go
@@ -275,7 +275,8 @@ func RunColdChunk(
 		sink.ColdChunkTotal(time.Since(start))
 		return berr
 	}
-	logger.Debugf("RunColdChunk: ingesting chunk %d [%d, %d]", uint32(chunkID), chunkID.FirstLedger(), chunkID.LastLedger())
+	logger.Debugf("RunColdChunk: ingesting chunk %d [%d, %d]",
+		uint32(chunkID), chunkID.FirstLedger(), chunkID.LastLedger())
 	service := NewColdService(ings, sink)
 	defer func() {
 		if cerr := service.Close(); cerr != nil {
diff --git a/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk.go b/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk.go
index 29c0f4ab2..0563698b5 100644
--- a/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk.go
+++ b/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk/hotchunk.go
@@ -89,7 +89,7 @@ func (d *DB) Close() error { return d.store.Close() }
 // MaxCommittedSeq returns the single authoritative per-chunk watermark:
 // the highest ledger seq durably committed, read from the ledgers CF's
 // last key. ok=false on an empty DB (no ledger committed yet).
-func (d *DB) MaxCommittedSeq() (seq uint32, ok bool, err error) {
+func (d *DB) MaxCommittedSeq() (uint32, bool, error) {
 	return d.ledger.LastSeq()
 }
 
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go b/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
index a9060879e..e63d3b565 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
@@ -4,9 +4,8 @@
 // (fullhistory/pkg/...). It is built ON that layer — the catalog WRAPS
 // metastore.Store rather than reinventing a RocksDB wrapper.
 //
-// This file map covers Slice 1 · Layer 1 (foundations): the durable-state
-// substrate only, with no daemon goroutines yet. The storage primitives,
-// orchestration, and daemon assembly stack on top in later layers (see "Later
+// This file map covers Slice 1 · Layers 1–2 (foundations + storage). The
+// orchestration and daemon assembly stack on top in later layers (see "Later
 // layers" below).
 //
 // # Data model (keys-first)
@@ -24,8 +23,8 @@
 // invariants are verified by fault-injection hooks fired from INSIDE the real
 // methods (see hooks.go), so the catalog, the one-write protocol, the sweeps,
 // and the I/O paths they protect must share a package to keep those hooks
-// package-private and the invariant tests meaningful. The files group by layer;
-// this layer adds:
+// package-private and the invariant tests meaningful. The files group by
+// concern:
 //
 //	Foundation     keys.go, paths.go
 //	                 the catalog key schema, the key↔path bijection, and chunk
@@ -39,16 +38,19 @@
 //	                 over the catalog + storage roots.
 //	Cross-cutting  artifacts.go
 //	                 the ArtifactSet/Kind abstraction the later layers subset.
+//	Storage        process.go, hotsource.go
+//	                 processChunk + backfillSource materialize a chunk's cold
+//	                 artifacts from the cheapest source (ready hot DB → frozen
+//	                 local .pack → bulk backend); hotsource exposes the hot tier
+//	                 as a freeze source.
 //	Test seam      hooks.go
 //	                 test-only crash-injection points fired from inside the real
 //	                 protocol/sweep methods (every field nil in production).
 //
 // # Later layers
 //
-// Slice 1 stacks on this foundation: Layer 2 adds the per-chunk hot DB and
-// processChunk (storage primitives); Layer 3 adds the postcondition
-// resolver/executor, the live ingestion loop, and the lifecycle tick
-// (orchestration); Layer 4 adds startStreaming, validateConfig, surgical
-// recovery, and the audit command (daemon assembly). Slices 2 and 3 then weave
-// in the events and tx-hash data types.
+// Layer 3 adds the postcondition resolver/executor, the live ingestion loop,
+// and the lifecycle tick (orchestration); Layer 4 adds startStreaming,
+// validateConfig, surgical recovery, and the audit command (daemon assembly).
+// Slices 2 and 3 then weave in the events and tx-hash data types.
 package streaming
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/process.go b/cmd/stellar-rpc/internal/fullhistory/streaming/process.go
index 7d503c3e1..7bb83f7a8 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/process.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/process.go
@@ -178,7 +178,8 @@ func processChunk(ctx context.Context, chunkID chunk.ID, artifacts ArtifactSet,
 	dirs := ingest.ColdDirs{
 		Ledgers: cat.layout.LedgersRoot(),
 	}
-	if rerr := ingest.RunColdChunk(ctx, cfg.Logger, source, dirs, chunkID, cfg.Sink, artifacts.ingestConfig()); rerr != nil {
+	rerr := ingest.RunColdChunk(ctx, cfg.Logger, source, dirs, chunkID, cfg.Sink, artifacts.ingestConfig())
+	if rerr != nil {
 		return fmt.Errorf("streaming: cold ingest chunk %s %s: %w", chunkID, artifacts, rerr)
 	}
 

From 5f0071d08769455f4f1370424d42d170e0ce7c61 Mon Sep 17 00:00:00 2001
From: Simon Chow <simon.chow@stellar.org>
Date: Tue, 23 Jun 2026 13:25:52 -0400
Subject: [PATCH 3/4] =?UTF-8?q?feat(fullhistory):=20streaming=20daemon=20s?=
 =?UTF-8?q?lice=201=20=E2=80=94=20layer=203=20(orchestration)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fullhistory/streaming/backfill_test.go    |  63 ++
 .../fullhistory/streaming/eligibility.go      | 117 ++++
 .../internal/fullhistory/streaming/execute.go | 217 +++++++
 .../fullhistory/streaming/execute_test.go     | 194 ++++++
 .../internal/fullhistory/streaming/ingest.go  | 316 ++++++++++
 .../fullhistory/streaming/ingest_test.go      | 442 ++++++++++++++
 .../fullhistory/streaming/lifecycle.go        | 389 ++++++++++++
 .../fullhistory/streaming/lifecycle_test.go   | 575 ++++++++++++++++++
 .../fullhistory/streaming/observability.go    | 339 +++++++++++
 .../fullhistory/streaming/progress.go         | 214 +++++++
 .../streaming/progress_realdb_test.go         | 104 ++++
 .../streaming/progress_shim_test.go           |  18 +
 .../fullhistory/streaming/progress_test.go    | 316 ++++++++++
 .../internal/fullhistory/streaming/resolve.go |  96 +++
 .../fullhistory/streaming/resolve_test.go     | 119 ++++
 .../fullhistory/streaming/retention.go        | 102 ++++
 16 files changed, 3621 insertions(+)
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/backfill_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/eligibility.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/execute.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/execute_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/ingest.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/ingest_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/observability.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/progress.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/progress_realdb_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/progress_shim_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/progress_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/resolve.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/resolve_test.go
 create mode 100644 cmd/stellar-rpc/internal/fullhistory/streaming/retention.go

diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/backfill_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/backfill_test.go
new file mode 100644
index 000000000..65d3e2155
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/backfill_test.go
@@ -0,0 +1,63 @@
+package streaming
+
+import (
+	"context"
+	"sync/atomic"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+// ---------------------------------------------------------------------------
+// runBackfill end-to-end on the seamed executor: resolve the diff, then
+// executePlan runs the resolved plan. There is NO upfront producibility gate
+// (item R2-5); an unproducible chunk fatals from backfillSource per chunk when
+// the executor reaches it (exercised below through the real processChunk path).
+// ---------------------------------------------------------------------------
+
+func TestRunBackfill_ResolvesThenExecutes(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	var chunksRun atomic.Int32
+	cfg := ExecConfig{
+		Catalog: cat, Logger: silentLogger(), Workers: 2,
+		Process: ProcessConfig{Backend: zeroTxBackend(t)},
+		runChunk: func(context.Context, ChunkBuild, ExecConfig) error {
+			chunksRun.Add(1)
+			return nil
+		},
+	}
+
+	// Fresh catalog, range [0,3]: resolve schedules 4 chunk builds.
+	require.NoError(t, runBackfill(context.Background(), cfg, 0, 3))
+	require.Equal(t, int32(4), chunksRun.Load())
+}
+
+// No backend AND a genuine fall-through chunk (nothing local): the daemon still
+// fatals — now from backfillSource itself when the executor reaches the chunk
+// (item R2-5 folded the upfront gate into the per-chunk source selection). The
+// REAL processChunk path runs (no runChunk seam), so backfillSource picks the
+// (3) bulk-backend branch, finds no backend, and aborts the plan.
+func TestRunBackfill_NoBackendNoLocalCopyFatals(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := ExecConfig{
+		Catalog: cat, Logger: silentLogger(), Workers: 1,
+		Process: ProcessConfig{HotProbe: &fakeHotProbe{}}, // not "ready", no backend
+	}
+	err := runBackfill(context.Background(), cfg, 0, 0)
+	require.Error(t, err)
+	require.ErrorContains(t, err, "no bulk backend is configured")
+}
+
+// An inverted range (younger-than-one-chunk network) backfills nothing.
+func TestRunBackfill_InvertedRangeIsNoop(t *testing.T) {
+	cat, _ := testCatalog(t)
+	var ran int
+	cfg := ExecConfig{
+		Catalog: cat, Logger: silentLogger(), Workers: 1,
+		Process:  ProcessConfig{Backend: zeroTxBackend(t)},
+		runChunk: func(context.Context, ChunkBuild, ExecConfig) error { ran++; return nil },
+	}
+	require.NoError(t, runBackfill(context.Background(), cfg, 5, 4))
+	require.Zero(t, ran)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/eligibility.go b/cmd/stellar-rpc/internal/fullhistory/streaming/eligibility.go
new file mode 100644
index 000000000..c3a999fb0
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/eligibility.go
@@ -0,0 +1,117 @@
+package streaming
+
+import (
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// The discard and prune eligibility scans. Each returns a list of zero-arg
+// callables (closures over the op and its arguments); the tick just calls them
+// in order. Both are PURE READS of the catalog — they decide eligibility from
+// durable keys alone, so re-running against the same snapshot after a tick
+// finishes yields nothing (the quiescence postcondition).
+
+// eligibleDiscardOps walks hot:chunk:* keys and returns a discard closure per
+// hot DB the cold artifacts now fully serve (or that fell past retention). Per
+// chunk:
+//
+//   - chunkLastLedger < floor (past retention OR below earliest_ledger): discard.
+//     Its artifact files, if any, carry their own keys and are picked up by the
+//     prune stage on the same tick.
+//   - complete (last ledger <= through) and nothing pending (cold artifacts fully
+//     serve it): discard.
+//   - otherwise (live, or still producing): leave alone.
+//
+// discardHotTierForChunk is idempotent and re-derives from durable keys, so a
+// crash between freeze and discard self-heals on the next tick.
+func eligibleDiscardOps(cfg LifecycleConfig, cat *Catalog, through uint32) ([]func() error, error) {
+	earliest, _, err := cat.EarliestLedger()
+	if err != nil {
+		return nil, err
+	}
+	// The discard scan's "past retention" test is the reader retention
+	// contract's ChunkBelowFloor (retention.go) — one definition shared with the
+	// read gate, so a hot DB is retired on exactly the floor the reader stops
+	// admitting its seqs at. A shortened retentionChunks raises this floor
+	// immediately (the gate is rebuilt from the live `through` each tick).
+	gate := NewRetentionGate(through, cfg.RetentionChunks, earliest)
+
+	hot, err := cat.HotChunkKeys()
+	if err != nil {
+		return nil, err
+	}
+
+	var ops []func() error
+	for _, c := range hot {
+		last := c.LastLedger()
+		switch {
+		case gate.ChunkBelowFloor(c):
+			ops = append(ops, func() error { return discardHotTierForChunk(cat, c) })
+		case last <= through:
+			pending, perr := pendingArtifacts(c, cat)
+			if perr != nil {
+				return nil, perr
+			}
+			if pending.Empty() {
+				ops = append(ops, func() error { return discardHotTierForChunk(cat, c) })
+			}
+			// else: still producing — leave alone.
+		}
+		// default (last > through): the live chunk or above — ingestion's, never
+		// the lifecycle's to touch.
+	}
+	return ops, nil
+}
+
+// pendingArtifacts lists which processChunk outputs chunk still needs: the
+// per-chunk kinds (currently just ledgers) that are not yet frozen.
+func pendingArtifacts(c chunk.ID, cat *Catalog) (ArtifactSet, error) {
+	var need ArtifactSet
+	for _, kind := range []Kind{KindLedgers} {
+		state, err := cat.State(c, kind)
+		if err != nil {
+			return need, err
+		}
+		if state != StateFrozen {
+			need = need.Add(kind)
+		}
+	}
+	return need, nil
+}
+
+// eligiblePruneOps is the system's only file-deleter, driven entirely by keys.
+// It returns one batched SweepChunkArtifacts closure for the chunk family.
+//
+// "Wholly below the floor" is the RetentionGate's predicate — the same one the
+// discard scan and the read path use, so prune deletes exactly what the reader
+// has stopped admitting. At a genesis floor the gate matches nothing (the
+// design's guard: nothing is below genesis), so no hand-rolled sentinel is needed.
+func eligiblePruneOps(cfg LifecycleConfig, cat *Catalog, through uint32) ([]func() error, error) {
+	earliest, _, err := cat.EarliestLedger()
+	if err != nil {
+		return nil, err
+	}
+	gate := NewRetentionGate(through, cfg.RetentionChunks, earliest)
+
+	var ops []func() error
+
+	// Chunk family: swept in one batch.
+	refs, err := cat.ChunkArtifactKeys()
+	if err != nil {
+		return nil, err
+	}
+	var sweep []ArtifactRef
+	for _, ref := range refs {
+		switch {
+		case gate.ChunkBelowFloor(ref.Chunk):
+			// Wholly past retention: any state goes.
+			sweep = append(sweep, ref)
+		case ref.State == StatePruning:
+			// In-retention artifact demoted by a recovery.
+			sweep = append(sweep, ref)
+		}
+	}
+	if len(sweep) > 0 {
+		ops = append(ops, func() error { return cat.SweepChunkArtifacts(sweep) })
+	}
+	return ops, nil
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/execute.go b/cmd/stellar-rpc/internal/fullhistory/streaming/execute.go
new file mode 100644
index 000000000..0259a0c6b
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/execute.go
@@ -0,0 +1,217 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"runtime"
+
+	"golang.org/x/sync/errgroup"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ExecConfig is the scheduler's dependency bundle — everything resolve,
+// executePlan, and runBackfill read. It COMPOSES process.go's ProcessConfig
+// (which drives processChunk + backfillSource) rather than redeclaring its
+// fields, and adds the two scheduler knobs. The Catalog and Logger are shared,
+// so they live here and are projected down to the primitive; the rest of the
+// primitive config (HotProbe, Backend, …) is carried verbatim.
+//
+// This is the "one Config" the design's resolve/executePlan/runBackfill
+// pseudocode reads `cfg.Catalog`, `cfg.Workers`, and `cfg.MaxRetries` from; the
+// full daemon Config (retention, captive core, paths) is a superset assembled
+// at startup and is out of this issue's scope.
+type ExecConfig struct {
+	Catalog *Catalog
+	Logger  *supportlog.Entry
+
+	// Metrics is the streaming control-plane sink (observability.go) shared by
+	// backfill, the ingestion loop, and the lifecycle tick. nil ⇒ nopMetrics via
+	// WithDefaults, so every phase reports unconditionally. It is the DAEMON's
+	// phase sink, distinct from Process.Sink (the per-data-type ingest sink).
+	Metrics Metrics
+
+	// Process carries the primitive-specific dependencies. Its Catalog and
+	// Logger fields are filled from the shared ones above by the projection
+	// accessor, so a caller need not duplicate them.
+	Process ProcessConfig
+
+	// Workers is the ONLY concurrency knob: the size of the single bounded pool
+	// every chunk build draws from. Must be > 0 — a zero pool deadlocks
+	// executePlan (every task blocks acquiring a slot that never frees).
+	// Defaults to GOMAXPROCS via WithDefaults.
+	Workers int
+
+	// MaxRetries bounds per-task retries before a task aborts the whole plan
+	// (and, in production, the daemon). 0 means "try once, no retry".
+	MaxRetries int
+
+	// runChunk is a test-only seam: when nil (production) the executor runs the
+	// real processChunk. Tests override it to drive the failure paths
+	// deterministically without standing up the full ingestion pipeline. It
+	// never appears in production wiring.
+	runChunk func(ctx context.Context, cb ChunkBuild, cfg ExecConfig) error
+}
+
+// WithDefaults returns a copy of cfg with Workers defaulted to GOMAXPROCS when
+// unset. Validation (Workers > 0, non-nil deps) is validate's job.
+func (cfg ExecConfig) WithDefaults() ExecConfig {
+	if cfg.Workers <= 0 {
+		cfg.Workers = runtime.GOMAXPROCS(0)
+	}
+	if cfg.Metrics == nil {
+		cfg.Metrics = nopMetrics{}
+	}
+	return cfg
+}
+
+// metrics returns the configured sink, or nopMetrics when unset — the read every
+// phase uses so it never nil-checks (WithDefaults fills it for the daemon path,
+// but a primitive called directly in a test may not have run WithDefaults).
+func (cfg ExecConfig) metrics() Metrics { return metricsOrNop(cfg.Metrics) }
+
+func (cfg ExecConfig) validate() error {
+	if cfg.Catalog == nil {
+		return errors.New("streaming: ExecConfig.Catalog is nil")
+	}
+	if cfg.Logger == nil {
+		return errors.New("streaming: ExecConfig.Logger is nil")
+	}
+	if cfg.Workers <= 0 {
+		// Loud, not silently corrected: a zero pool deadlocks executePlan, so the
+		// caller's miswiring must surface rather than hang.
+		return fmt.Errorf("streaming: ExecConfig.Workers must be > 0 (got %d) — a zero pool deadlocks executePlan", cfg.Workers)
+	}
+	return nil
+}
+
+// processConfig projects the ExecConfig down to the ProcessConfig processChunk
+// reads, filling the shared Catalog/Logger so callers configure them once.
+func (cfg ExecConfig) processConfig() ProcessConfig {
+	p := cfg.Process
+	p.Catalog = cfg.Catalog
+	p.Logger = cfg.Logger
+	return p
+}
+
+// executePlan runs a Plan on one bounded worker pool (cfg.Workers — the only
+// resource knob). It is the SAME executor both callers use: runBackfill (catch-
+// up) and the lifecycle tick. The structure is map without a job tracker —
+// chunk builds are the maps — and there is deliberately no task engine and no
+// persisted task state: resolve re-plans from durable keys on every run, so
+// there is nothing to resume.
+//
+// Each ChunkBuild acquires a worker slot, runs (with retries), and on SUCCESS
+// closes its done-channel AFTER its artifacts are durable (done-channels signal
+// SUCCESS, not mere completion). A build that exhausts its retries LEAVES the
+// channel open and RETURNS the error, which cancels gctx.
+//
+// At most Workers chunk builds execute at any instant. A task exhausting its
+// retries returns an error, which errgroup propagates: gctx is canceled, every
+// other task's slot-acquire/processChunk observes it, and g.Wait returns the
+// first error — the daemon aborts and a restart re-resolves from durable keys.
+func executePlan(ctx context.Context, plan Plan, cfg ExecConfig) error {
+	if err := cfg.validate(); err != nil {
+		return err
+	}
+
+	// One slot per worker — the single pool all chunk builds share.
+	slots := make(chan struct{}, cfg.Workers)
+
+	// One done-channel per planned chunk build, created up front.
+	done := make(map[chunk.ID]chan struct{}, len(plan.ChunkBuilds))
+	for _, cb := range plan.ChunkBuilds {
+		done[cb.Chunk] = make(chan struct{})
+	}
+
+	runChunk := cfg.runChunk
+	if runChunk == nil {
+		procCfg := cfg.processConfig()
+		runChunk = func(gctx context.Context, cb ChunkBuild, _ ExecConfig) error {
+			return processChunk(gctx, cb.Chunk, cb.Artifacts, procCfg)
+		}
+	}
+
+	g, gctx := errgroup.WithContext(ctx)
+
+	for _, cb := range plan.ChunkBuilds {
+		g.Go(func() error {
+			if err := acquireSlot(gctx, slots); err != nil {
+				return err
+			}
+			defer releaseSlot(slots)
+			if err := withRetries(gctx, cfg.MaxRetries, func() error {
+				return runChunk(gctx, cb, cfg)
+			}); err != nil {
+				// SUCCESS semantics: leave done[cb.Chunk] OPEN and return the error.
+				// errgroup cancels gctx and g.Wait returns the first error.
+				return err
+			}
+			// Success: artifacts are durable.
+			close(done[cb.Chunk])
+			return nil
+		})
+	}
+
+	return g.Wait()
+}
+
+// acquireSlot blocks until a worker slot is free or ctx is canceled. Pulling it
+// out of the goroutine bodies keeps the cancel-vs-acquire select in one place.
+func acquireSlot(ctx context.Context, slots chan struct{}) error {
+	select {
+	case slots <- struct{}{}:
+		return nil
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+}
+
+// releaseSlot frees a previously-acquired worker slot. It never blocks (the
+// buffer always has room for a token this goroutine put there).
+func releaseSlot(slots chan struct{}) { <-slots }
+
+// withRetries runs fn up to maxRetries+1 times (one attempt plus maxRetries
+// retries), returning nil on the first success and the last error after the
+// budget is exhausted. A canceled ctx stops retrying immediately — once the
+// errgroup cancels gctx (a sibling task aborted), there is no point burning
+// this task's retry budget against a doomed context.
+func withRetries(ctx context.Context, maxRetries int, fn func() error) error {
+	var err error
+	for attempt := 0; attempt <= maxRetries; attempt++ {
+		if cerr := ctx.Err(); cerr != nil {
+			return cerr
+		}
+		if err = fn(); err == nil {
+			return nil
+		}
+	}
+	return err
+}
+
+// runBackfill is backfill's entry point: resolve the missing work, then
+// executePlan over the resolver's diff. It is the SAME executePlan the lifecycle
+// tick uses — one scheduler, two callers, sharing one set of postconditions.
+//
+// There is NO upfront producibility gate (item R2-5 / the design "folded the
+// upfront gate into the per-chunk bounded wait"): a genuinely unproducible chunk
+// — no local copy and no configured bulk backend — fatals from backfillSource
+// itself when the executor reaches that chunk, on every retry. backfillSource's
+// bounded WaitForCoverage handles a fall-through chunk above a lagging-but-
+// advancing backend per chunk. The daemon therefore still fatals on an
+// unproducible chunk; only the surface point moved from a pre-flight check to
+// the per-chunk source selection (see the return note for the narrowing flag).
+func runBackfill(ctx context.Context, cfg ExecConfig, rangeStart, rangeEnd chunk.ID) error {
+	cfg = cfg.WithDefaults()
+	if err := cfg.validate(); err != nil {
+		return err
+	}
+	plan, err := resolve(cfg, rangeStart, rangeEnd)
+	if err != nil {
+		return fmt.Errorf("streaming: runBackfill resolve [%s,%s]: %w", rangeStart, rangeEnd, err)
+	}
+	return executePlan(ctx, plan, cfg)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/execute_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/execute_test.go
new file mode 100644
index 000000000..4d4738dbc
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/execute_test.go
@@ -0,0 +1,194 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ---------------------------------------------------------------------------
+// Executor test harness. The runChunk seam lets a test drive the chunk-build
+// pool deterministically: a fake chunk build records its order and optionally
+// blocks on a release signal.
+// ---------------------------------------------------------------------------
+
+// execRecorder captures chunk task execution so a test can assert completion.
+// All access is mutex-guarded — the executor runs tasks on many goroutines.
+type execRecorder struct {
+	mu sync.Mutex
+	// chunkDone[c] is true once the chunk build for c has returned.
+	chunkDone map[chunk.ID]bool
+	order     []string
+}
+
+func newExecRecorder() *execRecorder {
+	return &execRecorder{chunkDone: map[chunk.ID]bool{}}
+}
+
+func (r *execRecorder) markChunkDone(c chunk.ID) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.chunkDone[c] = true
+	r.order = append(r.order, "chunk:"+c.String())
+}
+
+// execTestCfg builds an ExecConfig with the chunk-build seam installed. workers
+// sets the pool size.
+func execTestCfg(cat *Catalog, workers int, runChunk func(context.Context, ChunkBuild, ExecConfig) error) ExecConfig {
+	return ExecConfig{
+		Catalog:  cat,
+		Logger:   silentLogger(),
+		Workers:  workers,
+		runChunk: runChunk,
+	}
+}
+
+// ---------------------------------------------------------------------------
+// No deadlock at Workers=1; every planned chunk build runs.
+// ---------------------------------------------------------------------------
+
+func TestExecutePlan_RunsEveryChunkBuild_Workers1(t *testing.T) {
+	cat, _ := testCatalog(t)
+	rec := newExecRecorder()
+
+	plan := Plan{
+		ChunkBuilds: []ChunkBuild{
+			{Chunk: 0, Artifacts: AllArtifacts()},
+			{Chunk: 1, Artifacts: AllArtifacts()},
+			{Chunk: 4, Artifacts: AllArtifacts()},
+			{Chunk: 5, Artifacts: AllArtifacts()},
+		},
+	}
+
+	cfg := execTestCfg(cat, 1, func(_ context.Context, cb ChunkBuild, _ ExecConfig) error {
+		rec.markChunkDone(cb.Chunk)
+		return nil
+	})
+
+	require.NoError(t, executePlan(context.Background(), plan, cfg),
+		"Workers=1 must not deadlock")
+	require.Len(t, rec.chunkDone, 4)
+}
+
+// A high worker count runs every chunk build concurrently without losing any.
+func TestExecutePlan_RunsEveryChunkBuildUnderConcurrency(t *testing.T) {
+	cat, _ := testCatalog(t)
+	rec := newExecRecorder()
+
+	plan := Plan{
+		ChunkBuilds: []ChunkBuild{
+			{Chunk: 0, Artifacts: AllArtifacts()},
+			{Chunk: 1, Artifacts: AllArtifacts()},
+			{Chunk: 2, Artifacts: AllArtifacts()},
+			{Chunk: 3, Artifacts: AllArtifacts()},
+		},
+	}
+
+	cfg := execTestCfg(cat, 8, func(_ context.Context, cb ChunkBuild, _ ExecConfig) error {
+		time.Sleep(time.Duration(uint32(cb.Chunk)+1) * 5 * time.Millisecond)
+		rec.markChunkDone(cb.Chunk)
+		return nil
+	})
+
+	require.NoError(t, executePlan(context.Background(), plan, cfg))
+	require.Len(t, rec.chunkDone, 4)
+}
+
+// ---------------------------------------------------------------------------
+// SUCCESS semantics (item R2-2): a failed chunk build returns the error, which
+// cancels gctx; the plan ALWAYS aborts with the first error.
+// ---------------------------------------------------------------------------
+
+func TestExecutePlan_FailedChunkAbortsPlan(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	chunkErr := errors.New("chunk build boom")
+
+	plan := Plan{
+		ChunkBuilds: []ChunkBuild{{Chunk: 0, Artifacts: AllArtifacts()}},
+	}
+
+	cfg := execTestCfg(cat, 1, func(context.Context, ChunkBuild, ExecConfig) error { return chunkErr })
+
+	err := executePlan(context.Background(), plan, cfg)
+	require.Error(t, err, "a task exhausting retries aborts the plan")
+	require.ErrorIs(t, err, chunkErr, "the chunk failure propagates")
+}
+
+// ---------------------------------------------------------------------------
+// Retry budget + zero-workers guard.
+// ---------------------------------------------------------------------------
+
+func TestExecutePlan_RetriesThenSucceeds(t *testing.T) {
+	cat, _ := testCatalog(t)
+	var attempts atomic.Int32
+
+	plan := Plan{ChunkBuilds: []ChunkBuild{{Chunk: 0, Artifacts: AllArtifacts()}}}
+	cfg := ExecConfig{
+		Catalog: cat, Logger: silentLogger(), Workers: 1, MaxRetries: 3,
+		runChunk: func(context.Context, ChunkBuild, ExecConfig) error {
+			if attempts.Add(1) < 3 {
+				return errors.New("transient")
+			}
+			return nil
+		},
+	}
+	require.NoError(t, executePlan(context.Background(), plan, cfg))
+	require.Equal(t, int32(3), attempts.Load(), "fn runs until it succeeds within the budget")
+}
+
+func TestExecutePlan_ExhaustsRetriesAndAborts(t *testing.T) {
+	cat, _ := testCatalog(t)
+	var attempts atomic.Int32
+
+	plan := Plan{ChunkBuilds: []ChunkBuild{{Chunk: 0, Artifacts: AllArtifacts()}}}
+	cfg := ExecConfig{
+		Catalog: cat, Logger: silentLogger(), Workers: 1, MaxRetries: 2,
+		runChunk: func(context.Context, ChunkBuild, ExecConfig) error {
+			attempts.Add(1)
+			return errors.New("always fails")
+		},
+	}
+	require.Error(t, executePlan(context.Background(), plan, cfg))
+	require.Equal(t, int32(3), attempts.Load(), "1 try + MaxRetries(2) = 3 attempts")
+}
+
+func TestExecutePlan_ZeroWorkersIsLoudNotADeadlock(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg := ExecConfig{Catalog: cat, Logger: silentLogger(), Workers: 0}
+	err := executePlan(context.Background(), Plan{ChunkBuilds: []ChunkBuild{{Chunk: 0}}}, cfg)
+	require.ErrorContains(t, err, "Workers must be > 0",
+		"a zero pool must be rejected, not deadlock")
+}
+
+// Context cancellation propagates: a long-running chunk build observing a
+// canceled context returns promptly and the whole plan aborts.
+func TestExecutePlan_ContextCancelAborts(t *testing.T) {
+	cat, _ := testCatalog(t)
+	ctx, cancel := context.WithCancel(context.Background())
+
+	plan := Plan{ChunkBuilds: []ChunkBuild{
+		{Chunk: 0, Artifacts: AllArtifacts()},
+		{Chunk: 1, Artifacts: AllArtifacts()},
+	}}
+	var started sync.WaitGroup
+	started.Add(1)
+	var once sync.Once
+	cfg := ExecConfig{
+		Catalog: cat, Logger: silentLogger(), Workers: 2,
+		runChunk: func(ctx context.Context, _ ChunkBuild, _ ExecConfig) error {
+			once.Do(started.Done)
+			<-ctx.Done()
+			return ctx.Err()
+		},
+	}
+	go func() { started.Wait(); cancel() }()
+	require.Error(t, executePlan(ctx, plan, cfg))
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/ingest.go b/cmd/stellar-rpc/internal/fullhistory/streaming/ingest.go
new file mode 100644
index 000000000..902d00bf0
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/ingest.go
@@ -0,0 +1,316 @@
+package streaming
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk"
+)
+
+// The hot-DB ingestion loop (DECISION (a)). One goroutine polls one ledger
+// source by sequence (the design's indexed core.GetLedger(ctx, seq)) into the
+// per-chunk hot DB, committing each ledger as one atomic synced WriteBatch over
+// the ledger CF. A ledger is therefore fully present or fully absent, and the
+// per-chunk frontier is a SINGLE authoritative value — the DB's
+// MaxCommittedSeq. The loop keeps NO progress variable: the last synced batch IS
+// the watermark, re-derived from durable catalog state at the next startup (see
+// lastCommittedLedger).
+//
+// The loop's only outbound coupling is the lifecycle notification channel (see
+// the Concurrency model): at every chunk boundary it sends the just-completed
+// chunk id. The two goroutines share no in-memory state and never write the same
+// meta-store key or touch the same per-chunk hot RocksDB instance.
+//
+// CLEAN-SHUTDOWN vs CRASH is decided at the DAEMON TOP LEVEL, not here: the loop
+// returns whatever GetLedger returns (a ctx-cancelled error on a clean shutdown,
+// any other error on a crash), and superviseStreaming classifies a non-nil
+// return as clean iff ctx was cancelled (see daemon.go). The loop never tries to
+// tell the two apart itself.
+
+// LedgerGetter is the indexed-poll source the ingestion loop drives: it returns
+// the raw LedgerCloseMeta wire bytes for one ledger sequence, blocking until
+// that ledger is available (the design's core.GetLedger(ctx, seq)). Production
+// wraps captive core's GetLedger; tests pass a fake getter.
+type LedgerGetter interface {
+	GetLedger(ctx context.Context, seq uint32) (xdr.LedgerCloseMetaView, error)
+}
+
+// allHotTypes is the hot tier's ingest selection: every data type the per-chunk
+// DB holds. The hot DB is the sole copy of a chunk's recently ingested ledgers
+// until the cold artifacts are frozen, so it ingests them in the one atomic
+// batch.
+//
+//nolint:gochecknoglobals // immutable selection, the production ingest config
+var allHotTypes = hotchunk.Ingest{Ledgers: true}
+
+// openHotTierForChunk opens (or recovers, or creates) the ONE shared hot DB for
+// chunkID under the Phase A catalog hot:chunk bracket, returning an open handle
+// the caller owns.
+//
+// Three cases, keyed on the durable hot:chunk state (matching the design's
+// openHotDB):
+//
+//   - "ready": the bracket says the dir exists and is usable. Open it. If the
+//     dir is MISSING, that is hot-volume loss — the hot DB is the sole copy of
+//     the chunk's recently-ingested ledgers, so recreating empty would silently
+//     drop them. Refuse with ErrHotVolumeLost (case 4); never auto-heal.
+//   - "transient" (a crashed create/discard, or a recovery-demoted key) or
+//     absent (first use): wipe any leftover dir and create fresh, bracketing the
+//     creation as transient -> create+fsync dir+parent -> ready so a power loss
+//     mid-create can never fabricate the "ready but dir missing" fatal above.
+func openHotTierForChunk(cat *Catalog, chunkID chunk.ID, logger *supportlog.Entry) (*hotchunk.DB, error) {
+	dir := cat.layout.HotChunkPath(chunkID)
+
+	state, err := cat.HotState(chunkID)
+	if err != nil {
+		return nil, fmt.Errorf("streaming: read hot state chunk %s: %w", chunkID, err)
+	}
+
+	if state == HotReady {
+		if _, statErr := os.Stat(dir); statErr != nil {
+			if os.IsNotExist(statErr) {
+				// The key promises a DB the filesystem does not have — hot
+				// storage was lost out from under a surviving meta store. This
+				// is the same case-4 fatal lastCommittedLedger surfaces lazily
+				// on its refinement open; surface it as the sentinel so the
+				// daemon's top-level loop owns the fatal-and-surface decision.
+				return nil, fmt.Errorf(
+					"%w: chunk %s is %q but its hot dir %s is missing",
+					ErrHotVolumeLost, chunkID, HotReady, dir)
+			}
+			return nil, fmt.Errorf(
+				"%w: chunk %s: stat hot dir %s: %w",
+				ErrHotVolumeLost, chunkID, dir, statErr)
+		}
+		db, openErr := hotchunk.Open(dir, chunkID, logger)
+		if openErr != nil {
+			// The dir existed at the stat above; an open failure now is loss.
+			return nil, fmt.Errorf("%w: chunk %s: open hot DB: %w", ErrHotVolumeLost, chunkID, openErr)
+		}
+		return db, nil
+	}
+
+	// "transient" or absent — a crashed create/discard left debris, or this is
+	// first use. Wipe any leftover dir, then create fresh under the bracket.
+	if rmErr := os.RemoveAll(dir); rmErr != nil {
+		return nil, fmt.Errorf("streaming: wipe leftover hot dir %s: %w", dir, rmErr)
+	}
+	if putErr := cat.PutHotTransient(chunkID); putErr != nil {
+		return nil, fmt.Errorf("streaming: mark hot transient chunk %s: %w", chunkID, putErr)
+	}
+
+	db, openErr := hotchunk.Open(dir, chunkID, logger)
+	if openErr != nil {
+		return nil, fmt.Errorf("streaming: create hot DB chunk %s: %w", chunkID, openErr)
+	}
+
+	// The dir + its dirent must be durable BEFORE the key flips to "ready" —
+	// else a power crash between the flip and the dir's durability fabricates
+	// the "ready but dir missing" fatal above for a DB that was actually fine.
+	if syncErr := fsyncDir(dir); syncErr != nil {
+		_ = db.Close()
+		return nil, fmt.Errorf("streaming: fsync hot dir %s: %w", dir, syncErr)
+	}
+	if syncErr := fsyncDir(parentDir(dir)); syncErr != nil {
+		_ = db.Close()
+		return nil, fmt.Errorf("streaming: fsync hot parent dir %s: %w", parentDir(dir), syncErr)
+	}
+	if flipErr := cat.FlipHotReady(chunkID); flipErr != nil {
+		_ = db.Close()
+		return nil, fmt.Errorf("streaming: flip hot ready chunk %s: %w", chunkID, flipErr)
+	}
+	return db, nil
+}
+
+// discardHotTierForChunk retires a chunk's hot DB once every cold artifact
+// derived from it is durable (or it has fallen past retention). It is the
+// bracket's close end and the inverse of openHotTierForChunk's create branch:
+// transient -> rmdir+fsync parent -> delete key. Idempotent — a missing key is
+// a no-op, and a crash mid-discard leaves the key "transient" for the next
+// discard scan (or the next open) to finish.
+//
+// The caller MUST have closed the chunk's write handle and confirmed no reader
+// holds it (the lifecycle's discard stage runs after executePlan froze the cold
+// artifacts, and readers hold independent handles resolved through meta keys).
+func discardHotTierForChunk(cat *Catalog, chunkID chunk.ID) error {
+	has, err := cat.Has(hotChunkKey(chunkID))
+	if err != nil {
+		return fmt.Errorf("streaming: read hot key chunk %s: %w", chunkID, err)
+	}
+	if !has {
+		return nil
+	}
+	if putErr := cat.PutHotTransient(chunkID); putErr != nil {
+		return fmt.Errorf("streaming: mark hot transient chunk %s: %w", chunkID, putErr)
+	}
+
+	dir := cat.layout.HotChunkPath(chunkID)
+	if rmErr := os.RemoveAll(dir); rmErr != nil {
+		return fmt.Errorf("streaming: rmdir hot dir %s: %w", dir, rmErr)
+	}
+	// The unlink must be durable BEFORE the key delete: the key outlives the
+	// durable rmdir, so a crash anywhere re-runs the discard rather than leaving
+	// a key-less dir.
+	if syncErr := fsyncDir(parentDir(dir)); syncErr != nil {
+		return fmt.Errorf("streaming: fsync hot parent dir %s: %w", parentDir(dir), syncErr)
+	}
+	if delErr := cat.DeleteHotKey(chunkID); delErr != nil {
+		return fmt.Errorf("streaming: delete hot key chunk %s: %w", chunkID, delErr)
+	}
+	return nil
+}
+
+// runIngestionLoop polls core for LCMs by sequence into hotDB, committing each
+// ledger as one atomic synced WriteBatch over the ledger CF, and at each chunk
+// boundary hands the live-chunk frontier forward by closing the just-filled DB
+// and opening the next chunk's. It returns the error GetLedger or a boundary
+// step produced (nil never, since the poll is unbounded) — the daemon top level
+// classifies it: a ctx-cancelled return is a clean shutdown, any other error is
+// RESTARTABLE (the supervisor restarts; startup re-derives the watermark from
+// the last synced batch, losing nothing).
+//
+// The boundary's write order is load-bearing (the handoff fence): the DB is
+// CLOSED before the next chunk's hot:chunk key is created. Creating that key is
+// the act that makes THIS chunk visibly complete to the lifecycle's derivation,
+// so the write handle must already be released when the key appears — otherwise
+// a lifecycle tick (possibly still in flight from the previous notification)
+// could discard a dir whose writer is live. notify() therefore fires only AFTER
+// the next chunk's DB is open and its key created.
+//
+// ingestTypes selects which CFs each ledger's batch writes; production passes
+// allHotTypes. The loop keeps no progress variable — durability is the batch,
+// progress is derived.
+func runIngestionLoop(
+	ctx context.Context,
+	core LedgerGetter,
+	hotDB *hotchunk.DB,
+	cat *Catalog,
+	lifecycleCh chan<- chunk.ID,
+	ingestTypes hotchunk.Ingest,
+	logger *supportlog.Entry,
+	metrics Metrics,
+) (err error) {
+	metrics = metricsOrNop(metrics)
+
+	// notify hands the just-completed chunk id to the lifecycle. The channel is
+	// buffered (lifecycleQueueDepth); a FULL buffer means freeze has fallen that
+	// many boundaries behind ingestion — fail loud (a wedged lifecycle the daemon
+	// cannot recover from by continuing to ingest).
+	notify := func(complete chunk.ID) {
+		select {
+		case lifecycleCh <- complete:
+		default:
+			logger.Fatalf("streaming: lifecycle fell %d boundaries behind ingestion; investigate",
+				lifecycleQueueDepth)
+		}
+	}
+
+	// The loop owns hotDB for the rest of its life: it is the single writer, and
+	// it reopens hotDB at every boundary. On any exit, close the live handle so
+	// the process does not leak the rocksdb instance (boundary handoff already
+	// closed every prior chunk's DB). On the clean-shutdown and crash paths there
+	// is no live writer racing this close; on an error path the loop has stopped.
+	defer func() {
+		if hotDB != nil {
+			if cerr := hotDB.Close(); cerr != nil && err == nil {
+				err = fmt.Errorf("streaming: close live hot DB: %w", cerr)
+			}
+		}
+	}()
+
+	// The resume point is the live chunk's next un-committed ledger: one past the
+	// DB's authoritative watermark, or the chunk's first ledger on an empty resume
+	// DB. Re-derived here (not kept as a progress variable) so a duplicate
+	// already-committed ledger from the source is the idempotent retry the hot
+	// stores tolerate.
+	resume, err := nextIngestLedger(hotDB)
+	if err != nil {
+		return fmt.Errorf("streaming: derive resume ledger: %w", err)
+	}
+
+	// Indexed poll from the resume ledger. GetLedger blocks until ledger seq is
+	// available; a returned error (ctx-cancelled or otherwise) ends the loop and
+	// the daemon top level classifies it.
+	for seq := resume; ; seq++ {
+		lcm, gerr := core.GetLedger(ctx, seq)
+		if gerr != nil {
+			return fmt.Errorf("streaming: get ledger %d: %w", seq, gerr)
+		}
+
+		// One atomic, synced WriteBatch — a ledger is either fully in the hot DB
+		// or absent. The batch IS the durability boundary; no progress variable
+		// is kept.
+		if _, ierr := hotDB.IngestLedger(seq, lcm, ingestTypes); ierr != nil {
+			return fmt.Errorf("streaming: ingest ledger %d: %w", seq, ierr)
+		}
+
+		// Per-ledger liveness signal: the batch is durably synced, so seq is now
+		// the highest committed ledger. This is the daemon's moving steady-state
+		// health gauge — a wedged or slow ingester is detectable between chunk
+		// boundaries, which the watermark gauge (refreshed only on a boundary
+		// tick) cannot show. No network tip is available here, so the loop does
+		// NOT touch IngestionLag (a catch-up-only signal).
+		metrics.LastCommitted(seq)
+
+		// Chunk boundary: this seq is the chunk's last ledger.
+		if seq == chunk.IDFromLedger(seq).LastLedger() {
+			closed := chunk.IDFromLedger(seq)
+			next := closed + 1
+			// Close the write handle BEFORE creating the next chunk's hot key.
+			// The moment that key exists, a tick's derivation classifies THIS
+			// chunk as complete and may freeze and discard its hot DB, and no
+			// writer may hold it then.
+			if cerr := hotDB.Close(); cerr != nil {
+				hotDB = nil // closed (failed) — do not double-close in defer
+				return fmt.Errorf("streaming: close hot DB at boundary chunk %s: %w", closed, cerr)
+			}
+			hotDB = nil // released; reopen below republishes it for the defer
+
+			nextDB, oerr := openHotTierForChunk(cat, next, logger)
+			if oerr != nil {
+				return fmt.Errorf("streaming: open hot DB for chunk %s at boundary: %w", next, oerr)
+			}
+			hotDB = nextDB
+			// Creating chunk next's key (inside openHotTierForChunk) moved the
+			// partition; only now notify the lifecycle of the completed chunk.
+			notify(closed)
+
+			// Phase-boundary observability: the just-filled chunk is now visibly
+			// complete, the next chunk's DB is open. Count the handoff and log the
+			// boundary (the lifecycle tick the notify just woke will report the
+			// freeze/discard/prune of this chunk).
+			metrics.ChunkBoundary(uint32(closed))
+			logger.WithField("closed_chunk", closed.String()).
+				WithField("next_chunk", next.String()).
+				WithField("last_ledger", seq).
+				Info("streaming: ingestion chunk boundary — handed off to lifecycle")
+		}
+	}
+}
+
+// nextIngestLedger is the resume point for a just-opened live hot DB: one past
+// its authoritative watermark, or the bound chunk's first ledger on an empty
+// DB. It is the only place the loop "reads progress", and even that read is not
+// kept as a variable — the poll's start derives from durable state, and a
+// re-delivered already-committed ledger is the idempotent retry the hot stores
+// tolerate.
+func nextIngestLedger(db *hotchunk.DB) (uint32, error) {
+	maxSeq, ok, err := db.MaxCommittedSeq()
+	if err != nil {
+		return 0, err
+	}
+	if !ok {
+		return db.ChunkID().FirstLedger(), nil
+	}
+	return maxSeq + 1, nil
+}
+
+// parentDir returns dir's parent, the dirent the hot-tier create/discard
+// barriers fsync so a creation or removal of the chunk dir is itself durable.
+func parentDir(dir string) string { return filepath.Dir(dir) }
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/ingest_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/ingest_test.go
new file mode 100644
index 000000000..81c281c28
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/ingest_test.go
@@ -0,0 +1,442 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"os"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
+)
+
+// ledgerEntry builds a ledgers-CF entry carrying a real zero-tx LCM for seq —
+// the bytes the cold pipeline can later re-read if the chunk freezes from the
+// hot DB.
+func ledgerEntry(t *testing.T, seq uint32) ledger.Entry {
+	t.Helper()
+	return ledger.Entry{Seq: seq, Bytes: zeroTxLCMBytes(t, seq)}
+}
+
+// ---------------------------------------------------------------------------
+// fakeLedgerGetter — an injectable LedgerGetter the ingestion loop polls by
+// sequence (the design's indexed core.GetLedger(ctx, seq)). For seqs it has a
+// programmed frame it returns those bytes; once the poll runs past the last
+// programmed seq it either blocks until ctx is cancelled (a live tip stream that
+// only ends on shutdown) or returns endErr (a crashed backend). It records the
+// FIRST seq it was asked for (the restart resume point) and the GetLedger call
+// count.
+// ---------------------------------------------------------------------------
+
+type fakeLedgerGetter struct {
+	frames     map[uint32][]byte // seq -> raw LCM bytes
+	maxSeq     uint32            // highest programmed seq
+	blockOnCtx bool              // past the last frame, block until ctx.Done
+	endErr     error             // past the last frame, return this (when not blocking)
+	yieldErrAt uint32            // if non-zero, return errAt at this seq instead of bytes
+	errAt      error
+
+	calls     atomic.Int32
+	firstSeen atomic.Uint32
+	sawFirst  atomic.Bool
+}
+
+var _ LedgerGetter = (*fakeLedgerGetter)(nil)
+
+func (g *fakeLedgerGetter) GetLedger(ctx context.Context, seq uint32) (xdr.LedgerCloseMetaView, error) {
+	g.calls.Add(1)
+	if g.sawFirst.CompareAndSwap(false, true) {
+		g.firstSeen.Store(seq)
+	}
+	if ctx.Err() != nil {
+		return nil, ctx.Err()
+	}
+	if g.yieldErrAt != 0 && seq == g.yieldErrAt {
+		return nil, g.errAt
+	}
+	if raw, ok := g.frames[seq]; ok {
+		return xdr.LedgerCloseMetaView(raw), nil
+	}
+	// Past the programmed frames.
+	if g.blockOnCtx {
+		<-ctx.Done()
+		return nil, ctx.Err()
+	}
+	if g.endErr != nil {
+		return nil, g.endErr
+	}
+	return nil, errors.New("fakeLedgerGetter: no frame for seq")
+}
+
+// getterForSeqs builds a fakeLedgerGetter with zero-tx LCM frames for [from,to].
+func getterForSeqs(t *testing.T, from, to uint32) *fakeLedgerGetter {
+	t.Helper()
+	g := &fakeLedgerGetter{frames: map[uint32][]byte{}, maxSeq: to}
+	for seq := from; seq <= to; seq++ {
+		g.frames[seq] = zeroTxLCMBytes(t, seq)
+	}
+	return g
+}
+
+// openLiveHotDB opens (and brackets ready) the live hot DB for a chunk via the
+// production opener, returning the handle and the catalog it lives under.
+func openLiveHotDB(t *testing.T, cat *Catalog, c chunk.ID) *hotchunk.DB {
+	t.Helper()
+	db, err := openHotTierForChunk(cat, c, silentLogger())
+	require.NoError(t, err)
+	return db
+}
+
+// seedWatermark writes a single ledgers-CF entry at seq into the chunk's hot DB
+// so the indexed poll resumes at seq+1 — letting a boundary test drive the loop
+// over only the last ledger or two of a chunk instead of all 10,000. The
+// returned DB is the (re-opened, ready) live handle the loop then owns.
+func seedWatermark(t *testing.T, cat *Catalog, c chunk.ID, seq uint32) *hotchunk.DB {
+	t.Helper()
+	db := openLiveHotDB(t, cat, c)
+	require.NoError(t, db.Ledgers().AddLedgers(ledgerEntry(t, seq)))
+	require.NoError(t, db.Close())
+	reopened, err := openHotTierForChunk(cat, c, silentLogger())
+	require.NoError(t, err)
+	return reopened
+}
+
+// drainLifecycle counts how many chunk ids the buffered lifecycle channel
+// delivered after the loop returned (the loop is done, so no send races this).
+func drainLifecycle(ch chan chunk.ID) []chunk.ID {
+	var got []chunk.ID
+	for {
+		select {
+		case c := <-ch:
+			got = append(got, c)
+		default:
+			return got
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// openHotTierForChunk / discardHotTierForChunk — the bracket.
+// ---------------------------------------------------------------------------
+
+// TestOpenHotTier_CreatesBracketAndDir: a fresh open writes the dir and flips
+// the key "ready"; the returned DB is empty (resume at FirstLedger).
+func TestOpenHotTier_CreatesBracketAndDir(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(3)
+
+	db, err := openHotTierForChunk(cat, c, silentLogger())
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = db.Close() })
+
+	state, err := cat.HotState(c)
+	require.NoError(t, err)
+	assert.Equal(t, HotReady, state, "open flips the key ready")
+
+	_, statErr := os.Stat(cat.layout.HotChunkPath(c))
+	require.NoError(t, statErr, "the dir exists")
+
+	resume, err := nextIngestLedger(db)
+	require.NoError(t, err)
+	assert.Equal(t, c.FirstLedger(), resume, "an empty resume DB resumes at the chunk's first ledger")
+}
+
+// TestOpenHotTier_ReadyButDirMissingIsCase4 is the case-4 fatal: a "ready" key
+// whose dir is gone is hot-volume loss, never auto-healed.
+func TestOpenHotTier_ReadyButDirMissingIsCase4(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(5)
+	require.NoError(t, cat.PutHotTransient(c))
+	require.NoError(t, cat.FlipHotReady(c)) // key says ready, but no dir created
+
+	_, err := openHotTierForChunk(cat, c, silentLogger())
+	require.Error(t, err)
+	require.ErrorIs(t, err, ErrHotVolumeLost)
+}
+
+// TestOpenHotTier_TransientRecreatesFresh: a "transient" key (crashed
+// create/discard) is recovered by wiping any leftover and recreating.
+func TestOpenHotTier_TransientRecreatesFresh(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(2)
+	require.NoError(t, cat.PutHotTransient(c)) // a crash left a transient key
+
+	db, err := openHotTierForChunk(cat, c, silentLogger())
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = db.Close() })
+
+	state, err := cat.HotState(c)
+	require.NoError(t, err)
+	assert.Equal(t, HotReady, state)
+}
+
+// TestDiscardHotTier_RemovesDirAndKey retires the bracket: the key is deleted
+// and the dir is gone. A second discard is a no-op.
+func TestDiscardHotTier_RemovesDirAndKey(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(4)
+	db := openLiveHotDB(t, cat, c)
+	require.NoError(t, db.Close())
+
+	require.NoError(t, discardHotTierForChunk(cat, c))
+
+	has, err := cat.Has(hotChunkKey(c))
+	require.NoError(t, err)
+	assert.False(t, has, "the hot key is deleted")
+	_, statErr := os.Stat(cat.layout.HotChunkPath(c))
+	assert.True(t, os.IsNotExist(statErr), "the dir is removed")
+
+	require.NoError(t, discardHotTierForChunk(cat, c), "second discard is a no-op")
+}
+
+// ---------------------------------------------------------------------------
+// runIngestionLoop — atomic landing.
+// ---------------------------------------------------------------------------
+
+// TestRunIngestionLoop_LedgerLandsInLedgerCF: polling a short contiguous prefix
+// lands each ledger atomically in the ledger CF — the single watermark advances
+// to the last committed seq, and the CF is readable. The getter then errs
+// (backend crash), which the loop returns.
+func TestRunIngestionLoop_LedgerLandsInLedgerCF(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(0)
+	first := c.FirstLedger()
+	db := openLiveHotDB(t, cat, c)
+
+	// A short contiguous prefix from the chunk's first ledger, then the poll runs
+	// dry and errs.
+	getter := getterForSeqs(t, first, first+2)
+	getter.endErr = errors.New("backend crashed")
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+
+	err := runIngestionLoop(context.Background(), getter, db, cat, ch, allHotTypes, silentLogger(), nil)
+	require.Error(t, err, "poll ran past the prefix and the getter errored")
+	require.NotErrorIs(t, err, ErrHotVolumeLost)
+
+	// Reopen the (loop-closed) DB and assert the ledger CF advanced.
+	reopened, err := hotchunk.Open(cat.layout.HotChunkPath(c), c, silentLogger())
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = reopened.Close() })
+
+	maxSeq, ok, err := reopened.MaxCommittedSeq()
+	require.NoError(t, err)
+	require.True(t, ok)
+	assert.Equal(t, first+2, maxSeq, "the single watermark is the last committed seq")
+
+	raw, err := reopened.Ledgers().GetLedgerRaw(first + 2)
+	require.NoError(t, err)
+	assert.NotEmpty(t, raw)
+}
+
+// ---------------------------------------------------------------------------
+// runIngestionLoop — boundary handoff: close BEFORE creating C+1's key.
+// ---------------------------------------------------------------------------
+
+// TestRunIngestionLoop_BoundaryClosesBeforeNextKey asserts the load-bearing
+// handoff order: at the chunk boundary the just-filled DB is CLOSED before the
+// next chunk's hot:chunk key is created. The beforeHotTransient hook fires at
+// the exact instant the next key appears; at that moment the predecessor's DB
+// directory must be reopenable (its RocksDB LOCK released = it is closed).
+func TestRunIngestionLoop_BoundaryClosesBeforeNextKey(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(0)
+	last := c.LastLedger() // boundary ledger
+	next := c + 1
+
+	// Seed the watermark just below the boundary so the poll resumes at last and
+	// crosses the boundary in one step (instead of ingesting all 10,000 ledgers).
+	db := seedWatermark(t, cat, c, last-1)
+
+	var (
+		hookFired   atomic.Bool
+		closedFirst atomic.Bool
+	)
+	cat.hooks.beforeHotTransient = func(id chunk.ID) {
+		if id != next {
+			return // ignore the live chunk's own (already-done) bracket
+		}
+		hookFired.Store(true)
+		probe, openErr := hotchunk.Open(cat.layout.HotChunkPath(c), c, silentLogger())
+		if openErr == nil {
+			closedFirst.Store(true)
+			_ = probe.Close()
+		}
+	}
+
+	// ledgers+txhash only — fast, and the boundary detection is seq-based. Poll
+	// the chunk's true last ledger (boundary 0->1), then the first ledger of the
+	// next chunk, then the getter errs.
+	ingestTypes := hotchunk.Ingest{Ledgers: true}
+	getter := &fakeLedgerGetter{frames: map[uint32][]byte{
+		last:               zeroTxLCMBytes(t, last),
+		next.FirstLedger(): zeroTxLCMBytes(t, next.FirstLedger()),
+	}, endErr: errors.New("end")}
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+
+	err := runIngestionLoop(context.Background(), getter, db, cat, ch, ingestTypes, silentLogger(), nil)
+	require.Error(t, err, "poll ran past the frames and the getter errored")
+
+	require.True(t, hookFired.Load(), "the next chunk's key was created")
+	require.True(t, closedFirst.Load(),
+		"the predecessor's DB was CLOSED before the next chunk's key was created")
+
+	state, err := cat.HotState(next)
+	require.NoError(t, err)
+	assert.Equal(t, HotReady, state)
+
+	// The boundary sent the just-completed chunk id (chunk 0) to the lifecycle.
+	sent := drainLifecycle(ch)
+	require.Contains(t, sent, c, "the boundary notified the lifecycle of the closed chunk")
+}
+
+// ---------------------------------------------------------------------------
+// runIngestionLoop — boundary notifications carry the completed chunk id.
+// ---------------------------------------------------------------------------
+
+// TestRunIngestionLoop_BoundaryNotifiesCompletedChunk: crossing the chunk 0 -> 1
+// boundary sends chunk 0 into the buffered lifecycle channel. The watermark is
+// seeded just below the boundary so the poll crosses it in one step. The buffer
+// is far above the at-most-one a healthy daemon holds, so it never blocks the
+// loop.
+func TestRunIngestionLoop_BoundaryNotifiesCompletedChunk(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(0)
+	c1 := c + 1
+	db := seedWatermark(t, cat, c, c.LastLedger()-1)
+
+	ingestTypes := hotchunk.Ingest{Ledgers: true}
+	getter := &fakeLedgerGetter{frames: map[uint32][]byte{
+		c.LastLedger():   zeroTxLCMBytes(t, c.LastLedger()),   // boundary 0->1
+		c1.FirstLedger(): zeroTxLCMBytes(t, c1.FirstLedger()), // a ledger in chunk 1
+	}, endErr: errors.New("end")}
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+
+	done := make(chan error, 1)
+	go func() {
+		done <- runIngestionLoop(context.Background(), getter, db, cat, ch, ingestTypes, silentLogger(), nil)
+	}()
+
+	select {
+	case err := <-done:
+		require.Error(t, err, "poll ran dry")
+	case <-time.After(10 * time.Second):
+		t.Fatal("ingestion loop deadlocked")
+	}
+
+	sent := drainLifecycle(ch)
+	assert.Equal(t, []chunk.ID{c}, sent, "the completed chunk id was sent at the boundary")
+}
+
+// ---------------------------------------------------------------------------
+// runIngestionLoop — clean shutdown vs crash (classified at the daemon top
+// level: ctx-cancelled return is clean, any other error is restartable).
+// ---------------------------------------------------------------------------
+
+// TestRunIngestionLoop_CtxCancelReturnsCtxErr: a ctx cancellation while the poll
+// is blocking on the tip makes GetLedger return ctx.Err(); the loop returns that
+// (the daemon top level classifies a ctx-cancelled return as a clean shutdown).
+func TestRunIngestionLoop_CtxCancelReturnsCtxErr(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(0)
+	first := c.FirstLedger()
+	db := openLiveHotDB(t, cat, c)
+
+	getter := getterForSeqs(t, first, first+1)
+	getter.blockOnCtx = true // after the frames, behave like a live tip stream
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+	ctx, cancel := context.WithCancel(context.Background())
+
+	done := make(chan error, 1)
+	go func() {
+		done <- runIngestionLoop(ctx, getter, db, cat, ch, allHotTypes, silentLogger(), nil)
+	}()
+
+	require.Eventually(t, func() bool {
+		return getter.calls.Load() >= 3 // ingested 2 frames, blocked on the 3rd
+	}, 5*time.Second, 5*time.Millisecond)
+	cancel()
+
+	select {
+	case err := <-done:
+		require.Error(t, err)
+		require.ErrorIs(t, err, context.Canceled, "the loop surfaces the ctx-cancelled GetLedger error")
+	case <-time.After(10 * time.Second):
+		t.Fatal("ingestion loop did not stop on ctx cancellation")
+	}
+}
+
+// TestRunIngestionLoop_GetLedgerErrorReturnsError: a GetLedger error (not a
+// shutdown) propagates as a restartable failure.
+func TestRunIngestionLoop_GetLedgerErrorReturnsError(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(0)
+	first := c.FirstLedger()
+	db := openLiveHotDB(t, cat, c)
+
+	boom := errors.New("backend exploded")
+	getter := getterForSeqs(t, first, first)
+	getter.yieldErrAt = first + 1
+	getter.errAt = boom
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+
+	err := runIngestionLoop(context.Background(), getter, db, cat, ch, allHotTypes, silentLogger(), nil)
+	require.Error(t, err)
+	require.ErrorIs(t, err, boom)
+	require.NotErrorIs(t, err, ErrHotVolumeLost)
+}
+
+// ---------------------------------------------------------------------------
+// runIngestionLoop — restart resumes idempotently from the derived watermark.
+// ---------------------------------------------------------------------------
+
+// TestRunIngestionLoop_RestartResumesFromWatermark: after a first run commits a
+// prefix and exits, a second run over a FRESH open of the SAME hot dir resumes
+// at watermark+1 (asserted via the FIRST seq the getter is asked for) and a
+// re-delivered already-committed ledger is the idempotent retry the hot stores
+// tolerate — the final watermark is exactly the last delivered seq.
+func TestRunIngestionLoop_RestartResumesFromWatermark(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(0)
+	first := c.FirstLedger()
+
+	// First run: commit [first, first+2], then the getter errs.
+	db1 := openLiveHotDB(t, cat, c)
+	getter1 := getterForSeqs(t, first, first+2)
+	getter1.endErr = errors.New("end")
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+	err := runIngestionLoop(context.Background(), getter1, db1, cat, ch, allHotTypes, silentLogger(), nil)
+	require.Error(t, err)
+	assert.Equal(t, first, getter1.firstSeen.Load(), "first run resumed at the chunk's first ledger")
+
+	// Restart: re-open the live DB the way startup would. The resume point must
+	// be watermark+1.
+	db2, err := openHotTierForChunk(cat, c, silentLogger())
+	require.NoError(t, err)
+	resume, err := nextIngestLedger(db2)
+	require.NoError(t, err)
+	assert.Equal(t, first+3, resume, "restart resumes one past the durable watermark")
+
+	// Second run re-delivers the last already-committed ledger (idempotent) plus
+	// two new ones.
+	getter2 := getterForSeqs(t, first+2, first+5)
+	getter2.endErr = errors.New("end")
+	err = runIngestionLoop(context.Background(), getter2, db2, cat, ch, allHotTypes, silentLogger(), nil)
+	require.Error(t, err)
+	assert.Equal(t, first+3, getter2.firstSeen.Load(), "second run resumed at watermark+1")
+
+	reopened, err := hotchunk.Open(cat.layout.HotChunkPath(c), c, silentLogger())
+	require.NoError(t, err)
+	t.Cleanup(func() { _ = reopened.Close() })
+	maxSeq, ok, err := reopened.MaxCommittedSeq()
+	require.NoError(t, err)
+	require.True(t, ok)
+	assert.Equal(t, first+5, maxSeq)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle.go b/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle.go
new file mode 100644
index 000000000..1fefc9ba2
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle.go
@@ -0,0 +1,389 @@
+package streaming
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// The lifecycle goroutine runs one tick per notification (sent by the ingestion
+// loop at start — the startup seed — and at every chunk boundary, carrying the
+// just-completed chunk id), in three stages:
+//
+//  1. plan-and-execute — the SAME resolve + executePlan catch-up uses, over
+//     [floor, lastChunk]. This is where a just-closed chunk freezes (from its hot
+//     DB via backfillSource's hot branch). lastChunk is the id ingestion handed
+//     over — "how far to go"; what to build, discard, and prune is read from the
+//     catalog.
+//  2. discard scan — retire hot DBs the cold artifacts now fully serve (or that
+//     fell past retention).
+//  3. prune scan — sweep demoted and past-retention files, both key families.
+//
+// The retention floor plays two roles with OPPOSITE safe directions, kept
+// separate (design "Lifecycle"):
+//
+//   - As a RETENTION boundary (the prune scan, the reader gate) erring low is
+//     harmless — an extra chunk lingers briefly, or a read lands on already-
+//     pruned data and returns not-found via the reader's missing-file rule.
+//   - As a PRODUCTION boundary erring low is DANGEROUS — planning a build below
+//     existing storage demands chunks from a bulk source nobody validated it can
+//     produce. So the tick's plan range never starts below existing storage:
+//     start is RAISED to lowestMaterializedChunk when the floor sits lower.
+//     Extending the bottom of storage (retention widening) is exclusively catch-
+//     up's job; producibility is enforced lazily there, per chunk, by the cold
+//     ingest during the build (no pre-flight gate).
+//
+// The two goroutines (ingestion, lifecycle) share NO state: the tick is a pure
+// function of the catalog, deriving everything from durable keys on every run.
+
+// LifecycleConfig is the dependency bundle the lifecycle tick and loop read. It
+// COMPOSES the scheduler's ExecConfig (resolve/executePlan share one set of
+// postconditions and one worker pool with catch-up) and adds the retention knob
+// plus an injectable fatal sink.
+//
+// RetentionChunks is the sliding-floor width (0 means "fixed earliest-ledger
+// floor only", no sliding retention). Fatalf is the abort sink for the error
+// policy: a tick whose executePlan fails (retries exhausted) aborts the daemon,
+// because startup is the recovery path. Production wires log.Fatalf via
+// WithLifecycleDefaults; tests inject a recorder so an abort is observable
+// without killing the test process.
+type LifecycleConfig struct {
+	ExecConfig
+
+	// RetentionChunks bounds the sliding retention floor's width. 0 disables the
+	// sliding floor (the fixed earliest-ledger floor alone applies).
+	RetentionChunks uint32
+
+	// Fatalf aborts the daemon on a tick op failure (the error policy). nil in a
+	// caller's literal; WithLifecycleDefaults fills log.Fatalf. Tests override it.
+	Fatalf func(format string, args ...any)
+}
+
+// WithLifecycleDefaults returns a copy with ExecConfig defaults applied and
+// Fatalf defaulted to log.Fatalf when unset. The daemon calls this once at
+// startup before launching the loop.
+func (cfg LifecycleConfig) WithLifecycleDefaults() LifecycleConfig {
+	cfg.ExecConfig = cfg.ExecConfig.WithDefaults()
+	if cfg.Fatalf == nil {
+		cfg.Fatalf = log.Fatalf
+	}
+	return cfg
+}
+
+// effectiveRetentionFloor is the lower bound of the retention window, chunk-
+// aligned: the first ledger of the lowest in-scope chunk. It combines the
+// sliding retention floor (lastCompleteChunkAt(upperBound) - retentionChunks +
+// 1, when retentionChunks > 0) with the fixed earliest-ledger floor, taking the
+// HIGHER of the two.
+//
+// upperBound is ingestion's progress (completeThrough at runtime; the catch-up
+// loop passes max(network tip, derived watermark)). The signed slidingChunk
+// math is the underflow guard: a young store or a large retentionChunks drives
+// slidingChunk negative, which max(..., 0) clamps to chunk 0 before mapping to
+// its first ledger — never a uint32 wrap to MaxUint32.
+func effectiveRetentionFloor(upperBound, retentionChunks, earliest uint32) uint32 {
+	sliding := uint32(chunk.FirstLedgerSeq) // GenesisLedger
+	if retentionChunks > 0 {
+		slidingChunk := lastCompleteChunkAt(upperBound) - int64(retentionChunks) + 1
+		sliding = chunkFirstLedger(max(slidingChunk, 0))
+	}
+	return max(sliding, earliest)
+}
+
+// lastCompleteChunkAt is the inverse of chunk.ID.LastLedger: the largest chunk
+// whose last ledger is <= ledger, as a SIGNED int64 so a sub-genesis ledger
+// (the watermark sentinel) maps to -1 ("before the first chunk") rather than
+// wrapping. E.g. lastCompleteChunkAt(chunk 0's last ledger) == 0; a ledger
+// below the first chunk's last ledger yields -1.
+//
+// The cast-before-subtract keeps the whole computation in int64: ledger is
+// uint32, so (ledger - 1) would underflow for ledger 0; int64(ledger) - 1 does
+// not. With chunk c spanning [c*L + 2, (c+1)*L + 1], the largest c whose last
+// ledger <= ledger is (ledger - 2)/L when ledger >= 2; the form below
+// ((ledger - FirstLedgerSeq + 1) - 1)/L - ... is normalized to match the
+// design's (ledger-1)/L - 1 only after accounting for FirstLedgerSeq, so it is
+// derived directly from the chunk geometry instead.
+func lastCompleteChunkAt(ledger uint32) int64 {
+	// chunk c's last ledger is (c+1)*L + FirstLedgerSeq - 1. The largest c with
+	// that value <= ledger is floor((ledger - FirstLedgerSeq + 1)/L) - 1, i.e.
+	// floor((ledger + 1 - FirstLedgerSeq)/L) - 1. Below the first chunk's last
+	// ledger this is negative (the sentinel).
+	return (int64(ledger)+1-int64(chunk.FirstLedgerSeq))/int64(chunk.LedgersPerChunk) - 1
+}
+
+// chunkFirstLedger maps a non-negative signed chunk index to its first ledger.
+// It is the signed-domain companion of chunk.ID.FirstLedger used by
+// effectiveRetentionFloor after the max(..., 0) clamp.
+func chunkFirstLedger(c int64) uint32 {
+	return chunk.ID(c).FirstLedger() //nolint:gosec // c >= 0 (clamped) and bounded by real chunk ids
+}
+
+// chunkIDOfLedger maps a ledger to its chunk, signed so the watermark sentinel
+// (below genesis) yields a negative index instead of panicking like
+// chunk.IDFromLedger. The tick only ever feeds it completeThrough, which is >=
+// FirstLedgerSeq-1; a sentinel maps to chunk -1 ("before the first chunk").
+func chunkIDOfLedger(ledger uint32) int64 {
+	if ledger < chunk.FirstLedgerSeq {
+		return -1
+	}
+	return int64(chunk.IDFromLedger(ledger))
+}
+
+// lastCompleteChunkAtID is lastCompleteChunkAt mapped to a chunk.ID for the
+// resolver's rangeEnd, clamped at 0 (a negative result means no complete chunk
+// exists; resolve's inverted-range guard then makes the plan empty when
+// rangeEnd < rangeStart). The caller guards the negative case before using it.
+func lastCompleteChunkAtID(ledger uint32) (chunk.ID, bool) {
+	c := lastCompleteChunkAt(ledger)
+	if c < 0 {
+		return 0, false
+	}
+	return chunk.ID(c), true //nolint:gosec // c >= 0
+}
+
+// lowestMaterializedChunk is the lowest chunk holding ANY chunk:* artifact key
+// or hot:chunk key — the bottom of existing storage. ok=false on an empty
+// catalog (a first frontfill tick, where resolve's inverted-range guard makes
+// the tick a no-op anyway). It is the production-boundary anchor: the tick's
+// plan never starts below it.
+func lowestMaterializedChunk(cat *Catalog) (chunk.ID, bool, error) {
+	lowest := chunk.ID(0)
+	found := false
+	note := func(c chunk.ID) {
+		if !found || c < lowest {
+			lowest, found = c, true
+		}
+	}
+
+	refs, err := cat.ChunkArtifactKeys()
+	if err != nil {
+		return 0, false, err
+	}
+	for _, ref := range refs {
+		note(ref.Chunk)
+	}
+
+	hot, err := cat.HotChunkKeys()
+	if err != nil {
+		return 0, false, err
+	}
+	for _, c := range hot {
+		note(c)
+	}
+	return lowest, found, nil
+}
+
+// runLifecycleTick runs ONE tick for the just-completed chunk lastChunk that
+// ingestion handed over. through is derived from lastChunk (its last ledger), so
+// every stage sees the same snapshot and a boundary committing mid-tick can't
+// make one stage contradict another (the new chunk is simply next tick's work).
+// The three stages run in order.
+//
+// lastChunk is the unit of "how far to go": the plan range is [floor, lastChunk]
+// (start raised to existing storage), and the discard/prune scans key off
+// through = lastChunk.LastLedger(). What to build/discard/prune is read from the
+// catalog, not from lastChunk.
+//
+// CLEAN-SHUTDOWN (binding): if executePlan returns an error AND ctx was
+// cancelled, the tick returns WITHOUT calling Fatalf — cancellation is a
+// shutdown request, never an op failure. Only a genuine failure (ctx still
+// live) aborts the daemon via Fatalf, per the error policy.
+func runLifecycleTick(ctx context.Context, cfg LifecycleConfig, cat *Catalog, lastChunk chunk.ID) {
+	metrics := cfg.metrics()
+	logger := cfg.Logger
+
+	// through is the last ledger of the chunk ingestion handed over — the one
+	// snapshot every stage shares.
+	through := lastChunk.LastLedger()
+
+	earliest, _, err := cat.EarliestLedger()
+	if err != nil {
+		if ctx.Err() != nil {
+			return
+		}
+		cfg.Fatalf("streaming: lifecycle tick: read earliest ledger: %v", err)
+		return
+	}
+	floor := effectiveRetentionFloor(through, cfg.RetentionChunks, earliest)
+
+	// Progress gauges, refreshed every tick from the snapshot: the derived
+	// watermark (completeThrough) and the effective retention floor.
+	metrics.Watermark(through, floor)
+	if logger != nil {
+		logger.WithField("through", through).
+			WithField("floor", floor).
+			Debug("streaming: lifecycle tick — derived snapshot")
+	}
+
+	// Plan range start = chunkID(floor), RAISED to lowestMaterializedChunk when
+	// that is higher — the production-boundary rule (never plan below existing
+	// storage; extending the bottom is catch-up's job).
+	start := chunkIDOfLedger(floor)
+	low, hasLow, err := lowestMaterializedChunk(cat)
+	if err != nil {
+		if ctx.Err() != nil {
+			return
+		}
+		cfg.Fatalf("streaming: lifecycle tick: lowest materialized chunk: %v", err)
+		return
+	}
+	if hasLow && int64(low) > start {
+		start = int64(low)
+	}
+
+	// Stage 1 — plan-and-execute (the freeze). Timed and counted as one phase;
+	// the plan's size is the chunk build count (0 when there is no producible
+	// range, still reported so the empty-tick rate is visible).
+	//
+	// rangeEnd is the just-completed chunk ingestion handed over (lastChunk), but
+	// CLAMPED to the highest chunk that is actually complete in durable storage:
+	// the production stage must never target the live or a not-yet-complete chunk
+	// (its hot DB is held open by ingestion, and freezing it would race a live
+	// writer — and on a young network nothing is complete at all). In the running
+	// daemon lastChunk IS that highest-complete chunk, so the clamp is a no-op
+	// there; it only bites on the seed/young-network/recovery edges. A negative
+	// result (no complete chunk) makes the range empty — production is skipped,
+	// while the discard and prune scans below still run.
+	freezeStart := time.Now()
+	var chunkBuilds int
+	durableThrough, derr := lastCommittedLedger(cat, nil) // chunk-granularity, no hot DB read
+	if derr != nil {
+		if ctx.Err() != nil {
+			return
+		}
+		cfg.Fatalf("streaming: lifecycle tick: derive durable through: %v", derr)
+		return
+	}
+	highestComplete, haveComplete := lastCompleteChunkAtID(durableThrough)
+	rangeEnd := lastChunk
+	if haveComplete && highestComplete < rangeEnd {
+		rangeEnd = highestComplete
+	}
+	if haveComplete && start >= 0 && start <= int64(rangeEnd) {
+		plan, perr := resolve(cfg.ExecConfig, chunk.ID(start), rangeEnd) //nolint:gosec // start >= 0
+		if perr != nil {
+			if ctx.Err() != nil {
+				return
+			}
+			cfg.Fatalf("streaming: lifecycle tick: resolve [%d,%s]: %v", start, rangeEnd, perr)
+			return
+		}
+		chunkBuilds = len(plan.ChunkBuilds)
+		if eerr := executePlan(ctx, plan, cfg.ExecConfig); eerr != nil {
+			// CLEAN-SHUTDOWN FIX: a cancelled ctx makes executePlan return ctx.Err()
+			// (every task's slot-acquire/wait observes the errgroup cancel). That is
+			// a shutdown, NOT an op failure — return before any Fatalf.
+			if ctx.Err() != nil {
+				return
+			}
+			cfg.Fatalf("streaming: lifecycle tick: %v", eerr)
+			return
+		}
+	}
+	// else: no complete chunk in range (young network / empty store) — skip
+	// production. The discard and prune scans still run: a past-retention hot DB
+	// or stale key can exist with no producible range.
+	metrics.Freeze(chunkBuilds, time.Since(freezeStart))
+	if logger != nil && chunkBuilds > 0 {
+		logger.WithField("chunk_builds", chunkBuilds).
+			Info("streaming: lifecycle freeze stage complete")
+	}
+
+	// Stage 2 — discard scan.
+	discardStart := time.Now()
+	discardOps, err := eligibleDiscardOps(cfg, cat, through)
+	if err != nil {
+		if ctx.Err() != nil {
+			return
+		}
+		cfg.Fatalf("streaming: lifecycle tick: eligible discard ops: %v", err)
+		return
+	}
+	for _, op := range discardOps {
+		if oerr := op(); oerr != nil {
+			if ctx.Err() != nil {
+				return
+			}
+			cfg.Fatalf("streaming: lifecycle tick: discard op: %v", oerr)
+			return
+		}
+	}
+	metrics.Discard(len(discardOps), time.Since(discardStart))
+	if logger != nil && len(discardOps) > 0 {
+		logger.WithField("discarded", len(discardOps)).Info("streaming: lifecycle discard stage complete")
+	}
+
+	// Live hot-chunk gauge after the discard stage (the live + awaiting-discard set).
+	if hot, herr := cat.HotChunkKeys(); herr == nil {
+		metrics.LiveHotChunks(len(hot))
+	}
+
+	// Stage 3 — prune scan.
+	pruneStart := time.Now()
+	pruneOps, err := eligiblePruneOps(cfg, cat, through)
+	if err != nil {
+		if ctx.Err() != nil {
+			return
+		}
+		cfg.Fatalf("streaming: lifecycle tick: eligible prune ops: %v", err)
+		return
+	}
+	for _, op := range pruneOps {
+		if oerr := op(); oerr != nil {
+			if ctx.Err() != nil {
+				return
+			}
+			cfg.Fatalf("streaming: lifecycle tick: prune op: %v", oerr)
+			return
+		}
+	}
+	metrics.Prune(len(pruneOps), time.Since(pruneStart))
+	if logger != nil && len(pruneOps) > 0 {
+		logger.WithField("pruned", len(pruneOps)).Info("streaming: lifecycle prune stage complete")
+	}
+
+	// Cold-tier footprint gauge after the prune stage (post-deletion size).
+	if bytes, berr := coldTierBytes(cat.layout); berr == nil {
+		metrics.ColdTierBytes(bytes)
+	}
+}
+
+// lifecycleQueueDepth is the lifecycle notification buffer depth — far above the
+// at-most-one boundary a healthy daemon holds in flight. A FULL buffer means
+// freeze has fallen this many boundaries behind ingestion, which is a fatal
+// condition the ingestion-side notify() reports (see runIngestionLoop).
+const lifecycleQueueDepth = 8
+
+// lifecycleLoop is the event-driven lifecycle goroutine. Each notification
+// carries the just-completed chunk id; the loop DRAINS the buffered channel to
+// the most-recent id (one tick covers every chunk queued behind it, since the
+// plan range is [floor, lastChunk] and chunk ids only increase) and runs one
+// tick up to it. It selects on BOTH ctx.Done() (return, clean shutdown) AND the
+// channel — so it never blocks forever and never fatals on shutdown.
+// Notifications arrive from exactly one source (ingestion: each boundary plus
+// the startup seed, whose tick doubles as startup convergence). Between
+// notifications the goroutine is idle, and idle means quiescent.
+func lifecycleLoop(ctx context.Context, cfg LifecycleConfig, cat *Catalog, ch <-chan chunk.ID) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case lastChunk := <-ch:
+			// Drain to the most-recent queued chunk: one tick over [floor, lastChunk]
+			// subsumes every earlier boundary still sitting in the buffer.
+		drain:
+			for {
+				select {
+				case lastChunk = <-ch:
+				case <-ctx.Done():
+					return
+				default:
+					break drain
+				}
+			}
+			runLifecycleTick(ctx, cfg, cat, lastChunk)
+		}
+	}
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle_test.go
new file mode 100644
index 000000000..f7782db52
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle_test.go
@@ -0,0 +1,575 @@
+package streaming
+
+import (
+	"context"
+	"fmt"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ---------------------------------------------------------------------------
+// Arithmetic: lastCompleteChunkAt, effectiveRetentionFloor.
+// ---------------------------------------------------------------------------
+
+func TestLastCompleteChunkAt(t *testing.T) {
+	tests := []struct {
+		name   string
+		ledger uint32
+		want   int64
+	}{
+		{"below first chunk's last ledger => sentinel -1", chunk.ID(0).LastLedger() - 1, -1},
+		{"genesis sentinel (FirstLedgerSeq-1) => -1", chunk.FirstLedgerSeq - 1, -1},
+		{"ledger 0 does not underflow => -1", 0, -1},
+		{"chunk 0's last ledger => 0", chunk.ID(0).LastLedger(), 0},
+		{"chunk 0's last ledger + 1 (into chunk 1) => still 0", chunk.ID(0).LastLedger() + 1, 0},
+		{"chunk 5's last ledger => 5", chunk.ID(5).LastLedger(), 5},
+		{"the doc's example 10_001 => 0", 10_001, 0},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			require.Equal(t, tc.want, lastCompleteChunkAt(tc.ledger))
+		})
+	}
+}
+
+func TestEffectiveRetentionFloor(t *testing.T) {
+	genesis := uint32(chunk.FirstLedgerSeq)
+	tests := []struct {
+		name            string
+		upperBound      uint32
+		retentionChunks uint32
+		earliest        uint32
+		want            uint32
+	}{
+		{
+			name:            "no sliding (retention 0): earliest floor wins",
+			upperBound:      chunk.ID(100).LastLedger(),
+			retentionChunks: 0,
+			earliest:        chunk.ID(10).FirstLedger(),
+			want:            chunk.ID(10).FirstLedger(),
+		},
+		{
+			name:            "no sliding, no earliest pin: genesis",
+			upperBound:      chunk.ID(100).LastLedger(),
+			retentionChunks: 0,
+			earliest:        0,
+			want:            genesis,
+		},
+		{
+			name:            "sliding floor leads when above earliest",
+			upperBound:      chunk.ID(100).LastLedger(), // last complete chunk = 100
+			retentionChunks: 10,                         // floor chunk = 100-10+1 = 91
+			earliest:        0,
+			want:            chunk.ID(91).FirstLedger(),
+		},
+		{
+			name:            "earliest floor leads when above the sliding floor",
+			upperBound:      chunk.ID(100).LastLedger(),
+			retentionChunks: 10,                         // sliding floor chunk = 91
+			earliest:        chunk.ID(95).FirstLedger(), // higher
+			want:            chunk.ID(95).FirstLedger(),
+		},
+		{
+			name:            "retention wider than history clamps to chunk 0, never wraps",
+			upperBound:      chunk.ID(3).LastLedger(),
+			retentionChunks: 1000, // sliding chunk = 3-1000+1 < 0 => clamp to chunk 0
+			earliest:        0,
+			want:            chunk.ID(0).FirstLedger(),
+		},
+		{
+			name:            "young store (upperBound below first chunk) clamps to chunk 0",
+			upperBound:      chunk.FirstLedgerSeq + 5, // no complete chunk yet
+			retentionChunks: 5,
+			earliest:        0,
+			want:            chunk.ID(0).FirstLedger(),
+		},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			require.Equal(t, tc.want, effectiveRetentionFloor(tc.upperBound, tc.retentionChunks, tc.earliest))
+		})
+	}
+}
+
+// ---------------------------------------------------------------------------
+// lowestMaterializedChunk.
+// ---------------------------------------------------------------------------
+
+func TestLowestMaterializedChunk(t *testing.T) {
+	t.Run("empty catalog => ok=false", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		_, ok, err := lowestMaterializedChunk(cat)
+		require.NoError(t, err)
+		require.False(t, ok)
+	})
+
+	t.Run("min over chunk artifact keys and hot keys", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		freezeKinds(t, cat, 7, KindLedgers)        // chunk artifact key at 7
+		require.NoError(t, cat.PutHotTransient(4)) // hot key at 4 (lower)
+		freezeKinds(t, cat, 9, KindLedgers)
+		low, ok, err := lowestMaterializedChunk(cat)
+		require.NoError(t, err)
+		require.True(t, ok)
+		require.Equal(t, chunk.ID(4), low)
+	})
+}
+
+// ---------------------------------------------------------------------------
+// End-to-end tick harness: real catalog + real hotchunk DBs.
+// ---------------------------------------------------------------------------
+
+// ingestFullHotChunk creates a "ready" hot DB for chunk c and ingests every
+// ledger in the chunk (contiguous from FirstLedger), then closes the write
+// handle — the post-boundary state the lifecycle freezes from. The hot key is
+// left "ready" and the dir is on disk, as the boundary handoff leaves it.
+func ingestFullHotChunk(t *testing.T, cat *Catalog, c chunk.ID) {
+	t.Helper()
+	db := openLiveHotDB(t, cat, c)
+	for seq := c.FirstLedger(); seq <= c.LastLedger(); seq++ {
+		_, err := db.IngestLedger(seq, xdr.LedgerCloseMetaView(zeroTxLCMBytes(t, seq)), allHotTypes)
+		require.NoError(t, err)
+	}
+	require.NoError(t, db.Close()) // release the write handle (boundary handoff)
+}
+
+// lifecycleTestConfig wires a LifecycleConfig over the real production primitives
+// (a real RocksHotProbe over the catalog's hot layout) plus a fatal recorder so a
+// tick abort is observable instead of killing the test process.
+func lifecycleTestConfig(t *testing.T, cat *Catalog, retentionChunks uint32) (LifecycleConfig, *fatalRecorder) {
+	t.Helper()
+	rec := &fatalRecorder{}
+	cfg := LifecycleConfig{
+		ExecConfig: ExecConfig{
+			Catalog: cat,
+			Logger:  silentLogger(),
+			Workers: 2,
+			Process: ProcessConfig{
+				HotProbe: NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger()),
+			},
+		},
+		RetentionChunks: retentionChunks,
+		Fatalf:          rec.fatalf,
+	}
+	return cfg, rec
+}
+
+// fatalRecorder captures Fatalf calls so a test can assert a tick did (or did
+// NOT) abort the daemon.
+type fatalRecorder struct {
+	count atomic.Int32
+	last  atomic.Value // string
+}
+
+func (r *fatalRecorder) fatalf(format string, args ...any) {
+	r.count.Add(1)
+	r.last.Store(fmt.Sprintf(format, args...))
+}
+
+func (r *fatalRecorder) fired() bool { return r.count.Load() > 0 }
+
+// TestRunLifecycleTick_BoundaryFreezesDiscards is the "one boundary, end to
+// end" walk: chunk 0 just closed (its full hot DB is on disk, ready), chunk 1 is
+// the new live chunk. One tick must:
+//   - freeze chunk 0's cold ledger artifact FROM its hot DB (via processChunk's
+//     hot branch),
+//   - discard chunk 0's hot DB (cold artifacts now fully serve it),
+//   - leave the live chunk 1 untouched.
+//
+// Then re-running the tick is a no-op (quiescence).
+func TestRunLifecycleTick_BoundaryFreezesDiscards(t *testing.T) {
+	t.Parallel()             // full-chunk ingest; isolated TempDir/catalog — overlap with the other heavy tests to fit the gate's go-test timeout
+	cat, _ := testCatalog(t) // a chunk finalizes immediately
+	cfg, rec := lifecycleTestConfig(t, cat, 0)
+
+	// Chunk 0: just-closed, full hot DB on disk. Chunk 1: the new live chunk.
+	ingestFullHotChunk(t, cat, 0)
+	live := openLiveHotDB(t, cat, 1) // the live chunk's hot DB (held open by "ingestion")
+	t.Cleanup(func() { _ = live.Close() })
+
+	runTickForCatalog(context.Background(), t, cfg, cat)
+	require.False(t, rec.fired(), "a healthy tick never aborts: %v", rec.last.Load())
+
+	// Chunk 0's cold ledger artifact is frozen.
+	state, err := cat.State(0, KindLedgers)
+	require.NoError(t, err)
+	assert.Equal(t, StateFrozen, state, "chunk 0 ledgers frozen")
+
+	// Chunk 0's hot DB is discarded (cold artifacts fully serve it).
+	has, err := cat.Has(hotChunkKey(0))
+	require.NoError(t, err)
+	assert.False(t, has, "chunk 0's hot key is gone")
+
+	// The live chunk 1 is untouched: its hot key still "ready", no cold artifacts.
+	hotState, err := cat.HotState(1)
+	require.NoError(t, err)
+	assert.Equal(t, HotReady, hotState, "the live chunk's hot key is untouched")
+	lfs1, err := cat.State(1, KindLedgers)
+	require.NoError(t, err)
+	assert.Equal(t, State(""), lfs1, "the live chunk is not frozen")
+
+	// Quiescence: re-running the tick produces no work.
+	through, err := deriveCompleteThrough(cat)
+	require.NoError(t, err)
+	assertQuiescent(t, cfg, cat, through)
+}
+
+// TestRunLifecycleTick_DiscardWhenComplete: a complete chunk whose cold ledger
+// artifact is frozen (nothing pending) has its hot DB discarded; an incomplete
+// chunk (ledgers not yet frozen) keeps its hot DB.
+func TestRunLifecycleTick_DiscardWhenComplete(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg, _ := lifecycleTestConfig(t, cat, 0)
+
+	// Chunk 0 with a "ready" hot DB on disk but NOT yet frozen: still pending.
+	makeReadyHotDirNoData(t, cat, 0)
+	// A live chunk 1 above it so chunk 0 is below the partition boundary.
+	require.NoError(t, cat.PutHotTransient(1))
+
+	through := chunk.ID(0).LastLedger() // chunk 0 complete via positional/cold
+	ops, err := eligibleDiscardOps(cfg, cat, through)
+	require.NoError(t, err)
+	require.Empty(t, ops, "ledgers not frozen yet: the hot DB stays")
+
+	// Now freeze chunk 0's ledger artifact: nothing pending => discard eligible.
+	freezeKinds(t, cat, 0, KindLedgers)
+	ops, err = eligibleDiscardOps(cfg, cat, through)
+	require.NoError(t, err)
+	require.Len(t, ops, 1, "frozen + nothing pending => discard eligible")
+	require.NoError(t, ops[0]())
+
+	has, err := cat.Has(hotChunkKey(0))
+	require.NoError(t, err)
+	assert.False(t, has, "the now-complete chunk's hot DB is discarded")
+}
+
+// TestRunLifecycleTick_PastFloorPrune: a chunk wholly below the effective
+// retention floor has its artifact files and hot DB swept, regardless of state.
+func TestRunLifecycleTick_PastFloorPrune(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg, rec := lifecycleTestConfig(t, cat, 2) // retain ~2 chunks
+
+	// completeThrough will be chunk 5's last ledger (positional: live chunk 6).
+	// floor = lastCompleteChunkAt(through)-retention+1 = 5-2+1 = chunk 4's first
+	// ledger. So chunks 0..3 are wholly past the floor and must be swept.
+	for c := chunk.ID(0); c <= 5; c++ {
+		freezeKinds(t, cat, c, KindLedgers)
+		writeArtifact(t, cat.layout.LedgerPackPath(c))
+	}
+	// A past-floor hot DB too (chunk 1).
+	makeReadyHotDirNoData(t, cat, 1)
+	live := openLiveHotDB(t, cat, 6) // live chunk
+	t.Cleanup(func() { _ = live.Close() })
+
+	through, err := deriveCompleteThrough(cat)
+	require.NoError(t, err)
+	require.Equal(t, chunk.ID(5).LastLedger(), through)
+	floor := effectiveRetentionFloor(through, cfg.RetentionChunks, 0)
+	require.Equal(t, chunk.ID(4).FirstLedger(), floor, "floor anchors 2 chunks back")
+
+	runTickForCatalog(context.Background(), t, cfg, cat)
+	require.False(t, rec.fired(), "prune tick never aborts: %v", rec.last.Load())
+
+	// Chunks 0..3 (wholly below the floor) are gone: keys and files.
+	for c := chunk.ID(0); c <= 3; c++ {
+		ledgers, serr := cat.State(c, KindLedgers)
+		require.NoError(t, serr)
+		assert.Equal(t, State(""), ledgers, "chunk %s ledgers key swept", c)
+		assert.NoFileExists(t, cat.layout.LedgerPackPath(c), "chunk %s pack swept", c)
+		has, herr := cat.Has(hotChunkKey(c))
+		require.NoError(t, herr)
+		assert.False(t, has, "chunk %s hot key swept", c)
+	}
+	// Chunk 4 (the floor chunk) and 5 are within retention and survive.
+	for c := chunk.ID(4); c <= 5; c++ {
+		ledgers, serr := cat.State(c, KindLedgers)
+		require.NoError(t, serr)
+		assert.Equal(t, StateFrozen, ledgers, "chunk %s in retention survives", c)
+	}
+
+	assertQuiescent(t, cfg, cat, through)
+}
+
+// TestRunLifecycleTick_PrunesTransientChunkDebris: a "pruning" chunk artifact
+// key (a recovery-demoted leftover) is swept by the prune scan.
+func TestRunLifecycleTick_PrunesTransientChunkDebris(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg, rec := lifecycleTestConfig(t, cat, 0)
+
+	// A "pruning" chunk artifact key (in-retention demotion) with a real file.
+	writeArtifact(t, cat.layout.LedgerPackPath(0))
+	require.NoError(t, cat.store.Put(chunkKey(0, KindLedgers), string(StatePruning)))
+
+	through, err := deriveCompleteThrough(cat)
+	require.NoError(t, err)
+	ops, err := eligiblePruneOps(cfg, cat, through)
+	require.NoError(t, err)
+	require.Len(t, ops, 1, "the pruning debris is swept")
+	require.NoError(t, ops[0]())
+	require.False(t, rec.fired())
+
+	s, err := cat.State(0, KindLedgers)
+	require.NoError(t, err)
+	require.Equal(t, State(""), s, "the pruning chunk key is gone")
+}
+
+// ---------------------------------------------------------------------------
+// CLEAN SHUTDOWN: a ctx cancelled mid-tick returns WITHOUT fatal.
+// ---------------------------------------------------------------------------
+
+// TestRunLifecycleTick_CleanShutdownNoFatal: when executePlan returns because
+// ctx was cancelled, the tick must NOT call Fatalf — cancellation is a shutdown,
+// never an op failure. The plan stage's work is real (a backend-only chunk that
+// the cancelled ctx aborts), so executePlan genuinely returns an error here.
+func TestRunLifecycleTick_CleanShutdownNoFatal(t *testing.T) {
+	cat, _ := testCatalog(t)
+	rec := &fatalRecorder{}
+
+	// A READY live chunk 1 so chunk 0 sits BELOW the partition and counts as
+	// complete (positional term => through = chunk 0's last ledger), making the
+	// plan range [0,0] non-empty. Chunk 0 has no frozen artifacts, so resolve
+	// schedules a ChunkBuild whose seamed execution we cancel mid-flight.
+	readyHot(t, cat, 1)                        // live chunk (ready + dir)
+	require.NoError(t, cat.PutHotTransient(0)) // chunk 0 in storage, below live
+
+	// Block the chunk build long enough to cancel, then make it observe the cancel.
+	started := make(chan struct{})
+	cfg := LifecycleConfig{
+		ExecConfig: ExecConfig{
+			Catalog: cat,
+			Logger:  silentLogger(),
+			Workers: 1,
+			runChunk: func(ctx context.Context, _ ChunkBuild, _ ExecConfig) error {
+				close(started)
+				<-ctx.Done() // wait for the cancel, then return the ctx error
+				return ctx.Err()
+			},
+		},
+		RetentionChunks: 0,
+		Fatalf:          rec.fatalf,
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+	go func() {
+		runLifecycleTick(ctx, cfg, cat, 0) // lastChunk 0: plan range [0,0], the build we cancel
+		close(done)
+	}()
+
+	select {
+	case <-started:
+	case <-time.After(5 * time.Second):
+		t.Fatal("the chunk build never started")
+	}
+	cancel() // shutdown mid-tick
+
+	select {
+	case <-done:
+	case <-time.After(5 * time.Second):
+		t.Fatal("the tick did not return after ctx cancellation")
+	}
+	require.False(t, rec.fired(), "a cancelled ctx is a clean shutdown, NOT an op failure — no Fatalf")
+}
+
+// TestRunLifecycleTick_GenuineFailureAborts: when a plan op fails for a real
+// reason (NOT ctx cancellation), the tick aborts via Fatalf per the error policy.
+func TestRunLifecycleTick_GenuineFailureAborts(t *testing.T) {
+	cat, _ := testCatalog(t)
+	rec := &fatalRecorder{}
+
+	readyHot(t, cat, 1)                        // ready live chunk => through = chunk 0 last ledger
+	require.NoError(t, cat.PutHotTransient(0)) // chunk 0 below live, no frozen artifacts
+
+	cfg := LifecycleConfig{
+		ExecConfig: ExecConfig{
+			Catalog: cat,
+			Logger:  silentLogger(),
+			Workers: 1,
+			runChunk: func(context.Context, ChunkBuild, ExecConfig) error {
+				return assertErr // a genuine, non-cancellation failure
+			},
+		},
+		Fatalf: rec.fatalf,
+	}
+	runLifecycleTick(context.Background(), cfg, cat, 0) // lastChunk 0: plan range [0,0], the failing build
+	require.True(t, rec.fired(), "a genuine op failure aborts the daemon")
+}
+
+// ---------------------------------------------------------------------------
+// lifecycleLoop: selects on BOTH ctx.Done and the notification channel; drains
+// to the most-recent queued chunk id.
+// ---------------------------------------------------------------------------
+
+// TestLifecycleLoop_RunsTickPerNotifyThenStopsOnCtx: a notification (a completed
+// chunk id) runs a tick; a ctx cancellation returns the loop. The loop never
+// blocks forever and never fatals on shutdown.
+func TestLifecycleLoop_RunsTickPerNotifyThenStopsOnCtx(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg, rec := lifecycleTestConfig(t, cat, 0)
+
+	// Make the tick observable WITHOUT a slow full ingest: chunk 0 is already
+	// fully frozen, with a leftover "ready" hot DB on disk. The plan stage is a
+	// no-op; the discard scan retires chunk 0's hot DB. A live chunk 1 keeps chunk
+	// 0 below the partition.
+	freezeKinds(t, cat, 0, KindLedgers)
+	makeReadyHotDirNoData(t, cat, 0)
+	live := openLiveHotDB(t, cat, 1)
+	t.Cleanup(func() { _ = live.Close() })
+
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+	go func() {
+		lifecycleLoop(ctx, cfg, cat, ch)
+		close(done)
+	}()
+
+	ch <- chunk.ID(0) // ingestion hands over the just-completed chunk 0
+	require.Eventually(t, func() bool {
+		has, err := cat.Has(hotChunkKey(0))
+		return err == nil && !has
+	}, 10*time.Second, 20*time.Millisecond, "the notification ran a tick that discarded chunk 0")
+	require.False(t, rec.fired())
+
+	cancel()
+	select {
+	case <-done:
+	case <-time.After(5 * time.Second):
+		t.Fatal("the loop did not return on ctx cancellation")
+	}
+}
+
+// TestLifecycleLoop_DrainsToMostRecent: several chunk ids queued behind one
+// notification are coalesced into ONE tick over the most-recent. With chunks 0
+// and 1 both frozen+covered and a live chunk 2, sending 0 then 1 runs a single
+// tick up to chunk 1 that discards both.
+func TestLifecycleLoop_DrainsToMostRecent(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg, rec := lifecycleTestConfig(t, cat, 0)
+
+	for c := chunk.ID(0); c <= 1; c++ {
+		freezeKinds(t, cat, c, KindLedgers)
+		makeReadyHotDirNoData(t, cat, c)
+	}
+	live := openLiveHotDB(t, cat, 2)
+	t.Cleanup(func() { _ = live.Close() })
+
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	done := make(chan struct{})
+	go func() {
+		lifecycleLoop(ctx, cfg, cat, ch)
+		close(done)
+	}()
+
+	ch <- chunk.ID(0)
+	ch <- chunk.ID(1) // drained-to: one tick over [floor, 1] discards both
+	require.Eventually(t, func() bool {
+		h0, e0 := cat.Has(hotChunkKey(0))
+		h1, e1 := cat.Has(hotChunkKey(1))
+		return e0 == nil && e1 == nil && !h0 && !h1
+	}, 10*time.Second, 20*time.Millisecond, "one drained tick discarded both completed chunks")
+	require.False(t, rec.fired())
+
+	cancel()
+	select {
+	case <-done:
+	case <-time.After(5 * time.Second):
+		t.Fatal("the loop did not return on ctx cancellation")
+	}
+}
+
+// TestLifecycleLoop_ReturnsImmediatelyOnAlreadyCancelledCtx: an already-cancelled
+// ctx makes the loop return without running any tick (never blocks on the
+// channel forever).
+func TestLifecycleLoop_ReturnsImmediatelyOnAlreadyCancelledCtx(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg, _ := lifecycleTestConfig(t, cat, 0)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	ch := make(chan chunk.ID) // unbuffered, never sent to
+	done := make(chan struct{})
+	go func() {
+		lifecycleLoop(ctx, cfg, cat, ch)
+		close(done)
+	}()
+	select {
+	case <-done:
+	case <-time.After(5 * time.Second):
+		t.Fatal("the loop blocked instead of observing the cancelled ctx")
+	}
+}
+
+// ---------------------------------------------------------------------------
+// helpers.
+// ---------------------------------------------------------------------------
+
+// runTickForCatalog runs one lifecycle tick the way ingestion would drive it:
+// it derives the highest complete chunk from the catalog (the chunk id ingestion
+// hands over at a boundary) and passes it as lastChunk. A negative result (young
+// network, no complete chunk) is passed as chunk 0 — the resolve range guard
+// then makes the plan empty, matching the design's young-network no-op.
+func runTickForCatalog(ctx context.Context, t *testing.T, cfg LifecycleConfig, cat *Catalog) {
+	t.Helper()
+	through, err := deriveCompleteThrough(cat)
+	require.NoError(t, err)
+	last, ok := lastCompleteChunkAtID(through)
+	if !ok {
+		last = 0
+	}
+	runLifecycleTick(ctx, cfg, cat, last)
+}
+
+// assertErr is a fixed non-cancellation error for the genuine-failure path.
+var assertErr = errStr("streaming: synthetic op failure")
+
+type errStr string
+
+func (e errStr) Error() string { return string(e) }
+
+// makeReadyHotDirNoData opens and closes a real (empty) hot DB for c so its dir
+// exists on disk and its key is "ready" — the state a discard scan inspects
+// without needing a full ingest.
+func makeReadyHotDirNoData(t *testing.T, cat *Catalog, c chunk.ID) {
+	t.Helper()
+	db, err := openHotTierForChunk(cat, c, silentLogger())
+	require.NoError(t, err)
+	require.NoError(t, db.Close())
+}
+
+// assertQuiescent re-runs the tick's three derivations against the SAME through
+// snapshot and asserts none schedule work — the quiescence postcondition.
+func assertQuiescent(t *testing.T, cfg LifecycleConfig, cat *Catalog, through uint32) {
+	t.Helper()
+	earliest, _, err := cat.EarliestLedger()
+	require.NoError(t, err)
+	floor := effectiveRetentionFloor(through, cfg.RetentionChunks, earliest)
+	start := chunkIDOfLedger(floor)
+	low, hasLow, err := lowestMaterializedChunk(cat)
+	require.NoError(t, err)
+	if hasLow && int64(low) > start {
+		start = int64(low)
+	}
+	if rangeEnd, ok := lastCompleteChunkAtID(through); ok && start >= 0 {
+		plan, perr := resolve(cfg.ExecConfig, chunk.ID(start), rangeEnd)
+		require.NoError(t, perr)
+		assert.True(t, plan.Empty(), "re-resolve schedules no work at quiescence: %+v", plan)
+	}
+	dops, err := eligibleDiscardOps(cfg, cat, through)
+	require.NoError(t, err)
+	assert.Empty(t, dops, "re-scan finds no discard work at quiescence")
+	pops, err := eligiblePruneOps(cfg, cat, through)
+	require.NoError(t, err)
+	assert.Empty(t, pops, "re-scan finds no prune work at quiescence")
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/observability.go b/cmd/stellar-rpc/internal/fullhistory/streaming/observability.go
new file mode 100644
index 000000000..c03df6aba
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/observability.go
@@ -0,0 +1,339 @@
+package streaming
+
+import (
+	"io/fs"
+	"os"
+	"path/filepath"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// Observability for the streaming daemon's own control plane — distinct from the
+// per-data-type ingest metrics (ingest.MetricSink / ingest.PrometheusSink), which
+// time the cold/hot ingesters themselves. THIS sink times and counts the daemon's
+// PHASES: the ingestion loop's chunk-boundary handoffs, catch-up backfill passes,
+// the three lifecycle-tick stages (freeze / discard / prune), and surgical
+// recovery — plus the derived progress gauges (ingestion lag, watermark, the
+// effective retention floor, live hot-chunk count, cold-tier footprint) that no
+// per-ingester sink can see because they are properties of the whole catalog.
+//
+// It is a SMALL interface so it is trivially testable: a test passes a recorder
+// (recordingMetrics in the tests) and asserts the daemon drove the expected
+// signals at the right phase boundaries, without standing up Prometheus. Every
+// call site reads cfg's Metrics through metricsOrNop, so a nil sink is a no-op and
+// no phase ever nil-checks.
+//
+// All methods MUST be safe for concurrent use: the ingestion loop, the lifecycle
+// goroutine, and (during catch-up) the worker pool all report concurrently.
+type Metrics interface {
+	// --- gauges (absolute, last-write-wins) ---
+
+	// IngestionLag sets the lag in ledgers: networkTip - lastCommitted. This is a
+	// CATCH-UP-ONLY signal: catch-up reports it each pass against the bulk tip
+	// (networkTip is the best tip currently known, lastCommitted the highest
+	// durably committed ledger). The steady-state ingestion loop runs at the live
+	// edge of captive core and holds no independent network-tip source to compare
+	// against, so it does NOT touch this gauge — its liveness signal is
+	// LastCommitted, refreshed per ledger. Once catch-up converges, ingestion_lag
+	// freezes at its final catch-up value by design; do not read it as a live
+	// steady-state health metric (use LastCommitted for that).
+	IngestionLag(networkTip, lastCommitted uint32)
+
+	// LastCommitted sets the highest durably committed ledger the ingestion loop
+	// has synced. It is the daemon's per-ledger steady-state liveness signal:
+	// runIngestionLoop refreshes it after every synced WriteBatch, so a wedged or
+	// slow ingester is detectable between chunk boundaries (the watermark gauge
+	// refreshes only on a chunk-boundary tick, ≈LedgersPerChunk apart, and the
+	// per-ledger hot write otherwise emits nothing). A stalled gauge with a live
+	// daemon means ingestion is not keeping up.
+	LastCommitted(seq uint32)
+
+	// Watermark sets the derived watermark (the highest durably committed ledger,
+	// deriveWatermark's result) and the effective retention floor (the lowest
+	// ledger inside the retention window). Reported by startStreaming after
+	// derivation and by every lifecycle tick.
+	Watermark(lastCommitted, retentionFloor uint32)
+
+	// CatchupProgress sets catch-up's position: the last ledger backfilled so far
+	// and the target (the tip-anchored upper bound of the catch-up window). Equal
+	// values mean catch-up has converged.
+	CatchupProgress(backfilledThrough, target uint32)
+
+	// LiveHotChunks sets the count of hot-chunk DBs currently on disk (the
+	// hot:chunk key count). Reported by every lifecycle tick after the discard
+	// stage so the gauge tracks the live + awaiting-discard set.
+	LiveHotChunks(count int)
+
+	// ColdTierBytes sets the cold-tier on-disk footprint in bytes (the summed size
+	// of the ledgers tree). Reported by every lifecycle tick after the prune
+	// stage.
+	ColdTierBytes(bytes int64)
+
+	// --- counters + durations (one call per completed phase action) ---
+
+	// ChunkBoundary counts one ingestion chunk-boundary handoff (a chunk filled,
+	// its DB closed, the next chunk's DB opened). closedChunk is the just-filled
+	// chunk's id.
+	ChunkBoundary(closedChunk uint32)
+
+	// CatchupPass counts one completed catch-up backfill pass over [lo, hi] and
+	// records its wall-clock. A pass that backfilled nothing (converged) is not
+	// reported — only passes that ran runBackfill.
+	CatchupPass(lo, hi uint32, d time.Duration)
+
+	// Freeze counts one lifecycle-tick plan-and-execute stage (the freeze) and
+	// records its wall-clock. chunkBuilds is the plan's size — 0 when the tick had
+	// no producible range (the stage still reports, with a zero count, so the rate
+	// of empty ticks is observable).
+	Freeze(chunkBuilds int, d time.Duration)
+
+	// Discard counts the hot DBs a tick retired and records the stage wall-clock.
+	Discard(count int, d time.Duration)
+
+	// Prune counts the prune-stage sweep ops a tick ran and records the stage
+	// wall-clock.
+	Prune(count int, d time.Duration)
+
+	// Recovery counts one surgical-recovery apply and records how many keys it
+	// demoted across the cold/hot tiers.
+	Recovery(coldKeys, hotKeys int, d time.Duration)
+}
+
+// nopMetrics discards every signal. It is the default when a config carries no
+// Metrics, so every phase reports unconditionally without a nil-check.
+type nopMetrics struct{}
+
+func (nopMetrics) IngestionLag(uint32, uint32)               {}
+func (nopMetrics) LastCommitted(uint32)                      {}
+func (nopMetrics) Watermark(uint32, uint32)                  {}
+func (nopMetrics) CatchupProgress(uint32, uint32)            {}
+func (nopMetrics) LiveHotChunks(int)                         {}
+func (nopMetrics) ColdTierBytes(int64)                       {}
+func (nopMetrics) ChunkBoundary(uint32)                      {}
+func (nopMetrics) CatchupPass(uint32, uint32, time.Duration) {}
+func (nopMetrics) Freeze(int, time.Duration)                 {}
+func (nopMetrics) Discard(int, time.Duration)                {}
+func (nopMetrics) Prune(int, time.Duration)                  {}
+func (nopMetrics) Recovery(int, int, time.Duration)          {}
+
+// metricsOrNop returns m, or nopMetrics{} when m is nil, so call sites never
+// nil-check before reporting a phase signal.
+func metricsOrNop(m Metrics) Metrics {
+	if m == nil {
+		return nopMetrics{}
+	}
+	return m
+}
+
+// streamingSubsystem is the Prometheus subsystem for all streaming control-plane
+// metrics, under the daemon's namespace (interfaces.PrometheusNamespace). It is
+// distinct from ingest.metricsSubsystem ("fullhistory_ingest") so the two metric
+// families never collide in one registry.
+const streamingSubsystem = "fullhistory_streaming"
+
+// phaseBuckets time the daemon's phase actions: a chunk-boundary handoff is
+// sub-millisecond, a freeze/rebuild over a full chunk is seconds to minutes, a
+// catch-up pass over many chunks longer still. 1ms … ~70min, ×4 per bucket — the
+// same wide span ingest's coldStageBuckets use, so a single dashboard renders
+// both families on one axis.
+//
+//nolint:gochecknoglobals // fixed bucket layout, read-only
+var phaseBuckets = prometheus.ExponentialBuckets(0.001, 4, 12)
+
+// PrometheusMetrics is the production Metrics sink: it records the streaming
+// daemon's phase signals into Prometheus collectors. Constructed via
+// NewPrometheusMetrics, which MustRegisters its collectors under a namespace +
+// the fullhistory_streaming subsystem — the same daemon convention
+// ingest.NewPrometheusSink follows.
+type PrometheusMetrics struct {
+	// Gauges — absolute, last-write-wins.
+	ingestionLag      prometheus.Gauge
+	lastCommitted     prometheus.Gauge
+	watermark         prometheus.Gauge
+	retentionFloor    prometheus.Gauge
+	catchupBackfilled prometheus.Gauge
+	catchupTarget     prometheus.Gauge
+	liveHotChunks     prometheus.Gauge
+	coldTierBytes     prometheus.Gauge
+
+	// Counters — monotonic event tallies.
+	chunkBoundaries prometheus.Counter
+	catchupPasses   prometheus.Counter
+	freezeChunks    prometheus.Counter
+	discarded       prometheus.Counter
+	pruned          prometheus.Counter
+	recoveries      prometheus.Counter
+	recoveredKeys   *prometheus.CounterVec // by tier
+
+	// Durations — per-phase wall-clock histograms, keyed by phase label.
+	phaseDuration *prometheus.HistogramVec
+}
+
+// Phase labels for the per-phase duration histogram.
+const (
+	phaseCatchupPass = "catchup_pass"
+	phaseFreeze      = "freeze"
+	phaseDiscard     = "discard"
+	phasePrune       = "prune"
+	phaseRecovery    = "recovery"
+)
+
+// NewPrometheusMetrics builds a PrometheusMetrics and MustRegisters its
+// collectors on registry under namespace + the fullhistory_streaming subsystem.
+// namespace is the daemon convention value (interfaces.PrometheusNamespace).
+func NewPrometheusMetrics(registry *prometheus.Registry, namespace string) *PrometheusMetrics {
+	gauge := func(name, help string) prometheus.Gauge {
+		return prometheus.NewGauge(prometheus.GaugeOpts{
+			Namespace: namespace, Subsystem: streamingSubsystem, Name: name, Help: help,
+		})
+	}
+	counter := func(name, help string) prometheus.Counter {
+		return prometheus.NewCounter(prometheus.CounterOpts{
+			Namespace: namespace, Subsystem: streamingSubsystem, Name: name, Help: help,
+		})
+	}
+
+	m := &PrometheusMetrics{
+		ingestionLag:      gauge("ingestion_lag_ledgers", "catch-up only: network tip minus last committed ledger"),
+		lastCommitted:     gauge("last_committed_ledger", "highest ledger the ingestion loop has durably synced (per-ledger liveness)"),
+		watermark:         gauge("watermark_ledger", "derived watermark — highest durably committed ledger"),
+		retentionFloor:    gauge("retention_floor_ledger", "effective retention floor — lowest in-window ledger"),
+		catchupBackfilled: gauge("catchup_backfilled_ledger", "last ledger catch-up has backfilled through"),
+		catchupTarget:     gauge("catchup_target_ledger", "catch-up target — tip-anchored upper bound"),
+		liveHotChunks:     gauge("live_hot_chunks", "count of hot-chunk DBs currently on disk"),
+		coldTierBytes:     gauge("cold_tier_bytes", "cold-tier on-disk footprint in bytes"),
+
+		chunkBoundaries: counter("chunk_boundaries_total", "ingestion chunk-boundary handoffs"),
+		catchupPasses:   counter("catchup_passes_total", "completed catch-up backfill passes"),
+		freezeChunks:    counter("freeze_chunks_total", "chunks frozen by the lifecycle freeze stage"),
+		discarded:       counter("discarded_hot_chunks_total", "hot DBs retired by the discard stage"),
+		pruned:          counter("pruned_ops_total", "prune-stage sweep ops"),
+		recoveries:      counter("recoveries_total", "surgical-recovery applies"),
+		recoveredKeys: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Namespace: namespace, Subsystem: streamingSubsystem,
+			Name: "recovered_keys_total", Help: "keys demoted by surgical recovery, by tier",
+		}, []string{"tier"}),
+
+		phaseDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
+			Namespace: namespace, Subsystem: streamingSubsystem,
+			Name: "phase_duration_seconds", Help: "wall-clock of a daemon phase action",
+			Buckets: phaseBuckets,
+		}, []string{"phase"}),
+	}
+
+	registry.MustRegister(
+		m.ingestionLag, m.lastCommitted, m.watermark, m.retentionFloor, m.catchupBackfilled, m.catchupTarget,
+		m.liveHotChunks, m.coldTierBytes,
+		m.chunkBoundaries, m.catchupPasses, m.freezeChunks,
+		m.discarded, m.pruned, m.recoveries, m.recoveredKeys,
+		m.phaseDuration,
+	)
+	return m
+}
+
+func (m *PrometheusMetrics) IngestionLag(networkTip, lastCommitted uint32) {
+	// Signed lag: a lagging bulk tip below the watermark yields 0, not a wrap.
+	lag := int64(networkTip) - int64(lastCommitted)
+	if lag < 0 {
+		lag = 0
+	}
+	m.ingestionLag.Set(float64(lag))
+}
+
+func (m *PrometheusMetrics) LastCommitted(seq uint32) { m.lastCommitted.Set(float64(seq)) }
+
+func (m *PrometheusMetrics) Watermark(lastCommitted, retentionFloor uint32) {
+	m.watermark.Set(float64(lastCommitted))
+	m.retentionFloor.Set(float64(retentionFloor))
+}
+
+func (m *PrometheusMetrics) CatchupProgress(backfilledThrough, target uint32) {
+	m.catchupBackfilled.Set(float64(backfilledThrough))
+	m.catchupTarget.Set(float64(target))
+}
+
+func (m *PrometheusMetrics) LiveHotChunks(count int) { m.liveHotChunks.Set(float64(count)) }
+
+func (m *PrometheusMetrics) ColdTierBytes(bytes int64) { m.coldTierBytes.Set(float64(bytes)) }
+
+func (m *PrometheusMetrics) ChunkBoundary(uint32) { m.chunkBoundaries.Inc() }
+
+func (m *PrometheusMetrics) CatchupPass(_, _ uint32, d time.Duration) {
+	m.catchupPasses.Inc()
+	m.phaseDuration.WithLabelValues(phaseCatchupPass).Observe(d.Seconds())
+}
+
+func (m *PrometheusMetrics) Freeze(chunkBuilds int, d time.Duration) {
+	if chunkBuilds > 0 {
+		m.freezeChunks.Add(float64(chunkBuilds))
+	}
+	m.phaseDuration.WithLabelValues(phaseFreeze).Observe(d.Seconds())
+}
+
+func (m *PrometheusMetrics) Discard(count int, d time.Duration) {
+	if count > 0 {
+		m.discarded.Add(float64(count))
+	}
+	m.phaseDuration.WithLabelValues(phaseDiscard).Observe(d.Seconds())
+}
+
+func (m *PrometheusMetrics) Prune(count int, d time.Duration) {
+	if count > 0 {
+		m.pruned.Add(float64(count))
+	}
+	m.phaseDuration.WithLabelValues(phasePrune).Observe(d.Seconds())
+}
+
+func (m *PrometheusMetrics) Recovery(coldKeys, hotKeys int, d time.Duration) {
+	m.recoveries.Inc()
+	if coldKeys > 0 {
+		m.recoveredKeys.WithLabelValues("cold").Add(float64(coldKeys))
+	}
+	if hotKeys > 0 {
+		m.recoveredKeys.WithLabelValues("hot").Add(float64(hotKeys))
+	}
+	m.phaseDuration.WithLabelValues(phaseRecovery).Observe(d.Seconds())
+}
+
+// compile-time assertion: the production sink satisfies the interface.
+var _ Metrics = (*PrometheusMetrics)(nil)
+
+// coldTierBytes sums the on-disk footprint of the cold tier — the ledgers tree
+// (the hot tier and the meta store are excluded: the hot tier is transient, the
+// meta store tiny). It walks the tree's root once, ignoring a missing tree (a
+// frontfill deployment may not have materialized it). A walk error is non-fatal
+// — the lifecycle caller treats a returned error as "skip the gauge this tick"
+// rather than failing the tick, so a transient FS hiccup never aborts the daemon.
+func coldTierBytes(layout Layout) (int64, error) {
+	var total int64
+	var firstErr error
+	for _, root := range []string{
+		layout.LedgersRoot(),
+	} {
+		err := filepath.WalkDir(root, func(_ string, d fs.DirEntry, err error) error {
+			if err != nil {
+				if os.IsNotExist(err) {
+					return nil // an un-materialized tree contributes nothing
+				}
+				return err
+			}
+			if d.IsDir() {
+				return nil
+			}
+			info, ierr := d.Info()
+			if ierr != nil {
+				if os.IsNotExist(ierr) {
+					return nil // raced with a prune unlink — count it as gone
+				}
+				return ierr
+			}
+			total += info.Size()
+			return nil
+		})
+		if err != nil && firstErr == nil {
+			firstErr = err
+		}
+	}
+	return total, firstErr
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/progress.go b/cmd/stellar-rpc/internal/fullhistory/streaming/progress.go
new file mode 100644
index 000000000..7aea790cf
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/progress.go
@@ -0,0 +1,214 @@
+package streaming
+
+import (
+	"fmt"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// Progress derivation. There is NO stored watermark (see the data model's
+// "Progress is derived, never stored"): every consumer recomputes its bound
+// from durable catalog keys on every call. ONE derivation, lastCommittedLedger,
+// matching the design's lastCommittedLedger(cat[, probe]):
+//
+//   - probe == nil (the lifecycle tick): chunk granularity, a pure catalog read
+//     that opens no hot DB. The positional term is everything below the live
+//     (highest ready) chunk.
+//   - probe != nil (ingestion's resume point at startup): refined by exactly ONE
+//     read of the highest ready hot DB when the hot tier leads the cold tier —
+//     sub-chunk precision inside the live chunk plus boundary-crash recovery
+//     (the highest ready chunk may be a just-completed predecessor whose
+//     completion no key advertises). Hot-volume loss is detected LAZILY on that
+//     one open (no eager dir-existence scan over every ready key — see item 6 /
+//     the design's "detects loss lazily on open"); a ready-but-won't-open hot DB
+//     surfaces as ErrHotVolumeLost with the surgical-recovery guidance.
+//
+// SIGNED-DOMAIN arithmetic (the sentinel-underflow guard): chunk.ID is uint32
+// and CANNOT hold the pre-genesis sentinel -1, nor survive a `maxChunk-1` /
+// `earliest-1` underflow when the live chunk is chunk 0 or the floor pin is
+// absent. Every "highest complete chunk" computation below therefore happens in
+// int64, with -1 meaning "nothing below is complete"; completeThrough maps the
+// signed chunk index to its last ledger, returning the pre-genesis sentinel for
+// any negative input. A raw chunk.ID is never fed an underflowed value, and
+// ID(^uint32(0)) is never passed to LastLedger() (which would overflow — see
+// chunk.go's LastLedger note).
+
+// preGenesisLedger is the watermark when NOTHING below the floor is complete:
+// FirstLedgerSeq-1, i.e. "ingest from genesis". It is the value completeThrough
+// returns for the pre-genesis sentinel (a negative signed chunk index).
+const preGenesisLedger uint32 = chunk.FirstLedgerSeq - 1
+
+// completeThrough maps a SIGNED chunk index to the last ledger that chunk index
+// represents as a "complete through" bound:
+//
+//   - c < 0 (the pre-genesis sentinel): no chunk below is complete, so the bound
+//     is FirstLedgerSeq-1 — the design's chunkLastLedger(-1) = 1, computed here
+//     without uint32 wraparound.
+//   - c >= 0: chunk.ID(c).LastLedger().
+//
+// This is the single chokepoint that keeps the cold/positional/floor terms out
+// of the uint32 underflow trap the design pseudocode's signed math hid.
+func completeThrough(c int64) uint32 {
+	if c < 0 {
+		return preGenesisLedger
+	}
+	return chunk.ID(c).LastLedger() //nolint:gosec // c >= 0 and bounded by real chunk ids
+}
+
+// lastCommittedLedger is the single highest-durably-committed-ledger derivation
+// (the design's lastCommittedLedger(cat[, probe])). It maxes the cold term, the
+// hot term, and the earliest-1 floor, each computed in the signed domain and
+// mapped through completeThrough so a fresh/young store can never underflow to
+// MaxUint32:
+//
+//   - COLD term — the highest chunk whose artifacts are ALL durable
+//     (highestDurableChunk; -1 on a fresh start). Leads at startup, before
+//     ingestion has created any hot key.
+//   - HOT term — taken only when the hot tier LEADS the cold tier (hot > cold),
+//     which is the design's switch. counts only "ready" hot keys; a "transient"
+//     key never advances the bound, which is what lets recovery demote any hot
+//     key without inflating it.
+//     · probe == nil: the POSITIONAL term — everything below the live (highest
+//     ready) chunk, completeThrough(hot-1). Pure catalog read.
+//     · probe != nil: ONE read of the highest ready hot DB's MaxCommittedSeq —
+//     sub-chunk precision plus the boundary-crash frontier (a "transient"
+//     live chunk leaves the highest *ready* chunk a just-completed
+//     predecessor whose completion no key advertises). Hot-volume loss is
+//     detected LAZILY on this one open: a ready-but-won't-open / absent-dir
+//     hot DB surfaces as ErrHotVolumeLost. It is safe to open here only
+//     because derivation runs before ingestion takes the live DB's exclusive
+//     lock. (Gating on hot > cold means the cold tier dominates whenever it
+//     leads, so the equivalent positional/refinement value is preserved
+//     exactly while avoiding a needless open.)
+//   - FLOOR term — EarliestLedger()-1, computed as int64(earliest)-1 so an
+//     absent/zero pin yields the pre-genesis sentinel rather than underflowing.
+func lastCommittedLedger(cat *Catalog, probe HotProbe) (uint32, error) {
+	cold, err := highestDurableChunk(cat)
+	if err != nil {
+		return 0, err
+	}
+	through := completeThrough(cold)
+
+	hot, err := highestReadyChunkSigned(cat)
+	if err != nil {
+		return 0, err
+	}
+	if hot > cold {
+		if probe == nil {
+			// Positional term: everything BELOW the live (highest ready) chunk.
+			through = max(through, completeThrough(hot-1))
+		} else {
+			// One refinement read of the highest ready hot DB. Loss is detected
+			// lazily on this open (no eager scan over every ready key).
+			refined, rerr := refineWithHotDB(cat, probe, hot)
+			if rerr != nil {
+				return 0, rerr
+			}
+			through = max(through, refined)
+		}
+	}
+
+	earliest, ok, err := cat.EarliestLedger()
+	if err != nil {
+		return 0, err
+	}
+	if ok {
+		// int64 before the -1 so a zero/genesis pin does not underflow.
+		floor := int64(earliest) - 1
+		if floor < 0 {
+			floor = 0
+		}
+		through = max(through, uint32(floor)) //nolint:gosec // floor >= 0, fits uint32
+	}
+
+	return through, nil
+}
+
+// refineWithHotDB opens the highest ready hot chunk read-only through probe and
+// returns its MaxCommittedSeq (or completeThrough(live-1) when the DB is empty —
+// the positional fallback). Loss is LAZY: a "ready" key whose dir is absent or
+// whose DB won't open surfaces as ErrHotVolumeLost with the surgical-recovery
+// guidance (item 6 — narrowed from the former eager all-ready-keys dir scan; the
+// per-chunk open here is the same loud, actionable fatal).
+func refineWithHotDB(cat *Catalog, probe HotProbe, live int64) (uint32, error) {
+	id := chunk.ID(live) //nolint:gosec // live > cold >= -1, so live >= 0
+	hot, ok, openErr := probe.OpenHotChunk(id)
+	if openErr != nil {
+		return 0, fmt.Errorf("%w: chunk %s is %q but its hot DB won't open (run surgical recovery): %w",
+			ErrHotVolumeLost, id, HotReady, openErr)
+	}
+	if !ok {
+		return 0, fmt.Errorf("%w: chunk %s is %q but its hot dir is missing (run surgical recovery)",
+			ErrHotVolumeLost, id, HotReady)
+	}
+	defer func() { _ = hot.Close() }()
+
+	maxSeq, present, seqErr := hot.MaxCommittedSeq()
+	if seqErr != nil {
+		return 0, fmt.Errorf("%w: chunk %s: max committed seq: %w", ErrHotVolumeLost, id, seqErr)
+	}
+	if present {
+		return maxSeq, nil
+	}
+	// Empty live DB: positional fallback (everything below it).
+	return completeThrough(live - 1), nil
+}
+
+// highestDurableChunk returns the highest chunk id whose artifacts are ALL
+// durable, or -1 when no chunk is fully durable (a fresh start). "All durable"
+// is the pendingArtifacts-empty test: every per-chunk kind (currently just
+// ledgers) frozen. A chunk whose only kind is not yet frozen DEGRADES the bound
+// and backfill repairs it.
+//
+// Returns int64 so the -1 sentinel is representable; lastCommittedLedger feeds
+// it through completeThrough.
+func highestDurableChunk(cat *Catalog) (int64, error) {
+	refs, err := cat.ChunkArtifactKeys()
+	if err != nil {
+		return 0, err
+	}
+
+	// Collect frozen per-kind state per chunk.
+	type kinds struct{ ledgers bool }
+	frozen := map[chunk.ID]*kinds{}
+	for _, ref := range refs {
+		if ref.State != StateFrozen {
+			continue
+		}
+		k := frozen[ref.Chunk]
+		if k == nil {
+			k = &kinds{}
+			frozen[ref.Chunk] = k
+		}
+		if ref.Kind == KindLedgers {
+			k.ledgers = true
+		}
+	}
+
+	highest := int64(-1)
+	for c, k := range frozen {
+		if !k.ledgers {
+			continue
+		}
+		if id := int64(c); id > highest {
+			highest = id
+		}
+	}
+	return highest, nil
+}
+
+// highestReadyChunkSigned returns the highest "ready" hot chunk id as int64, or
+// -1 when there is no ready hot key. The signed return lets completeThrough
+// compute the positional term (max ready - 1) without a uint32 underflow when the
+// live chunk is chunk 0.
+func highestReadyChunkSigned(cat *Catalog) (int64, error) {
+	ready, err := cat.ReadyHotChunkKeys()
+	if err != nil {
+		return 0, err
+	}
+	if len(ready) == 0 {
+		return -1, nil
+	}
+	// ReadyHotChunkKeys is sorted ascending; the last is the highest.
+	return int64(ready[len(ready)-1]), nil
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/progress_realdb_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/progress_realdb_test.go
new file mode 100644
index 000000000..c553aea13
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/progress_realdb_test.go
@@ -0,0 +1,104 @@
+package streaming
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
+)
+
+// TestDeriveWatermark_RealHotDB_RefinementIsNotStale exercises the watermark
+// refinement against a REAL per-chunk hotchunk DB read through the production
+// rocksHotProbe — the path the fakeHotProbe table tests stub out. It proves the
+// single-DB MaxCommittedSeq refinement reads the actual committed ledger frontier
+// (the ledgers CF's last key) and is not a stale/constant value: the bound rises
+// to exactly the highest seq committed to the live chunk's real DB.
+func TestDeriveWatermark_RealHotDB_RefinementIsNotStale(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	live := chunk.ID(5)
+	// Production bracket: creates the hot dir, opens the SINGLE shared multi-CF
+	// DB, flips the hot key "ready". This is exactly what ingestion does.
+	db := openLiveHotDB(t, cat, live)
+
+	// Commit two real ledgers into the ledgers CF (the CF MaxCommittedSeq reads).
+	first := live.FirstLedger()
+	committedTop := first + 200
+	require.NoError(t, db.Ledgers().AddLedgers(
+		ledger.Entry{Seq: first, Bytes: []byte("ledger-A")},
+		ledger.Entry{Seq: committedTop, Bytes: []byte("ledger-B")},
+	))
+	// Close the live writer before the probe re-opens read-only (RocksDB LOCK).
+	require.NoError(t, db.Close())
+
+	// Sanity: positional baseline (live chunk 5 ⇒ everything below 5) is chunk 4's
+	// last ledger, strictly below the committed top — so the assertion below can
+	// only pass if the refinement actually read the real DB.
+	baseline := mustDeriveCompleteThrough(t, cat)
+	require.Equal(t, chunk.ID(4).LastLedger(), baseline)
+	require.Greater(t, committedTop, baseline, "fixture must put the real frontier above the baseline")
+
+	probe := NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger())
+	got, err := deriveWatermark(cat, probe)
+	require.NoError(t, err)
+	require.Equal(t, committedTop, got,
+		"watermark must equal the REAL ledgers-CF last key, not the positional baseline")
+}
+
+// TestDeriveWatermark_RealHotDB_OpensHighestReady proves the refinement opens the
+// HIGHEST ready chunk (the live chunk), not just any ready chunk. Two ready chunks
+// have independent real hot DBs with DIFFERENT committed frontiers; the watermark
+// must reflect the higher chunk's DB. The fakeHotProbe table tests CANNOT cover
+// this: fakeHotProbe.OpenHotChunk ignores its chunk-id argument and returns one
+// canned DB, so a "open ready[0] instead of ready[len-1]" regression is invisible
+// to them — only a real per-chunk probe distinguishes the two.
+func TestDeriveWatermark_RealHotDB_OpensHighestReady(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	lower, higher := chunk.ID(4), chunk.ID(7)
+
+	// Lower ready chunk: a real DB committed near the TOP of chunk 4. If the
+	// refinement wrongly opened the lower chunk, the bound would land here.
+	lowDB := openLiveHotDB(t, cat, lower)
+	lowTop := lower.FirstLedger() + 9000
+	require.NoError(t, lowDB.Ledgers().AddLedgers(ledger.Entry{Seq: lowTop, Bytes: []byte("low")}))
+	require.NoError(t, lowDB.Close())
+
+	// Higher ready chunk (the live chunk): committed mid-chunk 7.
+	highDB := openLiveHotDB(t, cat, higher)
+	highMid := higher.FirstLedger() + 1234
+	require.NoError(t, highDB.Ledgers().AddLedgers(ledger.Entry{Seq: highMid, Bytes: []byte("high")}))
+	require.NoError(t, highDB.Close())
+
+	// The two frontiers must be unambiguous: chunk 7 mid-seq is far above chunk 4's
+	// top, so reading the wrong chunk yields a strictly different (lower) answer.
+	require.Greater(t, highMid, lowTop)
+
+	probe := NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger())
+	got, err := deriveWatermark(cat, probe)
+	require.NoError(t, err)
+	require.Equal(t, highMid, got,
+		"refinement must open the HIGHEST ready chunk (7), reading its committed mid-seq")
+}
+
+// TestDeriveWatermark_RealHotDB_EmptyLiveFallsBack is the count-only-ready case
+// against a real DB: a "ready" live chunk whose real hot DB has NO committed
+// ledger (MaxCommittedSeq ok=false) must fall back to deriveCompleteThrough, not
+// fabricate a frontier. Read through the production probe.
+func TestDeriveWatermark_RealHotDB_EmptyLiveFallsBack(t *testing.T) {
+	cat, _ := testCatalog(t)
+	makeChunkDurable(t, cat, 0) // cold term => chunk 0 last ledger
+
+	live := chunk.ID(3)
+	db := openLiveHotDB(t, cat, live) // ready key + real dir, but NOTHING committed
+	require.NoError(t, db.Close())
+
+	// Real probe reads the empty ledgers CF: ok=false, no refinement.
+	probe := NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger())
+	got, err := deriveWatermark(cat, probe)
+	require.NoError(t, err)
+	require.Equal(t, chunk.ID(2).LastLedger(), got,
+		"empty live DB ⇒ positional baseline (max ready 3 - 1 = chunk 2), no fabricated frontier")
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/progress_shim_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/progress_shim_test.go
new file mode 100644
index 000000000..cca5e7baa
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/progress_shim_test.go
@@ -0,0 +1,18 @@
+package streaming
+
+// Test-only aliases for the consolidated progress derivation (item R2-4). The
+// design folded deriveCompleteThrough + deriveWatermark into ONE
+// lastCommittedLedger(cat[, probe]):
+//
+//   - deriveCompleteThrough(cat)      == lastCommittedLedger(cat, nil)   (chunk
+//     granularity, pure catalog read — the positional term, no hot DB open).
+//   - deriveWatermark(cat, probe)     == lastCommittedLedger(cat, probe) (one
+//     refinement read of the highest ready hot DB, loss detected LAZILY on it).
+//
+// These shims keep the existing tests' intent legible against the old names; the
+// production callers all use lastCommittedLedger directly.
+func deriveCompleteThrough(cat *Catalog) (uint32, error) { return lastCommittedLedger(cat, nil) }
+
+func deriveWatermark(cat *Catalog, probe HotProbe) (uint32, error) {
+	return lastCommittedLedger(cat, probe)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/progress_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/progress_test.go
new file mode 100644
index 000000000..d3d6f15bf
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/progress_test.go
@@ -0,0 +1,316 @@
+package streaming
+
+import (
+	"errors"
+	"os"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ---------------------------------------------------------------------------
+// progress derivation test helpers.
+// ---------------------------------------------------------------------------
+
+// makeChunkDurable flips ledgers to frozen for a chunk — the
+// pendingArtifacts-empty state highestDurableChunk counts.
+func makeChunkDurable(t *testing.T, cat *Catalog, c chunk.ID) {
+	t.Helper()
+	freezeKinds(t, cat, c, KindLedgers)
+}
+
+// makeHotDir creates the on-disk hot dir for a chunk so deriveWatermark's
+// per-ready-key dir-existence loop sees it present.
+func makeHotDir(t *testing.T, cat *Catalog, c chunk.ID) {
+	t.Helper()
+	require.NoError(t, os.MkdirAll(cat.layout.HotChunkPath(c), 0o755))
+}
+
+// readyHot marks a chunk's hot key "ready" AND creates its dir, the production
+// pairing deriveWatermark expects (a ready key whose dir is missing is loss).
+func readyHot(t *testing.T, cat *Catalog, c chunk.ID) {
+	t.Helper()
+	require.NoError(t, cat.PutHotTransient(c))
+	require.NoError(t, cat.FlipHotReady(c))
+	makeHotDir(t, cat, c)
+}
+
+// ---------------------------------------------------------------------------
+// completeThrough — the sentinel-safe signed->ledger map. Proves the
+// pre-genesis sentinel resolves to FirstLedgerSeq-1 (=1), NOT a uint32 wrap.
+//
+// THE ALIASING TRAP this test exists to catch: a guard-less completeThrough
+// (chunk.ID(uint32(c)).LastLedger() with no `c<0` branch) does NOT fail on the
+// production sentinel -1, because chunk.ID(uint32(-1)=MaxUint32).LastLedger()
+// computes (MaxUint32+1)*LedgersPerChunk+FirstLedgerSeq-1, whose (MaxUint32+1)
+// overflows uint32 to 0 — yielding exactly 1 == preGenesisLedger. So a -1-only
+// test would pass even with the guard removed. Every OTHER negative input wraps
+// to a large, distinct value (e.g. -2 => 4294957297), so the guard is only
+// actually exercised by a negative sentinel that is NOT -1. The -2 and -100
+// rows below are the load-bearing underflow guards; -1 alone is decorative.
+// ---------------------------------------------------------------------------
+
+func TestCompleteThrough(t *testing.T) {
+	tests := []struct {
+		name string
+		in   int64
+		want uint32
+	}{
+		{"pre-genesis sentinel -1 => FirstLedgerSeq-1, not MaxUint32 (ALIASES the wrap; see trap above)", -1, preGenesisLedger},
+		{"sentinel -2 does NOT alias the wrap (guard-less would yield 4294957297)", -2, preGenesisLedger},
+		{"deeply negative still pre-genesis", -100, preGenesisLedger},
+		{"chunk 0 last ledger", 0, chunk.ID(0).LastLedger()},
+		{"chunk 5 last ledger", 5, chunk.ID(5).LastLedger()},
+	}
+	require.Equal(t, uint32(1), preGenesisLedger, "FirstLedgerSeq-1 == 1 (the doc's chunkLastLedger(-1))")
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			require.Equal(t, tc.want, completeThrough(tc.in))
+		})
+	}
+
+	// The aliasing trap, asserted directly so the comment above cannot rot: the
+	// production sentinel -1 wraps to exactly preGenesisLedger (which is why a
+	// -1-only test is blind to a dropped guard), while -2 wraps to a large,
+	// distinct value that the guard must squash. Computed from chunk arithmetic,
+	// not hardcoded, so it tracks LedgersPerChunk/FirstLedgerSeq.
+	guardlessWrap := func(c int64) uint32 {
+		return chunk.ID(uint32(c)).LastLedger() //nolint:gosec // deliberate wrap to model a guard-less impl
+	}
+	require.Equal(t, preGenesisLedger, guardlessWrap(-1),
+		"-1 aliases preGenesisLedger under the wrap — the coincidence this test must not rely on")
+	require.NotEqual(t, preGenesisLedger, guardlessWrap(-2),
+		"-2 must NOT alias — proving the guard (not a coincidence) is what makes completeThrough(-2) safe")
+}
+
+// ---------------------------------------------------------------------------
+// deriveCompleteThrough — chunk-granularity bound, pure catalog read.
+// ---------------------------------------------------------------------------
+
+func TestDeriveCompleteThrough(t *testing.T) {
+	t.Run("fresh store => pre-genesis sentinel, never MaxUint32", func(t *testing.T) {
+		// No durable chunk, no hot key, no earliest pin: every term is -1.
+		// A naive uint32 impl (chunkLastLedger(ID(-1)) / earliest-1) would wrap
+		// to MaxUint32 here; the signed domain must yield FirstLedgerSeq-1.
+		cat, _ := testCatalog(t)
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, preGenesisLedger, got)
+	})
+
+	t.Run("cold term leads: highest fully-durable chunk", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		makeChunkDurable(t, cat, 0)
+		makeChunkDurable(t, cat, 1)
+		makeChunkDurable(t, cat, 2)
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, chunk.ID(2).LastLedger(), got)
+	})
+
+	t.Run("incompletely-frozen tip degrades the bound (ledgers freezing, not frozen)", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		makeChunkDurable(t, cat, 0)
+		makeChunkDurable(t, cat, 1)
+		// Chunk 2: ledgers only "freezing" — a mid-freeze crash. It must NOT
+		// count: bound stays at chunk 1.
+		require.NoError(t, cat.MarkChunkFreezing(2, KindLedgers))
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, chunk.ID(1).LastLedger(), got)
+	})
+
+	t.Run("positional term leads in steady state: everything below the live chunk", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		// No cold artifacts yet (steady state: chunks complete before cold exists).
+		// Ready hot keys 3,4,5 => live chunk is 5 => everything below 5 complete.
+		readyHot(t, cat, 3)
+		readyHot(t, cat, 4)
+		readyHot(t, cat, 5)
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, chunk.ID(4).LastLedger(), got, "max ready (5) - 1 = chunk 4's last ledger")
+	})
+
+	t.Run("transient hot key does NOT advance the positional term", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		readyHot(t, cat, 3)
+		// A transient key above the highest ready one must be excluded.
+		require.NoError(t, cat.PutHotTransient(9))
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, chunk.ID(2).LastLedger(), got, "max READY (3) - 1, ignoring transient 9")
+	})
+
+	t.Run("live chunk 0 => positional term is pre-genesis, NOT MaxUint32", func(t *testing.T) {
+		// The exact uint32-underflow trap: max ready = 0, so 0-1 must be the
+		// pre-genesis sentinel, not ID(4294967295).LastLedger().
+		cat, _ := testCatalog(t)
+		readyHot(t, cat, 0)
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, preGenesisLedger, got)
+	})
+
+	t.Run("earliest pin floor leads when above cold/positional terms", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		// Floor pinned mid-chain, no chunks durable, no hot keys.
+		const floor = 50000
+		require.NoError(t, cat.PutEarliestLedger(floor))
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, uint32(floor-1), got)
+	})
+
+	t.Run("earliest pin == genesis (2) does not underflow", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, preGenesisLedger, got, "earliest 2 - 1 = 1, not MaxUint32")
+	})
+
+	t.Run("max of all three terms", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		makeChunkDurable(t, cat, 0) // cold => chunk 0 last ledger
+		readyHot(t, cat, 4)         // positional => chunk 3 last ledger (highest)
+		require.NoError(t, cat.PutEarliestLedger(2))
+		got, err := deriveCompleteThrough(cat)
+		require.NoError(t, err)
+		require.Equal(t, chunk.ID(3).LastLedger(), got)
+	})
+}
+
+// ---------------------------------------------------------------------------
+// deriveWatermark — deriveCompleteThrough + one refinement read + the
+// per-ready-key dir-existence fatal loop.
+// ---------------------------------------------------------------------------
+
+func TestDeriveWatermark(t *testing.T) {
+	t.Run("no ready hot keys => equals deriveCompleteThrough, no open", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		makeChunkDurable(t, cat, 0)
+		probe := &fakeHotProbe{} // would error if opened with ok=false under "ready", but none ready
+		got, err := deriveWatermark(cat, probe)
+		require.NoError(t, err)
+		require.Equal(t, chunk.ID(0).LastLedger(), got)
+	})
+
+	t.Run("sub-chunk precision: refinement reads mid-chunk seq inside the live chunk", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		readyHot(t, cat, 5) // live chunk 5; positional term = chunk 4 last ledger
+		midLive := chunk.ID(5).FirstLedger() + 123
+		probe := &fakeHotProbe{ok: true, chunk: &fakeHotChunk{maxSeq: midLive, present: true}}
+		got, err := deriveWatermark(cat, probe)
+		require.NoError(t, err)
+		require.Equal(t, midLive, got, "refined to the live chunk's committed seq")
+	})
+
+	t.Run("boundary-crash under-count recovered by refinement", func(t *testing.T) {
+		// Live chunk crashed at a boundary and was demoted to "transient": the
+		// highest READY key is the just-completed predecessor (chunk 4), whose
+		// completion no key advertises (positional term = chunk 3). The refinement
+		// opens chunk 4 and reads its full committed seq = chunk 4's last ledger,
+		// recovering the frontier the positional term under-counted.
+		cat, _ := testCatalog(t)
+		readyHot(t, cat, 4)
+		require.NoError(t, cat.PutHotTransient(5)) // the crashed live chunk
+		require.Equal(t, chunk.ID(3).LastLedger(), mustDeriveCompleteThrough(t, cat),
+			"positional term alone under-counts to chunk 3")
+
+		chunk4Last := chunk.ID(4).LastLedger()
+		probe := &fakeHotProbe{ok: true, chunk: &fakeHotChunk{maxSeq: chunk4Last, present: true}}
+		got, err := deriveWatermark(cat, probe)
+		require.NoError(t, err)
+		require.Equal(t, chunk4Last, got, "refinement recovers the chunk-4 frontier")
+	})
+
+	t.Run("count-only-ready: an empty refinement DB falls back to deriveCompleteThrough", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		makeChunkDurable(t, cat, 0)
+		readyHot(t, cat, 3) // positional => chunk 2 last ledger
+		// DB present but empty (present=false): no refinement, w stays positional.
+		probe := &fakeHotProbe{ok: true, chunk: &fakeHotChunk{present: false}}
+		got, err := deriveWatermark(cat, probe)
+		require.NoError(t, err)
+		require.Equal(t, chunk.ID(2).LastLedger(), got)
+	})
+
+	t.Run("refinement only RAISES the bound, never lowers it", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		makeChunkDurable(t, cat, 0)
+		makeChunkDurable(t, cat, 1)
+		makeChunkDurable(t, cat, 2) // cold term => chunk 2 last ledger
+		readyHot(t, cat, 3)         // positional => chunk 2 last ledger
+		// Live DB reports a seq below the cold bound (e.g. just opened); max wins.
+		probe := &fakeHotProbe{ok: true, chunk: &fakeHotChunk{maxSeq: 5, present: true}}
+		got, err := deriveWatermark(cat, probe)
+		require.NoError(t, err)
+		require.Equal(t, chunk.ID(2).LastLedger(), got)
+	})
+
+	t.Run("LAZY loss (item R2-6): only the highest ready chunk is opened; a lower"+
+		" ready key's missing dir is NOT eagerly flagged", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		// Two ready keys; the LOWER one's dir is missing. Under the design's lazy
+		// detection (no eager all-ready-keys scan) only the HIGHEST ready chunk is
+		// opened, so the lower key's missing dir is not surfaced here — it surfaces
+		// later, when ingestion/discard reaches that chunk via openHotTierForChunk.
+		require.NoError(t, cat.PutHotTransient(2))
+		require.NoError(t, cat.FlipHotReady(2)) // ready key 2, NO dir (not opened here)
+		readyHot(t, cat, 5)                     // highest ready key 5 WITH dir (opened)
+		probe := &fakeHotProbe{ok: true, chunk: &fakeHotChunk{maxSeq: 10, present: true}}
+		got, err := deriveWatermark(cat, probe)
+		require.NoError(t, err)
+		require.Equal(t, uint32(10), got, "refined to the highest ready chunk's seq")
+	})
+
+	t.Run("fatal: a ready HIGHEST chunk whose dir is missing (lazy loss on open)", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		// The highest ready chunk's dir is missing: the one open the derivation
+		// performs surfaces the loss as ErrHotVolumeLost with recovery guidance.
+		require.NoError(t, cat.PutHotTransient(5))
+		require.NoError(t, cat.FlipHotReady(5)) // ready key 5, NO dir
+		probe := &fakeHotProbe{ok: false}       // OpenHotChunk reports dir absent
+		_, err := deriveWatermark(cat, probe)
+		require.Error(t, err)
+		require.ErrorIs(t, err, ErrHotVolumeLost)
+		require.Contains(t, err.Error(), "00000005")
+	})
+
+	t.Run("fatal: refinement open error on the highest ready chunk", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		readyHot(t, cat, 3) // dir present
+		probe := &fakeHotProbe{openErr: errors.New("rocksdb LOCK held")}
+		_, err := deriveWatermark(cat, probe)
+		require.Error(t, err)
+		require.ErrorIs(t, err, ErrHotVolumeLost)
+	})
+
+	t.Run("fatal: refinement read error", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		readyHot(t, cat, 3)
+		probe := &fakeHotProbe{ok: true, chunk: &fakeHotChunk{maxErr: errors.New("corrupt")}}
+		_, err := deriveWatermark(cat, probe)
+		require.Error(t, err)
+		require.ErrorIs(t, err, ErrHotVolumeLost)
+	})
+
+	t.Run("live chunk 0 ready, empty DB => pre-genesis, no underflow", func(t *testing.T) {
+		cat, _ := testCatalog(t)
+		readyHot(t, cat, 0)
+		probe := &fakeHotProbe{ok: true, chunk: &fakeHotChunk{present: false}}
+		got, err := deriveWatermark(cat, probe)
+		require.NoError(t, err)
+		require.Equal(t, preGenesisLedger, got)
+	})
+}
+
+func mustDeriveCompleteThrough(t *testing.T, cat *Catalog) uint32 {
+	t.Helper()
+	got, err := deriveCompleteThrough(cat)
+	require.NoError(t, err)
+	return got
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/resolve.go b/cmd/stellar-rpc/internal/fullhistory/streaming/resolve.go
new file mode 100644
index 000000000..a3aa23905
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/resolve.go
@@ -0,0 +1,96 @@
+package streaming
+
+import (
+	"slices"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ChunkBuild names one per-chunk freeze pass: the chunk plus the subset of kinds
+// it still needs. One processChunk pass produces all of Artifacts. It is pure
+// data — the executor interprets it (design-docs/full-history-streaming-
+// workflow.md "Postcondition-driven scheduling").
+type ChunkBuild struct {
+	Chunk     chunk.ID
+	Artifacts ArtifactSet
+}
+
+// Plan is the resolver's output: the per-chunk freeze work. It carries no
+// behavior — it can be logged, diffed, and tested without running it, which is
+// what makes "the plan is just a value" literally true.
+type Plan struct {
+	ChunkBuilds []ChunkBuild
+}
+
+// Empty reports whether the plan schedules no work — the steady-state /
+// quiescent case.
+func (p Plan) Empty() bool { return len(p.ChunkBuilds) == 0 }
+
+// resolve computes the diff between the desired state — every artifact derived
+// from every ledger in [rangeStart, rangeEnd] is durable and servable — and the
+// catalog, emitting the difference as a Plan. It is a PURE READ of the Phase A
+// catalog: it touches no file, marks no key, and recomputes from durable keys
+// on every run, so a restart re-plans from what is actually on disk with
+// nothing to reconcile (design-docs "Postcondition-driven scheduling").
+//
+// The kind rule:
+//
+//   - ledgers (per-chunk): chunk c is needed iff chunk:{c}:ledgers is not
+//     "frozen". A "freezing"/"pruning"/absent key re-materializes (idempotent
+//     inside processChunk); a "frozen" key self-skips here.
+//
+// Inverted range (rangeEnd < rangeStart, a network younger than one complete
+// chunk) returns the empty Plan.
+func resolve(cfg ExecConfig, rangeStart, rangeEnd chunk.ID) (Plan, error) {
+	if rangeEnd < rangeStart {
+		return Plan{}, nil // no complete chunk exists yet
+	}
+	cat := cfg.Catalog
+
+	// Per-chunk work, unioned across kinds; one ChunkBuild per chunk regardless
+	// of how many kinds it needs (one processChunk pass produces all).
+	needs := map[chunk.ID]ArtifactSet{}
+
+	// Per-chunk kinds: ledgers.
+	for c := rangeStart; ; c++ {
+		for _, kind := range []Kind{KindLedgers} {
+			state, err := cat.State(c, kind)
+			if err != nil {
+				return Plan{}, err
+			}
+			if state != StateFrozen {
+				needs[c] = needs[c].Add(kind)
+			}
+		}
+		if c == rangeEnd { // inclusive upper bound; guard chunk.ID wraparound
+			break
+		}
+	}
+
+	return Plan{ChunkBuilds: chunkBuildsFrom(needs)}, nil
+}
+
+// chunkBuildsFrom flattens the per-chunk needs map into a ChunkBuild slice,
+// sorted by chunk id so the plan is deterministic (loggable / diffable /
+// testable). Chunks whose set ended up empty (all kinds frozen) are omitted.
+func chunkBuildsFrom(needs map[chunk.ID]ArtifactSet) []ChunkBuild {
+	if len(needs) == 0 {
+		return nil
+	}
+	ids := make([]chunk.ID, 0, len(needs))
+	for c, set := range needs {
+		if set.Empty() {
+			continue
+		}
+		ids = append(ids, c)
+	}
+	if len(ids) == 0 {
+		return nil
+	}
+	slices.Sort(ids)
+	builds := make([]ChunkBuild, len(ids))
+	for i, c := range ids {
+		builds[i] = ChunkBuild{Chunk: c, Artifacts: needs[c]}
+	}
+	return builds
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/resolve_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/resolve_test.go
new file mode 100644
index 000000000..05f7d6a03
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/resolve_test.go
@@ -0,0 +1,119 @@
+package streaming
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ---------------------------------------------------------------------------
+// resolve test helpers — set catalog state directly through the Phase A
+// one-write protocol so resolve sees exactly the durable keys production would.
+// ---------------------------------------------------------------------------
+
+// freezeKinds flips the given per-chunk kinds to "frozen" for chunkID via the
+// one-write protocol (no real file content needed — resolve reads keys only).
+func freezeKinds(t *testing.T, cat *Catalog, chunkID chunk.ID, kinds ...Kind) {
+	t.Helper()
+	require.NoError(t, cat.MarkChunkFreezing(chunkID, kinds...))
+	require.NoError(t, cat.FlipChunkFrozen(chunkID, kinds...))
+}
+
+// resolveCfg wires a minimal ExecConfig over a catalog for resolve tests
+// (resolve never runs a task, so the primitive deps stay nil).
+func resolveCfg(cat *Catalog) ExecConfig {
+	return ExecConfig{Catalog: cat, Logger: silentLogger(), Workers: 1}
+}
+
+// chunkSet collects the ChunkBuild chunk ids into a slice for assertions.
+func chunkSet(p Plan) []chunk.ID {
+	out := make([]chunk.ID, len(p.ChunkBuilds))
+	for i, cb := range p.ChunkBuilds {
+		out[i] = cb.Chunk
+	}
+	return out
+}
+
+// findChunkBuild returns the ChunkBuild for c, or ok=false.
+func findChunkBuild(p Plan, c chunk.ID) (ChunkBuild, bool) {
+	for _, cb := range p.ChunkBuilds {
+		if cb.Chunk == c {
+			return cb, true
+		}
+	}
+	return ChunkBuild{}, false
+}
+
+// ---------------------------------------------------------------------------
+// Inverted range guard.
+// ---------------------------------------------------------------------------
+
+func TestResolve_InvertedRangeIsEmpty(t *testing.T) {
+	cat, _ := testCatalog(t)
+	plan, err := resolve(resolveCfg(cat), 5, 4)
+	require.NoError(t, err)
+	require.True(t, plan.Empty(), "rangeEnd < rangeStart must yield an empty plan")
+}
+
+// ---------------------------------------------------------------------------
+// Steady-state restart: a fully-frozen range resolves to nothing.
+// ---------------------------------------------------------------------------
+
+func TestResolve_SteadyStateRestartIsEmpty(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// Every chunk in [0,3] has its ledgers frozen — the post-freeze steady state.
+	for c := chunk.ID(0); c <= 3; c++ {
+		freezeKinds(t, cat, c, KindLedgers)
+	}
+
+	plan, err := resolve(resolveCfg(cat), 0, 3)
+	require.NoError(t, err)
+	require.True(t, plan.Empty(),
+		"steady-state restart of fully-frozen chunks must schedule nothing, got %+v", plan)
+}
+
+// ---------------------------------------------------------------------------
+// A range with a partly-frozen middle: only the un-frozen chunks are scheduled,
+// and each scheduled chunk requests the ledgers artifact.
+// ---------------------------------------------------------------------------
+
+func TestResolve_SchedulesOnlyUnfrozenChunks(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// Chunks 0,1,5 frozen; 2,3,4 absent.
+	for _, c := range []chunk.ID{0, 1, 5} {
+		freezeKinds(t, cat, c, KindLedgers)
+	}
+
+	plan, err := resolve(resolveCfg(cat), 0, 5)
+	require.NoError(t, err)
+
+	require.Equal(t, []chunk.ID{2, 3, 4}, chunkSet(plan),
+		"only the un-frozen chunks need work; frozen chunks self-skip")
+	for _, c := range []chunk.ID{2, 3, 4} {
+		cb, ok := findChunkBuild(plan, c)
+		require.True(t, ok)
+		require.True(t, cb.Artifacts.Has(KindLedgers), "an un-frozen chunk requests ledgers")
+		require.Equal(t, AllArtifacts(), cb.Artifacts)
+	}
+}
+
+// A "freezing" (not "frozen") key re-materializes: a partial/crashed freeze
+// attempt is re-scheduled, never trusted.
+func TestResolve_FreezingKeyReMaterializes(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// Chunk 1 is mid-freeze ("freezing", not flipped to "frozen").
+	require.NoError(t, cat.MarkChunkFreezing(1, KindLedgers))
+
+	plan, err := resolve(resolveCfg(cat), 1, 1)
+	require.NoError(t, err)
+	require.Equal(t, []chunk.ID{1}, chunkSet(plan),
+		"a freezing (not frozen) key must be re-scheduled")
+	cb, ok := findChunkBuild(plan, 1)
+	require.True(t, ok)
+	require.True(t, cb.Artifacts.Has(KindLedgers))
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/retention.go b/cmd/stellar-rpc/internal/fullhistory/streaming/retention.go
new file mode 100644
index 000000000..88d4a109e
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/retention.go
@@ -0,0 +1,102 @@
+package streaming
+
+import (
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// The reader retention contract (design "Reader retention contract",
+// gettransaction §8.5 / §9). It is the single storage-side rule that lets the
+// prune and sweep stages remove a chunk's files the instant it passes the
+// retention floor WITHOUT coordinating with the index lifecycle:
+//
+//	A read for any seq below the effective retention floor is not-found,
+//	regardless of whether the underlying file still exists on disk.
+//
+// A read may land on a .pack that pruning has since deleted, or on one that
+// pruning is about to delete; a below-floor read is not-found either way. From
+// the storage layer's perspective, retention — not the set of files on disk —
+// is the source of truth for "is this data available?", and that is the entire
+// property prune/sweep rely on to unlink unilaterally (sweep.go,
+// eligibility.go).
+//
+// The floor plays two roles with OPPOSITE safe directions, and the system
+// keeps them strictly separate (design "Lifecycle"):
+//
+//   - RETENTION role (this gate, the prune scan): erring LOW is harmless. A
+//     gate that admits a seq an instant after pruning removed its data returns
+//     not-found via the reader's missing-file rule; a gate that rejects a seq an
+//     instant before pruning gets to it merely anticipates the prune. Either way
+//     the answer a reader sees is correct, so this role anchors on the same live
+//     completeThrough the prune scan uses.
+//   - PRODUCTION role (catch-up's plan range, NOT this file): erring low is
+//     DANGEROUS — it would demand chunks from a bulk source nobody validated it
+//     can produce. Production therefore never consults the floor below existing
+//     storage; extending the bottom of storage (retention widening) is
+//     exclusively catch-up's job, where producibility is enforced lazily per
+//     chunk by the cold ingest (no pre-flight gate). This gate is a retention
+//     consumer by construction (a read is harmless to reject), so it uses the
+//     floor directly.
+//
+// retentionFloorFor is the gate's floor: effectiveRetentionFloor evaluated at
+// the SAME (completeThrough, RetentionChunks, earliest_ledger) the prune and
+// discard scans use, so a read and a concurrent prune agree on where the floor
+// sits within one tick's snapshot. Sliding the floor is therefore atomic from
+// the reader's perspective: shortening retention raises the floor and both the
+// gate and the prune scan observe the higher value on the next derivation.
+func retentionFloorFor(through, retentionChunks, earliest uint32) uint32 {
+	return effectiveRetentionFloor(through, retentionChunks, earliest)
+}
+
+// seqWithinRetention reports whether seq is at or above the effective retention
+// floor — the reader retention contract's admit/reject decision for one seq.
+// false means the read MUST resolve to not-found no matter what is on disk;
+// this is what makes it safe for pruning to unlink a chunk's files the moment
+// the chunk passes the floor.
+//
+// The comparison is "seq >= floor", chunk-aligned through effectiveRetentionFloor:
+// the floor is the first ledger of the lowest in-retention chunk, so a seq in a
+// straddling window resolves in-range when it sits in the floor chunk or above
+// and not-found when it sits in a below-floor chunk of the SAME window — the
+// window-straddling case (gettransaction §8.5: a stale .idx whose lo references
+// pruned chunks is tolerated precisely because this gate masks them).
+func seqWithinRetention(seq, through, retentionChunks, earliest uint32) bool {
+	return seq >= retentionFloorFor(through, retentionChunks, earliest)
+}
+
+// RetentionGate is the reader-facing handle the query-routing layer consults
+// before serving any seq: it pins one (completeThrough, RetentionChunks,
+// earliest_ledger) snapshot so every seq a single read examines is judged
+// against one floor. The serving side derives a fresh gate per request (or per
+// coverage refresh) — how it obtains completeThrough is the query-routing
+// design's concern; this type only fixes the contract's arithmetic so the read
+// path and the prune stage cannot drift.
+type RetentionGate struct {
+	floor uint32
+}
+
+// NewRetentionGate builds the gate for one snapshot of ingestion progress and
+// the retention config. through is completeThrough; retentionChunks/earliest are
+// the same knobs the prune scan reads. A shortened retentionChunks yields a
+// higher floor immediately — no per-chunk state to migrate.
+func NewRetentionGate(through, retentionChunks, earliest uint32) RetentionGate {
+	return RetentionGate{floor: retentionFloorFor(through, retentionChunks, earliest)}
+}
+
+// Floor is the gate's effective retention floor — the first ledger of the
+// lowest in-retention chunk. Exposed for the reader's coverage filtering (it
+// skips a window's .idx probe when the window is wholly below Floor, the §8.2
+// retention gate) and for tests.
+func (g RetentionGate) Floor() uint32 { return g.floor }
+
+// Admits reports whether a read for seq is within retention. false ⟹ the read
+// is not-found regardless of on-disk state — the contract pruning relies on.
+func (g RetentionGate) Admits(seq uint32) bool { return seq >= g.floor }
+
+// ChunkBelowFloor reports whether an entire chunk sits below the floor — its
+// last ledger is below the floor. This is the same predicate the discard and
+// prune scans use (eligibility.go: last < floor), surfaced on the gate so the
+// reader and the lifecycle share one definition of "past retention" rather than
+// each open-coding the comparison.
+func (g RetentionGate) ChunkBelowFloor(c chunk.ID) bool {
+	return c.LastLedger() < g.floor
+}

From 220867a96c02131756f8322d2c1d01d4ac357fe8 Mon Sep 17 00:00:00 2001
From: Simon Chow <simon.chow@stellar.org>
Date: Tue, 23 Jun 2026 17:58:12 -0400
Subject: [PATCH 4/4] docs+style(streaming): scope layer-3 doc.go to
 orchestration; fix lint

doc.go: add Planner (resolve/execute/eligibility), Ingestion (ingest),
Orchestration (progress/lifecycle/retention), and Observability groups;
update the coverage line and 'Later layers' note.

golangci-lint (this layer's own new findings):
- misspell: cancelled -> canceled (comments across ingest/lifecycle +
  their tests)
- modernize: if-guard -> max() in IngestionLag and lastCommittedLedger
- staticcheck QF1008: cfg.ExecConfig.WithDefaults() -> cfg.WithDefaults()
- revive: rename unused refineWithHotDB(cat) parameter to _
- errname: assertErr/errStr -> errSyntheticOp/stringError (test)
- lll: wrap/shorten 4 long lines
- godoclint: Metrics doc starts with the symbol name
- drop two dead //nolint:gosec directives; //nolint:gosec on uint32(floor)
  (now needed after the max() rewrite)
- //nolint for forward-/test-only or inherently-complex symbols:
  runIngestionLoop metrics (unparam), seqWithinRetention (unused),
  Metrics (interfacebloat), runLifecycleTick (gocognit/gocyclo/cyclop/
  funlen), the plan-and-execute guard (nestif)
---
 .../internal/fullhistory/streaming/doc.go     | 25 +++++++++++++-----
 .../internal/fullhistory/streaming/execute.go |  4 ++-
 .../internal/fullhistory/streaming/ingest.go  | 16 +++++++-----
 .../fullhistory/streaming/ingest_test.go      |  8 +++---
 .../fullhistory/streaming/lifecycle.go        | 10 ++++---
 .../fullhistory/streaming/lifecycle_test.go   | 26 ++++++++++---------
 .../fullhistory/streaming/observability.go    | 11 +++-----
 .../internal/fullhistory/streaming/process.go |  6 +++--
 .../fullhistory/streaming/process_test.go     |  3 ++-
 .../fullhistory/streaming/progress.go         |  9 +++----
 .../fullhistory/streaming/progress_test.go    |  4 +--
 .../fullhistory/streaming/retention.go        |  2 ++
 12 files changed, 71 insertions(+), 53 deletions(-)

diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go b/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
index e63d3b565..ec278846c 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
@@ -4,9 +4,9 @@
 // (fullhistory/pkg/...). It is built ON that layer — the catalog WRAPS
 // metastore.Store rather than reinventing a RocksDB wrapper.
 //
-// This file map covers Slice 1 · Layers 1–2 (foundations + storage). The
-// orchestration and daemon assembly stack on top in later layers (see "Later
-// layers" below).
+// This file map covers Slice 1 · Layers 1–3 (foundations + storage +
+// orchestration). Daemon assembly stacks on top in Layer 4 (see "Later layers"
+// below).
 //
 // # Data model (keys-first)
 //
@@ -43,14 +43,25 @@
 //	                 artifacts from the cheapest source (ready hot DB → frozen
 //	                 local .pack → bulk backend); hotsource exposes the hot tier
 //	                 as a freeze source.
+//	Planner        resolve.go, execute.go, eligibility.go
+//	                 the postcondition resolver (catalog diff → Plan), the
+//	                 bounded-worker executor, and discard/prune eligibility.
+//	Ingestion      ingest.go
+//	                 the live hot-DB ingestion loop (indexed GetLedger, one synced
+//	                 WriteBatch per ledger) and the chunk-boundary handoff.
+//	Orchestration  progress.go, lifecycle.go, retention.go
+//	                 derived progress (the resume point), the lifecycle tick
+//	                 (plan → discard → prune), and retention-floor arithmetic +
+//	                 the reader-retention gate.
+//	Observability  observability.go
+//	                 the metrics sink interface and the signals it emits.
 //	Test seam      hooks.go
 //	                 test-only crash-injection points fired from inside the real
 //	                 protocol/sweep methods (every field nil in production).
 //
 // # Later layers
 //
-// Layer 3 adds the postcondition resolver/executor, the live ingestion loop,
-// and the lifecycle tick (orchestration); Layer 4 adds startStreaming,
-// validateConfig, surgical recovery, and the audit command (daemon assembly).
-// Slices 2 and 3 then weave in the events and tx-hash data types.
+// Layer 4 adds startStreaming, validateConfig, surgical recovery, and the audit
+// command (daemon assembly). Slices 2 and 3 then weave in the events and
+// tx-hash data types.
 package streaming
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/execute.go b/cmd/stellar-rpc/internal/fullhistory/streaming/execute.go
index 0259a0c6b..965ea0d47 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/execute.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/execute.go
@@ -83,7 +83,9 @@ func (cfg ExecConfig) validate() error {
 	if cfg.Workers <= 0 {
 		// Loud, not silently corrected: a zero pool deadlocks executePlan, so the
 		// caller's miswiring must surface rather than hang.
-		return fmt.Errorf("streaming: ExecConfig.Workers must be > 0 (got %d) — a zero pool deadlocks executePlan", cfg.Workers)
+		return fmt.Errorf(
+			"streaming: ExecConfig.Workers must be > 0 (got %d) — a zero pool deadlocks executePlan", cfg.Workers,
+		)
 	}
 	return nil
 }
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/ingest.go b/cmd/stellar-rpc/internal/fullhistory/streaming/ingest.go
index 902d00bf0..cb36a26b1 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/ingest.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/ingest.go
@@ -28,9 +28,9 @@ import (
 // meta-store key or touch the same per-chunk hot RocksDB instance.
 //
 // CLEAN-SHUTDOWN vs CRASH is decided at the DAEMON TOP LEVEL, not here: the loop
-// returns whatever GetLedger returns (a ctx-cancelled error on a clean shutdown,
+// returns whatever GetLedger returns (a ctx-canceled error on a clean shutdown,
 // any other error on a crash), and superviseStreaming classifies a non-nil
-// return as clean iff ctx was cancelled (see daemon.go). The loop never tries to
+// return as clean iff ctx was canceled (see daemon.go). The loop never tries to
 // tell the two apart itself.
 
 // LedgerGetter is the indexed-poll source the ingestion loop drives: it returns
@@ -82,11 +82,13 @@ func openHotTierForChunk(cat *Catalog, chunkID chunk.ID, logger *supportlog.Entr
 				// daemon's top-level loop owns the fatal-and-surface decision.
 				return nil, fmt.Errorf(
 					"%w: chunk %s is %q but its hot dir %s is missing",
-					ErrHotVolumeLost, chunkID, HotReady, dir)
+					ErrHotVolumeLost, chunkID, HotReady, dir,
+				)
 			}
 			return nil, fmt.Errorf(
 				"%w: chunk %s: stat hot dir %s: %w",
-				ErrHotVolumeLost, chunkID, dir, statErr)
+				ErrHotVolumeLost, chunkID, dir, statErr,
+			)
 		}
 		db, openErr := hotchunk.Open(dir, chunkID, logger)
 		if openErr != nil {
@@ -171,7 +173,7 @@ func discardHotTierForChunk(cat *Catalog, chunkID chunk.ID) error {
 // boundary hands the live-chunk frontier forward by closing the just-filled DB
 // and opening the next chunk's. It returns the error GetLedger or a boundary
 // step produced (nil never, since the poll is unbounded) — the daemon top level
-// classifies it: a ctx-cancelled return is a clean shutdown, any other error is
+// classifies it: a ctx-canceled return is a clean shutdown, any other error is
 // RESTARTABLE (the supervisor restarts; startup re-derives the watermark from
 // the last synced batch, losing nothing).
 //
@@ -194,7 +196,7 @@ func runIngestionLoop(
 	lifecycleCh chan<- chunk.ID,
 	ingestTypes hotchunk.Ingest,
 	logger *supportlog.Entry,
-	metrics Metrics,
+	metrics Metrics, //nolint:unparam // non-nil in production (startStreaming, Layer 4) and in observability_test
 ) (err error) {
 	metrics = metricsOrNop(metrics)
 
@@ -235,7 +237,7 @@ func runIngestionLoop(
 	}
 
 	// Indexed poll from the resume ledger. GetLedger blocks until ledger seq is
-	// available; a returned error (ctx-cancelled or otherwise) ends the loop and
+	// available; a returned error (ctx-canceled or otherwise) ends the loop and
 	// the daemon top level classifies it.
 	for seq := resume; ; seq++ {
 		lcm, gerr := core.GetLedger(ctx, seq)
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/ingest_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/ingest_test.go
index 81c281c28..d91baa1bf 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/ingest_test.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/ingest_test.go
@@ -30,7 +30,7 @@ func ledgerEntry(t *testing.T, seq uint32) ledger.Entry {
 // fakeLedgerGetter — an injectable LedgerGetter the ingestion loop polls by
 // sequence (the design's indexed core.GetLedger(ctx, seq)). For seqs it has a
 // programmed frame it returns those bytes; once the poll runs past the last
-// programmed seq it either blocks until ctx is cancelled (a live tip stream that
+// programmed seq it either blocks until ctx is canceled (a live tip stream that
 // only ends on shutdown) or returns endErr (a crashed backend). It records the
 // FIRST seq it was asked for (the restart resume point) and the GetLedger call
 // count.
@@ -337,12 +337,12 @@ func TestRunIngestionLoop_BoundaryNotifiesCompletedChunk(t *testing.T) {
 
 // ---------------------------------------------------------------------------
 // runIngestionLoop — clean shutdown vs crash (classified at the daemon top
-// level: ctx-cancelled return is clean, any other error is restartable).
+// level: ctx-canceled return is clean, any other error is restartable).
 // ---------------------------------------------------------------------------
 
 // TestRunIngestionLoop_CtxCancelReturnsCtxErr: a ctx cancellation while the poll
 // is blocking on the tip makes GetLedger return ctx.Err(); the loop returns that
-// (the daemon top level classifies a ctx-cancelled return as a clean shutdown).
+// (the daemon top level classifies a ctx-canceled return as a clean shutdown).
 func TestRunIngestionLoop_CtxCancelReturnsCtxErr(t *testing.T) {
 	cat, _ := testCatalog(t)
 	c := chunk.ID(0)
@@ -367,7 +367,7 @@ func TestRunIngestionLoop_CtxCancelReturnsCtxErr(t *testing.T) {
 	select {
 	case err := <-done:
 		require.Error(t, err)
-		require.ErrorIs(t, err, context.Canceled, "the loop surfaces the ctx-cancelled GetLedger error")
+		require.ErrorIs(t, err, context.Canceled, "the loop surfaces the ctx-canceled GetLedger error")
 	case <-time.After(10 * time.Second):
 		t.Fatal("ingestion loop did not stop on ctx cancellation")
 	}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle.go b/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle.go
index 1fefc9ba2..32fbc3184 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle.go
@@ -65,7 +65,7 @@ type LifecycleConfig struct {
 // Fatalf defaulted to log.Fatalf when unset. The daemon calls this once at
 // startup before launching the loop.
 func (cfg LifecycleConfig) WithLifecycleDefaults() LifecycleConfig {
-	cfg.ExecConfig = cfg.ExecConfig.WithDefaults()
+	cfg.ExecConfig = cfg.WithDefaults()
 	if cfg.Fatalf == nil {
 		cfg.Fatalf = log.Fatalf
 	}
@@ -187,9 +187,11 @@ func lowestMaterializedChunk(cat *Catalog) (chunk.ID, bool, error) {
 // catalog, not from lastChunk.
 //
 // CLEAN-SHUTDOWN (binding): if executePlan returns an error AND ctx was
-// cancelled, the tick returns WITHOUT calling Fatalf — cancellation is a
+// canceled, the tick returns WITHOUT calling Fatalf — cancellation is a
 // shutdown request, never an op failure. Only a genuine failure (ctx still
 // live) aborts the daemon via Fatalf, per the error policy.
+//
+//nolint:gocognit,gocyclo,cyclop,funlen // an inherently multi-stage sequence (plan → discard → prune)
 func runLifecycleTick(ctx context.Context, cfg LifecycleConfig, cat *Catalog, lastChunk chunk.ID) {
 	metrics := cfg.metrics()
 	logger := cfg.Logger
@@ -261,7 +263,7 @@ func runLifecycleTick(ctx context.Context, cfg LifecycleConfig, cat *Catalog, la
 	if haveComplete && highestComplete < rangeEnd {
 		rangeEnd = highestComplete
 	}
-	if haveComplete && start >= 0 && start <= int64(rangeEnd) {
+	if haveComplete && start >= 0 && start <= int64(rangeEnd) { //nolint:nestif // plan-and-execute guard
 		plan, perr := resolve(cfg.ExecConfig, chunk.ID(start), rangeEnd) //nolint:gosec // start >= 0
 		if perr != nil {
 			if ctx.Err() != nil {
@@ -272,7 +274,7 @@ func runLifecycleTick(ctx context.Context, cfg LifecycleConfig, cat *Catalog, la
 		}
 		chunkBuilds = len(plan.ChunkBuilds)
 		if eerr := executePlan(ctx, plan, cfg.ExecConfig); eerr != nil {
-			// CLEAN-SHUTDOWN FIX: a cancelled ctx makes executePlan return ctx.Err()
+			// CLEAN-SHUTDOWN FIX: a canceled ctx makes executePlan return ctx.Err()
 			// (every task's slot-acquire/wait observes the errgroup cancel). That is
 			// a shutdown, NOT an op failure — return before any Fatalf.
 			if ctx.Err() != nil {
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle_test.go
index f7782db52..6160ecfce 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle_test.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/lifecycle_test.go
@@ -186,7 +186,9 @@ func (r *fatalRecorder) fired() bool { return r.count.Load() > 0 }
 //
 // Then re-running the tick is a no-op (quiescence).
 func TestRunLifecycleTick_BoundaryFreezesDiscards(t *testing.T) {
-	t.Parallel()             // full-chunk ingest; isolated TempDir/catalog — overlap with the other heavy tests to fit the gate's go-test timeout
+	// full-chunk ingest; isolated TempDir/catalog — overlaps the other heavy
+	// tests to fit the gate's go-test timeout.
+	t.Parallel()
 	cat, _ := testCatalog(t) // a chunk finalizes immediately
 	cfg, rec := lifecycleTestConfig(t, cat, 0)
 
@@ -322,13 +324,13 @@ func TestRunLifecycleTick_PrunesTransientChunkDebris(t *testing.T) {
 }
 
 // ---------------------------------------------------------------------------
-// CLEAN SHUTDOWN: a ctx cancelled mid-tick returns WITHOUT fatal.
+// CLEAN SHUTDOWN: a ctx canceled mid-tick returns WITHOUT fatal.
 // ---------------------------------------------------------------------------
 
 // TestRunLifecycleTick_CleanShutdownNoFatal: when executePlan returns because
-// ctx was cancelled, the tick must NOT call Fatalf — cancellation is a shutdown,
+// ctx was canceled, the tick must NOT call Fatalf — cancellation is a shutdown,
 // never an op failure. The plan stage's work is real (a backend-only chunk that
-// the cancelled ctx aborts), so executePlan genuinely returns an error here.
+// the canceled ctx aborts), so executePlan genuinely returns an error here.
 func TestRunLifecycleTick_CleanShutdownNoFatal(t *testing.T) {
 	cat, _ := testCatalog(t)
 	rec := &fatalRecorder{}
@@ -376,7 +378,7 @@ func TestRunLifecycleTick_CleanShutdownNoFatal(t *testing.T) {
 	case <-time.After(5 * time.Second):
 		t.Fatal("the tick did not return after ctx cancellation")
 	}
-	require.False(t, rec.fired(), "a cancelled ctx is a clean shutdown, NOT an op failure — no Fatalf")
+	require.False(t, rec.fired(), "a canceled ctx is a clean shutdown, NOT an op failure — no Fatalf")
 }
 
 // TestRunLifecycleTick_GenuineFailureAborts: when a plan op fails for a real
@@ -394,7 +396,7 @@ func TestRunLifecycleTick_GenuineFailureAborts(t *testing.T) {
 			Logger:  silentLogger(),
 			Workers: 1,
 			runChunk: func(context.Context, ChunkBuild, ExecConfig) error {
-				return assertErr // a genuine, non-cancellation failure
+				return errSyntheticOp // a genuine, non-cancellation failure
 			},
 		},
 		Fatalf: rec.fatalf,
@@ -488,7 +490,7 @@ func TestLifecycleLoop_DrainsToMostRecent(t *testing.T) {
 	}
 }
 
-// TestLifecycleLoop_ReturnsImmediatelyOnAlreadyCancelledCtx: an already-cancelled
+// TestLifecycleLoop_ReturnsImmediatelyOnAlreadyCancelledCtx: an already-canceled
 // ctx makes the loop return without running any tick (never blocks on the
 // channel forever).
 func TestLifecycleLoop_ReturnsImmediatelyOnAlreadyCancelledCtx(t *testing.T) {
@@ -507,7 +509,7 @@ func TestLifecycleLoop_ReturnsImmediatelyOnAlreadyCancelledCtx(t *testing.T) {
 	select {
 	case <-done:
 	case <-time.After(5 * time.Second):
-		t.Fatal("the loop blocked instead of observing the cancelled ctx")
+		t.Fatal("the loop blocked instead of observing the canceled ctx")
 	}
 }
 
@@ -531,12 +533,12 @@ func runTickForCatalog(ctx context.Context, t *testing.T, cfg LifecycleConfig, c
 	runLifecycleTick(ctx, cfg, cat, last)
 }
 
-// assertErr is a fixed non-cancellation error for the genuine-failure path.
-var assertErr = errStr("streaming: synthetic op failure")
+// errSyntheticOp is a fixed non-cancellation error for the genuine-failure path.
+var errSyntheticOp = stringError("streaming: synthetic op failure")
 
-type errStr string
+type stringError string
 
-func (e errStr) Error() string { return string(e) }
+func (e stringError) Error() string { return string(e) }
 
 // makeReadyHotDirNoData opens and closes a real (empty) hot DB for c so its dir
 // exists on disk and its key is "ready" — the state a discard scan inspects
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/observability.go b/cmd/stellar-rpc/internal/fullhistory/streaming/observability.go
index c03df6aba..22f8550c3 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/observability.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/observability.go
@@ -9,7 +9,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 )
 
-// Observability for the streaming daemon's own control plane — distinct from the
+// Metrics is the streaming daemon's own control-plane sink — distinct from the
 // per-data-type ingest metrics (ingest.MetricSink / ingest.PrometheusSink), which
 // time the cold/hot ingesters themselves. THIS sink times and counts the daemon's
 // PHASES: the ingestion loop's chunk-boundary handoffs, catch-up backfill passes,
@@ -26,7 +26,7 @@ import (
 //
 // All methods MUST be safe for concurrent use: the ingestion loop, the lifecycle
 // goroutine, and (during catch-up) the worker pool all report concurrently.
-type Metrics interface {
+type Metrics interface { //nolint:interfacebloat // one cohesive control-plane sink for the daemon's phases
 	// --- gauges (absolute, last-write-wins) ---
 
 	// IngestionLag sets the lag in ledgers: networkTip - lastCommitted. This is a
@@ -196,7 +196,7 @@ func NewPrometheusMetrics(registry *prometheus.Registry, namespace string) *Prom
 
 	m := &PrometheusMetrics{
 		ingestionLag:      gauge("ingestion_lag_ledgers", "catch-up only: network tip minus last committed ledger"),
-		lastCommitted:     gauge("last_committed_ledger", "highest ledger the ingestion loop has durably synced (per-ledger liveness)"),
+		lastCommitted:     gauge("last_committed_ledger", "highest ledger the ingestion loop has durably synced"),
 		watermark:         gauge("watermark_ledger", "derived watermark — highest durably committed ledger"),
 		retentionFloor:    gauge("retention_floor_ledger", "effective retention floor — lowest in-window ledger"),
 		catchupBackfilled: gauge("catchup_backfilled_ledger", "last ledger catch-up has backfilled through"),
@@ -234,10 +234,7 @@ func NewPrometheusMetrics(registry *prometheus.Registry, namespace string) *Prom
 
 func (m *PrometheusMetrics) IngestionLag(networkTip, lastCommitted uint32) {
 	// Signed lag: a lagging bulk tip below the watermark yields 0, not a wrap.
-	lag := int64(networkTip) - int64(lastCommitted)
-	if lag < 0 {
-		lag = 0
-	}
+	lag := max(int64(networkTip)-int64(lastCommitted), 0)
 	m.ingestionLag.Set(float64(lag))
 }
 
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/process.go b/cmd/stellar-rpc/internal/fullhistory/streaming/process.go
index 7bb83f7a8..e1bdfffd3 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/process.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/process.go
@@ -263,13 +263,15 @@ func backfillSource(
 		// (frozen ⇒ file exists); surface it rather than silently downloading.
 		return nil, noClose, fmt.Errorf(
 			"streaming: chunk %s ledgers is %q but pack file is missing at %s",
-			chunkID, StateFrozen, cat.layout.LedgerPackPath(chunkID))
+			chunkID, StateFrozen, cat.layout.LedgerPackPath(chunkID),
+		)
 	}
 
 	// (3) Bulk backend — the only source for a chunk with no local copy.
 	if cfg.Backend == nil {
 		return nil, noClose, fmt.Errorf(
-			"streaming: chunk %s has no local copy and no bulk backend is configured", chunkID)
+			"streaming: chunk %s has no local copy and no bulk backend is configured", chunkID,
+		)
 	}
 	if cfg.BackendWaiter != nil {
 		if werr := cfg.BackendWaiter.WaitForCoverage(ctx, chunkID.LastLedger()); werr != nil {
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/process_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/process_test.go
index e07fb009f..caf84ff53 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/process_test.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/process_test.go
@@ -470,7 +470,8 @@ func writeRealPack(t *testing.T, cat *Catalog, chunkID chunk.ID) {
 	dirs := ingest.ColdDirs{Ledgers: cat.layout.LedgersRoot()}
 	require.NoError(t, ingest.RunColdChunk(
 		context.Background(), silentLogger(), src, dirs, chunkID,
-		ingest.NopSink{}, ingest.Config{Ledgers: true}))
+		ingest.NopSink{}, ingest.Config{Ledgers: true},
+	))
 	require.FileExists(t, cat.layout.LedgerPackPath(chunkID))
 }
 
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/progress.go b/cmd/stellar-rpc/internal/fullhistory/streaming/progress.go
index 7aea790cf..8f21f4677 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/progress.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/progress.go
@@ -114,11 +114,8 @@ func lastCommittedLedger(cat *Catalog, probe HotProbe) (uint32, error) {
 	}
 	if ok {
 		// int64 before the -1 so a zero/genesis pin does not underflow.
-		floor := int64(earliest) - 1
-		if floor < 0 {
-			floor = 0
-		}
-		through = max(through, uint32(floor)) //nolint:gosec // floor >= 0, fits uint32
+		floor := max(int64(earliest)-1, 0)
+		through = max(through, uint32(floor)) //nolint:gosec // floor = max(.., 0) >= 0, fits uint32
 	}
 
 	return through, nil
@@ -130,7 +127,7 @@ func lastCommittedLedger(cat *Catalog, probe HotProbe) (uint32, error) {
 // whose DB won't open surfaces as ErrHotVolumeLost with the surgical-recovery
 // guidance (item 6 — narrowed from the former eager all-ready-keys dir scan; the
 // per-chunk open here is the same loud, actionable fatal).
-func refineWithHotDB(cat *Catalog, probe HotProbe, live int64) (uint32, error) {
+func refineWithHotDB(_ *Catalog, probe HotProbe, live int64) (uint32, error) {
 	id := chunk.ID(live) //nolint:gosec // live > cold >= -1, so live >= 0
 	hot, ok, openErr := probe.OpenHotChunk(id)
 	if openErr != nil {
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/progress_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/progress_test.go
index d3d6f15bf..7f540a790 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/progress_test.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/progress_test.go
@@ -58,7 +58,7 @@ func TestCompleteThrough(t *testing.T) {
 		in   int64
 		want uint32
 	}{
-		{"pre-genesis sentinel -1 => FirstLedgerSeq-1, not MaxUint32 (ALIASES the wrap; see trap above)", -1, preGenesisLedger},
+		{"pre-genesis sentinel -1 => FirstLedgerSeq-1, not MaxUint32 (aliases the wrap)", -1, preGenesisLedger},
 		{"sentinel -2 does NOT alias the wrap (guard-less would yield 4294957297)", -2, preGenesisLedger},
 		{"deeply negative still pre-genesis", -100, preGenesisLedger},
 		{"chunk 0 last ledger", 0, chunk.ID(0).LastLedger()},
@@ -77,7 +77,7 @@ func TestCompleteThrough(t *testing.T) {
 	// distinct value that the guard must squash. Computed from chunk arithmetic,
 	// not hardcoded, so it tracks LedgersPerChunk/FirstLedgerSeq.
 	guardlessWrap := func(c int64) uint32 {
-		return chunk.ID(uint32(c)).LastLedger() //nolint:gosec // deliberate wrap to model a guard-less impl
+		return chunk.ID(uint32(c)).LastLedger()
 	}
 	require.Equal(t, preGenesisLedger, guardlessWrap(-1),
 		"-1 aliases preGenesisLedger under the wrap — the coincidence this test must not rely on")
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/retention.go b/cmd/stellar-rpc/internal/fullhistory/streaming/retention.go
index 88d4a109e..8d9b07af0 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/retention.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/retention.go
@@ -59,6 +59,8 @@ func retentionFloorFor(through, retentionChunks, earliest uint32) uint32 {
 // and not-found when it sits in a below-floor chunk of the SAME window — the
 // window-straddling case (gettransaction §8.5: a stale .idx whose lo references
 // pruned chunks is tolerated precisely because this gate masks them).
+//
+//nolint:unused // exercised by retention_test in a later layer
 func seqWithinRetention(seq, through, retentionChunks, earliest uint32) bool {
 	return seq >= retentionFloorFor(through, retentionChunks, earliest)
 }