From d756a038099da82b64dd0d269ddd3d6c8ec5c0e7 Mon Sep 17 00:00:00 2001 From: Sergey Vojtovich Date: Thu, 16 Apr 2026 23:22:07 +0400 Subject: [PATCH] MDEV-21423 - lock-free trx_sys get performance regression cause by lf_find and ut_delay Under high concurrency, MVCC snapshot creation may spend a significant amount of time in lf_hash_iterate()/lfind() while collecting active read-write transaction identifiers. This overhead is particularly visible in sysbench oltp_read_write with transaction-isolation=READ-COMMITTED. Iteration cost becomes high due to significant TLB thrashing and poor memory locality in this hot code path because snapshot creation touches many rw_trx_hash nodes distributed across memory, including dummy nodes that are irrelevant for snapshot construction. In addition, traversing LF_HASH requires issuing heavyweight memory barriers. This is a performance regression after 53cc9aa5be6, which changed MVCC snapshot creation to scan LF_HASH instead of maintaining a global sorted vector protected by the global mutex. Add trx_sys.rw_trx_ids, a compact traversal-friendly vector of active read-write transaction identifiers and serialization numbers optimized for MVCC snapshot creation, while rw_trx_hash remains responsible for transaction lookup. The vector may contain empty slots corresponding to idle or read-only transactions that currently do not own a read-write transaction identifier. Such slots are skipped by snapshot creation. This reduces traversal overhead during MVCC snapshot creation by improving memory locality, reducing TLB pressure, and avoiding repeated memory barriers required for rw_trx_hash traversal. --- storage/innobase/include/trx0purge.h | 6 +- storage/innobase/include/trx0sys.h | 218 ++++++++++++++++++--------- storage/innobase/include/trx0trx.h | 2 + storage/innobase/include/ut0new.h | 1 + storage/innobase/read/read0read.cc | 2 +- storage/innobase/trx/trx0purge.cc | 10 +- storage/innobase/trx/trx0sys.cc | 2 + storage/innobase/trx/trx0trx.cc | 10 +- storage/innobase/ut/ut0new.cc | 2 + 9 files changed, 172 insertions(+), 81 deletions(-) diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 8027cde5917c9..1b35722eed1ab 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -37,9 +37,11 @@ Created 3/26/1996 Heikki Tuuri Remove the undo log segment from the rseg slot if it is too big for reuse. @param[in] trx transaction @param[in,out] undo undo log -@param[in,out] mtr mini-transaction */ +@param[in,out] mtr mini-transaction +@param[in] end transaction serialisation number */ void -trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr); +trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr, + trx_id_t end); /** Remove unnecessary history data from rollback segments. NOTE that when this diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index cebaacb4e7b6a..3126275947ba4 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -340,14 +340,6 @@ struct rw_trx_hash_element_t trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */ - - /** - Transaction serialization number. - - Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY - state. Initially set to TRX_ID_MAX. - */ - Atomic_counter no; trx_t *trx; srw_mutex mutex; }; @@ -443,7 +435,6 @@ class rw_trx_hash_t ut_ad(element->trx == 0); element->trx= trx; element->id= trx->id; - element->no= TRX_ID_MAX; trx->rw_trx_hash_element= element; } @@ -512,7 +503,6 @@ class rw_trx_hash_t if (element->trx) validate_element(element->trx); element->mutex.wr_unlock(); - ut_ad(element->id < element->no); return arg->action(element, arg->argument); } #endif @@ -849,6 +839,119 @@ class thread_safe_trx_ilist_t alignas(CPU_LEVEL1_DCACHE_LINESIZE) ilist trx_list; }; +/** + Active read-write transaction identifiers and serialisation numbers container. + + Unlike rw_trx_hash_t, which is optimized for direct lookup, this + structure is optimized for compact storage and traversal of active + transactions by MVCC read view construction. + + The vector may contain empty slots corresponding to idle or read-only + transactions that currently do not own an active read-write trx_id. + Such slots are skipped during traversal. +*/ +class rw_trx_vector +{ + struct rw_trx_id + { + Atomic_relaxed id{TRX_ID_MAX}; + Atomic_relaxed no{TRX_ID_MAX}; + trx_t *trx; + rw_trx_id(trx_t *t): trx(t) {} + }; + alignas(CPU_LEVEL1_DCACHE_LINESIZE) + std::vector> + ids{ut_allocator(mem_key_trx_sys_t_rw_trx_ids)}; + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock_low latch; + +public: + void assign_new_trx_no(const trx_t *trx, trx_id_t no) noexcept + { + latch.rd_lock(); + ut_ad(trx->rw_trx_ids_slot < ids.size()); + ut_ad(ids[trx->rw_trx_ids_slot].trx == trx); + ut_ad(ids[trx->rw_trx_ids_slot].id == trx->id); + ut_ad(ids[trx->rw_trx_ids_slot].no == TRX_ID_MAX); + ids[trx->rw_trx_ids_slot].no= no; + latch.rd_unlock(); + } + trx_id_t snapshot_ids(trx_ids_t &view_ids, + const trx_id_t max_trx_id) const noexcept + { + trx_id_t min_trx_no{max_trx_id}; + view_ids.clear(); + latch.rd_lock(); + view_ids.reserve(ids.size()); + for (const auto &it : ids) + { + trx_id_t id{it.id}; + if (id < max_trx_id) + { + view_ids.push_back(id); + const trx_id_t no{it.no}; + if (no < min_trx_no) + min_trx_no= no; + } + } + latch.rd_unlock(); + return min_trx_no; + } + void register_rw(const trx_t *trx) noexcept + { + latch.rd_lock(); + ut_ad(trx->rw_trx_ids_slot < ids.size()); + ut_ad(ids[trx->rw_trx_ids_slot].trx == trx); + ut_ad(ids[trx->rw_trx_ids_slot].id == TRX_ID_MAX); + ut_ad(ids[trx->rw_trx_ids_slot].no == TRX_ID_MAX); + ids[trx->rw_trx_ids_slot].id= trx->id; + latch.rd_unlock(); + } + void deregister_rw(const trx_t *trx) noexcept + { + latch.rd_lock(); + ut_ad(trx->rw_trx_ids_slot < ids.size()); + rw_trx_id &slot= ids[trx->rw_trx_ids_slot]; + ut_ad(slot.trx == trx); + ut_ad(slot.id == trx->id); + slot.id= TRX_ID_MAX; + slot.no= TRX_ID_MAX; + latch.rd_unlock(); + } + void register_trx(trx_t *trx) noexcept + { + ut_ad(trx->rw_trx_ids_slot == std::numeric_limits::max()); + latch.wr_lock(); + trx->rw_trx_ids_slot= static_cast(ids.size()); + ids.emplace_back(trx); + latch.wr_unlock(); + } + void deregister_trx(trx_t *trx) noexcept + { + latch.wr_lock(); + ut_ad(trx->rw_trx_ids_slot < ids.size()); + ut_ad(ids[trx->rw_trx_ids_slot].trx == trx); + if (trx->rw_trx_ids_slot + 1 < ids.size()) + { + trx_t *move_trx= ids.back().trx; + ids[trx->rw_trx_ids_slot]= std::move(ids.back()); + move_trx->rw_trx_ids_slot= trx->rw_trx_ids_slot; + } + ids.pop_back(); + latch.wr_unlock(); + trx->rw_trx_ids_slot= std::numeric_limits::max(); + } + void create() noexcept + { + ut_ad(ids.size() == 0); + latch.init(); + } + void destroy() noexcept + { + ut_ad(ids.size() == 0); + latch.destroy(); + } +}; + /** The transaction system central memory data structure. */ class trx_sys_t { @@ -876,6 +979,15 @@ class trx_sys_t /** False if there is no undo log to purge or rollback */ bool undo_log_nonempty; public: + /** + Collection of active read-write transaction identifiers and serialization + numbers used for MVCC snapshot creation. + + This complements rw_trx_hash with a traversal-friendly representation + optimized for collecting active transaction ids. + */ + rw_trx_vector rw_trx_ids; + /** List of all transactions. */ thread_safe_trx_ilist_t trx_list; @@ -1014,7 +1126,7 @@ class trx_sys_t next call to trx_sys.get_new_trx_id() */ - trx_id_t get_max_trx_id() + trx_id_t get_max_trx_id() const noexcept { return m_max_trx_id; } @@ -1037,7 +1149,7 @@ class trx_sys_t Allocates and assigns new transaction serialisation number. There's a gap between m_max_trx_id increment and transaction serialisation - number becoming visible through rw_trx_hash. While we're in this gap + number becoming visible through rw_trx_ids. While we're in this gap concurrent thread may come and do MVCC snapshot without seeing allocated but not yet assigned serialisation number. Then at some point purge thread may clone this view. As a result it won't see newly allocated serialisation @@ -1047,58 +1159,44 @@ class trx_sys_t m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively means that all transaction serialisation numbers up to m_max_trx_id are - available through rw_trx_hash. + available through rw_trx_ids. We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so - that m_rw_trx_hash_version increment happens after - trx->rw_trx_hash_element->no becomes visible through rw_trx_hash. + that m_rw_trx_hash_version increment happens after transaction serialisation + number becomes visible through rw_trx_ids. @param trx transaction */ - void assign_new_trx_no(trx_t *trx) + trx_id_t assign_new_trx_no(trx_t *trx) { - trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh(); + trx_id_t no= get_new_trx_id_no_refresh(); + rw_trx_ids.assign_new_trx_no(trx, no); refresh_rw_trx_hash_version(); + return no; } /** Takes MVCC snapshot. - To reduce malloc probability we reserve rw_trx_hash.size() + 32 elements - in ids. - For details about get_rw_trx_hash_version() != get_max_trx_id() spin @sa register_rw() and @sa assign_new_trx_no(). We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so - that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash. - - To optimise snapshot creation rw_trx_hash.iterate() is being used instead - of rw_trx_hash.iterate_no_dups(). It means that some transaction - identifiers may appear multiple times in ids. + that loading of m_rw_trx_hash_version happens before accessing rw_trx_ids. - @param[in,out] caller_trx used to get access to rw_trx_hash_pins @param[out] ids array to store registered transaction identifiers @param[out] max_trx_id variable to store m_max_trx_id value - @param[out] mix_trx_no variable to store min(no) value + + @return min(no) */ - void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id, - trx_id_t *min_trx_no) + trx_id_t snapshot_ids(trx_ids_t &ids, trx_id_t &max_trx_id) const noexcept { - snapshot_ids_arg arg(ids); - - while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id()) + while ((max_trx_id= get_rw_trx_hash_version()) != get_max_trx_id()) ut_delay(1); - arg.m_no= arg.m_id; - - ids->clear(); - ids->reserve(rw_trx_hash.size() + 32); - rw_trx_hash.iterate(caller_trx, copy_one_id, &arg); - *max_trx_id= arg.m_id; - *min_trx_no= arg.m_no; + return rw_trx_ids.snapshot_ids(ids, max_trx_id); } @@ -1149,7 +1247,7 @@ class trx_sys_t Transaction becomes visible to MVCC. There's a gap between m_max_trx_id increment and transaction becoming - visible through rw_trx_hash. While we're in this gap concurrent thread may + visible through rw_trx_ids. While we're in this gap concurrent thread may come and do MVCC snapshot. As a result concurrent read view will be able to observe records owned by this transaction even before it was committed. @@ -1166,20 +1264,23 @@ class trx_sys_t void register_rw(trx_t *trx) { trx->id= get_new_trx_id_no_refresh(); - rw_trx_hash.insert(trx); + rw_trx_ids.register_rw(trx); refresh_rw_trx_hash_version(); + rw_trx_hash.insert(trx); } /** Deregisters read-write transaction. - Transaction is removed from rw_trx_hash, which releases all implicit locks. - MVCC snapshot won't see this transaction anymore. + After this call the transaction is no longer visible as active to MVCC read + views created subsequently, and all implicit locks held by the transaction + have been released. */ - void deregister_rw(trx_t *trx) + void deregister_rw(trx_t *trx) noexcept { + rw_trx_ids.deregister_rw(trx); rw_trx_hash.erase(trx); } @@ -1204,6 +1305,7 @@ class trx_sys_t void register_trx(trx_t *trx) { trx_list.push_front(*trx); + rw_trx_ids.register_trx(trx); } @@ -1214,6 +1316,7 @@ class trx_sys_t */ void deregister_trx(trx_t *trx) { + rw_trx_ids.deregister_trx(trx); trx_list.remove(*trx); } @@ -1266,33 +1369,8 @@ class trx_sys_t private: static my_bool find_same_or_older_callback(void *el, void *i) noexcept; - - struct snapshot_ids_arg - { - snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {} - trx_ids_t *m_ids; - trx_id_t m_id; - trx_id_t m_no; - }; - - - static my_bool copy_one_id(void* el, void *a) - { - auto element= static_cast(el); - auto arg= static_cast(a); - if (element->id < arg->m_id) - { - trx_id_t no= element->no; - arg->m_ids->push_back(element->id); - if (no < arg->m_no) - arg->m_no= no; - } - return 0; - } - - /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */ - trx_id_t get_rw_trx_hash_version() + trx_id_t get_rw_trx_hash_version() const noexcept { return m_rw_trx_hash_version.load(std::memory_order_acquire); } diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 26fe376ec9618..b15ad1c57cb58 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -629,6 +629,8 @@ struct trx_t : ilist_node<> public: + /** trx_sys.rw_trx_ids index, protected by trx_sys.rw_trx_ids.latch */ + uint32_t rw_trx_ids_slot; /** Transaction identifier (0 if no locks were acquired). Set by trx_sys_t::register_rw() or trx_resurrect() before the transaction is added to trx_sys.rw_trx_hash. diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h index bcc129601d11f..b8ea79d3c2ea4 100644 --- a/storage/innobase/include/ut0new.h +++ b/storage/innobase/include/ut0new.h @@ -174,6 +174,7 @@ extern PSI_memory_key mem_key_other; extern PSI_memory_key mem_key_row_log_buf; extern PSI_memory_key mem_key_row_merge_sort; extern PSI_memory_key mem_key_std; +extern PSI_memory_key mem_key_trx_sys_t_rw_trx_ids; /** Setup the internal objects needed for UT_NEW() to operate. This must be called before the first call to UT_NEW(). */ diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc index e522980e8391e..17a1bbec0491b 100644 --- a/storage/innobase/read/read0read.cc +++ b/storage/innobase/read/read0read.cc @@ -172,7 +172,7 @@ For details see: row_undo_mod_sec_is_unsafe() and row_purge_poss_sec() */ inline void ReadViewBase::snapshot(trx_t *trx) { - trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no); + m_low_limit_no= trx_sys.snapshot_ids(m_ids, m_low_limit_id); if (m_ids.empty()) { m_up_limit_id= m_low_limit_id; diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 9ba4a8f3e190f..3b453e05fb589 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -151,12 +151,14 @@ bool purge_sys_t::is_purgeable(trx_id_t trx_id) const noexcept Remove the undo log segment from the rseg slot if it is too big for reuse. @param[in] trx transaction @param[in,out] undo undo log -@param[in,out] mtr mini-transaction */ +@param[in,out] mtr mini-transaction +@param[in] end transaction serialisation number */ void -trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) +trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr, + trx_id_t end) { DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")", - trx->id, trx_id_t{trx->rw_trx_hash_element->no})); + trx->id, end)); ut_ad(undo->id < TRX_RSEG_N_SLOTS); ut_ad(undo == trx->rsegs.m_redo.undo); trx_rseg_t *rseg= trx->rsegs.m_redo.rseg; @@ -257,7 +259,7 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->page.frame, undo_state); mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, undo_header + TRX_UNDO_TRX_NO, - trx->rw_trx_hash_element->no); + end); } /** Free an undo log segment. diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 2f2265a3df1fd..57e463d25057a 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -168,6 +168,7 @@ void trx_sys_t::create() m_initialised= true; trx_list.create(); rw_trx_hash.init(); + rw_trx_ids.create(); for (auto &rseg : temp_rsegs) rseg.init(nullptr, FIL_NULL); for (auto &rseg : rseg_array) @@ -361,6 +362,7 @@ trx_sys_t::close() } rw_trx_hash.destroy(); + rw_trx_ids.destroy(); /* There can't be any active transactions. */ for (auto& rseg : temp_rsegs) rseg.destroy(); diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 640352a0aacc3..06bd8a561b567 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -188,6 +188,7 @@ struct TrxFactory { new(&trx->read_view) ReadView(); trx->rw_trx_hash_pins = 0; + trx->rw_trx_ids_slot = std::numeric_limits::max(); trx_init(trx); trx->dict_operation_lock_mode = false; @@ -708,6 +709,7 @@ static dberr_t trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg, trx_sys.rw_trx_hash.insert(trx); trx_sys.rw_trx_hash.put_pins(trx); + trx_sys.rw_trx_ids.register_rw(trx); if (trx_state_eq(trx, TRX_STATE_ACTIVE)) *rows_to_undo+= trx->undo_no; return trx_resurrect_table_locks(trx, *undo); @@ -1147,6 +1149,7 @@ inline void trx_t::write_serialisation_history(mtr_t *mtr) binlog_oob_context *binlog_ctx= nullptr; if (UNIV_LIKELY(undo != nullptr)) { + trx_id_t end; MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); /* We have to hold exclusive rseg->latch because undo log headers have @@ -1173,8 +1176,7 @@ inline void trx_t::write_serialisation_history(mtr_t *mtr) thread can also fetch redo log records from rseg with greater last commit number before rseg with lesser one. */ purge_sys.queue_lock(); - trx_sys.assign_new_trx_no(this); - const trx_id_t end{rw_trx_hash_element->no}; + end= trx_sys.assign_new_trx_no(this); rseg->last_page_no= undo->hdr_page_no; /* end cannot be less than anything in rseg. User threads only produce events when a rollback segment is empty. */ @@ -1183,7 +1185,7 @@ inline void trx_t::write_serialisation_history(mtr_t *mtr) purge_sys.queue_unlock(); } else - trx_sys.assign_new_trx_no(this); + end= trx_sys.assign_new_trx_no(this); /* Include binlog data in the commit record, if any. */ if (active_commit_ordered) @@ -1193,7 +1195,7 @@ inline void trx_t::write_serialisation_history(mtr_t *mtr) /* Change the undo log segment state from TRX_UNDO_ACTIVE, to define the transaction as committed in the file based domain, at mtr->commit_lsn() obtained in mtr->commit() below. */ - trx_purge_add_undo_to_history(this, undo, mtr); + trx_purge_add_undo_to_history(this, undo, mtr, end); done: rseg->release(); rseg->latch.wr_unlock(); diff --git a/storage/innobase/ut/ut0new.cc b/storage/innobase/ut/ut0new.cc index ad575016aeca7..f5b1ae7b59b99 100644 --- a/storage/innobase/ut/ut0new.cc +++ b/storage/innobase/ut/ut0new.cc @@ -47,6 +47,7 @@ PSI_memory_key mem_key_other; PSI_memory_key mem_key_row_log_buf; PSI_memory_key mem_key_row_merge_sort; PSI_memory_key mem_key_std; +PSI_memory_key mem_key_trx_sys_t_rw_trx_ids; #ifdef UNIV_PFS_MEMORY @@ -75,6 +76,7 @@ static PSI_memory_info pfs_info[] = { {&mem_key_row_log_buf, "row_log_buf", 0}, {&mem_key_row_merge_sort, "row_merge_sort", 0}, {&mem_key_std, "std", 0}, + {&mem_key_trx_sys_t_rw_trx_ids, "trx_sys_t::rw_trx_ids", 0}, }; static const int NKEYS = static_castUT_ARR_SIZE(auto_event_names)-1;