Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions cloud/src/meta-service/meta_service_job.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,15 @@ void start_compaction_job(MetaServiceCode& code, std::string& msg, std::stringst
return;
}
}

if (compaction.base_compaction_cnt() < stats.base_compaction_cnt() ||
compaction.cumulative_compaction_cnt() < stats.cumulative_compaction_cnt()) {
// STOP_TOKEN is a lock marker used by schema change to block concurrent compactions during
// delete bitmap recalculation on MOW tables. It does not perform actual compaction, so the
// stale tablet cache check (which guards against compacting on outdated rowset metadata) is
// not meaningful for it and must be skipped to avoid spurious failures when the BE's cached
// compaction counts lag behind the meta-service due to a concurrent compaction completing
// on another BE node (see CORE-5964).
if (compaction.type() != TabletCompactionJobPB::STOP_TOKEN &&
(compaction.base_compaction_cnt() < stats.base_compaction_cnt() ||
compaction.cumulative_compaction_cnt() < stats.cumulative_compaction_cnt())) {
code = MetaServiceCode::STALE_TABLET_CACHE;
SS << "could not perform compaction on expired tablet cache."
<< " req_base_compaction_cnt=" << compaction.base_compaction_cnt()
Expand Down
65 changes: 65 additions & 0 deletions cloud/test/meta_service_job_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1645,6 +1645,71 @@ void check_job_key(MetaServiceProxy* meta_service, std::string instance_id, int6
}
}

// Regression test for CORE-5964: STOP_TOKEN should not be rejected by the stale tablet
// cache check even when the BE's cached compaction counts lag behind the meta-service.
// STOP_TOKEN is a lock marker used by schema change (MOW table) to block concurrent
// compactions during delete bitmap recalculation -- it does not perform actual compaction
// work, so verifying compaction count freshness is meaningless for it.
TEST(MetaServiceJobTest, StopTokenSkipsStaleTabletCacheCheck) {
auto meta_service = get_meta_service();

auto sp = SyncPoint::get_instance();
DORIS_CLOUD_DEFER {
SyncPoint::get_instance()->clear_all_call_backs();
};
sp->set_call_back("get_instance_id", [&](auto&& args) {
auto* ret = try_any_cast_ret<std::string>(args);
ret->first = instance_id;
ret->second = true;
});
sp->enable_processing();

int64_t table_id = 1, index_id = 2, partition_id = 3, tablet_id = 101;

// Set up tablet index
auto index_key = meta_tablet_idx_key({instance_id, tablet_id});
TabletIndexPB idx_pb;
idx_pb.set_table_id(table_id);
idx_pb.set_index_id(index_id);
idx_pb.set_partition_id(partition_id);
idx_pb.set_tablet_id(tablet_id);
std::unique_ptr<Transaction> txn;
ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), TxnErrorCode::TXN_OK);
txn->put(index_key, idx_pb.SerializeAsString());

// Simulate meta-service state where cumulative_compaction_cnt=9 (advanced by another BE)
std::string stats_key =
stats_tablet_key({instance_id, table_id, index_id, partition_id, tablet_id});
TabletStatsPB stats;
stats.set_base_compaction_cnt(0);
stats.set_cumulative_compaction_cnt(9);
txn->put(stats_key, stats.SerializeAsString());
ASSERT_EQ(txn->commit(), TxnErrorCode::TXN_OK);

// A regular CUMULATIVE compaction with stale counts (req=8 < actual=9) must be rejected.
{
StartTabletJobResponse res;
start_compaction_job(meta_service.get(), tablet_id, "cumu_job", "ip:port",
/*base_cnt=*/0, /*cumu_cnt=*/8, TabletCompactionJobPB::CUMULATIVE,
res);
ASSERT_EQ(res.status().code(), MetaServiceCode::STALE_TABLET_CACHE)
<< "CUMULATIVE with stale counts should be rejected";
}

// A STOP_TOKEN with the same stale counts must NOT be rejected (CORE-5964 regression).
// The BE's cached cumulative_compaction_cnt=8 lags behind the actual value=9 on the
// meta-service side, but STOP_TOKEN registration must still succeed.
{
StartTabletJobResponse res;
start_compaction_job(meta_service.get(), tablet_id, "stop_token_job", "ip:port",
/*base_cnt=*/0, /*cumu_cnt=*/8, TabletCompactionJobPB::STOP_TOKEN,
res);
ASSERT_EQ(res.status().code(), MetaServiceCode::OK)
<< "STOP_TOKEN with stale counts should NOT be rejected; got: "
<< res.status().msg();
}
}

TEST(MetaServiceJobTest, DeleteBitmapUpdateLockCompatibilityTest) {
auto meta_service = get_meta_service();
auto sp = SyncPoint::get_instance();
Expand Down
Loading