diff --git a/build.gradle b/build.gradle index 7e79551..d966a29 100644 --- a/build.gradle +++ b/build.gradle @@ -1,16 +1,16 @@ -plugins { - id 'java' - id 'application' - id 'com.google.protobuf' version '0.9.5' - id 'io.freefair.lombok' version '8.13.1' - id 'com.gradleup.shadow' version '8.3.6' - id 'me.champeau.jmh' version '0.7.3' -} - +plugins { + id 'java' + id 'application' + id 'com.google.protobuf' version '0.9.5' + id 'io.freefair.lombok' version '8.13.1' + id 'com.gradleup.shadow' version '8.3.6' + id 'me.champeau.jmh' version '0.7.3' +} + group = 'io.ringbroker' version = '0.0.1-BETA' - -/* ---------- JVM ---------- */ + +/* ---------- JVM ---------- */ java { toolchain { languageVersion = JavaLanguageVersion.of(21) @@ -24,6 +24,19 @@ tasks.withType(JavaCompile).configureEach { options.encoding = 'UTF-8' } +/* ---------- Toolchain launcher (ensures JavaExec/Test run on JDK 21, not Gradle daemon JDK) ---------- */ +def jdk21Launcher = javaToolchains.launcherFor { + languageVersion = JavaLanguageVersion.of(21) +} + +tasks.withType(JavaExec).configureEach { + javaLauncher = jdk21Launcher +} + +tasks.withType(Test).configureEach { + javaLauncher = jdk21Launcher +} + /* ---------- Repos & Versions ---------- */ repositories { mavenCentral() } @@ -32,11 +45,11 @@ ext { protobufVersion = '3.25.7' jacksonVersion = '2.19.0' picocliVersion = '4.7.7' - nettyVersion = '4.2.1.Final' + nettyVersion = '4.2.1.Final' jmhVersion = '1.37' slf4jVersion = '2.0.17' jupiterVersion = '5.12.2' - junitPlatformVersion = '1.12.2' // Added matching platform version + junitPlatformVersion = '1.12.2' annotationVersion = '1.3.2' testcontainersVersion = '1.20.3' } @@ -46,25 +59,25 @@ dependencies { // gRPC and Protobuf implementation "io.grpc:grpc-netty-shaded:$grpcVersion" implementation "io.grpc:grpc-stub:$grpcVersion" - implementation "io.grpc:grpc-protobuf:$grpcVersion" - implementation "com.google.protobuf:protobuf-java:$protobufVersion" - implementation "io.netty:netty-all:$nettyVersion" - - // Lombok (enabled via plugin) - compileOnly "org.projectlombok:lombok" - annotationProcessor "org.projectlombok:lombok" - - // Jackson YAML for config parsing - implementation "com.fasterxml.jackson.core:jackson-databind:$jacksonVersion" - implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:$jacksonVersion" - - // Picocli CLI support - implementation "info.picocli:picocli:$picocliVersion" - annotationProcessor "info.picocli:picocli-codegen:$picocliVersion" - - // SLF4J API (you can choose a backend like logback) - implementation "org.slf4j:slf4j-api:$slf4jVersion" - runtimeOnly "org.slf4j:slf4j-simple:$slf4jVersion" + implementation "io.grpc:grpc-protobuf:$grpcVersion" + implementation "com.google.protobuf:protobuf-java:$protobufVersion" + implementation "io.netty:netty-all:$nettyVersion" + + // Lombok (enabled via plugin) + compileOnly "org.projectlombok:lombok" + annotationProcessor "org.projectlombok:lombok" + + // Jackson YAML for config parsing + implementation "com.fasterxml.jackson.core:jackson-databind:$jacksonVersion" + implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:$jacksonVersion" + + // Picocli CLI support + implementation "info.picocli:picocli:$picocliVersion" + annotationProcessor "info.picocli:picocli-codegen:$picocliVersion" + + // SLF4J API + simple backend + implementation "org.slf4j:slf4j-api:$slf4jVersion" + runtimeOnly "org.slf4j:slf4j-simple:$slf4jVersion" // JUnit 5 testImplementation "org.junit.jupiter:junit-jupiter:$jupiterVersion" @@ -75,25 +88,25 @@ dependencies { compileOnly "javax.annotation:javax.annotation-api:$annotationVersion" - // JMH dependencies - jmh "org.openjdk.jmh:jmh-core:${jmhVersion}" - jmh "org.openjdk.jmh:jmh-generator-annprocess:${jmhVersion}" -} - -/* ---------- Protobuf / gRPC code‑gen ---------- */ -protobuf { - protoc { artifact = "com.google.protobuf:protoc:${protobufVersion}" } - plugins { - grpc { artifact = "io.grpc:protoc-gen-grpc-java:${grpcVersion}" } - } - generateProtoTasks { - all().each { task -> - task.plugins { grpc {} } - } - } -} - -/* ---------- Application entry‑point ---------- */ + // ---- JMH (FIXED): ensure BenchmarkList is generated ---- + jmhImplementation "org.openjdk.jmh:jmh-core:${jmhVersion}" + jmhAnnotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:${jmhVersion}" +} + +/* ---------- Protobuf / gRPC code-gen ---------- */ +protobuf { + protoc { artifact = "com.google.protobuf:protoc:${protobufVersion}" } + plugins { + grpc { artifact = "io.grpc:protoc-gen-grpc-java:${grpcVersion}" } + } + generateProtoTasks { + all().each { task -> + task.plugins { grpc {} } + } + } +} + +/* ---------- Application entry-point ---------- */ application { mainClass = 'io.ringbroker.Application' } @@ -105,8 +118,8 @@ test { // Fat jar is required for the Testcontainers-based cluster integration test. dependsOn tasks.named('shadowJar') } - -/* ---------- Jar manifest ---------- */ + +/* ---------- Jar manifest ---------- */ jar { manifest { attributes( @@ -116,21 +129,20 @@ jar { ) } } - -/* ---------- JMH Configuration ---------- */ + +/* ---------- JMH Configuration ---------- */ jmh { - includes = ['.*Benchmark.*'] // Include classes with "Benchmark" in their name - resultFormat = 'JSON' // Output format for results + includes = ['.*Benchmark.*'] + resultFormat = 'JSON' resultsFile = project.file("${project.buildDir}/reports/jmh/results.json") - timeOnIteration = '1s' // Time per iteration - warmupIterations = 2 // Number of warmup iterations - iterations = 5 // Number of measurement iterations - fork = 2 // Number of forks - failOnError = true // Fail build on errors during benchmarking - forceGC = true // Force GC between iterations - jvmArgsAppend = ['--enable-preview'] // Add any JVM args needed for your project - - // Allow quick overrides from the command line (e.g. -PjmhInclude=Foo -PjmhIterations=1). + timeOnIteration = '1s' + warmupIterations = 2 + iterations = 5 + fork = 2 + failOnError = true + forceGC = true + jvmArgsAppend = ['--enable-preview'] + if (project.hasProperty('jmhInclude')) { includes = [project.property('jmhInclude')] } @@ -147,3 +159,27 @@ jmh { jvmArgsAppend += ['-Djmh.ignoreLock=true'] } } + +/* ---------- Custom Benchmarker runner ---------- */ +tasks.register('benchmarkProfile', JavaExec) { + group = 'benchmark' + description = 'Runs the custom JMH Benchmarker main (Windows: JFR via Benchmarker; non-Windows: async-profiler).' + + dependsOn tasks.named('jmhClasses') + + // IMPORTANT: include both JMH + MAIN outputs & deps so forks can load project classes + classpath = files( + sourceSets.jmh.runtimeClasspath, + sourceSets.main.runtimeClasspath + ) + + mainClass = 'io.ringbroker.benchmark.Benchmarker' + + jvmArgs '--enable-preview', '-XX:+UnlockDiagnosticVMOptions', '-XX:+DebugNonSafepoints' + + if (project.hasProperty('asyncLibPath')) systemProperty 'ringbroker.async.libPath', project.property('asyncLibPath') + if (project.hasProperty('asyncDir')) systemProperty 'ringbroker.async.dir', project.property('asyncDir') + if (project.hasProperty('asyncEvent')) systemProperty 'ringbroker.async.event', project.property('asyncEvent') + if (project.hasProperty('asyncOutput')) systemProperty 'ringbroker.async.output', project.property('asyncOutput') + if (project.hasProperty('profileDir')) systemProperty 'ringbroker.profile.dir', project.property('profileDir') +} \ No newline at end of file diff --git a/src/jmh/java/io/ringbroker/benchmark/Benchmarker.java b/src/jmh/java/io/ringbroker/benchmark/Benchmarker.java index b4bdb27..ca381b0 100644 --- a/src/jmh/java/io/ringbroker/benchmark/Benchmarker.java +++ b/src/jmh/java/io/ringbroker/benchmark/Benchmarker.java @@ -5,6 +5,11 @@ import org.openjdk.jmh.runner.options.Options; import org.openjdk.jmh.runner.options.OptionsBuilder; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + /** * Main benchmark suite runner for RingBroker performance testing. * This class serves as the entry point for running all benchmarks. @@ -12,12 +17,81 @@ public class Benchmarker { public static void main(final String[] args) throws RunnerException { - final Options opt = new OptionsBuilder() - .include("io.ringbroker.benchmark.*Benchmark") - .exclude(Benchmarker.class.getSimpleName()) - .exclude(RawTcpClient.class.getSimpleName()) - .build(); + final boolean isWindows = System.getProperty("os.name", "").toLowerCase().contains("win"); + + // Where all profiler artifacts go (JFR or async-profiler outputs). + final Path outDir = Paths.get(System.getProperty( + "ringbroker.profile.dir", + System.getProperty("ringbroker.async.dir", "build/reports/jmh/profile") + )).toAbsolutePath().normalize(); + ensureDirectory(outDir); + + final OptionsBuilder builder = new OptionsBuilder(); + builder.include("io.ringbroker.benchmark.*Benchmark"); + builder.exclude(Benchmarker.class.getSimpleName()); + builder.exclude(RawTcpClient.class.getSimpleName()); + + // Ensure forks also get these (NOT just the JavaExec runner JVM) + builder.jvmArgsAppend( + "--enable-preview", + "-XX:+UnlockDiagnosticVMOptions", + "-XX:+DebugNonSafepoints" + ); + + // Pick profiler backend: + // - Windows: use JFR (built-in, no native DLL needed) + // - Others: use async-profiler (as you had) + if (isWindows) { + final String jfrSettings = System.getProperty("ringbroker.jfr.settings", "profile"); + final int stackDepth = Integer.getInteger("ringbroker.jfr.stackdepth", 256); + + // JFR supports %p (pid) and %t (timestamp) filename expansion. :contentReference[oaicite:2]{index=2} + final String jfrFile = outDir.resolve("ringbroker-%p-%t.jfr").toString(); + + builder.jvmArgsAppend( + "-XX:StartFlightRecording=filename=" + jfrFile + ",settings=" + jfrSettings, + "-XX:FlightRecorderOptions=stackdepth=" + stackDepth + ); + } else { + final String asyncLibPath = firstNonBlank( + System.getProperty("ringbroker.async.libPath"), + System.getenv("ASYNC_PROFILER_LIB"), + "/opt/async-profiler/lib/libasyncProfiler.so" + ); + // Important: JMH async profiler options are separated by ';' not ','. + final String asyncProfilerOptions = String.join(";", + "libPath=" + asyncLibPath, + "event=" + System.getProperty("ringbroker.async.event", "cpu"), + "output=" + System.getProperty("ringbroker.async.output", "flamegraph"), + "dir=" + outDir + ); + + builder.addProfiler("async", asyncProfilerOptions); + } + + if (Boolean.getBoolean("ringbroker.profile.gc")) { + builder.addProfiler("gc"); + } + + final Options opt = builder.build(); new Runner(opt).run(); } -} + + private static String firstNonBlank(final String... values) { + for (final String value : values) { + if (value != null && !value.isBlank()) { + return value; + } + } + throw new IllegalStateException("No non-blank value provided"); + } + + private static void ensureDirectory(final Path path) { + try { + Files.createDirectories(path); + } catch (final IOException e) { + throw new IllegalStateException("Failed to create profiler output directory: " + path, e); + } + } +} \ No newline at end of file diff --git a/src/jmh/java/io/ringbroker/benchmark/RingBrokerBenchmark.java b/src/jmh/java/io/ringbroker/benchmark/RingBrokerBenchmark.java index 89a1c72..d1795e2 100644 --- a/src/jmh/java/io/ringbroker/benchmark/RingBrokerBenchmark.java +++ b/src/jmh/java/io/ringbroker/benchmark/RingBrokerBenchmark.java @@ -406,4 +406,4 @@ private static void wipeDir(final Path root) throws IOException { }); } } -} +} \ No newline at end of file diff --git a/src/main/java/io/ringbroker/broker/ingress/ClusteredIngress.java b/src/main/java/io/ringbroker/broker/ingress/ClusteredIngress.java index 75e94f6..ce55595 100644 --- a/src/main/java/io/ringbroker/broker/ingress/ClusteredIngress.java +++ b/src/main/java/io/ringbroker/broker/ingress/ClusteredIngress.java @@ -1,5 +1,7 @@ package io.ringbroker.broker.ingress; +import com.google.protobuf.ByteString; +import com.google.protobuf.UnsafeByteOperations; import io.ringbroker.api.BrokerApi; import io.ringbroker.broker.delivery.Delivery; import io.ringbroker.broker.role.BrokerRole; @@ -19,6 +21,8 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; import java.lang.invoke.VarHandle; +import java.nio.ByteBuffer; +import java.nio.MappedByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; @@ -26,12 +30,14 @@ import java.util.concurrent.atomic.*; import java.util.concurrent.locks.LockSupport; import java.util.function.BiConsumer; +import java.util.zip.CRC32C; @Slf4j @Getter public final class ClusteredIngress { private static final CompletableFuture COMPLETED_FUTURE = CompletableFuture.completedFuture(null); + private static final byte[] EMPTY_BYTES = new byte[0]; private static final long PARK_NANOS = 1_000L; @@ -42,9 +48,23 @@ public final class ClusteredIngress { private static final int PIPELINE_MAX_DRAIN = 8_192; private static final int PIPELINE_QUEUE_FACTOR = 8; - // --- NEW: cap in-flight per partition so async doesn’t OOM --- + // Cap in-flight work per partition. private static final int MAX_INFLIGHT_BATCHES_PER_PARTITION = 8_192; private static final long MAX_INFLIGHT_BYTES_PER_PARTITION = 256L * 1024 * 1024; // 256MB + private static final int SUBSCRIBE_FETCH_BATCH = 512; + private static final long SUBSCRIBE_IDLE_NANOS = 200_000L; // 0.2ms + private static final int SUBSCRIBE_COMMIT_BATCH = 64; + private static final long SUBSCRIBE_COMMIT_MAX_DELAY_NANOS = 2_000_000L; // 2ms + private static final int FORWARD_MAX_RETRIES = 2; + private static final int MAX_BACKFILL_REPLY_BYTES = 1 * 1024 * 1024; + private static final int INITIAL_BACKFILL_SCRATCH_BYTES = 64 * 1024; + private static final ThreadLocal BACKFILL_SCRATCH = + ThreadLocal.withInitial(() -> new byte[INITIAL_BACKFILL_SCRATCH_BYTES]); + private static final int INITIAL_REPLICA_APPEND_SCRATCH = 256; + private static final ThreadLocal REPLICA_APPEND_SCRATCH = + ThreadLocal.withInitial(() -> new byte[INITIAL_REPLICA_APPEND_SCRATCH][]); + private static final ThreadLocal MESSAGE_ID_CRC = ThreadLocal.withInitial(CRC32C::new); + private static final ThreadLocal MESSAGE_ID_INT_SCRATCH = ThreadLocal.withInitial(() -> new byte[Integer.BYTES]); private final BackfillPlanner backfillPlanner; private final int backfillBatchSize = 64; @@ -63,7 +83,7 @@ public final class ClusteredIngress { return t; }); - // NEW: offload quorum replication and any blocking waits away from the per-partition pipeline thread + // Offload quorum replication and blocking waits from partition pipelines. private final ExecutorService ioExecutor = Executors.newThreadPerTaskExecutor(Thread.ofVirtual().name("broker-io").factory()); @@ -232,7 +252,7 @@ public static ClusteredIngress create(final TopicRegistry registry, new io.ringbroker.ledger.orchestrator.VirtualLog(partDir, (int) segmentCapacity); vLog.discoverOnDisk(); - final Ingress ingress = Ingress.create(registry, ring, vLog, 0L, batchSize, forceDurable); + final Ingress ingress = Ingress.create(registry, ring, vLog, 0L, batchSize, forceDurable, true); ingressMap.put(pid, ingress); deliveryMap.put(pid, new Delivery(ring)); @@ -297,21 +317,20 @@ public CompletableFuture publish(final long correlationId, final byte[] payload) { final int partitionId = partitioner.selectPartition(key, totalPartitions); - final int ownerNode = Math.floorMod(partitionId, clusterSize); + if (!isValidPartitionId(partitionId)) { + return CompletableFuture.failedFuture( + new IllegalStateException("Partitioner returned out-of-range partition: " + partitionId) + ); + } + final int ownerNode = resolveWriteOwner(partitionId); if (ownerNode == myNodeId) { if (idempotentMode && shouldDropDuplicate(partitionId, key, payload)) return COMPLETED_FUTURE; return pipeline(partitionId).submitPublish(correlationId, topic, retries, payload); } - // forward - final RemoteBrokerClient ownerClient = clusterNodes.get(ownerNode); - if (ownerClient == null) { - return CompletableFuture.failedFuture(new IllegalStateException("No client for owner " + ownerNode)); - } - final BrokerApi.Envelope env = buildPublishEnvelope(correlationId, topic, key, payload, partitionId, retries); - return forwardWithRetry(ownerClient, env, partitionId, 0); + return forwardWithRetry(env, partitionId, 0); } public CompletableFuture publish(final String topic, final byte[] key, final byte[] payload) { @@ -319,6 +338,25 @@ public CompletableFuture publish(final String topic, final byte[] key, fin return publish(defaultCorrelationId, topic, key, 0, payload); } + public CompletableFuture publish(final long correlationId, + final String topic, + final com.google.protobuf.ByteString key, + final int retries, + final com.google.protobuf.ByteString payload) { + return publish( + correlationId, + topic, + byteStringToArrayOrNull(key), + retries, + byteStringToArray(payload) + ); + } + + @FunctionalInterface + public interface MessageViewHandler { + void accept(long lsn, ByteBuffer payload); + } + /** * Publish to a specific partition (used by internal forwarding to preserve routing). */ @@ -328,33 +366,83 @@ public CompletableFuture publishToPartition(final long correlationId, final byte[] key, final int retries, final byte[] payload) { - final int ownerNode = Math.floorMod(partitionId, clusterSize); + if (!isValidPartitionId(partitionId)) { + return CompletableFuture.failedFuture( + new IllegalArgumentException("partition_id out of range: " + partitionId) + ); + } + + final int ownerNode = resolveWriteOwner(partitionId); if (ownerNode == myNodeId) { if (idempotentMode && shouldDropDuplicate(partitionId, key, payload)) return COMPLETED_FUTURE; return pipeline(partitionId).submitPublish(correlationId, topic, retries, payload); } - final RemoteBrokerClient ownerClient = clusterNodes.get(ownerNode); - if (ownerClient == null) { - return CompletableFuture.failedFuture(new IllegalStateException("No client for owner " + ownerNode)); - } - final BrokerApi.Envelope env = buildPublishEnvelope(correlationId, topic, key, payload, partitionId, retries); - return forwardWithRetry(ownerClient, env, partitionId, 0); + return forwardWithRetry(env, partitionId, 0); + } + + public CompletableFuture publishToPartition(final long correlationId, + final String topic, + final int partitionId, + final com.google.protobuf.ByteString key, + final int retries, + final com.google.protobuf.ByteString payload) { + return publishToPartition( + correlationId, + topic, + partitionId, + byteStringToArrayOrNull(key), + retries, + byteStringToArray(payload) + ); + } + + private boolean isValidPartitionId(final int partitionId) { + return partitionId >= 0 && partitionId < totalPartitions; } public void subscribeTopic(final String topic, final String group, final BiConsumer handler) { if (!registry.contains(topic)) throw new IllegalArgumentException("Unknown topic: " + topic); - for (final Map.Entry entry : deliveryMap.entrySet()) { - final int partitionId = entry.getKey(); - final long committed = Math.max(0L, offsetStore.fetch(topic, group, partitionId)); + for (final int partitionId : ingressMap.keySet()) { + ioExecutor.submit(() -> runSubscriptionLoop( + topic, + group, + partitionId, + handler::accept, + payload -> payload, + (segBuf, payloadPos, payloadLen) -> { + final byte[] payload = new byte[payloadLen]; + final var dup = segBuf.duplicate(); + dup.position(payloadPos); + dup.get(payload, 0, payloadLen); + return payload; + } + )); + } + } - entry.getValue().subscribe(committed, (sequence, message) -> { - handler.accept(sequence, message); - offsetStore.commit(topic, group, partitionId, sequence); - }); + public void subscribeTopicZeroCopy(final String topic, + final String group, + final MessageViewHandler handler) { + if (!registry.contains(topic)) throw new IllegalArgumentException("Unknown topic: " + topic); + + for (final int partitionId : ingressMap.keySet()) { + ioExecutor.submit(() -> runSubscriptionLoop( + topic, + group, + partitionId, + handler::accept, + payload -> ByteBuffer.wrap(payload).asReadOnlyBuffer(), + (segBuf, payloadPos, payloadLen) -> { + final ByteBuffer view = segBuf.duplicate(); + view.position(payloadPos); + view.limit(payloadPos + payloadLen); + return view.slice().asReadOnlyBuffer(); + } + )); } } @@ -388,9 +476,7 @@ public CompletableFuture handleSealAndRollAsync(final return pipeline(s.getPartitionId()).submitSeal(s); } - /** - * NEW: async, do NOT block Netty event loop threads. - */ + /** Runs metadata updates asynchronously to keep Netty event loops non-blocking. */ public CompletableFuture handleMetadataUpdateAsync(final BrokerApi.MetadataUpdate upd) { return pipeline(upd.getPartitionId()).submitMetadataUpdate(upd); } @@ -414,34 +500,36 @@ private static int nextPow2(final int v) { private final class PartitionPipeline implements Runnable { private static final int OFFER_SPIN_LIMIT = 256; - private static final long OFFER_PARK_NANOS = 1_000L; // 1µs backoff when full + private static final long OFFER_PARK_NANOS = 1_000L; + private static final int FUTURE_ARRAY_POOL_MAX = 64; private final int pid; private final MpscQueue queue; private final Thread thread; - // internal “never block” queue for commit completions (unbounded) + // Internal queue for commit completions. private final ConcurrentLinkedQueue internalQ = new ConcurrentLinkedQueue<>(); - // one-item defer slot for the (single) consumer thread (used by batching) + // One deferred slot for the pipeline consumer. private Object deferred; - // batch scratch (reused) — never allow 0-length + // Batch scratch. private final int maxDrain = Math.max(1, Math.min(PIPELINE_MAX_DRAIN, Math.max(1, batchSize))); private final byte[][] payloads = new byte[maxDrain][]; @SuppressWarnings("unchecked") private final CompletableFuture[] publishFuts = (CompletableFuture[]) new CompletableFuture[maxDrain]; + private final ArrayDeque[]> futureArrayPool = new ArrayDeque<>(); - // replication targets scratch (avoid per-publish allocation) + // Replication targets scratch. private int[] replicaScratch = new int[Math.max(1, clusterSize)]; - // NEW: in-flight tracking for correctness + backpressure + // In-flight tracking for backpressure. private final ArrayDeque pending = new ArrayDeque<>(); private int inflightBatches = 0; private long inflightBytes = 0; - // NEW: ensure per-partition replication happens in-order even though it’s off-thread + // Keep per-partition replication ordered, even when executed off-thread. private CompletableFuture replTail = COMPLETED_FUTURE; PartitionPipeline(final int pid, final int capacityPow2) { @@ -483,20 +571,15 @@ private void deferOne(final Object o) { } private boolean enqueueOrFail(final Object task, final CompletableFuture f) { - int spins = 0; - while (!queue.offer(task)) { - if (closed.get() || Thread.currentThread().isInterrupted()) { - f.completeExceptionally(new IllegalStateException("Broker is closed")); - return false; - } - if (spins++ < OFFER_SPIN_LIMIT) { - Thread.onSpinWait(); - } else { - spins = 0; - LockSupport.parkNanos(OFFER_PARK_NANOS); - } + if (closed.get() || Thread.currentThread().isInterrupted()) { + f.completeExceptionally(new IllegalStateException("Broker is closed")); + return false; } - return true; + if (queue.offer(task)) return true; + + f.completeExceptionally(new RejectedExecutionException( + "Partition pipeline queue full (pid=" + pid + ")")); + return false; } CompletableFuture submitPublish(final long correlationId, @@ -589,7 +672,7 @@ public void run() { // Fail pending publish batches PendingBatch pb; while ((pb = pending.pollFirst()) != null) { - pb.fail(stop); + completePendingFailure(pb, stop); } inflightBatches = 0; inflightBytes = 0; @@ -606,10 +689,13 @@ public void run() { Object in; while ((in = internalQ.poll()) != null) { - if (in instanceof CommitDoneTask cd) { - cd.pending.fail(stop); + if (in instanceof CommitDoneTask) { + // Pending batches were already failed above. Commit callbacks can race + // with shutdown and enqueue duplicate completion notifications here. + // Draining without re-completing avoids double-releasing pooled arrays. } } + futureArrayPool.clear(); } } @@ -655,8 +741,8 @@ private void onCommitDone(final CommitDoneTask cd) { inflightBatches = Math.max(0, inflightBatches - 1); inflightBytes = Math.max(0L, inflightBytes - cd.pending.bytes); - if (cd.error == null) cd.pending.succeed(); - else cd.pending.fail(unwrap(cd.error)); + if (cd.error == null) completePendingSuccess(cd.pending); + else completePendingFailure(cd.pending, unwrap(cd.error)); } private Throwable unwrap(final Throwable t) { @@ -665,6 +751,34 @@ private Throwable unwrap(final Throwable t) { return t; } + @SuppressWarnings("unchecked") + private CompletableFuture[] acquireFutureArray() { + final CompletableFuture[] arr = futureArrayPool.pollFirst(); + if (arr != null) return arr; + return (CompletableFuture[]) new CompletableFuture[maxDrain]; + } + + private void releaseFutureArray(final CompletableFuture[] arr, final int used) { + Arrays.fill(arr, 0, used, null); + if (futureArrayPool.size() < FUTURE_ARRAY_POOL_MAX) { + futureArrayPool.offerFirst(arr); + } + } + + private void completePendingSuccess(final PendingBatch pb) { + for (int i = 0; i < pb.futureCount; i++) { + pb.futures[i].complete(null); + } + releaseFutureArray(pb.futures, pb.futureCount); + } + + private void completePendingFailure(final PendingBatch pb, final Throwable t) { + for (int i = 0; i < pb.futureCount; i++) { + pb.futures[i].completeExceptionally(t); + } + releaseFutureArray(pb.futures, pb.futureCount); + } + private void drainAndProcessPublish(final PublishTask first) { if (!registry.contains(first.topic)) { first.future.completeExceptionally(new IllegalArgumentException("Unknown topic: " + first.topic)); @@ -687,14 +801,14 @@ private void drainAndProcessPublish(final PublishTask first) { count++; while (count < payloads.length) { - final Object o = queue.poll(); // IMPORTANT: do not consume deferred here + final Object o = queue.poll(); // Do not consume deferred here. if (o == null) break; if (!(o instanceof PublishTask p)) { deferOne(o); break; } - if (!Objects.equals(topic, p.topic) || retries != p.retries) { + if (!topic.equals(p.topic) || retries != p.retries) { deferOne(p); break; } @@ -755,9 +869,7 @@ private void drainAndProcessPublish(final PublishTask first) { final Ingress ing = getOrCreateIngress(pid, epoch); // enqueue into ingress queue (fast) - for (int i = 0; i < count; i++) { - ing.publishForEpoch(epoch, payloads[i]); - } + ing.publishBatchForEpoch(epoch, payloads, count); // figure replication targets final EpochPlacement placementCache = pe.activePlacement; @@ -778,12 +890,11 @@ private void drainAndProcessPublish(final PublishTask first) { if (id != myNodeId) replicaScratch[rc++] = id; } - // copy futures for this batch into a stable array (scratch will be cleared) - @SuppressWarnings("unchecked") - final CompletableFuture[] futs = (CompletableFuture[]) new CompletableFuture[count]; + // copy futures for this batch into a stable pooled array (scratch will be cleared) + final CompletableFuture[] futs = acquireFutureArray(); System.arraycopy(publishFuts, 0, futs, 0, count); - final PendingBatch pb = new PendingBatch(epoch, lastSeq, batchBytes, futs); + final PendingBatch pb = new PendingBatch(epoch, lastSeq, batchBytes, futs, count); pending.addLast(pb); inflightBatches++; inflightBytes += batchBytes; @@ -851,26 +962,24 @@ private CompletableFuture replicateOrderedAsync(final BrokerApi.Envelope e } } - // ---- NEW: pending publish batch ---- + // Pending publish batch. private static final class PendingBatch { final long epoch; final long lastSeq; final long bytes; final CompletableFuture[] futures; + final int futureCount; - PendingBatch(final long epoch, final long lastSeq, final long bytes, final CompletableFuture[] futures) { + PendingBatch(final long epoch, + final long lastSeq, + final long bytes, + final CompletableFuture[] futures, + final int futureCount) { this.epoch = epoch; this.lastSeq = lastSeq; this.bytes = bytes; this.futures = futures; - } - - void succeed() { - for (final CompletableFuture f : futures) f.complete(null); - } - - void fail(final Throwable t) { - for (final CompletableFuture f : futures) f.completeExceptionally(t); + this.futureCount = futureCount; } } @@ -883,6 +992,12 @@ private record OpenEpochTask(BrokerApi.OpenEpochRequest req, CompletableFuture future) {} private record CommitDoneTask(PendingBatch pending, Throwable error) {} + private static final class SubscriptionCommitState { + long pendingLsn = Long.MIN_VALUE; + int pendingCount = 0; + long lastCommitNanos = System.nanoTime(); + } + /** * Low-allocation MPSC ring queue. */ @@ -898,8 +1013,8 @@ private static final class MpscQueue { private final long[] sequence; private final Object[] buffer; - private final AtomicLong tail = new AtomicLong(0); - private final AtomicLong head = new AtomicLong(0); + private final PaddedCounter tail = new PaddedCounter(0L); + private final PaddedCounter head = new PaddedCounter(0L); MpscQueue(final int capacityPow2) { if (Integer.bitCount(capacityPow2) != 1) throw new IllegalArgumentException("capacity must be pow2"); @@ -951,12 +1066,42 @@ Object poll() { final int idx = (int) (h & mask); final Object item = BUF.getAcquire(buffer, idx); - // IMPORTANT: clear BEFORE making slot available + // Clear before making the slot available. BUF.setRelease(buffer, idx, null); SEQ.setRelease(sequence, idx, h + capacity); return item; } + + private static final class PaddedCounter { + private static final VarHandle VALUE; + + static { + try { + VALUE = MethodHandles.lookup().findVarHandle(PaddedCounter.class, "value", long.class); + } catch (final ReflectiveOperationException e) { + throw new ExceptionInInitializerError(e); + } + } + + @SuppressWarnings("unused") + private long p1, p2, p3, p4, p5, p6, p7; + private volatile long value; + @SuppressWarnings("unused") + private long q1, q2, q3, q4, q5, q6, q7; + + PaddedCounter(final long initial) { + VALUE.setRelease(this, initial); + } + + long get() { + return (long) VALUE.getVolatile(this); + } + + boolean compareAndSet(final long expect, final long update) { + return VALUE.compareAndSet(this, expect, update); + } + } } // ---------- Fast replica append paths (serialized => no CAS loops) ---------- @@ -1012,7 +1157,7 @@ private BrokerApi.ReplicationAck appendReplicaFast(final BrokerApi.AppendRequest } try { - ing.publishForEpoch(epoch, a.getPayload().toByteArray()); + ing.publishForEpoch(epoch, byteStringToArray(a.getPayload())); } catch (final Throwable t) { return BrokerApi.ReplicationAck.newBuilder() .setStatus(BrokerApi.ReplicationAck.Status.ERROR_PERSISTENCE_FAILED) @@ -1102,8 +1247,15 @@ private BrokerApi.ReplicationAck appendReplicaBatchFast(final BrokerApi.AppendBa } try { - for (int i = startIdx; i < n; i++) { - ing.publishForEpoch(epoch, payloads.get(i).toByteArray()); + final int toWrite = n - startIdx; + final byte[][] scratch = ensureReplicaAppendScratch(toWrite); + try { + for (int i = 0; i < toWrite; i++) { + scratch[i] = byteStringToArray(payloads.get(startIdx + i)); + } + ing.publishBatchForEpoch(epoch, scratch, toWrite); + } finally { + Arrays.fill(scratch, 0, toWrite, null); } } catch (final Throwable t) { return BrokerApi.ReplicationAck.newBuilder() @@ -1143,16 +1295,25 @@ private static BrokerApi.Envelope buildPublishEnvelope(final long correlationId, .build(); } - private CompletableFuture forwardWithRetry(final RemoteBrokerClient client, - final BrokerApi.Envelope env, + private CompletableFuture forwardWithRetry(final BrokerApi.Envelope env, final int partitionId, final int attempt) { + final int ownerNode = resolveWriteOwner(partitionId); + final RemoteBrokerClient client = clusterNodes.get(ownerNode); + if (client == null) { + return CompletableFuture.failedFuture(new IllegalStateException("No client for owner " + ownerNode)); + } + final CompletableFuture result = new CompletableFuture<>(); - client.sendEnvelopeWithAck(env).whenComplete((ack, err) -> { + final long timeoutMs = Math.max(1L, replicator.getTimeoutMillis()); + final CompletableFuture send = client.sendEnvelopeWithAck(env); + + send.orTimeout(timeoutMs, TimeUnit.MILLISECONDS).whenComplete((ack, err) -> { if (err != null) { - if (attempt < 1) { + send.cancel(true); + if (attempt < FORWARD_MAX_RETRIES) { refreshEpochFromMetadata(partitionId); - forwardWithRetry(client, env, partitionId, attempt + 1).whenComplete((v, e2) -> { + forwardWithRetry(env, partitionId, attempt + 1).whenComplete((v, e2) -> { if (e2 != null) result.completeExceptionally(e2); else result.complete(null); }); @@ -1162,6 +1323,14 @@ private CompletableFuture forwardWithRetry(final RemoteBrokerClient client return; } if (ack.getStatus() != BrokerApi.ReplicationAck.Status.SUCCESS) { + if (attempt < FORWARD_MAX_RETRIES) { + refreshEpochFromMetadata(partitionId); + forwardWithRetry(env, partitionId, attempt + 1).whenComplete((v, e2) -> { + if (e2 != null) result.completeExceptionally(e2); + else result.complete(null); + }); + return; + } result.completeExceptionally(new RuntimeException("Forwarding failed: " + ack.getStatus())); return; } @@ -1182,74 +1351,84 @@ private void backfillTick() { final long epoch = em.epoch(); if (!em.isSealed()) continue; if (!em.placement().getStorageNodes().contains(myNodeId)) continue; - if (ing.getVirtualLog().hasEpoch(epoch)) continue; + long nextOffset = 0L; + if (ing.getVirtualLog().hasEpoch(epoch)) { + final long localHwm = ing.getVirtualLog().forEpoch(epoch).getHighWaterMark(); + if (localHwm >= em.endSeq()) { + backfillPlanner.markPresent(pid, epoch); + continue; + } + nextOffset = Math.max(0L, localHwm + 1); + } + + boolean done = false; for (final int target : em.placement().getStorageNodesArray()) { if (target == myNodeId) continue; final RemoteBrokerClient client = clusterNodes.get(target); if (client == null) continue; try { - final BrokerApi.Envelope req = BrokerApi.Envelope.newBuilder() - .setBackfill(BrokerApi.BackfillRequest.newBuilder() - .setPartitionId(pid) - .setEpoch(epoch) - .setOffset(0) - .setMaxBytes(256 * 1024) - .build()) - .build(); - final BrokerApi.BackfillReply reply = client.sendBackfill(req).get(5, TimeUnit.SECONDS); - if (!reply.getRedirectNodesList().isEmpty()) continue; - final byte[] payload = reply.getPayload().toByteArray(); - if (payload.length == 0) continue; - - int pos = 0; - int count = 0; - final byte[][] batch = new byte[backfillBatchSize][]; - while (pos + Integer.BYTES <= payload.length && count < backfillBatchSize) { - final int len = (payload[pos] & 0xFF) | - ((payload[pos + 1] & 0xFF) << 8) | - ((payload[pos + 2] & 0xFF) << 16) | - ((payload[pos + 3] & 0xFF) << 24); - pos += Integer.BYTES; - if (pos + len > payload.length) break; - final byte[] rec = new byte[len]; - System.arraycopy(payload, pos, rec, 0, len); - batch[count++] = rec; - pos += len; - } - if (count > 0) { - ing.appendBackfillBatch(epoch, batch, count); - backfillPlanner.markPresent(pid, epoch); + for (;;) { + final BrokerApi.Envelope req = BrokerApi.Envelope.newBuilder() + .setBackfill(BrokerApi.BackfillRequest.newBuilder() + .setPartitionId(pid) + .setEpoch(epoch) + .setOffset(nextOffset) + .setMaxBytes(256 * 1024) + .build()) + .build(); + final BrokerApi.BackfillReply reply = client.sendBackfill(req).get(5, TimeUnit.SECONDS); + if (!reply.getRedirectNodesList().isEmpty()) break; + + final int count = ing.appendBackfillEncodedBatch( + epoch, + reply.getPayload().asReadOnlyByteBuffer(), + backfillBatchSize + ); + if (count > 0) { + nextOffset += count; + } + + if (reply.getEndOfEpoch()) { + backfillPlanner.markPresent(pid, epoch); + done = true; + break; + } + + // Avoid tight loops on malformed/empty responses. + if (count == 0) break; } - if (reply.getEndOfEpoch()) break; + if (done) break; } catch (final Exception ignored) { } } + if (done) continue; } } } private void loadFenceState(final Path partitionDir, final PartitionEpochs pe) { try { - Files.list(partitionDir) - .filter(p -> p.getFileName().toString().endsWith(".fence")) - .forEach(p -> { - final String name = p.getFileName().toString(); - try { - final String epochStr = name.substring("epoch-".length(), name.indexOf(".fence")); - final long epoch = Long.parseLong(epochStr); - final FenceStore.PartitionFence fence = FenceStore.loadEpochFence(partitionDir, epoch); - if (fence != null) { - final PartitionEpochState pes = new PartitionEpochState(); - pes.sealed.set(fence.sealed()); - pes.sealedEndSeq = fence.sealedEndSeq(); - pes.lastSeq.set(fence.lastSeq()); - pe.epochFences.put(epoch, pes); - pe.highestSeenEpoch.accumulateAndGet(epoch, Math::max); + try (var files = Files.list(partitionDir)) { + files.filter(p -> p.getFileName().toString().endsWith(".fence")) + .forEach(p -> { + final String name = p.getFileName().toString(); + try { + final String epochStr = name.substring("epoch-".length(), name.indexOf(".fence")); + final long epoch = Long.parseLong(epochStr); + final FenceStore.PartitionFence fence = FenceStore.loadEpochFence(partitionDir, epoch); + if (fence != null) { + final PartitionEpochState pes = new PartitionEpochState(); + pes.sealed.set(fence.sealed()); + pes.sealedEndSeq = fence.sealedEndSeq(); + pes.lastSeq.set(fence.lastSeq()); + pe.epochFences.put(epoch, pes); + pe.highestSeenEpoch.accumulateAndGet(epoch, Math::max); + } + } catch (final Exception ignored) { } - } catch (final Exception ignored) { - } - }); + }); + } } catch (final IOException ignored) { } } @@ -1401,7 +1580,7 @@ private BrokerApi.BackfillReply handleBackfill(final BrokerApi.BackfillRequest r final int pid = req.getPartitionId(); final long epoch = req.getEpoch(); final long offset = req.getOffset(); - final int maxBytes = Math.max(1, req.getMaxBytes()); + final int maxBytes = Math.max(1, Math.min(req.getMaxBytes(), MAX_BACKFILL_REPLY_BYTES)); final BrokerApi.BackfillReply.Builder reply = BrokerApi.BackfillReply.newBuilder(); @@ -1412,20 +1591,21 @@ private BrokerApi.BackfillReply handleBackfill(final BrokerApi.BackfillRequest r return reply.build(); } - final int[] written = new int[]{0}; - final byte[][] scratch = new byte[backfillBatchSize][]; + final byte[] out = ensureBackfillScratch(maxBytes); + final int[] encodedBytes = new int[]{0}; final int[] count = new int[]{0}; ing.fetchEpoch(epoch, offset, backfillBatchSize, (off, segBuf, payloadPos, payloadLen) -> { - if (written[0] + payloadLen + Integer.BYTES > maxBytes) return; - final byte[] buf = new byte[payloadLen + Integer.BYTES]; - buf[0] = (byte) (payloadLen); - buf[1] = (byte) (payloadLen >>> 8); - buf[2] = (byte) (payloadLen >>> 16); - buf[3] = (byte) (payloadLen >>> 24); - segBuf.position(payloadPos).get(buf, Integer.BYTES, payloadLen); - scratch[count[0]++] = buf; - written[0] += buf.length; + if (count[0] >= backfillBatchSize) return; + + final int frameBytes = Integer.BYTES + payloadLen; + if (encodedBytes[0] + frameBytes > maxBytes) return; + + writeLittleEndianInt(out, encodedBytes[0], payloadLen); + encodedBytes[0] += Integer.BYTES; + segBuf.get(payloadPos, out, encodedBytes[0], payloadLen); + encodedBytes[0] += payloadLen; + count[0]++; }); if (count[0] == 0) { @@ -1433,18 +1613,8 @@ private BrokerApi.BackfillReply handleBackfill(final BrokerApi.BackfillRequest r return reply.build(); } - int total = 0; - for (int i = 0; i < count[0]; i++) total += scratch[i].length; - final byte[] out = new byte[total]; - int pos = 0; - for (int i = 0; i < count[0]; i++) { - final byte[] src = scratch[i]; - System.arraycopy(src, 0, out, pos, src.length); - pos += src.length; - } - final long hwm = ing.getVirtualLog().forEpoch(epoch).getHighWaterMark(); - reply.setPayload(com.google.protobuf.ByteString.copyFrom(out)); + reply.setPayload(ByteString.copyFrom(out, 0, encodedBytes[0])); reply.setEndOfEpoch(offset + count[0] > hwm); return reply.build(); @@ -1470,7 +1640,7 @@ private Ingress getOrCreateIngress(final int partitionId, final long epoch) { new io.ringbroker.ledger.orchestrator.VirtualLog(partDir, (int) segmentCapacity); vLog.discoverOnDisk(); - final Ingress ingress = Ingress.create(registry, ring, vLog, epoch, batchSize, forceDurable); + final Ingress ingress = Ingress.create(registry, ring, vLog, epoch, batchSize, forceDurable, true); deliveryMap.putIfAbsent(pid, new Delivery(ring)); if (idempotentMode) { @@ -1518,6 +1688,191 @@ public Optional> placementForEpoch(final int partitionId, final lo return Optional.of(meta.placement().getStorageNodes()); } + private int resolveWriteOwner(final int partitionId) { + Optional cfg = metadataStore.current(partitionId); + if (cfg.isEmpty()) { + try { + cfg = Optional.of(bootstrapMetadataIfMissing(partitionId)); + } catch (final Throwable ignored) { + } + } + if (cfg.isPresent()) { + final int[] nodes = cfg.get().activeEpoch().placement().getStorageNodesArray(); + if (nodes.length > 0) { + final int preferred = nodes[0]; + if (preferred == myNodeId || clusterNodes.containsKey(preferred)) return preferred; + for (final int nodeId : nodes) { + if (nodeId == myNodeId || clusterNodes.containsKey(nodeId)) return nodeId; + } + return preferred; + } + } + return Math.floorMod(partitionId, clusterSize); + } + + private LogConfiguration bootstrapMetadataIfMissing(final int partitionId) { + final Optional existing = metadataStore.current(partitionId); + if (existing.isPresent()) return existing.get(); + + final List placement = replicaResolver.replicas(partitionId); + final EpochPlacement ep = new EpochPlacement(0L, placement, replicator.getAckQuorum()); + return metadataStore.bootstrapIfAbsent(partitionId, ep, 0L); + } + + @FunctionalInterface + private interface SubscriptionConsumer { + void accept(long lsn, T payload); + } + + @FunctionalInterface + private interface RingPayloadMapper { + T map(byte[] payload); + } + + @FunctionalInterface + private interface LedgerPayloadMapper { + T map(MappedByteBuffer segmentBuffer, int payloadPos, int payloadLen); + } + + private void runSubscriptionLoop(final String topic, + final String group, + final int partitionId, + final SubscriptionConsumer consumer, + final RingPayloadMapper ringMapper, + final LedgerPayloadMapper ledgerMapper) { + long cursor = Math.max(0L, offsetStore.fetch(topic, group, partitionId)); + if (cursor <= ((1L << 40) - 1)) { + cursor = Lsn.encode(0L, cursor); + } + final long[] nextSeq = new long[1]; + final SubscriptionCommitState commitState = new SubscriptionCommitState(); + + for (;;) { + if (closed.get() || Thread.currentThread().isInterrupted()) { + flushSubscriptionCommit(topic, group, partitionId, commitState, System.nanoTime()); + return; + } + try { + final Ingress ing = ingressMap.get(partitionId); + if (ing == null) { + maybeFlushSubscriptionCommit(topic, group, partitionId, commitState, System.nanoTime()); + LockSupport.parkNanos(SUBSCRIBE_IDLE_NANOS); + continue; + } + + refreshEpochFromMetadata(partitionId); + + final long epoch = Lsn.epoch(cursor); + final long seq = Lsn.seq(cursor); + final PartitionEpochs pe = partitionEpochs(partitionId); + final EpochState active = pe.active; + final long activeEpoch = (active != null) ? active.epochId : ing.getActiveEpoch(); + + if (epoch == activeEpoch && ing.hasRingMapping()) { + final long ringSeq = ing.ringSeqForLedgerSeq(seq); + final long ringCursor = ing.getRing().getCursor(); + final long minRingSeq = Math.max(0L, ringCursor - ringSize + 1L); + + if (ringSeq >= minRingSeq && ringSeq <= ringCursor) { + final byte[] msg = ing.getRing().get(ringSeq); + final long mappedSeq = ing.ledgerSeqForRingSeq(ringSeq); + if (mappedSeq == seq) { + final long lsn = Lsn.encode(epoch, seq); + consumer.accept(lsn, ringMapper.map(msg)); + markSubscriptionDelivered(lsn, commitState); + cursor = Lsn.encode(epoch, seq + 1); + maybeFlushSubscriptionCommit(topic, group, partitionId, commitState, System.nanoTime()); + continue; + } + } + } + + if (!ing.getVirtualLog().hasEpoch(epoch)) { + final Long nextStart = nextEpochStartSeq(partitionId, epoch); + if (nextStart != null) { + cursor = Lsn.encode(epoch + 1, nextStart); + continue; + } + maybeFlushSubscriptionCommit(topic, group, partitionId, commitState, System.nanoTime()); + LockSupport.parkNanos(SUBSCRIBE_IDLE_NANOS); + continue; + } + + nextSeq[0] = seq; + final int visited = ing.fetchEpoch(epoch, seq, SUBSCRIBE_FETCH_BATCH, (off, segBuf, payloadPos, payloadLen) -> { + final long lsn = Lsn.encode(epoch, off); + consumer.accept(lsn, ledgerMapper.map(segBuf, payloadPos, payloadLen)); + markSubscriptionDelivered(lsn, commitState); + nextSeq[0] = off + 1; + }); + + if (visited > 0) { + cursor = Lsn.encode(epoch, nextSeq[0]); + maybeFlushSubscriptionCommit(topic, group, partitionId, commitState, System.nanoTime()); + continue; + } + + final Long nextStart = nextEpochStartSeq(partitionId, epoch); + if (nextStart != null) { + cursor = Lsn.encode(epoch + 1, nextStart); + continue; + } + + maybeFlushSubscriptionCommit(topic, group, partitionId, commitState, System.nanoTime()); + LockSupport.parkNanos(SUBSCRIBE_IDLE_NANOS); + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + flushSubscriptionCommit(topic, group, partitionId, commitState, System.nanoTime()); + return; + } catch (final Throwable t) { + log.warn("Subscription loop error for topic={} group={} partition={}: {}", + topic, group, partitionId, t.toString()); + maybeFlushSubscriptionCommit(topic, group, partitionId, commitState, System.nanoTime()); + LockSupport.parkNanos(SUBSCRIBE_IDLE_NANOS); + } + } + } + + private static void markSubscriptionDelivered(final long lsn, final SubscriptionCommitState state) { + state.pendingLsn = lsn; + state.pendingCount++; + } + + private static boolean shouldFlushSubscriptionCommit(final SubscriptionCommitState state, final long nowNanos) { + if (state.pendingCount <= 0) return false; + if (state.pendingCount >= SUBSCRIBE_COMMIT_BATCH) return true; + return (nowNanos - state.lastCommitNanos) >= SUBSCRIBE_COMMIT_MAX_DELAY_NANOS; + } + + private void maybeFlushSubscriptionCommit(final String topic, + final String group, + final int partitionId, + final SubscriptionCommitState state, + final long nowNanos) { + if (!shouldFlushSubscriptionCommit(state, nowNanos)) return; + flushSubscriptionCommit(topic, group, partitionId, state, nowNanos); + } + + private void flushSubscriptionCommit(final String topic, + final String group, + final int partitionId, + final SubscriptionCommitState state, + final long nowNanos) { + if (state.pendingCount <= 0) return; + offsetStore.commit(topic, group, partitionId, state.pendingLsn); + state.pendingCount = 0; + state.pendingLsn = Long.MIN_VALUE; + state.lastCommitNanos = nowNanos; + } + + private Long nextEpochStartSeq(final int partitionId, final long epoch) { + final Optional cfg = metadataStore.current(partitionId); + if (cfg.isEmpty()) return null; + final EpochMetadata next = cfg.get().epoch(epoch + 1); + if (next == null) return null; + return next.startSeq(); + } + private EpochState ensureEpochState(final int partitionId, final long epoch) { final PartitionEpochs pe = partitionEpochs(partitionId); EpochState st = pe.active; @@ -1666,10 +2021,74 @@ public void shutdown() throws IOException { } } + private static byte[] byteStringToArray(final com.google.protobuf.ByteString bytes) { + if (bytes == null || bytes.isEmpty()) return EMPTY_BYTES; + final int len = bytes.size(); + final byte[] out = new byte[len]; + bytes.copyTo(out, 0); + return out; + } + + private static byte[] byteStringToArrayOrNull(final com.google.protobuf.ByteString bytes) { + if (bytes == null || bytes.isEmpty()) return null; + return byteStringToArray(bytes); + } + private long computeMessageId(final int partitionId, final byte[] key, final byte[] payload) { - final int keyHash = (key != null ? Arrays.hashCode(key) : 0); - final int payloadHash = Arrays.hashCode(payload); - final int combined = 31 * keyHash + payloadHash; - return (((long) partitionId) << 32) ^ (combined & 0xFFFF_FFFFL); + final CRC32C crc = MESSAGE_ID_CRC.get(); + crc.reset(); + crcUpdateInt(crc, partitionId); + + if (key != null) { + crcUpdateInt(crc, key.length); + crc.update(key, 0, key.length); + } else { + crcUpdateInt(crc, 0); + } + + crcUpdateInt(crc, payload.length); + crc.update(payload, 0, payload.length); + + return (((long) partitionId) << 32) ^ (crc.getValue() & 0xFFFF_FFFFL); + } + + private static void writeLittleEndianInt(final byte[] dst, final int pos, final int value) { + dst[pos] = (byte) (value); + dst[pos + 1] = (byte) (value >>> 8); + dst[pos + 2] = (byte) (value >>> 16); + dst[pos + 3] = (byte) (value >>> 24); + } + + private static void crcUpdateInt(final CRC32C crc, final int value) { + final byte[] scratch = MESSAGE_ID_INT_SCRATCH.get(); + scratch[0] = (byte) (value); + scratch[1] = (byte) (value >>> 8); + scratch[2] = (byte) (value >>> 16); + scratch[3] = (byte) (value >>> 24); + crc.update(scratch, 0, Integer.BYTES); + } + + private static byte[] ensureBackfillScratch(final int minSize) { + byte[] out = BACKFILL_SCRATCH.get(); + if (out.length >= minSize) return out; + int next = out.length; + while (next < minSize) { + next <<= 1; + } + out = new byte[next]; + BACKFILL_SCRATCH.set(out); + return out; + } + + private static byte[][] ensureReplicaAppendScratch(final int minSize) { + byte[][] scratch = REPLICA_APPEND_SCRATCH.get(); + if (scratch.length >= minSize) return scratch; + int next = scratch.length; + while (next < minSize) { + next <<= 1; + } + scratch = new byte[next][]; + REPLICA_APPEND_SCRATCH.set(scratch); + return scratch; } } diff --git a/src/main/java/io/ringbroker/broker/ingress/Ingress.java b/src/main/java/io/ringbroker/broker/ingress/Ingress.java index ab98320..dbbe70d 100644 --- a/src/main/java/io/ringbroker/broker/ingress/Ingress.java +++ b/src/main/java/io/ringbroker/broker/ingress/Ingress.java @@ -13,6 +13,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; import java.lang.invoke.VarHandle; +import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.util.AbstractList; import java.util.Arrays; @@ -44,14 +45,22 @@ public final class Ingress { private final int batchSize; private final SlotRing queue; + private final int maxEnqueueBatch; private final byte[][] batchBuffer; private final ByteBatch batchView; private final boolean forceDurableWrites; + private final boolean publishRingMapping; private volatile Future writerTask; private volatile Throwable writerFailure; - // --- NEW: waiters completed by writer thread when HWM advances --- + // Mapping between ring cursor space and durable ledger sequence space. + // Updated by writer thread after each published batch. + private volatile long ringSeqDelta = Long.MIN_VALUE; // ledgerSeq = ringSeq + ringSeqDelta + private volatile long lastPublishedRingSeq = -1L; + private volatile long lastPublishedLedgerSeq = -1L; + + // Waiters completed by the writer thread as the HWM advances. private static final CompletableFuture DONE = CompletableFuture.completedFuture(null); private static final class SeqWaiter { @@ -71,7 +80,8 @@ private Ingress(final TopicRegistry registry, final VirtualLog virtualLog, final long epoch, final int batchSize, - final boolean forceDurableWrites) { + final boolean forceDurableWrites, + final boolean publishRingMapping) { this.registry = Objects.requireNonNull(registry, "registry"); this.ring = Objects.requireNonNull(ring, "ring"); @@ -82,9 +92,11 @@ private Ingress(final TopicRegistry registry, this.activeEpoch.set(epoch); this.batchSize = batchSize; this.forceDurableWrites = forceDurableWrites; + this.publishRingMapping = publishRingMapping; final int capacity = nextPowerOfTwo(batchSize * QUEUE_CAPACITY_FACTOR); this.queue = new SlotRing(capacity); + this.maxEnqueueBatch = batchSize; this.batchBuffer = new byte[batchSize][]; this.batchView = new ByteBatch(batchBuffer); } @@ -95,8 +107,18 @@ public static Ingress create(final TopicRegistry registry, final long epoch, final int batchSize, final boolean durable) throws IOException { + return create(registry, ring, log, epoch, batchSize, durable, false); + } + + public static Ingress create(final TopicRegistry registry, + final RingBuffer ring, + final VirtualLog log, + final long epoch, + final int batchSize, + final boolean durable, + final boolean publishRingMapping) throws IOException { - final Ingress ingress = new Ingress(registry, ring, log, epoch, batchSize, durable); + final Ingress ingress = new Ingress(registry, ring, log, epoch, batchSize, durable, publishRingMapping); ingress.writerTask = EXECUTOR.submit(ingress::writerLoop); return ingress; } @@ -116,9 +138,12 @@ public void publish(final String topic, final int retries, final byte[] rawPaylo Objects.requireNonNull(rawPayload, "rawPayload"); if (!registry.contains(topic)) throw new IllegalArgumentException("topic not registered: " + topic); - - final String outTopic = retries > MAX_RETRIES ? topic + ".DLQ" : topic; - if (!registry.contains(outTopic)) throw new IllegalArgumentException("topic not registered: " + outTopic); + if (retries > MAX_RETRIES) { + final String dlqTopic = topic + ".DLQ"; + if (!registry.contains(dlqTopic)) { + throw new IllegalArgumentException("topic not registered: " + dlqTopic); + } + } final long epoch = activeEpoch.get(); offerWithBackoff(rawPayload, epoch); @@ -129,14 +154,26 @@ public void publishForEpoch(final long epoch, final byte[] rawPayload) { offerWithBackoff(rawPayload, epoch); } + public void publishBatchForEpoch(final long epoch, final byte[][] rawPayloads, final int count) { + Objects.requireNonNull(rawPayloads, "rawPayloads"); + if (count <= 0) return; + if (count > rawPayloads.length) throw new IllegalArgumentException("count exceeds payload array length"); + + int index = 0; + while (index < count) { + final int chunk = Math.min(maxEnqueueBatch, count - index); + validatePayloadBatch(rawPayloads, index, chunk); + offerBatchWithBackoff(rawPayloads, index, chunk, epoch); + index += chunk; + } + } + /** - * NEW: completes when the epoch's high-watermark reaches at least seq (durable write done). - * This is completed by the writer thread, so the pipeline never blocks/spins/polls. + * Completes when an epoch high-watermark reaches at least {@code seq}. */ public CompletableFuture whenPersisted(final long epoch, final long seq) { if (seq < 0) return DONE; - // fast-path: already persisted try { if (highWaterMark(epoch) >= seq) return DONE; } catch (final Throwable t) { @@ -148,8 +185,7 @@ public CompletableFuture whenPersisted(final long epoch, final long seq) { waitersByEpoch.computeIfAbsent(epoch, __ -> new ConcurrentLinkedQueue<>()) .offer(new SeqWaiter(seq, f)); - // NOTE: if writer already advanced, it’ll complete it on the next write; - // but if the epoch goes idle, we avoid leaking by doing a final check: + // Best-effort completion for races where the writer already advanced. try { if (highWaterMark(epoch) >= seq) { // best-effort complete; writer may still drain later @@ -216,6 +252,37 @@ private void offerWithBackoff(final byte[] payload, final long epoch) { } } + private void offerBatchWithBackoff(final byte[][] payloads, final int offset, final int count, final long epoch) { + int spins = 0; + + for (;;) { + final Throwable wf = writerFailure; + if (wf != null) { + throw new IllegalStateException("Ingress writer failed", wf); + } + + if (queue.offerBatch(payloads, offset, count, epoch)) return; + + if (Thread.currentThread().isInterrupted()) { + throw new RuntimeException("Interrupted while publishing batch"); + } + + if ((++spins & 1023) == 0) { + LockSupport.parkNanos(PARK_NANOS); + } else { + Thread.onSpinWait(); + } + } + } + + private static void validatePayloadBatch(final byte[][] payloads, final int offset, final int count) { + for (int i = 0; i < count; i++) { + if (payloads[offset + i] == null) { + throw new IllegalArgumentException("payload cannot be null at index " + (offset + i)); + } + } + } + public void appendBackfillBatch(final long epoch, final byte[][] payloads, final int count) throws IOException { if (count == 0) return; for (int i = 0; i < count; i++) { @@ -236,13 +303,61 @@ public void appendBackfillBatch(final long epoch, final byte[][] payloads, final completeWaiters(epoch, ledger.getHighWaterMark()); } + public int appendBackfillEncodedBatch(final long epoch, final ByteBuffer framedPayloads, final int maxMessages) throws IOException { + Objects.requireNonNull(framedPayloads, "framedPayloads"); + if (maxMessages <= 0 || framedPayloads.remaining() < Integer.BYTES) return 0; + + final var ledger = virtualLog.forEpoch(epoch); + int appended = 0; + long lastOffset = -1L; + + while (appended < maxMessages && framedPayloads.remaining() >= Integer.BYTES) { + final int len = peekLittleEndianInt(framedPayloads); + if (len < 0) break; + final int frameBytes; + try { + frameBytes = Math.addExact(Integer.BYTES, len); + } catch (final ArithmeticException ignored) { + break; + } + if (framedPayloads.remaining() < frameBytes) break; + + final LedgerSegment segment = ledger.writable(len); + final int written = segment.appendFramedBatchNoOffsets(framedPayloads, maxMessages - appended); + if (written <= 0) { + throw new IOException("Failed to append framed backfill payload for epoch " + epoch); + } + appended += written; + lastOffset = segment.getLastOffset(); + } + + if (appended > 0) { + ledger.setHighWaterMark(lastOffset); + completeWaiters(epoch, ledger.getHighWaterMark()); + } + + return appended; + } + private int computeTotalBytes(final byte[][] payloads, final int count) { - int total = 0; + long total = 0L; for (int i = 0; i < count; i++) { final int len = payloads[i].length; - total = Math.addExact(total, Integer.BYTES + Integer.BYTES + len); + total += (long) Integer.BYTES + Integer.BYTES + len; + if (total > Integer.MAX_VALUE) { + throw new IllegalArgumentException("payload batch too large: " + total); + } } - return total; + return (int) total; + } + + private static int peekLittleEndianInt(final ByteBuffer src) { + final int pos = src.position(); + final int b0 = src.get(pos) & 0xFF; + final int b1 = src.get(pos + 1) & 0xFF; + final int b2 = src.get(pos + 2) & 0xFF; + final int b3 = src.get(pos + 3) & 0xFF; + return b0 | (b1 << 8) | (b2 << 16) | (b3 << 24); } @PostConstruct @@ -266,6 +381,8 @@ private void writerLoop() { final SlotRing.Entry entry = new SlotRing.Entry(); final SlotRing.Entry carry = new SlotRing.Entry(); boolean hasCarry = false; + long cachedEpoch = Long.MIN_VALUE; + LedgerOrchestrator cachedLedger = null; try { while (!Thread.currentThread().isInterrupted()) { @@ -286,11 +403,11 @@ private void writerLoop() { } int count = 0; - int totalBytes = 0; + long totalBytesLong = 0L; final long batchEpoch = entry.epoch; batchBuffer[count++] = entry.payload; - totalBytes = Math.addExact(totalBytes, Integer.BYTES + Integer.BYTES + entry.payload.length); + totalBytesLong += (long) Integer.BYTES + Integer.BYTES + entry.payload.length; while (count < batchSize) { if (!queue.pollInto(entry)) break; @@ -307,12 +424,21 @@ private void writerLoop() { } batchBuffer[count++] = entry.payload; - totalBytes = Math.addExact(totalBytes, Integer.BYTES + Integer.BYTES + entry.payload.length); + totalBytesLong += (long) Integer.BYTES + Integer.BYTES + entry.payload.length; + } + + if (totalBytesLong > Integer.MAX_VALUE) { + throw new IllegalStateException("Batch bytes exceed int range: " + totalBytesLong); } + final int totalBytes = (int) totalBytesLong; batchView.setSize(count); - final LedgerOrchestrator ledger = virtualLog.forEpoch(batchEpoch); + if (cachedLedger == null || cachedEpoch != batchEpoch) { + cachedLedger = virtualLog.forEpoch(batchEpoch); + cachedEpoch = batchEpoch; + } + final LedgerOrchestrator ledger = cachedLedger; final LedgerSegment segment = ledger.writable(totalBytes); if (forceDurableWrites) { @@ -321,13 +447,22 @@ private void writerLoop() { segment.appendBatchNoOffsets(batchView, totalBytes); } - ledger.setHighWaterMark(segment.getLastOffset()); + final long endLedgerSeq = segment.getLastOffset(); + ledger.setHighWaterMark(endLedgerSeq); - // NEW: complete durability waiters for this epoch up to new HWM completeWaiters(batchEpoch, ledger.getHighWaterMark()); final long endSeq = ring.next(count); - ring.publishBatch(endSeq, count, batchBuffer); + ring.publishBatchSingleProducer(endSeq, count, batchBuffer); + + if (publishRingMapping) { + final long startLedgerSeq = endLedgerSeq - count + 1; + final long startRingSeq = endSeq - count + 1; + // Publish mapping after ring visibility for tail-cache consumers. + ringSeqDelta = startLedgerSeq - startRingSeq; + lastPublishedRingSeq = endSeq; + lastPublishedLedgerSeq = endLedgerSeq; + } Arrays.fill(batchBuffer, 0, count, null); } @@ -370,6 +505,36 @@ public long highWaterMark(final long epoch) { return virtualLog.forEpoch(epoch).getHighWaterMark(); } + public boolean hasRingMapping() { + return publishRingMapping && ringSeqDelta != Long.MIN_VALUE; + } + + public long ringSeqDelta() { + return ringSeqDelta; + } + + public long lastPublishedRingSeq() { + return lastPublishedRingSeq; + } + + public long lastPublishedLedgerSeq() { + return lastPublishedLedgerSeq; + } + + public long ledgerSeqForRingSeq(final long ringSeq) { + if (!publishRingMapping) throw new IllegalStateException("Ring mapping disabled"); + final long delta = ringSeqDelta; + if (delta == Long.MIN_VALUE) throw new IllegalStateException("Ring mapping unavailable"); + return ringSeq + delta; + } + + public long ringSeqForLedgerSeq(final long ledgerSeq) { + if (!publishRingMapping) throw new IllegalStateException("Ring mapping disabled"); + final long delta = ringSeqDelta; + if (delta == Long.MIN_VALUE) throw new IllegalStateException("Ring mapping unavailable"); + return ledgerSeq - delta; + } + // -------------------- SlotRing -------------------- static final class SlotRing { @@ -387,8 +552,8 @@ static final class SlotRing { private final long[] sequence; private final byte[][] buffer; - private final PaddedAtomicLong tail = new PaddedAtomicLong(0); - private final PaddedAtomicLong head = new PaddedAtomicLong(0); + private final PaddedCounter tail = new PaddedCounter(0L); + private final PaddedCounter head = new PaddedCounter(0L); SlotRing(final int capacityPow2) { if (Integer.bitCount(capacityPow2) != 1) throw new IllegalArgumentException("capacity must be power of two"); @@ -433,6 +598,52 @@ boolean offer(final byte[] element, final long epoch) { return true; } + boolean offerBatch(final byte[][] elements, final int offset, final int count, final long epoch) { + if (count <= 0) return true; + if (count > capacity) return false; + + long tailSnapshot; + + while (true) { + tailSnapshot = tail.get(); + + boolean retry = false; + for (int i = 0; i < count; i++) { + final long seq = tailSnapshot + i; + final int index = (int) (seq & mask); + + final long seqVal = (long) SEQUENCE_HANDLE.getVolatile(this.sequence, index); + final long difference = seqVal - seq; + + if (difference == 0) { + continue; + } + if (difference < 0) { + return false; + } + retry = true; + break; + } + + if (retry) { + Thread.onSpinWait(); + continue; + } + + if (tail.compareAndSet(tailSnapshot, tailSnapshot + count)) break; + } + + for (int i = 0; i < count; i++) { + final long seq = tailSnapshot + i; + final int index = (int) (seq & mask); + + BUFFER_HANDLE.setRelease(buffer, index, elements[offset + i]); + EPOCH_HANDLE.setRelease(epochs, index, epoch); + SEQUENCE_HANDLE.setRelease(sequence, index, seq + 1); + } + return true; + } + boolean pollInto(final Entry out) { long headSnapshot; @@ -474,11 +685,15 @@ static final class Entry { } } - private static final class PaddedAtomicLong extends AtomicLong { + private static final class PaddedCounter extends AtomicLong { + @SuppressWarnings("unused") volatile long p1, p2, p3, p4, p5, p6, p7; + @SuppressWarnings("unused") volatile long q1, q2, q3, q4, q5, q6, q7; - PaddedAtomicLong(final long initial) { super(initial); } + PaddedCounter(final long initial) { + super(initial); + } } private static final class ByteBatch extends AbstractList { diff --git a/src/main/java/io/ringbroker/cluster/client/RemoteBrokerClient.java b/src/main/java/io/ringbroker/cluster/client/RemoteBrokerClient.java index 352c8bf..284b68c 100644 --- a/src/main/java/io/ringbroker/cluster/client/RemoteBrokerClient.java +++ b/src/main/java/io/ringbroker/cluster/client/RemoteBrokerClient.java @@ -1,5 +1,6 @@ package io.ringbroker.cluster.client; +import com.google.protobuf.ByteString; import io.ringbroker.api.BrokerApi; import java.util.concurrent.CompletableFuture; @@ -8,28 +9,41 @@ * Abstraction over the broker-to-broker transport. */ public interface RemoteBrokerClient extends AutoCloseable { + byte[] EMPTY_BYTES = new byte[0]; /** - * Legacy method — still used by classic single-owner forwarders. + * Legacy method still used by single-owner forwarders. */ void sendMessage(String topic, byte[] key, byte[] payload); /** - * NEW: zero-copy path for replication. Default impl falls back to - * {@link #sendMessage(String, byte[], byte[])} if you only have a - * basic client implementation. + * Envelope path for replication. Default implementation falls back to + * {@link #sendMessage(String, byte[], byte[])} for basic clients. */ default void sendEnvelope(final BrokerApi.Envelope envelope) { if (envelope.hasPublish()) { final var m = envelope.getPublish(); sendMessage(m.getTopic(), - m.getKey().isEmpty() ? null : m.getKey().toByteArray(), - m.getPayload().toByteArray()); + byteStringToArrayOrNull(m.getKey()), + byteStringToArray(m.getPayload())); } else { throw new UnsupportedOperationException("Unsupported envelope type"); } } + private static byte[] byteStringToArray(final ByteString bytes) { + if (bytes == null || bytes.isEmpty()) return EMPTY_BYTES; + final int len = bytes.size(); + final byte[] out = new byte[len]; + bytes.copyTo(out, 0); + return out; + } + + private static byte[] byteStringToArrayOrNull(final ByteString bytes) { + if (bytes == null || bytes.isEmpty()) return null; + return byteStringToArray(bytes); + } + CompletableFuture sendEnvelopeWithAck(final BrokerApi.Envelope envelope); default CompletableFuture sendBackfill(final BrokerApi.Envelope envelope) { diff --git a/src/main/java/io/ringbroker/cluster/client/impl/NettyClusterClient.java b/src/main/java/io/ringbroker/cluster/client/impl/NettyClusterClient.java index c1f01f9..6919b3d 100644 --- a/src/main/java/io/ringbroker/cluster/client/impl/NettyClusterClient.java +++ b/src/main/java/io/ringbroker/cluster/client/impl/NettyClusterClient.java @@ -21,13 +21,21 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @Slf4j public final class NettyClusterClient implements RemoteBrokerClient { + private static final Object SHARED_GROUP_LOCK = new Object(); + private static final AtomicInteger SHARED_GROUP_REFS = new AtomicInteger(0); + private static final int SHARED_GROUP_THREADS = Math.max(2, Math.min(8, Runtime.getRuntime().availableProcessors())); + private static volatile EventLoopGroup sharedGroup; + private final Channel channel; private final EventLoopGroup group; @@ -36,12 +44,19 @@ public final class NettyClusterClient implements RemoteBrokerClient { private final ConcurrentMap> pendingBackfill = new ConcurrentHashMap<>(); + private final long requestTimeoutMillis; private final AtomicLong corrSeq = new AtomicLong(1L); private final AtomicBoolean closed = new AtomicBoolean(false); public NettyClusterClient(final String host, final int port) throws InterruptedException { - final IoHandlerFactory factory = NioIoHandler.newFactory(); - this.group = new MultiThreadIoEventLoopGroup(1, factory); + this(host, port, Long.getLong("ringbroker.cluster.requestTimeoutMillis", 5_000L)); + } + + public NettyClusterClient(final String host, + final int port, + final long requestTimeoutMillis) throws InterruptedException { + this.group = acquireSharedGroup(); + this.requestTimeoutMillis = Math.max(1L, requestTimeoutMillis); final Bootstrap bootstrap = new Bootstrap() .group(group) @@ -60,9 +75,20 @@ protected void initChannel(final SocketChannel ch) { } }); - this.channel = bootstrap.connect(new InetSocketAddress(host, port)) - .sync() - .channel(); + try { + this.channel = bootstrap.connect(new InetSocketAddress(host, port)) + .sync() + .channel(); + } catch (final InterruptedException ie) { + releaseSharedGroup(); + throw ie; + } catch (final RuntimeException re) { + releaseSharedGroup(); + throw re; + } catch (final Error err) { + releaseSharedGroup(); + throw err; + } log.info("NettyClusterClient connected to {}:{}", host, port); } @@ -111,7 +137,13 @@ public CompletableFuture sendEnvelopeWithAck(final Bro final CompletableFuture future = new CompletableFuture<>(); pendingAcks.put(corrId, future); - future.whenComplete((res, ex) -> pendingAcks.remove(corrId)); + final ScheduledFuture timeoutTask = channel.eventLoop().schedule(() -> { + future.completeExceptionally(new TimeoutException("Replication ack timeout corrId=" + corrId)); + }, requestTimeoutMillis, TimeUnit.MILLISECONDS); + future.whenComplete((res, ex) -> { + pendingAcks.remove(corrId); + timeoutTask.cancel(false); + }); channel.writeAndFlush(toSend).addListener(f -> { if (!f.isSuccess()) { @@ -134,7 +166,13 @@ public CompletableFuture sendBackfill(final BrokerApi.E final CompletableFuture future = new CompletableFuture<>(); pendingBackfill.put(corrId, future); - future.whenComplete((res, ex) -> pendingBackfill.remove(corrId)); + final ScheduledFuture timeoutTask = channel.eventLoop().schedule(() -> { + future.completeExceptionally(new TimeoutException("Backfill timeout corrId=" + corrId)); + }, requestTimeoutMillis, TimeUnit.MILLISECONDS); + future.whenComplete((res, ex) -> { + pendingBackfill.remove(corrId); + timeoutTask.cancel(false); + }); channel.writeAndFlush(toSend).addListener(f -> { if (!f.isSuccess()) { @@ -159,9 +197,36 @@ public void close() { try { if (channel != null) channel.close().syncUninterruptibly(); } finally { - if (group != null) group.shutdownGracefully(0, 2, TimeUnit.SECONDS).syncUninterruptibly(); + releaseSharedGroup(); } log.info("NettyClusterClient closed."); } + + private static EventLoopGroup acquireSharedGroup() { + synchronized (SHARED_GROUP_LOCK) { + EventLoopGroup g = sharedGroup; + if (g == null || g.isShuttingDown() || g.isShutdown() || g.isTerminated()) { + g = new MultiThreadIoEventLoopGroup(SHARED_GROUP_THREADS, NioIoHandler.newFactory()); + sharedGroup = g; + } + SHARED_GROUP_REFS.incrementAndGet(); + return g; + } + } + + private static void releaseSharedGroup() { + EventLoopGroup g = null; + synchronized (SHARED_GROUP_LOCK) { + final int refs = SHARED_GROUP_REFS.decrementAndGet(); + if (refs <= 0) { + SHARED_GROUP_REFS.set(0); + g = sharedGroup; + sharedGroup = null; + } + } + if (g != null) { + g.shutdownGracefully(0, 2, TimeUnit.SECONDS).syncUninterruptibly(); + } + } } diff --git a/src/main/java/io/ringbroker/cluster/membership/channel/ClientReplicationHandler.java b/src/main/java/io/ringbroker/cluster/membership/channel/ClientReplicationHandler.java index 8f14625..e856031 100644 --- a/src/main/java/io/ringbroker/cluster/membership/channel/ClientReplicationHandler.java +++ b/src/main/java/io/ringbroker/cluster/membership/channel/ClientReplicationHandler.java @@ -14,6 +14,12 @@ */ @Slf4j public final class ClientReplicationHandler extends SimpleChannelInboundHandler { + private static final BrokerApi.ReplicationAck ACK_SUCCESS = BrokerApi.ReplicationAck.newBuilder() + .setStatus(BrokerApi.ReplicationAck.Status.SUCCESS) + .build(); + private static final BrokerApi.ReplicationAck ACK_PERSISTENCE_FAILED = BrokerApi.ReplicationAck.newBuilder() + .setStatus(BrokerApi.ReplicationAck.Status.ERROR_PERSISTENCE_FAILED) + .build(); private final ConcurrentMap> pendingAcks; private final ConcurrentMap> pendingBackfill; @@ -41,11 +47,7 @@ protected void channelRead0(final ChannelHandlerContext ctx, final BrokerApi.Env if (envelope.hasPublishReply()) { final CompletableFuture fut = pendingAcks.remove(corrId); if (fut != null) { - final BrokerApi.ReplicationAck.Status status = - envelope.getPublishReply().getSuccess() - ? BrokerApi.ReplicationAck.Status.SUCCESS - : BrokerApi.ReplicationAck.Status.ERROR_PERSISTENCE_FAILED; - fut.complete(BrokerApi.ReplicationAck.newBuilder().setStatus(status).build()); + fut.complete(envelope.getPublishReply().getSuccess() ? ACK_SUCCESS : ACK_PERSISTENCE_FAILED); } else { log.warn("PublishReply for unknown corrId {}", corrId); } diff --git a/src/main/java/io/ringbroker/cluster/membership/hash/HashingProvider.java b/src/main/java/io/ringbroker/cluster/membership/hash/HashingProvider.java index 1ee8941..6ee71e5 100644 --- a/src/main/java/io/ringbroker/cluster/membership/hash/HashingProvider.java +++ b/src/main/java/io/ringbroker/cluster/membership/hash/HashingProvider.java @@ -6,9 +6,7 @@ import java.util.ArrayList; import java.util.Collection; -import java.util.Comparator; import java.util.List; -import java.util.stream.Collectors; /** * Highest‑Random‑Weight hashing, stable under membership churn. @@ -49,18 +47,67 @@ public int primary(final int key, final Collection members) { public List topN(final int key, final int n, final Collection members) { - final List persistenceOnly = members.stream() - .filter(m -> m.role() == BrokerRole.PERSISTENCE) - .toList(); - - final List candidates = persistenceOnly.isEmpty() - ? new ArrayList<>(members) // fallback: no persistence nodes; use all - : persistenceOnly; - - return candidates.stream() - .sorted(Comparator.comparingLong(m -> -score(key, m.brokerId()))) - .limit(n) - .map(Member::brokerId) - .collect(Collectors.toList()); + if (n <= 0 || members.isEmpty()) { + return List.of(); + } + + final Member[] candidateBuf = new Member[members.size()]; + int candidateCount = 0; + + for (final Member m : members) { + if (m.role() == BrokerRole.PERSISTENCE) { + candidateBuf[candidateCount++] = m; + } + } + + if (candidateCount == 0) { + for (final Member m : members) { + candidateBuf[candidateCount++] = m; + } + } + + final int limit = Math.min(n, candidateCount); + final int[] bestIds = new int[limit]; + final long[] bestScores = new long[limit]; + int bestCount = 0; + + for (int i = 0; i < candidateCount; i++) { + final int brokerId = candidateBuf[i].brokerId(); + final long s = score(key, brokerId); + + int insertAt = bestCount; + while (insertAt > 0) { + final int prev = insertAt - 1; + final long prevScore = bestScores[prev]; + final int prevId = bestIds[prev]; + + final boolean shouldShift = + s > prevScore || (s == prevScore && brokerId < prevId); + if (!shouldShift) break; + insertAt--; + } + + if (insertAt >= limit) { + continue; + } + + final int upper = Math.min(bestCount, limit - 1); + for (int j = upper; j > insertAt; j--) { + bestScores[j] = bestScores[j - 1]; + bestIds[j] = bestIds[j - 1]; + } + + bestScores[insertAt] = s; + bestIds[insertAt] = brokerId; + if (bestCount < limit) { + bestCount++; + } + } + + final ArrayList out = new ArrayList<>(bestCount); + for (int i = 0; i < bestCount; i++) { + out.add(bestIds[i]); + } + return out; } } diff --git a/src/main/java/io/ringbroker/cluster/membership/replicator/AdaptiveReplicator.java b/src/main/java/io/ringbroker/cluster/membership/replicator/AdaptiveReplicator.java index d491334..9872165 100644 --- a/src/main/java/io/ringbroker/cluster/membership/replicator/AdaptiveReplicator.java +++ b/src/main/java/io/ringbroker/cluster/membership/replicator/AdaptiveReplicator.java @@ -5,36 +5,51 @@ import lombok.Getter; import lombok.extern.slf4j.Slf4j; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.VarHandle; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.LockSupport; /** - * Latency-aware quorum replicator (failover-safe). + * Latency-aware quorum replicator (failover-safe) optimized for high-throughput / low-latency. * - * Key properties: - * - keeps LIVE reference to clients map (supports wiring after construction) - * - no per-call sorting; O(n) selection for fastest candidates - * - waits completions in any order (no head-of-line blocking) - * - if a chosen replica fails/times out, automatically starts another to still reach quorum - * - * IMPORTANT: Hot-path overload uses primitive arrays to avoid boxing / List allocations. + * Fixes vs earlier perf implementation: + * - Treats CompletableFuture timeouts (TimeoutException) as retriable within the same replicate() call, + * like ERROR_REPLICA_NOT_READY, to avoid segment-creation warmup stalls wedging quorum. + * - Sizes completion queue to tolerate retries/bursts (avoids producer spinning if queue fills). */ @Slf4j @Getter public final class AdaptiveReplicator { + private static final int ACK_NONE = -1; + private static final int SUCCESS_ORDINAL = BrokerApi.ReplicationAck.Status.SUCCESS.ordinal(); + private static final int NOT_READY_ORDINAL = BrokerApi.ReplicationAck.Status.ERROR_REPLICA_NOT_READY.ordinal(); + + // Retry backoff for transient states within a single replicate() call. + private static final long RETRY_INITIAL_NS = TimeUnit.MICROSECONDS.toNanos(50); + private static final long RETRY_TIMEOUT_INITIAL_NS = TimeUnit.MICROSECONDS.toNanos(200); + private static final long RETRY_MAX_NS = TimeUnit.MILLISECONDS.toNanos(5); + + // If no inflight and no scheduled retries, don't fail fast; briefly park and re-check until deadline. + private static final long NO_PROGRESS_PARK_NS = TimeUnit.MICROSECONDS.toNanos(200); + private final int ackQuorum; private final Map clients; // LIVE reference private final long timeoutMillis; - // EWMA latency (ns) per node - private final ConcurrentMap ewmaNs = new ConcurrentHashMap<>(); - private final double alpha = 0.2; - private final double defaultNs; + // EWMA latency in ns per node (integer EWMA). + private final ConcurrentMap ewmaNs = new ConcurrentHashMap<>(); + // new = prev + ((sample - prev) >> ewmaShift); ewmaShift=3 => alpha=1/8 + private final int ewmaShift = 3; + + private final long defaultNs; + private final long maxPenaltyNs; private final ExecutorService background = Executors.newSingleThreadExecutor(r -> { @@ -52,10 +67,12 @@ public AdaptiveReplicator(final int ackQuorum, this.ackQuorum = ackQuorum; this.clients = Objects.requireNonNull(clients, "clients"); this.timeoutMillis = timeoutMillis; + this.defaultNs = TimeUnit.MILLISECONDS.toNanos(1); + this.maxPenaltyNs = TimeUnit.SECONDS.toNanos(10); for (final Integer id : clients.keySet()) { - ewmaNs.put(id, defaultNs); + ewmaNs.put(id, new AtomicLong(defaultNs)); } } @@ -84,118 +101,190 @@ public void replicate(final BrokerApi.Envelope frame, replicate(frame, replicas, replicaCount, this.ackQuorum); } + private static int clampQuorum(final int quorumOverride, final int n) { + final int q = Math.max(1, quorumOverride); + return Math.min(q, n); + } + + private static int nextPow2AtLeast(final int x) { + int v = (x <= 2) ? 2 : x; + v--; + v |= v >>> 1; + v |= v >>> 2; + v |= v >>> 4; + v |= v >>> 8; + v |= v >>> 16; + v++; + return v; + } + public void replicate(final BrokerApi.Envelope frame, final int[] replicas, final int replicaCount, final int quorumOverride) throws InterruptedException, TimeoutException { + if (frame == null) throw new NullPointerException("frame"); if (replicas == null || replicaCount <= 0) throw new TimeoutException("No replicas provided"); + if (replicaCount > replicas.length) throw new IllegalArgumentException("replicaCount > replicas.length"); final int n = replicaCount; - final int quorum = Math.min(Math.max(1, quorumOverride), n); + final int quorum = clampQuorum(quorumOverride, n); + + // Per-call client cache (refresh from LIVE map when null). + final RemoteBrokerClient[] clientCache = new RemoteBrokerClient[n]; + int availableAtStart = 0; + for (int i = 0; i < n; i++) { + final RemoteBrokerClient c = clients.get(replicas[i]); + clientCache[i] = c; + if (c != null) availableAtStart++; + } + if (availableAtStart < quorum) { + throw new TimeoutException("Not enough replicas available to start quorum=" + quorum + + " (availableAtStart=" + availableAtStart + ")"); + } final long deadlineNs = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeoutMillis); + // attempted[i] means "do not start right now" (either in-flight OR permanently exhausted by non-retriable failure). final boolean[] attempted = new boolean[n]; final boolean[] completed = new boolean[n]; @SuppressWarnings("unchecked") - final CompletableFuture[] inflight = - new CompletableFuture[n]; + final CompletableFuture[] inflight = new CompletableFuture[n]; + + // Written by completers before publishing idx into doneQ. + final int[] ackStatus = new int[n]; + final Throwable[] errs = new Throwable[n]; + final long[] latencyNs = new long[n]; + for (int i = 0; i < n; i++) ackStatus[i] = ACK_NONE; + + // Transient retry scheduling per idx. + final long[] retryAfterNs = new long[n]; // 0 means "eligible now" + final long[] retryBackoffNs = new long[n]; + for (int i = 0; i < n; i++) retryBackoffNs[i] = RETRY_INITIAL_NS; - final ArrayBlockingQueue doneQ = new ArrayBlockingQueue<>(n); + // IMPORTANT: completions can exceed n when we retry; give headroom to avoid offer() spinning on producers. + final IntMpscQueue doneQ = new IntMpscQueue(nextPow2AtLeast(Math.max(2, n << 2))); int started = 0; int doneCount = 0; int successes = 0; - String firstFailure = null; - // Start enough attempts to be *capable* of reaching quorum. - while (started < quorum) { - final int idx = pickBestAvailableIndex(replicas, n, attempted); - if (idx < 0) { - final long rem = deadlineNs - System.nanoTime(); - if (rem <= 0) break; - LockSupport.parkNanos(Math.min(rem, TimeUnit.MILLISECONDS.toNanos(1))); - continue; - } + int firstFailNode = Integer.MIN_VALUE; + int firstFailStatus = ACK_NONE; + Throwable firstFailErr = null; - attempted[idx] = true; + while (successes < quorum) { + final long nowNs = System.nanoTime(); + if (nowNs >= deadlineNs) break; - final int nodeId = replicas[idx]; - final RemoteBrokerClient client = clients.get(nodeId); - if (client == null) continue; - - final long startNs = System.nanoTime(); - final CompletableFuture f = client.sendEnvelopeWithAck(frame); - inflight[idx] = f; - started++; - - f.whenComplete((ack, err) -> { - final long lat = System.nanoTime() - startNs; - // Never block a Netty event loop thread; offer is safe for n-sized queue. - doneQ.offer(new Done(idx, nodeId, ack, err, lat)); - }); - } + // Top-up: ensure successes + inflight >= quorum (no HOL blocking). + while (successes + (started - doneCount) < quorum) { + final long now2 = System.nanoTime(); + if (now2 >= deadlineNs) break; - if (started < quorum) { - throw new TimeoutException("Not enough replicas available to start quorum=" + quorum + " (started=" + started + ")"); - } + final int idx = pickBestEligibleIndex(replicas, clientCache, attempted, retryAfterNs, n, now2); + if (idx < 0) break; - while (successes < quorum) { - long remainingNs = deadlineNs - System.nanoTime(); - if (remainingNs <= 0) break; + final int nodeId = replicas[idx]; + final RemoteBrokerClient client = getClient(idx, nodeId, clientCache); + if (client == null) { + // Don't mark attempted; allow late wiring within the same call. + retryAfterNs[idx] = 0L; + continue; + } + + // startAttempt(idx) + attempted[idx] = true; + completed[idx] = false; + started++; + + final long startNs = System.nanoTime(); + try { + final CompletableFuture f = client.sendEnvelopeWithAck(frame); + inflight[idx] = f; + + f.whenComplete((ack, err) -> { + ackStatus[idx] = (ack == null) ? ACK_NONE : ack.getStatus().ordinal(); + errs[idx] = err; + latencyNs[idx] = System.nanoTime() - startNs; + doneQ.offer(idx); + }); + } catch (final Throwable t) { + ackStatus[idx] = ACK_NONE; + errs[idx] = t; + latencyNs[idx] = System.nanoTime() - startNs; + doneQ.offer(idx); + } + } + + if (successes >= quorum) break; - final Done d = doneQ.poll(remainingNs, TimeUnit.NANOSECONDS); - if (d == null) break; + // If nothing is inflight, sleep until the next scheduled retry (or a short park) or deadline. + long wakeNs = deadlineNs; + if (started == doneCount) { + final long nextRetry = findNextRetryNs(replicas, clientCache, attempted, retryAfterNs, n, nowNs); + if (nextRetry == Long.MAX_VALUE) { + wakeNs = Math.min(wakeNs, nowNs + NO_PROGRESS_PARK_NS); + } else { + wakeNs = Math.min(wakeNs, nextRetry); + } + } - if (!completed[d.idx]) { - completed[d.idx] = true; - doneCount++; + final int idx = doneQ.pollIntUntil(wakeNs); + if (idx < 0) { + // Woke up for retry/deadline; loop continues. + continue; } - if (d.success()) { + if (completed[idx]) continue; // defensive + completed[idx] = true; + doneCount++; + + final int nodeId = replicas[idx]; + final int st = ackStatus[idx]; + final Throwable err = errs[idx]; + final Throwable rootErr = unwrap(err); + + final boolean ok = (err == null && st == SUCCESS_ORDINAL); + if (ok) { successes++; - reward(d.nodeId, d.latencyNs); - } else { - penalize(d.nodeId); - if (firstFailure == null) { - final String status = (d.ack == null) - ? "no-ack" - : d.ack.getStatus().name(); - firstFailure = "node=" + d.nodeId + " status=" + status + - (d.err != null ? " err=" + d.err : ""); - } + retryBackoffNs[idx] = RETRY_INITIAL_NS; + retryAfterNs[idx] = 0L; + reward(nodeId, latencyNs[idx]); + continue; } - // If impossible to reach quorum with remaining inflight, start failover attempts. - while (successes + (started - doneCount) < quorum) { - remainingNs = deadlineNs - System.nanoTime(); - if (remainingNs <= 0) break; + if (firstFailNode == Integer.MIN_VALUE) { + firstFailNode = nodeId; + firstFailStatus = st; + firstFailErr = err; + } - final int idx = pickBestAvailableIndex(replicas, n, attempted); - if (idx < 0) break; + // ERROR_REPLICA_NOT_READY is retriable within call. + if (err == null && st == NOT_READY_ORDINAL) { + penalizeNotReady(nodeId); - attempted[idx] = true; + attempted[idx] = false; // allow retry + scheduleRetry(idx, System.nanoTime(), deadlineNs, retryAfterNs, retryBackoffNs, RETRY_INITIAL_NS); + continue; + } - final int nodeId = replicas[idx]; - final RemoteBrokerClient client = clients.get(nodeId); - if (client == null) continue; + // TimeoutException from the per-replica CompletableFuture (often segment creation stalls) is retriable. + if (rootErr instanceof TimeoutException) { + penalizeNotReady(nodeId); // soft penalty; usually transient - final long startNs = System.nanoTime(); - final CompletableFuture f = client.sendEnvelopeWithAck(frame); - inflight[idx] = f; - started++; + attempted[idx] = false; // allow retry + final CompletableFuture f = inflight[idx]; + if (f != null) f.cancel(true); - f.whenComplete((ack, err) -> { - final long lat = System.nanoTime() - startNs; - doneQ.offer(new Done(idx, nodeId, ack, err, lat)); - }); + scheduleRetry(idx, System.nanoTime(), deadlineNs, retryAfterNs, retryBackoffNs, RETRY_TIMEOUT_INITIAL_NS); + continue; } - // EARLY EXIT: all attempted completed, none left, cannot reach quorum. - if (doneCount == started && started == n && successes < quorum) break; + // Non-success, non-retriable: penalize and keep attempted[idx]=true (do not retry this idx in this call). + penalize(nodeId); } if (successes < quorum) { @@ -203,17 +292,30 @@ public void replicate(final BrokerApi.Envelope frame, final CompletableFuture f = inflight[i]; if (f != null && !f.isDone()) f.cancel(true); } - final String cause = (firstFailure == null) ? "no responses" : firstFailure; - throw new TimeoutException("Quorum timed out: got " + successes + "/" + quorum + " (firstFailure=" + cause + ")"); + + final String cause; + if (firstFailNode == Integer.MIN_VALUE) { + cause = "no responses"; + } else { + final String statusStr = (firstFailStatus == ACK_NONE) + ? "no-ack" + : BrokerApi.ReplicationAck.Status.values()[firstFailStatus].name(); + cause = "node=" + firstFailNode + " status=" + statusStr + + (firstFailErr != null ? " err=" + firstFailErr : ""); + } + + final boolean timedOut = System.nanoTime() >= deadlineNs; + final String prefix = timedOut ? "Quorum timed out" : "Quorum failed"; + throw new TimeoutException(prefix + ": got " + successes + "/" + quorum + " (firstFailure=" + cause + ")"); } - // Background replicate remaining replicas (not attempted) + // Background replicate remaining replicas (not attempted), preserving original behavior. if (!closed.get()) { for (int i = 0; i < n; i++) { if (attempted[i]) continue; final int nodeId = replicas[i]; - final RemoteBrokerClient client = clients.get(nodeId); + final RemoteBrokerClient client = getClient(i, nodeId, clientCache); if (client == null) continue; background.execute(() -> { @@ -221,7 +323,8 @@ public void replicate(final BrokerApi.Envelope frame, try { final BrokerApi.ReplicationAck ack = client.sendEnvelopeWithAck(frame).get(timeoutMillis, TimeUnit.MILLISECONDS); - if (ack.getStatus() == BrokerApi.ReplicationAck.Status.SUCCESS) { + + if (ack != null && ack.getStatus() == BrokerApi.ReplicationAck.Status.SUCCESS) { reward(nodeId, System.nanoTime() - startNs); } else { penalize(nodeId); @@ -235,53 +338,106 @@ public void replicate(final BrokerApi.Envelope frame, } } - private static final class Done { - final int idx; - final int nodeId; - final BrokerApi.ReplicationAck ack; - final Throwable err; - final long latencyNs; - - Done(final int idx, final int nodeId, final BrokerApi.ReplicationAck ack, final Throwable err, final long latencyNs) { - this.idx = idx; - this.nodeId = nodeId; - this.ack = ack; - this.err = err; - this.latencyNs = latencyNs; - } + private static void scheduleRetry(final int idx, + final long nowNs, + final long deadlineNs, + final long[] retryAfterNs, + final long[] retryBackoffNs, + final long minInitialBackoffNs) { + long b = retryBackoffNs[idx]; + if (b < minInitialBackoffNs) b = minInitialBackoffNs; + + long next = nowNs + b; + if (next > deadlineNs) next = deadlineNs; + retryAfterNs[idx] = next; + + final long bumped = b << 1; + retryBackoffNs[idx] = (bumped <= 0L) ? RETRY_MAX_NS : Math.min(bumped, RETRY_MAX_NS); + } - boolean success() { - return err == null && ack != null && ack.getStatus() == BrokerApi.ReplicationAck.Status.SUCCESS; + private static Throwable unwrap(Throwable t) { + while (t instanceof CompletionException || t instanceof ExecutionException) { + final Throwable c = t.getCause(); + if (c == null) break; + t = c; } + return t; } - private void reward(final int nodeId, final long latencyNs) { - ewmaNs.compute(nodeId, (id, prev) -> { - final double p = (prev == null) ? defaultNs : prev; - return (1.0 - alpha) * p + alpha * (double) latencyNs; - }); + private RemoteBrokerClient getClient(final int idx, final int nodeId, final RemoteBrokerClient[] cache) { + RemoteBrokerClient c = cache[idx]; + if (c != null) return c; + c = clients.get(nodeId); + cache[idx] = c; + return c; + } + + private void reward(final int nodeId, final long sampleNs) { + final AtomicLong a = ewmaNs.computeIfAbsent(nodeId, id -> new AtomicLong(defaultNs)); + long prev, next; + do { + prev = a.get(); + next = prev + ((sampleNs - prev) >> ewmaShift); + if (next < defaultNs) next = defaultNs; + } while (!a.compareAndSet(prev, next)); } private void penalize(final int nodeId) { - ewmaNs.compute(nodeId, (id, prev) -> { - final double p = (prev == null) ? defaultNs : prev; - final double bumped = Math.min(p * 2.0, (double) TimeUnit.SECONDS.toNanos(10)); - return Math.max(bumped, defaultNs); - }); + final AtomicLong a = ewmaNs.computeIfAbsent(nodeId, id -> new AtomicLong(defaultNs)); + long prev, next; + do { + prev = a.get(); + if (prev >= (maxPenaltyNs >>> 1)) next = maxPenaltyNs; + else { + next = prev << 1; + if (next < defaultNs) next = defaultNs; + } + } while (!a.compareAndSet(prev, next)); + } + + // Softer penalty for transient states: bump EWMA by 25% to deprioritize without blacklisting. + private void penalizeNotReady(final int nodeId) { + final AtomicLong a = ewmaNs.computeIfAbsent(nodeId, id -> new AtomicLong(defaultNs)); + long prev, next; + do { + prev = a.get(); + final long bump = prev + (prev >>> 2); // *1.25 + next = Math.min(Math.max(bump, defaultNs), maxPenaltyNs); + } while (!a.compareAndSet(prev, next)); } - private int pickBestAvailableIndex(final int[] replicas, final int n, final boolean[] attempted) { + /** + * Pick best eligible candidate: + * - not attempted + * - client present (live map refresh via cache) + * - now >= retryAfterNs[i] + * - minimal EWMA + */ + private int pickBestEligibleIndex(final int[] replicas, + final RemoteBrokerClient[] clientCache, + final boolean[] attempted, + final long[] retryAfterNs, + final int n, + final long nowNs) { int bestIdx = -1; - double bestScore = Double.POSITIVE_INFINITY; + long bestScore = Long.MAX_VALUE; for (int i = 0; i < n; i++) { if (attempted[i]) continue; + final long ra = retryAfterNs[i]; + if (ra != 0L && nowNs < ra) continue; final int nodeId = replicas[i]; - final RemoteBrokerClient c = clients.get(nodeId); + RemoteBrokerClient c = clientCache[i]; + if (c == null) { + c = clients.get(nodeId); + clientCache[i] = c; + } if (c == null) continue; - final double score = ewmaNs.getOrDefault(nodeId, defaultNs); + final AtomicLong a = ewmaNs.get(nodeId); + final long score = (a == null) ? defaultNs : a.get(); + if (score < bestScore) { bestScore = score; bestIdx = i; @@ -294,4 +450,171 @@ public void shutdown() { if (!closed.compareAndSet(false, true)) return; background.shutdownNow(); } + + /** + * Find earliest retryAfterNs among candidates that are not attempted and have a client. + * Returns Long.MAX_VALUE if no scheduled retries exist. + */ + private long findNextRetryNs(final int[] replicas, + final RemoteBrokerClient[] clientCache, + final boolean[] attempted, + final long[] retryAfterNs, + final int n, + final long nowNs) { + long best = Long.MAX_VALUE; + for (int i = 0; i < n; i++) { + if (attempted[i]) continue; + + final long ra = retryAfterNs[i]; + if (ra == 0L || ra <= nowNs) continue; + + final int nodeId = replicas[i]; + RemoteBrokerClient c = clientCache[i]; + if (c == null) { + c = clients.get(nodeId); + clientCache[i] = c; + } + if (c == null) continue; + + if (ra < best) best = ra; + } + return best; + } + + /** + * Multi-producer / single-consumer bounded ring queue (Vyukov MPSC) for indices. + * sequence[] must be long (monotonic tickets). buffer[] is int for cache density. + */ + private static final class IntMpscQueue { + private static final VarHandle SEQ, BUF; + + static { + SEQ = MethodHandles.arrayElementVarHandle(long[].class); + BUF = MethodHandles.arrayElementVarHandle(int[].class); + } + + private final int mask; + private final int capacity; + private final long[] sequence; + private final int[] buffer; + + private final PaddedCounter tail = new PaddedCounter(0L); + private final PaddedCounter head = new PaddedCounter(0L); + + private volatile Thread waiter; + + IntMpscQueue(final int capacityPow2) { + if (Integer.bitCount(capacityPow2) != 1) throw new IllegalArgumentException("capacity must be pow2"); + this.capacity = capacityPow2; + this.mask = capacityPow2 - 1; + this.sequence = new long[capacityPow2]; + this.buffer = new int[capacityPow2]; + for (int i = 0; i < capacityPow2; i++) sequence[i] = i; + } + + void offer(final int item) { + long t; + while (true) { + t = tail.get(); + final int idx = (int) (t & mask); + final long sv = (long) SEQ.getVolatile(sequence, idx); + final long dif = sv - t; + if (dif == 0) { + if (tail.compareAndSet(t, t + 1)) break; + } else { + Thread.onSpinWait(); + } + } + + final int idx = (int) (t & mask); + BUF.setRelease(buffer, idx, item); + SEQ.setRelease(sequence, idx, t + 1); + + final Thread w = waiter; + if (w != null) LockSupport.unpark(w); + } + + private int pollRaw() { + long h; + while (true) { + h = head.get(); + final int idx = (int) (h & mask); + final long sv = (long) SEQ.getVolatile(sequence, idx); + final long dif = sv - (h + 1); + if (dif == 0) { + if (head.compareAndSet(h, h + 1)) { + final int item = (int) BUF.getAcquire(buffer, idx); + BUF.setRelease(buffer, idx, 0); + SEQ.setRelease(sequence, idx, h + capacity); + return item; + } + } else if (dif < 0) { + return Integer.MIN_VALUE; // empty + } else { + Thread.onSpinWait(); + } + } + } + + /** + * Poll until deadlineNs, parking when empty. + * + * @return index [0..] or -1 on timeout + */ + int pollIntUntil(final long deadlineNs) throws InterruptedException { + for (int i = 0; i < 128; i++) { + final int v = pollRaw(); + if (v != Integer.MIN_VALUE) return v; + if (System.nanoTime() >= deadlineNs) return -1; + Thread.onSpinWait(); + } + + final Thread me = Thread.currentThread(); + waiter = me; + try { + while (true) { + final int v = pollRaw(); + if (v != Integer.MIN_VALUE) return v; + + final long remaining = deadlineNs - System.nanoTime(); + if (remaining <= 0) return -1; + + if (Thread.interrupted()) throw new InterruptedException(); + LockSupport.parkNanos(this, remaining); + } + } finally { + waiter = null; + } + } + + private static final class PaddedCounter { + private static final VarHandle VALUE; + + static { + try { + VALUE = MethodHandles.lookup().findVarHandle(PaddedCounter.class, "value", long.class); + } catch (final ReflectiveOperationException e) { + throw new ExceptionInInitializerError(e); + } + } + + @SuppressWarnings("unused") + private long p1, p2, p3, p4, p5, p6, p7; + private volatile long value; + @SuppressWarnings("unused") + private long q1, q2, q3, q4, q5, q6, q7; + + PaddedCounter(final long initial) { + VALUE.setRelease(this, initial); + } + + long get() { + return (long) VALUE.getVolatile(this); + } + + boolean compareAndSet(final long expect, final long update) { + return VALUE.compareAndSet(this, expect, update); + } + } + } } diff --git a/src/main/java/io/ringbroker/cluster/metadata/BroadcastingLogMetadataStore.java b/src/main/java/io/ringbroker/cluster/metadata/BroadcastingLogMetadataStore.java index 5d765a9..a93b3b8 100644 --- a/src/main/java/io/ringbroker/cluster/metadata/BroadcastingLogMetadataStore.java +++ b/src/main/java/io/ringbroker/cluster/metadata/BroadcastingLogMetadataStore.java @@ -5,6 +5,7 @@ import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; @@ -23,15 +24,18 @@ public final class BroadcastingLogMetadataStore implements LogMetadataStore { private final java.util.function.Supplier> clusterView; public static LogConfiguration fromProto(final BrokerApi.MetadataUpdate update) { - final List epochs = update.getEpochsList().stream() - .map(ec -> new EpochMetadata( - ec.getEpoch(), - ec.getStartSeq(), - ec.getEndSeq(), - new EpochPlacement(ec.getEpoch(), ec.getStorageNodesList(), ec.getAckQuorum()), - ec.getTieBreaker() - )) - .toList(); + final int size = update.getEpochsCount(); + final List epochs = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + final BrokerApi.EpochConfig ec = update.getEpochs(i); + epochs.add(new EpochMetadata( + ec.getEpoch(), + ec.getStartSeq(), + ec.getEndSeq(), + new EpochPlacement(ec.getEpoch(), ec.getStorageNodesList(), ec.getAckQuorum()), + ec.getTieBreaker() + )); + } return new LogConfiguration(update.getPartitionId(), update.getConfigVersion(), epochs); } diff --git a/src/main/java/io/ringbroker/cluster/metadata/EpochPlacement.java b/src/main/java/io/ringbroker/cluster/metadata/EpochPlacement.java index 75a51eb..a62afbe 100644 --- a/src/main/java/io/ringbroker/cluster/metadata/EpochPlacement.java +++ b/src/main/java/io/ringbroker/cluster/metadata/EpochPlacement.java @@ -3,7 +3,6 @@ import lombok.Getter; import lombok.ToString; -import java.util.Collections; import java.util.List; import java.util.Objects; @@ -30,12 +29,16 @@ public EpochPlacement(final long epoch, } this.epoch = epoch; this.storageNodes = List.copyOf(storageNodes); - this.storageNodesArray = storageNodes.stream().mapToInt(Integer::intValue).toArray(); + final int size = this.storageNodes.size(); + this.storageNodesArray = new int[size]; + for (int i = 0; i < size; i++) { + this.storageNodesArray[i] = this.storageNodes.get(i); + } this.ackQuorum = ackQuorum; } public List getStorageNodes() { - return Collections.unmodifiableList(storageNodes); + return storageNodes; } public int[] getStorageNodesArray() { diff --git a/src/main/java/io/ringbroker/core/barrier/Barrier.java b/src/main/java/io/ringbroker/core/barrier/Barrier.java index 2a90f91..5b5e308 100644 --- a/src/main/java/io/ringbroker/core/barrier/Barrier.java +++ b/src/main/java/io/ringbroker/core/barrier/Barrier.java @@ -13,11 +13,12 @@ * Coordinates blocking and wake-ups for a WaitStrategy. */ @RequiredArgsConstructor -public final class Barrier { - private final Sequence cursor; - private final WaitStrategy waitStrategy; - private final Lock lock = new ReentrantLock(); - private final Condition condition = lock.newCondition(); +public final class Barrier { + private final Sequence cursor; + private final WaitStrategy waitStrategy; + private final Lock lock = new ReentrantLock(); + private final Condition condition = lock.newCondition(); + private volatile int waiters; /** * -- GETTER -- * Check if an alert has been raised. @@ -35,14 +36,19 @@ public long waitFor(final long seq) throws InterruptedException { /** * Called by producers to wake up any blocked consumer. */ - public void signal() { - lock.lock(); - try { - condition.signalAll(); - } finally { - lock.unlock(); - } - // No need to call waitStrategy.signalAll(); wait strategies either don't block or use this Barrier. + public void signal() { + if (waiters == 0) { + return; + } + lock.lock(); + try { + if (waiters > 0) { + condition.signalAll(); + } + } finally { + lock.unlock(); + } + // No need to call waitStrategy.signalAll(); wait strategies either don't block or use this Barrier. } /** @@ -53,18 +59,26 @@ public void alert() { signal(); } - /** - * Called by wait strategies to block the consumer thread. - */ - public void block() throws InterruptedException { - lock.lock(); - try { - // Only wait if not alerted (to avoid waiting when we should be breaking out) - if (!alerted) { - condition.await(); - } - } finally { - lock.unlock(); - } - } -} + /** + * Called by wait strategies to block the consumer thread until {@code seq} is published. + * Re-checks cursor under lock to avoid missed wakeups. + */ + public void block(final long seq) throws InterruptedException { + lock.lock(); + try { + if (alerted) return; + if (cursor.getValue() >= seq) return; + + waiters++; + try { + while (!alerted && cursor.getValue() < seq) { + condition.await(); + } + } finally { + waiters--; + } + } finally { + lock.unlock(); + } + } +} diff --git a/src/main/java/io/ringbroker/core/ring/RingBuffer.java b/src/main/java/io/ringbroker/core/ring/RingBuffer.java index 3b9c516..4389fe6 100644 --- a/src/main/java/io/ringbroker/core/ring/RingBuffer.java +++ b/src/main/java/io/ringbroker/core/ring/RingBuffer.java @@ -119,6 +119,24 @@ public void publishBatch(final long endSeq, final int count, final E[] batch) { barrier.signal(); } + /** + * Publish a batch when the caller is the only publisher for this ring. + * Skips inter-publisher ordering spin because there is no concurrent publisher. + */ + @SuppressWarnings("unchecked") + public void publishBatchSingleProducer(final long endSeq, final int count, final E[] batch) { + final Object[] localEntries = this.entries; + final int m = this.mask; + + long seq = endSeq - count + 1; + for (int i = 0; i < count; i++, seq++) { + ARRAY_HANDLE.setRelease(localEntries, (int) (seq & m), batch[i]); + } + + cursor.setValue(endSeq); + barrier.signal(); + } + /** * Retrieve an entry, blocking until it’s published. * @@ -142,10 +160,11 @@ public long getCursor() { final class PaddedSequence { private final AtomicLong value; - // 7 longs of pre‐padding + + // 7 longs of pre-padding @SuppressWarnings("unused") private long p1, p2, p3, p4, p5, p6, p7; - // 7 longs of post‐padding + // 7 longs of post-padding @SuppressWarnings("unused") private long p8, p9, p10, p11, p12, p13, p14; @@ -157,7 +176,7 @@ public PaddedSequence(final long initial) { * Atomically increments by one and returns the updated value. */ public long incrementAndGet() { - return value.incrementAndGet(); + return addAndGet(1L); } /** diff --git a/src/main/java/io/ringbroker/core/wait/AdaptiveSpin.java b/src/main/java/io/ringbroker/core/wait/AdaptiveSpin.java index 6cd9b58..773decc 100644 --- a/src/main/java/io/ringbroker/core/wait/AdaptiveSpin.java +++ b/src/main/java/io/ringbroker/core/wait/AdaptiveSpin.java @@ -4,7 +4,7 @@ import io.ringbroker.core.sequence.Sequence; /** - * Spins briefly, then blocks on the Barrier’s condition to reduce CPU burn. + * Spins briefly, then blocks on the barrier condition to reduce CPU burn. */ public final class AdaptiveSpin implements WaitStrategy { private static final int SPIN_LIMIT = 1000; @@ -15,11 +15,14 @@ public long await(final long seq, final Sequence cursor, final Barrier barrier) int counter = 0; long available; while ((available = cursor.getValue()) < seq) { + if (barrier.isAlerted()) { + throw new RuntimeException("Consumer alerted"); + } if (counter < SPIN_LIMIT) { Thread.onSpinWait(); } else { - barrier.block(); // park on barrier’s condition - counter = 0; // reset spinning after wake + barrier.block(seq); + counter = 0; } counter++; } diff --git a/src/main/java/io/ringbroker/core/wait/Blocking.java b/src/main/java/io/ringbroker/core/wait/Blocking.java index 68baa25..2b11351 100644 --- a/src/main/java/io/ringbroker/core/wait/Blocking.java +++ b/src/main/java/io/ringbroker/core/wait/Blocking.java @@ -20,7 +20,7 @@ public long await(final long seq, final Sequence cursor, final Barrier barrier) if (barrier.isAlerted()) { throw new RuntimeException("Consumer alerted"); } - barrier.block(); // wait on the barrier's condition + barrier.block(seq); } return available; } diff --git a/src/main/java/io/ringbroker/ledger/orchestrator/LedgerOrchestrator.java b/src/main/java/io/ringbroker/ledger/orchestrator/LedgerOrchestrator.java index 0178c00..589af8d 100644 --- a/src/main/java/io/ringbroker/ledger/orchestrator/LedgerOrchestrator.java +++ b/src/main/java/io/ringbroker/ledger/orchestrator/LedgerOrchestrator.java @@ -10,12 +10,13 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.Channels; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; -import java.util.Comparator; -import java.util.List; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; @@ -28,7 +29,7 @@ import static java.nio.file.StandardOpenOption.WRITE; @Slf4j -public final class LedgerOrchestrator implements AutoCloseable { +public final class LedgerOrchestrator implements AutoCloseable { private static final ExecutorService INDEX_BUILDER = Executors.newSingleThreadExecutor(Thread.ofPlatform().name("ledger-idx-builder").daemon(true).factory()); @@ -46,7 +47,7 @@ public final class LedgerOrchestrator implements AutoCloseable { private Future nextSegmentFuture; // Immutable snapshot for fast binary-search selection in fetch. - private volatile LedgerSegment[] segmentSnapshot = new LedgerSegment[0]; + private volatile LedgerSegment[] segmentSnapshot = new LedgerSegment[0]; private LedgerOrchestrator(final Path directory, final int segmentCapacity, final long initialHwm) { this.directory = directory; @@ -54,16 +55,25 @@ private LedgerOrchestrator(final Path directory, final int segmentCapacity, fina this.highWaterMark = initialHwm; } - public static LedgerOrchestrator bootstrap(@NonNull final Path directory, final int segmentCapacity) throws IOException { - Files.createDirectories(directory); - - final List recoveredSegments = Files.list(directory) - .filter(p -> p.getFileName().toString().endsWith(LedgerConstant.SEGMENT_EXT)) - .sorted(Comparator.comparing(p -> p.getFileName().toString())) - .map(path -> recoverAndOpenSegment(path, directory)) - .filter(java.util.Objects::nonNull) - .sorted(Comparator.comparingLong(LedgerSegment::getFirstOffset).thenComparingLong(LedgerSegment::getLastOffset)) - .toList(); + public static LedgerOrchestrator bootstrap(@NonNull final Path directory, final int segmentCapacity) throws IOException { + Files.createDirectories(directory); + + final ArrayList segmentPaths = new ArrayList<>(); + try (final var files = Files.list(directory)) { + files.filter(p -> p.getFileName().toString().endsWith(LedgerConstant.SEGMENT_EXT)) + .forEach(segmentPaths::add); + } + segmentPaths.sort(Comparator.comparing(p -> p.getFileName().toString())); + + final ArrayList recoveredSegments = new ArrayList<>(segmentPaths.size()); + for (final Path path : segmentPaths) { + final LedgerSegment recovered = recoverAndOpenSegment(path, directory); + if (recovered != null) { + recoveredSegments.add(recovered); + } + } + recoveredSegments.sort(Comparator.comparingLong(LedgerSegment::getFirstOffset) + .thenComparingLong(LedgerSegment::getLastOffset)); // Offsets start at 0; if there are no segments yet, HWM is -1. final long currentHwm = recoveredSegments.isEmpty() ? -1L : recoveredSegments.getLast().getLastOffset(); @@ -97,9 +107,8 @@ public static LedgerOrchestrator bootstrap(@NonNull final Path directory, final } } - orchestrator.preAllocateNextSegment(); - return orchestrator; - } + return orchestrator; + } private static LedgerSegment recoverAndOpenSegment(final Path segmentPath, final Path baseDir) { Path tempRecoveryPath = null; @@ -236,28 +245,29 @@ private static void truncateChannel(final FileChannel ch, final long position) t /** * Returns the active segment, rolling if it cannot fit `requiredBytes`. */ - public LedgerSegment writable(final int requiredBytes) throws IOException { - LedgerSegment current = activeSegment.get(); - - if (current == null || !current.hasSpaceFor(requiredBytes)) { - if (current != null) { + public LedgerSegment writable(final int requiredBytes) throws IOException { + LedgerSegment current = activeSegment.get(); + + if (current == null || !current.hasSpaceFor(requiredBytes)) { + if (current != null) { log.debug("Rolling segment (capacity full for batch of {} bytes)", requiredBytes); } final LedgerSegment sealed = current; - current = rollToNextSegment(); - activeSegment.set(current); - addToSnapshotIfMissing(current); - preAllocateNextSegment(); - - // Build .idx off the hot path for sealed segments. - if (sealed != null && !sealed.isLogicallyEmpty()) { - INDEX_BUILDER.execute(sealed::buildDenseIndexIfMissingOrStale); - } - } - return current; - } + current = rollToNextSegment(); + activeSegment.set(current); + addToSnapshotIfMissing(current); + + // Build .idx off the hot path for sealed segments. + if (sealed != null && !sealed.isLogicallyEmpty()) { + INDEX_BUILDER.execute(sealed::buildDenseIndexIfMissingOrStale); + } + } else { + maybePreAllocateNextSegment(current, requiredBytes); + } + return current; + } public LedgerSegment writable() throws IOException { return writable(1024); // Default safety margin @@ -274,25 +284,57 @@ private void addToSnapshotIfMissing(final LedgerSegment seg) { segmentSnapshot = next; } - private LedgerSegment rollToNextSegment() throws IOException { - LedgerSegment nextActiveSegment = null; - if (nextSegmentFuture != null && nextSegmentFuture.isDone()) { - try { - nextActiveSegment = nextSegmentFuture.get(); - } catch (final Exception ignored) { - } - } - - if (nextActiveSegment == null) { - final LedgerSegment previousActive = activeSegment.get(); - final long baseOffset = (previousActive != null) ? previousActive.getLastOffset() : this.highWaterMark; - nextActiveSegment = createNewSegment(baseOffset); - } - - // Once used (or ignored), clear the future reference. - nextSegmentFuture = null; - return nextActiveSegment; - } + private LedgerSegment rollToNextSegment() throws IOException { + final LedgerSegment previousActive = activeSegment.get(); + final long baseOffset = (previousActive != null) ? previousActive.getLastOffset() : this.highWaterMark; + + LedgerSegment nextActiveSegment = null; + if (nextSegmentFuture != null && nextSegmentFuture.isDone()) { + try { + final LedgerSegment candidate = nextSegmentFuture.get(); + if (candidate != null) { + if (candidate.isLogicallyEmpty()) { + if (candidate.getLastOffset() != baseOffset) { + candidate.rebaseEmpty(baseOffset); + } + nextActiveSegment = candidate; + } else { + discardPreallocatedSegment(candidate); + } + } + } catch (final Exception ignored) { + } + } + + if (nextActiveSegment == null) { + nextActiveSegment = createNewSegment(baseOffset); + } + + // Once used (or ignored), clear the future reference. + nextSegmentFuture = null; + return nextActiveSegment; + } + + private void discardPreallocatedSegment(final LedgerSegment segment) { + try { segment.close(); } catch (final Exception ignored) {} + try { Files.deleteIfExists(segment.getFile()); } catch (final Exception ignored) {} + try { Files.deleteIfExists(LedgerSegment.indexPathForSegment(segment.getFile())); } catch (final Exception ignored) {} + } + + /** + * Defer preallocation until the active segment is reasonably close to full. + * This avoids background file creation I/O when rollover is not imminent. + */ + private void maybePreAllocateNextSegment(final LedgerSegment current, final int requiredBytes) { + if (current == null) return; + if (nextSegmentFuture != null && !nextSegmentFuture.isDone()) return; + + final int remaining = current.remainingBytes(); + final long trigger = Math.max((long) requiredBytes * 4L, Math.max(1L, ((long) segmentCapacity) >>> 3)); + if (remaining > trigger) return; + + preAllocateNextSegment(); + } private void preAllocateNextSegment() { if (nextSegmentFuture == null || nextSegmentFuture.isDone()) { @@ -439,4 +481,4 @@ private boolean isInSnapshot(final LedgerSegment seg) { public void setHighWaterMark(final long newHwm) { if (newHwm > this.highWaterMark) this.highWaterMark = newHwm; } -} \ No newline at end of file +} diff --git a/src/main/java/io/ringbroker/ledger/segment/LedgerSegment.java b/src/main/java/io/ringbroker/ledger/segment/LedgerSegment.java index c2af8f6..2bfdfda 100644 --- a/src/main/java/io/ringbroker/ledger/segment/LedgerSegment.java +++ b/src/main/java/io/ringbroker/ledger/segment/LedgerSegment.java @@ -196,9 +196,31 @@ private int countAcquire() { return (int) COUNT_HANDLE.getAcquire(this); } - public boolean isLogicallyEmpty() { - return firstOffset == FIRST_OFFSET_UNSET && countAcquire() == 0; - } + public boolean isLogicallyEmpty() { + return firstOffset == FIRST_OFFSET_UNSET && countAcquire() == 0; + } + + /** + * Rebase an empty segment to a new base last-offset so it can be safely reused + * as the next active segment after the previous segment is sealed. + */ + public void rebaseEmpty(final long baseLastOffset) { + if (!isLogicallyEmpty()) { + throw new IllegalStateException("Cannot rebase non-empty segment: " + file); + } + + // Empty segment state: no records, cursor at payload start. + buf.position(HEADER_SIZE); + COUNT_HANDLE.setRelease(this, 0); + + UNSAFE.putOrderedLong(this, FIRST_OFF_OFFSET, FIRST_OFFSET_UNSET); + this.firstOffset = FIRST_OFFSET_UNSET; + + UNSAFE.putOrderedLong(this, LAST_OFF_OFFSET, baseLastOffset); + this.lastOffset = baseLastOffset; + + updateHeaderOnDisk(); + } private void rebuildHintsAndCountFromSegment() { int pos = HEADER_SIZE; @@ -267,9 +289,17 @@ private void tryOpenDenseIndexIfValid() { } } - public boolean hasSpaceFor(final int payloadBytes) { - return (capacity - buf.position()) >= (payloadBytes + MIN_RECORD_OVERHEAD); - } + public boolean hasSpaceFor(final int payloadBytes) { + return (capacity - buf.position()) >= (payloadBytes + MIN_RECORD_OVERHEAD); + } + + /** + * Remaining writable bytes in this segment. + * Writer access is single-threaded per segment. + */ + public int remainingBytes() { + return capacity - buf.position(); + } /** * Dense, O(1) offset index: @@ -335,10 +365,86 @@ public int visitFromOffset(final long offset, return endIdx - startIdx; } - // ---- Append APIs ---- - - public void appendBatchNoOffsets(final List msgs, final int totalBytes) throws IOException { - if (msgs.isEmpty()) return; + // ---- Append APIs ---- + + public int appendFramedBatchNoOffsets(final ByteBuffer framedPayloads, final int maxMessages) throws IOException { + if (maxMessages <= 0) return 0; + if (framedPayloads == null || framedPayloads.remaining() < Integer.BYTES) return 0; + + final int remainingCapacity = capacity - buf.position(); + if (remainingCapacity < MIN_RECORD_SIZE) return 0; + + final ByteBuffer scan = framedPayloads.duplicate(); + int count = 0; + int requiredBytes = 0; + + while (count < maxMessages && scan.remaining() >= Integer.BYTES) { + final int len = readLittleEndianInt(scan); + if (len < 0 || scan.remaining() < len) break; + + final int recordBytes = Math.addExact(MIN_RECORD_OVERHEAD, len); + if (recordBytes > (remainingCapacity - requiredBytes)) break; + + requiredBytes = Math.addExact(requiredBytes, recordBytes); + scan.position(scan.position() + len); + count++; + } + + if (count == 0) return 0; + + if (buf.position() + requiredBytes > capacity) { + throw new IOException("Segment full for framed batch: " + file); + } + + long curr = lastOffset; + boolean firstSet = (firstOffset != FIRST_OFFSET_UNSET); + final boolean doCrc = !skipRecordCrc; + final int baseIdx = countAcquire(); + + for (int i = 0; i < count; i++) { + final int len = readLittleEndianInt(framedPayloads); + final int payloadPos = framedPayloads.position(); + + final int ridx = baseIdx + i; + if ((ridx & HINT_MASK) == 0) { + hintPositions[ridx >>> HINT_SHIFT] = buf.position(); + } + + buf.putInt(len); + + final ByteBuffer payload = framedPayloads.duplicate(); + payload.position(payloadPos); + payload.limit(payloadPos + len); + + if (doCrc) { + recordCrc.reset(); + recordCrc.update(payload.duplicate()); + buf.putInt((int) recordCrc.getValue()); + } else { + buf.putInt(0); + } + + buf.put(payload); + framedPayloads.position(payloadPos + len); + + curr++; + if (!firstSet) { + UNSAFE.putOrderedLong(this, FIRST_OFF_OFFSET, curr); + firstOffset = curr; + firstSet = true; + } + } + + UNSAFE.putOrderedLong(this, LAST_OFF_OFFSET, curr); + lastOffset = curr; + + COUNT_HANDLE.setRelease(this, baseIdx + count); + updateHeaderOnDisk(); + return count; + } + + public void appendBatchNoOffsets(final List msgs, final int totalBytes) throws IOException { + if (msgs.isEmpty()) return; if (buf.position() + totalBytes > capacity) { throw new IOException("Segment full for batch: " + file); @@ -452,13 +558,21 @@ public long[] appendBatch(final List msgs, final int totalBytes) throws return outs; } - public long[] appendBatchAndForce(final List msgs, final int totalBytes) throws IOException { - final long[] offs = appendBatch(msgs, totalBytes); - if (!msgs.isEmpty()) buf.force(); - return offs; - } - - private void updateHeaderOnDisk() { + public long[] appendBatchAndForce(final List msgs, final int totalBytes) throws IOException { + final long[] offs = appendBatch(msgs, totalBytes); + if (!msgs.isEmpty()) buf.force(); + return offs; + } + + private static int readLittleEndianInt(final ByteBuffer src) { + final int b0 = src.get() & 0xFF; + final int b1 = src.get() & 0xFF; + final int b2 = src.get() & 0xFF; + final int b3 = src.get() & 0xFF; + return b0 | (b1 << 8) | (b2 << 16) | (b3 << 24); + } + + private void updateHeaderOnDisk() { final int p = buf.position(); try { buf.putLong(FIRST_OFFSET_POS, firstOffset); diff --git a/src/main/java/io/ringbroker/offset/InMemoryOffsetStore.java b/src/main/java/io/ringbroker/offset/InMemoryOffsetStore.java index 50559a7..f060e07 100644 --- a/src/main/java/io/ringbroker/offset/InMemoryOffsetStore.java +++ b/src/main/java/io/ringbroker/offset/InMemoryOffsetStore.java @@ -10,28 +10,21 @@ import java.nio.ByteOrder; import java.nio.channels.FileChannel; import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.locks.LockSupport; -import java.util.stream.Stream; - -/* - * Hyper-optimized, Durable, Low-Latency OffsetStore backed by the LedgerOrchestrator. - * - * Hot-path goals: - * - commit(): O(1) with minimal allocations & string work - * - fetch(): O(1) with simple nested map + array read - */ -@Slf4j -public final class InMemoryOffsetStore implements OffsetStore, AutoCloseable { +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.LockSupport; +import java.util.stream.Stream; + +/** Durable in-memory offset store backed by a WAL. */ +@Slf4j +public final class InMemoryOffsetStore implements OffsetStore, AutoCloseable { /* 16MB segments as before. */ private static final int OFFSET_SEGMENT_CAPACITY = 16 * 1024 * 1024; @@ -39,8 +32,10 @@ public final class InMemoryOffsetStore implements OffsetStore, AutoCloseable { /* Batch size for WAL appends. Tune as needed. */ private static final int BATCH_SIZE = 1024; - /* Idle park duration for flusher. 1 microsecond. */ - private static final long PARK_NANOS = 1_000L; + /* Idle park duration for flusher. 1 microsecond. */ + private static final long PARK_NANOS = 1_000L; + private static final int INITIAL_FRAMED_BATCH_CAPACITY = 1 << 20; // 1MB + private static final int MAX_COMMIT_POOL_SIZE = BATCH_SIZE * 8; private final Path storageDir; @@ -134,41 +129,58 @@ private byte[] groupBytes(final String group) { private final LedgerOrchestrator wal; - /* - * MPSC queue for commits. - */ - private final ConcurrentLinkedQueue commitQueue = new ConcurrentLinkedQueue<>(); - - private final ExecutorService flusherExecutor = Executors.newSingleThreadExecutor( - Thread.ofVirtual().name("offset-flusher").factory() - ); + private static final class PendingCommit { + byte[] topicBytes; + byte[] groupBytes; + int partition; + long offset; + + void set(final byte[] topicBytes, final byte[] groupBytes, final int partition, final long offset) { + this.topicBytes = topicBytes; + this.groupBytes = groupBytes; + this.partition = partition; + this.offset = offset; + } + + void clear() { + this.topicBytes = null; + this.groupBytes = null; + this.partition = 0; + this.offset = 0L; + } + } + + /* + * MPSC queue for commits. + */ + private final ConcurrentLinkedQueue commitQueue = new ConcurrentLinkedQueue<>(); + private final ConcurrentLinkedQueue commitPool = new ConcurrentLinkedQueue<>(); + private final AtomicInteger pooledCommitCount = new AtomicInteger(0); + + private final ExecutorService flusherExecutor = Executors.newSingleThreadExecutor( + Thread.ofPlatform().name("offset-flusher").factory() + ); private final AtomicBoolean running = new AtomicBoolean(true); - public InMemoryOffsetStore(final Path storageDir) throws IOException { - this.storageDir = Objects.requireNonNull(storageDir, "storageDir"); - Files.createDirectories(storageDir); - - // Phase 1: recovery from existing segments. - recoverStateFromDisk(); - - // Phase 2: WAL bootstrap. - this.wal = LedgerOrchestrator.bootstrap(storageDir, OFFSET_SEGMENT_CAPACITY); - - // Phase 3: start flusher loop. - flusherExecutor.submit(this::flusherLoop); - } - - @Override - public void commit(final String topic, final String group, final int partition, final long offset) { - // Fast in-memory update: nested map + array write. - final PartitionOffsets po = partitionOffsets(topic, group); - po.set(partition, offset); - - // Serialize for async WAL persistence. - final byte[] payload = serialize(topic, group, partition, offset); - commitQueue.offer(payload); - } + public InMemoryOffsetStore(final Path storageDir) throws IOException { + this.storageDir = Objects.requireNonNull(storageDir, "storageDir"); + Files.createDirectories(storageDir); + + recoverStateFromDisk(); + this.wal = LedgerOrchestrator.bootstrap(storageDir, OFFSET_SEGMENT_CAPACITY); + flusherExecutor.submit(this::flusherLoop); + } + + @Override + public void commit(final String topic, final String group, final int partition, final long offset) { + final PartitionOffsets po = partitionOffsets(topic, group); + po.set(partition, offset); + + final PendingCommit c = acquireCommit(); + c.set(topicBytes(topic), groupBytes(group), partition, offset); + commitQueue.offer(c); + } @Override public long fetch(final String topic, final String group, final int partition) { @@ -179,81 +191,114 @@ public long fetch(final String topic, final String group, final int partition) { return po.get(partition); } - /* - * Background flush loop: drain queue, batch, append to WAL. - */ - private void flusherLoop() { - final List batchBuffer = new ArrayList<>(BATCH_SIZE); - - while (running.get()) { - try { - byte[] element = commitQueue.poll(); - - if (element == null) { - // Flush any accumulated batch before idling. - if (!batchBuffer.isEmpty()) { - flushBatch(batchBuffer); - } - LockSupport.parkNanos(PARK_NANOS); - continue; - } - - batchBuffer.add(element); - - // Greedy drain up to BATCH_SIZE. - while (batchBuffer.size() < BATCH_SIZE) { - element = commitQueue.poll(); - if (element == null) break; - batchBuffer.add(element); - } - - flushBatch(batchBuffer); - } catch (final Throwable t) { - log.error("Offset flusher loop encountered error", t); - } - } - - // Final drain when running flag is cleared. - try { - if (!commitQueue.isEmpty()) { - final List remaining = new ArrayList<>(); - byte[] b; - while ((b = commitQueue.poll()) != null) { - remaining.add(b); - if (remaining.size() >= BATCH_SIZE) { - flushBatch(remaining); - } - } - if (!remaining.isEmpty()) { - flushBatch(remaining); - } - } - } catch (final Throwable t) { - log.error("Error while flushing remaining offsets on flusher shutdown", t); - } - } - - private void flushBatch(final List batch) { - if (batch.isEmpty()) return; - - int totalBytes = 0; - for (final byte[] b : batch) { - totalBytes += (8 + b.length); - } - - try { - wal.writable(totalBytes).appendBatch(batch, totalBytes); - batch.clear(); - } catch (final IOException e) { - log.error("Failed to persist offset batch.", e); - } - } + /* + * Background flush loop: drain queue, batch, append to WAL. + */ + private void flusherLoop() { + final PendingCommit[] batch = new PendingCommit[BATCH_SIZE]; + int batchCount = 0; + ByteBuffer framedBatch = ByteBuffer.allocateDirect(INITIAL_FRAMED_BATCH_CAPACITY) + .order(ByteOrder.LITTLE_ENDIAN); + + while (running.get()) { + try { + PendingCommit element = commitQueue.poll(); + + if (element == null) { + if (batchCount > 0) { + framedBatch = flushBatch(batch, batchCount, framedBatch); + recycleBatch(batch, batchCount); + batchCount = 0; + } + LockSupport.parkNanos(PARK_NANOS); + continue; + } + + batch[batchCount++] = element; + + // Greedy drain up to BATCH_SIZE. + while (batchCount < BATCH_SIZE) { + element = commitQueue.poll(); + if (element == null) break; + batch[batchCount++] = element; + } + + framedBatch = flushBatch(batch, batchCount, framedBatch); + recycleBatch(batch, batchCount); + batchCount = 0; + } catch (final Throwable t) { + log.error("Offset flusher loop encountered error", t); + LockSupport.parkNanos(PARK_NANOS); + } + } + + // Final drain when running flag is cleared. + try { + for (;;) { + while (batchCount < BATCH_SIZE) { + final PendingCommit c = commitQueue.poll(); + if (c == null) break; + batch[batchCount++] = c; + } + if (batchCount == 0) break; + framedBatch = flushBatch(batch, batchCount, framedBatch); + recycleBatch(batch, batchCount); + batchCount = 0; + if (commitQueue.isEmpty()) { + break; + } + } + } catch (final Throwable t) { + log.error("Error while flushing remaining offsets on flusher shutdown", t); + } + } + + private ByteBuffer flushBatch(final PendingCommit[] batch, + final int count, + final ByteBuffer currentFramedBuffer) throws IOException { + if (count <= 0) return currentFramedBuffer; + + int framedBytes = 0; + for (int i = 0; i < count; i++) { + framedBytes = Math.addExact(framedBytes, Integer.BYTES + payloadSize(batch[i])); + } + + final ByteBuffer framed = ensureFramedCapacity(currentFramedBuffer, framedBytes); + framed.clear(); + + for (int i = 0; i < count; i++) { + final PendingCommit c = batch[i]; + final int payloadLen = payloadSize(c); + + framed.putInt(payloadLen); + framed.putInt(c.topicBytes.length); + framed.put(c.topicBytes); + framed.putInt(c.groupBytes.length); + framed.put(c.groupBytes); + framed.putInt(c.partition); + framed.putLong(c.offset); + } + + framed.flip(); + int remaining = count; + while (remaining > 0) { + final int nextPayloadLen = peekLittleEndianInt(framed); + final int requiredRecordBytes = Integer.BYTES + Integer.BYTES + nextPayloadLen; + final LedgerSegment segment = wal.writable(requiredRecordBytes); + final int written = segment.appendFramedBatchNoOffsets(framed, remaining); + if (written <= 0) { + throw new IOException("Failed to append offset WAL batch"); + } + remaining -= written; + } + return framed; + } @Override - public void close() throws Exception { - // Stop flusher loop. - running.set(false); - flusherExecutor.shutdown(); + public void close() throws Exception { + // Stop flusher loop. + running.set(false); + flusherExecutor.shutdown(); try { if (!flusherExecutor.awaitTermination(30, TimeUnit.SECONDS)) { log.warn("Offset flusher executor did not terminate within 30s"); @@ -261,14 +306,80 @@ public void close() throws Exception { } catch (final InterruptedException ie) { Thread.currentThread().interrupt(); } - - // WAL close. - wal.close(); - } - - private void recoverStateFromDisk() throws IOException { - log.info("Recovering offsets from: {}", storageDir); - try (final Stream files = Files.list(storageDir)) { + + // WAL close. + wal.close(); + } + + private PendingCommit acquireCommit() { + final PendingCommit reused = commitPool.poll(); + if (reused != null) { + pooledCommitCount.decrementAndGet(); + return reused; + } + return new PendingCommit(); + } + + private void recycleBatch(final PendingCommit[] batch, final int count) { + for (int i = 0; i < count; i++) { + final PendingCommit c = batch[i]; + batch[i] = null; + if (c != null) { + c.clear(); + tryOfferPooledCommit(c); + } + } + } + + private void tryOfferPooledCommit(final PendingCommit commit) { + if (tryReservePoolSlot()) { + commitPool.offer(commit); + } + } + + private boolean tryReservePoolSlot() { + for (;;) { + final int current = pooledCommitCount.get(); + if (current >= MAX_COMMIT_POOL_SIZE) { + return false; + } + if (pooledCommitCount.compareAndSet(current, current + 1)) { + return true; + } + } + } + + private static int payloadSize(final PendingCommit c) { + return Integer.BYTES + c.topicBytes.length + + Integer.BYTES + c.groupBytes.length + + Integer.BYTES + + Long.BYTES; + } + + private static ByteBuffer ensureFramedCapacity(final ByteBuffer current, final int requiredBytes) { + if (current.capacity() >= requiredBytes) { + return current; + } + + int next = current.capacity(); + while (next < requiredBytes) { + next <<= 1; + } + return ByteBuffer.allocateDirect(next).order(ByteOrder.LITTLE_ENDIAN); + } + + private static int peekLittleEndianInt(final ByteBuffer src) { + final int pos = src.position(); + final int b0 = src.get(pos) & 0xFF; + final int b1 = src.get(pos + 1) & 0xFF; + final int b2 = src.get(pos + 2) & 0xFF; + final int b3 = src.get(pos + 3) & 0xFF; + return b0 | (b1 << 8) | (b2 << 16) | (b3 << 24); + } + + private void recoverStateFromDisk() throws IOException { + log.info("Recovering offsets from: {}", storageDir); + try (final Stream files = Files.list(storageDir)) { final List segments = files .filter(p -> p.toString().endsWith(LedgerConstant.SEGMENT_EXT)) .sorted(Comparator.comparing(Path::getFileName)) @@ -289,15 +400,16 @@ private int replaySegment(final Path segmentPath) { final long fileSize = ch.size(); if (fileSize < LedgerSegment.HEADER_SIZE) return 0; - // Skip segment header in one shot. - ch.position(LedgerSegment.HEADER_SIZE); - - final ByteBuffer lenBuf = ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN); - - while (ch.position() < fileSize) { - lenBuf.clear(); - final int n = ch.read(lenBuf); - if (n < Integer.BYTES) break; + // Skip segment header in one shot. + ch.position(LedgerSegment.HEADER_SIZE); + + final ByteBuffer lenBuf = ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN); + ByteBuffer payloadBuf = ByteBuffer.allocate(4 * 1024).order(ByteOrder.LITTLE_ENDIAN); + + while (ch.position() < fileSize) { + lenBuf.clear(); + final int n = ch.read(lenBuf); + if (n < Integer.BYTES) break; lenBuf.flip(); final int payloadLen = lenBuf.getInt(); @@ -306,17 +418,21 @@ private int replaySegment(final Path segmentPath) { // Skip CRC (4 bytes). ch.position(ch.position() + Integer.BYTES); - if (payloadLen < 0 || payloadLen > (fileSize - ch.position())) { - // Bogus length, stop replaying this segment. - break; - } - - final ByteBuffer payloadBuf = ByteBuffer.allocate(payloadLen); - while (payloadBuf.hasRemaining()) { - final int r = ch.read(payloadBuf); - if (r < 0) { - // Torn record; stop. - break; + if (payloadLen < 0 || payloadLen > (fileSize - ch.position())) { + // Bogus length, stop replaying this segment. + break; + } + + if (payloadBuf.capacity() < payloadLen) { + payloadBuf = ByteBuffer.allocate(nextPowerOfTwo(payloadLen)).order(ByteOrder.LITTLE_ENDIAN); + } + payloadBuf.clear(); + payloadBuf.limit(payloadLen); + while (payloadBuf.hasRemaining()) { + final int r = ch.read(payloadBuf); + if (r < 0) { + // Torn record; stop. + break; } } if (payloadBuf.hasRemaining()) { @@ -331,8 +447,18 @@ private int replaySegment(final Path segmentPath) { } catch (final IOException e) { log.warn("Corrupt or partial segment found during recovery: {}", segmentPath, e); } - return replayed; - } + return replayed; + } + + private static int nextPowerOfTwo(final int value) { + int v = Math.max(1, value); + int hi = Integer.highestOneBit(v); + if (v == hi) { + return v; + } + hi <<= 1; + return (hi > 0) ? hi : Integer.MAX_VALUE; + } /* * Payload format (LE): @@ -363,50 +489,4 @@ private void deserializeAndUpdate(final ByteBuffer buf) { po.set(partition, offset); } - private byte[] serialize(final String topic, final String group, final int partition, final long offset) { - final byte[] tBytes = topicBytes(topic); - final byte[] gBytes = groupBytes(group); - - final int size = - 4 + tBytes.length + // topic length - 4 + gBytes.length + // group length - 4 + // partition - 8; // offset - - final byte[] out = new byte[size]; - int p = 0; - - p = putIntLE(out, p, tBytes.length); - System.arraycopy(tBytes, 0, out, p, tBytes.length); - p += tBytes.length; - - p = putIntLE(out, p, gBytes.length); - System.arraycopy(gBytes, 0, out, p, gBytes.length); - p += gBytes.length; - - p = putIntLE(out, p, partition); - p = putLongLE(out, p, offset); - - return out; - } - - private static int putIntLE(final byte[] arr, final int pos, final int value) { - arr[pos ] = (byte) (value & 0xFF); - arr[pos + 1] = (byte) ((value >> 8) & 0xFF); - arr[pos + 2] = (byte) ((value >> 16) & 0xFF); - arr[pos + 3] = (byte) ((value >> 24) & 0xFF); - return pos + 4; - } - - private static int putLongLE(final byte[] arr, final int pos, final long value) { - arr[pos ] = (byte) (value & 0xFFL); - arr[pos + 1] = (byte) ((value >> 8) & 0xFFL); - arr[pos + 2] = (byte) ((value >> 16) & 0xFFL); - arr[pos + 3] = (byte) ((value >> 24) & 0xFFL); - arr[pos + 4] = (byte) ((value >> 32) & 0xFFL); - arr[pos + 5] = (byte) ((value >> 40) & 0xFFL); - arr[pos + 6] = (byte) ((value >> 48) & 0xFFL); - arr[pos + 7] = (byte) ((value >> 56) & 0xFFL); - return pos + 8; - } -} +} diff --git a/src/main/java/io/ringbroker/transport/impl/NettyServerRequestHandler.java b/src/main/java/io/ringbroker/transport/impl/NettyServerRequestHandler.java index 9c207f4..86e78f2 100644 --- a/src/main/java/io/ringbroker/transport/impl/NettyServerRequestHandler.java +++ b/src/main/java/io/ringbroker/transport/impl/NettyServerRequestHandler.java @@ -7,22 +7,27 @@ import io.ringbroker.api.BrokerApi; import io.ringbroker.broker.ingress.ClusteredIngress; import io.ringbroker.broker.ingress.Ingress; -import io.ringbroker.core.lsn.Lsn; -import io.ringbroker.offset.OffsetStore; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; - -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.CompletableFuture; +import io.ringbroker.core.lsn.Lsn; +import io.ringbroker.offset.OffsetStore; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; @Slf4j @RequiredArgsConstructor -public class NettyServerRequestHandler extends SimpleChannelInboundHandler { - - private final ClusteredIngress ingress; - private final OffsetStore offsetStore; +public class NettyServerRequestHandler extends SimpleChannelInboundHandler { + + private static final int SUBSCRIBE_FLUSH_BATCH = 64; + private static final long SUBSCRIBE_FLUSH_MAX_DELAY_NANOS = 1_000_000L; // 1ms + + private final ClusteredIngress ingress; + private final OffsetStore offsetStore; @Override protected void channelRead0(final ChannelHandlerContext ctx, final BrokerApi.Envelope env) { @@ -32,10 +37,7 @@ protected void channelRead0(final ChannelHandlerContext ctx, final BrokerApi.Env switch (env.getKindCase()) { case PUBLISH -> { final var m = env.getPublish(); - final int partitionId = m.getPartitionId(); - final var fut = (partitionId != 0) - ? ingress.publishToPartition(corrId, m.getTopic(), partitionId, m.getKey().toByteArray(), m.getRetries(), m.getPayload().toByteArray()) - : ingress.publish(corrId, m.getTopic(), m.getKey().toByteArray(), m.getRetries(), m.getPayload().toByteArray()); + final CompletableFuture fut = publishMessage(corrId, m); fut .whenComplete((v, ex) -> { if (ex != null) { @@ -50,30 +52,34 @@ protected void channelRead0(final ChannelHandlerContext ctx, final BrokerApi.Env }); } - case BATCH -> { + case BATCH -> { final var list = env.getBatch().getMessagesList(); - final List> futures = new ArrayList<>(list.size()); + if (list.isEmpty()) { + writeReply(ctx, corrId, BrokerApi.PublishReply.newBuilder().setSuccess(true).build()); + break; + } + final AtomicInteger remaining = new AtomicInteger(list.size()); + final AtomicReference firstError = new AtomicReference<>(); for (final var m : list) { - final int partitionId = m.getPartitionId(); - futures.add((partitionId != 0) - ? ingress.publishToPartition(corrId, m.getTopic(), partitionId, m.getKey().toByteArray(), m.getRetries(), m.getPayload().toByteArray()) - : ingress.publish(corrId, m.getTopic(), m.getKey().toByteArray(), m.getRetries(), m.getPayload().toByteArray())); + final CompletableFuture f = publishMessage(corrId, m); + f.whenComplete((v, ex) -> { + if (ex != null) firstError.compareAndSet(null, ex); + if (remaining.decrementAndGet() == 0) { + final Throwable err = firstError.get(); + if (err != null) { + log.error("Batch publish failed (corrId: {}): {}", corrId, err.getMessage()); + writeReply(ctx, corrId, BrokerApi.PublishReply.newBuilder() + .setSuccess(false) + .setError(String.valueOf(err.getMessage())) + .build()); + } else { + writeReply(ctx, corrId, BrokerApi.PublishReply.newBuilder().setSuccess(true).build()); + } + } + }); } - - CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])) - .whenComplete((v, ex) -> { - if (ex != null) { - log.error("Batch publish failed (corrId: {}): {}", corrId, ex.getMessage()); - writeReply(ctx, corrId, BrokerApi.PublishReply.newBuilder() - .setSuccess(false) - .setError(String.valueOf(ex.getMessage())) - .build()); - } else { - writeReply(ctx, corrId, BrokerApi.PublishReply.newBuilder().setSuccess(true).build()); - } - }); - } + } case COMMIT -> { final var req = env.getCommit(); @@ -146,22 +152,39 @@ protected void channelRead0(final ChannelHandlerContext ctx, final BrokerApi.Env writeReply(ctx, corrId, fr.build()); } - case SUBSCRIBE -> { - final var s = env.getSubscribe(); - ingress.subscribeTopic(s.getTopic(), s.getGroup(), (seq, msg) -> { - if (ctx.channel().isActive()) { - ctx.writeAndFlush( - BrokerApi.Envelope.newBuilder() - .setMessageEvent(BrokerApi.MessageEvent.newBuilder() - .setTopic(s.getTopic()) - .setOffset(seq) - .setKey(ByteString.EMPTY) - .setPayload(UnsafeByteOperations.unsafeWrap(msg))) - .build() - ); - } - }); - } + case SUBSCRIBE -> { + final var s = env.getSubscribe(); + final AtomicInteger pendingWrites = new AtomicInteger(0); + final AtomicLong lastFlushNanos = new AtomicLong(System.nanoTime()); + + ingress.subscribeTopicZeroCopy(s.getTopic(), s.getGroup(), (lsn, payloadView) -> { + if (!ctx.channel().isActive()) return; + + ctx.executor().execute(() -> { + if (!ctx.channel().isActive()) return; + + ctx.write( + BrokerApi.Envelope.newBuilder() + .setMessageEvent(BrokerApi.MessageEvent.newBuilder() + .setTopic(s.getTopic()) + .setOffset(lsn) + .setKey(ByteString.EMPTY) + .setPayload(UnsafeByteOperations.unsafeWrap(payloadView))) + .build() + ); + + final int queued = pendingWrites.incrementAndGet(); + final long now = System.nanoTime(); + if (queued >= SUBSCRIBE_FLUSH_BATCH + || !ctx.channel().isWritable() + || (now - lastFlushNanos.get()) >= SUBSCRIBE_FLUSH_MAX_DELAY_NANOS) { + pendingWrites.set(0); + lastFlushNanos.set(now); + ctx.flush(); + } + }); + }); + } case APPEND -> { ingress.handleAppendAsync(env.getAppend()) @@ -271,7 +294,22 @@ private void writeReply(final ChannelHandlerContext ctx, log.warn("Unknown reply type: {}", reply.getClass().getName()); return; } - - ctx.writeAndFlush(b.build()); - } -} + + ctx.writeAndFlush(b.build()); + } + + private CompletableFuture publishMessage(final long corrId, final BrokerApi.Message m) { + if (m.hasPartitionId()) { + final int partitionId = m.getPartitionId(); + if (partitionId < 0 || partitionId >= ingress.getTotalPartitions()) { + return CompletableFuture.failedFuture( + new IllegalArgumentException("partition_id out of range: " + partitionId) + ); + } + return ingress.publishToPartition( + corrId, m.getTopic(), partitionId, m.getKey(), m.getRetries(), m.getPayload() + ); + } + return ingress.publish(corrId, m.getTopic(), m.getKey(), m.getRetries(), m.getPayload()); + } +} diff --git a/src/main/proto/broker.proto b/src/main/proto/broker.proto index 27df633..5c40bbe 100644 --- a/src/main/proto/broker.proto +++ b/src/main/proto/broker.proto @@ -35,13 +35,13 @@ message Envelope { } } -message Message { - string topic = 1; - int32 retries = 2; - bytes payload = 3; - bytes key = 4; - int32 partition_id = 5; -} +message Message { + string topic = 1; + int32 retries = 2; + bytes payload = 3; + bytes key = 4; + optional int32 partition_id = 5; +} message PublishReply { bool success = 1; diff --git a/src/test/java/io/ringbroker/broker/ingress/ClusteredIngressTest.java b/src/test/java/io/ringbroker/broker/ingress/ClusteredIngressTest.java index 61a7a34..0983d97 100644 --- a/src/test/java/io/ringbroker/broker/ingress/ClusteredIngressTest.java +++ b/src/test/java/io/ringbroker/broker/ingress/ClusteredIngressTest.java @@ -6,19 +6,32 @@ import io.ringbroker.cluster.membership.member.Member; import io.ringbroker.cluster.membership.replicator.AdaptiveReplicator; import io.ringbroker.cluster.membership.resolver.ReplicaSetResolver; +import io.ringbroker.cluster.metadata.EpochMetadata; +import io.ringbroker.cluster.metadata.EpochPlacement; +import io.ringbroker.cluster.metadata.LogConfiguration; import io.ringbroker.cluster.metadata.JournaledLogMetadataStore; +import io.ringbroker.core.lsn.Lsn; import io.ringbroker.core.wait.Blocking; import io.ringbroker.offset.InMemoryOffsetStore; import io.ringbroker.registry.TopicRegistry; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import java.nio.MappedByteBuffer; +import java.nio.ByteBuffer; import java.nio.file.Path; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import static org.junit.jupiter.api.Assertions.*; @@ -55,6 +68,303 @@ void replicationTimeoutSurfacesToCaller(@TempDir final Path dir) throws Exceptio c.close(); } + @Test + void publishPrefersMetadataOwnerOverModulo(@TempDir final Path dir) throws Exception { + final TopicRegistry registry = TopicRegistry.builder() + .topic("t", BrokerApi.Message.getDescriptor()) + .build(); + final InMemoryOffsetStore offsets = new InMemoryOffsetStore(dir.resolve("offsets")); + final JournaledLogMetadataStore metadata = new JournaledLogMetadataStore(dir.resolve("meta")); + + final RecordingRemoteClient owner2 = new RecordingRemoteClient(); + final ConcurrentHashMap clients = new ConcurrentHashMap<>(); + clients.put(2, owner2); + + final AdaptiveReplicator replicator = new AdaptiveReplicator(1, clients, 25); + final List members = List.of( + new Member(0, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1), + new Member(1, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1), + new Member(2, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1) + ); + final ReplicaSetResolver resolver = new ReplicaSetResolver(2, () -> members); + + final ClusteredIngress ingress = ClusteredIngress.create( + registry, + (key, total) -> 0, + 1, + 0, + 3, + clients, + dir.resolve("data"), + 8, + new Blocking(), + 512, + 4, + false, + offsets, + BrokerRole.PERSISTENCE, + resolver, + replicator, + metadata + ); + + // Force placement owner to node 2, different from modulo owner 0. + final EpochPlacement owner2Placement = new EpochPlacement(0L, List.of(2, 0), 1); + final LogConfiguration cfg = new LogConfiguration(0, 99L, List.of( + new EpochMetadata(0L, 0L, -1L, owner2Placement, 0L) + )); + metadata.applyRemote(cfg); + + ingress.publish("t", null, "x".getBytes()).join(); + + assertEquals(1, owner2.ackCalls.get(), "publish should be forwarded to metadata owner"); + assertTrue(owner2.lastPublish.get() != null && owner2.lastPublish.get().getPublish().getPartitionId() == 0); + + ingress.shutdown(); + offsets.close(); + } + + @Test + void forwardRetriesOnTimeoutThenFails(@TempDir final Path dir) throws Exception { + final TopicRegistry registry = TopicRegistry.builder() + .topic("t", BrokerApi.Message.getDescriptor()) + .build(); + final InMemoryOffsetStore offsets = new InMemoryOffsetStore(dir.resolve("offsets")); + final JournaledLogMetadataStore metadata = new JournaledLogMetadataStore(dir.resolve("meta")); + + final NeverAckRemoteClient hung = new NeverAckRemoteClient(); + final ConcurrentHashMap clients = new ConcurrentHashMap<>(); + clients.put(1, hung); + + final AdaptiveReplicator replicator = new AdaptiveReplicator(1, clients, 10); + final List members = List.of( + new Member(0, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1), + new Member(1, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1) + ); + final ReplicaSetResolver resolver = new ReplicaSetResolver(2, () -> members); + + final ClusteredIngress ingress = ClusteredIngress.create( + registry, + (key, total) -> 0, + 1, + 0, + 2, + clients, + dir.resolve("data"), + 8, + new Blocking(), + 512, + 4, + false, + offsets, + BrokerRole.PERSISTENCE, + resolver, + replicator, + metadata + ); + + final EpochPlacement remoteOwner = new EpochPlacement(0L, List.of(1, 0), 1); + metadata.applyRemote(new LogConfiguration(0, 101L, List.of( + new EpochMetadata(0L, 0L, -1L, remoteOwner, 0L) + ))); + + final CompletionException ex = assertThrows(CompletionException.class, + () -> ingress.publish("t", null, "z".getBytes()).join()); + assertTrue(rootCause(ex) instanceof TimeoutException, "final error should be timeout"); + assertEquals(3, hung.calls.get(), "should attempt initial send + 2 retries"); + + ingress.shutdown(); + offsets.close(); + } + + @Test + void backfillPaginatesAndReconstructsEpoch(@TempDir final Path dir) throws Exception { + final TopicRegistry registry = TopicRegistry.builder() + .topic("t", BrokerApi.Message.getDescriptor()) + .build(); + final InMemoryOffsetStore offsets = new InMemoryOffsetStore(dir.resolve("offsets")); + final JournaledLogMetadataStore metadata = new JournaledLogMetadataStore(dir.resolve("meta")); + + final PagedBackfillClient backfillClient = new PagedBackfillClient( + List.of("a".getBytes(), "b".getBytes(), "c".getBytes()) + ); + final ConcurrentHashMap clients = new ConcurrentHashMap<>(); + clients.put(1, backfillClient); + + final AdaptiveReplicator replicator = new AdaptiveReplicator(1, clients, 50); + final List members = List.of( + new Member(0, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1), + new Member(1, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1) + ); + final ReplicaSetResolver resolver = new ReplicaSetResolver(2, () -> members); + + final ClusteredIngress ingress = ClusteredIngress.create( + registry, + (key, total) -> 0, + 1, + 0, + 2, + clients, + dir.resolve("data"), + 8, + new Blocking(), + 512, + 4, + false, + offsets, + BrokerRole.PERSISTENCE, + resolver, + replicator, + metadata + ); + + final EpochPlacement placement = new EpochPlacement(1L, List.of(0, 1), 1); + metadata.applyRemote(new LogConfiguration(0, 77L, List.of( + new EpochMetadata(1L, 0L, 2L, placement, 0L) + ))); + + final var tick = ClusteredIngress.class.getDeclaredMethod("backfillTick"); + tick.setAccessible(true); + tick.invoke(ingress); + + assertEquals(List.of(0L, 2L), backfillClient.requestedOffsets, "backfill should page offsets"); + + final List seen = new ArrayList<>(); + final Ingress local = ingress.getIngressMap().get(0); + assertNotNull(local); + local.fetchEpoch(1L, 0L, 10, (off, segBuf, payloadPos, payloadLen) -> { + final byte[] p = readPayload(segBuf, payloadPos, payloadLen); + seen.add(new String(p)); + }); + assertEquals(List.of("a", "b", "c"), seen); + + ingress.shutdown(); + offsets.close(); + } + + @Test + void subscribeResumesFromLsnAndFallsBackToLedgerWhenRingOverrun(@TempDir final Path dir) throws Exception { + final TopicRegistry registry = TopicRegistry.builder() + .topic("t", BrokerApi.Message.getDescriptor()) + .build(); + final InMemoryOffsetStore offsets = new InMemoryOffsetStore(dir.resolve("offsets")); + final JournaledLogMetadataStore metadata = new JournaledLogMetadataStore(dir.resolve("meta")); + final AdaptiveReplicator replicator = new AdaptiveReplicator(1, Map.of(), 100); + final List members = List.of( + new Member(0, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1) + ); + final ReplicaSetResolver resolver = new ReplicaSetResolver(1, () -> members); + + final ClusteredIngress ingress = ClusteredIngress.create( + registry, + (key, total) -> 0, + 1, + 0, + 1, + Map.of(), + dir.resolve("data"), + 4, // intentionally tiny to force ring overrun + new Blocking(), + 512, + 1, + false, + offsets, + BrokerRole.PERSISTENCE, + resolver, + replicator, + metadata + ); + + final int total = 12; + for (int i = 0; i < total; i++) { + ingress.publish("t", null, ("m" + i).getBytes()).join(); + } + + final long startSeq = 3L; + final long startLsn = Lsn.encode(0L, startSeq); + offsets.commit("t", "g", 0, startLsn); + + final CountDownLatch latch = new CountDownLatch(total - (int) startSeq); + final CopyOnWriteArrayList payloads = new CopyOnWriteArrayList<>(); + final CopyOnWriteArrayList offsetsSeen = new CopyOnWriteArrayList<>(); + + ingress.subscribeTopic("t", "g", (lsn, payload) -> { + payloads.add(new String(payload)); + offsetsSeen.add(lsn); + latch.countDown(); + }); + + assertTrue(latch.await(10, TimeUnit.SECONDS), "subscription should replay from committed LSN"); + assertEquals(total - (int) startSeq, payloads.size()); + assertEquals("m3", payloads.get(0)); + assertEquals("m11", payloads.get(payloads.size() - 1)); + assertEquals(Lsn.encode(0L, 3L), offsetsSeen.get(0)); + assertEquals(Lsn.encode(0L, 11L), offsets.fetch("t", "g", 0)); + + ingress.shutdown(); + offsets.close(); + } + + @Test + void appendBackfillEncodedBatchRollsAcrossSegments(@TempDir final Path dir) throws Exception { + final TopicRegistry registry = TopicRegistry.builder() + .topic("t", BrokerApi.Message.getDescriptor()) + .build(); + final InMemoryOffsetStore offsets = new InMemoryOffsetStore(dir.resolve("offsets")); + final JournaledLogMetadataStore metadata = new JournaledLogMetadataStore(dir.resolve("meta")); + final AdaptiveReplicator replicator = new AdaptiveReplicator(1, Map.of(), 100); + final List members = List.of( + new Member(0, BrokerRole.PERSISTENCE, new java.net.InetSocketAddress("localhost", 0), System.currentTimeMillis(), 1) + ); + final ReplicaSetResolver resolver = new ReplicaSetResolver(1, () -> members); + + final ClusteredIngress ingress = ClusteredIngress.create( + registry, + (key, total) -> 0, + 1, + 0, + 1, + Map.of(), + dir.resolve("data"), + 8, + new Blocking(), + 96, // tiny segment to force multiple segment rolls + 1, + false, + offsets, + BrokerRole.PERSISTENCE, + resolver, + replicator, + metadata + ); + + final Ingress local = ingress.getIngressMap().get(0); + assertNotNull(local); + + final List records = List.of( + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".getBytes(), + "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb".getBytes(), + "cccccccccccccccccccccccccccccccccccccccc".getBytes() + ); + + final int appended = local.appendBackfillEncodedBatch(0L, ByteBuffer.wrap(encodeFramed(records)), 64); + assertEquals(3, appended); + assertEquals(2L, local.highWaterMark(0L)); + + final List seen = new ArrayList<>(); + local.fetchEpoch(0L, 0L, 10, (off, segBuf, payloadPos, payloadLen) -> { + seen.add(new String(readPayload(segBuf, payloadPos, payloadLen))); + }); + assertEquals(List.of( + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + "cccccccccccccccccccccccccccccccccccccccc" + ), seen); + + ingress.shutdown(); + offsets.close(); + } + private Components singleNode(final Path base) throws Exception { final TopicRegistry registry = TopicRegistry.builder() .topic("t", BrokerApi.Message.getDescriptor()) @@ -141,6 +451,126 @@ public CompletableFuture sendEnvelopeWithAck(final Bro return new Components(ingress, offsets); } + private static Throwable rootCause(final Throwable t) { + Throwable cur = t; + while (cur.getCause() != null) cur = cur.getCause(); + return cur; + } + + private static byte[] readPayload(final MappedByteBuffer buffer, final int pos, final int len) { + final byte[] out = new byte[len]; + final var dup = buffer.duplicate(); + dup.position(pos); + dup.get(out, 0, len); + return out; + } + + private static byte[] encodeFramed(final List records) { + int totalBytes = 0; + for (final byte[] rec : records) totalBytes += Integer.BYTES + rec.length; + + final byte[] out = new byte[totalBytes]; + int pos = 0; + for (final byte[] rec : records) { + out[pos] = (byte) rec.length; + out[pos + 1] = (byte) (rec.length >>> 8); + out[pos + 2] = (byte) (rec.length >>> 16); + out[pos + 3] = (byte) (rec.length >>> 24); + pos += Integer.BYTES; + System.arraycopy(rec, 0, out, pos, rec.length); + pos += rec.length; + } + return out; + } + + private static final class RecordingRemoteClient implements RemoteBrokerClient { + final AtomicInteger ackCalls = new AtomicInteger(); + final AtomicReference lastPublish = new AtomicReference<>(); + + @Override + public void sendMessage(final String topic, final byte[] key, final byte[] payload) { + } + + @Override + public CompletableFuture sendEnvelopeWithAck(final BrokerApi.Envelope envelope) { + ackCalls.incrementAndGet(); + lastPublish.set(envelope); + return CompletableFuture.completedFuture(BrokerApi.ReplicationAck.newBuilder() + .setStatus(BrokerApi.ReplicationAck.Status.SUCCESS) + .build()); + } + } + + private static final class NeverAckRemoteClient implements RemoteBrokerClient { + final AtomicInteger calls = new AtomicInteger(); + + @Override + public void sendMessage(final String topic, final byte[] key, final byte[] payload) { + } + + @Override + public CompletableFuture sendEnvelopeWithAck(final BrokerApi.Envelope envelope) { + calls.incrementAndGet(); + return new CompletableFuture<>(); + } + } + + private static final class PagedBackfillClient implements RemoteBrokerClient { + private final List records; + private final CopyOnWriteArrayList requestedOffsets = new CopyOnWriteArrayList<>(); + + PagedBackfillClient(final List records) { + this.records = records; + } + + @Override + public void sendMessage(final String topic, final byte[] key, final byte[] payload) { + } + + @Override + public CompletableFuture sendEnvelopeWithAck(final BrokerApi.Envelope envelope) { + return CompletableFuture.completedFuture(BrokerApi.ReplicationAck.newBuilder() + .setStatus(BrokerApi.ReplicationAck.Status.SUCCESS) + .build()); + } + + @Override + public CompletableFuture sendBackfill(final BrokerApi.Envelope envelope) { + final long offset = envelope.getBackfill().getOffset(); + requestedOffsets.add(offset); + + final int start = (int) offset; + if (start >= records.size()) { + return CompletableFuture.completedFuture(BrokerApi.BackfillReply.newBuilder() + .setEndOfEpoch(true) + .build()); + } + + final int endExclusive = Math.min(records.size(), start + 2); // 2 records per page + int totalBytes = 0; + for (int i = start; i < endExclusive; i++) { + totalBytes += Integer.BYTES + records.get(i).length; + } + final byte[] payload = new byte[totalBytes]; + int pos = 0; + for (int i = start; i < endExclusive; i++) { + final byte[] rec = records.get(i); + payload[pos] = (byte) (rec.length); + payload[pos + 1] = (byte) (rec.length >>> 8); + payload[pos + 2] = (byte) (rec.length >>> 16); + payload[pos + 3] = (byte) (rec.length >>> 24); + pos += Integer.BYTES; + System.arraycopy(rec, 0, payload, pos, rec.length); + pos += rec.length; + } + + return CompletableFuture.completedFuture(BrokerApi.BackfillReply.newBuilder() + .setPayload(com.google.protobuf.ByteString.copyFrom(payload)) + .setEndOfEpoch(endExclusive >= records.size()) + .build()); + } + } + private record Components(ClusteredIngress ingress, AutoCloseable offsets) implements AutoCloseable { @Override public void close() throws Exception {