From e0a28667ce03d789f3d4e2a671bdc579a4be42e6 Mon Sep 17 00:00:00 2001 From: Xiaozhen Liu Date: Fri, 13 Feb 2026 15:44:16 -0800 Subject: [PATCH 001/152] fix(amber): improve region kill behavior error handling and add synchronous kill logic. --- .../controller/WorkflowScheduler.scala | 2 + .../WorkerExecutionCompletedHandler.scala | 6 +- .../RegionExecutionCoordinator.scala | 55 +++++++++++++++---- .../WorkflowExecutionCoordinator.scala | 45 ++++++++++----- .../worker/promisehandlers/EndHandler.scala | 2 +- 5 files changed, 83 insertions(+), 27 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index 9dcf3ad4bfc..b1acb3c0650 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -54,4 +54,6 @@ class WorkflowScheduler( def getNextRegions: Set[Region] = if (!schedule.hasNext) Set() else schedule.next() + def hasPendingRegions: Boolean = schedule != null && schedule.hasNext + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala index 594673caa53..9f6871dcb83 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala @@ -60,7 +60,11 @@ trait WorkerExecutionCompletedHandler { .collect(Seq(statsRequest)) .flatMap(_ => { // if entire workflow is completed, clean up - if (cp.workflowExecution.isCompleted) { + val isWorkflowTerminal = + cp.workflowExecution.isCompleted && + !cp.workflowScheduler.hasPendingRegions && + !cp.workflowExecutionCoordinator.hasUnfinishedRegionCoordinators + if (isWorkflowTerminal) { // after query result come back: send completed event, cleanup ,and kill workflow sendToClient(ExecutionStateUpdate(cp.workflowExecution.getState)) cp.controllerTimerService.disableStatusUpdate() diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 7e5b228801f..9fee0226b6c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -20,7 +20,7 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.pekko.pattern.gracefulStop -import com.twitter.util.{Future, Return, Throw} +import com.twitter.util.{Duration => TwitterDuration, Future, JavaTimer, Return, Throw, Timer} import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.VFSURIFactory.decodeURI import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity @@ -60,7 +60,7 @@ import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowExecutions import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.AtomicReference -import scala.concurrent.duration.Duration +import scala.concurrent.duration.{Duration => ScalaDuration} /** * The executor of a region. @@ -108,10 +108,14 @@ class RegionExecutionCoordinator( private val currentPhaseRef: AtomicReference[RegionExecutionPhase] = new AtomicReference( Unexecuted ) + private val terminationFutureRef: AtomicReference[Future[Unit]] = new AtomicReference(null) + private val killRetryTimer: Timer = new JavaTimer(true) + private val killRetryDelay: TwitterDuration = TwitterDuration.fromMilliseconds(200) /** * Sync the status of `RegionExecution` and transition this coordinator's phase to `Completed` only when the - * coordinator is currently in `ExecutingNonDependeePortsPhase` and all the ports of this region are completed. + * coordinator is currently in `ExecutingNonDependeePortsPhase`, all the ports of this region are completed, and + * all workers in this region are terminated. * * Additionally, this method will also terminate all the workers of this region: * @@ -134,12 +138,22 @@ class RegionExecutionCoordinator( return Future.Unit } - // Set this coordinator's status to be completed so that subsequent regions can be started by - // WorkflowExecutionCoordinator. - setPhase(Completed) - - // Terminate all the workers in this region. - terminateWorkers(regionExecution) + val existingTerminationFuture = terminationFutureRef.get + if (existingTerminationFuture != null) { + existingTerminationFuture + } else { + val terminationFuture = terminateWorkersWithRetry(regionExecution).flatMap { _ => + // Set this coordinator's status to be completed so that subsequent regions can be started by + // WorkflowExecutionCoordinator. + setPhase(Completed) + Future.Unit + } + if (terminationFutureRef.compareAndSet(null, terminationFuture)) { + terminationFuture + } else { + terminationFutureRef.get + } + } } private def terminateWorkers(regionExecution: RegionExecution) = { @@ -166,7 +180,7 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) - gracefulStop(actorRef, Duration(5, TimeUnit.SECONDS)).asTwitter() + gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() } }.toSeq @@ -190,8 +204,29 @@ class RegionExecutionCoordinator( } } + private def terminateWorkersWithRetry( + regionExecution: RegionExecution, + attempt: Int = 1 + ): Future[Unit] = { + terminateWorkers(regionExecution).rescue { case err => + logger.warn( + s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", + err + ) + Future + .sleep(killRetryDelay)(killRetryTimer) + .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) + } + } + def isCompleted: Boolean = currentPhaseRef.get == Completed + /** + * Returns the region termination future if termination has been initiated. + * This is only set by `tryCompleteRegionExecution()`. + */ + def getTerminationFutureOpt: Option[Future[Unit]] = Option(terminationFutureRef.get) + /** * This will sync and transition the region execution phase from one to another depending on its current phase: * diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 05585f88d8d..1c3ae89471b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -27,9 +27,11 @@ import org.apache.texera.amber.engine.architecture.common.{ AkkaActorService } import org.apache.texera.amber.engine.architecture.controller.ControllerConfig +import org.apache.texera.amber.engine.architecture.controller.ExecutionStateUpdate import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.common.rpc.AsyncRPCClient +import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable class WorkflowExecutionCoordinator( @@ -44,6 +46,7 @@ class WorkflowExecutionCoordinator( private val regionExecutionCoordinators : mutable.HashMap[RegionIdentity, RegionExecutionCoordinator] = mutable.HashMap() + private val completionNotified: AtomicBoolean = new AtomicBoolean(false) @transient var actorRefService: AkkaActorRefMappingService = _ @@ -59,18 +62,19 @@ class WorkflowExecutionCoordinator( * After the syncs, if there are no running region(s), it will start new regions (if available). */ def coordinateRegionExecutors(actorService: AkkaActorService): Future[Unit] = { - if (regionExecutionCoordinators.values.exists(!_.isCompleted)) { - // As this method is invoked by the completion of each port in a region, and regionExecutionCoordinator only - // lanuches each phase asynchronously, we need to let each current unfinished regionExecutionCoordinator - // sync its status and proceed with next phases if needed. - Future - .collect({ - regionExecutionCoordinators.values - .filter(!_.isCompleted) - .map(_.syncStatusAndTransitionRegionExecutionPhase()) - .toSeq - }) + val unfinishedRegionCoordinators = + regionExecutionCoordinators.values.filter(!_.isCompleted).toSeq + + // Trigger sync for each unfinished region. + unfinishedRegionCoordinators.foreach(_.syncStatusAndTransitionRegionExecutionPhase()) + + // Wait only for region termination futures (kill path), then re-run coordination. + val terminationFutures = unfinishedRegionCoordinators.flatMap(_.getTerminationFutureOpt) + if (terminationFutures.nonEmpty) { + return Future + .collect(terminationFutures) .unit + .flatMap(_ => coordinateRegionExecutors(actorService)) } if (regionExecutionCoordinators.values.exists(!_.isCompleted)) { @@ -79,10 +83,17 @@ class WorkflowExecutionCoordinator( } // All existing regions are completed. Start the next region (if any). + val nextRegions = getNextRegions() + if (nextRegions.isEmpty) { + if (workflowExecution.isCompleted && completionNotified.compareAndSet(false, true)) { + asyncRPCClient.sendToClient(ExecutionStateUpdate(workflowExecution.getState)) + } + return Future.Unit + } + + executedRegions.append(nextRegions) Future - .collect({ - val nextRegions = getNextRegions() - executedRegions.append(nextRegions) + .collect( nextRegions .map(region => { workflowExecution.initRegionExecution(region) @@ -98,7 +109,7 @@ class WorkflowExecutionCoordinator( }) .map(_.syncStatusAndTransitionRegionExecutionPhase()) .toSeq - }) + ) .unit } @@ -116,4 +127,8 @@ class WorkflowExecutionCoordinator( .toSet } + def hasUnfinishedRegionCoordinators: Boolean = { + regionExecutionCoordinators.values.exists(!_.isCompleted) + } + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/promisehandlers/EndHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/promisehandlers/EndHandler.scala index 2a6a20b3d3e..0504e66f52b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/promisehandlers/EndHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/promisehandlers/EndHandler.scala @@ -48,8 +48,8 @@ trait EndHandler { s"Received EndHandler before all messages are processed. Unprocessed messages: " + s"${dp.inputManager.inputMessageQueue.peek()}" ) + return Future.exception(new IllegalStateException("worker still has unprocessed messages")) } - assert(dp.inputManager.inputMessageQueue.isEmpty) // Now we can safely acknowledge that this worker can be terminated. EmptyReturn() } From d21a25271f8bf86a2ed3e19e351b6709142027e6 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 01:27:00 -0700 Subject: [PATCH 002/152] refactor: isolate state serialization and materialization changes --- .../architecture/packaging/output_manager.py | 38 ++++++- amber/src/main/python/core/models/operator.py | 38 ++++++- amber/src/main/python/core/models/state.py | 97 ++++++++-------- .../python/core/runnables/data_processor.py | 1 + .../main/python/core/runnables/main_loop.py | 49 +++++++- .../python/core/runnables/network_receiver.py | 15 ++- .../python/core/runnables/network_sender.py | 23 ++-- .../python/core/storage/document_factory.py | 107 ++++++++++-------- ...ut_port_materialization_reader_runnable.py | 30 ++++- .../python/core/storage/vfs_uri_factory.py | 1 + .../messaginglayer/OutputManager.scala | 20 ++++ .../pythonworker/PythonProxyClient.scala | 7 +- .../pythonworker/PythonProxyServer.scala | 5 +- .../RegionExecutionCoordinator.scala | 40 +++++-- ...InputPortMaterializationReaderThread.scala | 26 ++++- .../core/executor/OperatorExecutor.scala | 8 +- .../texera/amber/core/state/State.scala | 83 +++++++++----- .../texera/amber/core/state/package.scala | 24 ++++ .../amber/core/storage/DocumentFactory.scala | 2 + .../amber/core/storage/VFSURIFactory.scala | 1 + 20 files changed, 452 insertions(+), 163 deletions(-) create mode 100644 common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index bf4afbf396f..065b063f7d4 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -17,6 +17,7 @@ import threading import typing +import uuid from collections import OrderedDict from itertools import chain from loguru import logger @@ -43,7 +44,12 @@ ) from core.models import Tuple, Schema, StateFrame from core.models.payload import DataPayload, DataFrame -from core.models.state import State +from core.models.state import ( + State, + STATE_SCHEMA, + serialize_state, + state_uri_from_result_uri, +) from core.storage.document_factory import DocumentFactory from core.storage.runnables.port_storage_writer import ( PortStorageWriter, @@ -87,6 +93,8 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() + self._storage_uris: typing.Dict[PortIdentity, str] = dict() + def is_missing_output_ports(self): """ This method is only used for ensuring correct region execution. @@ -126,6 +134,7 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri: str): Create a separate thread for saving output tuples of a port to storage in batch. """ + self._storage_uris[port_id] = storage_uri document, _ = DocumentFactory.open_document(storage_uri) buffered_item_writer = document.writer(str(get_worker_index(self.worker_id))) writer_queue = Queue() @@ -171,6 +180,31 @@ def save_tuple_to_storage_if_needed(self, tuple_: Tuple, port_id=None) -> None: PortStorageWriterElement(data_tuple=tuple_) ) + def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: + if port_id is None: + uris = self._storage_uris.values() + elif port_id in self._storage_uris: + uris = [self._storage_uris[port_id]] + else: + return + + for uri in uris: + state_uri = state_uri_from_result_uri(uri) + try: + document = DocumentFactory.open_document(state_uri)[0] + except ValueError: + document = DocumentFactory.create_document(state_uri, STATE_SCHEMA) + writer = document.writer(str(uuid.uuid4())) + writer.put_one(serialize_state(state)) + writer.close() + + def reset_output_storage(self) -> None: + port_id = self.get_port_ids()[0] + storage_uri = self._storage_uris[port_id] + self.close_port_storage_writers() + DocumentFactory.create_document(storage_uri, self._ports[port_id].get_schema()) + self.set_up_port_storage_writer(port_id, storage_uri) + def close_port_storage_writers(self) -> None: """ Flush the buffers of port storage writers and wait for all the @@ -248,7 +282,7 @@ def emit_state( receiver, ( StateFrame(payload) - if isinstance(payload, State) + if isinstance(payload, dict) else self.tuple_to_frame(payload) ), ) diff --git a/amber/src/main/python/core/models/operator.py b/amber/src/main/python/core/models/operator.py index 79050839958..5b9672988aa 100644 --- a/amber/src/main/python/core/models/operator.py +++ b/amber/src/main/python/core/models/operator.py @@ -108,14 +108,12 @@ def close(self) -> None: def process_state(self, state: State, port: int) -> Optional[State]: """ Process an input State from the given link. - The default implementation is to pass the State to all downstream operators - if the State has pass_to_all_downstream set to True. + The default implementation is to pass the State to downstream operators. :param state: State, a State from an input port to be processed. :param port: int, input port index of the current exhausted port. :return: State, producing one State object """ - if state.passToAllDownstream: - return state + return state def produce_state_on_start(self, port: int) -> State: """ @@ -293,3 +291,35 @@ def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike] time, or None. """ yield + + +class LoopStartOperator(TableOperator): + @overrides.final + def process_state(self, state: State, port: int) -> Optional[State]: + if "LoopStartStateURI" in state: + state["loop_counter"] += 1 + return state + self.state.update(state) + return None + + @overrides.final + def produce_state_on_finish(self, port: int) -> State: + from pickle import dumps + + self.state["table"] = dumps(Table(self._TableOperator__table_data[port])) + return dict(self.state) + + +class LoopEndOperator(TableOperator): + @overrides.final + def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: + yield table + + @abstractmethod + def condition(self) -> None: + pass + + def loop_start_id(self) -> str: + del self.state["table"] + del self.state["output"] + return self.state["LoopStartId"] diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index 2c8a268dfb7..e5726cc3c2f 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -15,61 +15,64 @@ # specific language governing permissions and limitations # under the License. -from dataclasses import dataclass -from pandas import DataFrame -from pyarrow import Table -from typing import Optional +import base64 +import json +from typing import Any, Dict, TypeAlias -from .schema import Schema, AttributeType -from .schema.attribute_type import FROM_PYOBJECT_MAPPING +from .schema import Schema +from .tuple import Tuple +State: TypeAlias = Dict[str, Any] -@dataclass -class State: - def __init__( - self, table: Optional[Table] = None, pass_to_all_downstream: bool = False - ): - self.schema = Schema() - self.passToAllDownstream = pass_to_all_downstream - if table is not None: - self.__dict__.update(table.to_pandas().iloc[0].to_dict()) - self.schema = Schema(table.schema) +STATE_CONTENT = "content" +_TYPE_MARKER = "__texera_type__" +_PAYLOAD_MARKER = "payload" +_BYTES_TYPE = "bytes" - def add( - self, key: str, value: any, value_type: Optional[AttributeType] = None - ) -> None: - self.__dict__[key] = value - if value_type is not None: - self.schema.add(key, value_type) - elif key != "schema": - self.schema.add(key, FROM_PYOBJECT_MAPPING[type(value)]) +STATE_SCHEMA = Schema(raw_schema={STATE_CONTENT: "STRING"}) - def get(self, key: str) -> any: - return self.__dict__[key] - def to_table(self) -> Table: - return Table.from_pandas( - df=DataFrame([self.__dict__]), - schema=self.schema.as_arrow_schema(), - ) +def state_uri_from_result_uri(result_uri: str) -> str: + return result_uri.replace("/result", "/state") - def __setattr__(self, key: str, value: any) -> None: - self.add(key, value) - def __setitem__(self, key: str, value: any) -> None: - self.add(key, value) +def serialize_state(state: State) -> Tuple: + return Tuple( + { + STATE_CONTENT: json.dumps( + _to_json_value(state), separators=(",", ":") + ) + }, + schema=STATE_SCHEMA, + ) - def __getitem__(self, key: str) -> any: - return self.get(key) - def __str__(self) -> str: - content = ", ".join( - [ - repr(key) + ": " + repr(value) - for key, value in self.__dict__.items() - if key != "schema" - ] - ) - return f"State[{content}]" +def deserialize_state(row: Tuple) -> State: + return _from_json_value(json.loads(row[STATE_CONTENT])) - __repr__ = __str__ + +def _to_json_value(value: Any) -> Any: + if value is None or isinstance(value, (bool, int, float, str)): + return value + if isinstance(value, bytes): + return { + _TYPE_MARKER: _BYTES_TYPE, + _PAYLOAD_MARKER: base64.b64encode(value).decode("ascii"), + } + if isinstance(value, dict): + return {str(key): _to_json_value(inner) for key, inner in value.items()} + if isinstance(value, (list, tuple)): + return [_to_json_value(inner) for inner in value] + raise TypeError( + f"State value of type {type(value).__name__} is not JSON serializable" + ) + + +def _from_json_value(value: Any) -> Any: + if isinstance(value, list): + return [_from_json_value(inner) for inner in value] + if isinstance(value, dict): + if value.get(_TYPE_MARKER) == _BYTES_TYPE: + return base64.b64decode(value[_PAYLOAD_MARKER]) + return {key: _from_json_value(inner) for key, inner in value.items()} + return value diff --git a/amber/src/main/python/core/runnables/data_processor.py b/amber/src/main/python/core/runnables/data_processor.py index 4399b1a3a2f..815e85a6446 100644 --- a/amber/src/main/python/core/runnables/data_processor.py +++ b/amber/src/main/python/core/runnables/data_processor.py @@ -100,6 +100,7 @@ def process_state(self, state: State) -> None: self._context.worker_id, self._context.console_message_manager.print_buf, ): + self._switch_context() self._set_output_state(executor.process_state(state, port_id)) except Exception as err: diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index d73c655734f..ece5cf8e102 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -38,8 +38,15 @@ ECMElement, InternalQueueElement, ) -from core.models.state import State +from core.models.operator import LoopEndOperator, LoopStartOperator +from core.models.state import ( + State, + STATE_SCHEMA, + serialize_state, + state_uri_from_result_uri, +) from core.runnables.data_processor import DataProcessor +from core.storage.document_factory import DocumentFactory from core.util import StoppableQueueBlockingRunnable, get_one_of from core.util.console_message.timestamp import current_time_in_local_timezone from core.util.customized_queue.queue_base import QueueElement @@ -48,6 +55,7 @@ PortIdentity, ChannelIdentity, EmbeddedControlMessageIdentity, + OperatorIdentity, ) from proto.org.apache.texera.amber.engine.architecture.rpc import ( ConsoleMessage, @@ -61,6 +69,7 @@ EmbeddedControlMessage, AsyncRpcContext, ControlRequest, + IterationCompletedRequest, ) from proto.org.apache.texera.amber.engine.architecture.worker import ( WorkerState, @@ -87,6 +96,29 @@ def __init__( target=self.data_processor.run, daemon=True, name="data_processor_thread" ).start() + def _attach_loop_start_id(self, output_state: State) -> None: + if "LoopStartId" in output_state: + return + output_state["LoopStartId"] = self.context.worker_id.split("-", 1)[1].rsplit( + "-main-0", 1 + )[0] + output_state["LoopStartStateURI"] = state_uri_from_result_uri( + self.context.input_manager.get_input_state_result_uri() + ) + + def _next_iteration( + self, executor: LoopEndOperator, controller_interface + ) -> None: + controller_interface.iteration_completed( + IterationCompletedRequest(OperatorIdentity(executor.loop_start_id())) + ) + uri = executor.state["LoopStartStateURI"] + del executor.state["LoopStartStateURI"] + del executor.state["LoopStartId"] + writer = DocumentFactory.create_document(uri, STATE_SCHEMA).writer("0") + writer.put_one(serialize_state(executor.state)) + writer.close() + def complete(self) -> None: """ Complete the DataProcessor, marking state to COMPLETED, and notify the @@ -94,12 +126,15 @@ def complete(self) -> None: """ # flush the buffered console prints self._check_and_report_console_messages(force_flush=True) - self.context.executor_manager.executor.close() + controller_interface = self._async_rpc_client.controller_stub() + executor = self.context.executor_manager.executor + if isinstance(executor, LoopEndOperator) and executor.condition(): + self._next_iteration(executor, controller_interface) + executor.close() # stop the data processing thread self.data_processor.stop() self.context.state_manager.transit_to(WorkerState.COMPLETED) self.context.statistics_manager.update_total_execution_time(time.time_ns()) - controller_interface = self._async_rpc_client.controller_stub() controller_interface.worker_execution_completed(EmptyRequest()) self.context.close() @@ -188,6 +223,10 @@ def process_input_state(self) -> None: output_state = self.context.state_processing_manager.get_output_state() self._switch_context() if output_state is not None: + if isinstance(self.context.executor_manager.executor, LoopEndOperator): + self.context.output_manager.reset_output_storage() + if isinstance(self.context.executor_manager.executor, LoopStartOperator): + self._attach_loop_start_id(output_state) for to, batch in self.context.output_manager.emit_state(output_state): self._output_queue.put( DataElement( @@ -197,6 +236,7 @@ def process_input_state(self) -> None: payload=batch, ) ) + self.context.output_manager.save_state_to_storage_if_needed(output_state) def process_tuple_with_udf(self) -> Iterator[Optional[Tuple]]: """ @@ -241,6 +281,7 @@ def _process_tuple(self, tuple_: Tuple) -> None: def _process_state(self, state_: State) -> None: self.context.state_processing_manager.current_input_state = state_ + self._switch_context() self.process_input_state() self._check_and_process_control() @@ -329,7 +370,7 @@ def _process_ecm(self, ecm_element: ECMElement): if ecm.ecm_type != EmbeddedControlMessageType.NO_ALIGNMENT: self.context.pause_manager.resume(PauseType.ECM_PAUSE) - + self._switch_context() if self.context.tuple_processing_manager.current_internal_marker: { StartChannel: self._process_start_channel, diff --git a/amber/src/main/python/core/runnables/network_receiver.py b/amber/src/main/python/core/runnables/network_receiver.py index fd42a8f589b..e1815b08f7d 100644 --- a/amber/src/main/python/core/runnables/network_receiver.py +++ b/amber/src/main/python/core/runnables/network_receiver.py @@ -32,6 +32,7 @@ ) from core.models import ( DataFrame, + Tuple, StateFrame, ) from core.models.internal_queue import ( @@ -40,8 +41,8 @@ InternalQueue, ECMElement, ) -from core.models.state import State from core.proxy import ProxyServer +from core.models.state import STATE_SCHEMA, deserialize_state from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable from proto.org.apache.texera.amber.engine.architecture.rpc import EmbeddedControlMessage @@ -96,7 +97,17 @@ def data_handler(command: bytes, table: Table) -> int: "Data", lambda _: DataFrame(table), "State", - lambda _: StateFrame(State(table)), + lambda _: StateFrame( + deserialize_state( + Tuple( + { + name: table[name][0].as_py() + for name in STATE_SCHEMA.get_attr_names() + }, + schema=STATE_SCHEMA, + ) + ) + ), "ECM", lambda _: EmbeddedControlMessage().parse(table["payload"][0].as_py()), ) diff --git a/amber/src/main/python/core/runnables/network_sender.py b/amber/src/main/python/core/runnables/network_sender.py index 9595433fb70..f1bd8659ee9 100644 --- a/amber/src/main/python/core/runnables/network_sender.py +++ b/amber/src/main/python/core/runnables/network_sender.py @@ -20,13 +20,18 @@ from overrides import overrides from typing import Optional -from core.models import DataPayload, InternalQueue, DataFrame, StateFrame, State +from core.models import DataPayload, InternalQueue, DataFrame, StateFrame from core.models.internal_queue import ( InternalQueueElement, DataElement, DCMElement, ECMElement, ) +from core.models.state import ( + STATE_CONTENT, + STATE_SCHEMA, + serialize_state, +) from core.proxy import ProxyClient from core.util import StoppableQueueBlockingRunnable from proto.org.apache.texera.amber.core import ChannelIdentity @@ -98,13 +103,15 @@ def _send_data(self, to: ChannelIdentity, data_payload: DataPayload) -> None: data_header = PythonDataHeader(tag=to, payload_type="Data") self._proxy_client.send_data(bytes(data_header), data_payload.frame) elif isinstance(data_payload, StateFrame): - data_header = PythonDataHeader( - tag=to, payload_type=data_payload.frame.__class__.__name__ - ) - table = ( - data_payload.frame.to_table() - if isinstance(data_payload.frame, State) - else None + data_header = PythonDataHeader(tag=to, payload_type="State") + serialized_state = serialize_state(data_payload.frame) + table = pa.Table.from_pydict( + { + STATE_CONTENT: [ + serialized_state[STATE_CONTENT] + ], + }, + schema=STATE_SCHEMA.as_arrow_schema(), ) self._proxy_client.send_data(bytes(data_header), table) else: diff --git a/amber/src/main/python/core/storage/document_factory.py b/amber/src/main/python/core/storage/document_factory.py index 9b686ab66b6..8a4d6fe3c5f 100644 --- a/amber/src/main/python/core/storage/document_factory.py +++ b/amber/src/main/python/core/storage/document_factory.py @@ -61,30 +61,35 @@ def create_document(uri: str, schema: Schema) -> VirtualDocument: if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME: _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - if resource_type in {VFSResourceType.RESULT}: - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - # Convert Amber Schema to Iceberg Schema with LARGE_BINARY - # field name encoding - iceberg_schema = amber_schema_to_iceberg_schema(schema) - - create_table( - IcebergCatalogInstance.get_instance(), - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - iceberg_schema, - override_if_exists=True, - ) - - return IcebergDocument[Tuple]( - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - iceberg_schema, - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - else: - raise ValueError(f"Resource type {resource_type} is not supported") + match resource_type: + case VFSResourceType.RESULT: + namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + namespace = "state" + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + # Convert Amber Schema to Iceberg Schema with LARGE_BINARY + # field name encoding + iceberg_schema = amber_schema_to_iceberg_schema(schema) + + create_table( + IcebergCatalogInstance.get_instance(), + namespace, + storage_key, + iceberg_schema, + override_if_exists=True, + ) + + return IcebergDocument[Tuple]( + namespace, + storage_key, + iceberg_schema, + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for creating the document" @@ -96,30 +101,36 @@ def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: if parsed_uri.scheme == "vfs": _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - if resource_type in {VFSResourceType.RESULT}: - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - table = load_table_metadata( - IcebergCatalogInstance.get_instance(), - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - ) - - if table is None: - raise ValueError("No storage is found for the given URI") - - amber_schema = Schema(table.schema().as_arrow()) - - document = IcebergDocument( - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - table.schema(), - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - return document, amber_schema - else: - raise ValueError(f"Resource type {resource_type} is not supported") + match resource_type: + case VFSResourceType.RESULT: + namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + namespace = "state" + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + + table = load_table_metadata( + IcebergCatalogInstance.get_instance(), + namespace, + storage_key, + ) + + if table is None: + raise ValueError("No storage is found for the given URI") + + amber_schema = Schema(table.schema().as_arrow()) + + document = IcebergDocument( + namespace, + storage_key, + table.schema(), + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + return document, amber_schema + else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for opening the document" diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index e49c0316cc7..493ecf0a413 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -17,8 +17,8 @@ import typing from loguru import logger -from pyarrow import Table from typing import Union +from pyarrow import Table from core.architecture.sendsemantics.broad_cast_partitioner import ( BroadcastPartitioner, @@ -34,8 +34,9 @@ from core.architecture.sendsemantics.round_robin_partitioner import ( RoundRobinPartitioner, ) -from core.models import Tuple, InternalQueue, DataFrame, DataPayload +from core.models import Tuple, InternalQueue, DataFrame, DataPayload, State, StateFrame from core.models.internal_queue import DataElement, ECMElement +from core.models.state import deserialize_state, state_uri_from_result_uri from core.storage.document_factory import DocumentFactory from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable @@ -125,6 +126,15 @@ def tuple_to_batch_with_filter(self, tuple_: Tuple) -> typing.Iterator[DataFrame if receiver == self.worker_actor_id: yield self.tuples_to_data_frame(tuples) + def emit_state_with_filter(self, state: State) -> typing.Iterator[StateFrame]: + for receiver, payload in self.partitioner.flush_state(state): + if receiver == self.worker_actor_id: + yield ( + StateFrame(payload) + if isinstance(payload, dict) + else self.tuples_to_data_frame(payload) + ) + def run(self) -> None: """ Main execution logic that reads tuples from the materialized storage and @@ -138,8 +148,21 @@ def run(self) -> None: self.uri ) self.emit_ecm("StartChannel", EmbeddedControlMessageType.NO_ALIGNMENT) - storage_iterator = self.materialization.get() + try: + state_document, _ = DocumentFactory.open_document( + state_uri_from_result_uri(self.uri) + ) + state_iterator = state_document.get() + for state in state_iterator: + for state_frame in self.emit_state_with_filter( + deserialize_state(state) + ): + self.emit_payload(state_frame) + except ValueError: + pass + + storage_iterator = self.materialization.get() # Iterate and process tuples. for tup in storage_iterator: if self._stopped: @@ -149,6 +172,7 @@ def run(self) -> None: tup.cast_to_schema(self.tuple_schema) for data_frame in self.tuple_to_batch_with_filter(tup): self.emit_payload(data_frame) + self.emit_ecm("EndChannel", EmbeddedControlMessageType.PORT_ALIGNMENT) self._finished = True except Exception as err: diff --git a/amber/src/main/python/core/storage/vfs_uri_factory.py b/amber/src/main/python/core/storage/vfs_uri_factory.py index de0c5db56ec..0e23e607055 100644 --- a/amber/src/main/python/core/storage/vfs_uri_factory.py +++ b/amber/src/main/python/core/storage/vfs_uri_factory.py @@ -34,6 +34,7 @@ class VFSResourceType(str, Enum): RESULT = "result" RUNTIME_STATISTICS = "runtimeStatistics" CONSOLE_MESSAGES = "consoleMessages" + STATE = "state" class VFSURIFactory: diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 4ab3d18056f..53755b780cc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -124,6 +124,8 @@ class OutputManager( : mutable.HashMap[PortIdentity, OutputPortResultWriterThread] = mutable.HashMap() + private val storageUris: mutable.HashMap[Int, URI] = mutable.HashMap() + /** * Add down stream operator and its corresponding Partitioner. * @@ -232,6 +234,23 @@ class OutputManager( }) } + def saveStateToStorageIfNeeded(state: State): Unit = { + try { + storageUris.foreach { + case (_, uri) => + val writer = DocumentFactory + .openDocument(State.stateUriFromResultUri(uri)) + ._1 + .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) + .asInstanceOf[BufferedItemWriter[Tuple]] + writer.putOne(State.serialize(state)) + writer.close() + } + } catch { + case _: Exception => () + } + } + /** * Singal the port storage writer to flush the remaining buffer and wait for commits to finish so that * the output port is properly completed. If the output port does not need storage, no action will be done. @@ -280,6 +299,7 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, storageUri: URI): Unit = { + this.storageUris(portId.id) = storageUri val bufferedItemWriter = DocumentFactory .openDocument(storageUri) ._1 diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index 6618e857b1d..e53fccf8c02 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.pythonworker import com.twitter.util.{Await, Promise} import org.apache.texera.amber.core.WorkflowRuntimeException +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.tuple.{Schema, Tuple} import org.apache.texera.amber.core.virtualidentity.{ActorVirtualIdentity, ChannelIdentity} import org.apache.texera.amber.engine.architecture.pythonworker.WorkerBatchInternalQueue.{ @@ -125,7 +126,11 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu case DataFrame(frame) => writeArrowStream(mutable.Queue(ArraySeq.unsafeWrapArray(frame): _*), from, "Data") case StateFrame(state) => - writeArrowStream(mutable.Queue(state.toTuple), from, "State") + writeArrowStream( + mutable.Queue(State.serialize(state)), + from, + "State" + ) } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala index c904e436bcd..2a1e212ac88 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala @@ -128,7 +128,10 @@ private class AmberProducer( dataHeader.payloadType match { case "State" => assert(root.getRowCount == 1) - outputPort.sendTo(to, StateFrame(State(Some(ArrowUtils.getTexeraTuple(0, root))))) + outputPort.sendTo( + to, + StateFrame(State.deserialize(ArrowUtils.getTexeraTuple(0, root))) + ) case "ECM" => assert(root.getRowCount == 1) outputPort.sendTo( diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index a0c73b6506d..a384f383e1f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.pekko.pattern.gracefulStop import com.twitter.util.{Duration => TwitterDuration, Future, JavaTimer, Return, Throw, Timer} +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.VFSURIFactory.decodeURI import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity @@ -181,6 +182,8 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) + asyncRPCClient.inputGateway.removeControlChannel(workerId) + asyncRPCClient.outputGateway.removeControlChannel(workerId) gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() } }.toSeq @@ -209,14 +212,15 @@ class RegionExecutionCoordinator( regionExecution: RegionExecution, attempt: Int = 1 ): Future[Unit] = { - terminateWorkers(regionExecution).rescue { case err => - logger.warn( - s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", - err - ) - Future - .sleep(killRetryDelay)(killRetryTimer) - .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) + terminateWorkers(regionExecution).rescue { + case err => + logger.warn( + s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", + err + ) + Future + .sleep(killRetryDelay)(killRetryTimer) + .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) } } @@ -563,12 +567,30 @@ class RegionExecutionCoordinator( portConfigs.foreach { case (outputPortId, portConfig) => val storageUriToAdd = portConfig.storageURI + val stateUriToAdd = State.stateUriFromResultUri(storageUriToAdd) val (_, eid, _, _) = decodeURI(storageUriToAdd) val schemaOptional = region.getOperator(outputPortId.opId).outputPorts(outputPortId.portId)._3 val schema = schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) - DocumentFactory.createDocument(storageUriToAdd, schema) + if (region.getOperators.exists(_.id.logicalOpId.id.startsWith("LoopEnd-operator-"))) { + try { + DocumentFactory.openDocument(storageUriToAdd) + } catch { + case _: Exception => + DocumentFactory.createDocument(storageUriToAdd, schema) + } + try { + DocumentFactory.openDocument(stateUriToAdd) + } catch { + case _: Exception => + DocumentFactory.createDocument(stateUriToAdd, State.schema) + } + } else { + DocumentFactory.createDocument(storageUriToAdd, schema) + DocumentFactory.createDocument(stateUriToAdd, State.schema) + } + WorkflowExecutionsResource.insertOperatorPortResultUri( eid = eid, globalPortId = outputPortId, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala index 10fbbc44a2c..acada743bc6 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.worker.managers import io.grpc.MethodDescriptor import org.apache.texera.amber.config.ApplicationConfig +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.model.VirtualDocument import org.apache.texera.amber.core.tuple.Tuple @@ -45,7 +46,11 @@ import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.{ DPInputQueueElement, FIFOMessageElement } -import org.apache.texera.amber.engine.common.ambermessage.{DataFrame, WorkflowFIFOMessage} +import org.apache.texera.amber.engine.common.ambermessage.{ + DataFrame, + StateFrame, + WorkflowFIFOMessage +} import org.apache.texera.amber.util.VirtualIdentityUtils.getFromActorIdForInputPortStorage import java.net.URI @@ -106,6 +111,25 @@ class InputPortMaterializationReaderThread( } // Flush any remaining tuples in the buffer. if (buffer.nonEmpty) flush() + + try { + val state_document = + DocumentFactory + .openDocument(State.stateUriFromResultUri(uri)) + ._1 + .asInstanceOf[VirtualDocument[Tuple]] + val stateReadIterator = state_document.get() + + while (stateReadIterator.hasNext) { + val state = State.deserialize(stateReadIterator.next()) + inputMessageQueue.put( + FIFOMessageElement(WorkflowFIFOMessage(channelId, getSequenceNumber, StateFrame(state))) + ) + } + } catch { + case _: Exception => + } + emitECM(METHOD_END_CHANNEL, PORT_ALIGNMENT) isFinished.set(true) } catch { diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala index f99739acc04..9837213abbb 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala @@ -29,13 +29,7 @@ trait OperatorExecutor { def produceStateOnStart(port: Int): Option[State] = None - def processState(state: State, port: Int): Option[State] = { - if (state.isPassToAllDownstream) { - Some(state) - } else { - None - } - } + def processState(state: State, port: Int): Option[State] = Some(state) def processTupleMultiPort( tuple: Tuple, diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 3226c9d2fe7..f76a314b7ae 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -19,39 +19,70 @@ package org.apache.texera.amber.core.state +import com.fasterxml.jackson.databind.JsonNode import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.texera.amber.util.JSONUtils.objectMapper -import scala.collection.mutable +import java.net.URI +import java.util.Base64 +import scala.jdk.CollectionConverters.IteratorHasAsScala -final case class State(tuple: Option[Tuple] = None, passToAllDownstream: Boolean = false) { - val data: mutable.Map[String, (AttributeType, Any)] = mutable.LinkedHashMap() - add("passToAllDownstream", passToAllDownstream, AttributeType.BOOLEAN) - if (tuple.isDefined) { - tuple.get.getSchema.getAttributes.foreach { attribute => - add(attribute.getName, tuple.get.getField(attribute.getName), attribute.getType) - } - } +object State { + private val StateContent = "content" + private val BytesTypeMarker = "__texera_type__" + private val BytesValue = "bytes" + private val PayloadMarker = "payload" - def add(key: String, value: Any, valueType: AttributeType): Unit = - data.put(key, (valueType, value)) + val schema: Schema = new Schema( + new Attribute(StateContent, AttributeType.STRING) + ) - def get(key: String): Any = data(key)._2 + def stateUriFromResultUri(resultUri: URI): URI = + new URI(resultUri.toString.replace("/result", "/state")) - def isPassToAllDownstream: Boolean = get("passToAllDownstream").asInstanceOf[Boolean] + def serialize(state: State): Tuple = { + val payloadJson = objectMapper.writeValueAsString(toJsonValue(state)) + Tuple.builder(schema).addSequentially(Array(payloadJson)).build() + } - def apply(key: String): Any = get(key) + def deserialize(tuple: Tuple): State = { + val payload = tuple.getField[String](StateContent) + objectMapper.readTree(payload).fields().asScala.map(entry => entry.getKey -> fromJsonValue(entry.getValue)).toMap + } - def toTuple: Tuple = - Tuple - .builder( - Schema(data.map { - case (name, (attrType, _)) => - new Attribute(name, attrType) - }.toList) - ) - .addSequentially(data.values.map(_._2).toArray) - .build() + private def toJsonValue(value: Any): Any = + value match { + case null => null + case bytes: Array[Byte] => + Map(BytesTypeMarker -> BytesValue, PayloadMarker -> Base64.getEncoder.encodeToString(bytes)) + case map: State => + map.iterator.map { case (k, v) => k -> toJsonValue(v) }.toMap + case iterable: Iterable[_] => + iterable.map(toJsonValue).toList + case other => other + } - override def toString: String = - data.map { case (key, (_, value)) => s"$key: $value" }.mkString(", ") + private def fromJsonValue(node: JsonNode): Any = { + if (node == null || node.isNull) { + null + } else if (node.isObject) { + val fields = node.fields().asScala.map(entry => entry.getKey -> entry.getValue).toMap + fields.get(BytesTypeMarker) match { + case Some(typeNode) if typeNode.isTextual && typeNode.asText() == BytesValue => + Base64.getDecoder.decode(fields(PayloadMarker).asText()) + case _ => + fields.view.mapValues(fromJsonValue).toMap + } + } else if (node.isArray) { + node.elements().asScala.map(fromJsonValue).toList + } else if (node.isBoolean) { + node.asBoolean() + } else if (node.isIntegralNumber) { + node.longValue() + } else if (node.isFloatingPointNumber) { + node.doubleValue() + } else { + node.asText() + } + } } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala new file mode 100644 index 00000000000..c110f9d814f --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.core + +package object state { + type State = Map[String, Any] +} diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 15949ef4717..ae37def667e 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -72,6 +72,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } @@ -119,6 +120,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala index 3513ac5ecd8..990776a69f0 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala @@ -34,6 +34,7 @@ object VFSResourceType extends Enumeration { val RESULT: Value = Value("result") val RUNTIME_STATISTICS: Value = Value("runtimeStatistics") val CONSOLE_MESSAGES: Value = Value("consoleMessages") + val STATE: Value = Value("state") } object VFSURIFactory { From be81f7d231696bf486a1a5f23d39d3877d540a67 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 01:32:04 -0700 Subject: [PATCH 003/152] fix: persist state in scala data processor --- .../texera/amber/engine/architecture/worker/DataProcessor.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala index 3aa5fa90a46..65c560ee594 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala @@ -126,6 +126,7 @@ class DataProcessor( val outputState = executor.processState(state, port) if (outputState.isDefined) { outputManager.emitState(outputState.get) + outputManager.saveStateToStorageIfNeeded(state) } } catch safely { case e => From b313e16f3cc7823b70b81016eb06a32c33e40951 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 01:32:41 -0700 Subject: [PATCH 004/152] fix: update if operator state access --- .../org/apache/texera/amber/operator/ifStatement/IfOpExec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala index 462bdd0969a..d2becc79a5b 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala @@ -34,7 +34,7 @@ class IfOpExec(descString: String) extends OperatorExecutor { //It can accept any value that can be converted to a boolean. For example, Int 1 will be converted to true. override def processState(state: State, port: Int): Option[State] = { outputPort = - if (state.get(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() + if (state(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() Some(state) } From 808a5e7ca2c6627db60bbf68920a5554fc2883f2 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 15:51:47 -0700 Subject: [PATCH 005/152] fix fmt --- amber/src/main/python/core/models/operator.py | 32 ------------------- .../python/core/runnables/data_processor.py | 1 - .../main/python/core/runnables/main_loop.py | 29 ----------------- 3 files changed, 62 deletions(-) diff --git a/amber/src/main/python/core/models/operator.py b/amber/src/main/python/core/models/operator.py index 5b9672988aa..91c5b2cf27b 100644 --- a/amber/src/main/python/core/models/operator.py +++ b/amber/src/main/python/core/models/operator.py @@ -291,35 +291,3 @@ def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike] time, or None. """ yield - - -class LoopStartOperator(TableOperator): - @overrides.final - def process_state(self, state: State, port: int) -> Optional[State]: - if "LoopStartStateURI" in state: - state["loop_counter"] += 1 - return state - self.state.update(state) - return None - - @overrides.final - def produce_state_on_finish(self, port: int) -> State: - from pickle import dumps - - self.state["table"] = dumps(Table(self._TableOperator__table_data[port])) - return dict(self.state) - - -class LoopEndOperator(TableOperator): - @overrides.final - def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: - yield table - - @abstractmethod - def condition(self) -> None: - pass - - def loop_start_id(self) -> str: - del self.state["table"] - del self.state["output"] - return self.state["LoopStartId"] diff --git a/amber/src/main/python/core/runnables/data_processor.py b/amber/src/main/python/core/runnables/data_processor.py index 815e85a6446..4399b1a3a2f 100644 --- a/amber/src/main/python/core/runnables/data_processor.py +++ b/amber/src/main/python/core/runnables/data_processor.py @@ -100,7 +100,6 @@ def process_state(self, state: State) -> None: self._context.worker_id, self._context.console_message_manager.print_buf, ): - self._switch_context() self._set_output_state(executor.process_state(state, port_id)) except Exception as err: diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index ece5cf8e102..e0104a755d8 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -69,7 +69,6 @@ EmbeddedControlMessage, AsyncRpcContext, ControlRequest, - IterationCompletedRequest, ) from proto.org.apache.texera.amber.engine.architecture.worker import ( WorkerState, @@ -96,29 +95,6 @@ def __init__( target=self.data_processor.run, daemon=True, name="data_processor_thread" ).start() - def _attach_loop_start_id(self, output_state: State) -> None: - if "LoopStartId" in output_state: - return - output_state["LoopStartId"] = self.context.worker_id.split("-", 1)[1].rsplit( - "-main-0", 1 - )[0] - output_state["LoopStartStateURI"] = state_uri_from_result_uri( - self.context.input_manager.get_input_state_result_uri() - ) - - def _next_iteration( - self, executor: LoopEndOperator, controller_interface - ) -> None: - controller_interface.iteration_completed( - IterationCompletedRequest(OperatorIdentity(executor.loop_start_id())) - ) - uri = executor.state["LoopStartStateURI"] - del executor.state["LoopStartStateURI"] - del executor.state["LoopStartId"] - writer = DocumentFactory.create_document(uri, STATE_SCHEMA).writer("0") - writer.put_one(serialize_state(executor.state)) - writer.close() - def complete(self) -> None: """ Complete the DataProcessor, marking state to COMPLETED, and notify the @@ -223,10 +199,6 @@ def process_input_state(self) -> None: output_state = self.context.state_processing_manager.get_output_state() self._switch_context() if output_state is not None: - if isinstance(self.context.executor_manager.executor, LoopEndOperator): - self.context.output_manager.reset_output_storage() - if isinstance(self.context.executor_manager.executor, LoopStartOperator): - self._attach_loop_start_id(output_state) for to, batch in self.context.output_manager.emit_state(output_state): self._output_queue.put( DataElement( @@ -236,7 +208,6 @@ def process_input_state(self) -> None: payload=batch, ) ) - self.context.output_manager.save_state_to_storage_if_needed(output_state) def process_tuple_with_udf(self) -> Iterator[Optional[Tuple]]: """ From 67c4e24f54a667183d2e009058dad3d03a9deca3 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 15:59:01 -0700 Subject: [PATCH 006/152] fix fmt --- amber/src/main/python/core/runnables/main_loop.py | 10 +++------- .../WorkerExecutionCompletedHandler.scala | 6 +----- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index e0104a755d8..844ce95f653 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -102,15 +102,12 @@ def complete(self) -> None: """ # flush the buffered console prints self._check_and_report_console_messages(force_flush=True) - controller_interface = self._async_rpc_client.controller_stub() - executor = self.context.executor_manager.executor - if isinstance(executor, LoopEndOperator) and executor.condition(): - self._next_iteration(executor, controller_interface) - executor.close() + self.context.executor_manager.executor.close() # stop the data processing thread self.data_processor.stop() self.context.state_manager.transit_to(WorkerState.COMPLETED) self.context.statistics_manager.update_total_execution_time(time.time_ns()) + controller_interface = self._async_rpc_client.controller_stub() controller_interface.worker_execution_completed(EmptyRequest()) self.context.close() @@ -252,7 +249,6 @@ def _process_tuple(self, tuple_: Tuple) -> None: def _process_state(self, state_: State) -> None: self.context.state_processing_manager.current_input_state = state_ - self._switch_context() self.process_input_state() self._check_and_process_control() @@ -341,7 +337,7 @@ def _process_ecm(self, ecm_element: ECMElement): if ecm.ecm_type != EmbeddedControlMessageType.NO_ALIGNMENT: self.context.pause_manager.resume(PauseType.ECM_PAUSE) - self._switch_context() + if self.context.tuple_processing_manager.current_internal_marker: { StartChannel: self._process_start_channel, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala index c3b3ddb234b..d54a22f26b9 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala @@ -61,11 +61,7 @@ trait WorkerExecutionCompletedHandler { .collect(Seq(statsRequest)) .flatMap(_ => { // if entire workflow is completed, clean up - val isWorkflowTerminal = - cp.workflowExecution.isCompleted && - !cp.workflowScheduler.hasPendingRegions && - !cp.workflowExecutionCoordinator.hasUnfinishedRegionCoordinators - if (isWorkflowTerminal) { + if (cp.workflowExecution.isCompleted) { // after query result come back: send completed event, cleanup ,and kill workflow sendToClient(ExecutionStateUpdate(cp.workflowExecution.getState)) cp.controllerTimerService.disableStatusUpdate() From 5cd9657696aa6a3482fbbdfb3196c56495cd8b23 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:00:06 -0700 Subject: [PATCH 007/152] fix fmt --- amber/src/main/python/core/runnables/main_loop.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index 844ce95f653..d73c655734f 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -38,15 +38,8 @@ ECMElement, InternalQueueElement, ) -from core.models.operator import LoopEndOperator, LoopStartOperator -from core.models.state import ( - State, - STATE_SCHEMA, - serialize_state, - state_uri_from_result_uri, -) +from core.models.state import State from core.runnables.data_processor import DataProcessor -from core.storage.document_factory import DocumentFactory from core.util import StoppableQueueBlockingRunnable, get_one_of from core.util.console_message.timestamp import current_time_in_local_timezone from core.util.customized_queue.queue_base import QueueElement @@ -55,7 +48,6 @@ PortIdentity, ChannelIdentity, EmbeddedControlMessageIdentity, - OperatorIdentity, ) from proto.org.apache.texera.amber.engine.architecture.rpc import ( ConsoleMessage, From 1173dd4599d6ad5a3ac4f30a5f2f4d1a515e66c9 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:07:45 -0700 Subject: [PATCH 008/152] fix fmt --- .../runnables/input_port_materialization_reader_runnable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index 493ecf0a413..a600f878572 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -172,7 +172,6 @@ def run(self) -> None: tup.cast_to_schema(self.tuple_schema) for data_frame in self.tuple_to_batch_with_filter(tup): self.emit_payload(data_frame) - self.emit_ecm("EndChannel", EmbeddedControlMessageType.PORT_ALIGNMENT) self._finished = True except Exception as err: From 907dd10a4dbab580e7d00c43084b7d3c47e97ebd Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:08:12 -0700 Subject: [PATCH 009/152] fix fmt --- .../engine/architecture/controller/WorkflowScheduler.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index b1acb3c0650..9dcf3ad4bfc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -54,6 +54,4 @@ class WorkflowScheduler( def getNextRegions: Set[Region] = if (!schedule.hasNext) Set() else schedule.next() - def hasPendingRegions: Boolean = schedule != null && schedule.hasNext - } From 42201cba499562faedd34a498ad6c1501f920928 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:13:37 -0700 Subject: [PATCH 010/152] fix fmt --- .../controller/WorkflowScheduler.scala | 2 ++ .../WorkerExecutionCompletedHandler.scala | 6 +++++- .../RegionExecutionCoordinator.scala | 19 ++----------------- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index 9dcf3ad4bfc..b1acb3c0650 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -54,4 +54,6 @@ class WorkflowScheduler( def getNextRegions: Set[Region] = if (!schedule.hasNext) Set() else schedule.next() + def hasPendingRegions: Boolean = schedule != null && schedule.hasNext + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala index d54a22f26b9..c3b3ddb234b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala @@ -61,7 +61,11 @@ trait WorkerExecutionCompletedHandler { .collect(Seq(statsRequest)) .flatMap(_ => { // if entire workflow is completed, clean up - if (cp.workflowExecution.isCompleted) { + val isWorkflowTerminal = + cp.workflowExecution.isCompleted && + !cp.workflowScheduler.hasPendingRegions && + !cp.workflowExecutionCoordinator.hasUnfinishedRegionCoordinators + if (isWorkflowTerminal) { // after query result come back: send completed event, cleanup ,and kill workflow sendToClient(ExecutionStateUpdate(cp.workflowExecution.getState)) cp.controllerTimerService.disableStatusUpdate() diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index a384f383e1f..85c03081f61 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -573,23 +573,8 @@ class RegionExecutionCoordinator( region.getOperator(outputPortId.opId).outputPorts(outputPortId.portId)._3 val schema = schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) - if (region.getOperators.exists(_.id.logicalOpId.id.startsWith("LoopEnd-operator-"))) { - try { - DocumentFactory.openDocument(storageUriToAdd) - } catch { - case _: Exception => - DocumentFactory.createDocument(storageUriToAdd, schema) - } - try { - DocumentFactory.openDocument(stateUriToAdd) - } catch { - case _: Exception => - DocumentFactory.createDocument(stateUriToAdd, State.schema) - } - } else { - DocumentFactory.createDocument(storageUriToAdd, schema) - DocumentFactory.createDocument(stateUriToAdd, State.schema) - } + DocumentFactory.createDocument(storageUriToAdd, schema) + DocumentFactory.createDocument(stateUriToAdd, State.schema) WorkflowExecutionsResource.insertOperatorPortResultUri( eid = eid, From 94b874c53b84391a60beea1b381d136735bd81ca Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:14:27 -0700 Subject: [PATCH 011/152] fix fmt --- .../RegionExecutionCoordinator.scala | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 85c03081f61..889c9c94eef 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -182,8 +182,6 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) - asyncRPCClient.inputGateway.removeControlChannel(workerId) - asyncRPCClient.outputGateway.removeControlChannel(workerId) gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() } }.toSeq @@ -212,15 +210,14 @@ class RegionExecutionCoordinator( regionExecution: RegionExecution, attempt: Int = 1 ): Future[Unit] = { - terminateWorkers(regionExecution).rescue { - case err => - logger.warn( - s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", - err - ) - Future - .sleep(killRetryDelay)(killRetryTimer) - .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) + terminateWorkers(regionExecution).rescue { case err => + logger.warn( + s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", + err + ) + Future + .sleep(killRetryDelay)(killRetryTimer) + .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) } } From 345fa41c782a8c3eff4327b7c7d6a2ebcad6f34a Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:15:57 -0700 Subject: [PATCH 012/152] fix fmt --- .../architecture/pythonworker/PythonProxyClient.scala | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index e53fccf8c02..cfdb6a82f86 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -126,11 +126,7 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu case DataFrame(frame) => writeArrowStream(mutable.Queue(ArraySeq.unsafeWrapArray(frame): _*), from, "Data") case StateFrame(state) => - writeArrowStream( - mutable.Queue(State.serialize(state)), - from, - "State" - ) + writeArrowStream(mutable.Queue(State.serialize(state)), from, "State") } } From b6d14ee05d7507aaff148d0dd4c54461a37113d0 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:16:23 -0700 Subject: [PATCH 013/152] fix fmt --- .../engine/architecture/pythonworker/PythonProxyServer.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala index 2a1e212ac88..463dc4b75a5 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala @@ -128,10 +128,7 @@ private class AmberProducer( dataHeader.payloadType match { case "State" => assert(root.getRowCount == 1) - outputPort.sendTo( - to, - StateFrame(State.deserialize(ArrowUtils.getTexeraTuple(0, root))) - ) + outputPort.sendTo(to, StateFrame(State.deserialize(ArrowUtils.getTexeraTuple(0, root)))) case "ECM" => assert(root.getRowCount == 1) outputPort.sendTo( From 92905d89ad9f55a8deb50377fba51d961503f911 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:16:35 -0700 Subject: [PATCH 014/152] fix fmt --- .../architecture/scheduling/RegionExecutionCoordinator.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 889c9c94eef..5be5d942e5c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -572,7 +572,6 @@ class RegionExecutionCoordinator( schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) DocumentFactory.createDocument(storageUriToAdd, schema) DocumentFactory.createDocument(stateUriToAdd, State.schema) - WorkflowExecutionsResource.insertOperatorPortResultUri( eid = eid, globalPortId = outputPortId, From fc8cdf83b081178ab1ea53c345410a099e1e6f38 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 16:58:19 -0700 Subject: [PATCH 015/152] init --- amber/src/main/python/core/models/operator.py | 6 ++---- amber/src/main/python/core/models/state.py | 5 +---- .../texera/amber/core/executor/OperatorExecutor.scala | 8 +------- .../scala/org/apache/texera/amber/core/state/State.scala | 5 +---- 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/amber/src/main/python/core/models/operator.py b/amber/src/main/python/core/models/operator.py index 79050839958..6c2b657d747 100644 --- a/amber/src/main/python/core/models/operator.py +++ b/amber/src/main/python/core/models/operator.py @@ -108,14 +108,12 @@ def close(self) -> None: def process_state(self, state: State, port: int) -> Optional[State]: """ Process an input State from the given link. - The default implementation is to pass the State to all downstream operators - if the State has pass_to_all_downstream set to True. + The default implementation is to pass the State to all downstream operators. :param state: State, a State from an input port to be processed. :param port: int, input port index of the current exhausted port. :return: State, producing one State object """ - if state.passToAllDownstream: - return state + return state def produce_state_on_start(self, port: int) -> State: """ diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index 2c8a268dfb7..feb35f2e274 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -26,11 +26,8 @@ @dataclass class State: - def __init__( - self, table: Optional[Table] = None, pass_to_all_downstream: bool = False - ): + def __init__(self, table: Optional[Table] = None): self.schema = Schema() - self.passToAllDownstream = pass_to_all_downstream if table is not None: self.__dict__.update(table.to_pandas().iloc[0].to_dict()) self.schema = Schema(table.schema) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala index f99739acc04..9837213abbb 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala @@ -29,13 +29,7 @@ trait OperatorExecutor { def produceStateOnStart(port: Int): Option[State] = None - def processState(state: State, port: Int): Option[State] = { - if (state.isPassToAllDownstream) { - Some(state) - } else { - None - } - } + def processState(state: State, port: Int): Option[State] = Some(state) def processTupleMultiPort( tuple: Tuple, diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 3226c9d2fe7..2b3465473b7 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -23,9 +23,8 @@ import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tup import scala.collection.mutable -final case class State(tuple: Option[Tuple] = None, passToAllDownstream: Boolean = false) { +final case class State(tuple: Option[Tuple] = None) { val data: mutable.Map[String, (AttributeType, Any)] = mutable.LinkedHashMap() - add("passToAllDownstream", passToAllDownstream, AttributeType.BOOLEAN) if (tuple.isDefined) { tuple.get.getSchema.getAttributes.foreach { attribute => add(attribute.getName, tuple.get.getField(attribute.getName), attribute.getType) @@ -37,8 +36,6 @@ final case class State(tuple: Option[Tuple] = None, passToAllDownstream: Boolean def get(key: String): Any = data(key)._2 - def isPassToAllDownstream: Boolean = get("passToAllDownstream").asInstanceOf[Boolean] - def apply(key: String): Any = get(key) def toTuple: Tuple = From 5c1d3696b8a730b9b827159a6b5219f392769791 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 19 Apr 2026 17:45:33 -0700 Subject: [PATCH 016/152] update --- amber/src/main/python/core/models/operator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/amber/src/main/python/core/models/operator.py b/amber/src/main/python/core/models/operator.py index 6c2b657d747..952e2a12c81 100644 --- a/amber/src/main/python/core/models/operator.py +++ b/amber/src/main/python/core/models/operator.py @@ -115,7 +115,7 @@ def process_state(self, state: State, port: int) -> Optional[State]: """ return state - def produce_state_on_start(self, port: int) -> State: + def produce_state_on_start(self, port: int) -> Optional[State]: """ Produce a State when the given link started. @@ -124,7 +124,7 @@ def produce_state_on_start(self, port: int) -> State: """ pass - def produce_state_on_finish(self, port: int) -> State: + def produce_state_on_finish(self, port: int) -> Optional[State]: """ Produce a State after the input port is exhausted. From 52eace6ecd4e55bd58caafa64313fe218faa4c29 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 20 Apr 2026 02:02:16 -0700 Subject: [PATCH 017/152] Apply xinyuan-loop-feb changes from 157156c onto main --- .../architecture/rpc/controlcommands.proto | 7 +- .../architecture/rpc/controllerservice.proto | 3 +- .../architecture/packaging/input_manager.py | 3 + .../architecture/packaging/output_manager.py | 38 +- amber/src/main/python/core/models/operator.py | 38 +- amber/src/main/python/core/models/state.py | 97 +- .../python/core/runnables/data_processor.py | 1 + .../main/python/core/runnables/main_loop.py | 49 +- .../python/core/runnables/network_receiver.py | 15 +- .../python/core/runnables/network_sender.py | 23 +- .../python/core/storage/document_factory.py | 107 +- .../core/storage/iceberg/iceberg_utils.py | 2 +- ...ut_port_materialization_reader_runnable.py | 30 +- .../python/core/storage/vfs_uri_factory.py | 1 + .../org/apache/texera/amber/core/__init__.py | 4 +- .../amber/engine/architecture/rpc/__init__.py | 1484 +++++++++-------- .../architecture/sendsemantics/__init__.py | 4 +- .../engine/architecture/worker/__init__.py | 4 +- .../texera/amber/engine/common/__init__.py | 42 +- amber/src/main/python/pytexera/__init__.py | 3 + .../architecture/common/AmberProcessor.scala | 4 +- ...ControllerAsyncRPCHandlerInitializer.scala | 1 + .../controller/ControllerProcessor.scala | 2 +- .../controller/WorkflowScheduler.scala | 4 +- .../execution/WorkflowExecution.scala | 1 + .../NextIterationHandler.scala | 47 + .../WorkerExecutionCompletedHandler.scala | 6 +- .../messaginglayer/NetworkInputGateway.scala | 4 + .../messaginglayer/NetworkOutputGateway.scala | 4 + .../messaginglayer/OutputManager.scala | 20 + .../pythonworker/PythonProxyClient.scala | 7 +- .../pythonworker/PythonProxyServer.scala | 5 +- .../RegionExecutionCoordinator.scala | 79 +- .../architecture/scheduling/Schedule.scala | 11 + .../WorkflowExecutionCoordinator.scala | 57 +- .../architecture/worker/DataProcessor.scala | 1 + ...InputPortMaterializationReaderThread.scala | 26 +- .../worker/promisehandlers/EndHandler.scala | 2 +- .../engine/common/rpc/AsyncRPCClient.scala | 8 +- .../workflow/WorkflowExecutionsResource.scala | 2 + .../core/executor/OperatorExecutor.scala | 8 +- .../texera/amber/core/state/State.scala | 83 +- .../texera/amber/core/state/package.scala | 24 + .../amber/core/storage/DocumentFactory.scala | 2 + .../amber/core/storage/VFSURIFactory.scala | 1 + .../result/iceberg/IcebergTableWriter.scala | 13 +- .../texera/amber/operator/LogicalOp.scala | 3 + .../amber/operator/ifStatement/IfOpExec.scala | 2 +- .../amber/operator/loop/LoopEndOpDesc.scala | 94 ++ .../amber/operator/loop/LoopStartOpDesc.scala | 88 + .../src/assets/operator_images/LoopEnd.png | Bin 0 -> 5865 bytes .../src/assets/operator_images/LoopStart.png | Bin 0 -> 2138 bytes 52 files changed, 1614 insertions(+), 950 deletions(-) create mode 100644 amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala create mode 100644 common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala create mode 100644 frontend/src/assets/operator_images/LoopEnd.png create mode 100644 frontend/src/assets/operator_images/LoopStart.png diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto index d714f64a154..f5798af36d9 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto @@ -46,6 +46,7 @@ message ControlRequest { PortCompletedRequest portCompletedRequest = 9; WorkerStateUpdatedRequest workerStateUpdatedRequest = 10; LinkWorkersRequest linkWorkersRequest = 11; + NextIterationRequest nextIterationRequest = 12; // request for worker AddInputChannelRequest addInputChannelRequest = 50; @@ -278,4 +279,8 @@ enum StatisticsUpdateTarget { message QueryStatisticsRequest{ repeated core.ActorVirtualIdentity filterByWorkers = 1; StatisticsUpdateTarget updateTarget = 2; -} \ No newline at end of file +} + +message NextIterationRequest{ + core.OperatorIdentity LoopStartId = 1 [(scalapb.field).no_box = true]; +} diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto index 70d189a3411..734966e1761 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto @@ -42,7 +42,8 @@ service ControllerService { rpc PauseWorkflow(EmptyRequest) returns (EmptyReturn); rpc WorkerStateUpdated(WorkerStateUpdatedRequest) returns (EmptyReturn); rpc WorkerExecutionCompleted(EmptyRequest) returns (EmptyReturn); + rpc NextIteration(NextIterationRequest) returns (EmptyReturn); rpc LinkWorkers(LinkWorkersRequest) returns (EmptyReturn); rpc ControllerInitiateQueryStatistics(QueryStatisticsRequest) returns (EmptyReturn); rpc RetryWorkflow(RetryWorkflowRequest) returns (EmptyReturn); -} \ No newline at end of file +} diff --git a/amber/src/main/python/core/architecture/packaging/input_manager.py b/amber/src/main/python/core/architecture/packaging/input_manager.py index 6cb6bdc08c4..af40423caec 100644 --- a/amber/src/main/python/core/architecture/packaging/input_manager.py +++ b/amber/src/main/python/core/architecture/packaging/input_manager.py @@ -173,3 +173,6 @@ def _process_data(self, table: Table) -> Iterator[Tuple]: yield Tuple( {name: field_accessor for name in table.column_names}, schema=schema ) + + def get_input_state_result_uri(self): + return next(iter(self._input_port_mat_reader_runnables.values()))[0].uri diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index bf4afbf396f..065b063f7d4 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -17,6 +17,7 @@ import threading import typing +import uuid from collections import OrderedDict from itertools import chain from loguru import logger @@ -43,7 +44,12 @@ ) from core.models import Tuple, Schema, StateFrame from core.models.payload import DataPayload, DataFrame -from core.models.state import State +from core.models.state import ( + State, + STATE_SCHEMA, + serialize_state, + state_uri_from_result_uri, +) from core.storage.document_factory import DocumentFactory from core.storage.runnables.port_storage_writer import ( PortStorageWriter, @@ -87,6 +93,8 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() + self._storage_uris: typing.Dict[PortIdentity, str] = dict() + def is_missing_output_ports(self): """ This method is only used for ensuring correct region execution. @@ -126,6 +134,7 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri: str): Create a separate thread for saving output tuples of a port to storage in batch. """ + self._storage_uris[port_id] = storage_uri document, _ = DocumentFactory.open_document(storage_uri) buffered_item_writer = document.writer(str(get_worker_index(self.worker_id))) writer_queue = Queue() @@ -171,6 +180,31 @@ def save_tuple_to_storage_if_needed(self, tuple_: Tuple, port_id=None) -> None: PortStorageWriterElement(data_tuple=tuple_) ) + def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: + if port_id is None: + uris = self._storage_uris.values() + elif port_id in self._storage_uris: + uris = [self._storage_uris[port_id]] + else: + return + + for uri in uris: + state_uri = state_uri_from_result_uri(uri) + try: + document = DocumentFactory.open_document(state_uri)[0] + except ValueError: + document = DocumentFactory.create_document(state_uri, STATE_SCHEMA) + writer = document.writer(str(uuid.uuid4())) + writer.put_one(serialize_state(state)) + writer.close() + + def reset_output_storage(self) -> None: + port_id = self.get_port_ids()[0] + storage_uri = self._storage_uris[port_id] + self.close_port_storage_writers() + DocumentFactory.create_document(storage_uri, self._ports[port_id].get_schema()) + self.set_up_port_storage_writer(port_id, storage_uri) + def close_port_storage_writers(self) -> None: """ Flush the buffers of port storage writers and wait for all the @@ -248,7 +282,7 @@ def emit_state( receiver, ( StateFrame(payload) - if isinstance(payload, State) + if isinstance(payload, dict) else self.tuple_to_frame(payload) ), ) diff --git a/amber/src/main/python/core/models/operator.py b/amber/src/main/python/core/models/operator.py index 79050839958..5b9672988aa 100644 --- a/amber/src/main/python/core/models/operator.py +++ b/amber/src/main/python/core/models/operator.py @@ -108,14 +108,12 @@ def close(self) -> None: def process_state(self, state: State, port: int) -> Optional[State]: """ Process an input State from the given link. - The default implementation is to pass the State to all downstream operators - if the State has pass_to_all_downstream set to True. + The default implementation is to pass the State to downstream operators. :param state: State, a State from an input port to be processed. :param port: int, input port index of the current exhausted port. :return: State, producing one State object """ - if state.passToAllDownstream: - return state + return state def produce_state_on_start(self, port: int) -> State: """ @@ -293,3 +291,35 @@ def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike] time, or None. """ yield + + +class LoopStartOperator(TableOperator): + @overrides.final + def process_state(self, state: State, port: int) -> Optional[State]: + if "LoopStartStateURI" in state: + state["loop_counter"] += 1 + return state + self.state.update(state) + return None + + @overrides.final + def produce_state_on_finish(self, port: int) -> State: + from pickle import dumps + + self.state["table"] = dumps(Table(self._TableOperator__table_data[port])) + return dict(self.state) + + +class LoopEndOperator(TableOperator): + @overrides.final + def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: + yield table + + @abstractmethod + def condition(self) -> None: + pass + + def loop_start_id(self) -> str: + del self.state["table"] + del self.state["output"] + return self.state["LoopStartId"] diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index 2c8a268dfb7..e5726cc3c2f 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -15,61 +15,64 @@ # specific language governing permissions and limitations # under the License. -from dataclasses import dataclass -from pandas import DataFrame -from pyarrow import Table -from typing import Optional +import base64 +import json +from typing import Any, Dict, TypeAlias -from .schema import Schema, AttributeType -from .schema.attribute_type import FROM_PYOBJECT_MAPPING +from .schema import Schema +from .tuple import Tuple +State: TypeAlias = Dict[str, Any] -@dataclass -class State: - def __init__( - self, table: Optional[Table] = None, pass_to_all_downstream: bool = False - ): - self.schema = Schema() - self.passToAllDownstream = pass_to_all_downstream - if table is not None: - self.__dict__.update(table.to_pandas().iloc[0].to_dict()) - self.schema = Schema(table.schema) +STATE_CONTENT = "content" +_TYPE_MARKER = "__texera_type__" +_PAYLOAD_MARKER = "payload" +_BYTES_TYPE = "bytes" - def add( - self, key: str, value: any, value_type: Optional[AttributeType] = None - ) -> None: - self.__dict__[key] = value - if value_type is not None: - self.schema.add(key, value_type) - elif key != "schema": - self.schema.add(key, FROM_PYOBJECT_MAPPING[type(value)]) +STATE_SCHEMA = Schema(raw_schema={STATE_CONTENT: "STRING"}) - def get(self, key: str) -> any: - return self.__dict__[key] - def to_table(self) -> Table: - return Table.from_pandas( - df=DataFrame([self.__dict__]), - schema=self.schema.as_arrow_schema(), - ) +def state_uri_from_result_uri(result_uri: str) -> str: + return result_uri.replace("/result", "/state") - def __setattr__(self, key: str, value: any) -> None: - self.add(key, value) - def __setitem__(self, key: str, value: any) -> None: - self.add(key, value) +def serialize_state(state: State) -> Tuple: + return Tuple( + { + STATE_CONTENT: json.dumps( + _to_json_value(state), separators=(",", ":") + ) + }, + schema=STATE_SCHEMA, + ) - def __getitem__(self, key: str) -> any: - return self.get(key) - def __str__(self) -> str: - content = ", ".join( - [ - repr(key) + ": " + repr(value) - for key, value in self.__dict__.items() - if key != "schema" - ] - ) - return f"State[{content}]" +def deserialize_state(row: Tuple) -> State: + return _from_json_value(json.loads(row[STATE_CONTENT])) - __repr__ = __str__ + +def _to_json_value(value: Any) -> Any: + if value is None or isinstance(value, (bool, int, float, str)): + return value + if isinstance(value, bytes): + return { + _TYPE_MARKER: _BYTES_TYPE, + _PAYLOAD_MARKER: base64.b64encode(value).decode("ascii"), + } + if isinstance(value, dict): + return {str(key): _to_json_value(inner) for key, inner in value.items()} + if isinstance(value, (list, tuple)): + return [_to_json_value(inner) for inner in value] + raise TypeError( + f"State value of type {type(value).__name__} is not JSON serializable" + ) + + +def _from_json_value(value: Any) -> Any: + if isinstance(value, list): + return [_from_json_value(inner) for inner in value] + if isinstance(value, dict): + if value.get(_TYPE_MARKER) == _BYTES_TYPE: + return base64.b64decode(value[_PAYLOAD_MARKER]) + return {key: _from_json_value(inner) for key, inner in value.items()} + return value diff --git a/amber/src/main/python/core/runnables/data_processor.py b/amber/src/main/python/core/runnables/data_processor.py index 4399b1a3a2f..815e85a6446 100644 --- a/amber/src/main/python/core/runnables/data_processor.py +++ b/amber/src/main/python/core/runnables/data_processor.py @@ -100,6 +100,7 @@ def process_state(self, state: State) -> None: self._context.worker_id, self._context.console_message_manager.print_buf, ): + self._switch_context() self._set_output_state(executor.process_state(state, port_id)) except Exception as err: diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index d73c655734f..6e9cff9c113 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -38,8 +38,15 @@ ECMElement, InternalQueueElement, ) -from core.models.state import State +from core.models.operator import LoopEndOperator, LoopStartOperator +from core.models.state import ( + State, + STATE_SCHEMA, + serialize_state, + state_uri_from_result_uri, +) from core.runnables.data_processor import DataProcessor +from core.storage.document_factory import DocumentFactory from core.util import StoppableQueueBlockingRunnable, get_one_of from core.util.console_message.timestamp import current_time_in_local_timezone from core.util.customized_queue.queue_base import QueueElement @@ -48,6 +55,7 @@ PortIdentity, ChannelIdentity, EmbeddedControlMessageIdentity, + OperatorIdentity, ) from proto.org.apache.texera.amber.engine.architecture.rpc import ( ConsoleMessage, @@ -61,6 +69,7 @@ EmbeddedControlMessage, AsyncRpcContext, ControlRequest, + NextIterationRequest, ) from proto.org.apache.texera.amber.engine.architecture.worker import ( WorkerState, @@ -87,6 +96,29 @@ def __init__( target=self.data_processor.run, daemon=True, name="data_processor_thread" ).start() + def _attach_loop_start_id(self, output_state: State) -> None: + if "LoopStartId" in output_state: + return + output_state["LoopStartId"] = self.context.worker_id.split("-", 1)[1].rsplit( + "-main-0", 1 + )[0] + output_state["LoopStartStateURI"] = state_uri_from_result_uri( + self.context.input_manager.get_input_state_result_uri() + ) + + def _next_iteration( + self, executor: LoopEndOperator, controller_interface + ) -> None: + controller_interface.next_iteration( + NextIterationRequest(OperatorIdentity(executor.loop_start_id())) + ) + uri = executor.state["LoopStartStateURI"] + del executor.state["LoopStartStateURI"] + del executor.state["LoopStartId"] + writer = DocumentFactory.create_document(uri, STATE_SCHEMA).writer("0") + writer.put_one(serialize_state(executor.state)) + writer.close() + def complete(self) -> None: """ Complete the DataProcessor, marking state to COMPLETED, and notify the @@ -94,12 +126,15 @@ def complete(self) -> None: """ # flush the buffered console prints self._check_and_report_console_messages(force_flush=True) - self.context.executor_manager.executor.close() + controller_interface = self._async_rpc_client.controller_stub() + executor = self.context.executor_manager.executor + if isinstance(executor, LoopEndOperator) and executor.condition(): + self._next_iteration(executor, controller_interface) + executor.close() # stop the data processing thread self.data_processor.stop() self.context.state_manager.transit_to(WorkerState.COMPLETED) self.context.statistics_manager.update_total_execution_time(time.time_ns()) - controller_interface = self._async_rpc_client.controller_stub() controller_interface.worker_execution_completed(EmptyRequest()) self.context.close() @@ -188,6 +223,10 @@ def process_input_state(self) -> None: output_state = self.context.state_processing_manager.get_output_state() self._switch_context() if output_state is not None: + if isinstance(self.context.executor_manager.executor, LoopEndOperator): + self.context.output_manager.reset_output_storage() + if isinstance(self.context.executor_manager.executor, LoopStartOperator): + self._attach_loop_start_id(output_state) for to, batch in self.context.output_manager.emit_state(output_state): self._output_queue.put( DataElement( @@ -197,6 +236,7 @@ def process_input_state(self) -> None: payload=batch, ) ) + self.context.output_manager.save_state_to_storage_if_needed(output_state) def process_tuple_with_udf(self) -> Iterator[Optional[Tuple]]: """ @@ -241,6 +281,7 @@ def _process_tuple(self, tuple_: Tuple) -> None: def _process_state(self, state_: State) -> None: self.context.state_processing_manager.current_input_state = state_ + self._switch_context() self.process_input_state() self._check_and_process_control() @@ -329,7 +370,7 @@ def _process_ecm(self, ecm_element: ECMElement): if ecm.ecm_type != EmbeddedControlMessageType.NO_ALIGNMENT: self.context.pause_manager.resume(PauseType.ECM_PAUSE) - + self._switch_context() if self.context.tuple_processing_manager.current_internal_marker: { StartChannel: self._process_start_channel, diff --git a/amber/src/main/python/core/runnables/network_receiver.py b/amber/src/main/python/core/runnables/network_receiver.py index fd42a8f589b..e1815b08f7d 100644 --- a/amber/src/main/python/core/runnables/network_receiver.py +++ b/amber/src/main/python/core/runnables/network_receiver.py @@ -32,6 +32,7 @@ ) from core.models import ( DataFrame, + Tuple, StateFrame, ) from core.models.internal_queue import ( @@ -40,8 +41,8 @@ InternalQueue, ECMElement, ) -from core.models.state import State from core.proxy import ProxyServer +from core.models.state import STATE_SCHEMA, deserialize_state from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable from proto.org.apache.texera.amber.engine.architecture.rpc import EmbeddedControlMessage @@ -96,7 +97,17 @@ def data_handler(command: bytes, table: Table) -> int: "Data", lambda _: DataFrame(table), "State", - lambda _: StateFrame(State(table)), + lambda _: StateFrame( + deserialize_state( + Tuple( + { + name: table[name][0].as_py() + for name in STATE_SCHEMA.get_attr_names() + }, + schema=STATE_SCHEMA, + ) + ) + ), "ECM", lambda _: EmbeddedControlMessage().parse(table["payload"][0].as_py()), ) diff --git a/amber/src/main/python/core/runnables/network_sender.py b/amber/src/main/python/core/runnables/network_sender.py index 9595433fb70..f1bd8659ee9 100644 --- a/amber/src/main/python/core/runnables/network_sender.py +++ b/amber/src/main/python/core/runnables/network_sender.py @@ -20,13 +20,18 @@ from overrides import overrides from typing import Optional -from core.models import DataPayload, InternalQueue, DataFrame, StateFrame, State +from core.models import DataPayload, InternalQueue, DataFrame, StateFrame from core.models.internal_queue import ( InternalQueueElement, DataElement, DCMElement, ECMElement, ) +from core.models.state import ( + STATE_CONTENT, + STATE_SCHEMA, + serialize_state, +) from core.proxy import ProxyClient from core.util import StoppableQueueBlockingRunnable from proto.org.apache.texera.amber.core import ChannelIdentity @@ -98,13 +103,15 @@ def _send_data(self, to: ChannelIdentity, data_payload: DataPayload) -> None: data_header = PythonDataHeader(tag=to, payload_type="Data") self._proxy_client.send_data(bytes(data_header), data_payload.frame) elif isinstance(data_payload, StateFrame): - data_header = PythonDataHeader( - tag=to, payload_type=data_payload.frame.__class__.__name__ - ) - table = ( - data_payload.frame.to_table() - if isinstance(data_payload.frame, State) - else None + data_header = PythonDataHeader(tag=to, payload_type="State") + serialized_state = serialize_state(data_payload.frame) + table = pa.Table.from_pydict( + { + STATE_CONTENT: [ + serialized_state[STATE_CONTENT] + ], + }, + schema=STATE_SCHEMA.as_arrow_schema(), ) self._proxy_client.send_data(bytes(data_header), table) else: diff --git a/amber/src/main/python/core/storage/document_factory.py b/amber/src/main/python/core/storage/document_factory.py index 9b686ab66b6..8a4d6fe3c5f 100644 --- a/amber/src/main/python/core/storage/document_factory.py +++ b/amber/src/main/python/core/storage/document_factory.py @@ -61,30 +61,35 @@ def create_document(uri: str, schema: Schema) -> VirtualDocument: if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME: _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - if resource_type in {VFSResourceType.RESULT}: - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - # Convert Amber Schema to Iceberg Schema with LARGE_BINARY - # field name encoding - iceberg_schema = amber_schema_to_iceberg_schema(schema) - - create_table( - IcebergCatalogInstance.get_instance(), - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - iceberg_schema, - override_if_exists=True, - ) - - return IcebergDocument[Tuple]( - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - iceberg_schema, - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - else: - raise ValueError(f"Resource type {resource_type} is not supported") + match resource_type: + case VFSResourceType.RESULT: + namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + namespace = "state" + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + # Convert Amber Schema to Iceberg Schema with LARGE_BINARY + # field name encoding + iceberg_schema = amber_schema_to_iceberg_schema(schema) + + create_table( + IcebergCatalogInstance.get_instance(), + namespace, + storage_key, + iceberg_schema, + override_if_exists=True, + ) + + return IcebergDocument[Tuple]( + namespace, + storage_key, + iceberg_schema, + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for creating the document" @@ -96,30 +101,36 @@ def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: if parsed_uri.scheme == "vfs": _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - if resource_type in {VFSResourceType.RESULT}: - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - table = load_table_metadata( - IcebergCatalogInstance.get_instance(), - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - ) - - if table is None: - raise ValueError("No storage is found for the given URI") - - amber_schema = Schema(table.schema().as_arrow()) - - document = IcebergDocument( - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - table.schema(), - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - return document, amber_schema - else: - raise ValueError(f"Resource type {resource_type} is not supported") + match resource_type: + case VFSResourceType.RESULT: + namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + namespace = "state" + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + + table = load_table_metadata( + IcebergCatalogInstance.get_instance(), + namespace, + storage_key, + ) + + if table is None: + raise ValueError("No storage is found for the given URI") + + amber_schema = Schema(table.schema().as_arrow()) + + document = IcebergDocument( + namespace, + storage_key, + table.schema(), + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + return document, amber_schema + else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for opening the document" diff --git a/amber/src/main/python/core/storage/iceberg/iceberg_utils.py b/amber/src/main/python/core/storage/iceberg/iceberg_utils.py index f973c72fe81..844ef3e00ff 100644 --- a/amber/src/main/python/core/storage/iceberg/iceberg_utils.py +++ b/amber/src/main/python/core/storage/iceberg/iceberg_utils.py @@ -148,7 +148,7 @@ def create_postgres_catalog( catalog_name, **{ "uri": f"postgresql+pg8000://{username}:{password}@{uri_without_scheme}", - "warehouse": f"file://{warehouse_path}", + "warehouse": warehouse_path, }, ) diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index e49c0316cc7..493ecf0a413 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -17,8 +17,8 @@ import typing from loguru import logger -from pyarrow import Table from typing import Union +from pyarrow import Table from core.architecture.sendsemantics.broad_cast_partitioner import ( BroadcastPartitioner, @@ -34,8 +34,9 @@ from core.architecture.sendsemantics.round_robin_partitioner import ( RoundRobinPartitioner, ) -from core.models import Tuple, InternalQueue, DataFrame, DataPayload +from core.models import Tuple, InternalQueue, DataFrame, DataPayload, State, StateFrame from core.models.internal_queue import DataElement, ECMElement +from core.models.state import deserialize_state, state_uri_from_result_uri from core.storage.document_factory import DocumentFactory from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable @@ -125,6 +126,15 @@ def tuple_to_batch_with_filter(self, tuple_: Tuple) -> typing.Iterator[DataFrame if receiver == self.worker_actor_id: yield self.tuples_to_data_frame(tuples) + def emit_state_with_filter(self, state: State) -> typing.Iterator[StateFrame]: + for receiver, payload in self.partitioner.flush_state(state): + if receiver == self.worker_actor_id: + yield ( + StateFrame(payload) + if isinstance(payload, dict) + else self.tuples_to_data_frame(payload) + ) + def run(self) -> None: """ Main execution logic that reads tuples from the materialized storage and @@ -138,8 +148,21 @@ def run(self) -> None: self.uri ) self.emit_ecm("StartChannel", EmbeddedControlMessageType.NO_ALIGNMENT) - storage_iterator = self.materialization.get() + try: + state_document, _ = DocumentFactory.open_document( + state_uri_from_result_uri(self.uri) + ) + state_iterator = state_document.get() + for state in state_iterator: + for state_frame in self.emit_state_with_filter( + deserialize_state(state) + ): + self.emit_payload(state_frame) + except ValueError: + pass + + storage_iterator = self.materialization.get() # Iterate and process tuples. for tup in storage_iterator: if self._stopped: @@ -149,6 +172,7 @@ def run(self) -> None: tup.cast_to_schema(self.tuple_schema) for data_frame in self.tuple_to_batch_with_filter(tup): self.emit_payload(data_frame) + self.emit_ecm("EndChannel", EmbeddedControlMessageType.PORT_ALIGNMENT) self._finished = True except Exception as err: diff --git a/amber/src/main/python/core/storage/vfs_uri_factory.py b/amber/src/main/python/core/storage/vfs_uri_factory.py index de0c5db56ec..0e23e607055 100644 --- a/amber/src/main/python/core/storage/vfs_uri_factory.py +++ b/amber/src/main/python/core/storage/vfs_uri_factory.py @@ -34,6 +34,7 @@ class VFSResourceType(str, Enum): RESULT = "result" RUNTIME_STATISTICS = "runtimeStatistics" CONSOLE_MESSAGES = "consoleMessages" + STATE = "state" class VFSURIFactory: diff --git a/amber/src/main/python/proto/org/apache/texera/amber/core/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/core/__init__.py index 2d21638c263..d993a669eab 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/core/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/core/__init__.py @@ -5,9 +5,7 @@ from dataclasses import dataclass from datetime import datetime -from typing import ( - List, -) +from typing import List import betterproto diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py index b7522a696ae..910a583a438 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py @@ -23,7 +23,6 @@ worker as _worker__, ) - if TYPE_CHECKING: import grpclib.server from betterproto.grpc.grpclib_client import MetadataLike @@ -102,6 +101,9 @@ class ControlRequest(betterproto.Message): link_workers_request: "LinkWorkersRequest" = betterproto.message_field( 11, group="sealed_value" ) + next_iteration_request: "NextIterationRequest" = betterproto.message_field( + 12, group="sealed_value" + ) add_input_channel_request: "AddInputChannelRequest" = betterproto.message_field( 50, group="sealed_value" ) @@ -394,6 +396,11 @@ class QueryStatisticsRequest(betterproto.Message): update_target: "StatisticsUpdateTarget" = betterproto.enum_field(2) +@dataclass(eq=False, repr=False) +class NextIterationRequest(betterproto.Message): + loop_start_id: "___core__.OperatorIdentity" = betterproto.message_field(1) + + @dataclass(eq=False, repr=False) class ControlReturn(betterproto.Message): """The generic return message""" @@ -524,503 +531,522 @@ class WorkerMetricsResponse(betterproto.Message): metrics: "_worker__.WorkerMetrics" = betterproto.message_field(1) -class RpcTesterStub(betterproto.ServiceStub): - async def send_ping( +class ControllerServiceStub(betterproto.ServiceStub): + async def retrieve_workflow_state( self, - ping: "Ping", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "IntResponse": + ) -> "RetrieveWorkflowStateResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPing", - ping, - IntResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetrieveWorkflowState", + empty_request, + RetrieveWorkflowStateResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_pong( + async def propagate_embedded_control_message( self, - pong: "Pong", + propagate_embedded_control_message_request: "PropagateEmbeddedControlMessageRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "IntResponse": + ) -> "PropagateEmbeddedControlMessageResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPong", - pong, - IntResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PropagateEmbeddedControlMessage", + propagate_embedded_control_message_request, + PropagateEmbeddedControlMessageResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_nested( + async def take_global_checkpoint( self, - nested: "Nested", + take_global_checkpoint_request: "TakeGlobalCheckpointRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "TakeGlobalCheckpointResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendNested", - nested, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/TakeGlobalCheckpoint", + take_global_checkpoint_request, + TakeGlobalCheckpointResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_pass( + async def debug_command( self, - pass_: "Pass", + debug_command_request: "DebugCommandRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPass", - pass_, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/DebugCommand", + debug_command_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_error_command( + async def evaluate_python_expression( self, - error_command: "ErrorCommand", + evaluate_python_expression_request: "EvaluatePythonExpressionRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EvaluatePythonExpressionResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendErrorCommand", - error_command, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/EvaluatePythonExpression", + evaluate_python_expression_request, + EvaluatePythonExpressionResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_recursion( + async def console_message_triggered( self, - recursion: "Recursion", + console_message_triggered_request: "ConsoleMessageTriggeredRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendRecursion", - recursion, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ConsoleMessageTriggered", + console_message_triggered_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_collect( + async def port_completed( self, - collect: "Collect", + port_completed_request: "PortCompletedRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendCollect", - collect, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PortCompleted", + port_completed_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_generate_number( + async def start_workflow( self, - generate_number: "GenerateNumber", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "IntResponse": + ) -> "StartWorkflowResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendGenerateNumber", - generate_number, - IntResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/StartWorkflow", + empty_request, + StartWorkflowResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_multi_call( + async def resume_workflow( self, - multi_call: "MultiCall", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendMultiCall", - multi_call, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ResumeWorkflow", + empty_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_chain( + async def pause_workflow( self, - chain: "Chain", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendChain", - chain, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PauseWorkflow", + empty_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - -class WorkerServiceStub(betterproto.ServiceStub): - async def add_input_channel( + async def worker_state_updated( self, - add_input_channel_request: "AddInputChannelRequest", + worker_state_updated_request: "WorkerStateUpdatedRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddInputChannel", - add_input_channel_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerStateUpdated", + worker_state_updated_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def add_partitioning( + async def worker_execution_completed( self, - add_partitioning_request: "AddPartitioningRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddPartitioning", - add_partitioning_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerExecutionCompleted", + empty_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def assign_port( + async def next_iteration( self, - assign_port_request: "AssignPortRequest", + next_iteration_request: "NextIterationRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AssignPort", - assign_port_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/NextIteration", + next_iteration_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def finalize_checkpoint( + async def link_workers( self, - finalize_checkpoint_request: "FinalizeCheckpointRequest", + link_workers_request: "LinkWorkersRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "FinalizeCheckpointResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FinalizeCheckpoint", - finalize_checkpoint_request, - FinalizeCheckpointResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers", + link_workers_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def flush_network_buffer( + async def controller_initiate_query_statistics( self, - empty_request: "EmptyRequest", + query_statistics_request: "QueryStatisticsRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FlushNetworkBuffer", - empty_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ControllerInitiateQueryStatistics", + query_statistics_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def initialize_executor( + async def retry_workflow( self, - initialize_executor_request: "InitializeExecutorRequest", + retry_workflow_request: "RetryWorkflowRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/InitializeExecutor", - initialize_executor_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetryWorkflow", + retry_workflow_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def open_executor( + +class RpcTesterStub(betterproto.ServiceStub): + async def send_ping( self, - empty_request: "EmptyRequest", + ping: "Ping", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "IntResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/OpenExecutor", - empty_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPing", + ping, + IntResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def pause_worker( + async def send_pong( self, - empty_request: "EmptyRequest", + pong: "Pong", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "WorkerStateResponse": + ) -> "IntResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PauseWorker", - empty_request, - WorkerStateResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPong", + pong, + IntResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def prepare_checkpoint( + async def send_nested( self, - prepare_checkpoint_request: "PrepareCheckpointRequest", + nested: "Nested", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PrepareCheckpoint", - prepare_checkpoint_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendNested", + nested, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def query_statistics( + async def send_pass( self, - empty_request: "EmptyRequest", + pass_: "Pass", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "WorkerMetricsResponse": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/QueryStatistics", - empty_request, - WorkerMetricsResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPass", + pass_, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def resume_worker( + async def send_error_command( self, - empty_request: "EmptyRequest", + error_command: "ErrorCommand", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "WorkerStateResponse": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/ResumeWorker", - empty_request, - WorkerStateResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendErrorCommand", + error_command, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def retrieve_state( + async def send_recursion( self, - empty_request: "EmptyRequest", + recursion: "Recursion", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetrieveState", - empty_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendRecursion", + recursion, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def retry_current_tuple( + async def send_collect( self, - empty_request: "EmptyRequest", + collect: "Collect", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetryCurrentTuple", - empty_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendCollect", + collect, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def start_worker( + async def send_generate_number( self, - empty_request: "EmptyRequest", + generate_number: "GenerateNumber", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "WorkerStateResponse": + ) -> "IntResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartWorker", - empty_request, - WorkerStateResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendGenerateNumber", + generate_number, + IntResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def end_worker( + async def send_multi_call( self, - empty_request: "EmptyRequest", + multi_call: "MultiCall", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndWorker", - empty_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendMultiCall", + multi_call, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def start_channel( + async def send_chain( self, - empty_request: "EmptyRequest", + chain: "Chain", + *, + timeout: Optional[float] = None, + deadline: Optional["Deadline"] = None, + metadata: Optional["MetadataLike"] = None + ) -> "StringResponse": + return await self._unary_unary( + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendChain", + chain, + StringResponse, + timeout=timeout, + deadline=deadline, + metadata=metadata, + ) + + +class WorkerServiceStub(betterproto.ServiceStub): + async def add_input_channel( + self, + add_input_channel_request: "AddInputChannelRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartChannel", - empty_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddInputChannel", + add_input_channel_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def end_channel( + async def add_partitioning( self, - empty_request: "EmptyRequest", + add_partitioning_request: "AddPartitioningRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndChannel", - empty_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddPartitioning", + add_partitioning_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def debug_command( + async def assign_port( self, - debug_command_request: "DebugCommandRequest", + assign_port_request: "AssignPortRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/DebugCommand", - debug_command_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AssignPort", + assign_port_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def evaluate_python_expression( + async def finalize_checkpoint( self, - evaluate_python_expression_request: "EvaluatePythonExpressionRequest", + finalize_checkpoint_request: "FinalizeCheckpointRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EvaluatedValue": + ) -> "FinalizeCheckpointResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EvaluatePythonExpression", - evaluate_python_expression_request, - EvaluatedValue, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FinalizeCheckpoint", + finalize_checkpoint_request, + FinalizeCheckpointResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def no_operation( + async def flush_network_buffer( self, empty_request: "EmptyRequest", *, @@ -1029,7 +1055,7 @@ async def no_operation( metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/NoOperation", + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FlushNetworkBuffer", empty_request, EmptyReturn, timeout=timeout, @@ -1037,162 +1063,160 @@ async def no_operation( metadata=metadata, ) - -class ControllerServiceStub(betterproto.ServiceStub): - async def retrieve_workflow_state( + async def initialize_executor( self, - empty_request: "EmptyRequest", + initialize_executor_request: "InitializeExecutorRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "RetrieveWorkflowStateResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetrieveWorkflowState", - empty_request, - RetrieveWorkflowStateResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/InitializeExecutor", + initialize_executor_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def propagate_embedded_control_message( + async def open_executor( self, - propagate_embedded_control_message_request: "PropagateEmbeddedControlMessageRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "PropagateEmbeddedControlMessageResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PropagateEmbeddedControlMessage", - propagate_embedded_control_message_request, - PropagateEmbeddedControlMessageResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/OpenExecutor", + empty_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def take_global_checkpoint( + async def pause_worker( self, - take_global_checkpoint_request: "TakeGlobalCheckpointRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "TakeGlobalCheckpointResponse": + ) -> "WorkerStateResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/TakeGlobalCheckpoint", - take_global_checkpoint_request, - TakeGlobalCheckpointResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PauseWorker", + empty_request, + WorkerStateResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def debug_command( + async def prepare_checkpoint( self, - debug_command_request: "DebugCommandRequest", + prepare_checkpoint_request: "PrepareCheckpointRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/DebugCommand", - debug_command_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PrepareCheckpoint", + prepare_checkpoint_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def evaluate_python_expression( + async def query_statistics( self, - evaluate_python_expression_request: "EvaluatePythonExpressionRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EvaluatePythonExpressionResponse": + ) -> "WorkerMetricsResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/EvaluatePythonExpression", - evaluate_python_expression_request, - EvaluatePythonExpressionResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/QueryStatistics", + empty_request, + WorkerMetricsResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def console_message_triggered( + async def resume_worker( self, - console_message_triggered_request: "ConsoleMessageTriggeredRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "WorkerStateResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ConsoleMessageTriggered", - console_message_triggered_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/ResumeWorker", + empty_request, + WorkerStateResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def port_completed( + async def retrieve_state( self, - port_completed_request: "PortCompletedRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PortCompleted", - port_completed_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetrieveState", + empty_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def start_workflow( + async def retry_current_tuple( self, empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StartWorkflowResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/StartWorkflow", + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetryCurrentTuple", empty_request, - StartWorkflowResponse, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def resume_workflow( + async def start_worker( self, empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "WorkerStateResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ResumeWorkflow", + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartWorker", empty_request, - EmptyReturn, + WorkerStateResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def pause_workflow( + async def end_worker( self, empty_request: "EmptyRequest", *, @@ -1201,7 +1225,7 @@ async def pause_workflow( metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PauseWorkflow", + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndWorker", empty_request, EmptyReturn, timeout=timeout, @@ -1209,24 +1233,24 @@ async def pause_workflow( metadata=metadata, ) - async def worker_state_updated( + async def start_channel( self, - worker_state_updated_request: "WorkerStateUpdatedRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerStateUpdated", - worker_state_updated_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartChannel", + empty_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def worker_execution_completed( + async def end_channel( self, empty_request: "EmptyRequest", *, @@ -1235,7 +1259,7 @@ async def worker_execution_completed( metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerExecutionCompleted", + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndChannel", empty_request, EmptyReturn, timeout=timeout, @@ -1243,51 +1267,51 @@ async def worker_execution_completed( metadata=metadata, ) - async def link_workers( + async def debug_command( self, - link_workers_request: "LinkWorkersRequest", + debug_command_request: "DebugCommandRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers", - link_workers_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/DebugCommand", + debug_command_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def controller_initiate_query_statistics( + async def evaluate_python_expression( self, - query_statistics_request: "QueryStatisticsRequest", + evaluate_python_expression_request: "EvaluatePythonExpressionRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "EvaluatedValue": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ControllerInitiateQueryStatistics", - query_statistics_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EvaluatePythonExpression", + evaluate_python_expression_request, + EvaluatedValue, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def retry_workflow( + async def no_operation( self, - retry_workflow_request: "RetryWorkflowRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetryWorkflow", - retry_workflow_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/NoOperation", + empty_request, EmptyReturn, timeout=timeout, deadline=deadline, @@ -1295,806 +1319,824 @@ async def retry_workflow( ) -class RpcTesterBase(ServiceBase): - - async def send_ping(self, ping: "Ping") -> "IntResponse": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) +class ControllerServiceBase(ServiceBase): - async def send_pong(self, pong: "Pong") -> "IntResponse": + async def retrieve_workflow_state( + self, empty_request: "EmptyRequest" + ) -> "RetrieveWorkflowStateResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_nested(self, nested: "Nested") -> "StringResponse": + async def propagate_embedded_control_message( + self, + propagate_embedded_control_message_request: "PropagateEmbeddedControlMessageRequest", + ) -> "PropagateEmbeddedControlMessageResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_pass(self, pass_: "Pass") -> "StringResponse": + async def take_global_checkpoint( + self, take_global_checkpoint_request: "TakeGlobalCheckpointRequest" + ) -> "TakeGlobalCheckpointResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_error_command( - self, error_command: "ErrorCommand" - ) -> "StringResponse": + async def debug_command( + self, debug_command_request: "DebugCommandRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_recursion(self, recursion: "Recursion") -> "StringResponse": + async def evaluate_python_expression( + self, evaluate_python_expression_request: "EvaluatePythonExpressionRequest" + ) -> "EvaluatePythonExpressionResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_collect(self, collect: "Collect") -> "StringResponse": + async def console_message_triggered( + self, console_message_triggered_request: "ConsoleMessageTriggeredRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_generate_number( - self, generate_number: "GenerateNumber" - ) -> "IntResponse": + async def port_completed( + self, port_completed_request: "PortCompletedRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_multi_call(self, multi_call: "MultiCall") -> "StringResponse": + async def start_workflow( + self, empty_request: "EmptyRequest" + ) -> "StartWorkflowResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_chain(self, chain: "Chain") -> "StringResponse": + async def resume_workflow(self, empty_request: "EmptyRequest") -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def __rpc_send_ping( - self, stream: "grpclib.server.Stream[Ping, IntResponse]" + async def pause_workflow(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def worker_state_updated( + self, worker_state_updated_request: "WorkerStateUpdatedRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def worker_execution_completed( + self, empty_request: "EmptyRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def next_iteration( + self, next_iteration_request: "NextIterationRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def link_workers( + self, link_workers_request: "LinkWorkersRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def controller_initiate_query_statistics( + self, query_statistics_request: "QueryStatisticsRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def retry_workflow( + self, retry_workflow_request: "RetryWorkflowRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def __rpc_retrieve_workflow_state( + self, + stream: "grpclib.server.Stream[EmptyRequest, RetrieveWorkflowStateResponse]", ) -> None: request = await stream.recv_message() - response = await self.send_ping(request) + response = await self.retrieve_workflow_state(request) await stream.send_message(response) - async def __rpc_send_pong( - self, stream: "grpclib.server.Stream[Pong, IntResponse]" + async def __rpc_propagate_embedded_control_message( + self, + stream: "grpclib.server.Stream[PropagateEmbeddedControlMessageRequest, PropagateEmbeddedControlMessageResponse]", ) -> None: request = await stream.recv_message() - response = await self.send_pong(request) + response = await self.propagate_embedded_control_message(request) await stream.send_message(response) - async def __rpc_send_nested( - self, stream: "grpclib.server.Stream[Nested, StringResponse]" + async def __rpc_take_global_checkpoint( + self, + stream: "grpclib.server.Stream[TakeGlobalCheckpointRequest, TakeGlobalCheckpointResponse]", ) -> None: request = await stream.recv_message() - response = await self.send_nested(request) + response = await self.take_global_checkpoint(request) await stream.send_message(response) - async def __rpc_send_pass( - self, stream: "grpclib.server.Stream[Pass, StringResponse]" + async def __rpc_debug_command( + self, stream: "grpclib.server.Stream[DebugCommandRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_pass(request) + response = await self.debug_command(request) await stream.send_message(response) - async def __rpc_send_error_command( - self, stream: "grpclib.server.Stream[ErrorCommand, StringResponse]" + async def __rpc_evaluate_python_expression( + self, + stream: "grpclib.server.Stream[EvaluatePythonExpressionRequest, EvaluatePythonExpressionResponse]", ) -> None: request = await stream.recv_message() - response = await self.send_error_command(request) + response = await self.evaluate_python_expression(request) await stream.send_message(response) - async def __rpc_send_recursion( - self, stream: "grpclib.server.Stream[Recursion, StringResponse]" + async def __rpc_console_message_triggered( + self, + stream: "grpclib.server.Stream[ConsoleMessageTriggeredRequest, EmptyReturn]", ) -> None: request = await stream.recv_message() - response = await self.send_recursion(request) + response = await self.console_message_triggered(request) await stream.send_message(response) - async def __rpc_send_collect( - self, stream: "grpclib.server.Stream[Collect, StringResponse]" + async def __rpc_port_completed( + self, stream: "grpclib.server.Stream[PortCompletedRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_collect(request) + response = await self.port_completed(request) await stream.send_message(response) - async def __rpc_send_generate_number( - self, stream: "grpclib.server.Stream[GenerateNumber, IntResponse]" + async def __rpc_start_workflow( + self, stream: "grpclib.server.Stream[EmptyRequest, StartWorkflowResponse]" ) -> None: request = await stream.recv_message() - response = await self.send_generate_number(request) + response = await self.start_workflow(request) await stream.send_message(response) - async def __rpc_send_multi_call( - self, stream: "grpclib.server.Stream[MultiCall, StringResponse]" + async def __rpc_resume_workflow( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_multi_call(request) + response = await self.resume_workflow(request) await stream.send_message(response) - async def __rpc_send_chain( - self, stream: "grpclib.server.Stream[Chain, StringResponse]" + async def __rpc_pause_workflow( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_chain(request) + response = await self.pause_workflow(request) + await stream.send_message(response) + + async def __rpc_worker_state_updated( + self, stream: "grpclib.server.Stream[WorkerStateUpdatedRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.worker_state_updated(request) + await stream.send_message(response) + + async def __rpc_worker_execution_completed( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.worker_execution_completed(request) + await stream.send_message(response) + + async def __rpc_next_iteration( + self, stream: "grpclib.server.Stream[NextIterationRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.next_iteration(request) + await stream.send_message(response) + + async def __rpc_link_workers( + self, stream: "grpclib.server.Stream[LinkWorkersRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.link_workers(request) + await stream.send_message(response) + + async def __rpc_controller_initiate_query_statistics( + self, stream: "grpclib.server.Stream[QueryStatisticsRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.controller_initiate_query_statistics(request) + await stream.send_message(response) + + async def __rpc_retry_workflow( + self, stream: "grpclib.server.Stream[RetryWorkflowRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.retry_workflow(request) await stream.send_message(response) def __mapping__(self) -> Dict[str, grpclib.const.Handler]: return { - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPing": grpclib.const.Handler( - self.__rpc_send_ping, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetrieveWorkflowState": grpclib.const.Handler( + self.__rpc_retrieve_workflow_state, grpclib.const.Cardinality.UNARY_UNARY, - Ping, - IntResponse, + EmptyRequest, + RetrieveWorkflowStateResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPong": grpclib.const.Handler( - self.__rpc_send_pong, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PropagateEmbeddedControlMessage": grpclib.const.Handler( + self.__rpc_propagate_embedded_control_message, grpclib.const.Cardinality.UNARY_UNARY, - Pong, - IntResponse, + PropagateEmbeddedControlMessageRequest, + PropagateEmbeddedControlMessageResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendNested": grpclib.const.Handler( - self.__rpc_send_nested, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/TakeGlobalCheckpoint": grpclib.const.Handler( + self.__rpc_take_global_checkpoint, grpclib.const.Cardinality.UNARY_UNARY, - Nested, - StringResponse, + TakeGlobalCheckpointRequest, + TakeGlobalCheckpointResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPass": grpclib.const.Handler( - self.__rpc_send_pass, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/DebugCommand": grpclib.const.Handler( + self.__rpc_debug_command, grpclib.const.Cardinality.UNARY_UNARY, - Pass, - StringResponse, + DebugCommandRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendErrorCommand": grpclib.const.Handler( - self.__rpc_send_error_command, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/EvaluatePythonExpression": grpclib.const.Handler( + self.__rpc_evaluate_python_expression, grpclib.const.Cardinality.UNARY_UNARY, - ErrorCommand, - StringResponse, + EvaluatePythonExpressionRequest, + EvaluatePythonExpressionResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendRecursion": grpclib.const.Handler( - self.__rpc_send_recursion, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ConsoleMessageTriggered": grpclib.const.Handler( + self.__rpc_console_message_triggered, grpclib.const.Cardinality.UNARY_UNARY, - Recursion, - StringResponse, + ConsoleMessageTriggeredRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendCollect": grpclib.const.Handler( - self.__rpc_send_collect, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PortCompleted": grpclib.const.Handler( + self.__rpc_port_completed, grpclib.const.Cardinality.UNARY_UNARY, - Collect, - StringResponse, + PortCompletedRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendGenerateNumber": grpclib.const.Handler( - self.__rpc_send_generate_number, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/StartWorkflow": grpclib.const.Handler( + self.__rpc_start_workflow, grpclib.const.Cardinality.UNARY_UNARY, - GenerateNumber, - IntResponse, + EmptyRequest, + StartWorkflowResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendMultiCall": grpclib.const.Handler( - self.__rpc_send_multi_call, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ResumeWorkflow": grpclib.const.Handler( + self.__rpc_resume_workflow, grpclib.const.Cardinality.UNARY_UNARY, - MultiCall, - StringResponse, + EmptyRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendChain": grpclib.const.Handler( - self.__rpc_send_chain, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PauseWorkflow": grpclib.const.Handler( + self.__rpc_pause_workflow, grpclib.const.Cardinality.UNARY_UNARY, - Chain, - StringResponse, + EmptyRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerStateUpdated": grpclib.const.Handler( + self.__rpc_worker_state_updated, + grpclib.const.Cardinality.UNARY_UNARY, + WorkerStateUpdatedRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerExecutionCompleted": grpclib.const.Handler( + self.__rpc_worker_execution_completed, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/NextIteration": grpclib.const.Handler( + self.__rpc_next_iteration, + grpclib.const.Cardinality.UNARY_UNARY, + NextIterationRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers": grpclib.const.Handler( + self.__rpc_link_workers, + grpclib.const.Cardinality.UNARY_UNARY, + LinkWorkersRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ControllerInitiateQueryStatistics": grpclib.const.Handler( + self.__rpc_controller_initiate_query_statistics, + grpclib.const.Cardinality.UNARY_UNARY, + QueryStatisticsRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetryWorkflow": grpclib.const.Handler( + self.__rpc_retry_workflow, + grpclib.const.Cardinality.UNARY_UNARY, + RetryWorkflowRequest, + EmptyReturn, ), } -class WorkerServiceBase(ServiceBase): - - async def add_input_channel( - self, add_input_channel_request: "AddInputChannelRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def add_partitioning( - self, add_partitioning_request: "AddPartitioningRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def assign_port( - self, assign_port_request: "AssignPortRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def finalize_checkpoint( - self, finalize_checkpoint_request: "FinalizeCheckpointRequest" - ) -> "FinalizeCheckpointResponse": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def flush_network_buffer( - self, empty_request: "EmptyRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def initialize_executor( - self, initialize_executor_request: "InitializeExecutorRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def open_executor(self, empty_request: "EmptyRequest") -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def pause_worker( - self, empty_request: "EmptyRequest" - ) -> "WorkerStateResponse": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def prepare_checkpoint( - self, prepare_checkpoint_request: "PrepareCheckpointRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def query_statistics( - self, empty_request: "EmptyRequest" - ) -> "WorkerMetricsResponse": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) +class RpcTesterBase(ServiceBase): - async def resume_worker( - self, empty_request: "EmptyRequest" - ) -> "WorkerStateResponse": + async def send_ping(self, ping: "Ping") -> "IntResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def retrieve_state(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def send_pong(self, pong: "Pong") -> "IntResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def retry_current_tuple(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def send_nested(self, nested: "Nested") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def start_worker( - self, empty_request: "EmptyRequest" - ) -> "WorkerStateResponse": + async def send_pass(self, pass_: "Pass") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def end_worker(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def send_error_command( + self, error_command: "ErrorCommand" + ) -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def start_channel(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def send_recursion(self, recursion: "Recursion") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def end_channel(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def send_collect(self, collect: "Collect") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def debug_command( - self, debug_command_request: "DebugCommandRequest" - ) -> "EmptyReturn": + async def send_generate_number( + self, generate_number: "GenerateNumber" + ) -> "IntResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def evaluate_python_expression( - self, evaluate_python_expression_request: "EvaluatePythonExpressionRequest" - ) -> "EvaluatedValue": + async def send_multi_call(self, multi_call: "MultiCall") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def no_operation(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def send_chain(self, chain: "Chain") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def __rpc_add_input_channel( - self, stream: "grpclib.server.Stream[AddInputChannelRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.add_input_channel(request) - await stream.send_message(response) - - async def __rpc_add_partitioning( - self, stream: "grpclib.server.Stream[AddPartitioningRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.add_partitioning(request) - await stream.send_message(response) - - async def __rpc_assign_port( - self, stream: "grpclib.server.Stream[AssignPortRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.assign_port(request) - await stream.send_message(response) - - async def __rpc_finalize_checkpoint( - self, - stream: "grpclib.server.Stream[FinalizeCheckpointRequest, FinalizeCheckpointResponse]", - ) -> None: - request = await stream.recv_message() - response = await self.finalize_checkpoint(request) - await stream.send_message(response) - - async def __rpc_flush_network_buffer( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.flush_network_buffer(request) - await stream.send_message(response) - - async def __rpc_initialize_executor( - self, stream: "grpclib.server.Stream[InitializeExecutorRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.initialize_executor(request) - await stream.send_message(response) - - async def __rpc_open_executor( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.open_executor(request) - await stream.send_message(response) - - async def __rpc_pause_worker( - self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" - ) -> None: - request = await stream.recv_message() - response = await self.pause_worker(request) - await stream.send_message(response) - - async def __rpc_prepare_checkpoint( - self, stream: "grpclib.server.Stream[PrepareCheckpointRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.prepare_checkpoint(request) - await stream.send_message(response) - - async def __rpc_query_statistics( - self, stream: "grpclib.server.Stream[EmptyRequest, WorkerMetricsResponse]" - ) -> None: - request = await stream.recv_message() - response = await self.query_statistics(request) - await stream.send_message(response) - - async def __rpc_resume_worker( - self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" + async def __rpc_send_ping( + self, stream: "grpclib.server.Stream[Ping, IntResponse]" ) -> None: request = await stream.recv_message() - response = await self.resume_worker(request) + response = await self.send_ping(request) await stream.send_message(response) - async def __rpc_retrieve_state( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_send_pong( + self, stream: "grpclib.server.Stream[Pong, IntResponse]" ) -> None: request = await stream.recv_message() - response = await self.retrieve_state(request) + response = await self.send_pong(request) await stream.send_message(response) - async def __rpc_retry_current_tuple( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_send_nested( + self, stream: "grpclib.server.Stream[Nested, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.retry_current_tuple(request) + response = await self.send_nested(request) await stream.send_message(response) - async def __rpc_start_worker( - self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" + async def __rpc_send_pass( + self, stream: "grpclib.server.Stream[Pass, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.start_worker(request) + response = await self.send_pass(request) await stream.send_message(response) - - async def __rpc_end_worker( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + + async def __rpc_send_error_command( + self, stream: "grpclib.server.Stream[ErrorCommand, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.end_worker(request) + response = await self.send_error_command(request) await stream.send_message(response) - async def __rpc_start_channel( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_send_recursion( + self, stream: "grpclib.server.Stream[Recursion, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.start_channel(request) + response = await self.send_recursion(request) await stream.send_message(response) - async def __rpc_end_channel( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_send_collect( + self, stream: "grpclib.server.Stream[Collect, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.end_channel(request) + response = await self.send_collect(request) await stream.send_message(response) - async def __rpc_debug_command( - self, stream: "grpclib.server.Stream[DebugCommandRequest, EmptyReturn]" + async def __rpc_send_generate_number( + self, stream: "grpclib.server.Stream[GenerateNumber, IntResponse]" ) -> None: request = await stream.recv_message() - response = await self.debug_command(request) + response = await self.send_generate_number(request) await stream.send_message(response) - async def __rpc_evaluate_python_expression( - self, - stream: "grpclib.server.Stream[EvaluatePythonExpressionRequest, EvaluatedValue]", + async def __rpc_send_multi_call( + self, stream: "grpclib.server.Stream[MultiCall, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.evaluate_python_expression(request) + response = await self.send_multi_call(request) await stream.send_message(response) - async def __rpc_no_operation( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_send_chain( + self, stream: "grpclib.server.Stream[Chain, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.no_operation(request) + response = await self.send_chain(request) await stream.send_message(response) def __mapping__(self) -> Dict[str, grpclib.const.Handler]: return { - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddInputChannel": grpclib.const.Handler( - self.__rpc_add_input_channel, - grpclib.const.Cardinality.UNARY_UNARY, - AddInputChannelRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddPartitioning": grpclib.const.Handler( - self.__rpc_add_partitioning, - grpclib.const.Cardinality.UNARY_UNARY, - AddPartitioningRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AssignPort": grpclib.const.Handler( - self.__rpc_assign_port, - grpclib.const.Cardinality.UNARY_UNARY, - AssignPortRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FinalizeCheckpoint": grpclib.const.Handler( - self.__rpc_finalize_checkpoint, - grpclib.const.Cardinality.UNARY_UNARY, - FinalizeCheckpointRequest, - FinalizeCheckpointResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FlushNetworkBuffer": grpclib.const.Handler( - self.__rpc_flush_network_buffer, - grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/InitializeExecutor": grpclib.const.Handler( - self.__rpc_initialize_executor, - grpclib.const.Cardinality.UNARY_UNARY, - InitializeExecutorRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/OpenExecutor": grpclib.const.Handler( - self.__rpc_open_executor, - grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PauseWorker": grpclib.const.Handler( - self.__rpc_pause_worker, - grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - WorkerStateResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PrepareCheckpoint": grpclib.const.Handler( - self.__rpc_prepare_checkpoint, - grpclib.const.Cardinality.UNARY_UNARY, - PrepareCheckpointRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/QueryStatistics": grpclib.const.Handler( - self.__rpc_query_statistics, - grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - WorkerMetricsResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/ResumeWorker": grpclib.const.Handler( - self.__rpc_resume_worker, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPing": grpclib.const.Handler( + self.__rpc_send_ping, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - WorkerStateResponse, + Ping, + IntResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetrieveState": grpclib.const.Handler( - self.__rpc_retrieve_state, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPong": grpclib.const.Handler( + self.__rpc_send_pong, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + Pong, + IntResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetryCurrentTuple": grpclib.const.Handler( - self.__rpc_retry_current_tuple, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendNested": grpclib.const.Handler( + self.__rpc_send_nested, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + Nested, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartWorker": grpclib.const.Handler( - self.__rpc_start_worker, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPass": grpclib.const.Handler( + self.__rpc_send_pass, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - WorkerStateResponse, + Pass, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndWorker": grpclib.const.Handler( - self.__rpc_end_worker, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendErrorCommand": grpclib.const.Handler( + self.__rpc_send_error_command, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + ErrorCommand, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartChannel": grpclib.const.Handler( - self.__rpc_start_channel, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendRecursion": grpclib.const.Handler( + self.__rpc_send_recursion, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + Recursion, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndChannel": grpclib.const.Handler( - self.__rpc_end_channel, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendCollect": grpclib.const.Handler( + self.__rpc_send_collect, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + Collect, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/DebugCommand": grpclib.const.Handler( - self.__rpc_debug_command, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendGenerateNumber": grpclib.const.Handler( + self.__rpc_send_generate_number, grpclib.const.Cardinality.UNARY_UNARY, - DebugCommandRequest, - EmptyReturn, + GenerateNumber, + IntResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EvaluatePythonExpression": grpclib.const.Handler( - self.__rpc_evaluate_python_expression, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendMultiCall": grpclib.const.Handler( + self.__rpc_send_multi_call, grpclib.const.Cardinality.UNARY_UNARY, - EvaluatePythonExpressionRequest, - EvaluatedValue, + MultiCall, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/NoOperation": grpclib.const.Handler( - self.__rpc_no_operation, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendChain": grpclib.const.Handler( + self.__rpc_send_chain, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + Chain, + StringResponse, ), } -class ControllerServiceBase(ServiceBase): +class WorkerServiceBase(ServiceBase): - async def retrieve_workflow_state( - self, empty_request: "EmptyRequest" - ) -> "RetrieveWorkflowStateResponse": + async def add_input_channel( + self, add_input_channel_request: "AddInputChannelRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def propagate_embedded_control_message( - self, - propagate_embedded_control_message_request: "PropagateEmbeddedControlMessageRequest", - ) -> "PropagateEmbeddedControlMessageResponse": + async def add_partitioning( + self, add_partitioning_request: "AddPartitioningRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def take_global_checkpoint( - self, take_global_checkpoint_request: "TakeGlobalCheckpointRequest" - ) -> "TakeGlobalCheckpointResponse": + async def assign_port( + self, assign_port_request: "AssignPortRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def debug_command( - self, debug_command_request: "DebugCommandRequest" - ) -> "EmptyReturn": + async def finalize_checkpoint( + self, finalize_checkpoint_request: "FinalizeCheckpointRequest" + ) -> "FinalizeCheckpointResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def evaluate_python_expression( - self, evaluate_python_expression_request: "EvaluatePythonExpressionRequest" - ) -> "EvaluatePythonExpressionResponse": + async def flush_network_buffer( + self, empty_request: "EmptyRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def console_message_triggered( - self, console_message_triggered_request: "ConsoleMessageTriggeredRequest" + async def initialize_executor( + self, initialize_executor_request: "InitializeExecutorRequest" ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def port_completed( - self, port_completed_request: "PortCompletedRequest" + async def open_executor(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def pause_worker( + self, empty_request: "EmptyRequest" + ) -> "WorkerStateResponse": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def prepare_checkpoint( + self, prepare_checkpoint_request: "PrepareCheckpointRequest" ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def start_workflow( + async def query_statistics( self, empty_request: "EmptyRequest" - ) -> "StartWorkflowResponse": + ) -> "WorkerMetricsResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def resume_workflow(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def resume_worker( + self, empty_request: "EmptyRequest" + ) -> "WorkerStateResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def pause_workflow(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def retrieve_state(self, empty_request: "EmptyRequest") -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def worker_state_updated( - self, worker_state_updated_request: "WorkerStateUpdatedRequest" - ) -> "EmptyReturn": + async def retry_current_tuple(self, empty_request: "EmptyRequest") -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def worker_execution_completed( + async def start_worker( self, empty_request: "EmptyRequest" - ) -> "EmptyReturn": + ) -> "WorkerStateResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def link_workers( - self, link_workers_request: "LinkWorkersRequest" + async def end_worker(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def start_channel(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def end_channel(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def debug_command( + self, debug_command_request: "DebugCommandRequest" ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def controller_initiate_query_statistics( - self, query_statistics_request: "QueryStatisticsRequest" - ) -> "EmptyReturn": + async def evaluate_python_expression( + self, evaluate_python_expression_request: "EvaluatePythonExpressionRequest" + ) -> "EvaluatedValue": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def retry_workflow( - self, retry_workflow_request: "RetryWorkflowRequest" - ) -> "EmptyReturn": + async def no_operation(self, empty_request: "EmptyRequest") -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def __rpc_retrieve_workflow_state( - self, - stream: "grpclib.server.Stream[EmptyRequest, RetrieveWorkflowStateResponse]", + async def __rpc_add_input_channel( + self, stream: "grpclib.server.Stream[AddInputChannelRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.retrieve_workflow_state(request) + response = await self.add_input_channel(request) await stream.send_message(response) - async def __rpc_propagate_embedded_control_message( - self, - stream: "grpclib.server.Stream[PropagateEmbeddedControlMessageRequest, PropagateEmbeddedControlMessageResponse]", + async def __rpc_add_partitioning( + self, stream: "grpclib.server.Stream[AddPartitioningRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.propagate_embedded_control_message(request) + response = await self.add_partitioning(request) await stream.send_message(response) - async def __rpc_take_global_checkpoint( + async def __rpc_assign_port( + self, stream: "grpclib.server.Stream[AssignPortRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.assign_port(request) + await stream.send_message(response) + + async def __rpc_finalize_checkpoint( self, - stream: "grpclib.server.Stream[TakeGlobalCheckpointRequest, TakeGlobalCheckpointResponse]", + stream: "grpclib.server.Stream[FinalizeCheckpointRequest, FinalizeCheckpointResponse]", ) -> None: request = await stream.recv_message() - response = await self.take_global_checkpoint(request) + response = await self.finalize_checkpoint(request) await stream.send_message(response) - async def __rpc_debug_command( - self, stream: "grpclib.server.Stream[DebugCommandRequest, EmptyReturn]" + async def __rpc_flush_network_buffer( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.debug_command(request) + response = await self.flush_network_buffer(request) await stream.send_message(response) - async def __rpc_evaluate_python_expression( - self, - stream: "grpclib.server.Stream[EvaluatePythonExpressionRequest, EvaluatePythonExpressionResponse]", + async def __rpc_initialize_executor( + self, stream: "grpclib.server.Stream[InitializeExecutorRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.evaluate_python_expression(request) + response = await self.initialize_executor(request) await stream.send_message(response) - async def __rpc_console_message_triggered( - self, - stream: "grpclib.server.Stream[ConsoleMessageTriggeredRequest, EmptyReturn]", + async def __rpc_open_executor( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.console_message_triggered(request) + response = await self.open_executor(request) await stream.send_message(response) - async def __rpc_port_completed( - self, stream: "grpclib.server.Stream[PortCompletedRequest, EmptyReturn]" + async def __rpc_pause_worker( + self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" ) -> None: request = await stream.recv_message() - response = await self.port_completed(request) + response = await self.pause_worker(request) await stream.send_message(response) - async def __rpc_start_workflow( - self, stream: "grpclib.server.Stream[EmptyRequest, StartWorkflowResponse]" + async def __rpc_prepare_checkpoint( + self, stream: "grpclib.server.Stream[PrepareCheckpointRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.start_workflow(request) + response = await self.prepare_checkpoint(request) await stream.send_message(response) - async def __rpc_resume_workflow( + async def __rpc_query_statistics( + self, stream: "grpclib.server.Stream[EmptyRequest, WorkerMetricsResponse]" + ) -> None: + request = await stream.recv_message() + response = await self.query_statistics(request) + await stream.send_message(response) + + async def __rpc_resume_worker( + self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" + ) -> None: + request = await stream.recv_message() + response = await self.resume_worker(request) + await stream.send_message(response) + + async def __rpc_retrieve_state( self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.resume_workflow(request) + response = await self.retrieve_state(request) await stream.send_message(response) - async def __rpc_pause_workflow( + async def __rpc_retry_current_tuple( self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.pause_workflow(request) + response = await self.retry_current_tuple(request) await stream.send_message(response) - async def __rpc_worker_state_updated( - self, stream: "grpclib.server.Stream[WorkerStateUpdatedRequest, EmptyReturn]" + async def __rpc_start_worker( + self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" ) -> None: request = await stream.recv_message() - response = await self.worker_state_updated(request) + response = await self.start_worker(request) await stream.send_message(response) - async def __rpc_worker_execution_completed( + async def __rpc_end_worker( self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.worker_execution_completed(request) + response = await self.end_worker(request) await stream.send_message(response) - async def __rpc_link_workers( - self, stream: "grpclib.server.Stream[LinkWorkersRequest, EmptyReturn]" + async def __rpc_start_channel( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.link_workers(request) + response = await self.start_channel(request) await stream.send_message(response) - async def __rpc_controller_initiate_query_statistics( - self, stream: "grpclib.server.Stream[QueryStatisticsRequest, EmptyReturn]" + async def __rpc_end_channel( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.controller_initiate_query_statistics(request) + response = await self.end_channel(request) await stream.send_message(response) - async def __rpc_retry_workflow( - self, stream: "grpclib.server.Stream[RetryWorkflowRequest, EmptyReturn]" + async def __rpc_debug_command( + self, stream: "grpclib.server.Stream[DebugCommandRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.retry_workflow(request) + response = await self.debug_command(request) + await stream.send_message(response) + + async def __rpc_evaluate_python_expression( + self, + stream: "grpclib.server.Stream[EvaluatePythonExpressionRequest, EvaluatedValue]", + ) -> None: + request = await stream.recv_message() + response = await self.evaluate_python_expression(request) + await stream.send_message(response) + + async def __rpc_no_operation( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.no_operation(request) await stream.send_message(response) def __mapping__(self) -> Dict[str, grpclib.const.Handler]: return { - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetrieveWorkflowState": grpclib.const.Handler( - self.__rpc_retrieve_workflow_state, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddInputChannel": grpclib.const.Handler( + self.__rpc_add_input_channel, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - RetrieveWorkflowStateResponse, + AddInputChannelRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PropagateEmbeddedControlMessage": grpclib.const.Handler( - self.__rpc_propagate_embedded_control_message, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddPartitioning": grpclib.const.Handler( + self.__rpc_add_partitioning, grpclib.const.Cardinality.UNARY_UNARY, - PropagateEmbeddedControlMessageRequest, - PropagateEmbeddedControlMessageResponse, + AddPartitioningRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/TakeGlobalCheckpoint": grpclib.const.Handler( - self.__rpc_take_global_checkpoint, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AssignPort": grpclib.const.Handler( + self.__rpc_assign_port, grpclib.const.Cardinality.UNARY_UNARY, - TakeGlobalCheckpointRequest, - TakeGlobalCheckpointResponse, + AssignPortRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/DebugCommand": grpclib.const.Handler( - self.__rpc_debug_command, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FinalizeCheckpoint": grpclib.const.Handler( + self.__rpc_finalize_checkpoint, grpclib.const.Cardinality.UNARY_UNARY, - DebugCommandRequest, + FinalizeCheckpointRequest, + FinalizeCheckpointResponse, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FlushNetworkBuffer": grpclib.const.Handler( + self.__rpc_flush_network_buffer, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/EvaluatePythonExpression": grpclib.const.Handler( - self.__rpc_evaluate_python_expression, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/InitializeExecutor": grpclib.const.Handler( + self.__rpc_initialize_executor, grpclib.const.Cardinality.UNARY_UNARY, - EvaluatePythonExpressionRequest, - EvaluatePythonExpressionResponse, + InitializeExecutorRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ConsoleMessageTriggered": grpclib.const.Handler( - self.__rpc_console_message_triggered, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/OpenExecutor": grpclib.const.Handler( + self.__rpc_open_executor, grpclib.const.Cardinality.UNARY_UNARY, - ConsoleMessageTriggeredRequest, + EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PortCompleted": grpclib.const.Handler( - self.__rpc_port_completed, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PauseWorker": grpclib.const.Handler( + self.__rpc_pause_worker, grpclib.const.Cardinality.UNARY_UNARY, - PortCompletedRequest, + EmptyRequest, + WorkerStateResponse, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PrepareCheckpoint": grpclib.const.Handler( + self.__rpc_prepare_checkpoint, + grpclib.const.Cardinality.UNARY_UNARY, + PrepareCheckpointRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/StartWorkflow": grpclib.const.Handler( - self.__rpc_start_workflow, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/QueryStatistics": grpclib.const.Handler( + self.__rpc_query_statistics, grpclib.const.Cardinality.UNARY_UNARY, EmptyRequest, - StartWorkflowResponse, + WorkerMetricsResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ResumeWorkflow": grpclib.const.Handler( - self.__rpc_resume_workflow, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/ResumeWorker": grpclib.const.Handler( + self.__rpc_resume_worker, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + WorkerStateResponse, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetrieveState": grpclib.const.Handler( + self.__rpc_retrieve_state, grpclib.const.Cardinality.UNARY_UNARY, EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PauseWorkflow": grpclib.const.Handler( - self.__rpc_pause_workflow, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetryCurrentTuple": grpclib.const.Handler( + self.__rpc_retry_current_tuple, grpclib.const.Cardinality.UNARY_UNARY, EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerStateUpdated": grpclib.const.Handler( - self.__rpc_worker_state_updated, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartWorker": grpclib.const.Handler( + self.__rpc_start_worker, grpclib.const.Cardinality.UNARY_UNARY, - WorkerStateUpdatedRequest, + EmptyRequest, + WorkerStateResponse, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndWorker": grpclib.const.Handler( + self.__rpc_end_worker, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerExecutionCompleted": grpclib.const.Handler( - self.__rpc_worker_execution_completed, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartChannel": grpclib.const.Handler( + self.__rpc_start_channel, grpclib.const.Cardinality.UNARY_UNARY, EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers": grpclib.const.Handler( - self.__rpc_link_workers, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndChannel": grpclib.const.Handler( + self.__rpc_end_channel, grpclib.const.Cardinality.UNARY_UNARY, - LinkWorkersRequest, + EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ControllerInitiateQueryStatistics": grpclib.const.Handler( - self.__rpc_controller_initiate_query_statistics, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/DebugCommand": grpclib.const.Handler( + self.__rpc_debug_command, grpclib.const.Cardinality.UNARY_UNARY, - QueryStatisticsRequest, + DebugCommandRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetryWorkflow": grpclib.const.Handler( - self.__rpc_retry_workflow, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EvaluatePythonExpression": grpclib.const.Handler( + self.__rpc_evaluate_python_expression, grpclib.const.Cardinality.UNARY_UNARY, - RetryWorkflowRequest, + EvaluatePythonExpressionRequest, + EvaluatedValue, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/NoOperation": grpclib.const.Handler( + self.__rpc_no_operation, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, EmptyReturn, ), } diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/sendsemantics/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/sendsemantics/__init__.py index bc241806b5c..94ed31cce3f 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/sendsemantics/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/sendsemantics/__init__.py @@ -4,9 +4,7 @@ # This file has been @generated from dataclasses import dataclass -from typing import ( - List, -) +from typing import List import betterproto diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/worker/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/worker/__init__.py index 6a7b210e185..072e7c8ce65 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/worker/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/worker/__init__.py @@ -4,9 +4,7 @@ # This file has been @generated from dataclasses import dataclass -from typing import ( - List, -) +from typing import List import betterproto diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/common/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/common/__init__.py index 55c789aa395..8c1464cc76c 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/common/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/common/__init__.py @@ -18,6 +18,27 @@ ) +@dataclass(eq=False, repr=False) +class Backpressure(betterproto.Message): + enable_backpressure: bool = betterproto.bool_field(1) + + +@dataclass(eq=False, repr=False) +class CreditUpdate(betterproto.Message): + pass + + +@dataclass(eq=False, repr=False) +class ActorCommand(betterproto.Message): + backpressure: "Backpressure" = betterproto.message_field(1, group="sealed_value") + credit_update: "CreditUpdate" = betterproto.message_field(2, group="sealed_value") + + +@dataclass(eq=False, repr=False) +class PythonActorMessage(betterproto.Message): + payload: "ActorCommand" = betterproto.message_field(1) + + @dataclass(eq=False, repr=False) class DirectControlMessagePayloadV2(betterproto.Message): control_invocation: "_architecture_rpc__.ControlInvocation" = ( @@ -133,24 +154,3 @@ class ExecutionMetadataStore(betterproto.Message): fatal_errors: List["__core__.WorkflowFatalError"] = betterproto.message_field(2) execution_id: "__core__.ExecutionIdentity" = betterproto.message_field(3) is_recovering: bool = betterproto.bool_field(4) - - -@dataclass(eq=False, repr=False) -class Backpressure(betterproto.Message): - enable_backpressure: bool = betterproto.bool_field(1) - - -@dataclass(eq=False, repr=False) -class CreditUpdate(betterproto.Message): - pass - - -@dataclass(eq=False, repr=False) -class ActorCommand(betterproto.Message): - backpressure: "Backpressure" = betterproto.message_field(1, group="sealed_value") - credit_update: "CreditUpdate" = betterproto.message_field(2, group="sealed_value") - - -@dataclass(eq=False, repr=False) -class PythonActorMessage(betterproto.Message): - payload: "ActorCommand" = betterproto.message_field(1) diff --git a/amber/src/main/python/pytexera/__init__.py b/amber/src/main/python/pytexera/__init__.py index e40d1a43fe0..c6001667380 100644 --- a/amber/src/main/python/pytexera/__init__.py +++ b/amber/src/main/python/pytexera/__init__.py @@ -19,6 +19,7 @@ from overrides import overrides from typing import Iterator, Optional, Union +from core.models.operator import LoopStartOperator, LoopEndOperator from pyamber import * from .storage.dataset_file_document import DatasetFileDocument from .storage.large_binary_input_stream import LargeBinaryInputStream @@ -43,6 +44,8 @@ "UDFTableOperator", "UDFBatchOperator", "UDFSourceOperator", + "LoopStartOperator", + "LoopEndOperator", "DatasetFileDocument", "largebinary", "LargeBinaryInputStream", diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/common/AmberProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/common/AmberProcessor.scala index e7763073232..22811b46417 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/common/AmberProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/common/AmberProcessor.scala @@ -43,7 +43,7 @@ abstract class AmberProcessor( with Serializable { /** FIFO & exactly once */ - val inputGateway: InputGateway = new NetworkInputGateway(this.actorId) + val inputGateway: NetworkInputGateway = new NetworkInputGateway(this.actorId) // 1. Unified Output val outputGateway: NetworkOutputGateway = @@ -55,7 +55,7 @@ abstract class AmberProcessor( } ) // 2. RPC Layer - val asyncRPCClient = new AsyncRPCClient(outputGateway, actorId) + val asyncRPCClient = new AsyncRPCClient(inputGateway, outputGateway, actorId) val asyncRPCServer: AsyncRPCServer = new AsyncRPCServer(outputGateway, actorId) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala index 4d9a36bab43..ea170b16ffe 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala @@ -34,6 +34,7 @@ class ControllerAsyncRPCHandlerInitializer( with AmberLogging with LinkWorkersHandler with WorkerExecutionCompletedHandler + with NextIterationHandler with WorkerStateUpdatedHandler with PauseHandler with QueryWorkerStatisticsHandler diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 7a8e94cf3a7..3461619cb36 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -44,7 +44,7 @@ class ControllerProcessor( val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - () => this.workflowScheduler.getNextRegions, + workflowScheduler, workflowExecution, controllerConfig, asyncRPCClient diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index 9dcf3ad4bfc..ff6df1f0a06 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -32,7 +32,7 @@ class WorkflowScheduler( actorId: ActorVirtualIdentity ) extends java.io.Serializable { var physicalPlan: PhysicalPlan = _ - private var schedule: Schedule = _ + var schedule: Schedule = _ def getSchedule: Schedule = schedule @@ -54,4 +54,6 @@ class WorkflowScheduler( def getNextRegions: Set[Region] = if (!schedule.hasNext) Set() else schedule.next() + def hasPendingRegions: Boolean = schedule != null && schedule.hasNext + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala index b806479b892..2de29f31fdd 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala @@ -44,6 +44,7 @@ case class WorkflowExecution() { * @throws AssertionError if the `RegionExecution` has already been initialized. */ def initRegionExecution(region: Region): RegionExecution = { + regionExecutions.remove(region.id) // ensure the region execution hasn't been initialized already. assert( !regionExecutions.contains(region.id), diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala new file mode 100644 index 00000000000..c8a8ad48659 --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.engine.architecture.controller.promisehandlers + +import com.twitter.util.Future +import org.apache.texera.amber.engine.architecture.controller.ControllerAsyncRPCHandlerInitializer +import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{ + AsyncRPCContext, + NextIterationRequest +} +import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn + +/** indicate a worker has completed its execution + * i.e. received and processed all data from upstreams + * note that this doesn't mean all the output of this worker + * has been received by the downstream workers. + * + * possible sender: worker + */ +trait NextIterationHandler { + this: ControllerAsyncRPCHandlerInitializer => + + override def nextIteration( + msg: NextIterationRequest, + ctx: AsyncRPCContext + ): Future[EmptyReturn] = { + cp.workflowExecutionCoordinator.loopBack(msg.loopStartId) + EmptyReturn() + } +} diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala index d54a22f26b9..c3b3ddb234b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/WorkerExecutionCompletedHandler.scala @@ -61,7 +61,11 @@ trait WorkerExecutionCompletedHandler { .collect(Seq(statsRequest)) .flatMap(_ => { // if entire workflow is completed, clean up - if (cp.workflowExecution.isCompleted) { + val isWorkflowTerminal = + cp.workflowExecution.isCompleted && + !cp.workflowScheduler.hasPendingRegions && + !cp.workflowExecutionCoordinator.hasUnfinishedRegionCoordinators + if (isWorkflowTerminal) { // after query result come back: send completed event, cleanup ,and kill workflow sendToClient(ExecutionStateUpdate(cp.workflowExecution.getState)) cp.controllerTimerService.disableStatusUpdate() diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkInputGateway.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkInputGateway.scala index 5cfd8aabc04..1d3ee3cb72c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkInputGateway.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkInputGateway.scala @@ -86,4 +86,8 @@ class NetworkInputGateway(val actorId: ActorVirtualIdentity) enforcers += enforcer } + def removeControlChannel(from: ActorVirtualIdentity): Unit = { + inputChannels.remove(ChannelIdentity(from, actorId, isControl = true)) + } + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkOutputGateway.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkOutputGateway.scala index 929a30f4efa..e35e819d41f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkOutputGateway.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkOutputGateway.scala @@ -94,4 +94,8 @@ class NetworkOutputGateway( idToSequenceNums.getOrElseUpdate(channelId, new AtomicLong()).getAndIncrement() } + def removeControlChannel(to: ActorVirtualIdentity): Unit = { + idToSequenceNums.remove(ChannelIdentity(actorId, to, isControl = true)) + } + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 4ab3d18056f..53755b780cc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -124,6 +124,8 @@ class OutputManager( : mutable.HashMap[PortIdentity, OutputPortResultWriterThread] = mutable.HashMap() + private val storageUris: mutable.HashMap[Int, URI] = mutable.HashMap() + /** * Add down stream operator and its corresponding Partitioner. * @@ -232,6 +234,23 @@ class OutputManager( }) } + def saveStateToStorageIfNeeded(state: State): Unit = { + try { + storageUris.foreach { + case (_, uri) => + val writer = DocumentFactory + .openDocument(State.stateUriFromResultUri(uri)) + ._1 + .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) + .asInstanceOf[BufferedItemWriter[Tuple]] + writer.putOne(State.serialize(state)) + writer.close() + } + } catch { + case _: Exception => () + } + } + /** * Singal the port storage writer to flush the remaining buffer and wait for commits to finish so that * the output port is properly completed. If the output port does not need storage, no action will be done. @@ -280,6 +299,7 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, storageUri: URI): Unit = { + this.storageUris(portId.id) = storageUri val bufferedItemWriter = DocumentFactory .openDocument(storageUri) ._1 diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index 6618e857b1d..e53fccf8c02 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.pythonworker import com.twitter.util.{Await, Promise} import org.apache.texera.amber.core.WorkflowRuntimeException +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.tuple.{Schema, Tuple} import org.apache.texera.amber.core.virtualidentity.{ActorVirtualIdentity, ChannelIdentity} import org.apache.texera.amber.engine.architecture.pythonworker.WorkerBatchInternalQueue.{ @@ -125,7 +126,11 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu case DataFrame(frame) => writeArrowStream(mutable.Queue(ArraySeq.unsafeWrapArray(frame): _*), from, "Data") case StateFrame(state) => - writeArrowStream(mutable.Queue(state.toTuple), from, "State") + writeArrowStream( + mutable.Queue(State.serialize(state)), + from, + "State" + ) } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala index c904e436bcd..2a1e212ac88 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala @@ -128,7 +128,10 @@ private class AmberProducer( dataHeader.payloadType match { case "State" => assert(root.getRowCount == 1) - outputPort.sendTo(to, StateFrame(State(Some(ArrowUtils.getTexeraTuple(0, root))))) + outputPort.sendTo( + to, + StateFrame(State.deserialize(ArrowUtils.getTexeraTuple(0, root))) + ) case "ECM" => assert(root.getRowCount == 1) outputPort.sendTo( diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index e490cde3d9b..a384f383e1f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -20,7 +20,8 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.pekko.pattern.gracefulStop -import com.twitter.util.{Future, Return, Throw} +import com.twitter.util.{Duration => TwitterDuration, Future, JavaTimer, Return, Throw, Timer} +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.VFSURIFactory.decodeURI import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity @@ -61,7 +62,7 @@ import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowExecutions import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.AtomicReference -import scala.concurrent.duration.Duration +import scala.concurrent.duration.{Duration => ScalaDuration} /** * The executor of a region. @@ -109,10 +110,14 @@ class RegionExecutionCoordinator( private val currentPhaseRef: AtomicReference[RegionExecutionPhase] = new AtomicReference( Unexecuted ) + private val terminationFutureRef: AtomicReference[Future[Unit]] = new AtomicReference(null) + private val killRetryTimer: Timer = new JavaTimer(true) + private val killRetryDelay: TwitterDuration = TwitterDuration.fromMilliseconds(200) /** * Sync the status of `RegionExecution` and transition this coordinator's phase to `Completed` only when the - * coordinator is currently in `ExecutingNonDependeePortsPhase` and all the ports of this region are completed. + * coordinator is currently in `ExecutingNonDependeePortsPhase`, all the ports of this region are completed, and + * all workers in this region are terminated. * * Additionally, this method will also terminate all the workers of this region: * @@ -135,12 +140,22 @@ class RegionExecutionCoordinator( return Future.Unit } - // Set this coordinator's status to be completed so that subsequent regions can be started by - // WorkflowExecutionCoordinator. - setPhase(Completed) - - // Terminate all the workers in this region. - terminateWorkers(regionExecution) + val existingTerminationFuture = terminationFutureRef.get + if (existingTerminationFuture != null) { + existingTerminationFuture + } else { + val terminationFuture = terminateWorkersWithRetry(regionExecution).flatMap { _ => + // Set this coordinator's status to be completed so that subsequent regions can be started by + // WorkflowExecutionCoordinator. + setPhase(Completed) + Future.Unit + } + if (terminationFutureRef.compareAndSet(null, terminationFuture)) { + terminationFuture + } else { + terminationFutureRef.get + } + } } private def terminateWorkers(regionExecution: RegionExecution) = { @@ -167,7 +182,9 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) - gracefulStop(actorRef, Duration(5, TimeUnit.SECONDS)).asTwitter() + asyncRPCClient.inputGateway.removeControlChannel(workerId) + asyncRPCClient.outputGateway.removeControlChannel(workerId) + gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() } }.toSeq @@ -191,8 +208,30 @@ class RegionExecutionCoordinator( } } + private def terminateWorkersWithRetry( + regionExecution: RegionExecution, + attempt: Int = 1 + ): Future[Unit] = { + terminateWorkers(regionExecution).rescue { + case err => + logger.warn( + s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", + err + ) + Future + .sleep(killRetryDelay)(killRetryTimer) + .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) + } + } + def isCompleted: Boolean = currentPhaseRef.get == Completed + /** + * Returns the region termination future if termination has been initiated. + * This is only set by `tryCompleteRegionExecution()`. + */ + def getTerminationFutureOpt: Option[Future[Unit]] = Option(terminationFutureRef.get) + /** * This will sync and transition the region execution phase from one to another depending on its current phase: * @@ -528,12 +567,30 @@ class RegionExecutionCoordinator( portConfigs.foreach { case (outputPortId, portConfig) => val storageUriToAdd = portConfig.storageURI + val stateUriToAdd = State.stateUriFromResultUri(storageUriToAdd) val (_, eid, _, _) = decodeURI(storageUriToAdd) val schemaOptional = region.getOperator(outputPortId.opId).outputPorts(outputPortId.portId)._3 val schema = schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) - DocumentFactory.createDocument(storageUriToAdd, schema) + if (region.getOperators.exists(_.id.logicalOpId.id.startsWith("LoopEnd-operator-"))) { + try { + DocumentFactory.openDocument(storageUriToAdd) + } catch { + case _: Exception => + DocumentFactory.createDocument(storageUriToAdd, schema) + } + try { + DocumentFactory.openDocument(stateUriToAdd) + } catch { + case _: Exception => + DocumentFactory.createDocument(stateUriToAdd, State.schema) + } + } else { + DocumentFactory.createDocument(storageUriToAdd, schema) + DocumentFactory.createDocument(stateUriToAdd, State.schema) + } + WorkflowExecutionsResource.insertOperatorPortResultUri( eid = eid, globalPortId = outputPortId, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 6f34c9ed1e5..4d0828acc04 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -19,6 +19,8 @@ package org.apache.texera.amber.engine.architecture.scheduling +import org.apache.texera.amber.core.virtualidentity.OperatorIdentity + case class Schedule(private val levelSets: Map[Int, Set[Region]]) extends Iterator[Set[Region]] { private var currentLevel = levelSets.keys.minOption.getOrElse(0) @@ -31,4 +33,13 @@ case class Schedule(private val levelSets: Map[Int, Set[Region]]) extends Iterat currentLevel += 1 regions } + + def loopBack(loopStartId: OperatorIdentity): Unit = + levelSets + .collectFirst { + case (level, regions) + if regions.exists(_.getOperators.exists(_.id.logicalOpId == loopStartId)) => + level + } + .foreach(currentLevel = _) } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 05585f88d8d..1ee471dd1b5 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -21,19 +21,25 @@ package org.apache.texera.amber.engine.architecture.scheduling import com.twitter.util.Future import com.typesafe.scalalogging.LazyLogging +import org.apache.texera.amber.core.virtualidentity.OperatorIdentity import org.apache.texera.amber.core.workflow.{GlobalPortIdentity, PhysicalLink} import org.apache.texera.amber.engine.architecture.common.{ AkkaActorRefMappingService, AkkaActorService } -import org.apache.texera.amber.engine.architecture.controller.ControllerConfig +import org.apache.texera.amber.engine.architecture.controller.{ + ControllerConfig, + ExecutionStateUpdate, + WorkflowScheduler +} import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.common.rpc.AsyncRPCClient +import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable class WorkflowExecutionCoordinator( - getNextRegions: () => Set[Region], + workflowScheduler: WorkflowScheduler, workflowExecution: WorkflowExecution, controllerConfig: ControllerConfig, asyncRPCClient: AsyncRPCClient @@ -44,6 +50,7 @@ class WorkflowExecutionCoordinator( private val regionExecutionCoordinators : mutable.HashMap[RegionIdentity, RegionExecutionCoordinator] = mutable.HashMap() + private val completionNotified: AtomicBoolean = new AtomicBoolean(false) @transient var actorRefService: AkkaActorRefMappingService = _ @@ -59,18 +66,19 @@ class WorkflowExecutionCoordinator( * After the syncs, if there are no running region(s), it will start new regions (if available). */ def coordinateRegionExecutors(actorService: AkkaActorService): Future[Unit] = { - if (regionExecutionCoordinators.values.exists(!_.isCompleted)) { - // As this method is invoked by the completion of each port in a region, and regionExecutionCoordinator only - // lanuches each phase asynchronously, we need to let each current unfinished regionExecutionCoordinator - // sync its status and proceed with next phases if needed. - Future - .collect({ - regionExecutionCoordinators.values - .filter(!_.isCompleted) - .map(_.syncStatusAndTransitionRegionExecutionPhase()) - .toSeq - }) + val unfinishedRegionCoordinators = + regionExecutionCoordinators.values.filter(!_.isCompleted).toSeq + + // Trigger sync for each unfinished region. + unfinishedRegionCoordinators.foreach(_.syncStatusAndTransitionRegionExecutionPhase()) + + // Wait only for region termination futures (kill path), then re-run coordination. + val terminationFutures = unfinishedRegionCoordinators.flatMap(_.getTerminationFutureOpt) + if (terminationFutures.nonEmpty) { + return Future + .collect(terminationFutures) .unit + .flatMap(_ => coordinateRegionExecutors(actorService)) } if (regionExecutionCoordinators.values.exists(!_.isCompleted)) { @@ -79,10 +87,17 @@ class WorkflowExecutionCoordinator( } // All existing regions are completed. Start the next region (if any). + val nextRegions = workflowScheduler.getNextRegions + if (nextRegions.isEmpty) { + if (workflowExecution.isCompleted && completionNotified.compareAndSet(false, true)) { + asyncRPCClient.sendToClient(ExecutionStateUpdate(workflowExecution.getState)) + } + return Future.Unit + } + + executedRegions.append(nextRegions) Future - .collect({ - val nextRegions = getNextRegions() - executedRegions.append(nextRegions) + .collect( nextRegions .map(region => { workflowExecution.initRegionExecution(region) @@ -98,7 +113,7 @@ class WorkflowExecutionCoordinator( }) .map(_.syncStatusAndTransitionRegionExecutionPhase()) .toSeq - }) + ) .unit } @@ -116,4 +131,12 @@ class WorkflowExecutionCoordinator( .toSet } + def hasUnfinishedRegionCoordinators: Boolean = { + regionExecutionCoordinators.values.exists(!_.isCompleted) + } + + def loopBack(loopStartId: OperatorIdentity): Unit = { + workflowScheduler.schedule.loopBack(loopStartId) + } + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala index 3aa5fa90a46..65c560ee594 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala @@ -126,6 +126,7 @@ class DataProcessor( val outputState = executor.processState(state, port) if (outputState.isDefined) { outputManager.emitState(outputState.get) + outputManager.saveStateToStorageIfNeeded(state) } } catch safely { case e => diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala index 10fbbc44a2c..acada743bc6 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.worker.managers import io.grpc.MethodDescriptor import org.apache.texera.amber.config.ApplicationConfig +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.model.VirtualDocument import org.apache.texera.amber.core.tuple.Tuple @@ -45,7 +46,11 @@ import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.{ DPInputQueueElement, FIFOMessageElement } -import org.apache.texera.amber.engine.common.ambermessage.{DataFrame, WorkflowFIFOMessage} +import org.apache.texera.amber.engine.common.ambermessage.{ + DataFrame, + StateFrame, + WorkflowFIFOMessage +} import org.apache.texera.amber.util.VirtualIdentityUtils.getFromActorIdForInputPortStorage import java.net.URI @@ -106,6 +111,25 @@ class InputPortMaterializationReaderThread( } // Flush any remaining tuples in the buffer. if (buffer.nonEmpty) flush() + + try { + val state_document = + DocumentFactory + .openDocument(State.stateUriFromResultUri(uri)) + ._1 + .asInstanceOf[VirtualDocument[Tuple]] + val stateReadIterator = state_document.get() + + while (stateReadIterator.hasNext) { + val state = State.deserialize(stateReadIterator.next()) + inputMessageQueue.put( + FIFOMessageElement(WorkflowFIFOMessage(channelId, getSequenceNumber, StateFrame(state))) + ) + } + } catch { + case _: Exception => + } + emitECM(METHOD_END_CHANNEL, PORT_ALIGNMENT) isFinished.set(true) } catch { diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/promisehandlers/EndHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/promisehandlers/EndHandler.scala index 2a6a20b3d3e..0504e66f52b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/promisehandlers/EndHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/promisehandlers/EndHandler.scala @@ -48,8 +48,8 @@ trait EndHandler { s"Received EndHandler before all messages are processed. Unprocessed messages: " + s"${dp.inputManager.inputMessageQueue.peek()}" ) + return Future.exception(new IllegalStateException("worker still has unprocessed messages")) } - assert(dp.inputManager.inputMessageQueue.isEmpty) // Now we can safely acknowledge that this worker can be terminated. EmptyReturn() } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/common/rpc/AsyncRPCClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/common/rpc/AsyncRPCClient.scala index 704ebd7f476..f7e26803b47 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/common/rpc/AsyncRPCClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/common/rpc/AsyncRPCClient.scala @@ -27,7 +27,10 @@ import org.apache.texera.amber.core.virtualidentity.{ EmbeddedControlMessageIdentity } import org.apache.texera.amber.engine.architecture.controller.ClientEvent -import org.apache.texera.amber.engine.architecture.messaginglayer.NetworkOutputGateway +import org.apache.texera.amber.engine.architecture.messaginglayer.{ + NetworkInputGateway, + NetworkOutputGateway +} import org.apache.texera.amber.engine.architecture.rpc.controlcommands._ import org.apache.texera.amber.engine.architecture.rpc.controllerservice.ControllerServiceFs2Grpc import org.apache.texera.amber.engine.architecture.rpc.controlreturns.{ @@ -125,7 +128,8 @@ object AsyncRPCClient { } class AsyncRPCClient( - outputGateway: NetworkOutputGateway, + val inputGateway: NetworkInputGateway, + val outputGateway: NetworkOutputGateway, val actorId: ActorVirtualIdentity ) extends AmberLogging { diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowExecutionsResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowExecutionsResource.scala index 72fb1c364e5..92582afdd2b 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowExecutionsResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowExecutionsResource.scala @@ -247,6 +247,8 @@ object WorkflowExecutionsResource { OPERATOR_PORT_EXECUTIONS.RESULT_URI ) .values(eid.id.toInt, globalPortId.serializeAsString, uri.toString) + .onConflict() + .doNothing() .execute() } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala index f99739acc04..9837213abbb 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala @@ -29,13 +29,7 @@ trait OperatorExecutor { def produceStateOnStart(port: Int): Option[State] = None - def processState(state: State, port: Int): Option[State] = { - if (state.isPassToAllDownstream) { - Some(state) - } else { - None - } - } + def processState(state: State, port: Int): Option[State] = Some(state) def processTupleMultiPort( tuple: Tuple, diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 3226c9d2fe7..f76a314b7ae 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -19,39 +19,70 @@ package org.apache.texera.amber.core.state +import com.fasterxml.jackson.databind.JsonNode import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.texera.amber.util.JSONUtils.objectMapper -import scala.collection.mutable +import java.net.URI +import java.util.Base64 +import scala.jdk.CollectionConverters.IteratorHasAsScala -final case class State(tuple: Option[Tuple] = None, passToAllDownstream: Boolean = false) { - val data: mutable.Map[String, (AttributeType, Any)] = mutable.LinkedHashMap() - add("passToAllDownstream", passToAllDownstream, AttributeType.BOOLEAN) - if (tuple.isDefined) { - tuple.get.getSchema.getAttributes.foreach { attribute => - add(attribute.getName, tuple.get.getField(attribute.getName), attribute.getType) - } - } +object State { + private val StateContent = "content" + private val BytesTypeMarker = "__texera_type__" + private val BytesValue = "bytes" + private val PayloadMarker = "payload" - def add(key: String, value: Any, valueType: AttributeType): Unit = - data.put(key, (valueType, value)) + val schema: Schema = new Schema( + new Attribute(StateContent, AttributeType.STRING) + ) - def get(key: String): Any = data(key)._2 + def stateUriFromResultUri(resultUri: URI): URI = + new URI(resultUri.toString.replace("/result", "/state")) - def isPassToAllDownstream: Boolean = get("passToAllDownstream").asInstanceOf[Boolean] + def serialize(state: State): Tuple = { + val payloadJson = objectMapper.writeValueAsString(toJsonValue(state)) + Tuple.builder(schema).addSequentially(Array(payloadJson)).build() + } - def apply(key: String): Any = get(key) + def deserialize(tuple: Tuple): State = { + val payload = tuple.getField[String](StateContent) + objectMapper.readTree(payload).fields().asScala.map(entry => entry.getKey -> fromJsonValue(entry.getValue)).toMap + } - def toTuple: Tuple = - Tuple - .builder( - Schema(data.map { - case (name, (attrType, _)) => - new Attribute(name, attrType) - }.toList) - ) - .addSequentially(data.values.map(_._2).toArray) - .build() + private def toJsonValue(value: Any): Any = + value match { + case null => null + case bytes: Array[Byte] => + Map(BytesTypeMarker -> BytesValue, PayloadMarker -> Base64.getEncoder.encodeToString(bytes)) + case map: State => + map.iterator.map { case (k, v) => k -> toJsonValue(v) }.toMap + case iterable: Iterable[_] => + iterable.map(toJsonValue).toList + case other => other + } - override def toString: String = - data.map { case (key, (_, value)) => s"$key: $value" }.mkString(", ") + private def fromJsonValue(node: JsonNode): Any = { + if (node == null || node.isNull) { + null + } else if (node.isObject) { + val fields = node.fields().asScala.map(entry => entry.getKey -> entry.getValue).toMap + fields.get(BytesTypeMarker) match { + case Some(typeNode) if typeNode.isTextual && typeNode.asText() == BytesValue => + Base64.getDecoder.decode(fields(PayloadMarker).asText()) + case _ => + fields.view.mapValues(fromJsonValue).toMap + } + } else if (node.isArray) { + node.elements().asScala.map(fromJsonValue).toList + } else if (node.isBoolean) { + node.asBoolean() + } else if (node.isIntegralNumber) { + node.longValue() + } else if (node.isFloatingPointNumber) { + node.doubleValue() + } else { + node.asText() + } + } } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala new file mode 100644 index 00000000000..c110f9d814f --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.core + +package object state { + type State = Map[String, Any] +} diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 15949ef4717..ae37def667e 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -72,6 +72,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } @@ -119,6 +120,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala index 3513ac5ecd8..990776a69f0 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala @@ -34,6 +34,7 @@ object VFSResourceType extends Enumeration { val RESULT: Value = Value("result") val RUNTIME_STATISTICS: Value = Value("runtimeStatistics") val CONSOLE_MESSAGES: Value = Value("consoleMessages") + val STATE: Value = Value("state") } object VFSURIFactory { diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala index 06d04e407f5..25b6df58001 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala @@ -29,6 +29,7 @@ import org.apache.iceberg.io.{DataWriter, OutputFile} import org.apache.iceberg.parquet.Parquet import org.apache.iceberg.{Schema, Table} +import java.nio.file.{Files, Path, Paths} import scala.collection.mutable.ArrayBuffer /** @@ -106,11 +107,13 @@ private[storage] class IcebergTableWriter[T]( private def flushBuffer(): Unit = { if (buffer.nonEmpty) { // Create a unique file path using the writer's identifier and the filename index - val location = table.location().stripSuffix("/") - val filepathString = s"$location/${writerIdentifier}_$filenameIdx" - // Increment the filename index by 1 - filenameIdx += 1 - val outputFile: OutputFile = table.io().newOutputFile(filepathString) + var filepath: Path = null + do { + filepath = Paths.get(table.location()).resolve(s"${writerIdentifier}_$filenameIdx") + filenameIdx += 1 + } while (Files.exists(filepath)) + + val outputFile: OutputFile = table.io().newOutputFile(filepath.toString) // Create a Parquet data writer to write a new file val dataWriter: DataWriter[Record] = Parquet .writeData(outputFile) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala index d9b9cd9f100..c2e33bbb8ca 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala @@ -141,6 +141,7 @@ import org.apache.texera.amber.operator.visualization.waterfallChart.WaterfallCh import org.apache.texera.amber.operator.visualization.windRoseChart.WindRoseChartOpDesc import org.apache.texera.amber.operator.visualization.wordCloud.WordCloudOpDesc import org.apache.commons.lang3.builder.{EqualsBuilder, HashCodeBuilder, ToStringBuilder} +import org.apache.texera.amber.operator.loop.{LoopEndOpDesc, LoopStartOpDesc} import org.apache.texera.amber.operator.sklearn.testing.SklearnTestingOpDesc import org.apache.texera.amber.operator.source.scan.file.{FileScanOpDesc, FileScanSourceOpDesc} import org.apache.texera.amber.operator.visualization.stripChart.StripChartOpDesc @@ -215,6 +216,8 @@ trait StateTransferFunc new Type(value = classOf[TypeCastingOpDesc], name = "TypeCasting"), new Type(value = classOf[LimitOpDesc], name = "Limit"), new Type(value = classOf[SleepOpDesc], name = "Sleep"), + new Type(value = classOf[LoopStartOpDesc], name = "LoopStart"), + new Type(value = classOf[LoopEndOpDesc], name = "LoopEnd"), new Type(value = classOf[RandomKSamplingOpDesc], name = "RandomKSampling"), new Type(value = classOf[ReservoirSamplingOpDesc], name = "ReservoirSampling"), new Type(value = classOf[HashJoinOpDesc[String]], name = "HashJoin"), diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala index 462bdd0969a..d2becc79a5b 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala @@ -34,7 +34,7 @@ class IfOpExec(descString: String) extends OperatorExecutor { //It can accept any value that can be converted to a boolean. For example, Int 1 will be converted to true. override def processState(state: State, port: Int): Option[State] = { outputPort = - if (state.get(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() + if (state(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() Some(state) } diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala new file mode 100644 index 00000000000..cb911b3d369 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.loop + +import com.fasterxml.jackson.annotation.JsonProperty +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.executor.OpExecWithCode +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PhysicalOp} +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} + +class LoopEndOpDesc extends LogicalOp { + @JsonProperty(required = true, defaultValue = "i += 1") + @JsonSchemaTitle("Update") + var update: String = _ + + @JsonProperty(required = true, defaultValue = "i < len(table)") + @JsonSchemaTitle("Condition") + var condition: String = _ + + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalOp = { + val pythonCode = + try { + generatePythonCode() + } catch { + case ex: Throwable => + s"#EXCEPTION DURING CODE GENERATION: ${ex.getMessage}" + } + PhysicalOp + .oneToOnePhysicalOp( + workflowId, + executionId, + operatorIdentifier, + OpExecWithCode(pythonCode, "python") + ) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withSuggestedWorkerNum(1) + .withParallelizable(false) + } + + override def operatorInfo: OperatorInfo = + OperatorInfo( + "Loop End", + "Loop End", + OperatorGroupConstants.CONTROL_GROUP, + inputPorts = List(InputPort()), + outputPorts = List(OutputPort()) + ) + + def generatePythonCode(): String = { + s""" + |from pytexera import * + |class ProcessLoopEndOperator(LoopEndOperator): + | @overrides + | def process_state(self, state: State, port: int) -> Optional[State]: + | loop_counter = int(state.get("loop_counter", 0)) + | if loop_counter > 0: + | state["loop_counter"] = loop_counter - 1 + | return state + | self.state = dict(state) + | from pickle import loads + | self.state["table"] = loads(self.state["table"]) + | exec("$update", {}, self.state) + | return None + | + | @overrides + | def condition(self) -> None: + | exec("output = $condition", {}, self.state) + | return self.state["output"] + |""".stripMargin + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala new file mode 100644 index 00000000000..baf1f4f4092 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.loop + +import com.fasterxml.jackson.annotation.JsonProperty +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.executor.OpExecWithCode +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PhysicalOp} +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} + +class LoopStartOpDesc extends LogicalOp { + @JsonProperty(required = true, defaultValue = "i = 0") + @JsonSchemaTitle("Initialization") + var initialization: String = _ + + @JsonProperty(required = true, defaultValue = "table.iloc[i]") + @JsonSchemaTitle("Output") + var output: String = _ + + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalOp = { + val pythonCode = + try { + generatePythonCode() + } catch { + case ex: Throwable => + s"#EXCEPTION DURING CODE GENERATION: ${ex.getMessage}" + } + PhysicalOp + .oneToOnePhysicalOp( + workflowId, + executionId, + operatorIdentifier, + OpExecWithCode(pythonCode, "python") + ) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withSuggestedWorkerNum(1) + .withParallelizable(false) + } + + override def operatorInfo: OperatorInfo = + OperatorInfo( + "Loop Start", + "Loop Start", + OperatorGroupConstants.CONTROL_GROUP, + inputPorts = List(InputPort()), + outputPorts = List(OutputPort()) + ) + + def generatePythonCode(): String = { + s""" + |from pytexera import * + |class ProcessLoopStartOperator(LoopStartOperator): + | @overrides + | def open(self): + | self.state = {"loop_counter": 0} + | exec("$initialization", {}, self.state) + | + | @overrides + | def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: + | self.state["table"] = table + | exec("output = $output", {}, self.state) + | yield self.state["output"] + |""".stripMargin + } +} diff --git a/frontend/src/assets/operator_images/LoopEnd.png b/frontend/src/assets/operator_images/LoopEnd.png new file mode 100644 index 0000000000000000000000000000000000000000..ee0f9ab6faccd328c102f214a7547d3e34d8359b GIT binary patch literal 5865 zcmVPy0qe(P?J^Czx6vRgiNy(U0Jp~gW+wJeo>FF6J zv)#V8`#arz@9BT?$35r!zVkc3^qfB5Mles!2yn63hL)Czs8n_X^S=Vv1(x+u03U(x zw}GeyL>8ERFmHopZ36HFlo}CsI${gOizOdGK38`k{#z@ z0LKG37R*P3=tuxoiu(3&tdALqidki^V-aG#x}8IXROixtAl!S_|ebFwduQ`ESz7 zbjWWA!Ukv4O65`j7wZ5mr#j}LP?(eVvu>^l%f;g1u&m1fd@ZLWO2h2?;QMz`x%|g8 zl*iSXOTu!ocp!+b29XmNfTo?#?1K3=DwqG9Ck%2$Se`Ls0yb^B0l<}c0y*Qh-j24m zSu}U<#*Et;n3o&Evg2F~=9?0Ai{@d6bl9le!$dcd>(0}mE{=}O6P9iJSP*>=z$tMS zYUV~lU>TUNAtNe;3TxWfHkYHj3yLDwV#I#U*Bvu(0mQ2kYVYyhm7M68OmOyH1TYUdIDwl7~3I>@WY#2Q?4@75X1sr2X zhye5BAi6jU(P5b&EIZC=0J;F|X-JY;1XL(v2f6Og)3h*k!m{HiPt~1if?#ylguvzG zx_8wyBf-I`5|(XyE{HBka6Lwp4g{F*AgLzA@=ZJefa|EJsK{hR*vTAn) zvr1r8XZ5{7q#gf6nX9h83WX`u)%BM|<&GgtD_}u%716^WQrV|@dR`3@0)bb+yaLQ` zpj=)>ij;LetjJs}?hm4aK{Oe_VPWFqp>>K(XBzV=FrTL@`b!&O*>R2s^P@S51$YU- zQ}F#~FmBwlG=KiPy4b+er|*m{TTX;!oebd906w8hcinpWViC?E*L}Wj84)LHA#CWm zQg?Z~M6@f7VN0Oy8(UkuY5x44Gz=I?Cr_U~5nH#aq)Bx@eO6ZN8fFKxDiCs}v}a75 zu;H8aiHwrl6>_cRw#T%;`Ts|C%qXH zCfrK1XKzk@12q}IGiUCAjT^58aD4{udArGVznJ9G6DMpY+^t0SgXji{KS@b~FDWZH zk}=N$^H(MHRF$2u8#htHW+0G7m8-mp%H>C^0@)-3xL7;~mi2uAs$6JVPhtTxi4iso z5qu&oM0(wxcVokb%jltpDk<8KN!5J*`33B>lZvqVW{L)?>#o?LiHQ(42)+hhPAQ(X zM|~kvSJxvcZbmLtb4SOy!A(gzw+Skq_L#Uq;K>lywwI)mDZsoGt*z6uM_=89)n4R+c?}AMbE&KA_w_8)%o!a4?&vrQmC7R^+D98s zbr$QWi?Ct&pck|)4p^x|>r59FImE**I!md%5|zs7v~b~@ za@%DjD+KtWiw;1cpzhLjIT@Cb>#892$WP6L4XOwN_e9nZXP`O;5Piu3M+uYzy z%ls>i$Ae31shU_5VZ$h_H#Nn#z6;Ab)tp&tBU}=rgEOt~KdI$IOvTTRAAdm9sIJ-w z>o`BqR!o@aG;yXu5>9Cha^-C^)lf%q~72=*S-=8>I zZ``p7Yuh*UW?tSTiiYoh$>d;(BAlr3Fh}fXi3%MmjQK|LyjdgZ7@M$;vqrQlHqrf5 zDw#g9qPw5GaBgqEPdFQQExGQ9kz=D0mK|pbfJMnKHT^Y zMPKzBD26Fh$#oYGoj(>~Ws6}j&mzyeF~)+-)D#h5+g5Ed5Pn*R{b*Koqh_IllnmX_TcsXlA7Gd8GCyKUP#v7*XDUU0(+ zYunF)=me3CV4g#sH&aBm5tOzqzWo0|1dvCo_Auu;Oz(-*2q$45}AkS$UK*6 zmjNaJQiJmKeQlwnp1t~Oa&1po+r9`yWg)Fklk5JwkZdD1Wdzu9{#&e32y>ACT^(h7uwg|uzS-Z)C6@fKtLWQ$zZF1f6 zV8Y5}7PPeNX>zr)Mz7@U33IlpahXP?$S!WN5%!?I-R4*32VR4%94T_&r0 z+mLo~v3NQxt6OAos%}9@*b8A<|0SZNrDYcrJ0OB=CWVHv1@DL$=llQBM_9*E318O> zD0!7!_b>suMr4K&V8?j{z@Y*r-rPr6+kOy4QxjLfd@p(4MTyHb0yB*O+x9{bjak8^ zt_3hJ>L=_=AgZfL!kCIpW3O=k>X`?T{}k@8&eZRfE`DLs;ZFHL?1=i%K+S&sMJ1Trbw{> zsCf}7Rs2{T39f}@-68;liP91PH7_EiqPEm6flSe2Q=Kp`+!C=%5_I;46G)W3 zyop>_$u>_DA|S;p^M-9gGU-^d+#A-MVRh_Z)9H{q!gG@pR>g2Kg~n&eLUL3G%~&99b$COJLi_g6M_Bu`tnEi9?zvfpXDW zCLT6VJjjKuvhlDc;~*C#$di$x83zgowr!Ksl_y4I+*ZLD>-8pUHshP0NAq-K&6>Id zd1OXLZI-V~U@E_4bnEkJmVD(GIp+ajYHQ__b#+>;r?!@zzhRSXiMBdJu9X|E(^v=Lkl0d8+U0?dD? zHZI2Rdy(q~9TvCi<{jS0J`17~VhA67D)SujyqO}hjnLE(VB4MzqDw^#V1AA~@5FxD z!wD-_r$Fi4jFy(&P2Vn&txZK}7(1{Izz!k?`2KZNF5fyjVW&;|7%G)lMU*R{Lktkp zV+3nKK(Lr#pZ$fxp|oJZYoii2JQhf~X_fdMluCP<-UnLvB&sCH-D+=F-#imV6on0u z2g@FIBnSvwEM5i6x=qB0KmlJ#o_B|sd?Q#30%Dm3{nq>b)l@EjKMG+d!Lt4$xaxJ- zamRgW_Uz4qI*e#R1bF7m9q`_JZ}nCU+FeYI@9#(D^6ILD4PJtdvlPG?V#b*Iv0~8b z5G+-laFFfY80wY`R)Y=F;W5xu0@PTs4?G{CN%rWo6 zvQDM0E_K*9k46NzqvI&}{*wT97FWx>9^=O!Ky&AAjH0V1!iH~IslHyr?PXpAqR)}* zz7b_HW=wnp*l`X9^RGZOqH8q1hG7yeA=jN3Rc_6M4N3bHfK#F>i8*d1DwWe|;lej# z$}!WcAQ1K@ctYHh^`RZh$aOzm%f{44SgBS?qg$>%_RcbI+FG`@wz0vR6~)Q=Oh5nL{*m9 zICD>%2$WjyS+}?NWOCi#)w76Lgbm)ZvULo`T9fgJ)wAQ_9Bm$~p*F@$^N&r~@My3E zMEYX@n3tlpb-D?diO$?0kY(%E`C4KFhML9vICw4S@MVZ&u6b746A zVWz-c0KS>l<^VXITsO9>H!;G7q`eHlohgE{625+>ast=4dB5}fo zZ(7+R98uP$>K*_$W5R@6o2kM~P?2Th#%lpwFI+bu$}Wsp3?s_K(ZWvu@e)4RuqJ(%1h8UwOO=g~Defj$~ql4M?kU!H9mTDSkF8 z-#3GK0W8bSbNTw9OuO$pAetstOgJi{gU6}(R_4!F%gw4sw3;JI*zi1iJeZ#V(Ju8G zDLC(G06)Q|O^fJ(2mT?bClS$MYoVzCz5?Kri3m+n2=hB2QrB0{Cn7BXQ5~6jL?%9%S$9hqCjpRS?!#EGZ;7SS)FIz@JxClEbC+dD!BO*Q8zQSaq1XAbdKbkwSR5e2s`LZ`*4On z#G>x;t6^K8SHQdi%x|DvUKQuoA^55<7WW6y!62Fp;4ly!3E%&v|=H`>}R zNcx_&n%Yzd+iw)xcKUAIY6zL%ha-@%yR{aGQzdNpF_w0~Of7hsu|Q&hEcJkyI$=Z7 zs?bDLnkbEU+Inzn#yKjUwIhiT!8qV$g0LZJ_XP92R5BOhAlM9Z`|i*$jvIWP1XP(J z?7+D;y&SP5@N7I8F>|nz1|)NY4d1rPGxo#WUwtLLTRP>bD=VUJNqeqpT~a0qJMgyE zU96*4^cr=1okcEmz>#Uf25;NGvPj?4Sj$TX?pf8b49wTmT0ThUVlq$Ifw!&WT-@t9 zx+y2I2s-yLzOKGlfE#P6A{bw3eG+m**x+s3=OMd6TT$iuFwNVn+tJoGOSi|WUXXG{ z*g-c52ZHEoO+6Rt#WvgXm|Za67NtykmKKsr!VV(t;jk>_DKq_QqqFKh`2JnGqN}2t z6>Vg$2|GaCzUFr-r%6=@H_ylj%tN6thZZb&EzbZo1YrlgarXexbP#m};d|y;3+65` z&mXOKYu+GgD8dc~pB-lkfUh@c2xB*Zd&zYd=Z%on>}^=W4tncO0x%UsUv6m6net3A z{}jL?DwkJR+xWcx-tdIo?yalNx?8uN4I-7)cS2q_ZNP5k=fK>J*4D@Jem0G|7fnUj zAx8!k9rjreoeJPnIXUyHl$_s!`AHD{It$TLQHhq+xTYoSPyoX&D#rsj7R*P734vBp zK-VG14-y}VmIF|oR-QLqbTkywsAHotF@-xjEv#PsaR8HHS%-jGm8(=`m-bE6J*?cK zz+1f^h^s)P%7wlTUk`V)0_`k4SDLq_~R%4M5l?=V1$00000NkvXXu0mjf{#zRE literal 0 HcmV?d00001 diff --git a/frontend/src/assets/operator_images/LoopStart.png b/frontend/src/assets/operator_images/LoopStart.png new file mode 100644 index 0000000000000000000000000000000000000000..7e5be023cdf6b64dd1bc140e5d75980455afeefb GIT binary patch literal 2138 zcmV-g2&MOlP)Px-6iGxuRCr$Pok@1vFbsy}TQ#e+JKw6wt+cDNN^a$bl4voq27m;>LBHSYqAkVY z$0rb+#7kG<@A`TD-M{p=3agGPM=EfMd@k!_*RSip?%sX$Fa2BAilA^a1?mO0>wb6l z?j5k2v68i*x_G;KNw`$Lu656Tc9Eo zuwW)j8SQ}zShIknU1*!20#9EcQB z6cz^>1a(cxTka+6Y@h~k>Vdf?xiG9J0X0-|fOc49G29qr3Tieum%s|bdO}cVHSgWK zRd`?mYB7L@zzV^7YEZqNBe1F-Ru$AD0sZM+llKS!)Id<7mwzo{Qv-F_Gg`;Ajf=V5 zcEPBP*Z_3}D+OT1L9Nzsk<(YzwwE$XlA1;kSW!?b@}lb|h!7T_9?%@2san@xK`cu5 z4NPGHDnVMNrYfLLkh*xZogYJ3fJ!!d(!e#T>u=Ub0F_K1J7MHlAR1IaI-00i9oPgi z9*BT73e;we!|nF0U1^@ zs7;RU1&&!a`tnAT$9|9)ZqY&gy58B9DER@RKu1jgs7DGB{-0nA5~$U$I4!jz2t$w; zqkgJLlynDUghNo9^B97VVtR@~AU#e0tKZ~t7t{tJ75|b-a~jdPl25tgMD{_gXRF?k zXh|dcz09jVg*qRav^@^AC+J29unB4*p2DAq9F9D=pW8RejG4((AV;*f{{Mr`-7oK{ z+EsgPgPL!YKX^Y6rKgkdgTv61*9U7K6zNnkYM3epFlN!gQDWcT~H&}?8PnHV1_x_cm~zVnY^)I>;EKn0{3-V~!vN)Oxq z1Xa7+7l8B!s6zQ=uyYedn!q}B{|`_Nx_vRvhYQil;ar^?Yf$$#9B^gF6_Z@Wbki%x ztAm4^J*sB6k1kr5l+3wnhXBsW4WL>Slvzs@udINoSG{+mM^%k8;7yegDI0JR1JDygFuPXM(N0BWhGCaCVhT#aw=V=z?de(toRjZVSEj!2b;lCwiOAI&NI40$xA|c}btX{twu`E7^XstQe;!h`;?wV2 z%Jo4-_kRvhHM@QHbw`UcQ|SIrz^bA*?VPi-I)#csRl~5VdQ*UDgxEX`@Jqdlp9+)We4}*#tzgGR)hetLR`( zU|H*=iJ9@^;87n?j-wUVwYy&yI3H%)=vntJ?akv=w?Xq4_}$+EM50(-LrH< z#{4!zlKpGYwa)zOG_vem6=HDBqaRts+|g(kR99Km4F_4&$W*8>sEl|tQ-hhs9_Vun z>gTnz5P+#bA;|?nZ4OexEW{zGgI7fekWRZ0s4d!pa$Jc7Dqv0LRAQSWb7U`2X@e|? zo0Q0)PVs8qDJuir)QU*(sEv*6XNeh_us{W@2^ces5|+~3$lf=OsRtia46fp6Wg4dG za4SL!HmH+;m@# zDsgnbrDHOvO~+D)^f3}JKy6IZ+tl4B?Q;+TwJ~8C^f8h$L2X1+(!N)UiejKPA}mF- zxHJ)4oya1?$hF7Y9dSk14)CI8Pk&?3lD+Fpmu$lmB4(g-(a!au8T+XF? z&3-<|(l-l&nh{o0IL%^I3J3n{a?L|445|^V{+yco^kdRID20W+G-h2trJ@70V|d`a zt^4IY5lN{uHH`wO6RauwE@t@doj)}B__y3}7UfZ?b`FmA)(Wr6Db&ar0hK8vsx4#E z73EQ58h~eJ1sL0ssI2 literal 0 HcmV?d00001 From cce8da28cd4c0742422e14a3e1dc08b787b0c8ad Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 20 Apr 2026 02:05:57 -0700 Subject: [PATCH 018/152] fix fmt --- .../runnables/input_port_materialization_reader_runnable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index 493ecf0a413..a600f878572 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -172,7 +172,6 @@ def run(self) -> None: tup.cast_to_schema(self.tuple_schema) for data_frame in self.tuple_to_batch_with_filter(tup): self.emit_payload(data_frame) - self.emit_ecm("EndChannel", EmbeddedControlMessageType.PORT_ALIGNMENT) self._finished = True except Exception as err: From 2540c8a8fc7c6d14da4b0f9d5eb17d88e587f0ec Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 20 Apr 2026 13:06:31 -0700 Subject: [PATCH 019/152] test: add multiple state processing regression --- .../python/core/runnables/test_main_loop.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/amber/src/main/python/core/runnables/test_main_loop.py b/amber/src/main/python/core/runnables/test_main_loop.py index 5ad0afec9bc..3ff03672082 100644 --- a/amber/src/main/python/core/runnables/test_main_loop.py +++ b/amber/src/main/python/core/runnables/test_main_loop.py @@ -25,6 +25,8 @@ from core.models import ( DataFrame, InternalQueue, + State, + StateFrame, Tuple, ) from core.models.internal_queue import ( @@ -1077,6 +1079,63 @@ def send_resume( ), ) + @pytest.mark.timeout(2) + def test_process_state_can_emit_multiple_states( + self, + main_loop, + output_queue, + mock_data_output_channel, + monkeypatch, + ): + class DummyExecutor: + @staticmethod + def process_state(state: State, port: int) -> State: + return {"value": state["value"] + 1, "port": port} + + main_loop.context.executor_manager.executor = DummyExecutor() + monkeypatch.setattr(main_loop, "_check_and_process_control", lambda: None) + monkeypatch.setattr( + main_loop.context.output_manager, + "emit_state", + lambda state: [(mock_data_output_channel.to_worker_id, StateFrame(state))], + ) + + switch_count = {"value": 0} + + def fake_switch_context(): + switch_count["value"] += 1 + # xinyuan-state-only still uses the original two-switch state handshake: + # the DataProcessor produces output during the first switch of each + # process_input_state() call, before MainLoop reads current_output_state. + if switch_count["value"] % 2 == 1: + current_input_state = ( + main_loop.context.state_processing_manager.current_input_state + ) + main_loop.context.state_processing_manager.current_output_state = ( + DummyExecutor.process_state(current_input_state, 0) + ) + + monkeypatch.setattr(main_loop, "_switch_context", fake_switch_context) + + first_state = {"value": 1} + second_state = {"value": 41} + + main_loop._process_state(first_state) + main_loop._process_state(second_state) + + first_output: DataElement = output_queue.get() + second_output: DataElement = output_queue.get() + + assert first_output.tag == mock_data_output_channel + assert isinstance(first_output.payload, StateFrame) + assert first_output.payload.frame["value"] == 2 + assert first_output.payload.frame["port"] == 0 + + assert second_output.tag == mock_data_output_channel + assert isinstance(second_output.payload, StateFrame) + assert second_output.payload.frame["value"] == 42 + assert second_output.payload.frame["port"] == 0 + @pytest.mark.timeout(5) def test_main_loop_thread_can_align_ecm( self, From e0bb804567a9f4d77015f1afcf0c9e2569b0d05c Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 20 Apr 2026 13:57:50 -0700 Subject: [PATCH 020/152] fix fmt --- .../controller/promisehandlers/NextIterationHandler.scala | 2 +- .../scheduling/WorkflowExecutionCoordinator.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala index c8a8ad48659..70b73a88480 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala @@ -41,7 +41,7 @@ trait NextIterationHandler { msg: NextIterationRequest, ctx: AsyncRPCContext ): Future[EmptyReturn] = { - cp.workflowExecutionCoordinator.loopBack(msg.loopStartId) + cp.workflowExecutionCoordinator.goto(msg.loopStartId) EmptyReturn() } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 1ee471dd1b5..dcab3582907 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -135,8 +135,8 @@ class WorkflowExecutionCoordinator( regionExecutionCoordinators.values.exists(!_.isCompleted) } - def loopBack(loopStartId: OperatorIdentity): Unit = { - workflowScheduler.schedule.loopBack(loopStartId) + def goto(OpId: OperatorIdentity): Unit = { + workflowScheduler.schedule.loopBack(OpId) } } From 151eb1521db1a077b2dcd34eadfa7e34e3125b22 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 20 Apr 2026 13:58:21 -0700 Subject: [PATCH 021/152] fix fmt --- .../amber/engine/architecture/scheduling/Schedule.scala | 4 ++-- .../scheduling/WorkflowExecutionCoordinator.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 4d0828acc04..6b1837feb46 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -34,11 +34,11 @@ case class Schedule(private val levelSets: Map[Int, Set[Region]]) extends Iterat regions } - def loopBack(loopStartId: OperatorIdentity): Unit = + def goto(opId: OperatorIdentity): Unit = levelSets .collectFirst { case (level, regions) - if regions.exists(_.getOperators.exists(_.id.logicalOpId == loopStartId)) => + if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => level } .foreach(currentLevel = _) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index dcab3582907..b7cc6af5e3c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -136,7 +136,7 @@ class WorkflowExecutionCoordinator( } def goto(OpId: OperatorIdentity): Unit = { - workflowScheduler.schedule.loopBack(OpId) + workflowScheduler.schedule.goto(OpId) } } From 69bb7857a1e0fd525dcefe0d125696a0f13ae56d Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 21 Apr 2026 01:03:54 -0700 Subject: [PATCH 022/152] update --- .../architecture/rpc/controlcommands.proto | 6 ++-- .../architecture/rpc/controllerservice.proto | 2 +- .../main/python/core/runnables/main_loop.py | 10 +++---- .../amber/engine/architecture/rpc/__init__.py | 30 +++++++++---------- ...ControllerAsyncRPCHandlerInitializer.scala | 2 +- ...dler.scala => JumpToOperatorHandler.scala} | 18 ++++------- .../architecture/scheduling/Schedule.scala | 2 +- .../WorkflowExecutionCoordinator.scala | 4 +-- 8 files changed, 34 insertions(+), 40 deletions(-) rename amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/{NextIterationHandler.scala => JumpToOperatorHandler.scala} (75%) diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto index f5798af36d9..d8d8b512d05 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto @@ -46,7 +46,7 @@ message ControlRequest { PortCompletedRequest portCompletedRequest = 9; WorkerStateUpdatedRequest workerStateUpdatedRequest = 10; LinkWorkersRequest linkWorkersRequest = 11; - NextIterationRequest nextIterationRequest = 12; + JumpToOperatorRequest jumpToOperatorRequest = 12; // request for worker AddInputChannelRequest addInputChannelRequest = 50; @@ -281,6 +281,6 @@ message QueryStatisticsRequest{ StatisticsUpdateTarget updateTarget = 2; } -message NextIterationRequest{ - core.OperatorIdentity LoopStartId = 1 [(scalapb.field).no_box = true]; +message JumpToOperatorRequest{ + core.OperatorIdentity targetOperatorId = 1 [(scalapb.field).no_box = true]; } diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto index 734966e1761..25c90e3e936 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto @@ -42,7 +42,7 @@ service ControllerService { rpc PauseWorkflow(EmptyRequest) returns (EmptyReturn); rpc WorkerStateUpdated(WorkerStateUpdatedRequest) returns (EmptyReturn); rpc WorkerExecutionCompleted(EmptyRequest) returns (EmptyReturn); - rpc NextIteration(NextIterationRequest) returns (EmptyReturn); + rpc JumpToOperator(JumpToOperatorRequest) returns (EmptyReturn); rpc LinkWorkers(LinkWorkersRequest) returns (EmptyReturn); rpc ControllerInitiateQueryStatistics(QueryStatisticsRequest) returns (EmptyReturn); rpc RetryWorkflow(RetryWorkflowRequest) returns (EmptyReturn); diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index 6e9cff9c113..c4953f1d961 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -69,7 +69,7 @@ EmbeddedControlMessage, AsyncRpcContext, ControlRequest, - NextIterationRequest, + JumpToOperatorRequest, ) from proto.org.apache.texera.amber.engine.architecture.worker import ( WorkerState, @@ -106,11 +106,11 @@ def _attach_loop_start_id(self, output_state: State) -> None: self.context.input_manager.get_input_state_result_uri() ) - def _next_iteration( + def _jump_to_loop_start( self, executor: LoopEndOperator, controller_interface ) -> None: - controller_interface.next_iteration( - NextIterationRequest(OperatorIdentity(executor.loop_start_id())) + controller_interface.jump_to_operator( + JumpToOperatorRequest(OperatorIdentity(executor.loop_start_id())) ) uri = executor.state["LoopStartStateURI"] del executor.state["LoopStartStateURI"] @@ -129,7 +129,7 @@ def complete(self) -> None: controller_interface = self._async_rpc_client.controller_stub() executor = self.context.executor_manager.executor if isinstance(executor, LoopEndOperator) and executor.condition(): - self._next_iteration(executor, controller_interface) + self._jump_to_loop_start(executor, controller_interface) executor.close() # stop the data processing thread self.data_processor.stop() diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py index 910a583a438..49438d3030f 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py @@ -101,7 +101,7 @@ class ControlRequest(betterproto.Message): link_workers_request: "LinkWorkersRequest" = betterproto.message_field( 11, group="sealed_value" ) - next_iteration_request: "NextIterationRequest" = betterproto.message_field( + jump_to_operator_request: "JumpToOperatorRequest" = betterproto.message_field( 12, group="sealed_value" ) add_input_channel_request: "AddInputChannelRequest" = betterproto.message_field( @@ -397,8 +397,8 @@ class QueryStatisticsRequest(betterproto.Message): @dataclass(eq=False, repr=False) -class NextIterationRequest(betterproto.Message): - loop_start_id: "___core__.OperatorIdentity" = betterproto.message_field(1) +class JumpToOperatorRequest(betterproto.Message): + target_operator_id: "___core__.OperatorIdentity" = betterproto.message_field(1) @dataclass(eq=False, repr=False) @@ -736,17 +736,17 @@ async def worker_execution_completed( metadata=metadata, ) - async def next_iteration( + async def jump_to_operator( self, - next_iteration_request: "NextIterationRequest", + jump_to_operator_request: "JumpToOperatorRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/NextIteration", - next_iteration_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator", + jump_to_operator_request, EmptyReturn, timeout=timeout, deadline=deadline, @@ -1378,8 +1378,8 @@ async def worker_execution_completed( ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def next_iteration( - self, next_iteration_request: "NextIterationRequest" + async def jump_to_operator( + self, jump_to_operator_request: "JumpToOperatorRequest" ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) @@ -1487,11 +1487,11 @@ async def __rpc_worker_execution_completed( response = await self.worker_execution_completed(request) await stream.send_message(response) - async def __rpc_next_iteration( - self, stream: "grpclib.server.Stream[NextIterationRequest, EmptyReturn]" + async def __rpc_jump_to_operator( + self, stream: "grpclib.server.Stream[JumpToOperatorRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.next_iteration(request) + response = await self.jump_to_operator(request) await stream.send_message(response) async def __rpc_link_workers( @@ -1589,10 +1589,10 @@ def __mapping__(self) -> Dict[str, grpclib.const.Handler]: EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/NextIteration": grpclib.const.Handler( - self.__rpc_next_iteration, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator": grpclib.const.Handler( + self.__rpc_jump_to_operator, grpclib.const.Cardinality.UNARY_UNARY, - NextIterationRequest, + JumpToOperatorRequest, EmptyReturn, ), "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers": grpclib.const.Handler( diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala index ea170b16ffe..03a4d479683 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala @@ -34,7 +34,7 @@ class ControllerAsyncRPCHandlerInitializer( with AmberLogging with LinkWorkersHandler with WorkerExecutionCompletedHandler - with NextIterationHandler + with JumpToOperatorHandler with WorkerStateUpdatedHandler with PauseHandler with QueryWorkerStatisticsHandler diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala similarity index 75% rename from amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala rename to amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala index 70b73a88480..aad72f08e90 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/NextIterationHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala @@ -23,25 +23,19 @@ import com.twitter.util.Future import org.apache.texera.amber.engine.architecture.controller.ControllerAsyncRPCHandlerInitializer import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{ AsyncRPCContext, - NextIterationRequest + JumpToOperatorRequest } import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn -/** indicate a worker has completed its execution - * i.e. received and processed all data from upstreams - * note that this doesn't mean all the output of this worker - * has been received by the downstream workers. - * - * possible sender: worker - */ -trait NextIterationHandler { +/** Requests the scheduler to continue from the region containing the target operator. */ +trait JumpToOperatorHandler { this: ControllerAsyncRPCHandlerInitializer => - override def nextIteration( - msg: NextIterationRequest, + override def jumpToOperator( + msg: JumpToOperatorRequest, ctx: AsyncRPCContext ): Future[EmptyReturn] = { - cp.workflowExecutionCoordinator.goto(msg.loopStartId) + cp.workflowExecutionCoordinator.jumpToOperator(msg.targetOperatorId) EmptyReturn() } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 6b1837feb46..fa8121cad5f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -34,7 +34,7 @@ case class Schedule(private val levelSets: Map[Int, Set[Region]]) extends Iterat regions } - def goto(opId: OperatorIdentity): Unit = + def jumpToOperator(opId: OperatorIdentity): Unit = levelSets .collectFirst { case (level, regions) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index b7cc6af5e3c..9e3e50beb66 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -135,8 +135,8 @@ class WorkflowExecutionCoordinator( regionExecutionCoordinators.values.exists(!_.isCompleted) } - def goto(OpId: OperatorIdentity): Unit = { - workflowScheduler.schedule.goto(OpId) + def jumpToOperator(opId: OperatorIdentity): Unit = { + workflowScheduler.schedule.jumpToOperator(opId) } } From 555e55df5483a103e4282b503e33ee3b0dcc1bb2 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 21 Apr 2026 01:20:16 -0700 Subject: [PATCH 023/152] Add scheduler jump-to-operator support --- .../architecture/rpc/controlcommands.proto | 7 ++- .../architecture/rpc/controllerservice.proto | 3 +- .../amber/engine/architecture/rpc/__init__.py | 43 +++++++++++++++++++ ...ControllerAsyncRPCHandlerInitializer.scala | 1 + .../controller/ControllerProcessor.scala | 2 +- .../controller/WorkflowScheduler.scala | 3 ++ .../JumpToOperatorHandler.scala | 41 ++++++++++++++++++ .../architecture/scheduling/Schedule.scala | 11 +++++ .../WorkflowExecutionCoordinator.scala | 11 +++-- 9 files changed, 116 insertions(+), 6 deletions(-) create mode 100644 amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto index d714f64a154..d8d8b512d05 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto @@ -46,6 +46,7 @@ message ControlRequest { PortCompletedRequest portCompletedRequest = 9; WorkerStateUpdatedRequest workerStateUpdatedRequest = 10; LinkWorkersRequest linkWorkersRequest = 11; + JumpToOperatorRequest jumpToOperatorRequest = 12; // request for worker AddInputChannelRequest addInputChannelRequest = 50; @@ -278,4 +279,8 @@ enum StatisticsUpdateTarget { message QueryStatisticsRequest{ repeated core.ActorVirtualIdentity filterByWorkers = 1; StatisticsUpdateTarget updateTarget = 2; -} \ No newline at end of file +} + +message JumpToOperatorRequest{ + core.OperatorIdentity targetOperatorId = 1 [(scalapb.field).no_box = true]; +} diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto index 70d189a3411..25c90e3e936 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto @@ -42,7 +42,8 @@ service ControllerService { rpc PauseWorkflow(EmptyRequest) returns (EmptyReturn); rpc WorkerStateUpdated(WorkerStateUpdatedRequest) returns (EmptyReturn); rpc WorkerExecutionCompleted(EmptyRequest) returns (EmptyReturn); + rpc JumpToOperator(JumpToOperatorRequest) returns (EmptyReturn); rpc LinkWorkers(LinkWorkersRequest) returns (EmptyReturn); rpc ControllerInitiateQueryStatistics(QueryStatisticsRequest) returns (EmptyReturn); rpc RetryWorkflow(RetryWorkflowRequest) returns (EmptyReturn); -} \ No newline at end of file +} diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py index b7522a696ae..f946e7123e9 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py @@ -102,6 +102,9 @@ class ControlRequest(betterproto.Message): link_workers_request: "LinkWorkersRequest" = betterproto.message_field( 11, group="sealed_value" ) + jump_to_operator_request: "JumpToOperatorRequest" = betterproto.message_field( + 12, group="sealed_value" + ) add_input_channel_request: "AddInputChannelRequest" = betterproto.message_field( 50, group="sealed_value" ) @@ -394,6 +397,11 @@ class QueryStatisticsRequest(betterproto.Message): update_target: "StatisticsUpdateTarget" = betterproto.enum_field(2) +@dataclass(eq=False, repr=False) +class JumpToOperatorRequest(betterproto.Message): + target_operator_id: "___core__.OperatorIdentity" = betterproto.message_field(1) + + @dataclass(eq=False, repr=False) class ControlReturn(betterproto.Message): """The generic return message""" @@ -1243,6 +1251,23 @@ async def worker_execution_completed( metadata=metadata, ) + async def jump_to_operator( + self, + jump_to_operator_request: "JumpToOperatorRequest", + *, + timeout: Optional[float] = None, + deadline: Optional["Deadline"] = None, + metadata: Optional["MetadataLike"] = None + ) -> "EmptyReturn": + return await self._unary_unary( + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator", + jump_to_operator_request, + EmptyReturn, + timeout=timeout, + deadline=deadline, + metadata=metadata, + ) + async def link_workers( self, link_workers_request: "LinkWorkersRequest", @@ -1880,6 +1905,11 @@ async def worker_execution_completed( ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + async def jump_to_operator( + self, jump_to_operator_request: "JumpToOperatorRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + async def link_workers( self, link_workers_request: "LinkWorkersRequest" ) -> "EmptyReturn": @@ -1984,6 +2014,13 @@ async def __rpc_worker_execution_completed( response = await self.worker_execution_completed(request) await stream.send_message(response) + async def __rpc_jump_to_operator( + self, stream: "grpclib.server.Stream[JumpToOperatorRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.jump_to_operator(request) + await stream.send_message(response) + async def __rpc_link_workers( self, stream: "grpclib.server.Stream[LinkWorkersRequest, EmptyReturn]" ) -> None: @@ -2079,6 +2116,12 @@ def __mapping__(self) -> Dict[str, grpclib.const.Handler]: EmptyRequest, EmptyReturn, ), + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator": grpclib.const.Handler( + self.__rpc_jump_to_operator, + grpclib.const.Cardinality.UNARY_UNARY, + JumpToOperatorRequest, + EmptyReturn, + ), "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers": grpclib.const.Handler( self.__rpc_link_workers, grpclib.const.Cardinality.UNARY_UNARY, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala index 4d9a36bab43..03a4d479683 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala @@ -34,6 +34,7 @@ class ControllerAsyncRPCHandlerInitializer( with AmberLogging with LinkWorkersHandler with WorkerExecutionCompletedHandler + with JumpToOperatorHandler with WorkerStateUpdatedHandler with PauseHandler with QueryWorkerStatisticsHandler diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 7a8e94cf3a7..3461619cb36 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -44,7 +44,7 @@ class ControllerProcessor( val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - () => this.workflowScheduler.getNextRegions, + workflowScheduler, workflowExecution, controllerConfig, asyncRPCClient diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index 9dcf3ad4bfc..c8a107e0451 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -20,6 +20,7 @@ package org.apache.texera.amber.engine.architecture.controller import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity +import org.apache.texera.amber.core.virtualidentity.OperatorIdentity import org.apache.texera.amber.core.workflow.{PhysicalPlan, WorkflowContext} import org.apache.texera.amber.engine.architecture.scheduling.{ CostBasedScheduleGenerator, @@ -54,4 +55,6 @@ class WorkflowScheduler( def getNextRegions: Set[Region] = if (!schedule.hasNext) Set() else schedule.next() + def jumpToOperator(opId: OperatorIdentity): Unit = schedule.jumpToOperator(opId) + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala new file mode 100644 index 00000000000..aad72f08e90 --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.engine.architecture.controller.promisehandlers + +import com.twitter.util.Future +import org.apache.texera.amber.engine.architecture.controller.ControllerAsyncRPCHandlerInitializer +import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{ + AsyncRPCContext, + JumpToOperatorRequest +} +import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn + +/** Requests the scheduler to continue from the region containing the target operator. */ +trait JumpToOperatorHandler { + this: ControllerAsyncRPCHandlerInitializer => + + override def jumpToOperator( + msg: JumpToOperatorRequest, + ctx: AsyncRPCContext + ): Future[EmptyReturn] = { + cp.workflowExecutionCoordinator.jumpToOperator(msg.targetOperatorId) + EmptyReturn() + } +} diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 6f34c9ed1e5..fa8121cad5f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -19,6 +19,8 @@ package org.apache.texera.amber.engine.architecture.scheduling +import org.apache.texera.amber.core.virtualidentity.OperatorIdentity + case class Schedule(private val levelSets: Map[Int, Set[Region]]) extends Iterator[Set[Region]] { private var currentLevel = levelSets.keys.minOption.getOrElse(0) @@ -31,4 +33,13 @@ case class Schedule(private val levelSets: Map[Int, Set[Region]]) extends Iterat currentLevel += 1 regions } + + def jumpToOperator(opId: OperatorIdentity): Unit = + levelSets + .collectFirst { + case (level, regions) + if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => + level + } + .foreach(currentLevel = _) } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 05585f88d8d..2b8e3ce1450 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -21,19 +21,20 @@ package org.apache.texera.amber.engine.architecture.scheduling import com.twitter.util.Future import com.typesafe.scalalogging.LazyLogging +import org.apache.texera.amber.core.virtualidentity.OperatorIdentity import org.apache.texera.amber.core.workflow.{GlobalPortIdentity, PhysicalLink} import org.apache.texera.amber.engine.architecture.common.{ AkkaActorRefMappingService, AkkaActorService } -import org.apache.texera.amber.engine.architecture.controller.ControllerConfig +import org.apache.texera.amber.engine.architecture.controller.{ControllerConfig, WorkflowScheduler} import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.common.rpc.AsyncRPCClient import scala.collection.mutable class WorkflowExecutionCoordinator( - getNextRegions: () => Set[Region], + workflowScheduler: WorkflowScheduler, workflowExecution: WorkflowExecution, controllerConfig: ControllerConfig, asyncRPCClient: AsyncRPCClient @@ -81,7 +82,7 @@ class WorkflowExecutionCoordinator( // All existing regions are completed. Start the next region (if any). Future .collect({ - val nextRegions = getNextRegions() + val nextRegions = workflowScheduler.getNextRegions executedRegions.append(nextRegions) nextRegions .map(region => { @@ -116,4 +117,8 @@ class WorkflowExecutionCoordinator( .toSet } + def jumpToOperator(opId: OperatorIdentity): Unit = { + workflowScheduler.jumpToOperator(opId) + } + } From 001a4090008fa0613d9ce6681eb5ec04abeaf9c6 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 21 Apr 2026 01:53:44 -0700 Subject: [PATCH 024/152] Add scheduler jump test --- .../scheduling/ScheduleSpec.scala | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala new file mode 100644 index 00000000000..6655874b756 --- /dev/null +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.engine.architecture.scheduling + +import org.apache.texera.amber.core.executor.OpExecInitInfo +import org.apache.texera.amber.core.virtualidentity.{ + ExecutionIdentity, + OperatorIdentity, + PhysicalOpIdentity, + WorkflowIdentity +} +import org.apache.texera.amber.core.workflow.PhysicalOp +import org.scalatest.flatspec.AnyFlatSpec + +class ScheduleSpec extends AnyFlatSpec { + + private def region(regionId: Long, opId: String): Region = { + val physicalOp = PhysicalOp( + PhysicalOpIdentity(OperatorIdentity(opId), "main"), + WorkflowIdentity(0), + ExecutionIdentity(0), + OpExecInitInfo.Empty + ) + Region(RegionIdentity(regionId), Set(physicalOp), Set.empty) + } + + "Schedule.jumpToOperator" should "make the next scheduled region contain the target operator" in { + val firstRegion = region(1, "first") + val secondRegion = region(2, "second") + val thirdRegion = region(3, "third") + val schedule = Schedule( + Map( + 0 -> Set(firstRegion), + 1 -> Set(secondRegion), + 2 -> Set(thirdRegion) + ) + ) + + assert(schedule.next() == Set(firstRegion)) + assert(schedule.next() == Set(secondRegion)) + + schedule.jumpToOperator(OperatorIdentity("first")) + + assert(schedule.next() == Set(firstRegion)) + } +} From 58d851d2c9dbe7054c522a52b18f2f4e5af7625d Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 21 Apr 2026 17:58:48 -0700 Subject: [PATCH 025/152] update --- common/workflow-core/build.sbt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/workflow-core/build.sbt b/common/workflow-core/build.sbt index e22d3553616..8088c536705 100644 --- a/common/workflow-core/build.sbt +++ b/common/workflow-core/build.sbt @@ -114,7 +114,7 @@ libraryDependencies ++= Seq( ///////////////////////////////////////////////////////////////////////////// // Arrow related -val arrowVersion = "14.0.1" +val arrowVersion = "15.0.2" val nettyVersion = "4.1.96.Final" val arrowDependencies = Seq( // https://mvnrepository.com/artifact/org.apache.arrow/flight-grpc @@ -225,4 +225,4 @@ libraryDependencies ++= Seq( "software.amazon.awssdk" % "sts" % "2.29.51" excludeAll( ExclusionRule(organization = "io.netty") ), -) \ No newline at end of file +) From a4a72ccaca81f2f85df44fb900088081ec345132 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 21 Apr 2026 18:41:06 -0700 Subject: [PATCH 026/152] update --- .../storage/result/iceberg/IcebergTableWriter.scala | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala index 25b6df58001..2d4ffdd063d 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala @@ -107,13 +107,11 @@ private[storage] class IcebergTableWriter[T]( private def flushBuffer(): Unit = { if (buffer.nonEmpty) { // Create a unique file path using the writer's identifier and the filename index - var filepath: Path = null - do { - filepath = Paths.get(table.location()).resolve(s"${writerIdentifier}_$filenameIdx") - filenameIdx += 1 - } while (Files.exists(filepath)) - - val outputFile: OutputFile = table.io().newOutputFile(filepath.toString) + val location = table.location().stripSuffix("/") + val filepathString = s"$location/${writerIdentifier}_$filenameIdx" + // Increment the filename index by 1 + filenameIdx += 1 + val outputFile: OutputFile = table.io().newOutputFile(filepathString) // Create a Parquet data writer to write a new file val dataWriter: DataWriter[Record] = Parquet .writeData(outputFile) From d695dec39b2a1810c3be936d0ffd36abc031ab88 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 21 Apr 2026 22:35:55 -0700 Subject: [PATCH 027/152] fix fmt --- .../texera/amber/engine/architecture/scheduling/Schedule.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index fa8121cad5f..4f38fbcf1c0 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -37,8 +37,7 @@ case class Schedule(private val levelSets: Map[Int, Set[Region]]) extends Iterat def jumpToOperator(opId: OperatorIdentity): Unit = levelSets .collectFirst { - case (level, regions) - if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => + case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => level } .foreach(currentLevel = _) From 80afe67d99c09c95ad8bd690affff8d856fd72a9 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 22 Apr 2026 20:47:27 -0700 Subject: [PATCH 028/152] update --- .../controller/execution/WorkflowExecution.scala | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala index 2de29f31fdd..42907a41d2c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala @@ -36,20 +36,13 @@ case class WorkflowExecution() { /** * Initializes or retrieves a `RegionExecution` for a given `Region`. If not already - * initialized, it creates and returns a new `RegionExecution`; otherwise, an assertion - * error is thrown if re-initialization is attempted. + * initialized, it creates and returns a new `RegionExecution`. * * @param region The `Region` for which to initialize or retrieve the `RegionExecution`. * @return The `RegionExecution` associated with the given `Region`. - * @throws AssertionError if the `RegionExecution` has already been initialized. */ def initRegionExecution(region: Region): RegionExecution = { regionExecutions.remove(region.id) - // ensure the region execution hasn't been initialized already. - assert( - !regionExecutions.contains(region.id), - s"RegionExecution of ${region.id} already initialized." - ) regionExecutions.getOrElseUpdate(region.id, RegionExecution(region)) } From 84bd376983dc933a567cf9a75576358a5bd52e1f Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 23 Apr 2026 00:48:33 -0700 Subject: [PATCH 029/152] refactor: keep only state changes on top of main --- .../architecture/packaging/output_manager.py | 38 ++++++- amber/src/main/python/core/models/operator.py | 6 +- amber/src/main/python/core/models/state.py | 97 ++++++++-------- .../python/core/runnables/network_receiver.py | 15 ++- .../python/core/runnables/network_sender.py | 23 ++-- .../python/core/runnables/test_main_loop.py | 59 ++++++++++ .../python/core/storage/document_factory.py | 107 ++++++++++-------- ...ut_port_materialization_reader_runnable.py | 29 ++++- .../python/core/storage/vfs_uri_factory.py | 1 + .../messaginglayer/OutputManager.scala | 20 ++++ .../pythonworker/PythonProxyClient.scala | 3 +- .../pythonworker/PythonProxyServer.scala | 2 +- .../RegionExecutionCoordinator.scala | 58 ++++++++-- .../architecture/worker/DataProcessor.scala | 1 + ...InputPortMaterializationReaderThread.scala | 26 ++++- .../core/executor/OperatorExecutor.scala | 8 +- .../texera/amber/core/state/State.scala | 83 +++++++++----- .../texera/amber/core/state/package.scala | 24 ++++ .../amber/core/storage/DocumentFactory.scala | 2 + .../amber/core/storage/VFSURIFactory.scala | 1 + .../amber/operator/ifStatement/IfOpExec.scala | 2 +- 21 files changed, 444 insertions(+), 161 deletions(-) create mode 100644 common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index bf4afbf396f..065b063f7d4 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -17,6 +17,7 @@ import threading import typing +import uuid from collections import OrderedDict from itertools import chain from loguru import logger @@ -43,7 +44,12 @@ ) from core.models import Tuple, Schema, StateFrame from core.models.payload import DataPayload, DataFrame -from core.models.state import State +from core.models.state import ( + State, + STATE_SCHEMA, + serialize_state, + state_uri_from_result_uri, +) from core.storage.document_factory import DocumentFactory from core.storage.runnables.port_storage_writer import ( PortStorageWriter, @@ -87,6 +93,8 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() + self._storage_uris: typing.Dict[PortIdentity, str] = dict() + def is_missing_output_ports(self): """ This method is only used for ensuring correct region execution. @@ -126,6 +134,7 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri: str): Create a separate thread for saving output tuples of a port to storage in batch. """ + self._storage_uris[port_id] = storage_uri document, _ = DocumentFactory.open_document(storage_uri) buffered_item_writer = document.writer(str(get_worker_index(self.worker_id))) writer_queue = Queue() @@ -171,6 +180,31 @@ def save_tuple_to_storage_if_needed(self, tuple_: Tuple, port_id=None) -> None: PortStorageWriterElement(data_tuple=tuple_) ) + def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: + if port_id is None: + uris = self._storage_uris.values() + elif port_id in self._storage_uris: + uris = [self._storage_uris[port_id]] + else: + return + + for uri in uris: + state_uri = state_uri_from_result_uri(uri) + try: + document = DocumentFactory.open_document(state_uri)[0] + except ValueError: + document = DocumentFactory.create_document(state_uri, STATE_SCHEMA) + writer = document.writer(str(uuid.uuid4())) + writer.put_one(serialize_state(state)) + writer.close() + + def reset_output_storage(self) -> None: + port_id = self.get_port_ids()[0] + storage_uri = self._storage_uris[port_id] + self.close_port_storage_writers() + DocumentFactory.create_document(storage_uri, self._ports[port_id].get_schema()) + self.set_up_port_storage_writer(port_id, storage_uri) + def close_port_storage_writers(self) -> None: """ Flush the buffers of port storage writers and wait for all the @@ -248,7 +282,7 @@ def emit_state( receiver, ( StateFrame(payload) - if isinstance(payload, State) + if isinstance(payload, dict) else self.tuple_to_frame(payload) ), ) diff --git a/amber/src/main/python/core/models/operator.py b/amber/src/main/python/core/models/operator.py index 79050839958..91c5b2cf27b 100644 --- a/amber/src/main/python/core/models/operator.py +++ b/amber/src/main/python/core/models/operator.py @@ -108,14 +108,12 @@ def close(self) -> None: def process_state(self, state: State, port: int) -> Optional[State]: """ Process an input State from the given link. - The default implementation is to pass the State to all downstream operators - if the State has pass_to_all_downstream set to True. + The default implementation is to pass the State to downstream operators. :param state: State, a State from an input port to be processed. :param port: int, input port index of the current exhausted port. :return: State, producing one State object """ - if state.passToAllDownstream: - return state + return state def produce_state_on_start(self, port: int) -> State: """ diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index 2c8a268dfb7..e5726cc3c2f 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -15,61 +15,64 @@ # specific language governing permissions and limitations # under the License. -from dataclasses import dataclass -from pandas import DataFrame -from pyarrow import Table -from typing import Optional +import base64 +import json +from typing import Any, Dict, TypeAlias -from .schema import Schema, AttributeType -from .schema.attribute_type import FROM_PYOBJECT_MAPPING +from .schema import Schema +from .tuple import Tuple +State: TypeAlias = Dict[str, Any] -@dataclass -class State: - def __init__( - self, table: Optional[Table] = None, pass_to_all_downstream: bool = False - ): - self.schema = Schema() - self.passToAllDownstream = pass_to_all_downstream - if table is not None: - self.__dict__.update(table.to_pandas().iloc[0].to_dict()) - self.schema = Schema(table.schema) +STATE_CONTENT = "content" +_TYPE_MARKER = "__texera_type__" +_PAYLOAD_MARKER = "payload" +_BYTES_TYPE = "bytes" - def add( - self, key: str, value: any, value_type: Optional[AttributeType] = None - ) -> None: - self.__dict__[key] = value - if value_type is not None: - self.schema.add(key, value_type) - elif key != "schema": - self.schema.add(key, FROM_PYOBJECT_MAPPING[type(value)]) +STATE_SCHEMA = Schema(raw_schema={STATE_CONTENT: "STRING"}) - def get(self, key: str) -> any: - return self.__dict__[key] - def to_table(self) -> Table: - return Table.from_pandas( - df=DataFrame([self.__dict__]), - schema=self.schema.as_arrow_schema(), - ) +def state_uri_from_result_uri(result_uri: str) -> str: + return result_uri.replace("/result", "/state") - def __setattr__(self, key: str, value: any) -> None: - self.add(key, value) - def __setitem__(self, key: str, value: any) -> None: - self.add(key, value) +def serialize_state(state: State) -> Tuple: + return Tuple( + { + STATE_CONTENT: json.dumps( + _to_json_value(state), separators=(",", ":") + ) + }, + schema=STATE_SCHEMA, + ) - def __getitem__(self, key: str) -> any: - return self.get(key) - def __str__(self) -> str: - content = ", ".join( - [ - repr(key) + ": " + repr(value) - for key, value in self.__dict__.items() - if key != "schema" - ] - ) - return f"State[{content}]" +def deserialize_state(row: Tuple) -> State: + return _from_json_value(json.loads(row[STATE_CONTENT])) - __repr__ = __str__ + +def _to_json_value(value: Any) -> Any: + if value is None or isinstance(value, (bool, int, float, str)): + return value + if isinstance(value, bytes): + return { + _TYPE_MARKER: _BYTES_TYPE, + _PAYLOAD_MARKER: base64.b64encode(value).decode("ascii"), + } + if isinstance(value, dict): + return {str(key): _to_json_value(inner) for key, inner in value.items()} + if isinstance(value, (list, tuple)): + return [_to_json_value(inner) for inner in value] + raise TypeError( + f"State value of type {type(value).__name__} is not JSON serializable" + ) + + +def _from_json_value(value: Any) -> Any: + if isinstance(value, list): + return [_from_json_value(inner) for inner in value] + if isinstance(value, dict): + if value.get(_TYPE_MARKER) == _BYTES_TYPE: + return base64.b64decode(value[_PAYLOAD_MARKER]) + return {key: _from_json_value(inner) for key, inner in value.items()} + return value diff --git a/amber/src/main/python/core/runnables/network_receiver.py b/amber/src/main/python/core/runnables/network_receiver.py index fd42a8f589b..e1815b08f7d 100644 --- a/amber/src/main/python/core/runnables/network_receiver.py +++ b/amber/src/main/python/core/runnables/network_receiver.py @@ -32,6 +32,7 @@ ) from core.models import ( DataFrame, + Tuple, StateFrame, ) from core.models.internal_queue import ( @@ -40,8 +41,8 @@ InternalQueue, ECMElement, ) -from core.models.state import State from core.proxy import ProxyServer +from core.models.state import STATE_SCHEMA, deserialize_state from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable from proto.org.apache.texera.amber.engine.architecture.rpc import EmbeddedControlMessage @@ -96,7 +97,17 @@ def data_handler(command: bytes, table: Table) -> int: "Data", lambda _: DataFrame(table), "State", - lambda _: StateFrame(State(table)), + lambda _: StateFrame( + deserialize_state( + Tuple( + { + name: table[name][0].as_py() + for name in STATE_SCHEMA.get_attr_names() + }, + schema=STATE_SCHEMA, + ) + ) + ), "ECM", lambda _: EmbeddedControlMessage().parse(table["payload"][0].as_py()), ) diff --git a/amber/src/main/python/core/runnables/network_sender.py b/amber/src/main/python/core/runnables/network_sender.py index 9595433fb70..f1bd8659ee9 100644 --- a/amber/src/main/python/core/runnables/network_sender.py +++ b/amber/src/main/python/core/runnables/network_sender.py @@ -20,13 +20,18 @@ from overrides import overrides from typing import Optional -from core.models import DataPayload, InternalQueue, DataFrame, StateFrame, State +from core.models import DataPayload, InternalQueue, DataFrame, StateFrame from core.models.internal_queue import ( InternalQueueElement, DataElement, DCMElement, ECMElement, ) +from core.models.state import ( + STATE_CONTENT, + STATE_SCHEMA, + serialize_state, +) from core.proxy import ProxyClient from core.util import StoppableQueueBlockingRunnable from proto.org.apache.texera.amber.core import ChannelIdentity @@ -98,13 +103,15 @@ def _send_data(self, to: ChannelIdentity, data_payload: DataPayload) -> None: data_header = PythonDataHeader(tag=to, payload_type="Data") self._proxy_client.send_data(bytes(data_header), data_payload.frame) elif isinstance(data_payload, StateFrame): - data_header = PythonDataHeader( - tag=to, payload_type=data_payload.frame.__class__.__name__ - ) - table = ( - data_payload.frame.to_table() - if isinstance(data_payload.frame, State) - else None + data_header = PythonDataHeader(tag=to, payload_type="State") + serialized_state = serialize_state(data_payload.frame) + table = pa.Table.from_pydict( + { + STATE_CONTENT: [ + serialized_state[STATE_CONTENT] + ], + }, + schema=STATE_SCHEMA.as_arrow_schema(), ) self._proxy_client.send_data(bytes(data_header), table) else: diff --git a/amber/src/main/python/core/runnables/test_main_loop.py b/amber/src/main/python/core/runnables/test_main_loop.py index 5ad0afec9bc..3ff03672082 100644 --- a/amber/src/main/python/core/runnables/test_main_loop.py +++ b/amber/src/main/python/core/runnables/test_main_loop.py @@ -25,6 +25,8 @@ from core.models import ( DataFrame, InternalQueue, + State, + StateFrame, Tuple, ) from core.models.internal_queue import ( @@ -1077,6 +1079,63 @@ def send_resume( ), ) + @pytest.mark.timeout(2) + def test_process_state_can_emit_multiple_states( + self, + main_loop, + output_queue, + mock_data_output_channel, + monkeypatch, + ): + class DummyExecutor: + @staticmethod + def process_state(state: State, port: int) -> State: + return {"value": state["value"] + 1, "port": port} + + main_loop.context.executor_manager.executor = DummyExecutor() + monkeypatch.setattr(main_loop, "_check_and_process_control", lambda: None) + monkeypatch.setattr( + main_loop.context.output_manager, + "emit_state", + lambda state: [(mock_data_output_channel.to_worker_id, StateFrame(state))], + ) + + switch_count = {"value": 0} + + def fake_switch_context(): + switch_count["value"] += 1 + # xinyuan-state-only still uses the original two-switch state handshake: + # the DataProcessor produces output during the first switch of each + # process_input_state() call, before MainLoop reads current_output_state. + if switch_count["value"] % 2 == 1: + current_input_state = ( + main_loop.context.state_processing_manager.current_input_state + ) + main_loop.context.state_processing_manager.current_output_state = ( + DummyExecutor.process_state(current_input_state, 0) + ) + + monkeypatch.setattr(main_loop, "_switch_context", fake_switch_context) + + first_state = {"value": 1} + second_state = {"value": 41} + + main_loop._process_state(first_state) + main_loop._process_state(second_state) + + first_output: DataElement = output_queue.get() + second_output: DataElement = output_queue.get() + + assert first_output.tag == mock_data_output_channel + assert isinstance(first_output.payload, StateFrame) + assert first_output.payload.frame["value"] == 2 + assert first_output.payload.frame["port"] == 0 + + assert second_output.tag == mock_data_output_channel + assert isinstance(second_output.payload, StateFrame) + assert second_output.payload.frame["value"] == 42 + assert second_output.payload.frame["port"] == 0 + @pytest.mark.timeout(5) def test_main_loop_thread_can_align_ecm( self, diff --git a/amber/src/main/python/core/storage/document_factory.py b/amber/src/main/python/core/storage/document_factory.py index 9b686ab66b6..8a4d6fe3c5f 100644 --- a/amber/src/main/python/core/storage/document_factory.py +++ b/amber/src/main/python/core/storage/document_factory.py @@ -61,30 +61,35 @@ def create_document(uri: str, schema: Schema) -> VirtualDocument: if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME: _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - if resource_type in {VFSResourceType.RESULT}: - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - # Convert Amber Schema to Iceberg Schema with LARGE_BINARY - # field name encoding - iceberg_schema = amber_schema_to_iceberg_schema(schema) - - create_table( - IcebergCatalogInstance.get_instance(), - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - iceberg_schema, - override_if_exists=True, - ) - - return IcebergDocument[Tuple]( - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - iceberg_schema, - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - else: - raise ValueError(f"Resource type {resource_type} is not supported") + match resource_type: + case VFSResourceType.RESULT: + namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + namespace = "state" + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + # Convert Amber Schema to Iceberg Schema with LARGE_BINARY + # field name encoding + iceberg_schema = amber_schema_to_iceberg_schema(schema) + + create_table( + IcebergCatalogInstance.get_instance(), + namespace, + storage_key, + iceberg_schema, + override_if_exists=True, + ) + + return IcebergDocument[Tuple]( + namespace, + storage_key, + iceberg_schema, + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for creating the document" @@ -96,30 +101,36 @@ def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: if parsed_uri.scheme == "vfs": _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - if resource_type in {VFSResourceType.RESULT}: - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - table = load_table_metadata( - IcebergCatalogInstance.get_instance(), - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - ) - - if table is None: - raise ValueError("No storage is found for the given URI") - - amber_schema = Schema(table.schema().as_arrow()) - - document = IcebergDocument( - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - table.schema(), - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - return document, amber_schema - else: - raise ValueError(f"Resource type {resource_type} is not supported") + match resource_type: + case VFSResourceType.RESULT: + namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + namespace = "state" + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + + table = load_table_metadata( + IcebergCatalogInstance.get_instance(), + namespace, + storage_key, + ) + + if table is None: + raise ValueError("No storage is found for the given URI") + + amber_schema = Schema(table.schema().as_arrow()) + + document = IcebergDocument( + namespace, + storage_key, + table.schema(), + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + return document, amber_schema + else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for opening the document" diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index e49c0316cc7..a600f878572 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -17,8 +17,8 @@ import typing from loguru import logger -from pyarrow import Table from typing import Union +from pyarrow import Table from core.architecture.sendsemantics.broad_cast_partitioner import ( BroadcastPartitioner, @@ -34,8 +34,9 @@ from core.architecture.sendsemantics.round_robin_partitioner import ( RoundRobinPartitioner, ) -from core.models import Tuple, InternalQueue, DataFrame, DataPayload +from core.models import Tuple, InternalQueue, DataFrame, DataPayload, State, StateFrame from core.models.internal_queue import DataElement, ECMElement +from core.models.state import deserialize_state, state_uri_from_result_uri from core.storage.document_factory import DocumentFactory from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable @@ -125,6 +126,15 @@ def tuple_to_batch_with_filter(self, tuple_: Tuple) -> typing.Iterator[DataFrame if receiver == self.worker_actor_id: yield self.tuples_to_data_frame(tuples) + def emit_state_with_filter(self, state: State) -> typing.Iterator[StateFrame]: + for receiver, payload in self.partitioner.flush_state(state): + if receiver == self.worker_actor_id: + yield ( + StateFrame(payload) + if isinstance(payload, dict) + else self.tuples_to_data_frame(payload) + ) + def run(self) -> None: """ Main execution logic that reads tuples from the materialized storage and @@ -138,8 +148,21 @@ def run(self) -> None: self.uri ) self.emit_ecm("StartChannel", EmbeddedControlMessageType.NO_ALIGNMENT) - storage_iterator = self.materialization.get() + try: + state_document, _ = DocumentFactory.open_document( + state_uri_from_result_uri(self.uri) + ) + state_iterator = state_document.get() + for state in state_iterator: + for state_frame in self.emit_state_with_filter( + deserialize_state(state) + ): + self.emit_payload(state_frame) + except ValueError: + pass + + storage_iterator = self.materialization.get() # Iterate and process tuples. for tup in storage_iterator: if self._stopped: diff --git a/amber/src/main/python/core/storage/vfs_uri_factory.py b/amber/src/main/python/core/storage/vfs_uri_factory.py index de0c5db56ec..0e23e607055 100644 --- a/amber/src/main/python/core/storage/vfs_uri_factory.py +++ b/amber/src/main/python/core/storage/vfs_uri_factory.py @@ -34,6 +34,7 @@ class VFSResourceType(str, Enum): RESULT = "result" RUNTIME_STATISTICS = "runtimeStatistics" CONSOLE_MESSAGES = "consoleMessages" + STATE = "state" class VFSURIFactory: diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 4ab3d18056f..53755b780cc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -124,6 +124,8 @@ class OutputManager( : mutable.HashMap[PortIdentity, OutputPortResultWriterThread] = mutable.HashMap() + private val storageUris: mutable.HashMap[Int, URI] = mutable.HashMap() + /** * Add down stream operator and its corresponding Partitioner. * @@ -232,6 +234,23 @@ class OutputManager( }) } + def saveStateToStorageIfNeeded(state: State): Unit = { + try { + storageUris.foreach { + case (_, uri) => + val writer = DocumentFactory + .openDocument(State.stateUriFromResultUri(uri)) + ._1 + .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) + .asInstanceOf[BufferedItemWriter[Tuple]] + writer.putOne(State.serialize(state)) + writer.close() + } + } catch { + case _: Exception => () + } + } + /** * Singal the port storage writer to flush the remaining buffer and wait for commits to finish so that * the output port is properly completed. If the output port does not need storage, no action will be done. @@ -280,6 +299,7 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, storageUri: URI): Unit = { + this.storageUris(portId.id) = storageUri val bufferedItemWriter = DocumentFactory .openDocument(storageUri) ._1 diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index 6618e857b1d..cfdb6a82f86 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.pythonworker import com.twitter.util.{Await, Promise} import org.apache.texera.amber.core.WorkflowRuntimeException +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.tuple.{Schema, Tuple} import org.apache.texera.amber.core.virtualidentity.{ActorVirtualIdentity, ChannelIdentity} import org.apache.texera.amber.engine.architecture.pythonworker.WorkerBatchInternalQueue.{ @@ -125,7 +126,7 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu case DataFrame(frame) => writeArrowStream(mutable.Queue(ArraySeq.unsafeWrapArray(frame): _*), from, "Data") case StateFrame(state) => - writeArrowStream(mutable.Queue(state.toTuple), from, "State") + writeArrowStream(mutable.Queue(State.serialize(state)), from, "State") } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala index c904e436bcd..463dc4b75a5 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala @@ -128,7 +128,7 @@ private class AmberProducer( dataHeader.payloadType match { case "State" => assert(root.getRowCount == 1) - outputPort.sendTo(to, StateFrame(State(Some(ArrowUtils.getTexeraTuple(0, root))))) + outputPort.sendTo(to, StateFrame(State.deserialize(ArrowUtils.getTexeraTuple(0, root)))) case "ECM" => assert(root.getRowCount == 1) outputPort.sendTo( diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index e490cde3d9b..5be5d942e5c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -20,7 +20,8 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.pekko.pattern.gracefulStop -import com.twitter.util.{Future, Return, Throw} +import com.twitter.util.{Duration => TwitterDuration, Future, JavaTimer, Return, Throw, Timer} +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.VFSURIFactory.decodeURI import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity @@ -61,7 +62,7 @@ import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowExecutions import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.AtomicReference -import scala.concurrent.duration.Duration +import scala.concurrent.duration.{Duration => ScalaDuration} /** * The executor of a region. @@ -109,10 +110,14 @@ class RegionExecutionCoordinator( private val currentPhaseRef: AtomicReference[RegionExecutionPhase] = new AtomicReference( Unexecuted ) + private val terminationFutureRef: AtomicReference[Future[Unit]] = new AtomicReference(null) + private val killRetryTimer: Timer = new JavaTimer(true) + private val killRetryDelay: TwitterDuration = TwitterDuration.fromMilliseconds(200) /** * Sync the status of `RegionExecution` and transition this coordinator's phase to `Completed` only when the - * coordinator is currently in `ExecutingNonDependeePortsPhase` and all the ports of this region are completed. + * coordinator is currently in `ExecutingNonDependeePortsPhase`, all the ports of this region are completed, and + * all workers in this region are terminated. * * Additionally, this method will also terminate all the workers of this region: * @@ -135,12 +140,22 @@ class RegionExecutionCoordinator( return Future.Unit } - // Set this coordinator's status to be completed so that subsequent regions can be started by - // WorkflowExecutionCoordinator. - setPhase(Completed) - - // Terminate all the workers in this region. - terminateWorkers(regionExecution) + val existingTerminationFuture = terminationFutureRef.get + if (existingTerminationFuture != null) { + existingTerminationFuture + } else { + val terminationFuture = terminateWorkersWithRetry(regionExecution).flatMap { _ => + // Set this coordinator's status to be completed so that subsequent regions can be started by + // WorkflowExecutionCoordinator. + setPhase(Completed) + Future.Unit + } + if (terminationFutureRef.compareAndSet(null, terminationFuture)) { + terminationFuture + } else { + terminationFutureRef.get + } + } } private def terminateWorkers(regionExecution: RegionExecution) = { @@ -167,7 +182,7 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) - gracefulStop(actorRef, Duration(5, TimeUnit.SECONDS)).asTwitter() + gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() } }.toSeq @@ -191,8 +206,29 @@ class RegionExecutionCoordinator( } } + private def terminateWorkersWithRetry( + regionExecution: RegionExecution, + attempt: Int = 1 + ): Future[Unit] = { + terminateWorkers(regionExecution).rescue { case err => + logger.warn( + s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", + err + ) + Future + .sleep(killRetryDelay)(killRetryTimer) + .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) + } + } + def isCompleted: Boolean = currentPhaseRef.get == Completed + /** + * Returns the region termination future if termination has been initiated. + * This is only set by `tryCompleteRegionExecution()`. + */ + def getTerminationFutureOpt: Option[Future[Unit]] = Option(terminationFutureRef.get) + /** * This will sync and transition the region execution phase from one to another depending on its current phase: * @@ -528,12 +564,14 @@ class RegionExecutionCoordinator( portConfigs.foreach { case (outputPortId, portConfig) => val storageUriToAdd = portConfig.storageURI + val stateUriToAdd = State.stateUriFromResultUri(storageUriToAdd) val (_, eid, _, _) = decodeURI(storageUriToAdd) val schemaOptional = region.getOperator(outputPortId.opId).outputPorts(outputPortId.portId)._3 val schema = schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) DocumentFactory.createDocument(storageUriToAdd, schema) + DocumentFactory.createDocument(stateUriToAdd, State.schema) WorkflowExecutionsResource.insertOperatorPortResultUri( eid = eid, globalPortId = outputPortId, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala index 3aa5fa90a46..65c560ee594 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala @@ -126,6 +126,7 @@ class DataProcessor( val outputState = executor.processState(state, port) if (outputState.isDefined) { outputManager.emitState(outputState.get) + outputManager.saveStateToStorageIfNeeded(state) } } catch safely { case e => diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala index 10fbbc44a2c..acada743bc6 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.worker.managers import io.grpc.MethodDescriptor import org.apache.texera.amber.config.ApplicationConfig +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.model.VirtualDocument import org.apache.texera.amber.core.tuple.Tuple @@ -45,7 +46,11 @@ import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.{ DPInputQueueElement, FIFOMessageElement } -import org.apache.texera.amber.engine.common.ambermessage.{DataFrame, WorkflowFIFOMessage} +import org.apache.texera.amber.engine.common.ambermessage.{ + DataFrame, + StateFrame, + WorkflowFIFOMessage +} import org.apache.texera.amber.util.VirtualIdentityUtils.getFromActorIdForInputPortStorage import java.net.URI @@ -106,6 +111,25 @@ class InputPortMaterializationReaderThread( } // Flush any remaining tuples in the buffer. if (buffer.nonEmpty) flush() + + try { + val state_document = + DocumentFactory + .openDocument(State.stateUriFromResultUri(uri)) + ._1 + .asInstanceOf[VirtualDocument[Tuple]] + val stateReadIterator = state_document.get() + + while (stateReadIterator.hasNext) { + val state = State.deserialize(stateReadIterator.next()) + inputMessageQueue.put( + FIFOMessageElement(WorkflowFIFOMessage(channelId, getSequenceNumber, StateFrame(state))) + ) + } + } catch { + case _: Exception => + } + emitECM(METHOD_END_CHANNEL, PORT_ALIGNMENT) isFinished.set(true) } catch { diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala index f99739acc04..9837213abbb 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/executor/OperatorExecutor.scala @@ -29,13 +29,7 @@ trait OperatorExecutor { def produceStateOnStart(port: Int): Option[State] = None - def processState(state: State, port: Int): Option[State] = { - if (state.isPassToAllDownstream) { - Some(state) - } else { - None - } - } + def processState(state: State, port: Int): Option[State] = Some(state) def processTupleMultiPort( tuple: Tuple, diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 3226c9d2fe7..f76a314b7ae 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -19,39 +19,70 @@ package org.apache.texera.amber.core.state +import com.fasterxml.jackson.databind.JsonNode import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.texera.amber.util.JSONUtils.objectMapper -import scala.collection.mutable +import java.net.URI +import java.util.Base64 +import scala.jdk.CollectionConverters.IteratorHasAsScala -final case class State(tuple: Option[Tuple] = None, passToAllDownstream: Boolean = false) { - val data: mutable.Map[String, (AttributeType, Any)] = mutable.LinkedHashMap() - add("passToAllDownstream", passToAllDownstream, AttributeType.BOOLEAN) - if (tuple.isDefined) { - tuple.get.getSchema.getAttributes.foreach { attribute => - add(attribute.getName, tuple.get.getField(attribute.getName), attribute.getType) - } - } +object State { + private val StateContent = "content" + private val BytesTypeMarker = "__texera_type__" + private val BytesValue = "bytes" + private val PayloadMarker = "payload" - def add(key: String, value: Any, valueType: AttributeType): Unit = - data.put(key, (valueType, value)) + val schema: Schema = new Schema( + new Attribute(StateContent, AttributeType.STRING) + ) - def get(key: String): Any = data(key)._2 + def stateUriFromResultUri(resultUri: URI): URI = + new URI(resultUri.toString.replace("/result", "/state")) - def isPassToAllDownstream: Boolean = get("passToAllDownstream").asInstanceOf[Boolean] + def serialize(state: State): Tuple = { + val payloadJson = objectMapper.writeValueAsString(toJsonValue(state)) + Tuple.builder(schema).addSequentially(Array(payloadJson)).build() + } - def apply(key: String): Any = get(key) + def deserialize(tuple: Tuple): State = { + val payload = tuple.getField[String](StateContent) + objectMapper.readTree(payload).fields().asScala.map(entry => entry.getKey -> fromJsonValue(entry.getValue)).toMap + } - def toTuple: Tuple = - Tuple - .builder( - Schema(data.map { - case (name, (attrType, _)) => - new Attribute(name, attrType) - }.toList) - ) - .addSequentially(data.values.map(_._2).toArray) - .build() + private def toJsonValue(value: Any): Any = + value match { + case null => null + case bytes: Array[Byte] => + Map(BytesTypeMarker -> BytesValue, PayloadMarker -> Base64.getEncoder.encodeToString(bytes)) + case map: State => + map.iterator.map { case (k, v) => k -> toJsonValue(v) }.toMap + case iterable: Iterable[_] => + iterable.map(toJsonValue).toList + case other => other + } - override def toString: String = - data.map { case (key, (_, value)) => s"$key: $value" }.mkString(", ") + private def fromJsonValue(node: JsonNode): Any = { + if (node == null || node.isNull) { + null + } else if (node.isObject) { + val fields = node.fields().asScala.map(entry => entry.getKey -> entry.getValue).toMap + fields.get(BytesTypeMarker) match { + case Some(typeNode) if typeNode.isTextual && typeNode.asText() == BytesValue => + Base64.getDecoder.decode(fields(PayloadMarker).asText()) + case _ => + fields.view.mapValues(fromJsonValue).toMap + } + } else if (node.isArray) { + node.elements().asScala.map(fromJsonValue).toList + } else if (node.isBoolean) { + node.asBoolean() + } else if (node.isIntegralNumber) { + node.longValue() + } else if (node.isFloatingPointNumber) { + node.doubleValue() + } else { + node.asText() + } + } } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala new file mode 100644 index 00000000000..c110f9d814f --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.core + +package object state { + type State = Map[String, Any] +} diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 15949ef4717..ae37def667e 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -72,6 +72,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } @@ -119,6 +120,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala index 3513ac5ecd8..990776a69f0 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala @@ -34,6 +34,7 @@ object VFSResourceType extends Enumeration { val RESULT: Value = Value("result") val RUNTIME_STATISTICS: Value = Value("runtimeStatistics") val CONSOLE_MESSAGES: Value = Value("consoleMessages") + val STATE: Value = Value("state") } object VFSURIFactory { diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala index 462bdd0969a..d2becc79a5b 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala @@ -34,7 +34,7 @@ class IfOpExec(descString: String) extends OperatorExecutor { //It can accept any value that can be converted to a boolean. For example, Int 1 will be converted to true. override def processState(state: State, port: Int): Option[State] = { outputPort = - if (state.get(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() + if (state(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() Some(state) } From b570aaef732519554eaf87a0805cf795edfa460f Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 23 Apr 2026 00:56:30 -0700 Subject: [PATCH 030/152] refactor: keep state changes without materialization --- .../architecture/packaging/output_manager.py | 36 +----- amber/src/main/python/core/models/state.py | 4 - .../python/core/storage/document_factory.py | 107 ++++++++---------- ...ut_port_materialization_reader_runnable.py | 29 +---- .../python/core/storage/vfs_uri_factory.py | 1 - .../messaginglayer/OutputManager.scala | 20 ---- .../RegionExecutionCoordinator.scala | 58 ++-------- .../architecture/worker/DataProcessor.scala | 1 - ...InputPortMaterializationReaderThread.scala | 26 +---- .../texera/amber/core/state/State.scala | 4 - .../amber/core/storage/DocumentFactory.scala | 2 - .../amber/core/storage/VFSURIFactory.scala | 1 - 12 files changed, 63 insertions(+), 226 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index 065b063f7d4..afa9127fe6e 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -17,7 +17,6 @@ import threading import typing -import uuid from collections import OrderedDict from itertools import chain from loguru import logger @@ -44,12 +43,7 @@ ) from core.models import Tuple, Schema, StateFrame from core.models.payload import DataPayload, DataFrame -from core.models.state import ( - State, - STATE_SCHEMA, - serialize_state, - state_uri_from_result_uri, -) +from core.models.state import State from core.storage.document_factory import DocumentFactory from core.storage.runnables.port_storage_writer import ( PortStorageWriter, @@ -93,8 +87,6 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() - self._storage_uris: typing.Dict[PortIdentity, str] = dict() - def is_missing_output_ports(self): """ This method is only used for ensuring correct region execution. @@ -134,7 +126,6 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri: str): Create a separate thread for saving output tuples of a port to storage in batch. """ - self._storage_uris[port_id] = storage_uri document, _ = DocumentFactory.open_document(storage_uri) buffered_item_writer = document.writer(str(get_worker_index(self.worker_id))) writer_queue = Queue() @@ -180,31 +171,6 @@ def save_tuple_to_storage_if_needed(self, tuple_: Tuple, port_id=None) -> None: PortStorageWriterElement(data_tuple=tuple_) ) - def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: - if port_id is None: - uris = self._storage_uris.values() - elif port_id in self._storage_uris: - uris = [self._storage_uris[port_id]] - else: - return - - for uri in uris: - state_uri = state_uri_from_result_uri(uri) - try: - document = DocumentFactory.open_document(state_uri)[0] - except ValueError: - document = DocumentFactory.create_document(state_uri, STATE_SCHEMA) - writer = document.writer(str(uuid.uuid4())) - writer.put_one(serialize_state(state)) - writer.close() - - def reset_output_storage(self) -> None: - port_id = self.get_port_ids()[0] - storage_uri = self._storage_uris[port_id] - self.close_port_storage_writers() - DocumentFactory.create_document(storage_uri, self._ports[port_id].get_schema()) - self.set_up_port_storage_writer(port_id, storage_uri) - def close_port_storage_writers(self) -> None: """ Flush the buffers of port storage writers and wait for all the diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index e5726cc3c2f..a496d5c41c2 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -32,10 +32,6 @@ STATE_SCHEMA = Schema(raw_schema={STATE_CONTENT: "STRING"}) -def state_uri_from_result_uri(result_uri: str) -> str: - return result_uri.replace("/result", "/state") - - def serialize_state(state: State) -> Tuple: return Tuple( { diff --git a/amber/src/main/python/core/storage/document_factory.py b/amber/src/main/python/core/storage/document_factory.py index 8a4d6fe3c5f..9b686ab66b6 100644 --- a/amber/src/main/python/core/storage/document_factory.py +++ b/amber/src/main/python/core/storage/document_factory.py @@ -61,35 +61,30 @@ def create_document(uri: str, schema: Schema) -> VirtualDocument: if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME: _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - match resource_type: - case VFSResourceType.RESULT: - namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE - case VFSResourceType.STATE: - namespace = "state" - case _: - raise ValueError(f"Resource type {resource_type} is not supported") - - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - # Convert Amber Schema to Iceberg Schema with LARGE_BINARY - # field name encoding - iceberg_schema = amber_schema_to_iceberg_schema(schema) - - create_table( - IcebergCatalogInstance.get_instance(), - namespace, - storage_key, - iceberg_schema, - override_if_exists=True, - ) - - return IcebergDocument[Tuple]( - namespace, - storage_key, - iceberg_schema, - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - + if resource_type in {VFSResourceType.RESULT}: + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + + # Convert Amber Schema to Iceberg Schema with LARGE_BINARY + # field name encoding + iceberg_schema = amber_schema_to_iceberg_schema(schema) + + create_table( + IcebergCatalogInstance.get_instance(), + StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, + storage_key, + iceberg_schema, + override_if_exists=True, + ) + + return IcebergDocument[Tuple]( + StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, + storage_key, + iceberg_schema, + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + else: + raise ValueError(f"Resource type {resource_type} is not supported") else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for creating the document" @@ -101,36 +96,30 @@ def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: if parsed_uri.scheme == "vfs": _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - match resource_type: - case VFSResourceType.RESULT: - namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE - case VFSResourceType.STATE: - namespace = "state" - case _: - raise ValueError(f"Resource type {resource_type} is not supported") - - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - table = load_table_metadata( - IcebergCatalogInstance.get_instance(), - namespace, - storage_key, - ) - - if table is None: - raise ValueError("No storage is found for the given URI") - - amber_schema = Schema(table.schema().as_arrow()) - - document = IcebergDocument( - namespace, - storage_key, - table.schema(), - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - return document, amber_schema - + if resource_type in {VFSResourceType.RESULT}: + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + + table = load_table_metadata( + IcebergCatalogInstance.get_instance(), + StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, + storage_key, + ) + + if table is None: + raise ValueError("No storage is found for the given URI") + + amber_schema = Schema(table.schema().as_arrow()) + + document = IcebergDocument( + StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, + storage_key, + table.schema(), + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + return document, amber_schema + else: + raise ValueError(f"Resource type {resource_type} is not supported") else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for opening the document" diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index a600f878572..e49c0316cc7 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -17,8 +17,8 @@ import typing from loguru import logger -from typing import Union from pyarrow import Table +from typing import Union from core.architecture.sendsemantics.broad_cast_partitioner import ( BroadcastPartitioner, @@ -34,9 +34,8 @@ from core.architecture.sendsemantics.round_robin_partitioner import ( RoundRobinPartitioner, ) -from core.models import Tuple, InternalQueue, DataFrame, DataPayload, State, StateFrame +from core.models import Tuple, InternalQueue, DataFrame, DataPayload from core.models.internal_queue import DataElement, ECMElement -from core.models.state import deserialize_state, state_uri_from_result_uri from core.storage.document_factory import DocumentFactory from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable @@ -126,15 +125,6 @@ def tuple_to_batch_with_filter(self, tuple_: Tuple) -> typing.Iterator[DataFrame if receiver == self.worker_actor_id: yield self.tuples_to_data_frame(tuples) - def emit_state_with_filter(self, state: State) -> typing.Iterator[StateFrame]: - for receiver, payload in self.partitioner.flush_state(state): - if receiver == self.worker_actor_id: - yield ( - StateFrame(payload) - if isinstance(payload, dict) - else self.tuples_to_data_frame(payload) - ) - def run(self) -> None: """ Main execution logic that reads tuples from the materialized storage and @@ -148,21 +138,8 @@ def run(self) -> None: self.uri ) self.emit_ecm("StartChannel", EmbeddedControlMessageType.NO_ALIGNMENT) - - try: - state_document, _ = DocumentFactory.open_document( - state_uri_from_result_uri(self.uri) - ) - state_iterator = state_document.get() - for state in state_iterator: - for state_frame in self.emit_state_with_filter( - deserialize_state(state) - ): - self.emit_payload(state_frame) - except ValueError: - pass - storage_iterator = self.materialization.get() + # Iterate and process tuples. for tup in storage_iterator: if self._stopped: diff --git a/amber/src/main/python/core/storage/vfs_uri_factory.py b/amber/src/main/python/core/storage/vfs_uri_factory.py index 0e23e607055..de0c5db56ec 100644 --- a/amber/src/main/python/core/storage/vfs_uri_factory.py +++ b/amber/src/main/python/core/storage/vfs_uri_factory.py @@ -34,7 +34,6 @@ class VFSResourceType(str, Enum): RESULT = "result" RUNTIME_STATISTICS = "runtimeStatistics" CONSOLE_MESSAGES = "consoleMessages" - STATE = "state" class VFSURIFactory: diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 53755b780cc..4ab3d18056f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -124,8 +124,6 @@ class OutputManager( : mutable.HashMap[PortIdentity, OutputPortResultWriterThread] = mutable.HashMap() - private val storageUris: mutable.HashMap[Int, URI] = mutable.HashMap() - /** * Add down stream operator and its corresponding Partitioner. * @@ -234,23 +232,6 @@ class OutputManager( }) } - def saveStateToStorageIfNeeded(state: State): Unit = { - try { - storageUris.foreach { - case (_, uri) => - val writer = DocumentFactory - .openDocument(State.stateUriFromResultUri(uri)) - ._1 - .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) - .asInstanceOf[BufferedItemWriter[Tuple]] - writer.putOne(State.serialize(state)) - writer.close() - } - } catch { - case _: Exception => () - } - } - /** * Singal the port storage writer to flush the remaining buffer and wait for commits to finish so that * the output port is properly completed. If the output port does not need storage, no action will be done. @@ -299,7 +280,6 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, storageUri: URI): Unit = { - this.storageUris(portId.id) = storageUri val bufferedItemWriter = DocumentFactory .openDocument(storageUri) ._1 diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 5be5d942e5c..e490cde3d9b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -20,8 +20,7 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.pekko.pattern.gracefulStop -import com.twitter.util.{Duration => TwitterDuration, Future, JavaTimer, Return, Throw, Timer} -import org.apache.texera.amber.core.state.State +import com.twitter.util.{Future, Return, Throw} import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.VFSURIFactory.decodeURI import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity @@ -62,7 +61,7 @@ import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowExecutions import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.AtomicReference -import scala.concurrent.duration.{Duration => ScalaDuration} +import scala.concurrent.duration.Duration /** * The executor of a region. @@ -110,14 +109,10 @@ class RegionExecutionCoordinator( private val currentPhaseRef: AtomicReference[RegionExecutionPhase] = new AtomicReference( Unexecuted ) - private val terminationFutureRef: AtomicReference[Future[Unit]] = new AtomicReference(null) - private val killRetryTimer: Timer = new JavaTimer(true) - private val killRetryDelay: TwitterDuration = TwitterDuration.fromMilliseconds(200) /** * Sync the status of `RegionExecution` and transition this coordinator's phase to `Completed` only when the - * coordinator is currently in `ExecutingNonDependeePortsPhase`, all the ports of this region are completed, and - * all workers in this region are terminated. + * coordinator is currently in `ExecutingNonDependeePortsPhase` and all the ports of this region are completed. * * Additionally, this method will also terminate all the workers of this region: * @@ -140,22 +135,12 @@ class RegionExecutionCoordinator( return Future.Unit } - val existingTerminationFuture = terminationFutureRef.get - if (existingTerminationFuture != null) { - existingTerminationFuture - } else { - val terminationFuture = terminateWorkersWithRetry(regionExecution).flatMap { _ => - // Set this coordinator's status to be completed so that subsequent regions can be started by - // WorkflowExecutionCoordinator. - setPhase(Completed) - Future.Unit - } - if (terminationFutureRef.compareAndSet(null, terminationFuture)) { - terminationFuture - } else { - terminationFutureRef.get - } - } + // Set this coordinator's status to be completed so that subsequent regions can be started by + // WorkflowExecutionCoordinator. + setPhase(Completed) + + // Terminate all the workers in this region. + terminateWorkers(regionExecution) } private def terminateWorkers(regionExecution: RegionExecution) = { @@ -182,7 +167,7 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) - gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() + gracefulStop(actorRef, Duration(5, TimeUnit.SECONDS)).asTwitter() } }.toSeq @@ -206,29 +191,8 @@ class RegionExecutionCoordinator( } } - private def terminateWorkersWithRetry( - regionExecution: RegionExecution, - attempt: Int = 1 - ): Future[Unit] = { - terminateWorkers(regionExecution).rescue { case err => - logger.warn( - s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", - err - ) - Future - .sleep(killRetryDelay)(killRetryTimer) - .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) - } - } - def isCompleted: Boolean = currentPhaseRef.get == Completed - /** - * Returns the region termination future if termination has been initiated. - * This is only set by `tryCompleteRegionExecution()`. - */ - def getTerminationFutureOpt: Option[Future[Unit]] = Option(terminationFutureRef.get) - /** * This will sync and transition the region execution phase from one to another depending on its current phase: * @@ -564,14 +528,12 @@ class RegionExecutionCoordinator( portConfigs.foreach { case (outputPortId, portConfig) => val storageUriToAdd = portConfig.storageURI - val stateUriToAdd = State.stateUriFromResultUri(storageUriToAdd) val (_, eid, _, _) = decodeURI(storageUriToAdd) val schemaOptional = region.getOperator(outputPortId.opId).outputPorts(outputPortId.portId)._3 val schema = schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) DocumentFactory.createDocument(storageUriToAdd, schema) - DocumentFactory.createDocument(stateUriToAdd, State.schema) WorkflowExecutionsResource.insertOperatorPortResultUri( eid = eid, globalPortId = outputPortId, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala index 65c560ee594..3aa5fa90a46 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala @@ -126,7 +126,6 @@ class DataProcessor( val outputState = executor.processState(state, port) if (outputState.isDefined) { outputManager.emitState(outputState.get) - outputManager.saveStateToStorageIfNeeded(state) } } catch safely { case e => diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala index acada743bc6..10fbbc44a2c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala @@ -21,7 +21,6 @@ package org.apache.texera.amber.engine.architecture.worker.managers import io.grpc.MethodDescriptor import org.apache.texera.amber.config.ApplicationConfig -import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.model.VirtualDocument import org.apache.texera.amber.core.tuple.Tuple @@ -46,11 +45,7 @@ import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.{ DPInputQueueElement, FIFOMessageElement } -import org.apache.texera.amber.engine.common.ambermessage.{ - DataFrame, - StateFrame, - WorkflowFIFOMessage -} +import org.apache.texera.amber.engine.common.ambermessage.{DataFrame, WorkflowFIFOMessage} import org.apache.texera.amber.util.VirtualIdentityUtils.getFromActorIdForInputPortStorage import java.net.URI @@ -111,25 +106,6 @@ class InputPortMaterializationReaderThread( } // Flush any remaining tuples in the buffer. if (buffer.nonEmpty) flush() - - try { - val state_document = - DocumentFactory - .openDocument(State.stateUriFromResultUri(uri)) - ._1 - .asInstanceOf[VirtualDocument[Tuple]] - val stateReadIterator = state_document.get() - - while (stateReadIterator.hasNext) { - val state = State.deserialize(stateReadIterator.next()) - inputMessageQueue.put( - FIFOMessageElement(WorkflowFIFOMessage(channelId, getSequenceNumber, StateFrame(state))) - ) - } - } catch { - case _: Exception => - } - emitECM(METHOD_END_CHANNEL, PORT_ALIGNMENT) isFinished.set(true) } catch { diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index f76a314b7ae..4957f31a407 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -23,7 +23,6 @@ import com.fasterxml.jackson.databind.JsonNode import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} import org.apache.texera.amber.util.JSONUtils.objectMapper -import java.net.URI import java.util.Base64 import scala.jdk.CollectionConverters.IteratorHasAsScala @@ -37,9 +36,6 @@ object State { new Attribute(StateContent, AttributeType.STRING) ) - def stateUriFromResultUri(resultUri: URI): URI = - new URI(resultUri.toString.replace("/result", "/state")) - def serialize(state: State): Tuple = { val payloadJson = objectMapper.writeValueAsString(toJsonValue(state)) Tuple.builder(schema).addSequentially(Array(payloadJson)).build() diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index ae37def667e..15949ef4717 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -72,7 +72,6 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace - case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } @@ -120,7 +119,6 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace - case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala index 990776a69f0..3513ac5ecd8 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala @@ -34,7 +34,6 @@ object VFSResourceType extends Enumeration { val RESULT: Value = Value("result") val RUNTIME_STATISTICS: Value = Value("runtimeStatistics") val CONSOLE_MESSAGES: Value = Value("consoleMessages") - val STATE: Value = Value("state") } object VFSURIFactory { From 3c0c8164e7657234820a6798a42ebaa30d9f6abf Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 23 Apr 2026 00:57:09 -0700 Subject: [PATCH 031/152] feat: add state materialization support --- .../architecture/packaging/output_manager.py | 36 +++++- amber/src/main/python/core/models/state.py | 4 + .../python/core/storage/document_factory.py | 107 ++++++++++-------- ...ut_port_materialization_reader_runnable.py | 29 ++++- .../python/core/storage/vfs_uri_factory.py | 1 + .../messaginglayer/OutputManager.scala | 20 ++++ .../RegionExecutionCoordinator.scala | 58 ++++++++-- .../architecture/worker/DataProcessor.scala | 1 + ...InputPortMaterializationReaderThread.scala | 26 ++++- .../texera/amber/core/state/State.scala | 4 + .../amber/core/storage/DocumentFactory.scala | 2 + .../amber/core/storage/VFSURIFactory.scala | 1 + 12 files changed, 226 insertions(+), 63 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index afa9127fe6e..065b063f7d4 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -17,6 +17,7 @@ import threading import typing +import uuid from collections import OrderedDict from itertools import chain from loguru import logger @@ -43,7 +44,12 @@ ) from core.models import Tuple, Schema, StateFrame from core.models.payload import DataPayload, DataFrame -from core.models.state import State +from core.models.state import ( + State, + STATE_SCHEMA, + serialize_state, + state_uri_from_result_uri, +) from core.storage.document_factory import DocumentFactory from core.storage.runnables.port_storage_writer import ( PortStorageWriter, @@ -87,6 +93,8 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() + self._storage_uris: typing.Dict[PortIdentity, str] = dict() + def is_missing_output_ports(self): """ This method is only used for ensuring correct region execution. @@ -126,6 +134,7 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri: str): Create a separate thread for saving output tuples of a port to storage in batch. """ + self._storage_uris[port_id] = storage_uri document, _ = DocumentFactory.open_document(storage_uri) buffered_item_writer = document.writer(str(get_worker_index(self.worker_id))) writer_queue = Queue() @@ -171,6 +180,31 @@ def save_tuple_to_storage_if_needed(self, tuple_: Tuple, port_id=None) -> None: PortStorageWriterElement(data_tuple=tuple_) ) + def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: + if port_id is None: + uris = self._storage_uris.values() + elif port_id in self._storage_uris: + uris = [self._storage_uris[port_id]] + else: + return + + for uri in uris: + state_uri = state_uri_from_result_uri(uri) + try: + document = DocumentFactory.open_document(state_uri)[0] + except ValueError: + document = DocumentFactory.create_document(state_uri, STATE_SCHEMA) + writer = document.writer(str(uuid.uuid4())) + writer.put_one(serialize_state(state)) + writer.close() + + def reset_output_storage(self) -> None: + port_id = self.get_port_ids()[0] + storage_uri = self._storage_uris[port_id] + self.close_port_storage_writers() + DocumentFactory.create_document(storage_uri, self._ports[port_id].get_schema()) + self.set_up_port_storage_writer(port_id, storage_uri) + def close_port_storage_writers(self) -> None: """ Flush the buffers of port storage writers and wait for all the diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index a496d5c41c2..e5726cc3c2f 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -32,6 +32,10 @@ STATE_SCHEMA = Schema(raw_schema={STATE_CONTENT: "STRING"}) +def state_uri_from_result_uri(result_uri: str) -> str: + return result_uri.replace("/result", "/state") + + def serialize_state(state: State) -> Tuple: return Tuple( { diff --git a/amber/src/main/python/core/storage/document_factory.py b/amber/src/main/python/core/storage/document_factory.py index 9b686ab66b6..8a4d6fe3c5f 100644 --- a/amber/src/main/python/core/storage/document_factory.py +++ b/amber/src/main/python/core/storage/document_factory.py @@ -61,30 +61,35 @@ def create_document(uri: str, schema: Schema) -> VirtualDocument: if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME: _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - if resource_type in {VFSResourceType.RESULT}: - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - # Convert Amber Schema to Iceberg Schema with LARGE_BINARY - # field name encoding - iceberg_schema = amber_schema_to_iceberg_schema(schema) - - create_table( - IcebergCatalogInstance.get_instance(), - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - iceberg_schema, - override_if_exists=True, - ) - - return IcebergDocument[Tuple]( - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - iceberg_schema, - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - else: - raise ValueError(f"Resource type {resource_type} is not supported") + match resource_type: + case VFSResourceType.RESULT: + namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + namespace = "state" + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + # Convert Amber Schema to Iceberg Schema with LARGE_BINARY + # field name encoding + iceberg_schema = amber_schema_to_iceberg_schema(schema) + + create_table( + IcebergCatalogInstance.get_instance(), + namespace, + storage_key, + iceberg_schema, + override_if_exists=True, + ) + + return IcebergDocument[Tuple]( + namespace, + storage_key, + iceberg_schema, + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for creating the document" @@ -96,30 +101,36 @@ def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: if parsed_uri.scheme == "vfs": _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - if resource_type in {VFSResourceType.RESULT}: - storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) - - table = load_table_metadata( - IcebergCatalogInstance.get_instance(), - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - ) - - if table is None: - raise ValueError("No storage is found for the given URI") - - amber_schema = Schema(table.schema().as_arrow()) - - document = IcebergDocument( - StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE, - storage_key, - table.schema(), - amber_tuples_to_arrow_table, - arrow_table_to_amber_tuples, - ) - return document, amber_schema - else: - raise ValueError(f"Resource type {resource_type} is not supported") + match resource_type: + case VFSResourceType.RESULT: + namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + namespace = "state" + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + + table = load_table_metadata( + IcebergCatalogInstance.get_instance(), + namespace, + storage_key, + ) + + if table is None: + raise ValueError("No storage is found for the given URI") + + amber_schema = Schema(table.schema().as_arrow()) + + document = IcebergDocument( + namespace, + storage_key, + table.schema(), + amber_tuples_to_arrow_table, + arrow_table_to_amber_tuples, + ) + return document, amber_schema + else: raise NotImplementedError( f"Unsupported URI scheme: {parsed_uri.scheme} for opening the document" diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index e49c0316cc7..a600f878572 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -17,8 +17,8 @@ import typing from loguru import logger -from pyarrow import Table from typing import Union +from pyarrow import Table from core.architecture.sendsemantics.broad_cast_partitioner import ( BroadcastPartitioner, @@ -34,8 +34,9 @@ from core.architecture.sendsemantics.round_robin_partitioner import ( RoundRobinPartitioner, ) -from core.models import Tuple, InternalQueue, DataFrame, DataPayload +from core.models import Tuple, InternalQueue, DataFrame, DataPayload, State, StateFrame from core.models.internal_queue import DataElement, ECMElement +from core.models.state import deserialize_state, state_uri_from_result_uri from core.storage.document_factory import DocumentFactory from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable @@ -125,6 +126,15 @@ def tuple_to_batch_with_filter(self, tuple_: Tuple) -> typing.Iterator[DataFrame if receiver == self.worker_actor_id: yield self.tuples_to_data_frame(tuples) + def emit_state_with_filter(self, state: State) -> typing.Iterator[StateFrame]: + for receiver, payload in self.partitioner.flush_state(state): + if receiver == self.worker_actor_id: + yield ( + StateFrame(payload) + if isinstance(payload, dict) + else self.tuples_to_data_frame(payload) + ) + def run(self) -> None: """ Main execution logic that reads tuples from the materialized storage and @@ -138,8 +148,21 @@ def run(self) -> None: self.uri ) self.emit_ecm("StartChannel", EmbeddedControlMessageType.NO_ALIGNMENT) - storage_iterator = self.materialization.get() + try: + state_document, _ = DocumentFactory.open_document( + state_uri_from_result_uri(self.uri) + ) + state_iterator = state_document.get() + for state in state_iterator: + for state_frame in self.emit_state_with_filter( + deserialize_state(state) + ): + self.emit_payload(state_frame) + except ValueError: + pass + + storage_iterator = self.materialization.get() # Iterate and process tuples. for tup in storage_iterator: if self._stopped: diff --git a/amber/src/main/python/core/storage/vfs_uri_factory.py b/amber/src/main/python/core/storage/vfs_uri_factory.py index de0c5db56ec..0e23e607055 100644 --- a/amber/src/main/python/core/storage/vfs_uri_factory.py +++ b/amber/src/main/python/core/storage/vfs_uri_factory.py @@ -34,6 +34,7 @@ class VFSResourceType(str, Enum): RESULT = "result" RUNTIME_STATISTICS = "runtimeStatistics" CONSOLE_MESSAGES = "consoleMessages" + STATE = "state" class VFSURIFactory: diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 4ab3d18056f..53755b780cc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -124,6 +124,8 @@ class OutputManager( : mutable.HashMap[PortIdentity, OutputPortResultWriterThread] = mutable.HashMap() + private val storageUris: mutable.HashMap[Int, URI] = mutable.HashMap() + /** * Add down stream operator and its corresponding Partitioner. * @@ -232,6 +234,23 @@ class OutputManager( }) } + def saveStateToStorageIfNeeded(state: State): Unit = { + try { + storageUris.foreach { + case (_, uri) => + val writer = DocumentFactory + .openDocument(State.stateUriFromResultUri(uri)) + ._1 + .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) + .asInstanceOf[BufferedItemWriter[Tuple]] + writer.putOne(State.serialize(state)) + writer.close() + } + } catch { + case _: Exception => () + } + } + /** * Singal the port storage writer to flush the remaining buffer and wait for commits to finish so that * the output port is properly completed. If the output port does not need storage, no action will be done. @@ -280,6 +299,7 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, storageUri: URI): Unit = { + this.storageUris(portId.id) = storageUri val bufferedItemWriter = DocumentFactory .openDocument(storageUri) ._1 diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index e490cde3d9b..5be5d942e5c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -20,7 +20,8 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.pekko.pattern.gracefulStop -import com.twitter.util.{Future, Return, Throw} +import com.twitter.util.{Duration => TwitterDuration, Future, JavaTimer, Return, Throw, Timer} +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.VFSURIFactory.decodeURI import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity @@ -61,7 +62,7 @@ import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowExecutions import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.AtomicReference -import scala.concurrent.duration.Duration +import scala.concurrent.duration.{Duration => ScalaDuration} /** * The executor of a region. @@ -109,10 +110,14 @@ class RegionExecutionCoordinator( private val currentPhaseRef: AtomicReference[RegionExecutionPhase] = new AtomicReference( Unexecuted ) + private val terminationFutureRef: AtomicReference[Future[Unit]] = new AtomicReference(null) + private val killRetryTimer: Timer = new JavaTimer(true) + private val killRetryDelay: TwitterDuration = TwitterDuration.fromMilliseconds(200) /** * Sync the status of `RegionExecution` and transition this coordinator's phase to `Completed` only when the - * coordinator is currently in `ExecutingNonDependeePortsPhase` and all the ports of this region are completed. + * coordinator is currently in `ExecutingNonDependeePortsPhase`, all the ports of this region are completed, and + * all workers in this region are terminated. * * Additionally, this method will also terminate all the workers of this region: * @@ -135,12 +140,22 @@ class RegionExecutionCoordinator( return Future.Unit } - // Set this coordinator's status to be completed so that subsequent regions can be started by - // WorkflowExecutionCoordinator. - setPhase(Completed) - - // Terminate all the workers in this region. - terminateWorkers(regionExecution) + val existingTerminationFuture = terminationFutureRef.get + if (existingTerminationFuture != null) { + existingTerminationFuture + } else { + val terminationFuture = terminateWorkersWithRetry(regionExecution).flatMap { _ => + // Set this coordinator's status to be completed so that subsequent regions can be started by + // WorkflowExecutionCoordinator. + setPhase(Completed) + Future.Unit + } + if (terminationFutureRef.compareAndSet(null, terminationFuture)) { + terminationFuture + } else { + terminationFutureRef.get + } + } } private def terminateWorkers(regionExecution: RegionExecution) = { @@ -167,7 +182,7 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) - gracefulStop(actorRef, Duration(5, TimeUnit.SECONDS)).asTwitter() + gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() } }.toSeq @@ -191,8 +206,29 @@ class RegionExecutionCoordinator( } } + private def terminateWorkersWithRetry( + regionExecution: RegionExecution, + attempt: Int = 1 + ): Future[Unit] = { + terminateWorkers(regionExecution).rescue { case err => + logger.warn( + s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", + err + ) + Future + .sleep(killRetryDelay)(killRetryTimer) + .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) + } + } + def isCompleted: Boolean = currentPhaseRef.get == Completed + /** + * Returns the region termination future if termination has been initiated. + * This is only set by `tryCompleteRegionExecution()`. + */ + def getTerminationFutureOpt: Option[Future[Unit]] = Option(terminationFutureRef.get) + /** * This will sync and transition the region execution phase from one to another depending on its current phase: * @@ -528,12 +564,14 @@ class RegionExecutionCoordinator( portConfigs.foreach { case (outputPortId, portConfig) => val storageUriToAdd = portConfig.storageURI + val stateUriToAdd = State.stateUriFromResultUri(storageUriToAdd) val (_, eid, _, _) = decodeURI(storageUriToAdd) val schemaOptional = region.getOperator(outputPortId.opId).outputPorts(outputPortId.portId)._3 val schema = schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) DocumentFactory.createDocument(storageUriToAdd, schema) + DocumentFactory.createDocument(stateUriToAdd, State.schema) WorkflowExecutionsResource.insertOperatorPortResultUri( eid = eid, globalPortId = outputPortId, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala index 3aa5fa90a46..65c560ee594 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala @@ -126,6 +126,7 @@ class DataProcessor( val outputState = executor.processState(state, port) if (outputState.isDefined) { outputManager.emitState(outputState.get) + outputManager.saveStateToStorageIfNeeded(state) } } catch safely { case e => diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala index 10fbbc44a2c..acada743bc6 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.worker.managers import io.grpc.MethodDescriptor import org.apache.texera.amber.config.ApplicationConfig +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.model.VirtualDocument import org.apache.texera.amber.core.tuple.Tuple @@ -45,7 +46,11 @@ import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.{ DPInputQueueElement, FIFOMessageElement } -import org.apache.texera.amber.engine.common.ambermessage.{DataFrame, WorkflowFIFOMessage} +import org.apache.texera.amber.engine.common.ambermessage.{ + DataFrame, + StateFrame, + WorkflowFIFOMessage +} import org.apache.texera.amber.util.VirtualIdentityUtils.getFromActorIdForInputPortStorage import java.net.URI @@ -106,6 +111,25 @@ class InputPortMaterializationReaderThread( } // Flush any remaining tuples in the buffer. if (buffer.nonEmpty) flush() + + try { + val state_document = + DocumentFactory + .openDocument(State.stateUriFromResultUri(uri)) + ._1 + .asInstanceOf[VirtualDocument[Tuple]] + val stateReadIterator = state_document.get() + + while (stateReadIterator.hasNext) { + val state = State.deserialize(stateReadIterator.next()) + inputMessageQueue.put( + FIFOMessageElement(WorkflowFIFOMessage(channelId, getSequenceNumber, StateFrame(state))) + ) + } + } catch { + case _: Exception => + } + emitECM(METHOD_END_CHANNEL, PORT_ALIGNMENT) isFinished.set(true) } catch { diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 4957f31a407..f76a314b7ae 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.JsonNode import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} import org.apache.texera.amber.util.JSONUtils.objectMapper +import java.net.URI import java.util.Base64 import scala.jdk.CollectionConverters.IteratorHasAsScala @@ -36,6 +37,9 @@ object State { new Attribute(StateContent, AttributeType.STRING) ) + def stateUriFromResultUri(resultUri: URI): URI = + new URI(resultUri.toString.replace("/result", "/state")) + def serialize(state: State): Tuple = { val payloadJson = objectMapper.writeValueAsString(toJsonValue(state)) Tuple.builder(schema).addSequentially(Array(payloadJson)).build() diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 15949ef4717..ae37def667e 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -72,6 +72,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } @@ -119,6 +120,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => "state" case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala index 3513ac5ecd8..990776a69f0 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/VFSURIFactory.scala @@ -34,6 +34,7 @@ object VFSResourceType extends Enumeration { val RESULT: Value = Value("result") val RUNTIME_STATISTICS: Value = Value("runtimeStatistics") val CONSOLE_MESSAGES: Value = Value("consoleMessages") + val STATE: Value = Value("state") } object VFSURIFactory { From bd1bc287ff71163bf8b1ee7da7fe7d6151cd4180 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 23 Apr 2026 01:05:16 -0700 Subject: [PATCH 032/152] fix: keep only materialization change in region coordinator --- AGENTS.md | 76 +++++++++++++++++++ .../RegionExecutionCoordinator.scala | 55 +++----------- 2 files changed, 86 insertions(+), 45 deletions(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000000..4038ee1733d --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,76 @@ +# AGENTS.md (Texera) + +Quick orientation for agents working on Apache Texera (Incubating). Pair this file with the root `README.md` and the developer wiki linked from it. + +## Big picture (modules + service boundaries) +- Repo is an **sbt multi-project Scala backend** (Scala 2.13.12, JDK-based) plus an **Angular frontend** under `frontend/`. +- Backend services (Dropwizard/Jersey) live at the top level and share code under `common/`. The sbt module graph is defined in `build.sbt`; note that sbt project names do not always match folder names (e.g., `amber/` is the sbt project `WorkflowExecutionService`). +- Shared libraries (`common/`): + - `common/dao` (DAO), `common/config` (Config), `common/auth` (Auth, JWT setup), `common/workflow-core` (WorkflowCore), `common/workflow-operator` (WorkflowOperator, operator definitions and descriptors), `common/pybuilder` (PyBuilder — a `pyb"..."` macro DSL for composing Python code for Python operators). +- Services (top-level folders): + - `amber/` — main web application + workflow execution engine ("Amber" actor-based dataflow runtime). Serves the Angular GUI, REST API, and the collaboration WebSocket. + - `workflow-compiling-service/` — compiles workflow JSON into executable plans. + - `file-service/` — datasets/files (works with LakeFS / Iceberg catalogs, see `sql/`). + - `config-service/` — runtime configuration. + - `access-control-service/` — ACL + the AI assistant chat/completion endpoints. + - `computing-unit-managing-service/` — lifecycle of compute units (master/worker pods, scaling). + - `pyright-language-service/` — Pyright-backed language server for Python UDF editing. +- Python runtime companion: `amber/src/main/python/` (`pytexera`, `pyamber`, `core`, `proto`, `texera_run_python_worker.py`) — used by Python workers spawned by the Amber engine. +- A secondary, source-only copy of the operator library lives at `core/workflow-operator/src/...` (legacy / build artifact source; prefer `common/workflow-operator` for new code). +- Build note: `build.sbt` injects ASF licensing files (`LICENSE`, `NOTICE`, `DISCLAIMER-WIP`) into `META-INF/` of every JAR via `asfLicensingSettings`. + +## Service port map (default config) +| Service | App port | Admin port | Source config | +| --- | --- | --- | --- | +| amber (`TexeraWebApplication`) | 8080 | 8081 | `amber/src/main/resources/web-config.yml` | +| workflow-compiling-service | 9090 | — | `workflow-compiling-service/src/main/resources/workflow-compiling-service-config.yaml` | +| file-service | 9092 | — | `file-service/src/main/resources/file-service-web-config.yaml` | +| config-service | 9094 | — | `config-service/src/main/resources/config-service-web-config.yaml` | +| access-control-service (AI assistant, models) | 9096 | — | `access-control-service/src/main/resources/access-control-service-web-config.yaml` | +| computing-unit-managing-service | 8888 | 8082 | `computing-unit-managing-service/src/main/resources/computing-unit-managing-service-config.yaml` | +| WebSocket (collaboration, `/wsapi`) | 8085 | — | served by `amber` | +| y-websocket (shared editing `/rtc`) | 1234 | — | `bin/shared-editing-server.sh`, `bin/y-websocket-server/` | + +Frontend dev proxy routing (`frontend/proxy.config.json`) mirrors this split — e.g., `/api/compile` → 9090, `/api/dataset` → 9092, `/api/config/**` → 9094, `/api/models` and `/api/chat/completion` → 9096, `/api/computing-unit` → 8888, everything else `/api` → 8080. + +## Runtime flow & cross-component communication +- **REST base path:** every service mounts Jersey at `/api/*` (`environment.jersey.setUrlPattern("/api/*")` in each `Application.run`). +- **Web GUI serving:** `amber` serves Angular static output via `FileAssetsBundle("../../frontend/dist", "/", "index.html")` and redirects 404s to `/` so Angular client-side routing works (`TexeraWebApplication.scala`). +- **WebSockets:** collaboration is wired through Dropwizard `WebsocketBundle(classOf[CollaborationResource])`; the Jetty WS idle timeout is explicitly set to 1 hour via `WebSocketUpgradeFilter` in `TexeraWebApplication.run(...)`. +- **Database init pattern:** services call `SqlServer.initConnection(StorageConfig.jdbcUrl, ...)` during startup (see `TexeraWebApplication.scala`, `WorkflowCompilingService.scala`, etc.). DDL lives in `sql/texera_ddl.sql`; Iceberg / LakeFS / Lakekeeper bootstrap SQL is also under `sql/`. +- **Auth:** JWT auth is installed via `setupJwtAuth(environment)` in `amber`, plus `AuthValueFactoryProvider.Binder[SessionUser]` and `RolesAllowedDynamicFeature`. Resources under `.../resource/auth/` (`AuthResource`, `GoogleAuthResource`) own login; `AuthResource.createAdminUser()` runs at startup. +- **Request logging filter:** every service adds a Jetty request-log filter that logs through SLF4J logger `org.eclipse.jetty.server.RequestLog` (level controlled by env var `TEXERA_SERVICE_LOG_LEVEL`). Note the servlet-API split: `amber` currently uses `javax.servlet.*` while `workflow-compiling-service` (and other newer services) use `jakarta.servlet.*`. There is a TODO to consolidate onto `common/auth`'s `RequestLoggingFilter.register()` once `amber` upgrades to Dropwizard 4.x. +- **Config loading:** every service uses `SubstitutingSourceProvider` + `EnvironmentVariableSubstitutor(false)` so YAML configs support `${ENV_VAR}` expansion. + +## Where to make changes (project-specific conventions) +- **New backend endpoint:** create a Jersey `*Resource` under `/src/main/scala/.../resource/` and **register it in that service's `Application.run(...)`** via `environment.jersey.register(classOf[YourResource])`. `amber`'s `TexeraWebApplication.run(...)` already registers a long list — `AuthResource`, `WorkflowResource`, `DashboardResource`, `ProjectResource`, `HubResource`, `GmailResource`, `AIAssistantResource`, etc. Follow that pattern; don't rely on classpath scanning. +- **Shared backend logic** belongs in `common/*` (honor the dependency graph in `build.sbt`): `common/dao` for DB, `common/config` for config, `common/auth` for auth, `common/workflow-core` + `common/workflow-operator` for dataflow model/operators, `common/pybuilder` for Python code generation. +- **New operators:** add to `common/workflow-operator/...`; Python-backed operators typically use `common/pybuilder` and interact with the Python worker under `amber/src/main/python/`. +- **Frontend code** is isolated under `frontend/src/` (Angular, yarn-managed). `amber` only serves the built output from `frontend/dist`. The app modules live in `frontend/src/app/{common,dashboard,hub,workspace}`. +- When adding Jackson-touched types, respect the per-service Jackson `dependencyOverrides` already in `build.sbt` — different services pin different versions for Dropwizard 3 vs 4 compatibility. + +## Critical developer workflows +- **Build everything:** `bin/build.sh` runs `bin/build-services.sh` (which runs `sbt clean dist` and unzips the `target/universal/*.zip` artifacts into per-service `target/` dirs) and then `bin/frontend.sh` (`yarn install && yarn run build` in `frontend/`). +- **Run locally after build:** + - `bin/server.sh` — starts `amber` from `amber/target/texera-*/bin/texera-web-application`. + - `bin/workflow-compiling-service.sh`, `bin/file-service.sh`, `bin/config-service.sh`, `bin/computing-unit-managing-service.sh`, `bin/workflow-computing-unit.sh` — start the other services from their unzipped `target/` dirs. + - `bin/frontend-dev.sh` — frontend dev server with the proxy config above. + - `bin/shared-editing-server.sh` — y-websocket server for `/rtc` collaboration. + - `bin/python-language-service.sh` / `bin/pylsp/` — Python language service. +- **Docker images:** Dockerfiles in `bin/*.dockerfile` **must be built from the repo root** as context (see `bin/README.md`). Example: `docker build -f bin/texera-web-application.dockerfile -t your-repo/texera-web-application:test .`. Helpers: `bin/build-images.sh`, `bin/merge-image-tags.sh`. +- **Deployment references:** single-node Docker Compose at `bin/single-node/docker-compose.yml` (+ `nginx.conf`, `examples/`); Kubernetes Helm chart at `bin/k8s/` (`Chart.yaml`, `values.yaml`, `values-development.yaml`, `templates/`). +- **Formatting/lint:** `bin/fix-format.sh`; Scalafmt config at `.scalafmt.conf`, Scalafix at `.scalafix.conf`. +- **Proto codegen:** `bin/python-proto-gen.sh`, `bin/frontend-proto-gen.sh`. +- **Service entrypoints** typically call `new ().run("server", )`. The YAML path either resolves via `TEXERA_HOME` (e.g., `WorkflowCompilingService`) or via `Utils.amberHomePath` (e.g., `TexeraWebApplication`). + +## Map of the code (high-signal entrypoints) +- sbt module graph and ASF licensing task: `build.sbt` +- `amber` web app + REST registration + GUI serving + WebSocket + JWT: `amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala` +- Amber dataflow engine: `amber/src/main/scala/org/apache/texera/amber/engine/architecture/{controller,worker,pythonworker,scheduling,messaginglayer,sendsemantics,deploysemantics,logreplay,common}/` +- Compilation service: `workflow-compiling-service/src/main/scala/org/apache/texera/service/WorkflowCompilingService.scala` +- Other service entrypoints: `*/src/main/scala/org/apache/texera/service/*Service.scala` (`FileService`, `ConfigService`, `AccessControlService`, `ComputingUnitManagingService`) +- Dashboard/user/hub/admin/AI resources: `amber/src/main/scala/org/apache/texera/web/resource/` (top-level + `auth/`, `aiassistant/`, `dashboard/{admin,hub,user}/`) +- Python worker + pytexera SDK: `amber/src/main/python/` +- Deployment artifacts: `bin/*.dockerfile`, `bin/*.sh`, `bin/single-node/`, `bin/k8s/` +- SQL DDL and catalog bootstrap: `sql/texera_ddl.sql`, `sql/texera_lakefs.sql`, `sql/texera_lakekeeper.sql`, `sql/iceberg_postgres_catalog.sql`, `sql/updates/` +- Root docs: `README.md` (links to developer wiki), `CONTRIBUTING.md`, `SECURITY.md`, `DISCLAIMER-WIP`. diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 5be5d942e5c..6600721e938 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -20,7 +20,7 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.pekko.pattern.gracefulStop -import com.twitter.util.{Duration => TwitterDuration, Future, JavaTimer, Return, Throw, Timer} +import com.twitter.util.{Future, Return, Throw} import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.VFSURIFactory.decodeURI @@ -62,7 +62,7 @@ import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowExecutions import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.AtomicReference -import scala.concurrent.duration.{Duration => ScalaDuration} +import scala.concurrent.duration.Duration /** * The executor of a region. @@ -110,14 +110,10 @@ class RegionExecutionCoordinator( private val currentPhaseRef: AtomicReference[RegionExecutionPhase] = new AtomicReference( Unexecuted ) - private val terminationFutureRef: AtomicReference[Future[Unit]] = new AtomicReference(null) - private val killRetryTimer: Timer = new JavaTimer(true) - private val killRetryDelay: TwitterDuration = TwitterDuration.fromMilliseconds(200) /** * Sync the status of `RegionExecution` and transition this coordinator's phase to `Completed` only when the - * coordinator is currently in `ExecutingNonDependeePortsPhase`, all the ports of this region are completed, and - * all workers in this region are terminated. + * coordinator is currently in `ExecutingNonDependeePortsPhase` and all the ports of this region are completed. * * Additionally, this method will also terminate all the workers of this region: * @@ -140,22 +136,12 @@ class RegionExecutionCoordinator( return Future.Unit } - val existingTerminationFuture = terminationFutureRef.get - if (existingTerminationFuture != null) { - existingTerminationFuture - } else { - val terminationFuture = terminateWorkersWithRetry(regionExecution).flatMap { _ => - // Set this coordinator's status to be completed so that subsequent regions can be started by - // WorkflowExecutionCoordinator. - setPhase(Completed) - Future.Unit - } - if (terminationFutureRef.compareAndSet(null, terminationFuture)) { - terminationFuture - } else { - terminationFutureRef.get - } - } + // Set this coordinator's status to be completed so that subsequent regions can be started by + // WorkflowExecutionCoordinator. + setPhase(Completed) + + // Terminate all the workers in this region. + terminateWorkers(regionExecution) } private def terminateWorkers(regionExecution: RegionExecution) = { @@ -182,7 +168,7 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) - gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() + gracefulStop(actorRef, Duration(5, TimeUnit.SECONDS)).asTwitter() } }.toSeq @@ -206,29 +192,8 @@ class RegionExecutionCoordinator( } } - private def terminateWorkersWithRetry( - regionExecution: RegionExecution, - attempt: Int = 1 - ): Future[Unit] = { - terminateWorkers(regionExecution).rescue { case err => - logger.warn( - s"Failed to terminate region ${region.id.id} on attempt $attempt. Retrying in ${killRetryDelay.inMilliseconds} ms.", - err - ) - Future - .sleep(killRetryDelay)(killRetryTimer) - .flatMap(_ => terminateWorkersWithRetry(regionExecution, attempt + 1)) - } - } - def isCompleted: Boolean = currentPhaseRef.get == Completed - /** - * Returns the region termination future if termination has been initiated. - * This is only set by `tryCompleteRegionExecution()`. - */ - def getTerminationFutureOpt: Option[Future[Unit]] = Option(terminationFutureRef.get) - /** * This will sync and transition the region execution phase from one to another depending on its current phase: * From 272caff58491b26ff27a26c8852b1a7e4bf2ed96 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 23 Apr 2026 01:05:33 -0700 Subject: [PATCH 033/152] chore: drop local AGENTS file from branch --- AGENTS.md | 76 ------------------------------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 4038ee1733d..00000000000 --- a/AGENTS.md +++ /dev/null @@ -1,76 +0,0 @@ -# AGENTS.md (Texera) - -Quick orientation for agents working on Apache Texera (Incubating). Pair this file with the root `README.md` and the developer wiki linked from it. - -## Big picture (modules + service boundaries) -- Repo is an **sbt multi-project Scala backend** (Scala 2.13.12, JDK-based) plus an **Angular frontend** under `frontend/`. -- Backend services (Dropwizard/Jersey) live at the top level and share code under `common/`. The sbt module graph is defined in `build.sbt`; note that sbt project names do not always match folder names (e.g., `amber/` is the sbt project `WorkflowExecutionService`). -- Shared libraries (`common/`): - - `common/dao` (DAO), `common/config` (Config), `common/auth` (Auth, JWT setup), `common/workflow-core` (WorkflowCore), `common/workflow-operator` (WorkflowOperator, operator definitions and descriptors), `common/pybuilder` (PyBuilder — a `pyb"..."` macro DSL for composing Python code for Python operators). -- Services (top-level folders): - - `amber/` — main web application + workflow execution engine ("Amber" actor-based dataflow runtime). Serves the Angular GUI, REST API, and the collaboration WebSocket. - - `workflow-compiling-service/` — compiles workflow JSON into executable plans. - - `file-service/` — datasets/files (works with LakeFS / Iceberg catalogs, see `sql/`). - - `config-service/` — runtime configuration. - - `access-control-service/` — ACL + the AI assistant chat/completion endpoints. - - `computing-unit-managing-service/` — lifecycle of compute units (master/worker pods, scaling). - - `pyright-language-service/` — Pyright-backed language server for Python UDF editing. -- Python runtime companion: `amber/src/main/python/` (`pytexera`, `pyamber`, `core`, `proto`, `texera_run_python_worker.py`) — used by Python workers spawned by the Amber engine. -- A secondary, source-only copy of the operator library lives at `core/workflow-operator/src/...` (legacy / build artifact source; prefer `common/workflow-operator` for new code). -- Build note: `build.sbt` injects ASF licensing files (`LICENSE`, `NOTICE`, `DISCLAIMER-WIP`) into `META-INF/` of every JAR via `asfLicensingSettings`. - -## Service port map (default config) -| Service | App port | Admin port | Source config | -| --- | --- | --- | --- | -| amber (`TexeraWebApplication`) | 8080 | 8081 | `amber/src/main/resources/web-config.yml` | -| workflow-compiling-service | 9090 | — | `workflow-compiling-service/src/main/resources/workflow-compiling-service-config.yaml` | -| file-service | 9092 | — | `file-service/src/main/resources/file-service-web-config.yaml` | -| config-service | 9094 | — | `config-service/src/main/resources/config-service-web-config.yaml` | -| access-control-service (AI assistant, models) | 9096 | — | `access-control-service/src/main/resources/access-control-service-web-config.yaml` | -| computing-unit-managing-service | 8888 | 8082 | `computing-unit-managing-service/src/main/resources/computing-unit-managing-service-config.yaml` | -| WebSocket (collaboration, `/wsapi`) | 8085 | — | served by `amber` | -| y-websocket (shared editing `/rtc`) | 1234 | — | `bin/shared-editing-server.sh`, `bin/y-websocket-server/` | - -Frontend dev proxy routing (`frontend/proxy.config.json`) mirrors this split — e.g., `/api/compile` → 9090, `/api/dataset` → 9092, `/api/config/**` → 9094, `/api/models` and `/api/chat/completion` → 9096, `/api/computing-unit` → 8888, everything else `/api` → 8080. - -## Runtime flow & cross-component communication -- **REST base path:** every service mounts Jersey at `/api/*` (`environment.jersey.setUrlPattern("/api/*")` in each `Application.run`). -- **Web GUI serving:** `amber` serves Angular static output via `FileAssetsBundle("../../frontend/dist", "/", "index.html")` and redirects 404s to `/` so Angular client-side routing works (`TexeraWebApplication.scala`). -- **WebSockets:** collaboration is wired through Dropwizard `WebsocketBundle(classOf[CollaborationResource])`; the Jetty WS idle timeout is explicitly set to 1 hour via `WebSocketUpgradeFilter` in `TexeraWebApplication.run(...)`. -- **Database init pattern:** services call `SqlServer.initConnection(StorageConfig.jdbcUrl, ...)` during startup (see `TexeraWebApplication.scala`, `WorkflowCompilingService.scala`, etc.). DDL lives in `sql/texera_ddl.sql`; Iceberg / LakeFS / Lakekeeper bootstrap SQL is also under `sql/`. -- **Auth:** JWT auth is installed via `setupJwtAuth(environment)` in `amber`, plus `AuthValueFactoryProvider.Binder[SessionUser]` and `RolesAllowedDynamicFeature`. Resources under `.../resource/auth/` (`AuthResource`, `GoogleAuthResource`) own login; `AuthResource.createAdminUser()` runs at startup. -- **Request logging filter:** every service adds a Jetty request-log filter that logs through SLF4J logger `org.eclipse.jetty.server.RequestLog` (level controlled by env var `TEXERA_SERVICE_LOG_LEVEL`). Note the servlet-API split: `amber` currently uses `javax.servlet.*` while `workflow-compiling-service` (and other newer services) use `jakarta.servlet.*`. There is a TODO to consolidate onto `common/auth`'s `RequestLoggingFilter.register()` once `amber` upgrades to Dropwizard 4.x. -- **Config loading:** every service uses `SubstitutingSourceProvider` + `EnvironmentVariableSubstitutor(false)` so YAML configs support `${ENV_VAR}` expansion. - -## Where to make changes (project-specific conventions) -- **New backend endpoint:** create a Jersey `*Resource` under `/src/main/scala/.../resource/` and **register it in that service's `Application.run(...)`** via `environment.jersey.register(classOf[YourResource])`. `amber`'s `TexeraWebApplication.run(...)` already registers a long list — `AuthResource`, `WorkflowResource`, `DashboardResource`, `ProjectResource`, `HubResource`, `GmailResource`, `AIAssistantResource`, etc. Follow that pattern; don't rely on classpath scanning. -- **Shared backend logic** belongs in `common/*` (honor the dependency graph in `build.sbt`): `common/dao` for DB, `common/config` for config, `common/auth` for auth, `common/workflow-core` + `common/workflow-operator` for dataflow model/operators, `common/pybuilder` for Python code generation. -- **New operators:** add to `common/workflow-operator/...`; Python-backed operators typically use `common/pybuilder` and interact with the Python worker under `amber/src/main/python/`. -- **Frontend code** is isolated under `frontend/src/` (Angular, yarn-managed). `amber` only serves the built output from `frontend/dist`. The app modules live in `frontend/src/app/{common,dashboard,hub,workspace}`. -- When adding Jackson-touched types, respect the per-service Jackson `dependencyOverrides` already in `build.sbt` — different services pin different versions for Dropwizard 3 vs 4 compatibility. - -## Critical developer workflows -- **Build everything:** `bin/build.sh` runs `bin/build-services.sh` (which runs `sbt clean dist` and unzips the `target/universal/*.zip` artifacts into per-service `target/` dirs) and then `bin/frontend.sh` (`yarn install && yarn run build` in `frontend/`). -- **Run locally after build:** - - `bin/server.sh` — starts `amber` from `amber/target/texera-*/bin/texera-web-application`. - - `bin/workflow-compiling-service.sh`, `bin/file-service.sh`, `bin/config-service.sh`, `bin/computing-unit-managing-service.sh`, `bin/workflow-computing-unit.sh` — start the other services from their unzipped `target/` dirs. - - `bin/frontend-dev.sh` — frontend dev server with the proxy config above. - - `bin/shared-editing-server.sh` — y-websocket server for `/rtc` collaboration. - - `bin/python-language-service.sh` / `bin/pylsp/` — Python language service. -- **Docker images:** Dockerfiles in `bin/*.dockerfile` **must be built from the repo root** as context (see `bin/README.md`). Example: `docker build -f bin/texera-web-application.dockerfile -t your-repo/texera-web-application:test .`. Helpers: `bin/build-images.sh`, `bin/merge-image-tags.sh`. -- **Deployment references:** single-node Docker Compose at `bin/single-node/docker-compose.yml` (+ `nginx.conf`, `examples/`); Kubernetes Helm chart at `bin/k8s/` (`Chart.yaml`, `values.yaml`, `values-development.yaml`, `templates/`). -- **Formatting/lint:** `bin/fix-format.sh`; Scalafmt config at `.scalafmt.conf`, Scalafix at `.scalafix.conf`. -- **Proto codegen:** `bin/python-proto-gen.sh`, `bin/frontend-proto-gen.sh`. -- **Service entrypoints** typically call `new ().run("server", )`. The YAML path either resolves via `TEXERA_HOME` (e.g., `WorkflowCompilingService`) or via `Utils.amberHomePath` (e.g., `TexeraWebApplication`). - -## Map of the code (high-signal entrypoints) -- sbt module graph and ASF licensing task: `build.sbt` -- `amber` web app + REST registration + GUI serving + WebSocket + JWT: `amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala` -- Amber dataflow engine: `amber/src/main/scala/org/apache/texera/amber/engine/architecture/{controller,worker,pythonworker,scheduling,messaginglayer,sendsemantics,deploysemantics,logreplay,common}/` -- Compilation service: `workflow-compiling-service/src/main/scala/org/apache/texera/service/WorkflowCompilingService.scala` -- Other service entrypoints: `*/src/main/scala/org/apache/texera/service/*Service.scala` (`FileService`, `ConfigService`, `AccessControlService`, `ComputingUnitManagingService`) -- Dashboard/user/hub/admin/AI resources: `amber/src/main/scala/org/apache/texera/web/resource/` (top-level + `auth/`, `aiassistant/`, `dashboard/{admin,hub,user}/`) -- Python worker + pytexera SDK: `amber/src/main/python/` -- Deployment artifacts: `bin/*.dockerfile`, `bin/*.sh`, `bin/single-node/`, `bin/k8s/` -- SQL DDL and catalog bootstrap: `sql/texera_ddl.sql`, `sql/texera_lakefs.sql`, `sql/texera_lakekeeper.sql`, `sql/iceberg_postgres_catalog.sql`, `sql/updates/` -- Root docs: `README.md` (links to developer wiki), `CONTRIBUTING.md`, `SECURITY.md`, `DISCLAIMER-WIP`. From e7a0f15f3d88ecadd871408372c91bb78f4438aa Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 23 Apr 2026 01:08:30 -0700 Subject: [PATCH 034/152] test: cover state materialization round trip --- .../storage/iceberg/test_iceberg_document.py | 40 +++++++++++++++++++ .../result/iceberg/IcebergDocumentSpec.scala | 28 +++++++++++++ 2 files changed, 68 insertions(+) diff --git a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py index 9b374f7d5c7..29e43f249c0 100644 --- a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py +++ b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py @@ -23,6 +23,12 @@ from concurrent.futures.thread import ThreadPoolExecutor from core.models import Schema, Tuple +from core.models.state import ( + STATE_SCHEMA, + deserialize_state, + serialize_state, + state_uri_from_result_uri, +) from core.storage.document_factory import DocumentFactory from core.storage.storage_config import StorageConfig from core.storage.vfs_uri_factory import VFSURIFactory @@ -317,3 +323,37 @@ def test_get_counts(self, iceberg_document, sample_items): assert iceberg_document.get_count() == len(sample_items), ( "get_count should return the same number as the length of sample_items" ) + + def test_state_materialization_round_trip(self): + operator_uuid = str(uuid.uuid4()).replace("-", "") + result_uri = VFSURIFactory.create_result_uri( + WorkflowIdentity(id=0), + ExecutionIdentity(id=0), + GlobalPortIdentity( + op_id=PhysicalOpIdentity( + logical_op_id=OperatorIdentity(id=f"test_state_{operator_uuid}"), + layer_name="main", + ), + port_id=PortIdentity(id=0), + input=False, + ), + ) + state_uri = state_uri_from_result_uri(result_uri) + DocumentFactory.create_document(state_uri, STATE_SCHEMA) + document, _ = DocumentFactory.open_document(state_uri) + + state = { + "loop_counter": 3, + "name": "outer-loop", + "payload": b"\x00\x01state-bytes", + "nested": {"enabled": True, "values": [1, 2, 3]}, + } + + writer = document.writer(str(uuid.uuid4())) + writer.open() + writer.put_one(serialize_state(state)) + writer.close() + + stored_rows = list(document.get()) + assert len(stored_rows) == 1 + assert deserialize_state(stored_rows[0]) == state diff --git a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala index 8fdf039f3ea..761efe63415 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala @@ -20,6 +20,7 @@ package org.apache.texera.amber.storage.result.iceberg import org.apache.texera.amber.config.StorageConfig +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.model.{VirtualDocument, VirtualDocumentSpec} import org.apache.texera.amber.core.storage.{DocumentFactory, IcebergCatalogInstance, VFSURIFactory} import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} @@ -141,6 +142,33 @@ class IcebergDocumentSpec extends VirtualDocumentSpec[Tuple] with BeforeAndAfter } } + it should "round trip materialized state documents" in { + val stateUri = State.stateUriFromResultUri(uri) + DocumentFactory.createDocument(stateUri, State.schema) + val stateDocument = + DocumentFactory.openDocument(stateUri)._1.asInstanceOf[VirtualDocument[Tuple]] + val state: State = Map( + "loop_counter" -> 3, + "name" -> "outer-loop", + "payload" -> Array[Byte](0, 1, 2, 3), + "nested" -> Map("enabled" -> true, "values" -> List(1, 2, 3)) + ) + + val writer = stateDocument.writer(UUID.randomUUID().toString) + writer.open() + writer.putOne(State.serialize(state)) + writer.close() + + val storedRows = stateDocument.get().toList + assert(storedRows.length == 1) + val deserialized = State.deserialize(storedRows.head) + assert(deserialized("loop_counter") == 3L) + assert(deserialized("name") == "outer-loop") + assert(deserialized("payload").asInstanceOf[Array[Byte]].sameElements(Array[Byte](0, 1, 2, 3))) + assert(deserialized("nested").asInstanceOf[Map[String, Any]]("enabled") == true) + assert(deserialized("nested").asInstanceOf[Map[String, Any]]("values") == List(1L, 2L, 3L)) + } + /** Returns a dynamic proxy for `realTable` that increments `counter` on every `refresh()` call. */ private def tableWithRefreshSpy(realTable: Table, counter: AtomicInteger): Table = Proxy From 3c4dbb8ff8c0e019e0849722afaaea7e35e96025 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 23 Apr 2026 01:16:36 -0700 Subject: [PATCH 035/152] test: cover multiple state rows in materialization --- .../storage/iceberg/test_iceberg_document.py | 44 +++++++++++++++++++ .../result/iceberg/IcebergDocumentSpec.scala | 38 ++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py index 29e43f249c0..f4d1c6b3449 100644 --- a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py +++ b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py @@ -357,3 +357,47 @@ def test_state_materialization_round_trip(self): stored_rows = list(document.get()) assert len(stored_rows) == 1 assert deserialize_state(stored_rows[0]) == state + + def test_multiple_states_materialize_as_rows_in_one_table(self): + operator_uuid = str(uuid.uuid4()).replace("-", "") + result_uri = VFSURIFactory.create_result_uri( + WorkflowIdentity(id=0), + ExecutionIdentity(id=0), + GlobalPortIdentity( + op_id=PhysicalOpIdentity( + logical_op_id=OperatorIdentity( + id=f"test_multiple_states_{operator_uuid}" + ), + layer_name="main", + ), + port_id=PortIdentity(id=0), + input=False, + ), + ) + state_uri = state_uri_from_result_uri(result_uri) + DocumentFactory.create_document(state_uri, STATE_SCHEMA) + document, _ = DocumentFactory.open_document(state_uri) + + states = [ + {"loop_counter": 0, "i": 1, "payload": b"first"}, + { + "loop_counter": 1, + "i": 2, + "payload": b"second", + "nested": {"values": [3, 4]}, + }, + ] + + writer = document.writer(str(uuid.uuid4())) + writer.open() + for state in states: + writer.put_one(serialize_state(state)) + writer.close() + + stored_rows = list(document.get()) + assert len(stored_rows) == len(states) + actual_states = sorted( + [deserialize_state(row) for row in stored_rows], + key=lambda state: state["loop_counter"], + ) + assert actual_states == states diff --git a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala index 761efe63415..062d9d21cc2 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala @@ -169,6 +169,44 @@ class IcebergDocumentSpec extends VirtualDocumentSpec[Tuple] with BeforeAndAfter assert(deserialized("nested").asInstanceOf[Map[String, Any]]("values") == List(1L, 2L, 3L)) } + it should "materialize multiple states as rows in one state table" in { + val stateUri = State.stateUriFromResultUri(uri) + DocumentFactory.createDocument(stateUri, State.schema) + val stateDocument = + DocumentFactory.openDocument(stateUri)._1.asInstanceOf[VirtualDocument[Tuple]] + val states: List[State] = List( + Map("loop_counter" -> 0, "i" -> 1, "payload" -> Array[Byte](1, 2, 3)), + Map( + "loop_counter" -> 1, + "i" -> 2, + "payload" -> Array[Byte](4, 5, 6), + "nested" -> Map("values" -> List(3, 4)) + ) + ) + + val writer = stateDocument.writer(UUID.randomUUID().toString) + writer.open() + states.foreach(state => writer.putOne(State.serialize(state))) + writer.close() + + val deserializedStates = + stateDocument.get().toList.map(State.deserialize).sortBy(_("loop_counter").asInstanceOf[Long]) + assert(deserializedStates.length == states.length) + deserializedStates.zip(states).foreach { + case (actual, expected) => + assert(actual("loop_counter") == expected("loop_counter").asInstanceOf[Int].toLong) + assert(actual("i") == expected("i").asInstanceOf[Int].toLong) + assert( + actual("payload") + .asInstanceOf[Array[Byte]] + .sameElements(expected("payload").asInstanceOf[Array[Byte]]) + ) + } + assert( + deserializedStates(1)("nested").asInstanceOf[Map[String, Any]]("values") == List(3L, 4L) + ) + } + /** Returns a dynamic proxy for `realTable` that increments `counter` on every `refresh()` call. */ private def tableWithRefreshSpy(realTable: Table, counter: AtomicInteger): Table = Proxy From cb1d9e13b156dad783dd4bf50ead9b384217c0ac Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 23 Apr 2026 19:02:15 -0700 Subject: [PATCH 036/152] fix fmt --- amber/src/main/python/core/models/state.py | 6 +----- amber/src/main/python/core/runnables/network_sender.py | 4 +--- .../scala/org/apache/texera/amber/core/state/State.scala | 7 ++++++- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index a496d5c41c2..897153d37a3 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -34,11 +34,7 @@ def serialize_state(state: State) -> Tuple: return Tuple( - { - STATE_CONTENT: json.dumps( - _to_json_value(state), separators=(",", ":") - ) - }, + {STATE_CONTENT: json.dumps(_to_json_value(state), separators=(",", ":"))}, schema=STATE_SCHEMA, ) diff --git a/amber/src/main/python/core/runnables/network_sender.py b/amber/src/main/python/core/runnables/network_sender.py index f1bd8659ee9..52d799d6f1f 100644 --- a/amber/src/main/python/core/runnables/network_sender.py +++ b/amber/src/main/python/core/runnables/network_sender.py @@ -107,9 +107,7 @@ def _send_data(self, to: ChannelIdentity, data_payload: DataPayload) -> None: serialized_state = serialize_state(data_payload.frame) table = pa.Table.from_pydict( { - STATE_CONTENT: [ - serialized_state[STATE_CONTENT] - ], + STATE_CONTENT: [serialized_state[STATE_CONTENT]], }, schema=STATE_SCHEMA.as_arrow_schema(), ) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 4957f31a407..779cc97a28c 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -43,7 +43,12 @@ object State { def deserialize(tuple: Tuple): State = { val payload = tuple.getField[String](StateContent) - objectMapper.readTree(payload).fields().asScala.map(entry => entry.getKey -> fromJsonValue(entry.getValue)).toMap + objectMapper + .readTree(payload) + .fields() + .asScala + .map(entry => entry.getKey -> fromJsonValue(entry.getValue)) + .toMap } private def toJsonValue(value: Any): Any = From a5b50110cb89f9c1a0003b01de9d3229dc35d018 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 24 Apr 2026 20:33:42 -0700 Subject: [PATCH 037/152] sync control channel cleanup on loop branch --- AGENTS.md | 76 +++++++++++++++++++ .../messaginglayer/NetworkOutputGateway.scala | 4 +- .../RegionExecutionCoordinator.scala | 2 + 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000000..4038ee1733d --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,76 @@ +# AGENTS.md (Texera) + +Quick orientation for agents working on Apache Texera (Incubating). Pair this file with the root `README.md` and the developer wiki linked from it. + +## Big picture (modules + service boundaries) +- Repo is an **sbt multi-project Scala backend** (Scala 2.13.12, JDK-based) plus an **Angular frontend** under `frontend/`. +- Backend services (Dropwizard/Jersey) live at the top level and share code under `common/`. The sbt module graph is defined in `build.sbt`; note that sbt project names do not always match folder names (e.g., `amber/` is the sbt project `WorkflowExecutionService`). +- Shared libraries (`common/`): + - `common/dao` (DAO), `common/config` (Config), `common/auth` (Auth, JWT setup), `common/workflow-core` (WorkflowCore), `common/workflow-operator` (WorkflowOperator, operator definitions and descriptors), `common/pybuilder` (PyBuilder — a `pyb"..."` macro DSL for composing Python code for Python operators). +- Services (top-level folders): + - `amber/` — main web application + workflow execution engine ("Amber" actor-based dataflow runtime). Serves the Angular GUI, REST API, and the collaboration WebSocket. + - `workflow-compiling-service/` — compiles workflow JSON into executable plans. + - `file-service/` — datasets/files (works with LakeFS / Iceberg catalogs, see `sql/`). + - `config-service/` — runtime configuration. + - `access-control-service/` — ACL + the AI assistant chat/completion endpoints. + - `computing-unit-managing-service/` — lifecycle of compute units (master/worker pods, scaling). + - `pyright-language-service/` — Pyright-backed language server for Python UDF editing. +- Python runtime companion: `amber/src/main/python/` (`pytexera`, `pyamber`, `core`, `proto`, `texera_run_python_worker.py`) — used by Python workers spawned by the Amber engine. +- A secondary, source-only copy of the operator library lives at `core/workflow-operator/src/...` (legacy / build artifact source; prefer `common/workflow-operator` for new code). +- Build note: `build.sbt` injects ASF licensing files (`LICENSE`, `NOTICE`, `DISCLAIMER-WIP`) into `META-INF/` of every JAR via `asfLicensingSettings`. + +## Service port map (default config) +| Service | App port | Admin port | Source config | +| --- | --- | --- | --- | +| amber (`TexeraWebApplication`) | 8080 | 8081 | `amber/src/main/resources/web-config.yml` | +| workflow-compiling-service | 9090 | — | `workflow-compiling-service/src/main/resources/workflow-compiling-service-config.yaml` | +| file-service | 9092 | — | `file-service/src/main/resources/file-service-web-config.yaml` | +| config-service | 9094 | — | `config-service/src/main/resources/config-service-web-config.yaml` | +| access-control-service (AI assistant, models) | 9096 | — | `access-control-service/src/main/resources/access-control-service-web-config.yaml` | +| computing-unit-managing-service | 8888 | 8082 | `computing-unit-managing-service/src/main/resources/computing-unit-managing-service-config.yaml` | +| WebSocket (collaboration, `/wsapi`) | 8085 | — | served by `amber` | +| y-websocket (shared editing `/rtc`) | 1234 | — | `bin/shared-editing-server.sh`, `bin/y-websocket-server/` | + +Frontend dev proxy routing (`frontend/proxy.config.json`) mirrors this split — e.g., `/api/compile` → 9090, `/api/dataset` → 9092, `/api/config/**` → 9094, `/api/models` and `/api/chat/completion` → 9096, `/api/computing-unit` → 8888, everything else `/api` → 8080. + +## Runtime flow & cross-component communication +- **REST base path:** every service mounts Jersey at `/api/*` (`environment.jersey.setUrlPattern("/api/*")` in each `Application.run`). +- **Web GUI serving:** `amber` serves Angular static output via `FileAssetsBundle("../../frontend/dist", "/", "index.html")` and redirects 404s to `/` so Angular client-side routing works (`TexeraWebApplication.scala`). +- **WebSockets:** collaboration is wired through Dropwizard `WebsocketBundle(classOf[CollaborationResource])`; the Jetty WS idle timeout is explicitly set to 1 hour via `WebSocketUpgradeFilter` in `TexeraWebApplication.run(...)`. +- **Database init pattern:** services call `SqlServer.initConnection(StorageConfig.jdbcUrl, ...)` during startup (see `TexeraWebApplication.scala`, `WorkflowCompilingService.scala`, etc.). DDL lives in `sql/texera_ddl.sql`; Iceberg / LakeFS / Lakekeeper bootstrap SQL is also under `sql/`. +- **Auth:** JWT auth is installed via `setupJwtAuth(environment)` in `amber`, plus `AuthValueFactoryProvider.Binder[SessionUser]` and `RolesAllowedDynamicFeature`. Resources under `.../resource/auth/` (`AuthResource`, `GoogleAuthResource`) own login; `AuthResource.createAdminUser()` runs at startup. +- **Request logging filter:** every service adds a Jetty request-log filter that logs through SLF4J logger `org.eclipse.jetty.server.RequestLog` (level controlled by env var `TEXERA_SERVICE_LOG_LEVEL`). Note the servlet-API split: `amber` currently uses `javax.servlet.*` while `workflow-compiling-service` (and other newer services) use `jakarta.servlet.*`. There is a TODO to consolidate onto `common/auth`'s `RequestLoggingFilter.register()` once `amber` upgrades to Dropwizard 4.x. +- **Config loading:** every service uses `SubstitutingSourceProvider` + `EnvironmentVariableSubstitutor(false)` so YAML configs support `${ENV_VAR}` expansion. + +## Where to make changes (project-specific conventions) +- **New backend endpoint:** create a Jersey `*Resource` under `/src/main/scala/.../resource/` and **register it in that service's `Application.run(...)`** via `environment.jersey.register(classOf[YourResource])`. `amber`'s `TexeraWebApplication.run(...)` already registers a long list — `AuthResource`, `WorkflowResource`, `DashboardResource`, `ProjectResource`, `HubResource`, `GmailResource`, `AIAssistantResource`, etc. Follow that pattern; don't rely on classpath scanning. +- **Shared backend logic** belongs in `common/*` (honor the dependency graph in `build.sbt`): `common/dao` for DB, `common/config` for config, `common/auth` for auth, `common/workflow-core` + `common/workflow-operator` for dataflow model/operators, `common/pybuilder` for Python code generation. +- **New operators:** add to `common/workflow-operator/...`; Python-backed operators typically use `common/pybuilder` and interact with the Python worker under `amber/src/main/python/`. +- **Frontend code** is isolated under `frontend/src/` (Angular, yarn-managed). `amber` only serves the built output from `frontend/dist`. The app modules live in `frontend/src/app/{common,dashboard,hub,workspace}`. +- When adding Jackson-touched types, respect the per-service Jackson `dependencyOverrides` already in `build.sbt` — different services pin different versions for Dropwizard 3 vs 4 compatibility. + +## Critical developer workflows +- **Build everything:** `bin/build.sh` runs `bin/build-services.sh` (which runs `sbt clean dist` and unzips the `target/universal/*.zip` artifacts into per-service `target/` dirs) and then `bin/frontend.sh` (`yarn install && yarn run build` in `frontend/`). +- **Run locally after build:** + - `bin/server.sh` — starts `amber` from `amber/target/texera-*/bin/texera-web-application`. + - `bin/workflow-compiling-service.sh`, `bin/file-service.sh`, `bin/config-service.sh`, `bin/computing-unit-managing-service.sh`, `bin/workflow-computing-unit.sh` — start the other services from their unzipped `target/` dirs. + - `bin/frontend-dev.sh` — frontend dev server with the proxy config above. + - `bin/shared-editing-server.sh` — y-websocket server for `/rtc` collaboration. + - `bin/python-language-service.sh` / `bin/pylsp/` — Python language service. +- **Docker images:** Dockerfiles in `bin/*.dockerfile` **must be built from the repo root** as context (see `bin/README.md`). Example: `docker build -f bin/texera-web-application.dockerfile -t your-repo/texera-web-application:test .`. Helpers: `bin/build-images.sh`, `bin/merge-image-tags.sh`. +- **Deployment references:** single-node Docker Compose at `bin/single-node/docker-compose.yml` (+ `nginx.conf`, `examples/`); Kubernetes Helm chart at `bin/k8s/` (`Chart.yaml`, `values.yaml`, `values-development.yaml`, `templates/`). +- **Formatting/lint:** `bin/fix-format.sh`; Scalafmt config at `.scalafmt.conf`, Scalafix at `.scalafix.conf`. +- **Proto codegen:** `bin/python-proto-gen.sh`, `bin/frontend-proto-gen.sh`. +- **Service entrypoints** typically call `new ().run("server", )`. The YAML path either resolves via `TEXERA_HOME` (e.g., `WorkflowCompilingService`) or via `Utils.amberHomePath` (e.g., `TexeraWebApplication`). + +## Map of the code (high-signal entrypoints) +- sbt module graph and ASF licensing task: `build.sbt` +- `amber` web app + REST registration + GUI serving + WebSocket + JWT: `amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala` +- Amber dataflow engine: `amber/src/main/scala/org/apache/texera/amber/engine/architecture/{controller,worker,pythonworker,scheduling,messaginglayer,sendsemantics,deploysemantics,logreplay,common}/` +- Compilation service: `workflow-compiling-service/src/main/scala/org/apache/texera/service/WorkflowCompilingService.scala` +- Other service entrypoints: `*/src/main/scala/org/apache/texera/service/*Service.scala` (`FileService`, `ConfigService`, `AccessControlService`, `ComputingUnitManagingService`) +- Dashboard/user/hub/admin/AI resources: `amber/src/main/scala/org/apache/texera/web/resource/` (top-level + `auth/`, `aiassistant/`, `dashboard/{admin,hub,user}/`) +- Python worker + pytexera SDK: `amber/src/main/python/` +- Deployment artifacts: `bin/*.dockerfile`, `bin/*.sh`, `bin/single-node/`, `bin/k8s/` +- SQL DDL and catalog bootstrap: `sql/texera_ddl.sql`, `sql/texera_lakefs.sql`, `sql/texera_lakekeeper.sql`, `sql/iceberg_postgres_catalog.sql`, `sql/updates/` +- Root docs: `README.md` (links to developer wiki), `CONTRIBUTING.md`, `SECURITY.md`, `DISCLAIMER-WIP`. diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkOutputGateway.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkOutputGateway.scala index e35e819d41f..ea7034e1d78 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkOutputGateway.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/NetworkOutputGateway.scala @@ -95,7 +95,9 @@ class NetworkOutputGateway( } def removeControlChannel(to: ActorVirtualIdentity): Unit = { - idToSequenceNums.remove(ChannelIdentity(actorId, to, isControl = true)) + synchronized { + idToSequenceNums.remove(ChannelIdentity(actorId, to, isControl = true)) + } } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 7eb3b871208..3a7616cd369 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -182,6 +182,8 @@ class RegionExecutionCoordinator( val actorRef = actorRefService.getActorRef(workerId) // Remove the actorRef so that no other actors can find the worker and send messages. actorRefService.removeActorRef(workerId) + // Restarted regions reuse actorId. Remove stale control channels so the + // controller does not reuse old control-message sequence numbers for new workers. asyncRPCClient.inputGateway.removeControlChannel(workerId) asyncRPCClient.outputGateway.removeControlChannel(workerId) gracefulStop(actorRef, ScalaDuration(5, TimeUnit.SECONDS)).asTwitter() From ca433166613a43d37d03a8d1f461e2349551ff1e Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 24 Apr 2026 20:33:51 -0700 Subject: [PATCH 038/152] remove accidental AGENTS file from loop branch --- AGENTS.md | 76 ------------------------------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 4038ee1733d..00000000000 --- a/AGENTS.md +++ /dev/null @@ -1,76 +0,0 @@ -# AGENTS.md (Texera) - -Quick orientation for agents working on Apache Texera (Incubating). Pair this file with the root `README.md` and the developer wiki linked from it. - -## Big picture (modules + service boundaries) -- Repo is an **sbt multi-project Scala backend** (Scala 2.13.12, JDK-based) plus an **Angular frontend** under `frontend/`. -- Backend services (Dropwizard/Jersey) live at the top level and share code under `common/`. The sbt module graph is defined in `build.sbt`; note that sbt project names do not always match folder names (e.g., `amber/` is the sbt project `WorkflowExecutionService`). -- Shared libraries (`common/`): - - `common/dao` (DAO), `common/config` (Config), `common/auth` (Auth, JWT setup), `common/workflow-core` (WorkflowCore), `common/workflow-operator` (WorkflowOperator, operator definitions and descriptors), `common/pybuilder` (PyBuilder — a `pyb"..."` macro DSL for composing Python code for Python operators). -- Services (top-level folders): - - `amber/` — main web application + workflow execution engine ("Amber" actor-based dataflow runtime). Serves the Angular GUI, REST API, and the collaboration WebSocket. - - `workflow-compiling-service/` — compiles workflow JSON into executable plans. - - `file-service/` — datasets/files (works with LakeFS / Iceberg catalogs, see `sql/`). - - `config-service/` — runtime configuration. - - `access-control-service/` — ACL + the AI assistant chat/completion endpoints. - - `computing-unit-managing-service/` — lifecycle of compute units (master/worker pods, scaling). - - `pyright-language-service/` — Pyright-backed language server for Python UDF editing. -- Python runtime companion: `amber/src/main/python/` (`pytexera`, `pyamber`, `core`, `proto`, `texera_run_python_worker.py`) — used by Python workers spawned by the Amber engine. -- A secondary, source-only copy of the operator library lives at `core/workflow-operator/src/...` (legacy / build artifact source; prefer `common/workflow-operator` for new code). -- Build note: `build.sbt` injects ASF licensing files (`LICENSE`, `NOTICE`, `DISCLAIMER-WIP`) into `META-INF/` of every JAR via `asfLicensingSettings`. - -## Service port map (default config) -| Service | App port | Admin port | Source config | -| --- | --- | --- | --- | -| amber (`TexeraWebApplication`) | 8080 | 8081 | `amber/src/main/resources/web-config.yml` | -| workflow-compiling-service | 9090 | — | `workflow-compiling-service/src/main/resources/workflow-compiling-service-config.yaml` | -| file-service | 9092 | — | `file-service/src/main/resources/file-service-web-config.yaml` | -| config-service | 9094 | — | `config-service/src/main/resources/config-service-web-config.yaml` | -| access-control-service (AI assistant, models) | 9096 | — | `access-control-service/src/main/resources/access-control-service-web-config.yaml` | -| computing-unit-managing-service | 8888 | 8082 | `computing-unit-managing-service/src/main/resources/computing-unit-managing-service-config.yaml` | -| WebSocket (collaboration, `/wsapi`) | 8085 | — | served by `amber` | -| y-websocket (shared editing `/rtc`) | 1234 | — | `bin/shared-editing-server.sh`, `bin/y-websocket-server/` | - -Frontend dev proxy routing (`frontend/proxy.config.json`) mirrors this split — e.g., `/api/compile` → 9090, `/api/dataset` → 9092, `/api/config/**` → 9094, `/api/models` and `/api/chat/completion` → 9096, `/api/computing-unit` → 8888, everything else `/api` → 8080. - -## Runtime flow & cross-component communication -- **REST base path:** every service mounts Jersey at `/api/*` (`environment.jersey.setUrlPattern("/api/*")` in each `Application.run`). -- **Web GUI serving:** `amber` serves Angular static output via `FileAssetsBundle("../../frontend/dist", "/", "index.html")` and redirects 404s to `/` so Angular client-side routing works (`TexeraWebApplication.scala`). -- **WebSockets:** collaboration is wired through Dropwizard `WebsocketBundle(classOf[CollaborationResource])`; the Jetty WS idle timeout is explicitly set to 1 hour via `WebSocketUpgradeFilter` in `TexeraWebApplication.run(...)`. -- **Database init pattern:** services call `SqlServer.initConnection(StorageConfig.jdbcUrl, ...)` during startup (see `TexeraWebApplication.scala`, `WorkflowCompilingService.scala`, etc.). DDL lives in `sql/texera_ddl.sql`; Iceberg / LakeFS / Lakekeeper bootstrap SQL is also under `sql/`. -- **Auth:** JWT auth is installed via `setupJwtAuth(environment)` in `amber`, plus `AuthValueFactoryProvider.Binder[SessionUser]` and `RolesAllowedDynamicFeature`. Resources under `.../resource/auth/` (`AuthResource`, `GoogleAuthResource`) own login; `AuthResource.createAdminUser()` runs at startup. -- **Request logging filter:** every service adds a Jetty request-log filter that logs through SLF4J logger `org.eclipse.jetty.server.RequestLog` (level controlled by env var `TEXERA_SERVICE_LOG_LEVEL`). Note the servlet-API split: `amber` currently uses `javax.servlet.*` while `workflow-compiling-service` (and other newer services) use `jakarta.servlet.*`. There is a TODO to consolidate onto `common/auth`'s `RequestLoggingFilter.register()` once `amber` upgrades to Dropwizard 4.x. -- **Config loading:** every service uses `SubstitutingSourceProvider` + `EnvironmentVariableSubstitutor(false)` so YAML configs support `${ENV_VAR}` expansion. - -## Where to make changes (project-specific conventions) -- **New backend endpoint:** create a Jersey `*Resource` under `/src/main/scala/.../resource/` and **register it in that service's `Application.run(...)`** via `environment.jersey.register(classOf[YourResource])`. `amber`'s `TexeraWebApplication.run(...)` already registers a long list — `AuthResource`, `WorkflowResource`, `DashboardResource`, `ProjectResource`, `HubResource`, `GmailResource`, `AIAssistantResource`, etc. Follow that pattern; don't rely on classpath scanning. -- **Shared backend logic** belongs in `common/*` (honor the dependency graph in `build.sbt`): `common/dao` for DB, `common/config` for config, `common/auth` for auth, `common/workflow-core` + `common/workflow-operator` for dataflow model/operators, `common/pybuilder` for Python code generation. -- **New operators:** add to `common/workflow-operator/...`; Python-backed operators typically use `common/pybuilder` and interact with the Python worker under `amber/src/main/python/`. -- **Frontend code** is isolated under `frontend/src/` (Angular, yarn-managed). `amber` only serves the built output from `frontend/dist`. The app modules live in `frontend/src/app/{common,dashboard,hub,workspace}`. -- When adding Jackson-touched types, respect the per-service Jackson `dependencyOverrides` already in `build.sbt` — different services pin different versions for Dropwizard 3 vs 4 compatibility. - -## Critical developer workflows -- **Build everything:** `bin/build.sh` runs `bin/build-services.sh` (which runs `sbt clean dist` and unzips the `target/universal/*.zip` artifacts into per-service `target/` dirs) and then `bin/frontend.sh` (`yarn install && yarn run build` in `frontend/`). -- **Run locally after build:** - - `bin/server.sh` — starts `amber` from `amber/target/texera-*/bin/texera-web-application`. - - `bin/workflow-compiling-service.sh`, `bin/file-service.sh`, `bin/config-service.sh`, `bin/computing-unit-managing-service.sh`, `bin/workflow-computing-unit.sh` — start the other services from their unzipped `target/` dirs. - - `bin/frontend-dev.sh` — frontend dev server with the proxy config above. - - `bin/shared-editing-server.sh` — y-websocket server for `/rtc` collaboration. - - `bin/python-language-service.sh` / `bin/pylsp/` — Python language service. -- **Docker images:** Dockerfiles in `bin/*.dockerfile` **must be built from the repo root** as context (see `bin/README.md`). Example: `docker build -f bin/texera-web-application.dockerfile -t your-repo/texera-web-application:test .`. Helpers: `bin/build-images.sh`, `bin/merge-image-tags.sh`. -- **Deployment references:** single-node Docker Compose at `bin/single-node/docker-compose.yml` (+ `nginx.conf`, `examples/`); Kubernetes Helm chart at `bin/k8s/` (`Chart.yaml`, `values.yaml`, `values-development.yaml`, `templates/`). -- **Formatting/lint:** `bin/fix-format.sh`; Scalafmt config at `.scalafmt.conf`, Scalafix at `.scalafix.conf`. -- **Proto codegen:** `bin/python-proto-gen.sh`, `bin/frontend-proto-gen.sh`. -- **Service entrypoints** typically call `new ().run("server", )`. The YAML path either resolves via `TEXERA_HOME` (e.g., `WorkflowCompilingService`) or via `Utils.amberHomePath` (e.g., `TexeraWebApplication`). - -## Map of the code (high-signal entrypoints) -- sbt module graph and ASF licensing task: `build.sbt` -- `amber` web app + REST registration + GUI serving + WebSocket + JWT: `amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala` -- Amber dataflow engine: `amber/src/main/scala/org/apache/texera/amber/engine/architecture/{controller,worker,pythonworker,scheduling,messaginglayer,sendsemantics,deploysemantics,logreplay,common}/` -- Compilation service: `workflow-compiling-service/src/main/scala/org/apache/texera/service/WorkflowCompilingService.scala` -- Other service entrypoints: `*/src/main/scala/org/apache/texera/service/*Service.scala` (`FileService`, `ConfigService`, `AccessControlService`, `ComputingUnitManagingService`) -- Dashboard/user/hub/admin/AI resources: `amber/src/main/scala/org/apache/texera/web/resource/` (top-level + `auth/`, `aiassistant/`, `dashboard/{admin,hub,user}/`) -- Python worker + pytexera SDK: `amber/src/main/python/` -- Deployment artifacts: `bin/*.dockerfile`, `bin/*.sh`, `bin/single-node/`, `bin/k8s/` -- SQL DDL and catalog bootstrap: `sql/texera_ddl.sql`, `sql/texera_lakefs.sql`, `sql/texera_lakekeeper.sql`, `sql/iceberg_postgres_catalog.sql`, `sql/updates/` -- Root docs: `README.md` (links to developer wiki), `CONTRIBUTING.md`, `SECURITY.md`, `DISCLAIMER-WIP`. From 858539dee8a8c3a88d21af5ef518d53002e21e4a Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 24 Apr 2026 20:44:54 -0700 Subject: [PATCH 039/152] fix fmt --- .../amber/core/storage/result/iceberg/IcebergTableWriter.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala index 2d4ffdd063d..06d04e407f5 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala @@ -29,7 +29,6 @@ import org.apache.iceberg.io.{DataWriter, OutputFile} import org.apache.iceberg.parquet.Parquet import org.apache.iceberg.{Schema, Table} -import java.nio.file.{Files, Path, Paths} import scala.collection.mutable.ArrayBuffer /** From f182695ec3581fd20ad9f5bf8bf0b3428ce55f3b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 24 Apr 2026 20:46:39 -0700 Subject: [PATCH 040/152] fix fmt --- common/workflow-core/build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/workflow-core/build.sbt b/common/workflow-core/build.sbt index 16a841df10e..e71b2cd1d22 100644 --- a/common/workflow-core/build.sbt +++ b/common/workflow-core/build.sbt @@ -222,4 +222,4 @@ libraryDependencies ++= Seq( "software.amazon.awssdk" % "sts" % "2.29.51" excludeAll( ExclusionRule(organization = "io.netty") ), -) +) \ No newline at end of file From 2904bf6d3d898bfd6fc07ea333ee7aa054d2410c Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 25 Apr 2026 03:43:12 -0700 Subject: [PATCH 041/152] fix fmt --- .../texera/web/service/WorkflowExecutionService.scala | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/web/service/WorkflowExecutionService.scala b/amber/src/main/scala/org/apache/texera/web/service/WorkflowExecutionService.scala index 741687e02c9..c6bbbafc88d 100644 --- a/amber/src/main/scala/org/apache/texera/web/service/WorkflowExecutionService.scala +++ b/amber/src/main/scala/org/apache/texera/web/service/WorkflowExecutionService.scala @@ -21,13 +21,14 @@ package org.apache.texera.web.service import com.typesafe.scalalogging.LazyLogging import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import org.apache.texera.amber.core.workflow.WorkflowContext +import org.apache.texera.amber.core.workflow.{ExecutionMode, WorkflowContext} import org.apache.texera.amber.engine.architecture.controller.{ControllerConfig, Workflow} import org.apache.texera.amber.engine.architecture.rpc.controlcommands.EmptyRequest import org.apache.texera.amber.engine.architecture.rpc.controlreturns.WorkflowAggregatedState._ import org.apache.texera.amber.engine.common.Utils import org.apache.texera.amber.engine.common.client.AmberClient import org.apache.texera.amber.engine.common.executionruntimestate.ExecutionMetadataStore +import org.apache.texera.amber.operator.loop.LoopStartOpDesc import org.apache.texera.web.model.websocket.event.{ TexeraWebSocketEvent, WorkflowErrorEvent, @@ -66,7 +67,12 @@ class WorkflowExecutionService( ) extends SubscriptionManager with LazyLogging { - workflowContext.workflowSettings = request.workflowSettings + workflowContext.workflowSettings = + if (request.logicalPlan.operators.exists(_.isInstanceOf[LoopStartOpDesc])) { + request.workflowSettings.copy(executionMode = ExecutionMode.MATERIALIZED) + } else { + request.workflowSettings + } val wsInput = new WebsocketInput(errorHandler) addSubscription( From 20e2cedbf7f59035a61e3d2e31c335a589febf00 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 26 Apr 2026 01:45:12 -0700 Subject: [PATCH 042/152] Refactor jump state into execution coordinator --- .../architecture/rpc/controlcommands.proto | 8 +---- .../amber/engine/architecture/rpc/__init__.py | 2 +- .../controller/WorkflowScheduler.scala | 15 ++++++--- .../architecture/scheduling/Schedule.scala | 23 +++----------- .../WorkflowExecutionCoordinator.scala | 29 +++++++++++++++-- .../scheduling/ScheduleSpec.scala | 31 ++++++++++++++++--- 6 files changed, 69 insertions(+), 39 deletions(-) diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto index 0cedbf4166c..27ea417cde9 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto @@ -39,19 +39,13 @@ message ControlRequest { TakeGlobalCheckpointRequest takeGlobalCheckpointRequest = 2; DebugCommandRequest debugCommandRequest = 3; EvaluatePythonExpressionRequest evaluatePythonExpressionRequest = 4; - ModifyLogicRequest modifyLogicRequest = 5; - RetryWorkflowRequest retryWorkflowRequest = 6; - ConsoleMessageTriggeredRequest consoleMessageTriggeredRequest = 8; - PortCompletedRequest portCompletedRequest = 9; - WorkerStateUpdatedRequest workerStateUpdatedRequest = 10; - LinkWorkersRequest linkWorkersRequest = 11; - JumpToOperatorRequest jumpToOperatorRequest = 12; RetryWorkflowRequest retryWorkflowRequest = 5; ConsoleMessageTriggeredRequest consoleMessageTriggeredRequest = 6; PortCompletedRequest portCompletedRequest = 7; WorkerStateUpdatedRequest workerStateUpdatedRequest = 8; LinkWorkersRequest linkWorkersRequest = 9; WorkflowReconfigureRequest workflowReconfigureRequest = 10; + JumpToOperatorRequest jumpToOperatorRequest = 11; // request for worker AddInputChannelRequest addInputChannelRequest = 50; diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py index 524c79fdc3b..ccc2a02deb4 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py @@ -102,7 +102,7 @@ class ControlRequest(betterproto.Message): betterproto.message_field(10, group="sealed_value") ) jump_to_operator_request: "JumpToOperatorRequest" = betterproto.message_field( - 12, group="sealed_value" + 11, group="sealed_value" ) add_input_channel_request: "AddInputChannelRequest" = betterproto.message_field( 50, group="sealed_value" diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index c8a107e0451..cd91c58d13f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -20,7 +20,6 @@ package org.apache.texera.amber.engine.architecture.controller import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity -import org.apache.texera.amber.core.virtualidentity.OperatorIdentity import org.apache.texera.amber.core.workflow.{PhysicalPlan, WorkflowContext} import org.apache.texera.amber.engine.architecture.scheduling.{ CostBasedScheduleGenerator, @@ -34,6 +33,7 @@ class WorkflowScheduler( ) extends java.io.Serializable { var physicalPlan: PhysicalPlan = _ private var schedule: Schedule = _ + private var nextRegionLevel: Option[Int] = None def getSchedule: Schedule = schedule @@ -51,10 +51,15 @@ class WorkflowScheduler( ).generate() this.schedule = generatedSchedule this.physicalPlan = updatedPhysicalPlan + this.nextRegionLevel = Some(generatedSchedule.startingLevel) } - def getNextRegions: Set[Region] = if (!schedule.hasNext) Set() else schedule.next() - - def jumpToOperator(opId: OperatorIdentity): Unit = schedule.jumpToOperator(opId) - + def getNextRegions: Set[Region] = + nextRegionLevel + .filter(schedule.levelSets.contains) + .map { level => + nextRegionLevel = Some(level + 1) + schedule.levelSets(level) + } + .getOrElse(Set.empty) } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 4f38fbcf1c0..65ed3f1fcaa 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -19,26 +19,11 @@ package org.apache.texera.amber.engine.architecture.scheduling -import org.apache.texera.amber.core.virtualidentity.OperatorIdentity - -case class Schedule(private val levelSets: Map[Int, Set[Region]]) extends Iterator[Set[Region]] { - private var currentLevel = levelSets.keys.minOption.getOrElse(0) +case class Schedule(levelSets: Map[Int, Set[Region]]) extends Iterable[Set[Region]] { + val startingLevel: Int = levelSets.keys.minOption.getOrElse(0) def getRegions: List[Region] = levelSets.values.flatten.toList - override def hasNext: Boolean = levelSets.isDefinedAt(currentLevel) - - override def next(): Set[Region] = { - val regions = levelSets(currentLevel) - currentLevel += 1 - regions - } - - def jumpToOperator(opId: OperatorIdentity): Unit = - levelSets - .collectFirst { - case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => - level - } - .foreach(currentLevel = _) + override def iterator: Iterator[Set[Region]] = + levelSets.keys.toSeq.sorted.iterator.map(levelSets) } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 2b8e3ce1450..88b8c18d54d 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -41,6 +41,7 @@ class WorkflowExecutionCoordinator( ) extends LazyLogging { private val executedRegions: mutable.ListBuffer[Set[Region]] = mutable.ListBuffer() + private var nextRegionLevel: Option[Int] = None private val regionExecutionCoordinators : mutable.HashMap[RegionIdentity, RegionExecutionCoordinator] = @@ -52,6 +53,23 @@ class WorkflowExecutionCoordinator( this.actorRefService = actorRefService } + private def getNextRegions: Set[Region] = { + val schedule = workflowScheduler.getSchedule + if (schedule == null) { + return Set.empty + } + if (nextRegionLevel.isEmpty) { + nextRegionLevel = Some(schedule.startingLevel) + } + nextRegionLevel + .filter(schedule.levelSets.contains) + .map { level => + nextRegionLevel = Some(level + 1) + schedule.levelSets(level) + } + .getOrElse(Set.empty) + } + /** * Each invocation first syncs the internal statuses of each exisiting `RegionExecutionCoordintor`, after which each * of the `RegionExecutionCoordintor`s will launch the corresponding next phase of whenever needed until it is @@ -82,7 +100,7 @@ class WorkflowExecutionCoordinator( // All existing regions are completed. Start the next region (if any). Future .collect({ - val nextRegions = workflowScheduler.getNextRegions + val nextRegions = getNextRegions executedRegions.append(nextRegions) nextRegions .map(region => { @@ -118,7 +136,14 @@ class WorkflowExecutionCoordinator( } def jumpToOperator(opId: OperatorIdentity): Unit = { - workflowScheduler.jumpToOperator(opId) + val schedule = workflowScheduler.getSchedule + if (schedule == null) { + return + } + nextRegionLevel = schedule.levelSets.collectFirst { + case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => + level + } } } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala index 6655874b756..3cbbdfc7759 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala @@ -21,12 +21,15 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.texera.amber.core.executor.OpExecInitInfo import org.apache.texera.amber.core.virtualidentity.{ + ActorVirtualIdentity, ExecutionIdentity, OperatorIdentity, PhysicalOpIdentity, WorkflowIdentity } import org.apache.texera.amber.core.workflow.PhysicalOp +import org.apache.texera.amber.engine.architecture.controller.WorkflowScheduler +import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.scalatest.flatspec.AnyFlatSpec class ScheduleSpec extends AnyFlatSpec { @@ -41,7 +44,20 @@ class ScheduleSpec extends AnyFlatSpec { Region(RegionIdentity(regionId), Set(physicalOp), Set.empty) } - "Schedule.jumpToOperator" should "make the next scheduled region contain the target operator" in { + private def setSchedule(workflowScheduler: WorkflowScheduler, schedule: Schedule): Unit = { + val scheduleField = classOf[WorkflowScheduler].getDeclaredField("schedule") + scheduleField.setAccessible(true) + scheduleField.set(workflowScheduler, schedule) + } + + private def getNextRegions(coordinator: WorkflowExecutionCoordinator): Set[Region] = { + val getNextRegionsMethod = + classOf[WorkflowExecutionCoordinator].getDeclaredMethod("getNextRegions") + getNextRegionsMethod.setAccessible(true) + getNextRegionsMethod.invoke(coordinator).asInstanceOf[Set[Region]] + } + + "WorkflowExecutionCoordinator.jumpToOperator" should "make the next scheduled region contain the target operator" in { val firstRegion = region(1, "first") val secondRegion = region(2, "second") val thirdRegion = region(3, "third") @@ -52,12 +68,17 @@ class ScheduleSpec extends AnyFlatSpec { 2 -> Set(thirdRegion) ) ) + val workflowScheduler = + new WorkflowScheduler(null, ActorVirtualIdentity("controller")) + setSchedule(workflowScheduler, schedule) + val coordinator = + new WorkflowExecutionCoordinator(workflowScheduler, WorkflowExecution(), null, null) - assert(schedule.next() == Set(firstRegion)) - assert(schedule.next() == Set(secondRegion)) + assert(getNextRegions(coordinator) == Set(firstRegion)) + assert(getNextRegions(coordinator) == Set(secondRegion)) - schedule.jumpToOperator(OperatorIdentity("first")) + coordinator.jumpToOperator(OperatorIdentity("first")) - assert(schedule.next() == Set(firstRegion)) + assert(getNextRegions(coordinator) == Set(firstRegion)) } } From 4c8cb5f77899ebd1c8e4b61c3924dafa7163dc80 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 26 Apr 2026 01:50:31 -0700 Subject: [PATCH 043/152] Simplify coordinator jump scheduling --- .../controller/WorkflowScheduler.scala | 12 ------------ .../WorkflowExecutionCoordinator.scala | 2 +- .../architecture/scheduling/ScheduleSpec.scala | 13 +++---------- .../engine/e2e/BatchSizePropagationSpec.scala | 17 ++++++++--------- 4 files changed, 12 insertions(+), 32 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index cd91c58d13f..5135d693753 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -23,7 +23,6 @@ import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity import org.apache.texera.amber.core.workflow.{PhysicalPlan, WorkflowContext} import org.apache.texera.amber.engine.architecture.scheduling.{ CostBasedScheduleGenerator, - Region, Schedule } @@ -33,7 +32,6 @@ class WorkflowScheduler( ) extends java.io.Serializable { var physicalPlan: PhysicalPlan = _ private var schedule: Schedule = _ - private var nextRegionLevel: Option[Int] = None def getSchedule: Schedule = schedule @@ -51,15 +49,5 @@ class WorkflowScheduler( ).generate() this.schedule = generatedSchedule this.physicalPlan = updatedPhysicalPlan - this.nextRegionLevel = Some(generatedSchedule.startingLevel) } - - def getNextRegions: Set[Region] = - nextRegionLevel - .filter(schedule.levelSets.contains) - .map { level => - nextRegionLevel = Some(level + 1) - schedule.levelSets(level) - } - .getOrElse(Set.empty) } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 88b8c18d54d..759ac44273c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -53,7 +53,7 @@ class WorkflowExecutionCoordinator( this.actorRefService = actorRefService } - private def getNextRegions: Set[Region] = { + private[scheduling] def getNextRegions: Set[Region] = { val schedule = workflowScheduler.getSchedule if (schedule == null) { return Set.empty diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala index 3cbbdfc7759..26979cd9649 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala @@ -50,13 +50,6 @@ class ScheduleSpec extends AnyFlatSpec { scheduleField.set(workflowScheduler, schedule) } - private def getNextRegions(coordinator: WorkflowExecutionCoordinator): Set[Region] = { - val getNextRegionsMethod = - classOf[WorkflowExecutionCoordinator].getDeclaredMethod("getNextRegions") - getNextRegionsMethod.setAccessible(true) - getNextRegionsMethod.invoke(coordinator).asInstanceOf[Set[Region]] - } - "WorkflowExecutionCoordinator.jumpToOperator" should "make the next scheduled region contain the target operator" in { val firstRegion = region(1, "first") val secondRegion = region(2, "second") @@ -74,11 +67,11 @@ class ScheduleSpec extends AnyFlatSpec { val coordinator = new WorkflowExecutionCoordinator(workflowScheduler, WorkflowExecution(), null, null) - assert(getNextRegions(coordinator) == Set(firstRegion)) - assert(getNextRegions(coordinator) == Set(secondRegion)) + assert(coordinator.getNextRegions == Set(firstRegion)) + assert(coordinator.getNextRegions == Set(secondRegion)) coordinator.jumpToOperator(OperatorIdentity("first")) - assert(getNextRegions(coordinator) == Set(firstRegion)) + assert(coordinator.getNextRegions == Set(firstRegion)) } } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/e2e/BatchSizePropagationSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/e2e/BatchSizePropagationSpec.scala index e9b830bdfdc..0a7dccf5db1 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/e2e/BatchSizePropagationSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/e2e/BatchSizePropagationSpec.scala @@ -25,6 +25,7 @@ import org.apache.pekko.util.Timeout import org.apache.texera.amber.clustering.SingleNodeListener import org.apache.texera.amber.core.workflow.{PortIdentity, WorkflowContext, WorkflowSettings} import org.apache.texera.amber.engine.architecture.controller._ +import org.apache.texera.amber.engine.architecture.scheduling.Schedule import org.apache.texera.amber.engine.architecture.sendsemantics.partitionings._ import org.apache.texera.amber.engine.common.virtualidentity.util.CONTROLLER import org.apache.texera.amber.engine.e2e.TestUtils.buildWorkflow @@ -54,11 +55,10 @@ class BatchSizePropagationSpec } def verifyBatchSizeInPartitioning( - workflowScheduler: WorkflowScheduler, + schedule: Schedule, expectedBatchSize: Int ): Unit = { - var nextRegions = workflowScheduler.getNextRegions - while (nextRegions.nonEmpty) { + schedule.foreach { nextRegions => nextRegions.foreach { region => region.resourceConfig.foreach { resourceConfig => resourceConfig.linkConfigs.foreach { @@ -112,7 +112,6 @@ class BatchSizePropagationSpec } } } - nextRegions = workflowScheduler.getNextRegions } } @@ -135,7 +134,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler, 1) + verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 1) } "Engine" should "propagate the correct batch size for headerlessCsv->keyword workflow" in { @@ -165,7 +164,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler, 500) + verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 500) } "Engine" should "propagate the correct batch size for csv->keyword->count workflow" in { @@ -203,7 +202,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler, 100) + verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 100) } "Engine" should "propagate the correct batch size for csv->keyword->averageAndGroupBy workflow" in { @@ -244,7 +243,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler, 300) + verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 300) } "Engine" should "propagate the correct batch size for csv->(csv->)->join workflow" in { @@ -285,7 +284,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler, 1) + verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 1) } } From b7c2e3b90021ad351abb04b53f29ce091f9ed7d4 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 26 Apr 2026 01:57:02 -0700 Subject: [PATCH 044/152] fix fmt --- .../engine/architecture/controller/WorkflowScheduler.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index 5135d693753..9e777eb6a84 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -21,10 +21,7 @@ package org.apache.texera.amber.engine.architecture.controller import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity import org.apache.texera.amber.core.workflow.{PhysicalPlan, WorkflowContext} -import org.apache.texera.amber.engine.architecture.scheduling.{ - CostBasedScheduleGenerator, - Schedule -} +import org.apache.texera.amber.engine.architecture.scheduling.{CostBasedScheduleGenerator, Schedule} class WorkflowScheduler( workflowContext: WorkflowContext, From 0aa5017689df22f2421eff1bdc9b58a19f6a7243 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 26 Apr 2026 02:55:42 -0700 Subject: [PATCH 045/152] Clarify jump-to-region API --- .../architecture/rpc/controlcommands.proto | 4 +-- .../architecture/rpc/controllerservice.proto | 2 +- .../amber/engine/architecture/rpc/__init__.py | 28 +++++++++---------- ...ControllerAsyncRPCHandlerInitializer.scala | 2 +- .../controller/ControllerProcessor.scala | 2 +- ...cala => JumpToOperatorRegionHandler.scala} | 10 +++---- .../WorkflowExecutionCoordinator.scala | 10 +++---- ...=> WorkflowExecutionCoordinatorSpec.scala} | 19 +++---------- 8 files changed, 33 insertions(+), 44 deletions(-) rename amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/{JumpToOperatorHandler.scala => JumpToOperatorRegionHandler.scala} (85%) rename amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/{ScheduleSpec.scala => WorkflowExecutionCoordinatorSpec.scala} (71%) diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto index 27ea417cde9..1f55927e4ae 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto @@ -45,7 +45,7 @@ message ControlRequest { WorkerStateUpdatedRequest workerStateUpdatedRequest = 8; LinkWorkersRequest linkWorkersRequest = 9; WorkflowReconfigureRequest workflowReconfigureRequest = 10; - JumpToOperatorRequest jumpToOperatorRequest = 11; + JumpToOperatorRegionRequest jumpToOperatorRegionRequest = 11; // request for worker AddInputChannelRequest addInputChannelRequest = 50; @@ -275,6 +275,6 @@ message QueryStatisticsRequest{ StatisticsUpdateTarget updateTarget = 2; } -message JumpToOperatorRequest{ +message JumpToOperatorRegionRequest{ core.OperatorIdentity targetOperatorId = 1 [(scalapb.field).no_box = true]; } diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto index a351dec494a..0932a7b914a 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto @@ -42,7 +42,7 @@ service ControllerService { rpc PauseWorkflow(EmptyRequest) returns (EmptyReturn); rpc WorkerStateUpdated(WorkerStateUpdatedRequest) returns (EmptyReturn); rpc WorkerExecutionCompleted(EmptyRequest) returns (EmptyReturn); - rpc JumpToOperator(JumpToOperatorRequest) returns (EmptyReturn); + rpc JumpToOperatorRegion(JumpToOperatorRegionRequest) returns (EmptyReturn); rpc LinkWorkers(LinkWorkersRequest) returns (EmptyReturn); rpc ControllerInitiateQueryStatistics(QueryStatisticsRequest) returns (EmptyReturn); rpc RetryWorkflow(RetryWorkflowRequest) returns (EmptyReturn); diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py index ccc2a02deb4..2bad2b0bfbc 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py @@ -101,7 +101,7 @@ class ControlRequest(betterproto.Message): workflow_reconfigure_request: "WorkflowReconfigureRequest" = ( betterproto.message_field(10, group="sealed_value") ) - jump_to_operator_request: "JumpToOperatorRequest" = betterproto.message_field( + jump_to_operator_region_request: "JumpToOperatorRegionRequest" = betterproto.message_field( 11, group="sealed_value" ) add_input_channel_request: "AddInputChannelRequest" = betterproto.message_field( @@ -389,7 +389,7 @@ class QueryStatisticsRequest(betterproto.Message): @dataclass(eq=False, repr=False) -class JumpToOperatorRequest(betterproto.Message): +class JumpToOperatorRegionRequest(betterproto.Message): target_operator_id: "___core__.OperatorIdentity" = betterproto.message_field(1) @@ -1259,17 +1259,17 @@ async def worker_execution_completed( metadata=metadata, ) - async def jump_to_operator( + async def jump_to_operator_region( self, - jump_to_operator_request: "JumpToOperatorRequest", + jump_to_operator_region_request: "JumpToOperatorRegionRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator", - jump_to_operator_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperatorRegion", + jump_to_operator_region_request, EmptyReturn, timeout=timeout, deadline=deadline, @@ -1948,8 +1948,8 @@ async def worker_execution_completed( ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def jump_to_operator( - self, jump_to_operator_request: "JumpToOperatorRequest" + async def jump_to_operator_region( + self, jump_to_operator_region_request: "JumpToOperatorRegionRequest" ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) @@ -2062,11 +2062,11 @@ async def __rpc_worker_execution_completed( response = await self.worker_execution_completed(request) await stream.send_message(response) - async def __rpc_jump_to_operator( - self, stream: "grpclib.server.Stream[JumpToOperatorRequest, EmptyReturn]" + async def __rpc_jump_to_operator_region( + self, stream: "grpclib.server.Stream[JumpToOperatorRegionRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.jump_to_operator(request) + response = await self.jump_to_operator_region(request) await stream.send_message(response) async def __rpc_link_workers( @@ -2171,10 +2171,10 @@ def __mapping__(self) -> Dict[str, grpclib.const.Handler]: EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator": grpclib.const.Handler( - self.__rpc_jump_to_operator, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperatorRegion": grpclib.const.Handler( + self.__rpc_jump_to_operator_region, grpclib.const.Cardinality.UNARY_UNARY, - JumpToOperatorRequest, + JumpToOperatorRegionRequest, EmptyReturn, ), "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers": grpclib.const.Handler( diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala index 2ffa437f5c7..7e5a904716c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala @@ -34,7 +34,7 @@ class ControllerAsyncRPCHandlerInitializer( with AmberLogging with LinkWorkersHandler with WorkerExecutionCompletedHandler - with JumpToOperatorHandler + with JumpToOperatorRegionHandler with WorkerStateUpdatedHandler with PauseHandler with QueryWorkerStatisticsHandler diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 3461619cb36..f3d461c9090 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -44,7 +44,7 @@ class ControllerProcessor( val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - workflowScheduler, + () => this.workflowScheduler.getSchedule, workflowExecution, controllerConfig, asyncRPCClient diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala similarity index 85% rename from amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala rename to amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index aad72f08e90..dbe71f58586 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -23,19 +23,19 @@ import com.twitter.util.Future import org.apache.texera.amber.engine.architecture.controller.ControllerAsyncRPCHandlerInitializer import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{ AsyncRPCContext, - JumpToOperatorRequest + JumpToOperatorRegionRequest } import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn /** Requests the scheduler to continue from the region containing the target operator. */ -trait JumpToOperatorHandler { +trait JumpToOperatorRegionHandler { this: ControllerAsyncRPCHandlerInitializer => - override def jumpToOperator( - msg: JumpToOperatorRequest, + override def jumpToOperatorRegion( + msg: JumpToOperatorRegionRequest, ctx: AsyncRPCContext ): Future[EmptyReturn] = { - cp.workflowExecutionCoordinator.jumpToOperator(msg.targetOperatorId) + cp.workflowExecutionCoordinator.jumpToRegionContainingOperator(msg.targetOperatorId) EmptyReturn() } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 759ac44273c..2fbf5fc3051 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -27,14 +27,14 @@ import org.apache.texera.amber.engine.architecture.common.{ AkkaActorRefMappingService, AkkaActorService } -import org.apache.texera.amber.engine.architecture.controller.{ControllerConfig, WorkflowScheduler} +import org.apache.texera.amber.engine.architecture.controller.ControllerConfig import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.common.rpc.AsyncRPCClient import scala.collection.mutable class WorkflowExecutionCoordinator( - workflowScheduler: WorkflowScheduler, + getSchedule: () => Schedule, workflowExecution: WorkflowExecution, controllerConfig: ControllerConfig, asyncRPCClient: AsyncRPCClient @@ -54,7 +54,7 @@ class WorkflowExecutionCoordinator( } private[scheduling] def getNextRegions: Set[Region] = { - val schedule = workflowScheduler.getSchedule + val schedule = getSchedule() if (schedule == null) { return Set.empty } @@ -135,8 +135,8 @@ class WorkflowExecutionCoordinator( .toSet } - def jumpToOperator(opId: OperatorIdentity): Unit = { - val schedule = workflowScheduler.getSchedule + def jumpToRegionContainingOperator(opId: OperatorIdentity): Unit = { + val schedule = getSchedule() if (schedule == null) { return } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala similarity index 71% rename from amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala rename to amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 26979cd9649..3138fd0543e 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/ScheduleSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -21,18 +21,16 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.texera.amber.core.executor.OpExecInitInfo import org.apache.texera.amber.core.virtualidentity.{ - ActorVirtualIdentity, ExecutionIdentity, OperatorIdentity, PhysicalOpIdentity, WorkflowIdentity } import org.apache.texera.amber.core.workflow.PhysicalOp -import org.apache.texera.amber.engine.architecture.controller.WorkflowScheduler import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.scalatest.flatspec.AnyFlatSpec -class ScheduleSpec extends AnyFlatSpec { +class WorkflowExecutionCoordinatorSpec extends AnyFlatSpec { private def region(regionId: Long, opId: String): Region = { val physicalOp = PhysicalOp( @@ -44,13 +42,7 @@ class ScheduleSpec extends AnyFlatSpec { Region(RegionIdentity(regionId), Set(physicalOp), Set.empty) } - private def setSchedule(workflowScheduler: WorkflowScheduler, schedule: Schedule): Unit = { - val scheduleField = classOf[WorkflowScheduler].getDeclaredField("schedule") - scheduleField.setAccessible(true) - scheduleField.set(workflowScheduler, schedule) - } - - "WorkflowExecutionCoordinator.jumpToOperator" should "make the next scheduled region contain the target operator" in { + "WorkflowExecutionCoordinator.jumpToRegionContainingOperator" should "make the next scheduled region contain the target operator's region" in { val firstRegion = region(1, "first") val secondRegion = region(2, "second") val thirdRegion = region(3, "third") @@ -61,16 +53,13 @@ class ScheduleSpec extends AnyFlatSpec { 2 -> Set(thirdRegion) ) ) - val workflowScheduler = - new WorkflowScheduler(null, ActorVirtualIdentity("controller")) - setSchedule(workflowScheduler, schedule) val coordinator = - new WorkflowExecutionCoordinator(workflowScheduler, WorkflowExecution(), null, null) + new WorkflowExecutionCoordinator(() => schedule, WorkflowExecution(), null, null) assert(coordinator.getNextRegions == Set(firstRegion)) assert(coordinator.getNextRegions == Set(secondRegion)) - coordinator.jumpToOperator(OperatorIdentity("first")) + coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) assert(coordinator.getNextRegions == Set(firstRegion)) } From 612206f40fe8040c4989be5b2c83c7a57452c273 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 26 Apr 2026 16:45:45 -0700 Subject: [PATCH 046/152] Restore iterator-style coordinator wiring --- .../controller/ControllerProcessor.scala | 30 ++++++++++++++++- .../WorkflowExecutionCoordinator.scala | 32 +++---------------- .../WorkflowExecutionCoordinatorSpec.scala | 29 ++++++++++++++--- 3 files changed, 59 insertions(+), 32 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index f3d461c9090..4e0fd1da760 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -33,6 +33,8 @@ import org.apache.texera.amber.engine.architecture.scheduling.WorkflowExecutionC import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.MainThreadDelegateMessage import org.apache.texera.amber.engine.common.ambermessage.WorkflowFIFOMessage +import scala.collection.mutable + class ControllerProcessor( workflowContext: WorkflowContext, controllerConfig: ControllerConfig, @@ -43,8 +45,34 @@ class ControllerProcessor( val workflowExecution: WorkflowExecution = WorkflowExecution() val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) + private val nextRegionLevel: mutable.ArrayBuffer[Option[Int]] = mutable.ArrayBuffer(None) val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - () => this.workflowScheduler.getSchedule, + () => { + val schedule = this.workflowScheduler.getSchedule + if (schedule == null) { + Set.empty + } else { + if (nextRegionLevel(0).isEmpty) { + nextRegionLevel(0) = Some(schedule.startingLevel) + } + nextRegionLevel(0) + .filter(schedule.levelSets.contains) + .map { level => + nextRegionLevel(0) = Some(level + 1) + schedule.levelSets(level) + } + .getOrElse(Set.empty) + } + }, + opId => { + val schedule = this.workflowScheduler.getSchedule + if (schedule != null) { + nextRegionLevel(0) = schedule.levelSets.collectFirst { + case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => + level + } + } + }, workflowExecution, controllerConfig, asyncRPCClient diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 2fbf5fc3051..b880385a4c7 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -34,14 +34,14 @@ import org.apache.texera.amber.engine.common.rpc.AsyncRPCClient import scala.collection.mutable class WorkflowExecutionCoordinator( - getSchedule: () => Schedule, + getNextRegions: () => Set[Region], + jumpToRegionContainingOperatorCallback: OperatorIdentity => Unit, workflowExecution: WorkflowExecution, controllerConfig: ControllerConfig, asyncRPCClient: AsyncRPCClient ) extends LazyLogging { private val executedRegions: mutable.ListBuffer[Set[Region]] = mutable.ListBuffer() - private var nextRegionLevel: Option[Int] = None private val regionExecutionCoordinators : mutable.HashMap[RegionIdentity, RegionExecutionCoordinator] = @@ -53,22 +53,7 @@ class WorkflowExecutionCoordinator( this.actorRefService = actorRefService } - private[scheduling] def getNextRegions: Set[Region] = { - val schedule = getSchedule() - if (schedule == null) { - return Set.empty - } - if (nextRegionLevel.isEmpty) { - nextRegionLevel = Some(schedule.startingLevel) - } - nextRegionLevel - .filter(schedule.levelSets.contains) - .map { level => - nextRegionLevel = Some(level + 1) - schedule.levelSets(level) - } - .getOrElse(Set.empty) - } + private[scheduling] def pullNextRegions: Set[Region] = getNextRegions() /** * Each invocation first syncs the internal statuses of each exisiting `RegionExecutionCoordintor`, after which each @@ -100,7 +85,7 @@ class WorkflowExecutionCoordinator( // All existing regions are completed. Start the next region (if any). Future .collect({ - val nextRegions = getNextRegions + val nextRegions = pullNextRegions executedRegions.append(nextRegions) nextRegions .map(region => { @@ -136,14 +121,7 @@ class WorkflowExecutionCoordinator( } def jumpToRegionContainingOperator(opId: OperatorIdentity): Unit = { - val schedule = getSchedule() - if (schedule == null) { - return - } - nextRegionLevel = schedule.levelSets.collectFirst { - case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => - level - } + jumpToRegionContainingOperatorCallback(opId) } } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 3138fd0543e..42bd70a5578 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -30,6 +30,8 @@ import org.apache.texera.amber.core.workflow.PhysicalOp import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.scalatest.flatspec.AnyFlatSpec +import scala.collection.mutable + class WorkflowExecutionCoordinatorSpec extends AnyFlatSpec { private def region(regionId: Long, opId: String): Region = { @@ -53,14 +55,33 @@ class WorkflowExecutionCoordinatorSpec extends AnyFlatSpec { 2 -> Set(thirdRegion) ) ) + val nextRegionLevel: mutable.ArrayBuffer[Option[Int]] = mutable.ArrayBuffer(None) val coordinator = - new WorkflowExecutionCoordinator(() => schedule, WorkflowExecution(), null, null) + new WorkflowExecutionCoordinator( + () => + nextRegionLevel(0) + .orElse(Some(schedule.startingLevel)) + .filter(schedule.levelSets.contains) + .map { level => + nextRegionLevel(0) = Some(level + 1) + schedule.levelSets(level) + } + .getOrElse(Set.empty), + opId => + nextRegionLevel(0) = schedule.levelSets.collectFirst { + case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => + level + }, + WorkflowExecution(), + null, + null + ) - assert(coordinator.getNextRegions == Set(firstRegion)) - assert(coordinator.getNextRegions == Set(secondRegion)) + assert(coordinator.pullNextRegions == Set(firstRegion)) + assert(coordinator.pullNextRegions == Set(secondRegion)) coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) - assert(coordinator.getNextRegions == Set(firstRegion)) + assert(coordinator.pullNextRegions == Set(firstRegion)) } } From 86cbdc5e7fda90836ab62df438321f805fb90d73 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 26 Apr 2026 16:58:23 -0700 Subject: [PATCH 047/152] fix fmt --- .../engine/architecture/controller/ControllerProcessor.scala | 3 ++- .../scheduling/WorkflowExecutionCoordinatorSpec.scala | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 4e0fd1da760..4667be10527 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -68,7 +68,8 @@ class ControllerProcessor( val schedule = this.workflowScheduler.getSchedule if (schedule != null) { nextRegionLevel(0) = schedule.levelSets.collectFirst { - case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => + case (level, regions) + if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => level } } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 42bd70a5578..144056685ec 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -69,7 +69,8 @@ class WorkflowExecutionCoordinatorSpec extends AnyFlatSpec { .getOrElse(Set.empty), opId => nextRegionLevel(0) = schedule.levelSets.collectFirst { - case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => + case (level, regions) + if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => level }, WorkflowExecution(), From baf5dbbe10b612098bf9db20e8c4557ae2fd6df9 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 26 Apr 2026 21:34:30 -0700 Subject: [PATCH 048/152] fix --- .../controller/ControllerProcessor.scala | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 4667be10527..3ee3f2d4630 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -33,8 +33,6 @@ import org.apache.texera.amber.engine.architecture.scheduling.WorkflowExecutionC import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.MainThreadDelegateMessage import org.apache.texera.amber.engine.common.ambermessage.WorkflowFIFOMessage -import scala.collection.mutable - class ControllerProcessor( workflowContext: WorkflowContext, controllerConfig: ControllerConfig, @@ -45,29 +43,30 @@ class ControllerProcessor( val workflowExecution: WorkflowExecution = WorkflowExecution() val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) - private val nextRegionLevel: mutable.ArrayBuffer[Option[Int]] = mutable.ArrayBuffer(None) + // The coordinator consumes regions through callbacks rather than reading Schedule directly. + // This cursor tracks the next ranked level to execute and can be reset when control flow + // requests jumping back to the region containing a target operator. + private var nextRegionLevel: Option[Int] = None val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( () => { - val schedule = this.workflowScheduler.getSchedule - if (schedule == null) { - Set.empty - } else { - if (nextRegionLevel(0).isEmpty) { - nextRegionLevel(0) = Some(schedule.startingLevel) - } - nextRegionLevel(0) - .filter(schedule.levelSets.contains) - .map { level => - nextRegionLevel(0) = Some(level + 1) - schedule.levelSets(level) + Option(this.workflowScheduler.getSchedule) + .map { schedule => + if (nextRegionLevel.isEmpty) { + nextRegionLevel = Some(schedule.startingLevel) } - .getOrElse(Set.empty) - } + nextRegionLevel + .filter(schedule.levelSets.contains) + .map { level => + nextRegionLevel = Some(level + 1) + schedule.levelSets(level) + } + .getOrElse(Set.empty) + } + .getOrElse(Set.empty) }, opId => { - val schedule = this.workflowScheduler.getSchedule - if (schedule != null) { - nextRegionLevel(0) = schedule.levelSets.collectFirst { + Option(this.workflowScheduler.getSchedule).foreach { schedule => + nextRegionLevel = schedule.levelSets.collectFirst { case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => level From ba6104fef853e9bfa0b6fa97e7fb7a28dfe3037e Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sun, 26 Apr 2026 21:37:28 -0700 Subject: [PATCH 049/152] fix fmt --- .../engine/architecture/controller/ControllerProcessor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 3ee3f2d4630..c0dd2c7ce80 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -55,7 +55,7 @@ class ControllerProcessor( nextRegionLevel = Some(schedule.startingLevel) } nextRegionLevel - .filter(schedule.levelSets.contains) + .filter(schedule.levelSets.contains) .map { level => nextRegionLevel = Some(level + 1) schedule.levelSets(level) From 560b670a6716021b623aba7cb9d34343d9d2c36e Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 27 Apr 2026 22:22:12 -0700 Subject: [PATCH 050/152] update --- .../controller/ControllerProcessor.scala | 64 +++++++++++-------- .../architecture/scheduling/Schedule.scala | 7 ++ 2 files changed, 44 insertions(+), 27 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index c0dd2c7ce80..094bbb876c9 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -43,36 +43,46 @@ class ControllerProcessor( val workflowExecution: WorkflowExecution = WorkflowExecution() val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) - // The coordinator consumes regions through callbacks rather than reading Schedule directly. - // This cursor tracks the next ranked level to execute and can be reset when control flow - // requests jumping back to the region containing a target operator. private var nextRegionLevel: Option[Int] = None - val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - () => { - Option(this.workflowScheduler.getSchedule) - .map { schedule => - if (nextRegionLevel.isEmpty) { - nextRegionLevel = Some(schedule.startingLevel) - } - nextRegionLevel - .filter(schedule.levelSets.contains) - .map { level => - nextRegionLevel = Some(level + 1) - schedule.levelSets(level) - } - .getOrElse(Set.empty) - } - .getOrElse(Set.empty) - }, - opId => { - Option(this.workflowScheduler.getSchedule).foreach { schedule => - nextRegionLevel = schedule.levelSets.collectFirst { - case (level, regions) - if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => - level + + /** + * The coordinator consumes regions through this callback rather than reading the schedule directly. + * The controller owns the cursor so it can reset the next schedule level when control flow requests + * jumping back to the region containing a target operator. + */ + private def getNextScheduledRegions(): Set[org.apache.texera.amber.engine.architecture.scheduling.Region] = { + Option(this.workflowScheduler.getSchedule) + .map { schedule => + if (nextRegionLevel.isEmpty) { + nextRegionLevel = Some(schedule.startingLevel) } + nextRegionLevel + .filter(schedule.levelSets.contains) + .map { level => + nextRegionLevel = Some(level + 1) + schedule.levelSets(level) + } + .getOrElse(Set.empty) } - }, + .getOrElse(Set.empty) + } + + /** + * Resets the schedule cursor so the next coordinator pull starts from the region containing the + * given operator. Schedule precomputes the operator-to-level mapping because loop control flow may + * jump repeatedly and should avoid rescanning all level sets on each jump. + */ + private def jumpToRegionContainingOperator( + opId: org.apache.texera.amber.core.virtualidentity.OperatorIdentity + ): Unit = { + Option(this.workflowScheduler.getSchedule).foreach { schedule => + nextRegionLevel = schedule.getLevelOfOperator(opId) + } + } + + val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( + getNextScheduledRegions, + jumpToRegionContainingOperator, workflowExecution, controllerConfig, asyncRPCClient diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 65ed3f1fcaa..f0e0c5b21db 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -19,11 +19,18 @@ package org.apache.texera.amber.engine.architecture.scheduling +import org.apache.texera.amber.core.virtualidentity.OperatorIdentity + case class Schedule(levelSets: Map[Int, Set[Region]]) extends Iterable[Set[Region]] { val startingLevel: Int = levelSets.keys.minOption.getOrElse(0) + private val operatorLevels = levelSets.iterator.flatMap { case (level, regions) => + regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> level)) + }.toMap def getRegions: List[Region] = levelSets.values.flatten.toList + def getLevelOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevels.get(opId) + override def iterator: Iterator[Set[Region]] = levelSets.keys.toSeq.sorted.iterator.map(levelSets) } From 3579bc1566c43cbf9961b075c0a9c87772e2525a Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 27 Apr 2026 23:18:12 -0700 Subject: [PATCH 051/152] update --- .../architecture/controller/Controller.scala | 1 + .../controller/ControllerProcessor.scala | 46 +++---------------- .../architecture/scheduling/Schedule.scala | 45 +++++++++++++++--- .../WorkflowExecutionCoordinator.scala | 19 ++++++-- .../WorkflowExecutionCoordinatorSpec.scala | 19 +------- 5 files changed, 62 insertions(+), 68 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala index daa977d8575..a838b1ae3c0 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala @@ -116,6 +116,7 @@ class Controller( override def initState(): Unit = { attachRuntimeServicesToCPState() cp.workflowScheduler.updateSchedule(physicalPlan) + cp.updateExecutionSchedule(cp.workflowScheduler.getSchedule) val regions: List[(Long, List[String])] = cp.workflowScheduler.getSchedule.getRegions.map { region => diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 094bbb876c9..df000e92c1b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -29,7 +29,7 @@ import org.apache.texera.amber.engine.architecture.common.{ } import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.architecture.logreplay.ReplayLogManager -import org.apache.texera.amber.engine.architecture.scheduling.WorkflowExecutionCoordinator +import org.apache.texera.amber.engine.architecture.scheduling.{Schedule, WorkflowExecutionCoordinator} import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.MainThreadDelegateMessage import org.apache.texera.amber.engine.common.ambermessage.WorkflowFIFOMessage @@ -43,51 +43,17 @@ class ControllerProcessor( val workflowExecution: WorkflowExecution = WorkflowExecution() val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) - private var nextRegionLevel: Option[Int] = None - - /** - * The coordinator consumes regions through this callback rather than reading the schedule directly. - * The controller owns the cursor so it can reset the next schedule level when control flow requests - * jumping back to the region containing a target operator. - */ - private def getNextScheduledRegions(): Set[org.apache.texera.amber.engine.architecture.scheduling.Region] = { - Option(this.workflowScheduler.getSchedule) - .map { schedule => - if (nextRegionLevel.isEmpty) { - nextRegionLevel = Some(schedule.startingLevel) - } - nextRegionLevel - .filter(schedule.levelSets.contains) - .map { level => - nextRegionLevel = Some(level + 1) - schedule.levelSets(level) - } - .getOrElse(Set.empty) - } - .getOrElse(Set.empty) - } - - /** - * Resets the schedule cursor so the next coordinator pull starts from the region containing the - * given operator. Schedule precomputes the operator-to-level mapping because loop control flow may - * jump repeatedly and should avoid rescanning all level sets on each jump. - */ - private def jumpToRegionContainingOperator( - opId: org.apache.texera.amber.core.virtualidentity.OperatorIdentity - ): Unit = { - Option(this.workflowScheduler.getSchedule).foreach { schedule => - nextRegionLevel = schedule.getLevelOfOperator(opId) - } - } - val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - getNextScheduledRegions, - jumpToRegionContainingOperator, + Schedule(Map.empty), workflowExecution, controllerConfig, asyncRPCClient ) + def updateExecutionSchedule(schedule: Schedule): Unit = { + workflowExecutionCoordinator.replaceSchedule(schedule) + } + private val initializer = new ControllerAsyncRPCHandlerInitializer(this) @transient var controllerTimerService: ControllerTimerService = _ diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index f0e0c5b21db..b4bd78e1767 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -21,16 +21,49 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.texera.amber.core.virtualidentity.OperatorIdentity -case class Schedule(levelSets: Map[Int, Set[Region]]) extends Iterable[Set[Region]] { - val startingLevel: Int = levelSets.keys.minOption.getOrElse(0) - private val operatorLevels = levelSets.iterator.flatMap { case (level, regions) => - regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> level)) +case class Schedule( + levelSets: Map[Int, Set[Region]], + baseLevels: Vector[Int] = Vector.empty, + executionLevels: Vector[Int] = Vector.empty, + currentLevelIndex: Int = 0 +) extends Iterable[Set[Region]] { + private val normalizedBaseLevels = + if (baseLevels.nonEmpty || levelSets.isEmpty) baseLevels else levelSets.keys.toVector.sorted + private val normalizedExecutionLevels = + if (executionLevels.nonEmpty || normalizedBaseLevels.isEmpty) executionLevels else normalizedBaseLevels + private val operatorLevelIndices = levelSets.iterator.flatMap { case (level, regions) => + val levelIndex = normalizedBaseLevels.indexOf(level) + regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> levelIndex)) }.toMap + val startingLevel: Int = normalizedBaseLevels.headOption.getOrElse(0) + def getRegions: List[Region] = levelSets.values.flatten.toList - def getLevelOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevels.get(opId) + def getCurrentRegions: Set[Region] = + normalizedExecutionLevels + .lift(currentLevelIndex) + .flatMap(levelSets.get) + .getOrElse(Set.empty) + + def advance: Schedule = + copy( + baseLevels = normalizedBaseLevels, + executionLevels = normalizedExecutionLevels, + currentLevelIndex = currentLevelIndex + 1 + ) + + def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) + + def rewriteExecutionFrom(levelIndex: Int): Schedule = + copy( + baseLevels = normalizedBaseLevels, + executionLevels = normalizedExecutionLevels.take(currentLevelIndex) ++ normalizedBaseLevels.drop( + levelIndex + ), + currentLevelIndex = currentLevelIndex + ) override def iterator: Iterator[Set[Region]] = - levelSets.keys.toSeq.sorted.iterator.map(levelSets) + normalizedExecutionLevels.iterator.map(levelSets) } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 6e823185a37..e4637e9ca3e 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -34,13 +34,14 @@ import org.apache.texera.amber.engine.common.rpc.AsyncRPCClient import scala.collection.mutable class WorkflowExecutionCoordinator( - getNextRegions: () => Set[Region], - jumpToRegionContainingOperatorCallback: OperatorIdentity => Unit, + initialSchedule: Schedule, workflowExecution: WorkflowExecution, controllerConfig: ControllerConfig, asyncRPCClient: AsyncRPCClient ) extends LazyLogging { + private var schedule: Schedule = initialSchedule + private val executedRegions: mutable.ListBuffer[Set[Region]] = mutable.ListBuffer() private val regionExecutionCoordinators @@ -53,7 +54,15 @@ class WorkflowExecutionCoordinator( this.actorRefService = actorRefService } - private[scheduling] def pullNextRegions: Set[Region] = getNextRegions() + def replaceSchedule(newSchedule: Schedule): Unit = { + schedule = newSchedule + } + + private[scheduling] def pullNextRegions: Set[Region] = { + val nextRegions = schedule.getCurrentRegions + schedule = schedule.advance + nextRegions + } /** * Each invocation first syncs the internal statuses of each exisiting `RegionExecutionCoordintor`, after which each @@ -127,7 +136,9 @@ class WorkflowExecutionCoordinator( } def jumpToRegionContainingOperator(opId: OperatorIdentity): Unit = { - jumpToRegionContainingOperatorCallback(opId) + schedule.getLevelIndexOfOperator(opId).foreach { levelIndex => + schedule = schedule.rewriteExecutionFrom(levelIndex) + } } } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 144056685ec..1c733e4d1e8 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -30,8 +30,6 @@ import org.apache.texera.amber.core.workflow.PhysicalOp import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.scalatest.flatspec.AnyFlatSpec -import scala.collection.mutable - class WorkflowExecutionCoordinatorSpec extends AnyFlatSpec { private def region(regionId: Long, opId: String): Region = { @@ -55,24 +53,9 @@ class WorkflowExecutionCoordinatorSpec extends AnyFlatSpec { 2 -> Set(thirdRegion) ) ) - val nextRegionLevel: mutable.ArrayBuffer[Option[Int]] = mutable.ArrayBuffer(None) val coordinator = new WorkflowExecutionCoordinator( - () => - nextRegionLevel(0) - .orElse(Some(schedule.startingLevel)) - .filter(schedule.levelSets.contains) - .map { level => - nextRegionLevel(0) = Some(level + 1) - schedule.levelSets(level) - } - .getOrElse(Set.empty), - opId => - nextRegionLevel(0) = schedule.levelSets.collectFirst { - case (level, regions) - if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => - level - }, + schedule, WorkflowExecution(), null, null From 802bdd0e718cb55085f36cc28ad879cb38bddebf Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 27 Apr 2026 23:26:25 -0700 Subject: [PATCH 052/152] update --- .github/workflows/build-and-push-images.yml | 6 +- .github/workflows/github-action-build.yml | 2 +- .../controller/WorkflowScheduler.scala | 9 ++- .../architecture/scheduling/Schedule.scala | 56 ++++++++----------- .../WorkflowExecutionCoordinator.scala | 4 +- .../engine/e2e/BatchSizePropagationSpec.scala | 17 +++--- 6 files changed, 45 insertions(+), 49 deletions(-) diff --git a/.github/workflows/build-and-push-images.yml b/.github/workflows/build-and-push-images.yml index b63743ec2c0..c7762ada03a 100644 --- a/.github/workflows/build-and-push-images.yml +++ b/.github/workflows/build-and-push-images.yml @@ -128,7 +128,7 @@ jobs: - name: Setup sbt launcher uses: sbt/setup-sbt@3e125ece5c3e5248e18da9ed8d2cce3d335ec8dd # v1.1.14 - - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0 + - uses: coursier/cache-action@4e2615869d13561d626ed48655e1a39e5b192b3c # v6.4.9 with: extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]' @@ -327,7 +327,7 @@ jobs: - name: Setup sbt launcher uses: sbt/setup-sbt@508b753e53cb6095967669e0911487d2b9bc9f41 # v1.1.22 - - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0 + - uses: coursier/cache-action@4e2615869d13561d626ed48655e1a39e5b192b3c # v6.4.9 with: extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]' @@ -407,7 +407,7 @@ jobs: - name: Setup sbt launcher uses: sbt/setup-sbt@508b753e53cb6095967669e0911487d2b9bc9f41 # v1.1.22 - - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0 + - uses: coursier/cache-action@4e2615869d13561d626ed48655e1a39e5b192b3c # v6.4.9 with: extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]' diff --git a/.github/workflows/github-action-build.yml b/.github/workflows/github-action-build.yml index f7a72b8da5a..21e70fcf509 100644 --- a/.github/workflows/github-action-build.yml +++ b/.github/workflows/github-action-build.yml @@ -122,7 +122,7 @@ jobs: if [ -f amber/operator-requirements.txt ]; then pip install -r amber/operator-requirements.txt; fi - name: Setup sbt launcher uses: sbt/setup-sbt@508b753e53cb6095967669e0911487d2b9bc9f41 # v1.1.22 - - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0 + - uses: coursier/cache-action@4e2615869d13561d626ed48655e1a39e5b192b3c # v6.4.9 with: extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]' - name: Lint with scalafmt diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index 9e777eb6a84..9dcf3ad4bfc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -21,7 +21,11 @@ package org.apache.texera.amber.engine.architecture.controller import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity import org.apache.texera.amber.core.workflow.{PhysicalPlan, WorkflowContext} -import org.apache.texera.amber.engine.architecture.scheduling.{CostBasedScheduleGenerator, Schedule} +import org.apache.texera.amber.engine.architecture.scheduling.{ + CostBasedScheduleGenerator, + Region, + Schedule +} class WorkflowScheduler( workflowContext: WorkflowContext, @@ -47,4 +51,7 @@ class WorkflowScheduler( this.schedule = generatedSchedule this.physicalPlan = updatedPhysicalPlan } + + def getNextRegions: Set[Region] = if (!schedule.hasNext) Set() else schedule.next() + } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index b4bd78e1767..dd48f893e37 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -22,48 +22,38 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.texera.amber.core.virtualidentity.OperatorIdentity case class Schedule( - levelSets: Map[Int, Set[Region]], - baseLevels: Vector[Int] = Vector.empty, - executionLevels: Vector[Int] = Vector.empty, - currentLevelIndex: Int = 0 -) extends Iterable[Set[Region]] { - private val normalizedBaseLevels = - if (baseLevels.nonEmpty || levelSets.isEmpty) baseLevels else levelSets.keys.toVector.sorted + private val levelSets: Map[Int, Set[Region]], + executionLevels: Vector[Int] = Vector.empty +) extends Iterator[Set[Region]] { + private val baseLevels = levelSets.keys.toVector.sorted private val normalizedExecutionLevels = - if (executionLevels.nonEmpty || normalizedBaseLevels.isEmpty) executionLevels else normalizedBaseLevels + if (executionLevels.nonEmpty || baseLevels.isEmpty) executionLevels else baseLevels private val operatorLevelIndices = levelSets.iterator.flatMap { case (level, regions) => - val levelIndex = normalizedBaseLevels.indexOf(level) + val levelIndex = baseLevels.indexOf(level) regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> levelIndex)) }.toMap - - val startingLevel: Int = normalizedBaseLevels.headOption.getOrElse(0) + private var currentLevelIndex = 0 def getRegions: List[Region] = levelSets.values.flatten.toList - def getCurrentRegions: Set[Region] = - normalizedExecutionLevels - .lift(currentLevelIndex) - .flatMap(levelSets.get) - .getOrElse(Set.empty) - - def advance: Schedule = - copy( - baseLevels = normalizedBaseLevels, - executionLevels = normalizedExecutionLevels, - currentLevelIndex = currentLevelIndex + 1 - ) - def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) - def rewriteExecutionFrom(levelIndex: Int): Schedule = - copy( - baseLevels = normalizedBaseLevels, - executionLevels = normalizedExecutionLevels.take(currentLevelIndex) ++ normalizedBaseLevels.drop( - levelIndex - ), - currentLevelIndex = currentLevelIndex + def rewriteExecutionFrom(levelIndex: Int): Schedule = { + val rewrittenSchedule = copy( + executionLevels = normalizedExecutionLevels.take(currentLevelIndex) ++ baseLevels.drop(levelIndex) ) + rewrittenSchedule.currentLevelIndex = currentLevelIndex + rewrittenSchedule + } + + override def hasNext: Boolean = currentLevelIndex < normalizedExecutionLevels.length - override def iterator: Iterator[Set[Region]] = - normalizedExecutionLevels.iterator.map(levelSets) + override def next(): Set[Region] = { + val regions = normalizedExecutionLevels + .lift(currentLevelIndex) + .flatMap(levelSets.get) + .getOrElse(Set.empty) + currentLevelIndex += 1 + regions + } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index e4637e9ca3e..3616fe03d6e 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -59,9 +59,7 @@ class WorkflowExecutionCoordinator( } private[scheduling] def pullNextRegions: Set[Region] = { - val nextRegions = schedule.getCurrentRegions - schedule = schedule.advance - nextRegions + if (!schedule.hasNext) Set() else schedule.next() } /** diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/e2e/BatchSizePropagationSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/e2e/BatchSizePropagationSpec.scala index 0a7dccf5db1..e9b830bdfdc 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/e2e/BatchSizePropagationSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/e2e/BatchSizePropagationSpec.scala @@ -25,7 +25,6 @@ import org.apache.pekko.util.Timeout import org.apache.texera.amber.clustering.SingleNodeListener import org.apache.texera.amber.core.workflow.{PortIdentity, WorkflowContext, WorkflowSettings} import org.apache.texera.amber.engine.architecture.controller._ -import org.apache.texera.amber.engine.architecture.scheduling.Schedule import org.apache.texera.amber.engine.architecture.sendsemantics.partitionings._ import org.apache.texera.amber.engine.common.virtualidentity.util.CONTROLLER import org.apache.texera.amber.engine.e2e.TestUtils.buildWorkflow @@ -55,10 +54,11 @@ class BatchSizePropagationSpec } def verifyBatchSizeInPartitioning( - schedule: Schedule, + workflowScheduler: WorkflowScheduler, expectedBatchSize: Int ): Unit = { - schedule.foreach { nextRegions => + var nextRegions = workflowScheduler.getNextRegions + while (nextRegions.nonEmpty) { nextRegions.foreach { region => region.resourceConfig.foreach { resourceConfig => resourceConfig.linkConfigs.foreach { @@ -112,6 +112,7 @@ class BatchSizePropagationSpec } } } + nextRegions = workflowScheduler.getNextRegions } } @@ -134,7 +135,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 1) + verifyBatchSizeInPartitioning(workflowScheduler, 1) } "Engine" should "propagate the correct batch size for headerlessCsv->keyword workflow" in { @@ -164,7 +165,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 500) + verifyBatchSizeInPartitioning(workflowScheduler, 500) } "Engine" should "propagate the correct batch size for csv->keyword->count workflow" in { @@ -202,7 +203,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 100) + verifyBatchSizeInPartitioning(workflowScheduler, 100) } "Engine" should "propagate the correct batch size for csv->keyword->averageAndGroupBy workflow" in { @@ -243,7 +244,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 300) + verifyBatchSizeInPartitioning(workflowScheduler, 300) } "Engine" should "propagate the correct batch size for csv->(csv->)->join workflow" in { @@ -284,7 +285,7 @@ class BatchSizePropagationSpec val workflowScheduler = new WorkflowScheduler(context, CONTROLLER) workflowScheduler.updateSchedule(workflow.physicalPlan) - verifyBatchSizeInPartitioning(workflowScheduler.getSchedule, 1) + verifyBatchSizeInPartitioning(workflowScheduler, 1) } } From f5ef44748afcac69bc17fe94892fbd0ed31d2528 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 27 Apr 2026 23:33:09 -0700 Subject: [PATCH 053/152] update --- .github/workflows/build-and-push-images.yml | 6 +++--- .github/workflows/github-action-build.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-and-push-images.yml b/.github/workflows/build-and-push-images.yml index c7762ada03a..b63743ec2c0 100644 --- a/.github/workflows/build-and-push-images.yml +++ b/.github/workflows/build-and-push-images.yml @@ -128,7 +128,7 @@ jobs: - name: Setup sbt launcher uses: sbt/setup-sbt@3e125ece5c3e5248e18da9ed8d2cce3d335ec8dd # v1.1.14 - - uses: coursier/cache-action@4e2615869d13561d626ed48655e1a39e5b192b3c # v6.4.9 + - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0 with: extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]' @@ -327,7 +327,7 @@ jobs: - name: Setup sbt launcher uses: sbt/setup-sbt@508b753e53cb6095967669e0911487d2b9bc9f41 # v1.1.22 - - uses: coursier/cache-action@4e2615869d13561d626ed48655e1a39e5b192b3c # v6.4.9 + - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0 with: extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]' @@ -407,7 +407,7 @@ jobs: - name: Setup sbt launcher uses: sbt/setup-sbt@508b753e53cb6095967669e0911487d2b9bc9f41 # v1.1.22 - - uses: coursier/cache-action@4e2615869d13561d626ed48655e1a39e5b192b3c # v6.4.9 + - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0 with: extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]' diff --git a/.github/workflows/github-action-build.yml b/.github/workflows/github-action-build.yml index 21e70fcf509..f7a72b8da5a 100644 --- a/.github/workflows/github-action-build.yml +++ b/.github/workflows/github-action-build.yml @@ -122,7 +122,7 @@ jobs: if [ -f amber/operator-requirements.txt ]; then pip install -r amber/operator-requirements.txt; fi - name: Setup sbt launcher uses: sbt/setup-sbt@508b753e53cb6095967669e0911487d2b9bc9f41 # v1.1.22 - - uses: coursier/cache-action@4e2615869d13561d626ed48655e1a39e5b192b3c # v6.4.9 + - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0 with: extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]' - name: Lint with scalafmt From cf161950dbcd2206a37f807327b82879a6a19cbb Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 28 Apr 2026 21:08:34 -0700 Subject: [PATCH 054/152] update --- .../controller/execution/WorkflowExecution.scala | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala index effc282a2f2..c1e44bd5cc8 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/execution/WorkflowExecution.scala @@ -36,13 +36,19 @@ case class WorkflowExecution() { /** * Initializes or retrieves a `RegionExecution` for a given `Region`. If not already - * initialized, it creates and returns a new `RegionExecution`. + * initialized, it creates and returns a new `RegionExecution`; otherwise, an assertion + * error is thrown if re-initialization is attempted. * * @param region The `Region` for which to initialize or retrieve the `RegionExecution`. * @return The `RegionExecution` associated with the given `Region`. + * @throws AssertionError if the `RegionExecution` has already been initialized. */ def initRegionExecution(region: Region): RegionExecution = { - regionExecutions.remove(region.id) + // ensure the region execution hasn't been initialized already. + assert( + !regionExecutions.contains(region.id), + s"RegionExecution of ${region.id} already initialized." + ) regionExecutions.getOrElseUpdate(region.id, RegionExecution(region)) } From 488040ee80a33e523c0a80925fa88a1328be5ae5 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 28 Apr 2026 21:21:36 -0700 Subject: [PATCH 055/152] update --- .../architecture/rpc/controlcommands.proto | 1 + .../org/apache/texera/amber/core/__init__.py | 4 +- .../amber/engine/architecture/rpc/__init__.py | 1394 ++++++++--------- .../architecture/sendsemantics/__init__.py | 4 +- .../engine/architecture/worker/__init__.py | 4 +- .../texera/amber/engine/common/__init__.py | 42 +- 6 files changed, 670 insertions(+), 779 deletions(-) diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto index 01bbb9fe318..e64a0b8be75 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto @@ -45,6 +45,7 @@ message ControlRequest { WorkerStateUpdatedRequest workerStateUpdatedRequest = 8; LinkWorkersRequest linkWorkersRequest = 9; WorkflowReconfigureRequest workflowReconfigureRequest = 10; + JumpToOperatorRequest jumpToOperatorRequest = 12; // request for worker AddInputChannelRequest addInputChannelRequest = 50; diff --git a/amber/src/main/python/proto/org/apache/texera/amber/core/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/core/__init__.py index d993a669eab..2d21638c263 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/core/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/core/__init__.py @@ -5,7 +5,9 @@ from dataclasses import dataclass from datetime import datetime -from typing import List +from typing import ( + List, +) import betterproto diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py index 550b1fed46d..77d51933af6 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py @@ -22,6 +22,7 @@ worker as _worker__, ) + if TYPE_CHECKING: import grpclib.server from betterproto.grpc.grpclib_client import MetadataLike @@ -100,9 +101,6 @@ class ControlRequest(betterproto.Message): workflow_reconfigure_request: "WorkflowReconfigureRequest" = ( betterproto.message_field(10, group="sealed_value") ) - jump_to_operator_request: "JumpToOperatorRequest" = betterproto.message_field( - 12, group="sealed_value" - ) add_input_channel_request: "AddInputChannelRequest" = betterproto.message_field( 50, group="sealed_value" ) @@ -387,11 +385,6 @@ class QueryStatisticsRequest(betterproto.Message): update_target: "StatisticsUpdateTarget" = betterproto.enum_field(2) -@dataclass(eq=False, repr=False) -class JumpToOperatorRequest(betterproto.Message): - target_operator_id: "___core__.OperatorIdentity" = betterproto.message_field(1) - - @dataclass(eq=False, repr=False) class ControlReturn(betterproto.Message): """The generic return message""" @@ -522,522 +515,503 @@ class WorkerMetricsResponse(betterproto.Message): metrics: "_worker__.WorkerMetrics" = betterproto.message_field(1) -class ControllerServiceStub(betterproto.ServiceStub): - async def retrieve_workflow_state( +class RpcTesterStub(betterproto.ServiceStub): + async def send_ping( self, - empty_request: "EmptyRequest", + ping: "Ping", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "RetrieveWorkflowStateResponse": + ) -> "IntResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetrieveWorkflowState", - empty_request, - RetrieveWorkflowStateResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPing", + ping, + IntResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def propagate_embedded_control_message( + async def send_pong( self, - propagate_embedded_control_message_request: "PropagateEmbeddedControlMessageRequest", + pong: "Pong", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "PropagateEmbeddedControlMessageResponse": + ) -> "IntResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PropagateEmbeddedControlMessage", - propagate_embedded_control_message_request, - PropagateEmbeddedControlMessageResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPong", + pong, + IntResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def take_global_checkpoint( + async def send_nested( self, - take_global_checkpoint_request: "TakeGlobalCheckpointRequest", + nested: "Nested", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "TakeGlobalCheckpointResponse": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/TakeGlobalCheckpoint", - take_global_checkpoint_request, - TakeGlobalCheckpointResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendNested", + nested, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def debug_command( + async def send_pass( self, - debug_command_request: "DebugCommandRequest", + pass_: "Pass", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/DebugCommand", - debug_command_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPass", + pass_, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def evaluate_python_expression( + async def send_error_command( self, - evaluate_python_expression_request: "EvaluatePythonExpressionRequest", + error_command: "ErrorCommand", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EvaluatePythonExpressionResponse": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/EvaluatePythonExpression", - evaluate_python_expression_request, - EvaluatePythonExpressionResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendErrorCommand", + error_command, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def console_message_triggered( + async def send_recursion( self, - console_message_triggered_request: "ConsoleMessageTriggeredRequest", + recursion: "Recursion", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ConsoleMessageTriggered", - console_message_triggered_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendRecursion", + recursion, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def port_completed( + async def send_collect( self, - port_completed_request: "PortCompletedRequest", + collect: "Collect", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PortCompleted", - port_completed_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendCollect", + collect, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def start_workflow( + async def send_generate_number( self, - empty_request: "EmptyRequest", + generate_number: "GenerateNumber", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StartWorkflowResponse": + ) -> "IntResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/StartWorkflow", - empty_request, - StartWorkflowResponse, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendGenerateNumber", + generate_number, + IntResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def resume_workflow( + async def send_multi_call( self, - empty_request: "EmptyRequest", + multi_call: "MultiCall", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ResumeWorkflow", - empty_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendMultiCall", + multi_call, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def pause_workflow( + async def send_chain( self, - empty_request: "EmptyRequest", + chain: "Chain", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StringResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PauseWorkflow", - empty_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendChain", + chain, + StringResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def worker_state_updated( + +class WorkerServiceStub(betterproto.ServiceStub): + async def add_input_channel( self, - worker_state_updated_request: "WorkerStateUpdatedRequest", + add_input_channel_request: "AddInputChannelRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerStateUpdated", - worker_state_updated_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddInputChannel", + add_input_channel_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def worker_execution_completed( + async def add_partitioning( self, - empty_request: "EmptyRequest", + add_partitioning_request: "AddPartitioningRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerExecutionCompleted", - empty_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddPartitioning", + add_partitioning_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def jump_to_operator( + async def assign_port( self, - jump_to_operator_request: "JumpToOperatorRequest", + assign_port_request: "AssignPortRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator", - jump_to_operator_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AssignPort", + assign_port_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def link_workers( + async def finalize_checkpoint( self, - link_workers_request: "LinkWorkersRequest", + finalize_checkpoint_request: "FinalizeCheckpointRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "FinalizeCheckpointResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers", - link_workers_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FinalizeCheckpoint", + finalize_checkpoint_request, + FinalizeCheckpointResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def controller_initiate_query_statistics( + async def flush_network_buffer( self, - query_statistics_request: "QueryStatisticsRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ControllerInitiateQueryStatistics", - query_statistics_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FlushNetworkBuffer", + empty_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def retry_workflow( + async def initialize_executor( self, - retry_workflow_request: "RetryWorkflowRequest", + initialize_executor_request: "InitializeExecutorRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetryWorkflow", - retry_workflow_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/InitializeExecutor", + initialize_executor_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - -class RpcTesterStub(betterproto.ServiceStub): - async def send_ping( + async def open_executor( self, - ping: "Ping", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "IntResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPing", - ping, - IntResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/OpenExecutor", + empty_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_pong( + async def pause_worker( self, - pong: "Pong", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "IntResponse": + ) -> "WorkerStateResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPong", - pong, - IntResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PauseWorker", + empty_request, + WorkerStateResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_nested( + async def prepare_checkpoint( self, - nested: "Nested", + prepare_checkpoint_request: "PrepareCheckpointRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendNested", - nested, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PrepareCheckpoint", + prepare_checkpoint_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_pass( + async def query_statistics( self, - pass_: "Pass", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "WorkerMetricsResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPass", - pass_, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/QueryStatistics", + empty_request, + WorkerMetricsResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_error_command( + async def resume_worker( self, - error_command: "ErrorCommand", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "WorkerStateResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendErrorCommand", - error_command, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/ResumeWorker", + empty_request, + WorkerStateResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_recursion( + async def retrieve_state( self, - recursion: "Recursion", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendRecursion", - recursion, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetrieveState", + empty_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_collect( + async def retry_current_tuple( self, - collect: "Collect", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": - return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendCollect", - collect, - StringResponse, - timeout=timeout, - deadline=deadline, - metadata=metadata, - ) - - async def send_generate_number( - self, - generate_number: "GenerateNumber", - *, - timeout: Optional[float] = None, - deadline: Optional["Deadline"] = None, - metadata: Optional["MetadataLike"] = None - ) -> "IntResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendGenerateNumber", - generate_number, - IntResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetryCurrentTuple", + empty_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_multi_call( + async def start_worker( self, - multi_call: "MultiCall", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "WorkerStateResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendMultiCall", - multi_call, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartWorker", + empty_request, + WorkerStateResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def send_chain( + async def end_worker( self, - chain: "Chain", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "StringResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendChain", - chain, - StringResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndWorker", + empty_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - -class WorkerServiceStub(betterproto.ServiceStub): - async def add_input_channel( + async def start_channel( self, - add_input_channel_request: "AddInputChannelRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddInputChannel", - add_input_channel_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartChannel", + empty_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def add_partitioning( + async def end_channel( self, - add_partitioning_request: "AddPartitioningRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddPartitioning", - add_partitioning_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndChannel", + empty_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def assign_port( + async def debug_command( self, - assign_port_request: "AssignPortRequest", + debug_command_request: "DebugCommandRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AssignPort", - assign_port_request, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/DebugCommand", + debug_command_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def finalize_checkpoint( + async def evaluate_python_expression( self, - finalize_checkpoint_request: "FinalizeCheckpointRequest", + evaluate_python_expression_request: "EvaluatePythonExpressionRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "FinalizeCheckpointResponse": + ) -> "EvaluatedValue": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FinalizeCheckpoint", - finalize_checkpoint_request, - FinalizeCheckpointResponse, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EvaluatePythonExpression", + evaluate_python_expression_request, + EvaluatedValue, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def flush_network_buffer( + async def no_operation( self, empty_request: "EmptyRequest", *, @@ -1046,7 +1020,7 @@ async def flush_network_buffer( metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FlushNetworkBuffer", + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/NoOperation", empty_request, EmptyReturn, timeout=timeout, @@ -1075,158 +1049,158 @@ async def update_executor( class ControllerServiceStub(betterproto.ServiceStub): async def retrieve_workflow_state( self, - initialize_executor_request: "InitializeExecutorRequest", + empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "RetrieveWorkflowStateResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/InitializeExecutor", - initialize_executor_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetrieveWorkflowState", + empty_request, + RetrieveWorkflowStateResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def open_executor( + async def propagate_embedded_control_message( self, - empty_request: "EmptyRequest", + propagate_embedded_control_message_request: "PropagateEmbeddedControlMessageRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "PropagateEmbeddedControlMessageResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/OpenExecutor", - empty_request, - EmptyReturn, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PropagateEmbeddedControlMessage", + propagate_embedded_control_message_request, + PropagateEmbeddedControlMessageResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def pause_worker( + async def take_global_checkpoint( self, - empty_request: "EmptyRequest", + take_global_checkpoint_request: "TakeGlobalCheckpointRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "WorkerStateResponse": + ) -> "TakeGlobalCheckpointResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PauseWorker", - empty_request, - WorkerStateResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/TakeGlobalCheckpoint", + take_global_checkpoint_request, + TakeGlobalCheckpointResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def prepare_checkpoint( + async def debug_command( self, - prepare_checkpoint_request: "PrepareCheckpointRequest", + debug_command_request: "DebugCommandRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PrepareCheckpoint", - prepare_checkpoint_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/DebugCommand", + debug_command_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def query_statistics( + async def evaluate_python_expression( self, - empty_request: "EmptyRequest", + evaluate_python_expression_request: "EvaluatePythonExpressionRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "WorkerMetricsResponse": + ) -> "EvaluatePythonExpressionResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/QueryStatistics", - empty_request, - WorkerMetricsResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/EvaluatePythonExpression", + evaluate_python_expression_request, + EvaluatePythonExpressionResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def resume_worker( + async def console_message_triggered( self, - empty_request: "EmptyRequest", + console_message_triggered_request: "ConsoleMessageTriggeredRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "WorkerStateResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/ResumeWorker", - empty_request, - WorkerStateResponse, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ConsoleMessageTriggered", + console_message_triggered_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def retrieve_state( + async def port_completed( self, - empty_request: "EmptyRequest", + port_completed_request: "PortCompletedRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetrieveState", - empty_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PortCompleted", + port_completed_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def retry_current_tuple( + async def start_workflow( self, empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": + ) -> "StartWorkflowResponse": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetryCurrentTuple", + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/StartWorkflow", empty_request, - EmptyReturn, + StartWorkflowResponse, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def start_worker( + async def resume_workflow( self, empty_request: "EmptyRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "WorkerStateResponse": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartWorker", + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ResumeWorkflow", empty_request, - WorkerStateResponse, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def end_worker( + async def pause_workflow( self, empty_request: "EmptyRequest", *, @@ -1235,7 +1209,7 @@ async def end_worker( metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndWorker", + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PauseWorkflow", empty_request, EmptyReturn, timeout=timeout, @@ -1243,24 +1217,24 @@ async def end_worker( metadata=metadata, ) - async def start_channel( + async def worker_state_updated( self, - empty_request: "EmptyRequest", + worker_state_updated_request: "WorkerStateUpdatedRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartChannel", - empty_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerStateUpdated", + worker_state_updated_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def end_channel( + async def worker_execution_completed( self, empty_request: "EmptyRequest", *, @@ -1269,7 +1243,7 @@ async def end_channel( metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndChannel", + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerExecutionCompleted", empty_request, EmptyReturn, timeout=timeout, @@ -1277,51 +1251,51 @@ async def end_channel( metadata=metadata, ) - async def debug_command( + async def link_workers( self, - debug_command_request: "DebugCommandRequest", + link_workers_request: "LinkWorkersRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/DebugCommand", - debug_command_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers", + link_workers_request, EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def evaluate_python_expression( + async def controller_initiate_query_statistics( self, - evaluate_python_expression_request: "EvaluatePythonExpressionRequest", + query_statistics_request: "QueryStatisticsRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None - ) -> "EvaluatedValue": + ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EvaluatePythonExpression", - evaluate_python_expression_request, - EvaluatedValue, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ControllerInitiateQueryStatistics", + query_statistics_request, + EmptyReturn, timeout=timeout, deadline=deadline, metadata=metadata, ) - async def no_operation( + async def retry_workflow( self, - empty_request: "EmptyRequest", + retry_workflow_request: "RetryWorkflowRequest", *, timeout: Optional[float] = None, deadline: Optional["Deadline"] = None, metadata: Optional["MetadataLike"] = None ) -> "EmptyReturn": return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/NoOperation", - empty_request, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetryWorkflow", + retry_workflow_request, EmptyReturn, timeout=timeout, deadline=deadline, @@ -1346,337 +1320,263 @@ async def reconfigure_workflow( ) -class ControllerServiceBase(ServiceBase): +class RpcTesterBase(ServiceBase): - async def retrieve_workflow_state( - self, empty_request: "EmptyRequest" - ) -> "RetrieveWorkflowStateResponse": + async def send_ping(self, ping: "Ping") -> "IntResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def propagate_embedded_control_message( - self, - propagate_embedded_control_message_request: "PropagateEmbeddedControlMessageRequest", - ) -> "PropagateEmbeddedControlMessageResponse": + async def send_pong(self, pong: "Pong") -> "IntResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def take_global_checkpoint( - self, take_global_checkpoint_request: "TakeGlobalCheckpointRequest" - ) -> "TakeGlobalCheckpointResponse": + async def send_nested(self, nested: "Nested") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def debug_command( - self, debug_command_request: "DebugCommandRequest" - ) -> "EmptyReturn": + async def send_pass(self, pass_: "Pass") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def evaluate_python_expression( - self, evaluate_python_expression_request: "EvaluatePythonExpressionRequest" - ) -> "EvaluatePythonExpressionResponse": + async def send_error_command( + self, error_command: "ErrorCommand" + ) -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def console_message_triggered( - self, console_message_triggered_request: "ConsoleMessageTriggeredRequest" - ) -> "EmptyReturn": + async def send_recursion(self, recursion: "Recursion") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def port_completed( - self, port_completed_request: "PortCompletedRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def start_workflow( - self, empty_request: "EmptyRequest" - ) -> "StartWorkflowResponse": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def resume_workflow(self, empty_request: "EmptyRequest") -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def pause_workflow(self, empty_request: "EmptyRequest") -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def worker_state_updated( - self, worker_state_updated_request: "WorkerStateUpdatedRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def worker_execution_completed( - self, empty_request: "EmptyRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def jump_to_operator( - self, jump_to_operator_request: "JumpToOperatorRequest" - ) -> "EmptyReturn": + async def send_collect(self, collect: "Collect") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def link_workers( - self, link_workers_request: "LinkWorkersRequest" - ) -> "EmptyReturn": + async def send_generate_number( + self, generate_number: "GenerateNumber" + ) -> "IntResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def controller_initiate_query_statistics( - self, query_statistics_request: "QueryStatisticsRequest" - ) -> "EmptyReturn": + async def send_multi_call(self, multi_call: "MultiCall") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def retry_workflow( - self, retry_workflow_request: "RetryWorkflowRequest" - ) -> "EmptyReturn": + async def send_chain(self, chain: "Chain") -> "StringResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def __rpc_retrieve_workflow_state( - self, - stream: "grpclib.server.Stream[EmptyRequest, RetrieveWorkflowStateResponse]", - ) -> None: - request = await stream.recv_message() - response = await self.retrieve_workflow_state(request) - await stream.send_message(response) - - async def __rpc_propagate_embedded_control_message( - self, - stream: "grpclib.server.Stream[PropagateEmbeddedControlMessageRequest, PropagateEmbeddedControlMessageResponse]", - ) -> None: - request = await stream.recv_message() - response = await self.propagate_embedded_control_message(request) - await stream.send_message(response) - - async def __rpc_take_global_checkpoint( - self, - stream: "grpclib.server.Stream[TakeGlobalCheckpointRequest, TakeGlobalCheckpointResponse]", - ) -> None: - request = await stream.recv_message() - response = await self.take_global_checkpoint(request) - await stream.send_message(response) - - async def __rpc_debug_command( - self, stream: "grpclib.server.Stream[DebugCommandRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.debug_command(request) - await stream.send_message(response) - - async def __rpc_evaluate_python_expression( - self, - stream: "grpclib.server.Stream[EvaluatePythonExpressionRequest, EvaluatePythonExpressionResponse]", - ) -> None: - request = await stream.recv_message() - response = await self.evaluate_python_expression(request) - await stream.send_message(response) - - async def __rpc_console_message_triggered( - self, - stream: "grpclib.server.Stream[ConsoleMessageTriggeredRequest, EmptyReturn]", - ) -> None: - request = await stream.recv_message() - response = await self.console_message_triggered(request) - await stream.send_message(response) - - async def __rpc_port_completed( - self, stream: "grpclib.server.Stream[PortCompletedRequest, EmptyReturn]" + async def __rpc_send_ping( + self, stream: "grpclib.server.Stream[Ping, IntResponse]" ) -> None: request = await stream.recv_message() - response = await self.port_completed(request) + response = await self.send_ping(request) await stream.send_message(response) - async def __rpc_start_workflow( - self, stream: "grpclib.server.Stream[EmptyRequest, StartWorkflowResponse]" + async def __rpc_send_pong( + self, stream: "grpclib.server.Stream[Pong, IntResponse]" ) -> None: request = await stream.recv_message() - response = await self.start_workflow(request) + response = await self.send_pong(request) await stream.send_message(response) - async def __rpc_resume_workflow( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_send_nested( + self, stream: "grpclib.server.Stream[Nested, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.resume_workflow(request) + response = await self.send_nested(request) await stream.send_message(response) - async def __rpc_pause_workflow( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_send_pass( + self, stream: "grpclib.server.Stream[Pass, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.pause_workflow(request) + response = await self.send_pass(request) await stream.send_message(response) - async def __rpc_worker_state_updated( - self, stream: "grpclib.server.Stream[WorkerStateUpdatedRequest, EmptyReturn]" + async def __rpc_send_error_command( + self, stream: "grpclib.server.Stream[ErrorCommand, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.worker_state_updated(request) + response = await self.send_error_command(request) await stream.send_message(response) - async def __rpc_worker_execution_completed( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_send_recursion( + self, stream: "grpclib.server.Stream[Recursion, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.worker_execution_completed(request) + response = await self.send_recursion(request) await stream.send_message(response) - async def __rpc_jump_to_operator( - self, stream: "grpclib.server.Stream[JumpToOperatorRequest, EmptyReturn]" + async def __rpc_send_collect( + self, stream: "grpclib.server.Stream[Collect, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.jump_to_operator(request) + response = await self.send_collect(request) await stream.send_message(response) - async def __rpc_link_workers( - self, stream: "grpclib.server.Stream[LinkWorkersRequest, EmptyReturn]" + async def __rpc_send_generate_number( + self, stream: "grpclib.server.Stream[GenerateNumber, IntResponse]" ) -> None: request = await stream.recv_message() - response = await self.link_workers(request) + response = await self.send_generate_number(request) await stream.send_message(response) - async def __rpc_controller_initiate_query_statistics( - self, stream: "grpclib.server.Stream[QueryStatisticsRequest, EmptyReturn]" + async def __rpc_send_multi_call( + self, stream: "grpclib.server.Stream[MultiCall, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.controller_initiate_query_statistics(request) + response = await self.send_multi_call(request) await stream.send_message(response) - async def __rpc_retry_workflow( - self, stream: "grpclib.server.Stream[RetryWorkflowRequest, EmptyReturn]" + async def __rpc_send_chain( + self, stream: "grpclib.server.Stream[Chain, StringResponse]" ) -> None: request = await stream.recv_message() - response = await self.retry_workflow(request) + response = await self.send_chain(request) await stream.send_message(response) def __mapping__(self) -> Dict[str, grpclib.const.Handler]: return { - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetrieveWorkflowState": grpclib.const.Handler( - self.__rpc_retrieve_workflow_state, - grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - RetrieveWorkflowStateResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PropagateEmbeddedControlMessage": grpclib.const.Handler( - self.__rpc_propagate_embedded_control_message, - grpclib.const.Cardinality.UNARY_UNARY, - PropagateEmbeddedControlMessageRequest, - PropagateEmbeddedControlMessageResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/TakeGlobalCheckpoint": grpclib.const.Handler( - self.__rpc_take_global_checkpoint, - grpclib.const.Cardinality.UNARY_UNARY, - TakeGlobalCheckpointRequest, - TakeGlobalCheckpointResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/DebugCommand": grpclib.const.Handler( - self.__rpc_debug_command, - grpclib.const.Cardinality.UNARY_UNARY, - DebugCommandRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/EvaluatePythonExpression": grpclib.const.Handler( - self.__rpc_evaluate_python_expression, - grpclib.const.Cardinality.UNARY_UNARY, - EvaluatePythonExpressionRequest, - EvaluatePythonExpressionResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ConsoleMessageTriggered": grpclib.const.Handler( - self.__rpc_console_message_triggered, - grpclib.const.Cardinality.UNARY_UNARY, - ConsoleMessageTriggeredRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PortCompleted": grpclib.const.Handler( - self.__rpc_port_completed, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPing": grpclib.const.Handler( + self.__rpc_send_ping, grpclib.const.Cardinality.UNARY_UNARY, - PortCompletedRequest, - EmptyReturn, + Ping, + IntResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/StartWorkflow": grpclib.const.Handler( - self.__rpc_start_workflow, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPong": grpclib.const.Handler( + self.__rpc_send_pong, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - StartWorkflowResponse, + Pong, + IntResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ResumeWorkflow": grpclib.const.Handler( - self.__rpc_resume_workflow, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendNested": grpclib.const.Handler( + self.__rpc_send_nested, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + Nested, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PauseWorkflow": grpclib.const.Handler( - self.__rpc_pause_workflow, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPass": grpclib.const.Handler( + self.__rpc_send_pass, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + Pass, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerStateUpdated": grpclib.const.Handler( - self.__rpc_worker_state_updated, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendErrorCommand": grpclib.const.Handler( + self.__rpc_send_error_command, grpclib.const.Cardinality.UNARY_UNARY, - WorkerStateUpdatedRequest, - EmptyReturn, + ErrorCommand, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerExecutionCompleted": grpclib.const.Handler( - self.__rpc_worker_execution_completed, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendRecursion": grpclib.const.Handler( + self.__rpc_send_recursion, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - EmptyReturn, + Recursion, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator": grpclib.const.Handler( - self.__rpc_jump_to_operator, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendCollect": grpclib.const.Handler( + self.__rpc_send_collect, grpclib.const.Cardinality.UNARY_UNARY, - JumpToOperatorRequest, - EmptyReturn, + Collect, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers": grpclib.const.Handler( - self.__rpc_link_workers, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendGenerateNumber": grpclib.const.Handler( + self.__rpc_send_generate_number, grpclib.const.Cardinality.UNARY_UNARY, - LinkWorkersRequest, - EmptyReturn, + GenerateNumber, + IntResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ControllerInitiateQueryStatistics": grpclib.const.Handler( - self.__rpc_controller_initiate_query_statistics, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendMultiCall": grpclib.const.Handler( + self.__rpc_send_multi_call, grpclib.const.Cardinality.UNARY_UNARY, - QueryStatisticsRequest, - EmptyReturn, + MultiCall, + StringResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetryWorkflow": grpclib.const.Handler( - self.__rpc_retry_workflow, + "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendChain": grpclib.const.Handler( + self.__rpc_send_chain, grpclib.const.Cardinality.UNARY_UNARY, - RetryWorkflowRequest, - EmptyReturn, + Chain, + StringResponse, ), } -class RpcTesterBase(ServiceBase): +class WorkerServiceBase(ServiceBase): - async def send_ping(self, ping: "Ping") -> "IntResponse": + async def add_input_channel( + self, add_input_channel_request: "AddInputChannelRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_pong(self, pong: "Pong") -> "IntResponse": + async def add_partitioning( + self, add_partitioning_request: "AddPartitioningRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_nested(self, nested: "Nested") -> "StringResponse": + async def assign_port( + self, assign_port_request: "AssignPortRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_pass(self, pass_: "Pass") -> "StringResponse": + async def finalize_checkpoint( + self, finalize_checkpoint_request: "FinalizeCheckpointRequest" + ) -> "FinalizeCheckpointResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_error_command( - self, error_command: "ErrorCommand" - ) -> "StringResponse": + async def flush_network_buffer( + self, empty_request: "EmptyRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_recursion(self, recursion: "Recursion") -> "StringResponse": + async def initialize_executor( + self, initialize_executor_request: "InitializeExecutorRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_collect(self, collect: "Collect") -> "StringResponse": + async def open_executor(self, empty_request: "EmptyRequest") -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_generate_number( - self, generate_number: "GenerateNumber" - ) -> "IntResponse": + async def pause_worker( + self, empty_request: "EmptyRequest" + ) -> "WorkerStateResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_multi_call(self, multi_call: "MultiCall") -> "StringResponse": + async def prepare_checkpoint( + self, prepare_checkpoint_request: "PrepareCheckpointRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def send_chain(self, chain: "Chain") -> "StringResponse": + async def query_statistics( + self, empty_request: "EmptyRequest" + ) -> "WorkerMetricsResponse": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def resume_worker( + self, empty_request: "EmptyRequest" + ) -> "WorkerStateResponse": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def retrieve_state(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def retry_current_tuple(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def start_worker( + self, empty_request: "EmptyRequest" + ) -> "WorkerStateResponse": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def end_worker(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def start_channel(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def end_channel(self, empty_request: "EmptyRequest") -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def debug_command( + self, debug_command_request: "DebugCommandRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def evaluate_python_expression( + self, evaluate_python_expression_request: "EvaluatePythonExpressionRequest" + ) -> "EvaluatedValue": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + + async def no_operation(self, empty_request: "EmptyRequest") -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) async def update_executor( @@ -1759,70 +1659,71 @@ async def __rpc_resume_worker( self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" ) -> None: request = await stream.recv_message() - response = await self.send_ping(request) + response = await self.resume_worker(request) await stream.send_message(response) - async def __rpc_send_pong( - self, stream: "grpclib.server.Stream[Pong, IntResponse]" + async def __rpc_retrieve_state( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_pong(request) + response = await self.retrieve_state(request) await stream.send_message(response) - async def __rpc_send_nested( - self, stream: "grpclib.server.Stream[Nested, StringResponse]" + async def __rpc_retry_current_tuple( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_nested(request) + response = await self.retry_current_tuple(request) await stream.send_message(response) - async def __rpc_send_pass( - self, stream: "grpclib.server.Stream[Pass, StringResponse]" + async def __rpc_start_worker( + self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" ) -> None: request = await stream.recv_message() - response = await self.send_pass(request) + response = await self.start_worker(request) await stream.send_message(response) - async def __rpc_send_error_command( - self, stream: "grpclib.server.Stream[ErrorCommand, StringResponse]" + async def __rpc_end_worker( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_error_command(request) + response = await self.end_worker(request) await stream.send_message(response) - async def __rpc_send_recursion( - self, stream: "grpclib.server.Stream[Recursion, StringResponse]" + async def __rpc_start_channel( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_recursion(request) + response = await self.start_channel(request) await stream.send_message(response) - async def __rpc_send_collect( - self, stream: "grpclib.server.Stream[Collect, StringResponse]" + async def __rpc_end_channel( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_collect(request) + response = await self.end_channel(request) await stream.send_message(response) - async def __rpc_send_generate_number( - self, stream: "grpclib.server.Stream[GenerateNumber, IntResponse]" + async def __rpc_debug_command( + self, stream: "grpclib.server.Stream[DebugCommandRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_generate_number(request) + response = await self.debug_command(request) await stream.send_message(response) - async def __rpc_send_multi_call( - self, stream: "grpclib.server.Stream[MultiCall, StringResponse]" + async def __rpc_evaluate_python_expression( + self, + stream: "grpclib.server.Stream[EvaluatePythonExpressionRequest, EvaluatedValue]", ) -> None: request = await stream.recv_message() - response = await self.send_multi_call(request) + response = await self.evaluate_python_expression(request) await stream.send_message(response) - async def __rpc_send_chain( - self, stream: "grpclib.server.Stream[Chain, StringResponse]" + async def __rpc_no_operation( + self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.send_chain(request) + response = await self.no_operation(request) await stream.send_message(response) async def __rpc_update_executor( @@ -1834,65 +1735,125 @@ async def __rpc_update_executor( def __mapping__(self) -> Dict[str, grpclib.const.Handler]: return { - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPing": grpclib.const.Handler( - self.__rpc_send_ping, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddInputChannel": grpclib.const.Handler( + self.__rpc_add_input_channel, grpclib.const.Cardinality.UNARY_UNARY, - Ping, - IntResponse, + AddInputChannelRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPong": grpclib.const.Handler( - self.__rpc_send_pong, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddPartitioning": grpclib.const.Handler( + self.__rpc_add_partitioning, grpclib.const.Cardinality.UNARY_UNARY, - Pong, - IntResponse, + AddPartitioningRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendNested": grpclib.const.Handler( - self.__rpc_send_nested, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AssignPort": grpclib.const.Handler( + self.__rpc_assign_port, grpclib.const.Cardinality.UNARY_UNARY, - Nested, - StringResponse, + AssignPortRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendPass": grpclib.const.Handler( - self.__rpc_send_pass, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FinalizeCheckpoint": grpclib.const.Handler( + self.__rpc_finalize_checkpoint, grpclib.const.Cardinality.UNARY_UNARY, - Pass, - StringResponse, + FinalizeCheckpointRequest, + FinalizeCheckpointResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendErrorCommand": grpclib.const.Handler( - self.__rpc_send_error_command, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FlushNetworkBuffer": grpclib.const.Handler( + self.__rpc_flush_network_buffer, grpclib.const.Cardinality.UNARY_UNARY, - ErrorCommand, - StringResponse, + EmptyRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendRecursion": grpclib.const.Handler( - self.__rpc_send_recursion, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/InitializeExecutor": grpclib.const.Handler( + self.__rpc_initialize_executor, grpclib.const.Cardinality.UNARY_UNARY, - Recursion, - StringResponse, + InitializeExecutorRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendCollect": grpclib.const.Handler( - self.__rpc_send_collect, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/OpenExecutor": grpclib.const.Handler( + self.__rpc_open_executor, grpclib.const.Cardinality.UNARY_UNARY, - Collect, - StringResponse, + EmptyRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendGenerateNumber": grpclib.const.Handler( - self.__rpc_send_generate_number, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PauseWorker": grpclib.const.Handler( + self.__rpc_pause_worker, grpclib.const.Cardinality.UNARY_UNARY, - GenerateNumber, - IntResponse, + EmptyRequest, + WorkerStateResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendMultiCall": grpclib.const.Handler( - self.__rpc_send_multi_call, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PrepareCheckpoint": grpclib.const.Handler( + self.__rpc_prepare_checkpoint, grpclib.const.Cardinality.UNARY_UNARY, - MultiCall, - StringResponse, + PrepareCheckpointRequest, + EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.RPCTester/SendChain": grpclib.const.Handler( - self.__rpc_send_chain, + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/QueryStatistics": grpclib.const.Handler( + self.__rpc_query_statistics, grpclib.const.Cardinality.UNARY_UNARY, - Chain, - StringResponse, + EmptyRequest, + WorkerMetricsResponse, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/ResumeWorker": grpclib.const.Handler( + self.__rpc_resume_worker, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + WorkerStateResponse, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetrieveState": grpclib.const.Handler( + self.__rpc_retrieve_state, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetryCurrentTuple": grpclib.const.Handler( + self.__rpc_retry_current_tuple, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartWorker": grpclib.const.Handler( + self.__rpc_start_worker, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + WorkerStateResponse, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndWorker": grpclib.const.Handler( + self.__rpc_end_worker, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartChannel": grpclib.const.Handler( + self.__rpc_start_channel, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndChannel": grpclib.const.Handler( + self.__rpc_end_channel, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/DebugCommand": grpclib.const.Handler( + self.__rpc_debug_command, + grpclib.const.Cardinality.UNARY_UNARY, + DebugCommandRequest, + EmptyReturn, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EvaluatePythonExpression": grpclib.const.Handler( + self.__rpc_evaluate_python_expression, + grpclib.const.Cardinality.UNARY_UNARY, + EvaluatePythonExpressionRequest, + EvaluatedValue, + ), + "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/NoOperation": grpclib.const.Handler( + self.__rpc_no_operation, + grpclib.const.Cardinality.UNARY_UNARY, + EmptyRequest, + EmptyReturn, ), "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/UpdateExecutor": grpclib.const.Handler( self.__rpc_update_executor, @@ -1903,92 +1864,78 @@ def __mapping__(self) -> Dict[str, grpclib.const.Handler]: } -class WorkerServiceBase(ServiceBase): - - async def add_input_channel( - self, add_input_channel_request: "AddInputChannelRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def add_partitioning( - self, add_partitioning_request: "AddPartitioningRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) +class ControllerServiceBase(ServiceBase): - async def assign_port( - self, assign_port_request: "AssignPortRequest" - ) -> "EmptyReturn": + async def retrieve_workflow_state( + self, empty_request: "EmptyRequest" + ) -> "RetrieveWorkflowStateResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def finalize_checkpoint( - self, finalize_checkpoint_request: "FinalizeCheckpointRequest" - ) -> "FinalizeCheckpointResponse": + async def propagate_embedded_control_message( + self, + propagate_embedded_control_message_request: "PropagateEmbeddedControlMessageRequest", + ) -> "PropagateEmbeddedControlMessageResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def flush_network_buffer( - self, empty_request: "EmptyRequest" - ) -> "EmptyReturn": + async def take_global_checkpoint( + self, take_global_checkpoint_request: "TakeGlobalCheckpointRequest" + ) -> "TakeGlobalCheckpointResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def initialize_executor( - self, initialize_executor_request: "InitializeExecutorRequest" + async def debug_command( + self, debug_command_request: "DebugCommandRequest" ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def open_executor(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def evaluate_python_expression( + self, evaluate_python_expression_request: "EvaluatePythonExpressionRequest" + ) -> "EvaluatePythonExpressionResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def pause_worker( - self, empty_request: "EmptyRequest" - ) -> "WorkerStateResponse": + async def console_message_triggered( + self, console_message_triggered_request: "ConsoleMessageTriggeredRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def prepare_checkpoint( - self, prepare_checkpoint_request: "PrepareCheckpointRequest" + async def port_completed( + self, port_completed_request: "PortCompletedRequest" ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def query_statistics( + async def start_workflow( self, empty_request: "EmptyRequest" - ) -> "WorkerMetricsResponse": + ) -> "StartWorkflowResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def resume_worker( - self, empty_request: "EmptyRequest" - ) -> "WorkerStateResponse": + async def resume_workflow(self, empty_request: "EmptyRequest") -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def retrieve_state(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def pause_workflow(self, empty_request: "EmptyRequest") -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def retry_current_tuple(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def worker_state_updated( + self, worker_state_updated_request: "WorkerStateUpdatedRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def start_worker( + async def worker_execution_completed( self, empty_request: "EmptyRequest" - ) -> "WorkerStateResponse": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def end_worker(self, empty_request: "EmptyRequest") -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def start_channel(self, empty_request: "EmptyRequest") -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - - async def end_channel(self, empty_request: "EmptyRequest") -> "EmptyReturn": + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def debug_command( - self, debug_command_request: "DebugCommandRequest" + async def link_workers( + self, link_workers_request: "LinkWorkersRequest" ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def evaluate_python_expression( - self, evaluate_python_expression_request: "EvaluatePythonExpressionRequest" - ) -> "EvaluatedValue": + async def controller_initiate_query_statistics( + self, query_statistics_request: "QueryStatisticsRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def no_operation(self, empty_request: "EmptyRequest") -> "EmptyReturn": + async def retry_workflow( + self, retry_workflow_request: "RetryWorkflowRequest" + ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) async def reconfigure_workflow( @@ -2001,142 +1948,109 @@ async def __rpc_retrieve_workflow_state( stream: "grpclib.server.Stream[EmptyRequest, RetrieveWorkflowStateResponse]", ) -> None: request = await stream.recv_message() - response = await self.add_input_channel(request) - await stream.send_message(response) - - async def __rpc_add_partitioning( - self, stream: "grpclib.server.Stream[AddPartitioningRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.add_partitioning(request) - await stream.send_message(response) - - async def __rpc_assign_port( - self, stream: "grpclib.server.Stream[AssignPortRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.assign_port(request) + response = await self.retrieve_workflow_state(request) await stream.send_message(response) - async def __rpc_finalize_checkpoint( + async def __rpc_propagate_embedded_control_message( self, - stream: "grpclib.server.Stream[FinalizeCheckpointRequest, FinalizeCheckpointResponse]", - ) -> None: - request = await stream.recv_message() - response = await self.finalize_checkpoint(request) - await stream.send_message(response) - - async def __rpc_flush_network_buffer( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.flush_network_buffer(request) - await stream.send_message(response) - - async def __rpc_initialize_executor( - self, stream: "grpclib.server.Stream[InitializeExecutorRequest, EmptyReturn]" + stream: "grpclib.server.Stream[PropagateEmbeddedControlMessageRequest, PropagateEmbeddedControlMessageResponse]", ) -> None: request = await stream.recv_message() - response = await self.initialize_executor(request) + response = await self.propagate_embedded_control_message(request) await stream.send_message(response) - async def __rpc_open_executor( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_take_global_checkpoint( + self, + stream: "grpclib.server.Stream[TakeGlobalCheckpointRequest, TakeGlobalCheckpointResponse]", ) -> None: request = await stream.recv_message() - response = await self.open_executor(request) + response = await self.take_global_checkpoint(request) await stream.send_message(response) - async def __rpc_pause_worker( - self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" + async def __rpc_debug_command( + self, stream: "grpclib.server.Stream[DebugCommandRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.pause_worker(request) + response = await self.debug_command(request) await stream.send_message(response) - async def __rpc_prepare_checkpoint( - self, stream: "grpclib.server.Stream[PrepareCheckpointRequest, EmptyReturn]" + async def __rpc_evaluate_python_expression( + self, + stream: "grpclib.server.Stream[EvaluatePythonExpressionRequest, EvaluatePythonExpressionResponse]", ) -> None: request = await stream.recv_message() - response = await self.prepare_checkpoint(request) + response = await self.evaluate_python_expression(request) await stream.send_message(response) - async def __rpc_query_statistics( - self, stream: "grpclib.server.Stream[EmptyRequest, WorkerMetricsResponse]" + async def __rpc_console_message_triggered( + self, + stream: "grpclib.server.Stream[ConsoleMessageTriggeredRequest, EmptyReturn]", ) -> None: request = await stream.recv_message() - response = await self.query_statistics(request) + response = await self.console_message_triggered(request) await stream.send_message(response) - async def __rpc_resume_worker( - self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" + async def __rpc_port_completed( + self, stream: "grpclib.server.Stream[PortCompletedRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.resume_worker(request) + response = await self.port_completed(request) await stream.send_message(response) - async def __rpc_retrieve_state( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_start_workflow( + self, stream: "grpclib.server.Stream[EmptyRequest, StartWorkflowResponse]" ) -> None: request = await stream.recv_message() - response = await self.retrieve_state(request) + response = await self.start_workflow(request) await stream.send_message(response) - async def __rpc_retry_current_tuple( + async def __rpc_resume_workflow( self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.retry_current_tuple(request) - await stream.send_message(response) - - async def __rpc_start_worker( - self, stream: "grpclib.server.Stream[EmptyRequest, WorkerStateResponse]" - ) -> None: - request = await stream.recv_message() - response = await self.start_worker(request) + response = await self.resume_workflow(request) await stream.send_message(response) - async def __rpc_end_worker( + async def __rpc_pause_workflow( self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.end_worker(request) + response = await self.pause_workflow(request) await stream.send_message(response) - async def __rpc_start_channel( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_worker_state_updated( + self, stream: "grpclib.server.Stream[WorkerStateUpdatedRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.start_channel(request) + response = await self.worker_state_updated(request) await stream.send_message(response) - async def __rpc_end_channel( + async def __rpc_worker_execution_completed( self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.end_channel(request) + response = await self.worker_execution_completed(request) await stream.send_message(response) - async def __rpc_debug_command( - self, stream: "grpclib.server.Stream[DebugCommandRequest, EmptyReturn]" + async def __rpc_link_workers( + self, stream: "grpclib.server.Stream[LinkWorkersRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.debug_command(request) + response = await self.link_workers(request) await stream.send_message(response) - async def __rpc_evaluate_python_expression( - self, - stream: "grpclib.server.Stream[EvaluatePythonExpressionRequest, EvaluatedValue]", + async def __rpc_controller_initiate_query_statistics( + self, stream: "grpclib.server.Stream[QueryStatisticsRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.evaluate_python_expression(request) + response = await self.controller_initiate_query_statistics(request) await stream.send_message(response) - async def __rpc_no_operation( - self, stream: "grpclib.server.Stream[EmptyRequest, EmptyReturn]" + async def __rpc_retry_workflow( + self, stream: "grpclib.server.Stream[RetryWorkflowRequest, EmptyReturn]" ) -> None: request = await stream.recv_message() - response = await self.no_operation(request) + response = await self.retry_workflow(request) await stream.send_message(response) async def __rpc_reconfigure_workflow( @@ -2148,124 +2062,94 @@ async def __rpc_reconfigure_workflow( def __mapping__(self) -> Dict[str, grpclib.const.Handler]: return { - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddInputChannel": grpclib.const.Handler( - self.__rpc_add_input_channel, - grpclib.const.Cardinality.UNARY_UNARY, - AddInputChannelRequest, - EmptyReturn, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AddPartitioning": grpclib.const.Handler( - self.__rpc_add_partitioning, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetrieveWorkflowState": grpclib.const.Handler( + self.__rpc_retrieve_workflow_state, grpclib.const.Cardinality.UNARY_UNARY, - AddPartitioningRequest, - EmptyReturn, + EmptyRequest, + RetrieveWorkflowStateResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/AssignPort": grpclib.const.Handler( - self.__rpc_assign_port, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PropagateEmbeddedControlMessage": grpclib.const.Handler( + self.__rpc_propagate_embedded_control_message, grpclib.const.Cardinality.UNARY_UNARY, - AssignPortRequest, - EmptyReturn, + PropagateEmbeddedControlMessageRequest, + PropagateEmbeddedControlMessageResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FinalizeCheckpoint": grpclib.const.Handler( - self.__rpc_finalize_checkpoint, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/TakeGlobalCheckpoint": grpclib.const.Handler( + self.__rpc_take_global_checkpoint, grpclib.const.Cardinality.UNARY_UNARY, - FinalizeCheckpointRequest, - FinalizeCheckpointResponse, + TakeGlobalCheckpointRequest, + TakeGlobalCheckpointResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/FlushNetworkBuffer": grpclib.const.Handler( - self.__rpc_flush_network_buffer, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/DebugCommand": grpclib.const.Handler( + self.__rpc_debug_command, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, + DebugCommandRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/InitializeExecutor": grpclib.const.Handler( - self.__rpc_initialize_executor, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/EvaluatePythonExpression": grpclib.const.Handler( + self.__rpc_evaluate_python_expression, grpclib.const.Cardinality.UNARY_UNARY, - InitializeExecutorRequest, - EmptyReturn, + EvaluatePythonExpressionRequest, + EvaluatePythonExpressionResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/OpenExecutor": grpclib.const.Handler( - self.__rpc_open_executor, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ConsoleMessageTriggered": grpclib.const.Handler( + self.__rpc_console_message_triggered, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, + ConsoleMessageTriggeredRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PauseWorker": grpclib.const.Handler( - self.__rpc_pause_worker, - grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - WorkerStateResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/PrepareCheckpoint": grpclib.const.Handler( - self.__rpc_prepare_checkpoint, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PortCompleted": grpclib.const.Handler( + self.__rpc_port_completed, grpclib.const.Cardinality.UNARY_UNARY, - PrepareCheckpointRequest, + PortCompletedRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/QueryStatistics": grpclib.const.Handler( - self.__rpc_query_statistics, - grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - WorkerMetricsResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/ResumeWorker": grpclib.const.Handler( - self.__rpc_resume_worker, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/StartWorkflow": grpclib.const.Handler( + self.__rpc_start_workflow, grpclib.const.Cardinality.UNARY_UNARY, EmptyRequest, - WorkerStateResponse, + StartWorkflowResponse, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetrieveState": grpclib.const.Handler( - self.__rpc_retrieve_state, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ResumeWorkflow": grpclib.const.Handler( + self.__rpc_resume_workflow, grpclib.const.Cardinality.UNARY_UNARY, EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/RetryCurrentTuple": grpclib.const.Handler( - self.__rpc_retry_current_tuple, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/PauseWorkflow": grpclib.const.Handler( + self.__rpc_pause_workflow, grpclib.const.Cardinality.UNARY_UNARY, EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartWorker": grpclib.const.Handler( - self.__rpc_start_worker, - grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, - WorkerStateResponse, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndWorker": grpclib.const.Handler( - self.__rpc_end_worker, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerStateUpdated": grpclib.const.Handler( + self.__rpc_worker_state_updated, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, + WorkerStateUpdatedRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/StartChannel": grpclib.const.Handler( - self.__rpc_start_channel, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/WorkerExecutionCompleted": grpclib.const.Handler( + self.__rpc_worker_execution_completed, grpclib.const.Cardinality.UNARY_UNARY, EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EndChannel": grpclib.const.Handler( - self.__rpc_end_channel, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers": grpclib.const.Handler( + self.__rpc_link_workers, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, + LinkWorkersRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/DebugCommand": grpclib.const.Handler( - self.__rpc_debug_command, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ControllerInitiateQueryStatistics": grpclib.const.Handler( + self.__rpc_controller_initiate_query_statistics, grpclib.const.Cardinality.UNARY_UNARY, - DebugCommandRequest, + QueryStatisticsRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/EvaluatePythonExpression": grpclib.const.Handler( - self.__rpc_evaluate_python_expression, - grpclib.const.Cardinality.UNARY_UNARY, - EvaluatePythonExpressionRequest, - EvaluatedValue, - ), - "/org.apache.texera.amber.engine.architecture.rpc.WorkerService/NoOperation": grpclib.const.Handler( - self.__rpc_no_operation, + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/RetryWorkflow": grpclib.const.Handler( + self.__rpc_retry_workflow, grpclib.const.Cardinality.UNARY_UNARY, - EmptyRequest, + RetryWorkflowRequest, EmptyReturn, ), "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/ReconfigureWorkflow": grpclib.const.Handler( diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/sendsemantics/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/sendsemantics/__init__.py index 94ed31cce3f..bc241806b5c 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/sendsemantics/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/sendsemantics/__init__.py @@ -4,7 +4,9 @@ # This file has been @generated from dataclasses import dataclass -from typing import List +from typing import ( + List, +) import betterproto diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/worker/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/worker/__init__.py index 072e7c8ce65..6a7b210e185 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/worker/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/worker/__init__.py @@ -4,7 +4,9 @@ # This file has been @generated from dataclasses import dataclass -from typing import List +from typing import ( + List, +) import betterproto diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/common/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/common/__init__.py index 8c1464cc76c..55c789aa395 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/common/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/common/__init__.py @@ -18,27 +18,6 @@ ) -@dataclass(eq=False, repr=False) -class Backpressure(betterproto.Message): - enable_backpressure: bool = betterproto.bool_field(1) - - -@dataclass(eq=False, repr=False) -class CreditUpdate(betterproto.Message): - pass - - -@dataclass(eq=False, repr=False) -class ActorCommand(betterproto.Message): - backpressure: "Backpressure" = betterproto.message_field(1, group="sealed_value") - credit_update: "CreditUpdate" = betterproto.message_field(2, group="sealed_value") - - -@dataclass(eq=False, repr=False) -class PythonActorMessage(betterproto.Message): - payload: "ActorCommand" = betterproto.message_field(1) - - @dataclass(eq=False, repr=False) class DirectControlMessagePayloadV2(betterproto.Message): control_invocation: "_architecture_rpc__.ControlInvocation" = ( @@ -154,3 +133,24 @@ class ExecutionMetadataStore(betterproto.Message): fatal_errors: List["__core__.WorkflowFatalError"] = betterproto.message_field(2) execution_id: "__core__.ExecutionIdentity" = betterproto.message_field(3) is_recovering: bool = betterproto.bool_field(4) + + +@dataclass(eq=False, repr=False) +class Backpressure(betterproto.Message): + enable_backpressure: bool = betterproto.bool_field(1) + + +@dataclass(eq=False, repr=False) +class CreditUpdate(betterproto.Message): + pass + + +@dataclass(eq=False, repr=False) +class ActorCommand(betterproto.Message): + backpressure: "Backpressure" = betterproto.message_field(1, group="sealed_value") + credit_update: "CreditUpdate" = betterproto.message_field(2, group="sealed_value") + + +@dataclass(eq=False, repr=False) +class PythonActorMessage(betterproto.Message): + payload: "ActorCommand" = betterproto.message_field(1) From c3ac307fbe5cad8accd906e82ef12185d02f3c05 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Tue, 28 Apr 2026 21:25:37 -0700 Subject: [PATCH 056/152] update --- .../amber/engine/architecture/rpc/__init__.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py index 77d51933af6..524c79fdc3b 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py @@ -101,6 +101,9 @@ class ControlRequest(betterproto.Message): workflow_reconfigure_request: "WorkflowReconfigureRequest" = ( betterproto.message_field(10, group="sealed_value") ) + jump_to_operator_request: "JumpToOperatorRequest" = betterproto.message_field( + 12, group="sealed_value" + ) add_input_channel_request: "AddInputChannelRequest" = betterproto.message_field( 50, group="sealed_value" ) @@ -385,6 +388,11 @@ class QueryStatisticsRequest(betterproto.Message): update_target: "StatisticsUpdateTarget" = betterproto.enum_field(2) +@dataclass(eq=False, repr=False) +class JumpToOperatorRequest(betterproto.Message): + target_operator_id: "___core__.OperatorIdentity" = betterproto.message_field(1) + + @dataclass(eq=False, repr=False) class ControlReturn(betterproto.Message): """The generic return message""" @@ -1251,6 +1259,23 @@ async def worker_execution_completed( metadata=metadata, ) + async def jump_to_operator( + self, + jump_to_operator_request: "JumpToOperatorRequest", + *, + timeout: Optional[float] = None, + deadline: Optional["Deadline"] = None, + metadata: Optional["MetadataLike"] = None + ) -> "EmptyReturn": + return await self._unary_unary( + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator", + jump_to_operator_request, + EmptyReturn, + timeout=timeout, + deadline=deadline, + metadata=metadata, + ) + async def link_workers( self, link_workers_request: "LinkWorkersRequest", @@ -1923,6 +1948,11 @@ async def worker_execution_completed( ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + async def jump_to_operator( + self, jump_to_operator_request: "JumpToOperatorRequest" + ) -> "EmptyReturn": + raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) + async def link_workers( self, link_workers_request: "LinkWorkersRequest" ) -> "EmptyReturn": @@ -2032,6 +2062,13 @@ async def __rpc_worker_execution_completed( response = await self.worker_execution_completed(request) await stream.send_message(response) + async def __rpc_jump_to_operator( + self, stream: "grpclib.server.Stream[JumpToOperatorRequest, EmptyReturn]" + ) -> None: + request = await stream.recv_message() + response = await self.jump_to_operator(request) + await stream.send_message(response) + async def __rpc_link_workers( self, stream: "grpclib.server.Stream[LinkWorkersRequest, EmptyReturn]" ) -> None: @@ -2134,6 +2171,12 @@ def __mapping__(self) -> Dict[str, grpclib.const.Handler]: EmptyRequest, EmptyReturn, ), + "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator": grpclib.const.Handler( + self.__rpc_jump_to_operator, + grpclib.const.Cardinality.UNARY_UNARY, + JumpToOperatorRequest, + EmptyReturn, + ), "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/LinkWorkers": grpclib.const.Handler( self.__rpc_link_workers, grpclib.const.Cardinality.UNARY_UNARY, From 62d285443ccde08478fe426a7636f1c356d38165 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 29 Apr 2026 16:19:23 -0700 Subject: [PATCH 057/152] update --- .../python/core/runnables/data_processor.py | 26 +-- .../main/python/core/runnables/main_loop.py | 8 +- .../python/core/runnables/test_main_loop.py | 175 +++++++++++++++++- 3 files changed, 177 insertions(+), 32 deletions(-) diff --git a/amber/src/main/python/core/runnables/data_processor.py b/amber/src/main/python/core/runnables/data_processor.py index 815e85a6446..776aa35b87a 100644 --- a/amber/src/main/python/core/runnables/data_processor.py +++ b/amber/src/main/python/core/runnables/data_processor.py @@ -49,16 +49,14 @@ def run(self) -> None: with self._context.tuple_processing_manager.context_switch_condition: self._context.tuple_processing_manager.context_switch_condition.wait() self._running.set() - self._switch_context() while self._running.is_set(): - marker = self._context.tuple_processing_manager.get_internal_marker() - state = self._context.state_processing_manager.get_input_state() - tuple_ = self._context.tuple_processing_manager.current_input_tuple - if marker is not None: - self.process_internal_marker(marker) - elif state is not None: - self.process_state(state) - elif tuple_ is not None: + tpm = self._context.tuple_processing_manager + spm = self._context.state_processing_manager + if tpm.current_internal_marker is not None: + self.process_internal_marker(tpm.get_internal_marker()) + elif spm.current_input_state is not None: + self.process_state(spm.get_input_state()) + elif tpm.current_input_tuple is not None: self.process_tuple() else: raise RuntimeError("No marker or tuple to process.") @@ -85,9 +83,6 @@ def process_internal_marker(self, internal_marker: InternalMarker) -> None: self._context.exception_manager.set_exception_info(exc_info) self._report_exception(exc_info) - finally: - self._switch_context() - def process_state(self, state: State) -> None: """ Process an input marker by invoking appropriate state @@ -100,7 +95,6 @@ def process_state(self, state: State) -> None: self._context.worker_id, self._context.console_message_manager.print_buf, ): - self._switch_context() self._set_output_state(executor.process_state(state, port_id)) except Exception as err: @@ -109,9 +103,6 @@ def process_state(self, state: State) -> None: self._context.exception_manager.set_exception_info(exc_info) self._report_exception(exc_info) - finally: - self._switch_context() - def process_tuple(self) -> None: """ Process an input tuple by invoking the executor's tuple processing method. @@ -134,9 +125,6 @@ def process_tuple(self) -> None: self._context.exception_manager.set_exception_info(exc_info) self._report_exception(exc_info) - finally: - self._switch_context() - def _set_output_tuple(self, output_iterator: Iterator[Optional[TupleLike]]) -> None: """ Set the output tuple after processing by the executor. diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index d42aedc7e44..8454808b058 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -225,9 +225,13 @@ def process_input_tuple(self) -> None: ) def process_input_state(self) -> None: + # Single switch handshake: DataProc parks at the run-loop's + # end-of-body switch (line 65) between tasks, so one switch from + # MainLoop drives a full pick-up -> executor -> output -> park-back + # cycle. By the time the switch returns, current_output_state holds + # the freshly produced output. self._switch_context() output_state = self.context.state_processing_manager.get_output_state() - self._switch_context() if output_state is not None: if isinstance(self.context.executor_manager.executor, LoopEndOperator): self.context.output_manager.reset_output_storage() @@ -287,7 +291,6 @@ def _process_tuple(self, tuple_: Tuple) -> None: def _process_state(self, state_: State) -> None: self.context.state_processing_manager.current_input_state = state_ - self._switch_context() self.process_input_state() self._check_and_process_control() @@ -376,7 +379,6 @@ def _process_ecm(self, ecm_element: ECMElement): if ecm.ecm_type != EmbeddedControlMessageType.NO_ALIGNMENT: self.context.pause_manager.resume(PauseType.ECM_PAUSE) - self._switch_context() if self.context.tuple_processing_manager.current_internal_marker: { StartChannel: self._process_start_channel, diff --git a/amber/src/main/python/core/runnables/test_main_loop.py b/amber/src/main/python/core/runnables/test_main_loop.py index 62065e1b8ce..d99049a9a3b 100644 --- a/amber/src/main/python/core/runnables/test_main_loop.py +++ b/amber/src/main/python/core/runnables/test_main_loop.py @@ -361,6 +361,16 @@ def mock_initialize_executor( ) return DCMElement(tag=mock_control_input_channel, payload=payload) + @pytest.fixture + def mock_state_data_elements(self, mock_data_input_channel): + return [ + DataElement( + tag=mock_data_input_channel, + payload=StateFrame(frame={"value": value}), + ) + for value in (1, 2, 3, 4) + ] + @pytest.fixture def mock_initialize_batch_count_executor( self, @@ -1101,17 +1111,15 @@ def process_state(state: State, port: int) -> State: lambda state: [(mock_data_output_channel.to_worker_id, StateFrame(state))], ) - switch_count = {"value": 0} - def fake_switch_context(): - switch_count["value"] += 1 - # xinyuan-state-only still uses the original two-switch state handshake: - # the DataProcessor produces output during the first switch of each - # process_input_state() call, before MainLoop reads current_output_state. - if switch_count["value"] % 2 == 1: - current_input_state = ( - main_loop.context.state_processing_manager.current_input_state - ) + # process_input_state now uses a single switch per call, mirroring + # the per-iteration switch in process_tuple_with_udf. Each switch + # simulates DataProc consuming the queued input state and writing + # current_output_state. + current_input_state = ( + main_loop.context.state_processing_manager.current_input_state + ) + if current_input_state is not None: main_loop.context.state_processing_manager.current_output_state = ( DummyExecutor.process_state(current_input_state, 0) ) @@ -1290,3 +1298,150 @@ def channel_size(channel: ChannelIdentity) -> int: "test-1" ] == b"pickle " + pickle.dumps(mock_binary_tuple["test-1"]) reraise() + + @pytest.mark.timeout(5) + def test_main_loop_thread_can_process_state( + self, + mock_data_output_channel, + mock_control_output_channel, + input_queue, + output_queue, + main_loop, + main_loop_thread, + mock_assign_input_port, + mock_assign_output_port, + mock_add_input_channel, + mock_add_partitioning, + mock_initialize_executor, + mock_data_element, + mock_state_data_elements, + mock_end_of_upstream, + command_sequence, + reraise, + ): + # End-to-end coverage of the state-processing path through the real + # MainLoop + DataProcessor threads. + # + # The cooperative-threading handshake works like this: + # - DataProcessor.run() peeks current_internal_marker / + # current_input_state / current_input_tuple every iteration and + # consumes only the slot whose branch it takes -- unhandled + # inputs survive into the next iteration. + # - process_state runs the executor inside replace_print() and + # writes the result to current_output_state, then notifies + # MainLoop via the finally _switch_context(). + # - MainLoop.process_input_state() switches twice and reads + # current_output_state after both switches, so the read sees + # the value DataProc has just written. + # + # The expected behavior is therefore: each state produces its own + # output in its own cycle (no lag), and an EndChannel ECM after + # the last state produces an additional output via + # produce_state_on_finish. + main_loop_thread.start() + + for setup_msg in [ + mock_assign_input_port, + mock_assign_output_port, + mock_add_input_channel, + mock_add_partitioning, + mock_initialize_executor, + ]: + input_queue.put(setup_msg) + assert output_queue.get() == DCMElement( + tag=mock_control_output_channel, + payload=DirectControlMessagePayloadV2( + return_invocation=ReturnInvocation( + command_id=command_sequence, + return_value=ControlReturn(empty_return=EmptyReturn()), + ) + ), + ) + + # Replace the EchoOperator that mock_initialize_executor loaded with + # an in-process executor that tags processed states and emits a + # finish marker on EndChannel. Going through the InitializeExecutor + # RPC above sets up the rest of the worker state (output schema, + # partitioning bookkeeping); swapping the executor instance here + # lets the test observe whether process_state actually runs without + # depending on Python's cross-test module caching for the loaded + # operator class. + class StateProcessingExecutor: + @staticmethod + def process_tuple(tuple_, port): + yield tuple_ + + @staticmethod + def process_state(state, port): + return {**state, "processed_marker": "executed", "port": port} + + @staticmethod + def produce_state_on_finish(port): + return {"finish_marker": "produce_state_on_finish_ran"} + + @staticmethod + def on_finish(port): + yield + + @staticmethod + def close(): + pass + + main_loop.context.executor_manager.executor = StateProcessingExecutor() + + # Send four states directly -- no warm-up tuple needed. With the + # init switch in DataProc.run() removed, MainLoop's first switch + # lands DataProc directly in the while-loop where it processes + # the queued state, so even the first state cycle works. + for state_element in mock_state_data_elements: + input_queue.put(state_element) + + for expected_value in (1, 2, 3, 4): + output_data_element: DataElement = output_queue.get() + assert output_data_element.tag == mock_data_output_channel + assert isinstance(output_data_element.payload, StateFrame), ( + f"expected StateFrame for value={expected_value}, got " + f"{type(output_data_element.payload).__name__}" + ) + output_state = output_data_element.payload.frame + assert output_state["value"] == expected_value, ( + f"state outputs arrived out of order: expected value=" + f"{expected_value}, got value={output_state['value']}" + ) + assert output_state["processed_marker"] == "executed" + assert output_state["port"] == 0 + + # Send EndChannel to drive _process_end_channel. The executor's + # produce_state_on_finish writes a finish-marker state into + # current_output_state inside DataProc's process_internal_marker; + # MainLoop's process_input_state then emits it. + input_queue.put(mock_end_of_upstream) + + # Drain the control reply messages so the next data + # output_queue.get() returns the post-EndChannel data emission. + output_queue.disable_data(InternalQueue.DisableType.DISABLE_BY_PAUSE) + for _ in range(3): + control_reply = output_queue.get() + assert isinstance(control_reply, DCMElement), ( + f"expected DCMElement during EndChannel teardown, got " + f"{type(control_reply).__name__}" + ) + output_queue.enable_data(InternalQueue.DisableType.DISABLE_BY_PAUSE) + + end_channel_state_output: DataElement = output_queue.get() + assert end_channel_state_output.tag == mock_data_output_channel + assert isinstance(end_channel_state_output.payload, StateFrame), ( + f"expected StateFrame for the EndChannel-driven emission, got " + f"{type(end_channel_state_output.payload).__name__}" + ) + end_channel_state = end_channel_state_output.payload.frame + assert "finish_marker" in end_channel_state, ( + f"EndChannel emission should be the finish-marker state from " + f"produce_state_on_finish, got {end_channel_state!r}" + ) + assert ( + end_channel_state["finish_marker"] + == "produce_state_on_finish_ran" + ) + + reraise() From ed650eaa8b368e816267fb4eb464b7bdb684c76d Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 29 Apr 2026 17:31:28 -0700 Subject: [PATCH 058/152] fix(amber): keep per-task finally switches, drop run-loop end-of-body switch Mirrors the same change on state-handshake-redesign (the upstream PR candidate, #4560 review feedback). Functionally equivalent to the prior design that removed all three per-task finallys; this design keeps them and drops the run-loop's end-of-body _switch_context() instead, which keeps process_state / process_internal_marker / process_tuple unchanged from origin/main. --- amber/src/main/python/core/runnables/data_processor.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/amber/src/main/python/core/runnables/data_processor.py b/amber/src/main/python/core/runnables/data_processor.py index 776aa35b87a..e8e5171c96e 100644 --- a/amber/src/main/python/core/runnables/data_processor.py +++ b/amber/src/main/python/core/runnables/data_processor.py @@ -60,7 +60,6 @@ def run(self) -> None: self.process_tuple() else: raise RuntimeError("No marker or tuple to process.") - self._switch_context() def process_internal_marker(self, internal_marker: InternalMarker) -> None: try: @@ -83,6 +82,9 @@ def process_internal_marker(self, internal_marker: InternalMarker) -> None: self._context.exception_manager.set_exception_info(exc_info) self._report_exception(exc_info) + finally: + self._switch_context() + def process_state(self, state: State) -> None: """ Process an input marker by invoking appropriate state @@ -103,6 +105,9 @@ def process_state(self, state: State) -> None: self._context.exception_manager.set_exception_info(exc_info) self._report_exception(exc_info) + finally: + self._switch_context() + def process_tuple(self) -> None: """ Process an input tuple by invoking the executor's tuple processing method. @@ -125,6 +130,9 @@ def process_tuple(self) -> None: self._context.exception_manager.set_exception_info(exc_info) self._report_exception(exc_info) + finally: + self._switch_context() + def _set_output_tuple(self, output_iterator: Iterator[Optional[TupleLike]]) -> None: """ Set the output tuple after processing by the executor. From 60db011cc0d0e780a6ab74819ae77f13f90a0376 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 29 Apr 2026 18:00:29 -0700 Subject: [PATCH 059/152] fix: address state review comments --- .../python/core/runnables/network_receiver.py | 28 ++++++++------ .../python/core/runnables/test_main_loop.py | 7 ++-- .../core/runnables/test_network_receiver.py | 38 ++++++++++++++++++- .../pythonworker/PythonProxyClient.scala | 4 +- .../pythonworker/PythonProxyServer.scala | 4 +- .../texera/amber/core/state/State.scala | 4 +- 6 files changed, 64 insertions(+), 21 deletions(-) diff --git a/amber/src/main/python/core/runnables/network_receiver.py b/amber/src/main/python/core/runnables/network_receiver.py index 73478e3ad4b..8cd8a0d5375 100644 --- a/amber/src/main/python/core/runnables/network_receiver.py +++ b/amber/src/main/python/core/runnables/network_receiver.py @@ -97,17 +97,7 @@ def data_handler(command: bytes, table: Table) -> int: "Data", lambda _: DataFrame(table), "State", - lambda _: StateFrame( - deserialize_state( - Tuple( - { - name: table[name][0].as_py() - for name in STATE_SCHEMA.get_attr_names() - }, - schema=STATE_SCHEMA, - ) - ) - ), + lambda _: StateFrame(self._deserialize_state_payload(table)), "ECM", lambda _: EmbeddedControlMessage().parse(table["payload"][0].as_py()), ) @@ -155,6 +145,22 @@ def actor_message_handler(message: bytes) -> int: self._proxy_server.register_actor_message_handler(actor_message_handler) + @staticmethod + def _deserialize_state_payload(table: Table) -> dict: + # Each network State message carries exactly one serialized state row. + # Multiple states are sent as multiple State messages, not as multiple + # rows inside one network payload. + assert len(table) == 1 + return deserialize_state( + Tuple( + { + name: table[name][0].as_py() + for name in STATE_SCHEMA.get_attr_names() + }, + schema=STATE_SCHEMA, + ) + ) + def register_shutdown(self, shutdown: callable) -> None: self._proxy_server.register( name="shutdown", action=ProxyServer.ack(msg="Bye bye!")(shutdown) diff --git a/amber/src/main/python/core/runnables/test_main_loop.py b/amber/src/main/python/core/runnables/test_main_loop.py index 9f3b7c5b760..d2af3b79466 100644 --- a/amber/src/main/python/core/runnables/test_main_loop.py +++ b/amber/src/main/python/core/runnables/test_main_loop.py @@ -26,6 +26,7 @@ from core.models import ( DataFrame, InternalQueue, + StateFrame, Tuple, ) from core.models.internal_queue import ( @@ -1079,7 +1080,7 @@ def send_resume( ) @pytest.mark.timeout(2) - def test_process_state_can_emit_multiple_states( + def test_process_state_can_emit_consecutive_states( self, main_loop, output_queue, @@ -1088,7 +1089,7 @@ def test_process_state_can_emit_multiple_states( ): class DummyExecutor: @staticmethod - def process_state(state: State, port: int) -> State: + def process_state(state, port: int): return {"value": state["value"] + 1, "port": port} main_loop.context.executor_manager.executor = DummyExecutor() @@ -1103,7 +1104,7 @@ def process_state(state: State, port: int) -> State: def fake_switch_context(): switch_count["value"] += 1 - # xinyuan-state-only still uses the original two-switch state handshake: + # The current state-processing handshake uses two context switches: # the DataProcessor produces output during the first switch of each # process_input_state() call, before MainLoop reads current_output_state. if switch_count["value"] % 2 == 1: diff --git a/amber/src/main/python/core/runnables/test_network_receiver.py b/amber/src/main/python/core/runnables/test_network_receiver.py index 2cc2541f2d5..00196e40960 100644 --- a/amber/src/main/python/core/runnables/test_network_receiver.py +++ b/amber/src/main/python/core/runnables/test_network_receiver.py @@ -25,7 +25,7 @@ DataElement, ECMElement, ) -from core.models.payload import DataFrame +from core.models.payload import DataFrame, StateFrame from core.proxy import ProxyClient from core.runnables.network_receiver import NetworkReceiver from core.runnables.network_sender import NetworkSender @@ -139,6 +139,42 @@ def test_network_receiver_can_receive_data_messages( assert len(element.payload.frame) == len(data_payload.frame) assert element.tag == channel_id + @pytest.mark.timeout(10) + def test_network_receiver_can_receive_consecutive_state_messages( + self, + output_queue, + input_queue, + network_receiver, + network_sender_thread, + ): + network_sender_thread.start() + worker_id = ActorVirtualIdentity(name="test") + channel_id = ChannelIdentity(worker_id, worker_id, False) + + input_queue.put( + DataElement( + tag=channel_id, + payload=StateFrame({"loop_counter": 0, "i": 1}), + ) + ) + input_queue.put( + DataElement( + tag=channel_id, + payload=StateFrame({"loop_counter": 1, "i": 2}), + ) + ) + + first_element: DataElement = output_queue.get() + second_element: DataElement = output_queue.get() + + assert isinstance(first_element.payload, StateFrame) + assert first_element.payload.frame == {"loop_counter": 0, "i": 1} + assert first_element.tag == channel_id + + assert isinstance(second_element.payload, StateFrame) + assert second_element.payload.frame == {"loop_counter": 1, "i": 2} + assert second_element.tag == channel_id + @pytest.mark.timeout(10) def test_network_receiver_can_receive_control_messages( self, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index cfdb6a82f86..bb9a9dd6146 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -21,7 +21,7 @@ package org.apache.texera.amber.engine.architecture.pythonworker import com.twitter.util.{Await, Promise} import org.apache.texera.amber.core.WorkflowRuntimeException -import org.apache.texera.amber.core.state.State +import org.apache.texera.amber.core.state.StateJson import org.apache.texera.amber.core.tuple.{Schema, Tuple} import org.apache.texera.amber.core.virtualidentity.{ActorVirtualIdentity, ChannelIdentity} import org.apache.texera.amber.engine.architecture.pythonworker.WorkerBatchInternalQueue.{ @@ -126,7 +126,7 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu case DataFrame(frame) => writeArrowStream(mutable.Queue(ArraySeq.unsafeWrapArray(frame): _*), from, "Data") case StateFrame(state) => - writeArrowStream(mutable.Queue(State.serialize(state)), from, "State") + writeArrowStream(mutable.Queue(StateJson.serialize(state)), from, "State") } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala index 463dc4b75a5..508ea43baaf 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala @@ -21,7 +21,7 @@ package org.apache.texera.amber.engine.architecture.pythonworker import com.google.common.primitives.Longs import com.twitter.util.Promise -import org.apache.texera.amber.core.state.State +import org.apache.texera.amber.core.state.StateJson import org.apache.texera.amber.core.tuple.Tuple import org.apache.texera.amber.core.virtualidentity.{ActorVirtualIdentity, ChannelIdentity} import org.apache.texera.amber.engine.architecture.messaginglayer.NetworkOutputGateway @@ -128,7 +128,7 @@ private class AmberProducer( dataHeader.payloadType match { case "State" => assert(root.getRowCount == 1) - outputPort.sendTo(to, StateFrame(State.deserialize(ArrowUtils.getTexeraTuple(0, root)))) + outputPort.sendTo(to, StateFrame(StateJson.deserialize(ArrowUtils.getTexeraTuple(0, root)))) case "ECM" => assert(root.getRowCount == 1) outputPort.sendTo( diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 779cc97a28c..8452bff3542 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -26,7 +26,7 @@ import org.apache.texera.amber.util.JSONUtils.objectMapper import java.util.Base64 import scala.jdk.CollectionConverters.IteratorHasAsScala -object State { +object StateJson { private val StateContent = "content" private val BytesTypeMarker = "__texera_type__" private val BytesValue = "bytes" @@ -56,7 +56,7 @@ object State { case null => null case bytes: Array[Byte] => Map(BytesTypeMarker -> BytesValue, PayloadMarker -> Base64.getEncoder.encodeToString(bytes)) - case map: State => + case map: Map[?, ?] => map.iterator.map { case (k, v) => k -> toJsonValue(v) }.toMap case iterable: Iterable[_] => iterable.map(toJsonValue).toList From 74e2b1da7fd08bda09dd8deb6edd3b4895e83500 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 29 Apr 2026 18:35:39 -0700 Subject: [PATCH 060/152] update --- .../texera/amber/core/state/State.scala | 24 +++++++++++++------ .../texera/amber/core/state/package.scala | 24 ------------------- 2 files changed, 17 insertions(+), 31 deletions(-) delete mode 100644 common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 8452bff3542..70d6c92fff6 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -26,6 +26,14 @@ import org.apache.texera.amber.util.JSONUtils.objectMapper import java.util.Base64 import scala.jdk.CollectionConverters.IteratorHasAsScala +final case class State(data: Map[String, Any]) { + def apply(key: String): Any = data(key) + + def get(key: String): Option[Any] = data.get(key) + + def updated(key: String, value: Any): State = State(data.updated(key, value)) +} + object StateJson { private val StateContent = "content" private val BytesTypeMarker = "__texera_type__" @@ -37,18 +45,20 @@ object StateJson { ) def serialize(state: State): Tuple = { - val payloadJson = objectMapper.writeValueAsString(toJsonValue(state)) + val payloadJson = objectMapper.writeValueAsString(toJsonValue(state.data)) Tuple.builder(schema).addSequentially(Array(payloadJson)).build() } def deserialize(tuple: Tuple): State = { val payload = tuple.getField[String](StateContent) - objectMapper - .readTree(payload) - .fields() - .asScala - .map(entry => entry.getKey -> fromJsonValue(entry.getValue)) - .toMap + State( + objectMapper + .readTree(payload) + .fields() + .asScala + .map(entry => entry.getKey -> fromJsonValue(entry.getValue)) + .toMap + ) } private def toJsonValue(value: Any): Any = diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala deleted file mode 100644 index c110f9d814f..00000000000 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/package.scala +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.texera.amber.core - -package object state { - type State = Map[String, Any] -} From 2fe32703c50b337893a726c8a8ececea5f5f283c Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 29 Apr 2026 18:44:53 -0700 Subject: [PATCH 061/152] update --- .../pythonworker/PythonProxyClient.scala | 4 ++-- .../pythonworker/PythonProxyServer.scala | 4 ++-- .../org/apache/texera/amber/core/state/State.scala | 12 +++--------- .../texera/amber/operator/ifStatement/IfOpExec.scala | 2 +- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index bb9a9dd6146..cfdb6a82f86 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -21,7 +21,7 @@ package org.apache.texera.amber.engine.architecture.pythonworker import com.twitter.util.{Await, Promise} import org.apache.texera.amber.core.WorkflowRuntimeException -import org.apache.texera.amber.core.state.StateJson +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.tuple.{Schema, Tuple} import org.apache.texera.amber.core.virtualidentity.{ActorVirtualIdentity, ChannelIdentity} import org.apache.texera.amber.engine.architecture.pythonworker.WorkerBatchInternalQueue.{ @@ -126,7 +126,7 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu case DataFrame(frame) => writeArrowStream(mutable.Queue(ArraySeq.unsafeWrapArray(frame): _*), from, "Data") case StateFrame(state) => - writeArrowStream(mutable.Queue(StateJson.serialize(state)), from, "State") + writeArrowStream(mutable.Queue(State.serialize(state)), from, "State") } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala index 508ea43baaf..463dc4b75a5 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala @@ -21,7 +21,7 @@ package org.apache.texera.amber.engine.architecture.pythonworker import com.google.common.primitives.Longs import com.twitter.util.Promise -import org.apache.texera.amber.core.state.StateJson +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.tuple.Tuple import org.apache.texera.amber.core.virtualidentity.{ActorVirtualIdentity, ChannelIdentity} import org.apache.texera.amber.engine.architecture.messaginglayer.NetworkOutputGateway @@ -128,7 +128,7 @@ private class AmberProducer( dataHeader.payloadType match { case "State" => assert(root.getRowCount == 1) - outputPort.sendTo(to, StateFrame(StateJson.deserialize(ArrowUtils.getTexeraTuple(0, root)))) + outputPort.sendTo(to, StateFrame(State.deserialize(ArrowUtils.getTexeraTuple(0, root)))) case "ECM" => assert(root.getRowCount == 1) outputPort.sendTo( diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 70d6c92fff6..5560e69c805 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -26,15 +26,9 @@ import org.apache.texera.amber.util.JSONUtils.objectMapper import java.util.Base64 import scala.jdk.CollectionConverters.IteratorHasAsScala -final case class State(data: Map[String, Any]) { - def apply(key: String): Any = data(key) +final case class State(values: Map[String, Any]) - def get(key: String): Option[Any] = data.get(key) - - def updated(key: String, value: Any): State = State(data.updated(key, value)) -} - -object StateJson { +object State { private val StateContent = "content" private val BytesTypeMarker = "__texera_type__" private val BytesValue = "bytes" @@ -45,7 +39,7 @@ object StateJson { ) def serialize(state: State): Tuple = { - val payloadJson = objectMapper.writeValueAsString(toJsonValue(state.data)) + val payloadJson = objectMapper.writeValueAsString(toJsonValue(state.values)) Tuple.builder(schema).addSequentially(Array(payloadJson)).build() } diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala index d2becc79a5b..a3244f7eec8 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala @@ -34,7 +34,7 @@ class IfOpExec(descString: String) extends OperatorExecutor { //It can accept any value that can be converted to a boolean. For example, Int 1 will be converted to true. override def processState(state: State, port: Int): Option[State] = { outputPort = - if (state(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() + if (state.values(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() Some(state) } From 87330ecc2af753af2a6b83406bbdcd132521f7b1 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 29 Apr 2026 18:50:32 -0700 Subject: [PATCH 062/152] update --- .../apache/texera/amber/operator/ifStatement/IfOpExec.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala index a3244f7eec8..4634ad1c18c 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala @@ -34,7 +34,8 @@ class IfOpExec(descString: String) extends OperatorExecutor { //It can accept any value that can be converted to a boolean. For example, Int 1 will be converted to true. override def processState(state: State, port: Int): Option[State] = { outputPort = - if (state.values(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() + if (state.values(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) + else PortIdentity() Some(state) } From 83e6fb575b82e8bd3d5fe6ec0a3a84418f4f4d6b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 29 Apr 2026 23:39:48 -0700 Subject: [PATCH 063/152] fix fmt --- .../architecture/controller/ControllerProcessor.scala | 5 ++++- .../engine/architecture/scheduling/Schedule.scala | 10 ++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index df000e92c1b..ea437875992 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -29,7 +29,10 @@ import org.apache.texera.amber.engine.architecture.common.{ } import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.architecture.logreplay.ReplayLogManager -import org.apache.texera.amber.engine.architecture.scheduling.{Schedule, WorkflowExecutionCoordinator} +import org.apache.texera.amber.engine.architecture.scheduling.{ + Schedule, + WorkflowExecutionCoordinator +} import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.MainThreadDelegateMessage import org.apache.texera.amber.engine.common.ambermessage.WorkflowFIFOMessage diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index dd48f893e37..e359dc4e504 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -28,9 +28,10 @@ case class Schedule( private val baseLevels = levelSets.keys.toVector.sorted private val normalizedExecutionLevels = if (executionLevels.nonEmpty || baseLevels.isEmpty) executionLevels else baseLevels - private val operatorLevelIndices = levelSets.iterator.flatMap { case (level, regions) => - val levelIndex = baseLevels.indexOf(level) - regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> levelIndex)) + private val operatorLevelIndices = levelSets.iterator.flatMap { + case (level, regions) => + val levelIndex = baseLevels.indexOf(level) + regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> levelIndex)) }.toMap private var currentLevelIndex = 0 @@ -40,7 +41,8 @@ case class Schedule( def rewriteExecutionFrom(levelIndex: Int): Schedule = { val rewrittenSchedule = copy( - executionLevels = normalizedExecutionLevels.take(currentLevelIndex) ++ baseLevels.drop(levelIndex) + executionLevels = + normalizedExecutionLevels.take(currentLevelIndex) ++ baseLevels.drop(levelIndex) ) rewrittenSchedule.currentLevelIndex = currentLevelIndex rewrittenSchedule From 2ab9e001f5022c0ea848fa819ea61c58516d4e17 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 15:47:32 -0700 Subject: [PATCH 064/152] test(amber): expand jump-to-operator-region coordinator coverage Add cases for multi-jump sequences, unknown-target rejection, jump before any pull, jump after schedule exhaustion, and forward jump. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../WorkflowExecutionCoordinatorSpec.scala | 104 +++++++++++++++--- 1 file changed, 87 insertions(+), 17 deletions(-) diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 1c733e4d1e8..b62bbf754c3 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -42,30 +42,100 @@ class WorkflowExecutionCoordinatorSpec extends AnyFlatSpec { Region(RegionIdentity(regionId), Set(physicalOp), Set.empty) } - "WorkflowExecutionCoordinator.jumpToRegionContainingOperator" should "make the next scheduled region contain the target operator's region" in { - val firstRegion = region(1, "first") - val secondRegion = region(2, "second") - val thirdRegion = region(3, "third") + private def threeLevelSchedule(): (Region, Region, Region, Schedule) = { + val first = region(1, "first") + val second = region(2, "second") + val third = region(3, "third") val schedule = Schedule( Map( - 0 -> Set(firstRegion), - 1 -> Set(secondRegion), - 2 -> Set(thirdRegion) + 0 -> Set(first), + 1 -> Set(second), + 2 -> Set(third) ) ) - val coordinator = - new WorkflowExecutionCoordinator( - schedule, - WorkflowExecution(), - null, - null - ) + (first, second, third, schedule) + } + + private def newCoordinator(schedule: Schedule): WorkflowExecutionCoordinator = + new WorkflowExecutionCoordinator(schedule, WorkflowExecution(), null, null) + + "WorkflowExecutionCoordinator.jumpToRegionContainingOperator" should + "make the next scheduled region contain the target operator's region" in { + val (first, second, _, schedule) = threeLevelSchedule() + val coordinator = newCoordinator(schedule) + + assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.pullNextRegions == Set(second)) + + coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) + + assert(coordinator.pullNextRegions == Set(first)) + } + + it should "support multiple sequential jumps interleaved with region pulls" in { + val (first, second, third, schedule) = threeLevelSchedule() + val coordinator = newCoordinator(schedule) + + assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.pullNextRegions == Set(second)) + + coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) + assert(coordinator.pullNextRegions == Set(first)) + + coordinator.jumpToRegionContainingOperator(OperatorIdentity("second")) + assert(coordinator.pullNextRegions == Set(second)) + assert(coordinator.pullNextRegions == Set(third)) + + coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) + assert(coordinator.pullNextRegions == Set(first)) + } + + it should "be a no-op when the target operator is not in any scheduled region" in { + val (first, second, _, schedule) = threeLevelSchedule() + val coordinator = newCoordinator(schedule) + + assert(coordinator.pullNextRegions == Set(first)) + + coordinator.jumpToRegionContainingOperator(OperatorIdentity("does-not-exist")) + + // Iteration position must be unaffected by an unknown target. + assert(coordinator.pullNextRegions == Set(second)) + } + + it should "leave the schedule untouched when called repeatedly with unknown operators" in { + val (first, second, third, schedule) = threeLevelSchedule() + val coordinator = newCoordinator(schedule) - assert(coordinator.pullNextRegions == Set(firstRegion)) - assert(coordinator.pullNextRegions == Set(secondRegion)) + coordinator.jumpToRegionContainingOperator(OperatorIdentity("ghost-1")) + coordinator.jumpToRegionContainingOperator(OperatorIdentity("ghost-2")) + coordinator.jumpToRegionContainingOperator(OperatorIdentity("ghost-3")) + + assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.pullNextRegions == Set(second)) + assert(coordinator.pullNextRegions == Set(third)) + } + + it should "allow jumping back to the first region after the schedule is exhausted" in { + val (first, second, third, schedule) = threeLevelSchedule() + val coordinator = newCoordinator(schedule) + + assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.pullNextRegions == Set(second)) + assert(coordinator.pullNextRegions == Set(third)) + assert(coordinator.pullNextRegions == Set.empty) coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) + assert(coordinator.pullNextRegions == Set(first)) + } + + it should "support jumping forward past regions that have not yet been pulled" in { + val (first, _, third, schedule) = threeLevelSchedule() + val coordinator = newCoordinator(schedule) + + assert(coordinator.pullNextRegions == Set(first)) - assert(coordinator.pullNextRegions == Set(firstRegion)) + coordinator.jumpToRegionContainingOperator(OperatorIdentity("third")) + assert(coordinator.pullNextRegions == Set(third)) + assert(coordinator.pullNextRegions == Set.empty) } } From 9472125611870b1c2abf4e986214e532fe3ee3e4 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 15:57:03 -0700 Subject: [PATCH 065/152] feat(amber): require Schedule level keys to be contiguous from 0 Reject any `Schedule` constructed with gaps or non-zero starting level keys. The schedule generator already produces contiguous-from-0 keys, so this only tightens the contract for direct callers and tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../amber/engine/architecture/scheduling/Schedule.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index e359dc4e504..5c3346947da 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -25,6 +25,11 @@ case class Schedule( private val levelSets: Map[Int, Set[Region]], executionLevels: Vector[Int] = Vector.empty ) extends Iterator[Set[Region]] { + require( + levelSets.keys.toSet == (0 until levelSets.size).toSet, + s"Schedule level keys must be contiguous starting at 0, got: ${levelSets.keys.toSeq.sorted}" + ) + private val baseLevels = levelSets.keys.toVector.sorted private val normalizedExecutionLevels = if (executionLevels.nonEmpty || baseLevels.isEmpty) executionLevels else baseLevels From 3b8ec8e324c871631ad0d425668440e21a68ac87 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 16:30:39 -0700 Subject: [PATCH 066/152] fix: make python State runtime type --- .../architecture/packaging/output_manager.py | 2 +- amber/src/main/python/core/models/payload.py | 4 ++ amber/src/main/python/core/models/state.py | 8 +-- .../python/core/runnables/data_processor.py | 2 + .../python/core/runnables/test_main_loop.py | 52 +++++++------------ 5 files changed, 31 insertions(+), 37 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index afa9127fe6e..bf4afbf396f 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -248,7 +248,7 @@ def emit_state( receiver, ( StateFrame(payload) - if isinstance(payload, dict) + if isinstance(payload, State) else self.tuple_to_frame(payload) ), ) diff --git a/amber/src/main/python/core/models/payload.py b/amber/src/main/python/core/models/payload.py index 61a33294882..e8d733c1794 100644 --- a/amber/src/main/python/core/models/payload.py +++ b/amber/src/main/python/core/models/payload.py @@ -34,3 +34,7 @@ class DataFrame(DataPayload): @dataclass class StateFrame(DataPayload): frame: State + + def __post_init__(self): + if not isinstance(self.frame, State): + self.frame = State(self.frame) diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index 897153d37a3..8603c2124fd 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -17,12 +17,14 @@ import base64 import json -from typing import Any, Dict, TypeAlias +from typing import Any from .schema import Schema from .tuple import Tuple -State: TypeAlias = Dict[str, Any] + +class State(dict): + pass STATE_CONTENT = "content" _TYPE_MARKER = "__texera_type__" @@ -40,7 +42,7 @@ def serialize_state(state: State) -> Tuple: def deserialize_state(row: Tuple) -> State: - return _from_json_value(json.loads(row[STATE_CONTENT])) + return State(_from_json_value(json.loads(row[STATE_CONTENT]))) def _to_json_value(value: Any) -> Any: diff --git a/amber/src/main/python/core/runnables/data_processor.py b/amber/src/main/python/core/runnables/data_processor.py index 35d2a75d1d5..276a1669f55 100644 --- a/amber/src/main/python/core/runnables/data_processor.py +++ b/amber/src/main/python/core/runnables/data_processor.py @@ -168,6 +168,8 @@ def _set_output_state(self, output_state: State) -> None: """ Set the output state after processing by the executor. """ + if output_state is not None and not isinstance(output_state, State): + output_state = State(output_state) self._context.state_processing_manager.current_output_state = output_state def _switch_context(self) -> None: diff --git a/amber/src/main/python/core/runnables/test_main_loop.py b/amber/src/main/python/core/runnables/test_main_loop.py index 0dc8496c154..c9daa633f55 100644 --- a/amber/src/main/python/core/runnables/test_main_loop.py +++ b/amber/src/main/python/core/runnables/test_main_loop.py @@ -26,6 +26,7 @@ from core.models import ( DataFrame, InternalQueue, + State, StateFrame, Tuple, ) @@ -165,8 +166,7 @@ def mock_data_element(self, mock_tuple, mock_data_input_channel): def mock_state_data_elements(self, mock_data_input_channel): elements = [] for value in (1, 2, 3, 4): - state = State() - state.add("value", value) + state = State({"value": value}) elements.append( DataElement( tag=mock_data_input_channel, @@ -188,19 +188,16 @@ def process_tuple(tuple_, port): @staticmethod def process_state(state: State, port: int) -> State: - new_state = State() - for key, value in state.__dict__.items(): - if key != "schema": - new_state.add(key, value) - new_state.add("processed_marker", "executed") - new_state.add("port", port) + new_state = State( + {key: value for key, value in state.items() if key != "schema"} + ) + new_state["processed_marker"] = "executed" + new_state["port"] = port return new_state @staticmethod def produce_state_on_finish(port: int) -> State: - finish_state = State() - finish_state.add("finish_marker", "produce_state_on_finish_ran") - return finish_state + return State({"finish_marker": "produce_state_on_finish_ran"}) @staticmethod def on_finish(port): @@ -1141,7 +1138,7 @@ def test_process_state_can_emit_consecutive_states( class DummyExecutor: @staticmethod def process_state(state, port: int): - return {"value": state["value"] + 1, "port": port} + return State({"value": state["value"] + 1, "port": port}) main_loop.context.executor_manager.executor = DummyExecutor() monkeypatch.setattr(main_loop, "_check_and_process_control", lambda: None) @@ -1151,25 +1148,19 @@ def process_state(state, port: int): lambda state: [(mock_data_output_channel.to_worker_id, StateFrame(state))], ) - switch_count = {"value": 0} - def fake_switch_context(): - switch_count["value"] += 1 - # The current state-processing handshake uses two context switches: - # the DataProcessor produces output during the first switch of each - # process_input_state() call, before MainLoop reads current_output_state. - if switch_count["value"] % 2 == 1: - current_input_state = ( - main_loop.context.state_processing_manager.current_input_state - ) + current_input_state = ( + main_loop.context.state_processing_manager.current_input_state + ) + if current_input_state is not None: main_loop.context.state_processing_manager.current_output_state = ( DummyExecutor.process_state(current_input_state, 0) ) monkeypatch.setattr(main_loop, "_switch_context", fake_switch_context) - first_state = {"value": 1} - second_state = {"value": 41} + first_state = State({"value": 1}) + second_state = State({"value": 41}) main_loop._process_state(first_state) main_loop._process_state(second_state) @@ -1357,10 +1348,7 @@ def test_process_state_can_emit_multiple_states( class DummyExecutor: @staticmethod def process_state(state: State, port: int) -> State: - output_state = State() - output_state.add("value", state["value"] + 1) - output_state.add("port", port) - return output_state + return State({"value": state["value"] + 1, "port": port}) main_loop.context.executor_manager.executor = DummyExecutor() monkeypatch.setattr(main_loop, "_check_and_process_control", lambda: None) @@ -1381,10 +1369,8 @@ def fake_switch_context(): monkeypatch.setattr(main_loop, "_switch_context", fake_switch_context) - first_state = State() - first_state.add("value", 1) - second_state = State() - second_state.add("value", 41) + first_state = State({"value": 1}) + second_state = State({"value": 41}) main_loop._process_state(first_state) main_loop._process_state(second_state) @@ -1499,7 +1485,7 @@ def test_main_loop_thread_can_process_state( f"{type(end_channel_state_output.payload).__name__}" ) end_channel_state = end_channel_state_output.payload.frame - assert "finish_marker" in end_channel_state.__dict__, ( + assert "finish_marker" in end_channel_state, ( f"EndChannel emission should be the finish-marker state from " f"produce_state_on_finish, got {end_channel_state!r}" ) From 075697202ad81f4687bd903e901d61f78eb9c0f7 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 17:19:27 -0700 Subject: [PATCH 067/152] refactor: drop StateFrame.__post_init__ auto-coerce The __post_init__ that wrapped a non-State frame argument in `State(...)` was only triggered by two test_network_receiver.py call sites that passed plain dict literals. Pass `State({...})` explicitly at those call sites and remove the silent coerce -- callers already know the contract. --- amber/src/main/python/core/models/payload.py | 4 ---- .../src/main/python/core/runnables/test_network_receiver.py | 5 +++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/amber/src/main/python/core/models/payload.py b/amber/src/main/python/core/models/payload.py index e8d733c1794..61a33294882 100644 --- a/amber/src/main/python/core/models/payload.py +++ b/amber/src/main/python/core/models/payload.py @@ -34,7 +34,3 @@ class DataFrame(DataPayload): @dataclass class StateFrame(DataPayload): frame: State - - def __post_init__(self): - if not isinstance(self.frame, State): - self.frame = State(self.frame) diff --git a/amber/src/main/python/core/runnables/test_network_receiver.py b/amber/src/main/python/core/runnables/test_network_receiver.py index 00196e40960..bf890e4a2f0 100644 --- a/amber/src/main/python/core/runnables/test_network_receiver.py +++ b/amber/src/main/python/core/runnables/test_network_receiver.py @@ -26,6 +26,7 @@ ECMElement, ) from core.models.payload import DataFrame, StateFrame +from core.models.state import State from core.proxy import ProxyClient from core.runnables.network_receiver import NetworkReceiver from core.runnables.network_sender import NetworkSender @@ -154,13 +155,13 @@ def test_network_receiver_can_receive_consecutive_state_messages( input_queue.put( DataElement( tag=channel_id, - payload=StateFrame({"loop_counter": 0, "i": 1}), + payload=StateFrame(State({"loop_counter": 0, "i": 1})), ) ) input_queue.put( DataElement( tag=channel_id, - payload=StateFrame({"loop_counter": 1, "i": 2}), + payload=StateFrame(State({"loop_counter": 1, "i": 2})), ) ) From d92ed51b1cdb65effe962e65f6cb1138a97a383f Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 18:01:53 -0700 Subject: [PATCH 068/152] fix fmt --- amber/src/main/python/core/models/state.py | 28 ++++++++++--------- .../python/core/runnables/network_receiver.py | 15 ++-------- .../python/core/runnables/network_sender.py | 14 ++-------- 3 files changed, 21 insertions(+), 36 deletions(-) diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index 8603c2124fd..003aaa212ac 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -24,25 +24,27 @@ class State(dict): - pass + CONTENT = "content" + SCHEMA = Schema(raw_schema={CONTENT: "STRING"}) -STATE_CONTENT = "content" -_TYPE_MARKER = "__texera_type__" -_PAYLOAD_MARKER = "payload" -_BYTES_TYPE = "bytes" + def to_json(self) -> str: + return json.dumps(_to_json_value(self), separators=(",", ":")) -STATE_SCHEMA = Schema(raw_schema={STATE_CONTENT: "STRING"}) + def to_tuple(self) -> Tuple: + return Tuple({State.CONTENT: self.to_json()}, schema=State.SCHEMA) + @classmethod + def from_json(cls, payload: str) -> "State": + return cls(_from_json_value(json.loads(payload))) -def serialize_state(state: State) -> Tuple: - return Tuple( - {STATE_CONTENT: json.dumps(_to_json_value(state), separators=(",", ":"))}, - schema=STATE_SCHEMA, - ) + @classmethod + def from_tuple(cls, row: Tuple) -> "State": + return cls.from_json(row[cls.CONTENT]) -def deserialize_state(row: Tuple) -> State: - return State(_from_json_value(json.loads(row[STATE_CONTENT]))) +_TYPE_MARKER = "__texera_type__" +_PAYLOAD_MARKER = "payload" +_BYTES_TYPE = "bytes" def _to_json_value(value: Any) -> Any: diff --git a/amber/src/main/python/core/runnables/network_receiver.py b/amber/src/main/python/core/runnables/network_receiver.py index 8cd8a0d5375..739cf0788ec 100644 --- a/amber/src/main/python/core/runnables/network_receiver.py +++ b/amber/src/main/python/core/runnables/network_receiver.py @@ -32,7 +32,7 @@ ) from core.models import ( DataFrame, - Tuple, + State, StateFrame, ) from core.models.internal_queue import ( @@ -42,7 +42,6 @@ ECMElement, ) from core.proxy import ProxyServer -from core.models.state import STATE_SCHEMA, deserialize_state from core.util import Stoppable, get_one_of from core.util.runnable.runnable import Runnable from proto.org.apache.texera.amber.engine.architecture.rpc import EmbeddedControlMessage @@ -146,20 +145,12 @@ def actor_message_handler(message: bytes) -> int: self._proxy_server.register_actor_message_handler(actor_message_handler) @staticmethod - def _deserialize_state_payload(table: Table) -> dict: + def _deserialize_state_payload(table: Table) -> State: # Each network State message carries exactly one serialized state row. # Multiple states are sent as multiple State messages, not as multiple # rows inside one network payload. assert len(table) == 1 - return deserialize_state( - Tuple( - { - name: table[name][0].as_py() - for name in STATE_SCHEMA.get_attr_names() - }, - schema=STATE_SCHEMA, - ) - ) + return State.from_json(table[State.CONTENT][0].as_py()) def register_shutdown(self, shutdown: callable) -> None: self._proxy_server.register( diff --git a/amber/src/main/python/core/runnables/network_sender.py b/amber/src/main/python/core/runnables/network_sender.py index 52d799d6f1f..d8e3889ac11 100644 --- a/amber/src/main/python/core/runnables/network_sender.py +++ b/amber/src/main/python/core/runnables/network_sender.py @@ -20,18 +20,13 @@ from overrides import overrides from typing import Optional -from core.models import DataPayload, InternalQueue, DataFrame, StateFrame +from core.models import DataPayload, InternalQueue, DataFrame, State, StateFrame from core.models.internal_queue import ( InternalQueueElement, DataElement, DCMElement, ECMElement, ) -from core.models.state import ( - STATE_CONTENT, - STATE_SCHEMA, - serialize_state, -) from core.proxy import ProxyClient from core.util import StoppableQueueBlockingRunnable from proto.org.apache.texera.amber.core import ChannelIdentity @@ -104,12 +99,9 @@ def _send_data(self, to: ChannelIdentity, data_payload: DataPayload) -> None: self._proxy_client.send_data(bytes(data_header), data_payload.frame) elif isinstance(data_payload, StateFrame): data_header = PythonDataHeader(tag=to, payload_type="State") - serialized_state = serialize_state(data_payload.frame) table = pa.Table.from_pydict( - { - STATE_CONTENT: [serialized_state[STATE_CONTENT]], - }, - schema=STATE_SCHEMA.as_arrow_schema(), + {State.CONTENT: [data_payload.frame.to_json()]}, + schema=State.SCHEMA.as_arrow_schema(), ) self._proxy_client.send_data(bytes(data_header), table) else: From 2a8966e9363aa3c9b6d7945292f44145f60d9ba1 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 18:09:29 -0700 Subject: [PATCH 069/152] refactor(scala): move State serializers into the State class Mirror the Python in-class restructure from d92ed51b1c on the Scala side. State now carries `toJson` / `toTuple` instance methods plus `fromJson` / `fromTuple` companion constructors; `Content` is exposed publicly to mirror Python's `State.CONTENT`. Drops the prior `State.serialize` / `State.deserialize` companion methods. --- .../pythonworker/PythonProxyClient.scala | 2 +- .../pythonworker/PythonProxyServer.scala | 2 +- .../texera/amber/core/state/State.scala | 24 ++++++++++--------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index cfdb6a82f86..f3a3af9d503 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -126,7 +126,7 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu case DataFrame(frame) => writeArrowStream(mutable.Queue(ArraySeq.unsafeWrapArray(frame): _*), from, "Data") case StateFrame(state) => - writeArrowStream(mutable.Queue(State.serialize(state)), from, "State") + writeArrowStream(mutable.Queue(state.toTuple), from, "State") } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala index 463dc4b75a5..2ff866365bb 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyServer.scala @@ -128,7 +128,7 @@ private class AmberProducer( dataHeader.payloadType match { case "State" => assert(root.getRowCount == 1) - outputPort.sendTo(to, StateFrame(State.deserialize(ArrowUtils.getTexeraTuple(0, root)))) + outputPort.sendTo(to, StateFrame(State.fromTuple(ArrowUtils.getTexeraTuple(0, root)))) case "ECM" => assert(root.getRowCount == 1) outputPort.sendTo( diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 5560e69c805..d93e16a62cc 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -26,25 +26,26 @@ import org.apache.texera.amber.util.JSONUtils.objectMapper import java.util.Base64 import scala.jdk.CollectionConverters.IteratorHasAsScala -final case class State(values: Map[String, Any]) +final case class State(values: Map[String, Any]) { + + def toJson: String = + objectMapper.writeValueAsString(State.toJsonValue(values)) + + def toTuple: Tuple = + Tuple.builder(State.schema).addSequentially(Array(toJson)).build() +} object State { - private val StateContent = "content" + val Content = "content" private val BytesTypeMarker = "__texera_type__" private val BytesValue = "bytes" private val PayloadMarker = "payload" val schema: Schema = new Schema( - new Attribute(StateContent, AttributeType.STRING) + new Attribute(Content, AttributeType.STRING) ) - def serialize(state: State): Tuple = { - val payloadJson = objectMapper.writeValueAsString(toJsonValue(state.values)) - Tuple.builder(schema).addSequentially(Array(payloadJson)).build() - } - - def deserialize(tuple: Tuple): State = { - val payload = tuple.getField[String](StateContent) + def fromJson(payload: String): State = State( objectMapper .readTree(payload) @@ -53,7 +54,8 @@ object State { .map(entry => entry.getKey -> fromJsonValue(entry.getValue)) .toMap ) - } + + def fromTuple(row: Tuple): State = fromJson(row.getField[String](Content)) private def toJsonValue(value: Any): Any = value match { From 7303189e37c5a36f2bc569f2bcad5e66614d1db1 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 19:21:19 -0700 Subject: [PATCH 070/152] fix fmt --- .../engine/architecture/pythonworker/PythonProxyClient.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index f3a3af9d503..6618e857b1d 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -21,7 +21,6 @@ package org.apache.texera.amber.engine.architecture.pythonworker import com.twitter.util.{Await, Promise} import org.apache.texera.amber.core.WorkflowRuntimeException -import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.tuple.{Schema, Tuple} import org.apache.texera.amber.core.virtualidentity.{ActorVirtualIdentity, ChannelIdentity} import org.apache.texera.amber.engine.architecture.pythonworker.WorkerBatchInternalQueue.{ From 0f907554d95b045517fcead4cd24456fa0ac6101 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 20:11:22 -0700 Subject: [PATCH 071/152] update --- .../main/python/core/runnables/network_receiver.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/amber/src/main/python/core/runnables/network_receiver.py b/amber/src/main/python/core/runnables/network_receiver.py index 739cf0788ec..f8ef6361fe1 100644 --- a/amber/src/main/python/core/runnables/network_receiver.py +++ b/amber/src/main/python/core/runnables/network_receiver.py @@ -96,7 +96,9 @@ def data_handler(command: bytes, table: Table) -> int: "Data", lambda _: DataFrame(table), "State", - lambda _: StateFrame(self._deserialize_state_payload(table)), + lambda _: StateFrame( + State.from_json(table[State.CONTENT][0].as_py()) + ), "ECM", lambda _: EmbeddedControlMessage().parse(table["payload"][0].as_py()), ) @@ -144,14 +146,6 @@ def actor_message_handler(message: bytes) -> int: self._proxy_server.register_actor_message_handler(actor_message_handler) - @staticmethod - def _deserialize_state_payload(table: Table) -> State: - # Each network State message carries exactly one serialized state row. - # Multiple states are sent as multiple State messages, not as multiple - # rows inside one network payload. - assert len(table) == 1 - return State.from_json(table[State.CONTENT][0].as_py()) - def register_shutdown(self, shutdown: callable) -> None: self._proxy_server.register( name="shutdown", action=ProxyServer.ack(msg="Bye bye!")(shutdown) From 3e861588ad3222d81c8f5188631d365a31823053 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 20:14:59 -0700 Subject: [PATCH 072/152] fix fmt --- amber/src/main/python/core/runnables/network_receiver.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/amber/src/main/python/core/runnables/network_receiver.py b/amber/src/main/python/core/runnables/network_receiver.py index f8ef6361fe1..659cd65c78d 100644 --- a/amber/src/main/python/core/runnables/network_receiver.py +++ b/amber/src/main/python/core/runnables/network_receiver.py @@ -96,9 +96,7 @@ def data_handler(command: bytes, table: Table) -> int: "Data", lambda _: DataFrame(table), "State", - lambda _: StateFrame( - State.from_json(table[State.CONTENT][0].as_py()) - ), + lambda _: StateFrame(State.from_json(table[State.CONTENT][0].as_py())), "ECM", lambda _: EmbeddedControlMessage().parse(table["payload"][0].as_py()), ) From 80037ce42286d818eb9ff714b4144d25229be47a Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 20:16:13 -0700 Subject: [PATCH 073/152] fix fmt --- .../main/scala/org/apache/texera/amber/core/state/State.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index d93e16a62cc..ba146f1d57c 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -36,7 +36,7 @@ final case class State(values: Map[String, Any]) { } object State { - val Content = "content" + private val Content = "content" private val BytesTypeMarker = "__texera_type__" private val BytesValue = "bytes" private val PayloadMarker = "payload" From c1d19afc38031dab9d4b30c68bcba467ebea8500 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 21:23:17 -0700 Subject: [PATCH 074/152] fix: route StateFrame through state.toTuple in PythonProxyClient Co-Authored-By: Claude Opus 4.7 (1M context) --- .../engine/architecture/pythonworker/PythonProxyClient.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala index cfdb6a82f86..f3a3af9d503 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -126,7 +126,7 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu case DataFrame(frame) => writeArrowStream(mutable.Queue(ArraySeq.unsafeWrapArray(frame): _*), from, "Data") case StateFrame(state) => - writeArrowStream(mutable.Queue(State.serialize(state)), from, "State") + writeArrowStream(mutable.Queue(state.toTuple), from, "State") } } From 407cd296ea07b90de0e7c1b9d6d513060ed1ee64 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 21:43:58 -0700 Subject: [PATCH 075/152] add test cases --- .../src/main/python/core/models/test_state.py | 101 ++++++++++++++ .../texera/amber/core/state/StateSpec.scala | 131 ++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 amber/src/main/python/core/models/test_state.py create mode 100644 common/workflow-core/src/test/scala/org/apache/texera/amber/core/state/StateSpec.scala diff --git a/amber/src/main/python/core/models/test_state.py b/amber/src/main/python/core/models/test_state.py new file mode 100644 index 00000000000..aef2297130b --- /dev/null +++ b/amber/src/main/python/core/models/test_state.py @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +from core.models.state import State + + +class TestState: + def test_state_subclasses_dict(self): + state = State({"a": 1}) + assert isinstance(state, dict) + assert state["a"] == 1 + assert State() == {} + + def test_class_attributes(self): + assert State.CONTENT == "content" + assert State.SCHEMA.get_attr_names() == ["content"] + + def test_json_round_trip_primitives(self): + original = State( + { + "string": "hello", + "int": 42, + "float": 3.14, + "bool_true": True, + "bool_false": False, + "none_value": None, + } + ) + decoded = State.from_json(original.to_json()) + assert decoded == original + + def test_json_round_trip_empty(self): + assert State.from_json(State().to_json()) == State() + + def test_json_round_trip_bytes(self): + original = State({"payload": b"\x00\x01\x02\xff"}) + decoded = State.from_json(original.to_json()) + assert decoded["payload"] == b"\x00\x01\x02\xff" + assert isinstance(decoded["payload"], bytes) + + def test_json_round_trip_nested_dict(self): + original = State({"outer": {"inner": {"value": 1}}}) + decoded = State.from_json(original.to_json()) + assert decoded == original + + def test_json_round_trip_list_of_mixed_values(self): + original = State({"items": [1, "two", 3.0, True, None]}) + decoded = State.from_json(original.to_json()) + assert decoded == original + + def test_json_round_trip_bytes_inside_list_and_nested_dict(self): + original = State( + { + "blobs": [b"first", b"second"], + "nested": {"sub_blob": b"inside"}, + } + ) + decoded = State.from_json(original.to_json()) + assert decoded["blobs"] == [b"first", b"second"] + assert decoded["nested"]["sub_blob"] == b"inside" + + def test_to_json_rejects_non_serializable_value(self): + class Custom: + pass + + with pytest.raises(TypeError): + State({"bad": Custom()}).to_json() + + def test_tuple_round_trip(self): + original = State({"loop_counter": 3, "label": "outer", "blob": b"\x01\x02"}) + decoded = State.from_tuple(original.to_tuple()) + assert decoded == original + + def test_to_tuple_uses_state_schema(self): + tuple_ = State({"x": 1}).to_tuple() + # Single STRING column whose value is the JSON serialization. + assert tuple_[State.CONTENT] == '{"x":1}' + + def test_nested_dict_decodes_to_plain_dict(self): + # Top-level returns a State; nested dicts come back as plain dict. + # This is intentional -- only the outermost mapping is wrapped. + decoded = State.from_json('{"outer":{"inner":1}}') + assert isinstance(decoded, State) + assert isinstance(decoded["outer"], dict) + assert not isinstance(decoded["outer"], State) diff --git a/common/workflow-core/src/test/scala/org/apache/texera/amber/core/state/StateSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/amber/core/state/StateSpec.scala new file mode 100644 index 00000000000..976a585e31a --- /dev/null +++ b/common/workflow-core/src/test/scala/org/apache/texera/amber/core/state/StateSpec.scala @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.core.state + +import org.scalatest.flatspec.AnyFlatSpec + +class StateSpec extends AnyFlatSpec { + + "State" should "json-round-trip an empty state" in { + val original = State(Map.empty) + assert(State.fromJson(original.toJson) == original) + } + + it should "json-round-trip primitive values" in { + val original = State( + Map( + "string" -> "hello", + "long" -> 42L, + "double" -> 3.14, + "bool_true" -> true, + "bool_false" -> false + ) + ) + val decoded = State.fromJson(original.toJson) + assert(decoded.values("string") == "hello") + assert(decoded.values("long") == 42L) + assert(decoded.values("double") == 3.14) + assert(decoded.values("bool_true") == true) + assert(decoded.values("bool_false") == false) + } + + it should "drop null entries during JSON serialization" in { + // The shared `objectMapper` is configured with `Include.NON_NULL`, so + // null values are stripped before they hit the wire. Document the + // behavior here so callers know they cannot transport an explicit null + // through a State -- Python's serializer keeps nulls but Scala does not. + val original = State(Map("present" -> "value", "absent" -> null)) + val decoded = State.fromJson(original.toJson) + assert(decoded.values.keySet == Set("present")) + assert(decoded.values("present") == "value") + } + + it should "json-round-trip byte arrays via the bytes type marker" in { + val payload = Array[Byte](0, 1, 2, -1) + val original = State(Map("payload" -> payload)) + val decoded = State.fromJson(original.toJson) + val decodedBytes = decoded.values("payload").asInstanceOf[Array[Byte]] + assert(decodedBytes.sameElements(payload)) + } + + it should "json-round-trip nested maps" in { + val original = State(Map("outer" -> Map("inner" -> Map("value" -> 1L)))) + val decoded = State.fromJson(original.toJson) + assert(decoded == original) + } + + it should "json-round-trip lists of mixed values" in { + val original = State(Map("items" -> List(1L, "two", 3.0, true, null))) + val decoded = State.fromJson(original.toJson) + assert(decoded == original) + } + + it should "json-round-trip byte arrays nested inside lists and maps" in { + val original = State( + Map( + "blobs" -> List(Array[Byte](1, 2), Array[Byte](3, 4)), + "nested" -> Map("sub_blob" -> Array[Byte](5, 6)) + ) + ) + val decoded = State.fromJson(original.toJson) + val blobs = decoded.values("blobs").asInstanceOf[List[Array[Byte]]] + assert(blobs.head.sameElements(Array[Byte](1, 2))) + assert(blobs(1).sameElements(Array[Byte](3, 4))) + val subBlob = decoded.values + .apply("nested") + .asInstanceOf[Map[String, Any]]("sub_blob") + .asInstanceOf[Array[Byte]] + assert(subBlob.sameElements(Array[Byte](5, 6))) + } + + it should "tuple-round-trip" in { + val original = State( + Map( + "loop_counter" -> 3L, + "label" -> "outer", + "blob" -> Array[Byte](1, 2) + ) + ) + val decoded = State.fromTuple(original.toTuple) + assert(decoded.values("loop_counter") == 3L) + assert(decoded.values("label") == "outer") + assert( + decoded.values("blob").asInstanceOf[Array[Byte]].sameElements(Array[Byte](1, 2)) + ) + } + + it should "produce a tuple whose payload is the JSON serialization" in { + val tuple = State(Map("x" -> 1L)).toTuple + assert(tuple.getSchema == State.schema) + assert(tuple.getField[String]("content") == """{"x":1}""") + } + + it should "decode a payload encoded by the Python serializer" in { + // Wire-format compatibility check: the bytes-marker keys and the + // single-row "content" column must match what core/models/state.py + // emits, otherwise cross-language transport breaks. + val pythonEmitted = """{"i":2,"blob":{"__texera_type__":"bytes","payload":"AQID"}}""" + val decoded = State.fromJson(pythonEmitted) + assert(decoded.values("i") == 2L) + assert( + decoded.values("blob").asInstanceOf[Array[Byte]].sameElements(Array[Byte](1, 2, 3)) + ) + } +} From 12bb7e45fc349f60db27d3c448321a7b24367a41 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 22:23:09 -0700 Subject: [PATCH 076/152] refactor: drop redundant JumpToOperator API in favor of JumpToOperatorRegion Both APIs invoked the same coordinator path. Removing JumpToOperatorRequest collapses the proto message, generated Python bindings, the JumpToOperatorHandler trait, the alias method on WorkflowExecutionCoordinator, and the loop-end caller in main_loop.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../architecture/rpc/controlcommands.proto | 5 --- .../architecture/rpc/controllerservice.proto | 1 - .../main/python/core/runnables/main_loop.py | 6 +-- .../amber/engine/architecture/rpc/__init__.py | 43 ------------------- ...ControllerAsyncRPCHandlerInitializer.scala | 1 - .../JumpToOperatorHandler.scala | 41 ------------------ .../WorkflowExecutionCoordinator.scala | 2 - 7 files changed, 3 insertions(+), 96 deletions(-) delete mode 100644 amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto index d78eea5d6f1..1f55927e4ae 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controlcommands.proto @@ -46,7 +46,6 @@ message ControlRequest { LinkWorkersRequest linkWorkersRequest = 9; WorkflowReconfigureRequest workflowReconfigureRequest = 10; JumpToOperatorRegionRequest jumpToOperatorRegionRequest = 11; - JumpToOperatorRequest jumpToOperatorRequest = 12; // request for worker AddInputChannelRequest addInputChannelRequest = 50; @@ -279,7 +278,3 @@ message QueryStatisticsRequest{ message JumpToOperatorRegionRequest{ core.OperatorIdentity targetOperatorId = 1 [(scalapb.field).no_box = true]; } - -message JumpToOperatorRequest{ - core.OperatorIdentity targetOperatorId = 1 [(scalapb.field).no_box = true]; -} diff --git a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto index 02382ce53ce..0932a7b914a 100644 --- a/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto +++ b/amber/src/main/protobuf/org/apache/texera/amber/engine/architecture/rpc/controllerservice.proto @@ -42,7 +42,6 @@ service ControllerService { rpc PauseWorkflow(EmptyRequest) returns (EmptyReturn); rpc WorkerStateUpdated(WorkerStateUpdatedRequest) returns (EmptyReturn); rpc WorkerExecutionCompleted(EmptyRequest) returns (EmptyReturn); - rpc JumpToOperator(JumpToOperatorRequest) returns (EmptyReturn); rpc JumpToOperatorRegion(JumpToOperatorRegionRequest) returns (EmptyReturn); rpc LinkWorkers(LinkWorkersRequest) returns (EmptyReturn); rpc ControllerInitiateQueryStatistics(QueryStatisticsRequest) returns (EmptyReturn); diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index ffedc8647e6..7b5aa2d51a1 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -64,7 +64,7 @@ EmbeddedControlMessage, AsyncRpcContext, ControlRequest, - JumpToOperatorRequest, + JumpToOperatorRegionRequest, ) from proto.org.apache.texera.amber.engine.architecture.worker import ( WorkerState, @@ -104,8 +104,8 @@ def _attach_loop_start_id(self, output_state: State) -> None: def _jump_to_loop_start( self, executor: LoopEndOperator, controller_interface ) -> None: - controller_interface.jump_to_operator( - JumpToOperatorRequest(OperatorIdentity(executor.loop_start_id())) + controller_interface.jump_to_operator_region( + JumpToOperatorRegionRequest(OperatorIdentity(executor.loop_start_id())) ) uri = executor.state["LoopStartStateURI"] del executor.state["LoopStartStateURI"] diff --git a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py index 2adf0b63492..2bad2b0bfbc 100644 --- a/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py +++ b/amber/src/main/python/proto/org/apache/texera/amber/engine/architecture/rpc/__init__.py @@ -104,9 +104,6 @@ class ControlRequest(betterproto.Message): jump_to_operator_region_request: "JumpToOperatorRegionRequest" = betterproto.message_field( 11, group="sealed_value" ) - jump_to_operator_request: "JumpToOperatorRequest" = betterproto.message_field( - 12, group="sealed_value" - ) add_input_channel_request: "AddInputChannelRequest" = betterproto.message_field( 50, group="sealed_value" ) @@ -396,11 +393,6 @@ class JumpToOperatorRegionRequest(betterproto.Message): target_operator_id: "___core__.OperatorIdentity" = betterproto.message_field(1) -@dataclass(eq=False, repr=False) -class JumpToOperatorRequest(betterproto.Message): - target_operator_id: "___core__.OperatorIdentity" = betterproto.message_field(1) - - @dataclass(eq=False, repr=False) class ControlReturn(betterproto.Message): """The generic return message""" @@ -1267,23 +1259,6 @@ async def worker_execution_completed( metadata=metadata, ) - async def jump_to_operator( - self, - jump_to_operator_request: "JumpToOperatorRequest", - *, - timeout: Optional[float] = None, - deadline: Optional["Deadline"] = None, - metadata: Optional["MetadataLike"] = None - ) -> "EmptyReturn": - return await self._unary_unary( - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator", - jump_to_operator_request, - EmptyReturn, - timeout=timeout, - deadline=deadline, - metadata=metadata, - ) - async def jump_to_operator_region( self, jump_to_operator_region_request: "JumpToOperatorRegionRequest", @@ -1973,11 +1948,6 @@ async def worker_execution_completed( ) -> "EmptyReturn": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def jump_to_operator( - self, jump_to_operator_request: "JumpToOperatorRequest" - ) -> "EmptyReturn": - raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) - async def jump_to_operator_region( self, jump_to_operator_region_request: "JumpToOperatorRegionRequest" ) -> "EmptyReturn": @@ -2092,13 +2062,6 @@ async def __rpc_worker_execution_completed( response = await self.worker_execution_completed(request) await stream.send_message(response) - async def __rpc_jump_to_operator( - self, stream: "grpclib.server.Stream[JumpToOperatorRequest, EmptyReturn]" - ) -> None: - request = await stream.recv_message() - response = await self.jump_to_operator(request) - await stream.send_message(response) - async def __rpc_jump_to_operator_region( self, stream: "grpclib.server.Stream[JumpToOperatorRegionRequest, EmptyReturn]" ) -> None: @@ -2208,12 +2171,6 @@ def __mapping__(self) -> Dict[str, grpclib.const.Handler]: EmptyRequest, EmptyReturn, ), - "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperator": grpclib.const.Handler( - self.__rpc_jump_to_operator, - grpclib.const.Cardinality.UNARY_UNARY, - JumpToOperatorRequest, - EmptyReturn, - ), "/org.apache.texera.amber.engine.architecture.rpc.ControllerService/JumpToOperatorRegion": grpclib.const.Handler( self.__rpc_jump_to_operator_region, grpclib.const.Cardinality.UNARY_UNARY, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala index c79db7669fd..7e5a904716c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerAsyncRPCHandlerInitializer.scala @@ -34,7 +34,6 @@ class ControllerAsyncRPCHandlerInitializer( with AmberLogging with LinkWorkersHandler with WorkerExecutionCompletedHandler - with JumpToOperatorHandler with JumpToOperatorRegionHandler with WorkerStateUpdatedHandler with PauseHandler diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala deleted file mode 100644 index aad72f08e90..00000000000 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorHandler.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.texera.amber.engine.architecture.controller.promisehandlers - -import com.twitter.util.Future -import org.apache.texera.amber.engine.architecture.controller.ControllerAsyncRPCHandlerInitializer -import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{ - AsyncRPCContext, - JumpToOperatorRequest -} -import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn - -/** Requests the scheduler to continue from the region containing the target operator. */ -trait JumpToOperatorHandler { - this: ControllerAsyncRPCHandlerInitializer => - - override def jumpToOperator( - msg: JumpToOperatorRequest, - ctx: AsyncRPCContext - ): Future[EmptyReturn] = { - cp.workflowExecutionCoordinator.jumpToOperator(msg.targetOperatorId) - EmptyReturn() - } -} diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 34fe0c88875..a7bfe40bf47 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -154,6 +154,4 @@ class WorkflowExecutionCoordinator( } } - def jumpToOperator(opId: OperatorIdentity): Unit = jumpToRegionContainingOperator(opId) - } From c6aba8b5eb5d5dcf28274a9565e00c297cd9d917 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 22:36:26 -0700 Subject: [PATCH 077/152] add test cases --- amber/src/main/python/core/runnables/main_loop.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index 7b5aa2d51a1..56bbf887a76 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -220,11 +220,6 @@ def process_input_tuple(self) -> None: ) def process_input_state(self) -> None: - # Single switch handshake: DataProc parks at the run-loop's - # end-of-body switch (line 65) between tasks, so one switch from - # MainLoop drives a full pick-up -> executor -> output -> park-back - # cycle. By the time the switch returns, current_output_state holds - # the freshly produced output. self._switch_context() output_state = self.context.state_processing_manager.get_output_state() if output_state is not None: From 481d7574b7b32955e976f1f0e0d560f7c4dead41 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 23:16:35 -0700 Subject: [PATCH 078/152] add test cases --- .../main/scala/org/apache/texera/amber/core/state/State.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 5bbdedbca81..532f355c17e 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -29,10 +29,6 @@ import scala.jdk.CollectionConverters.IteratorHasAsScala final case class State(values: Map[String, Any]) { - def apply(key: String): Any = values(key) - - def get(key: String): Option[Any] = values.get(key) - def toJson: String = objectMapper.writeValueAsString(State.toJsonValue(values)) From 1b7460d84b1c049136d61a1b1cc6872ae8a32092 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 30 Apr 2026 23:27:55 -0700 Subject: [PATCH 079/152] update --- amber/src/main/python/core/storage/document_factory.py | 4 ++-- .../python/core/storage/iceberg/test_iceberg_document.py | 1 + amber/src/main/python/core/storage/storage_config.py | 3 +++ .../python/pytexera/storage/test_large_binary_manager.py | 1 + amber/src/main/python/texera_run_python_worker.py | 2 ++ .../architecture/pythonworker/PythonWorkflowWorker.scala | 1 + .../architecture/scheduling/RegionExecutionCoordinator.scala | 5 ----- common/config/src/main/resources/storage.conf | 3 +++ .../apache/texera/amber/config/EnvironmentalVariable.scala | 1 + .../scala/org/apache/texera/amber/config/StorageConfig.scala | 3 +++ .../apache/texera/amber/core/storage/DocumentFactory.scala | 4 ++-- 11 files changed, 19 insertions(+), 9 deletions(-) diff --git a/amber/src/main/python/core/storage/document_factory.py b/amber/src/main/python/core/storage/document_factory.py index 8a4d6fe3c5f..bd690ceb592 100644 --- a/amber/src/main/python/core/storage/document_factory.py +++ b/amber/src/main/python/core/storage/document_factory.py @@ -65,7 +65,7 @@ def create_document(uri: str, schema: Schema) -> VirtualDocument: case VFSResourceType.RESULT: namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE case VFSResourceType.STATE: - namespace = "state" + namespace = StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE case _: raise ValueError(f"Resource type {resource_type} is not supported") @@ -105,7 +105,7 @@ def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: case VFSResourceType.RESULT: namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE case VFSResourceType.STATE: - namespace = "state" + namespace = StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE case _: raise ValueError(f"Resource type {resource_type} is not supported") diff --git a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py index be8dd5d231d..032376ae314 100644 --- a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py +++ b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py @@ -45,6 +45,7 @@ rest_catalog_uri="http://localhost:8181/catalog/", rest_catalog_warehouse_name="texera", table_result_namespace="operator-port-result", + table_state_namespace="operator-port-state", directory_path="../../../../../../amber/user-resources/workflow-results", commit_batch_size=4096, s3_endpoint="http://localhost:9000", diff --git a/amber/src/main/python/core/storage/storage_config.py b/amber/src/main/python/core/storage/storage_config.py index 0e47bdb71ae..82335909874 100644 --- a/amber/src/main/python/core/storage/storage_config.py +++ b/amber/src/main/python/core/storage/storage_config.py @@ -32,6 +32,7 @@ class StorageConfig: ICEBERG_REST_CATALOG_URI = None ICEBERG_REST_CATALOG_WAREHOUSE_NAME = None ICEBERG_TABLE_RESULT_NAMESPACE = None + ICEBERG_TABLE_STATE_NAMESPACE = None ICEBERG_FILE_STORAGE_DIRECTORY_PATH = None ICEBERG_TABLE_COMMIT_BATCH_SIZE = None @@ -51,6 +52,7 @@ def initialize( rest_catalog_uri, rest_catalog_warehouse_name, table_result_namespace, + table_state_namespace, directory_path, commit_batch_size, s3_endpoint, @@ -71,6 +73,7 @@ def initialize( cls.ICEBERG_REST_CATALOG_WAREHOUSE_NAME = rest_catalog_warehouse_name cls.ICEBERG_TABLE_RESULT_NAMESPACE = table_result_namespace + cls.ICEBERG_TABLE_STATE_NAMESPACE = table_state_namespace cls.ICEBERG_FILE_STORAGE_DIRECTORY_PATH = directory_path cls.ICEBERG_TABLE_COMMIT_BATCH_SIZE = int(commit_batch_size) diff --git a/amber/src/main/python/pytexera/storage/test_large_binary_manager.py b/amber/src/main/python/pytexera/storage/test_large_binary_manager.py index 64c7080e520..1942e91f8bc 100644 --- a/amber/src/main/python/pytexera/storage/test_large_binary_manager.py +++ b/amber/src/main/python/pytexera/storage/test_large_binary_manager.py @@ -34,6 +34,7 @@ def setup_storage_config(self): rest_catalog_uri="http://localhost:8181/catalog/", rest_catalog_warehouse_name="texera", table_result_namespace="test", + table_state_namespace="test-state", directory_path="/tmp/test", commit_batch_size=1000, s3_endpoint="http://localhost:9000", diff --git a/amber/src/main/python/texera_run_python_worker.py b/amber/src/main/python/texera_run_python_worker.py index 8687298f819..9b21fa53343 100644 --- a/amber/src/main/python/texera_run_python_worker.py +++ b/amber/src/main/python/texera_run_python_worker.py @@ -52,6 +52,7 @@ def init_loguru_logger(stream_log_level) -> None: iceberg_rest_catalog_uri, iceberg_rest_catalog_warehouse_name, iceberg_table_namespace, + iceberg_table_state_namespace, iceberg_file_storage_directory_path, iceberg_table_commit_batch_size, s3_endpoint, @@ -68,6 +69,7 @@ def init_loguru_logger(stream_log_level) -> None: iceberg_rest_catalog_uri, iceberg_rest_catalog_warehouse_name, iceberg_table_namespace, + iceberg_table_state_namespace, iceberg_file_storage_directory_path, iceberg_table_commit_batch_size, s3_endpoint, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala index 32e417f3c07..48c632022d1 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala @@ -188,6 +188,7 @@ class PythonWorkflowWorker( if (isRest) StorageConfig.icebergRESTCatalogUri else "", if (isRest) StorageConfig.icebergRESTCatalogWarehouseName else "", StorageConfig.icebergTableResultNamespace, + StorageConfig.icebergTableStateNamespace, StorageConfig.fileStorageDirectoryPath.toString, StorageConfig.icebergTableCommitBatchSize.toString, StorageConfig.s3Endpoint, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 975d82ef42f..e452dc79aa8 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -577,11 +577,6 @@ class RegionExecutionCoordinator( schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) DocumentFactory.createDocument(storageUriToAdd, schema) DocumentFactory.createDocument(stateUriToAdd, State.schema) - WorkflowExecutionsResource.insertOperatorPortResultUri( - eid = eid, - globalPortId = outputPortId, - uri = storageUriToAdd - ) if (!isRestart) { WorkflowExecutionsResource.insertOperatorPortResultUri( eid = eid, diff --git a/common/config/src/main/resources/storage.conf b/common/config/src/main/resources/storage.conf index 1f39359155c..da2f7ccc198 100644 --- a/common/config/src/main/resources/storage.conf +++ b/common/config/src/main/resources/storage.conf @@ -61,6 +61,9 @@ storage { runtime-statistics-namespace = "workflow-runtime-statistics" runtime-statistics-namespace = ${?STORAGE_ICEBERG_TABLE_RUNTIME_STATISTICS_NAMESPACE} + state-namespace = "operator-port-state" + state-namespace = ${?STORAGE_ICEBERG_TABLE_STATE_NAMESPACE} + commit { batch-size = 4096 # decide the buffer size of our IcebergTableWriter batch-size = ${?STORAGE_ICEBERG_TABLE_COMMIT_BATCH_SIZE} diff --git a/common/config/src/main/scala/org/apache/texera/amber/config/EnvironmentalVariable.scala b/common/config/src/main/scala/org/apache/texera/amber/config/EnvironmentalVariable.scala index 9ec52bba653..123c56505ee 100644 --- a/common/config/src/main/scala/org/apache/texera/amber/config/EnvironmentalVariable.scala +++ b/common/config/src/main/scala/org/apache/texera/amber/config/EnvironmentalVariable.scala @@ -67,6 +67,7 @@ object EnvironmentalVariable { "STORAGE_ICEBERG_TABLE_CONSOLE_MESSAGES_NAMESPACE" val ENV_ICEBERG_TABLE_RUNTIME_STATISTICS_NAMESPACE = "STORAGE_ICEBERG_TABLE_RUNTIME_STATISTICS_NAMESPACE" + val ENV_ICEBERG_TABLE_STATE_NAMESPACE = "STORAGE_ICEBERG_TABLE_STATE_NAMESPACE" val ENV_ICEBERG_TABLE_COMMIT_BATCH_SIZE = "STORAGE_ICEBERG_TABLE_COMMIT_BATCH_SIZE" val ENV_ICEBERG_TABLE_COMMIT_NUM_RETRIES = "STORAGE_ICEBERG_TABLE_COMMIT_NUM_RETRIES" val ENV_ICEBERG_TABLE_COMMIT_MIN_WAIT_MS = "STORAGE_ICEBERG_TABLE_COMMIT_MIN_WAIT_MS" diff --git a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala index 728e3c0c2de..07447cfdbee 100644 --- a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala +++ b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala @@ -54,6 +54,8 @@ object StorageConfig { conf.getString("storage.iceberg.table.console-messages-namespace") val icebergTableRuntimeStatisticsNamespace: String = conf.getString("storage.iceberg.table.runtime-statistics-namespace") + val icebergTableStateNamespace: String = + conf.getString("storage.iceberg.table.state-namespace") val icebergTableCommitBatchSize: Int = conf.getInt("storage.iceberg.table.commit.batch-size") val icebergTableCommitNumRetries: Int = @@ -111,6 +113,7 @@ object StorageConfig { "STORAGE_ICEBERG_TABLE_CONSOLE_MESSAGES_NAMESPACE" val ENV_ICEBERG_TABLE_RUNTIME_STATISTICS_NAMESPACE = "STORAGE_ICEBERG_TABLE_RUNTIME_STATISTICS_NAMESPACE" + val ENV_ICEBERG_TABLE_STATE_NAMESPACE = "STORAGE_ICEBERG_TABLE_STATE_NAMESPACE" val ENV_ICEBERG_TABLE_COMMIT_BATCH_SIZE = "STORAGE_ICEBERG_TABLE_COMMIT_BATCH_SIZE" val ENV_ICEBERG_TABLE_COMMIT_NUM_RETRIES = "STORAGE_ICEBERG_TABLE_COMMIT_NUM_RETRIES" val ENV_ICEBERG_TABLE_COMMIT_MIN_WAIT_MS = "STORAGE_ICEBERG_TABLE_COMMIT_MIN_WAIT_MS" diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index ae37def667e..00f6c70ba73 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -72,7 +72,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace - case STATE => "state" + case STATE => StorageConfig.icebergTableStateNamespace case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } @@ -120,7 +120,7 @@ object DocumentFactory { case RESULT => StorageConfig.icebergTableResultNamespace case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace - case STATE => "state" + case STATE => StorageConfig.icebergTableStateNamespace case _ => throw new IllegalArgumentException(s"Resource type $resourceType is not supported") } From 47252b3cf9d7c9130696b6534565b326e60eca44 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 00:01:16 -0700 Subject: [PATCH 080/152] update --- .../architecture/scheduling/RegionExecutionCoordinator.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index e452dc79aa8..58fdf9f2428 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -21,6 +21,7 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.pekko.pattern.gracefulStop import com.twitter.util.{Duration => TwitterDuration, Future, JavaTimer, Return, Throw, Timer} +import org.apache.texera.amber.core.state.State import org.apache.texera.amber.core.storage.DocumentFactory import org.apache.texera.amber.core.storage.VFSURIFactory.decodeURI import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity From df5d347e6b15055daf1f5c21ba72651d0d8d00b3 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 00:11:23 -0700 Subject: [PATCH 081/152] update --- .../messaginglayer/OutputManager.scala | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 2af1ccce2c0..3b8caa4d671 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -124,7 +124,7 @@ class OutputManager( : mutable.HashMap[PortIdentity, OutputPortResultWriterThread] = mutable.HashMap() - private val storageUris: mutable.HashMap[Int, URI] = mutable.HashMap() + private val storageUris: mutable.ArrayBuffer[URI] = mutable.ArrayBuffer() /** * Add down stream operator and its corresponding Partitioner. @@ -236,15 +236,14 @@ class OutputManager( def saveStateToStorageIfNeeded(state: State): Unit = { try { - storageUris.foreach { - case (_, uri) => - val writer = DocumentFactory - .openDocument(State.uriFromResultUri(uri)) - ._1 - .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) - .asInstanceOf[BufferedItemWriter[Tuple]] - writer.putOne(state.toTuple) - writer.close() + storageUris.foreach { uri => + val writer = DocumentFactory + .openDocument(State.uriFromResultUri(uri)) + ._1 + .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) + .asInstanceOf[BufferedItemWriter[Tuple]] + writer.putOne(state.toTuple) + writer.close() } } catch { case _: Exception => () @@ -299,7 +298,7 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, storageUri: URI): Unit = { - this.storageUris(portId.id) = storageUri + this.storageUris += storageUri val bufferedItemWriter = DocumentFactory .openDocument(storageUri) ._1 From e906f5a9c624ebfbbcbcd60653003a372821c3c2 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 00:16:01 -0700 Subject: [PATCH 082/152] update --- .../python/core/architecture/packaging/output_manager.py | 7 ------- .../input_port_materialization_reader_runnable.py | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index 371df7cc30f..112c649c6a0 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -193,13 +193,6 @@ def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: writer.put_one(state.to_tuple()) writer.close() - def reset_output_storage(self) -> None: - port_id = self.get_port_ids()[0] - storage_uri = self._storage_uris[port_id] - self.close_port_storage_writers() - DocumentFactory.create_document(storage_uri, self._ports[port_id].get_schema()) - self.set_up_port_storage_writer(port_id, storage_uri) - def close_port_storage_writers(self) -> None: """ Flush the buffers of port storage writers and wait for all the diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index 6b077e78f75..22b9bce51a7 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -130,7 +130,7 @@ def emit_state_with_filter(self, state: State) -> typing.Iterator[StateFrame]: if receiver == self.worker_actor_id: yield ( StateFrame(payload) - if isinstance(payload, dict) + if isinstance(payload, State) else self.tuples_to_data_frame(payload) ) From 0b761b363e4da7c9d89ec320b6d27a71c2446281 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 00:19:52 -0700 Subject: [PATCH 083/152] update --- .../python/core/architecture/packaging/output_manager.py | 7 +++++++ amber/src/main/python/core/runnables/main_loop.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index 112c649c6a0..f68bffb0156 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -193,6 +193,13 @@ def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: writer.put_one(state.to_tuple()) writer.close() + def reset_loopend_storage(self) -> None: + port_id = self.get_port_ids()[0] + storage_uri = self._storage_uris[port_id] + self.close_port_storage_writers() + DocumentFactory.create_document(storage_uri, self._ports[port_id].get_schema()) + self.set_up_port_storage_writer(port_id, storage_uri) + def close_port_storage_writers(self) -> None: """ Flush the buffers of port storage writers and wait for all the diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index 56bbf887a76..20af171f755 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -224,7 +224,7 @@ def process_input_state(self) -> None: output_state = self.context.state_processing_manager.get_output_state() if output_state is not None: if isinstance(self.context.executor_manager.executor, LoopEndOperator): - self.context.output_manager.reset_output_storage() + self.context.output_manager.reset_loopend_storage() if isinstance(self.context.executor_manager.executor, LoopStartOperator): self._attach_loop_start_id(output_state) for to, batch in self.context.output_manager.emit_state(output_state): From d2f9c0e494290c80db3d1cc953ff6dec0851cf13 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 15:40:25 -0700 Subject: [PATCH 084/152] refactor(amber): move schedule-rewrite logic out of Schedule Drop `Schedule.rewriteExecutionFrom`. The coordinator now constructs the rewritten Schedule inline in `jumpToRegionContainingOperator`, keeping Schedule a passive data container with no factory methods that produce new Schedules. Schedule changes: - `levelSets`, `baseLevels`, `effectiveExecutionLevels` are now public vals (previously private). - `currentLevelIndex` exposed as a read-only accessor. - New `initialLevelIndex` constructor parameter lets callers seed the iteration cursor when reconstructing. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../architecture/scheduling/Schedule.scala | 31 ++++++++----------- .../WorkflowExecutionCoordinator.scala | 5 ++- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 5c3346947da..78bab08ac3c 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -22,45 +22,40 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.texera.amber.core.virtualidentity.OperatorIdentity case class Schedule( - private val levelSets: Map[Int, Set[Region]], - executionLevels: Vector[Int] = Vector.empty + levelSets: Map[Int, Set[Region]], + executionLevels: Vector[Int] = Vector.empty, + initialLevelIndex: Int = 0 ) extends Iterator[Set[Region]] { require( levelSets.keys.toSet == (0 until levelSets.size).toSet, s"Schedule level keys must be contiguous starting at 0, got: ${levelSets.keys.toSeq.sorted}" ) - private val baseLevels = levelSets.keys.toVector.sorted - private val normalizedExecutionLevels = + val baseLevels: Vector[Int] = levelSets.keys.toVector.sorted + val effectiveExecutionLevels: Vector[Int] = if (executionLevels.nonEmpty || baseLevels.isEmpty) executionLevels else baseLevels + private val operatorLevelIndices = levelSets.iterator.flatMap { case (level, regions) => val levelIndex = baseLevels.indexOf(level) regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> levelIndex)) }.toMap - private var currentLevelIndex = 0 + + private var _currentLevelIndex: Int = initialLevelIndex + def currentLevelIndex: Int = _currentLevelIndex def getRegions: List[Region] = levelSets.values.flatten.toList def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) - def rewriteExecutionFrom(levelIndex: Int): Schedule = { - val rewrittenSchedule = copy( - executionLevels = - normalizedExecutionLevels.take(currentLevelIndex) ++ baseLevels.drop(levelIndex) - ) - rewrittenSchedule.currentLevelIndex = currentLevelIndex - rewrittenSchedule - } - - override def hasNext: Boolean = currentLevelIndex < normalizedExecutionLevels.length + override def hasNext: Boolean = _currentLevelIndex < effectiveExecutionLevels.length override def next(): Set[Region] = { - val regions = normalizedExecutionLevels - .lift(currentLevelIndex) + val regions = effectiveExecutionLevels + .lift(_currentLevelIndex) .flatMap(levelSets.get) .getOrElse(Set.empty) - currentLevelIndex += 1 + _currentLevelIndex += 1 regions } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index a7bfe40bf47..c9e3c6cb7e3 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -150,7 +150,10 @@ class WorkflowExecutionCoordinator( def jumpToRegionContainingOperator(opId: OperatorIdentity): Unit = { schedule.getLevelIndexOfOperator(opId).foreach { levelIndex => - schedule = schedule.rewriteExecutionFrom(levelIndex) + val rewrittenLevels = + schedule.effectiveExecutionLevels.take(schedule.currentLevelIndex) ++ + schedule.baseLevels.drop(levelIndex) + schedule = Schedule(schedule.levelSets, rewrittenLevels, schedule.currentLevelIndex) } } From 837323a3327ffef9e62ab0ce78e7714de6fed985 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 16:10:31 -0700 Subject: [PATCH 085/152] refactor(amber): use Schedule.copy in jumpToRegionContainingOperator Reconstruct via the case class's auto-generated `copy(...)` instead of the full constructor call, passing `initialLevelIndex = schedule.currentLevelIndex` to preserve the iteration cursor across the rewrite. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scheduling/WorkflowExecutionCoordinator.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index c9e3c6cb7e3..5d8c2e46ba8 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -153,7 +153,10 @@ class WorkflowExecutionCoordinator( val rewrittenLevels = schedule.effectiveExecutionLevels.take(schedule.currentLevelIndex) ++ schedule.baseLevels.drop(levelIndex) - schedule = Schedule(schedule.levelSets, rewrittenLevels, schedule.currentLevelIndex) + schedule = schedule.copy( + executionLevels = rewrittenLevels, + initialLevelIndex = schedule.currentLevelIndex + ) } } From e1c6433fad1de9d2dd72e0986588552d7102fa4f Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 16:58:12 -0700 Subject: [PATCH 086/152] refactor(amber): simplify jumpToRegionContainingOperator to a cursor reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Schedule` always iterates through `baseLevels` (level keys 0..N-1) by construction, so jumping to a region just means resetting the cursor to that level — no need to rewrite `executionLevels`. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scheduling/WorkflowExecutionCoordinator.scala | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 5d8c2e46ba8..fbdb0bb7cd9 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -150,13 +150,7 @@ class WorkflowExecutionCoordinator( def jumpToRegionContainingOperator(opId: OperatorIdentity): Unit = { schedule.getLevelIndexOfOperator(opId).foreach { levelIndex => - val rewrittenLevels = - schedule.effectiveExecutionLevels.take(schedule.currentLevelIndex) ++ - schedule.baseLevels.drop(levelIndex) - schedule = schedule.copy( - executionLevels = rewrittenLevels, - initialLevelIndex = schedule.currentLevelIndex - ) + schedule = schedule.copy(initialLevelIndex = levelIndex) } } From b89de48b43654d6faa7d02ba18e52bb4469499fb Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:04:17 -0700 Subject: [PATCH 087/152] refactor(amber): slim down Schedule to data + cursor With the contiguous-from-0 invariant, level keys are 0..N-1 by construction, so the indirection through `executionLevels` / `effectiveExecutionLevels` / `baseLevels` is no longer needed: - Iteration walks `levelSets` directly via the cursor. - `operatorLevelIndices` maps operator id -> level (which is also the level index, since keys are contiguous from 0). Re-narrow accessors that no longer have external readers: - `levelSets`, `currentLevelIndex` are private again. - `executionLevels` / `effectiveExecutionLevels` / `baseLevels` removed. Only `initialLevelIndex` remains externally addressable, used by coordinator's `schedule.copy(initialLevelIndex = ...)` cursor reset. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../architecture/scheduling/Schedule.scala | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 78bab08ac3c..be0acba4849 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -22,8 +22,7 @@ package org.apache.texera.amber.engine.architecture.scheduling import org.apache.texera.amber.core.virtualidentity.OperatorIdentity case class Schedule( - levelSets: Map[Int, Set[Region]], - executionLevels: Vector[Int] = Vector.empty, + private val levelSets: Map[Int, Set[Region]], initialLevelIndex: Int = 0 ) extends Iterator[Set[Region]] { require( @@ -31,31 +30,23 @@ case class Schedule( s"Schedule level keys must be contiguous starting at 0, got: ${levelSets.keys.toSeq.sorted}" ) - val baseLevels: Vector[Int] = levelSets.keys.toVector.sorted - val effectiveExecutionLevels: Vector[Int] = - if (executionLevels.nonEmpty || baseLevels.isEmpty) executionLevels else baseLevels + private val operatorLevelIndices: Map[OperatorIdentity, Int] = + levelSets.iterator.flatMap { + case (level, regions) => + regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> level)) + }.toMap - private val operatorLevelIndices = levelSets.iterator.flatMap { - case (level, regions) => - val levelIndex = baseLevels.indexOf(level) - regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> levelIndex)) - }.toMap - - private var _currentLevelIndex: Int = initialLevelIndex - def currentLevelIndex: Int = _currentLevelIndex + private var currentLevelIndex: Int = initialLevelIndex def getRegions: List[Region] = levelSets.values.flatten.toList def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) - override def hasNext: Boolean = _currentLevelIndex < effectiveExecutionLevels.length + override def hasNext: Boolean = currentLevelIndex < levelSets.size override def next(): Set[Region] = { - val regions = effectiveExecutionLevels - .lift(_currentLevelIndex) - .flatMap(levelSets.get) - .getOrElse(Set.empty) - _currentLevelIndex += 1 + val regions = levelSets.getOrElse(currentLevelIndex, Set.empty) + currentLevelIndex += 1 regions } } From 0f55e3705eec931c661dbae39afc2d0ece01cdb6 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:09:46 -0700 Subject: [PATCH 088/152] refactor(amber): rename Schedule cursor to currentLevel Drop the redundant `Index` suffix from the cursor field. Level keys are 0..N-1 by construction, so the cursor value is the level itself. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../amber/engine/architecture/scheduling/Schedule.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index be0acba4849..909d96ef60f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -36,17 +36,17 @@ case class Schedule( regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> level)) }.toMap - private var currentLevelIndex: Int = initialLevelIndex + private var currentLevel: Int = initialLevelIndex def getRegions: List[Region] = levelSets.values.flatten.toList def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) - override def hasNext: Boolean = currentLevelIndex < levelSets.size + override def hasNext: Boolean = currentLevel < levelSets.size override def next(): Set[Region] = { - val regions = levelSets.getOrElse(currentLevelIndex, Set.empty) - currentLevelIndex += 1 + val regions = levelSets.getOrElse(currentLevel, Set.empty) + currentLevel += 1 regions } } From b62d7bd00bf988f80e8fca03c1a10ff0c3768d3b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:10:58 -0700 Subject: [PATCH 089/152] refactor(amber): rename pullNextRegions to getNextRegions Aligns the coordinator's region-fetch method name with the existing `WorkflowScheduler.getNextRegions` convention. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../WorkflowExecutionCoordinator.scala | 4 +- .../WorkflowExecutionCoordinatorSpec.scala | 44 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index fbdb0bb7cd9..3a8c159ac29 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -61,7 +61,7 @@ class WorkflowExecutionCoordinator( schedule = newSchedule } - private[scheduling] def pullNextRegions: Set[Region] = { + private[scheduling] def getNextRegions: Set[Region] = { if (!schedule.hasNext) Set() else schedule.next() } @@ -94,7 +94,7 @@ class WorkflowExecutionCoordinator( } // All existing regions are completed. Start the next region (if any). - val nextRegions = pullNextRegions + val nextRegions = getNextRegions if (nextRegions.isEmpty) { if (workflowExecution.isCompleted && completionNotified.compareAndSet(false, true)) { asyncRPCClient.sendToClient(ExecutionStateUpdate(workflowExecution.getState)) diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index fa48d704c97..371ca0a997d 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -131,42 +131,42 @@ class WorkflowExecutionCoordinatorSpec val (first, second, _, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.pullNextRegions == Set(first)) - assert(coordinator.pullNextRegions == Set(second)) + assert(coordinator.getNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(second)) coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) - assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(first)) } it should "support multiple sequential jumps interleaved with region pulls" in { val (first, second, third, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.pullNextRegions == Set(first)) - assert(coordinator.pullNextRegions == Set(second)) + assert(coordinator.getNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(second)) coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) - assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(first)) coordinator.jumpToRegionContainingOperator(OperatorIdentity("second")) - assert(coordinator.pullNextRegions == Set(second)) - assert(coordinator.pullNextRegions == Set(third)) + assert(coordinator.getNextRegions == Set(second)) + assert(coordinator.getNextRegions == Set(third)) coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) - assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(first)) } it should "be a no-op when the target operator is not in any scheduled region" in { val (first, second, _, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(first)) coordinator.jumpToRegionContainingOperator(OperatorIdentity("does-not-exist")) // Iteration position must be unaffected by an unknown target. - assert(coordinator.pullNextRegions == Set(second)) + assert(coordinator.getNextRegions == Set(second)) } it should "leave the schedule untouched when called repeatedly with unknown operators" in { @@ -177,32 +177,32 @@ class WorkflowExecutionCoordinatorSpec coordinator.jumpToRegionContainingOperator(OperatorIdentity("ghost-2")) coordinator.jumpToRegionContainingOperator(OperatorIdentity("ghost-3")) - assert(coordinator.pullNextRegions == Set(first)) - assert(coordinator.pullNextRegions == Set(second)) - assert(coordinator.pullNextRegions == Set(third)) + assert(coordinator.getNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(second)) + assert(coordinator.getNextRegions == Set(third)) } it should "allow jumping back to the first region after the schedule is exhausted" in { val (first, second, third, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.pullNextRegions == Set(first)) - assert(coordinator.pullNextRegions == Set(second)) - assert(coordinator.pullNextRegions == Set(third)) - assert(coordinator.pullNextRegions == Set.empty) + assert(coordinator.getNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(second)) + assert(coordinator.getNextRegions == Set(third)) + assert(coordinator.getNextRegions == Set.empty) coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) - assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(first)) } it should "support jumping forward past regions that have not yet been pulled" in { val (first, _, third, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.pullNextRegions == Set(first)) + assert(coordinator.getNextRegions == Set(first)) coordinator.jumpToRegionContainingOperator(OperatorIdentity("third")) - assert(coordinator.pullNextRegions == Set(third)) - assert(coordinator.pullNextRegions == Set.empty) + assert(coordinator.getNextRegions == Set(third)) + assert(coordinator.getNextRegions == Set.empty) } } From 2fa51e0522e63d4dad4da697bf066caa0e20bed5 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:15:44 -0700 Subject: [PATCH 090/152] refactor(amber): inline jump rewrite in JumpToOperatorRegionHandler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The handler now reads the current schedule from the coordinator, builds a copy seeded at the target operator's level, and calls `replaceSchedule(...)`. The coordinator-side `jumpToRegionContainingOperator` helper is gone — the coordinator only exposes `getSchedule` / `replaceSchedule` and is no longer the place that knows how to translate "operator id" into "schedule cursor". Test helper in `WorkflowExecutionCoordinatorSpec` mirrors the same read-copy-replace flow, keeping the existing scenarios at the coordinator level. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../JumpToOperatorRegionHandler.scala | 6 +++- .../WorkflowExecutionCoordinator.scala | 9 ++---- .../WorkflowExecutionCoordinatorSpec.scala | 31 ++++++++++++------- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index dbe71f58586..f71feffd6b3 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -35,7 +35,11 @@ trait JumpToOperatorRegionHandler { msg: JumpToOperatorRegionRequest, ctx: AsyncRPCContext ): Future[EmptyReturn] = { - cp.workflowExecutionCoordinator.jumpToRegionContainingOperator(msg.targetOperatorId) + val coordinator = cp.workflowExecutionCoordinator + val schedule = coordinator.getSchedule + schedule.getLevelIndexOfOperator(msg.targetOperatorId).foreach { levelIndex => + coordinator.replaceSchedule(schedule.copy(initialLevelIndex = levelIndex)) + } EmptyReturn() } } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 3a8c159ac29..82c17dbc5ba 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -21,7 +21,6 @@ package org.apache.texera.amber.engine.architecture.scheduling import com.twitter.util.Future import com.typesafe.scalalogging.LazyLogging -import org.apache.texera.amber.core.virtualidentity.OperatorIdentity import org.apache.texera.amber.core.workflow.{GlobalPortIdentity, PhysicalLink} import org.apache.texera.amber.engine.architecture.common.{ AkkaActorRefMappingService, @@ -57,6 +56,8 @@ class WorkflowExecutionCoordinator( this.actorRefService = actorRefService } + def getSchedule: Schedule = schedule + def replaceSchedule(newSchedule: Schedule): Unit = { schedule = newSchedule } @@ -148,10 +149,4 @@ class WorkflowExecutionCoordinator( regionExecutionCoordinators.values.exists(!_.isCompleted) } - def jumpToRegionContainingOperator(opId: OperatorIdentity): Unit = { - schedule.getLevelIndexOfOperator(opId).foreach { levelIndex => - schedule = schedule.copy(initialLevelIndex = levelIndex) - } - } - } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 371ca0a997d..b25d8127f6f 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -76,6 +76,15 @@ class WorkflowExecutionCoordinatorSpec private def newJumpCoordinator(schedule: Schedule): WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator(schedule, WorkflowExecution(), null, null) + // Mirrors what JumpToOperatorRegionHandler does: read the current schedule, find the level + // containing the target operator, and replace the schedule with a copy seeded at that level. + private def jumpTo(coordinator: WorkflowExecutionCoordinator, opName: String): Unit = { + val schedule = coordinator.getSchedule + schedule.getLevelIndexOfOperator(OperatorIdentity(opName)).foreach { levelIndex => + coordinator.replaceSchedule(schedule.copy(initialLevelIndex = levelIndex)) + } + } + "WorkflowExecutionCoordinator" should "start the next region only after previous region termination succeeds" in { val firstOp = createSourceOp("first-op") @@ -126,7 +135,7 @@ class WorkflowExecutionCoordinatorSpec assert(rpcProbe.startedWorkers.contains(secondWorkerId)) } - "WorkflowExecutionCoordinator.jumpToRegionContainingOperator" should + "Jumping to an operator's region" should "make the next scheduled region contain the target operator's region" in { val (first, second, _, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) @@ -134,7 +143,7 @@ class WorkflowExecutionCoordinatorSpec assert(coordinator.getNextRegions == Set(first)) assert(coordinator.getNextRegions == Set(second)) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) + jumpTo(coordinator, "first") assert(coordinator.getNextRegions == Set(first)) } @@ -146,14 +155,14 @@ class WorkflowExecutionCoordinatorSpec assert(coordinator.getNextRegions == Set(first)) assert(coordinator.getNextRegions == Set(second)) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) + jumpTo(coordinator, "first") assert(coordinator.getNextRegions == Set(first)) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("second")) + jumpTo(coordinator, "second") assert(coordinator.getNextRegions == Set(second)) assert(coordinator.getNextRegions == Set(third)) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) + jumpTo(coordinator, "first") assert(coordinator.getNextRegions == Set(first)) } @@ -163,7 +172,7 @@ class WorkflowExecutionCoordinatorSpec assert(coordinator.getNextRegions == Set(first)) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("does-not-exist")) + jumpTo(coordinator, "does-not-exist") // Iteration position must be unaffected by an unknown target. assert(coordinator.getNextRegions == Set(second)) @@ -173,9 +182,9 @@ class WorkflowExecutionCoordinatorSpec val (first, second, third, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("ghost-1")) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("ghost-2")) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("ghost-3")) + jumpTo(coordinator, "ghost-1") + jumpTo(coordinator, "ghost-2") + jumpTo(coordinator, "ghost-3") assert(coordinator.getNextRegions == Set(first)) assert(coordinator.getNextRegions == Set(second)) @@ -191,7 +200,7 @@ class WorkflowExecutionCoordinatorSpec assert(coordinator.getNextRegions == Set(third)) assert(coordinator.getNextRegions == Set.empty) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("first")) + jumpTo(coordinator, "first") assert(coordinator.getNextRegions == Set(first)) } @@ -201,7 +210,7 @@ class WorkflowExecutionCoordinatorSpec assert(coordinator.getNextRegions == Set(first)) - coordinator.jumpToRegionContainingOperator(OperatorIdentity("third")) + jumpTo(coordinator, "third") assert(coordinator.getNextRegions == Set(third)) assert(coordinator.getNextRegions == Set.empty) } From fc05cbef9b7e09427627f381f81797d8f5aff0f6 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:17:01 -0700 Subject: [PATCH 091/152] refactor(amber): inline coordinator lookup in JumpToOperatorRegionHandler Drop the `val coordinator = cp.workflowExecutionCoordinator` local in favor of inline `cp.workflowExecutionCoordinator.*` references. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../promisehandlers/JumpToOperatorRegionHandler.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index f71feffd6b3..ef951e42ccc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -35,10 +35,11 @@ trait JumpToOperatorRegionHandler { msg: JumpToOperatorRegionRequest, ctx: AsyncRPCContext ): Future[EmptyReturn] = { - val coordinator = cp.workflowExecutionCoordinator - val schedule = coordinator.getSchedule + val schedule = cp.workflowExecutionCoordinator.getSchedule schedule.getLevelIndexOfOperator(msg.targetOperatorId).foreach { levelIndex => - coordinator.replaceSchedule(schedule.copy(initialLevelIndex = levelIndex)) + cp.workflowExecutionCoordinator.replaceSchedule( + schedule.copy(initialLevelIndex = levelIndex) + ) } EmptyReturn() } From 8c46d1ad0e4282614f957522d0d30b0511a7e207 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:18:09 -0700 Subject: [PATCH 092/152] refactor(amber): direct levelSets lookup in Schedule.next MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The contiguous-from-0 invariant plus the `hasNext` guard means `levelSets(currentLevel)` is always defined when `next()` is called per the Iterator contract. Drop the `getOrElse(..., Set.empty)` fallback that silently masked misuse — out-of-range calls now raise `NoSuchElementException` like a standard Iterator. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../texera/amber/engine/architecture/scheduling/Schedule.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 909d96ef60f..02d31231385 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -45,7 +45,7 @@ case class Schedule( override def hasNext: Boolean = currentLevel < levelSets.size override def next(): Set[Region] = { - val regions = levelSets.getOrElse(currentLevel, Set.empty) + val regions = levelSets(currentLevel) currentLevel += 1 regions } From e47de64add6a0bf3c8518701257b6150d8e72240 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:19:10 -0700 Subject: [PATCH 093/152] refactor(amber): use Map.isDefinedAt in Schedule.hasNext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Slightly more semantically direct than a positional `< size` check — asks "is the cursor pointing at a defined level" rather than relying on the contiguous-from-0 invariant to translate that into a size comparison. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../texera/amber/engine/architecture/scheduling/Schedule.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 02d31231385..6bdd8e665b4 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -42,7 +42,7 @@ case class Schedule( def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) - override def hasNext: Boolean = currentLevel < levelSets.size + override def hasNext: Boolean = levelSets.isDefinedAt(currentLevel) override def next(): Set[Region] = { val regions = levelSets(currentLevel) From daab8bef9567b64fb4d1b81c2947e07cf3ddcc77 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:22:27 -0700 Subject: [PATCH 094/152] refactor(amber): inline coordinator getNextRegions Drop the `private[scheduling] def getNextRegions` indirection; its only internal call is now inlined in `coordinateRegionExecutors`. The spec uses a `nextRegions(coordinator)` private helper that mirrors the same hasNext/next() guard. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../WorkflowExecutionCoordinator.scala | 6 +-- .../WorkflowExecutionCoordinatorSpec.scala | 49 ++++++++++--------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 82c17dbc5ba..5f1ae75fbc5 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -62,10 +62,6 @@ class WorkflowExecutionCoordinator( schedule = newSchedule } - private[scheduling] def getNextRegions: Set[Region] = { - if (!schedule.hasNext) Set() else schedule.next() - } - /** * Each invocation first syncs the internal statuses of each exisiting `RegionExecutionCoordintor`, after which each * of the `RegionExecutionCoordintor`s will launch the corresponding next phase of whenever needed until it is @@ -95,7 +91,7 @@ class WorkflowExecutionCoordinator( } // All existing regions are completed. Start the next region (if any). - val nextRegions = getNextRegions + val nextRegions = if (!schedule.hasNext) Set.empty[Region] else schedule.next() if (nextRegions.isEmpty) { if (workflowExecution.isCompleted && completionNotified.compareAndSet(false, true)) { asyncRPCClient.sendToClient(ExecutionStateUpdate(workflowExecution.getState)) diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index b25d8127f6f..815d9536e7c 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -76,6 +76,11 @@ class WorkflowExecutionCoordinatorSpec private def newJumpCoordinator(schedule: Schedule): WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator(schedule, WorkflowExecution(), null, null) + private def nextRegions(coordinator: WorkflowExecutionCoordinator): Set[Region] = { + val schedule = coordinator.getSchedule + if (schedule.hasNext) schedule.next() else Set.empty + } + // Mirrors what JumpToOperatorRegionHandler does: read the current schedule, find the level // containing the target operator, and replace the schedule with a copy seeded at that level. private def jumpTo(coordinator: WorkflowExecutionCoordinator, opName: String): Unit = { @@ -140,42 +145,42 @@ class WorkflowExecutionCoordinatorSpec val (first, second, _, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.getNextRegions == Set(first)) - assert(coordinator.getNextRegions == Set(second)) + assert(nextRegions(coordinator) == Set(first)) + assert(nextRegions(coordinator) == Set(second)) jumpTo(coordinator, "first") - assert(coordinator.getNextRegions == Set(first)) + assert(nextRegions(coordinator) == Set(first)) } it should "support multiple sequential jumps interleaved with region pulls" in { val (first, second, third, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.getNextRegions == Set(first)) - assert(coordinator.getNextRegions == Set(second)) + assert(nextRegions(coordinator) == Set(first)) + assert(nextRegions(coordinator) == Set(second)) jumpTo(coordinator, "first") - assert(coordinator.getNextRegions == Set(first)) + assert(nextRegions(coordinator) == Set(first)) jumpTo(coordinator, "second") - assert(coordinator.getNextRegions == Set(second)) - assert(coordinator.getNextRegions == Set(third)) + assert(nextRegions(coordinator) == Set(second)) + assert(nextRegions(coordinator) == Set(third)) jumpTo(coordinator, "first") - assert(coordinator.getNextRegions == Set(first)) + assert(nextRegions(coordinator) == Set(first)) } it should "be a no-op when the target operator is not in any scheduled region" in { val (first, second, _, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.getNextRegions == Set(first)) + assert(nextRegions(coordinator) == Set(first)) jumpTo(coordinator, "does-not-exist") // Iteration position must be unaffected by an unknown target. - assert(coordinator.getNextRegions == Set(second)) + assert(nextRegions(coordinator) == Set(second)) } it should "leave the schedule untouched when called repeatedly with unknown operators" in { @@ -186,32 +191,32 @@ class WorkflowExecutionCoordinatorSpec jumpTo(coordinator, "ghost-2") jumpTo(coordinator, "ghost-3") - assert(coordinator.getNextRegions == Set(first)) - assert(coordinator.getNextRegions == Set(second)) - assert(coordinator.getNextRegions == Set(third)) + assert(nextRegions(coordinator) == Set(first)) + assert(nextRegions(coordinator) == Set(second)) + assert(nextRegions(coordinator) == Set(third)) } it should "allow jumping back to the first region after the schedule is exhausted" in { val (first, second, third, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.getNextRegions == Set(first)) - assert(coordinator.getNextRegions == Set(second)) - assert(coordinator.getNextRegions == Set(third)) - assert(coordinator.getNextRegions == Set.empty) + assert(nextRegions(coordinator) == Set(first)) + assert(nextRegions(coordinator) == Set(second)) + assert(nextRegions(coordinator) == Set(third)) + assert(nextRegions(coordinator) == Set.empty) jumpTo(coordinator, "first") - assert(coordinator.getNextRegions == Set(first)) + assert(nextRegions(coordinator) == Set(first)) } it should "support jumping forward past regions that have not yet been pulled" in { val (first, _, third, schedule) = threeLevelSchedule() val coordinator = newJumpCoordinator(schedule) - assert(coordinator.getNextRegions == Set(first)) + assert(nextRegions(coordinator) == Set(first)) jumpTo(coordinator, "third") - assert(coordinator.getNextRegions == Set(third)) - assert(coordinator.getNextRegions == Set.empty) + assert(nextRegions(coordinator) == Set(third)) + assert(nextRegions(coordinator) == Set.empty) } } From 9872d8eb77a10da10816226ce79430b504cf6c23 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:28:28 -0700 Subject: [PATCH 095/152] refactor(amber): lazy-init WorkflowExecutionCoordinator with the real schedule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ControllerProcessor.workflowExecutionCoordinator` is now a `lazy val` that constructs with `workflowScheduler.getSchedule` on first access, removing the empty-schedule placeholder + handoff dance. `Controller.initState` reorders so `updateSchedule(physicalPlan)` runs before `attachRuntimeServicesToCPState()` — the latter triggers the lazy coordinator via `setupActorRefService`, which must observe a populated schedule. WorkflowScheduler.updateSchedule does not depend on any of the runtime services, so the reorder is safe. `updateExecutionSchedule` on ControllerProcessor is gone. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../architecture/controller/Controller.scala | 5 ++-- .../controller/ControllerProcessor.scala | 25 ++++++++----------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala index a838b1ae3c0..2f0d9a9adbc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala @@ -114,9 +114,10 @@ class Controller( ) override def initState(): Unit = { - attachRuntimeServicesToCPState() + // updateSchedule must run before attachRuntimeServicesToCPState: the latter triggers + // the lazy `workflowExecutionCoordinator`, which reads `workflowScheduler.getSchedule`. cp.workflowScheduler.updateSchedule(physicalPlan) - cp.updateExecutionSchedule(cp.workflowScheduler.getSchedule) + attachRuntimeServicesToCPState() val regions: List[(Long, List[String])] = cp.workflowScheduler.getSchedule.getRegions.map { region => diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index ea437875992..4b3d290f6e8 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -29,10 +29,7 @@ import org.apache.texera.amber.engine.architecture.common.{ } import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.architecture.logreplay.ReplayLogManager -import org.apache.texera.amber.engine.architecture.scheduling.{ - Schedule, - WorkflowExecutionCoordinator -} +import org.apache.texera.amber.engine.architecture.scheduling.WorkflowExecutionCoordinator import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.MainThreadDelegateMessage import org.apache.texera.amber.engine.common.ambermessage.WorkflowFIFOMessage @@ -46,16 +43,16 @@ class ControllerProcessor( val workflowExecution: WorkflowExecution = WorkflowExecution() val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) - val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - Schedule(Map.empty), - workflowExecution, - controllerConfig, - asyncRPCClient - ) - - def updateExecutionSchedule(schedule: Schedule): Unit = { - workflowExecutionCoordinator.replaceSchedule(schedule) - } + + // Lazy: first access must happen *after* `workflowScheduler.updateSchedule(...)` has produced + // a real schedule. Controller.initState enforces this order. + lazy val workflowExecutionCoordinator: WorkflowExecutionCoordinator = + new WorkflowExecutionCoordinator( + workflowScheduler.getSchedule, + workflowExecution, + controllerConfig, + asyncRPCClient + ) private val initializer = new ControllerAsyncRPCHandlerInitializer(this) From ec76300561c5bfc5f5da9e534edb69f37fea655c Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:31:44 -0700 Subject: [PATCH 096/152] revert(amber): drop lazy-init coordinator, keep direct replaceSchedule call Restore the eager `val workflowExecutionCoordinator` with the empty `Schedule(Map.empty)` placeholder. `Controller.initState` keeps the original order (services first, then schedule) and now calls `cp.workflowExecutionCoordinator.replaceSchedule(...)` directly instead of going through a now-removed `updateExecutionSchedule` wrapper. The lazy-val approach was rejected because of init-order fragility. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../engine/architecture/controller/Controller.scala | 5 ++--- .../controller/ControllerProcessor.scala | 13 ++++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala index 2f0d9a9adbc..50c3527b321 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala @@ -114,10 +114,9 @@ class Controller( ) override def initState(): Unit = { - // updateSchedule must run before attachRuntimeServicesToCPState: the latter triggers - // the lazy `workflowExecutionCoordinator`, which reads `workflowScheduler.getSchedule`. - cp.workflowScheduler.updateSchedule(physicalPlan) attachRuntimeServicesToCPState() + cp.workflowScheduler.updateSchedule(physicalPlan) + cp.workflowExecutionCoordinator.replaceSchedule(cp.workflowScheduler.getSchedule) val regions: List[(Long, List[String])] = cp.workflowScheduler.getSchedule.getRegions.map { region => diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 4b3d290f6e8..dcb038fb5aa 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -29,7 +29,10 @@ import org.apache.texera.amber.engine.architecture.common.{ } import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.architecture.logreplay.ReplayLogManager -import org.apache.texera.amber.engine.architecture.scheduling.WorkflowExecutionCoordinator +import org.apache.texera.amber.engine.architecture.scheduling.{ + Schedule, + WorkflowExecutionCoordinator +} import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.MainThreadDelegateMessage import org.apache.texera.amber.engine.common.ambermessage.WorkflowFIFOMessage @@ -44,11 +47,11 @@ class ControllerProcessor( val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) - // Lazy: first access must happen *after* `workflowScheduler.updateSchedule(...)` has produced - // a real schedule. Controller.initState enforces this order. - lazy val workflowExecutionCoordinator: WorkflowExecutionCoordinator = + // Constructed eagerly with an empty placeholder; Controller.initState calls + // `replaceSchedule(...)` once `workflowScheduler.updateSchedule(...)` has produced the real one. + val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - workflowScheduler.getSchedule, + Schedule(Map.empty), workflowExecution, controllerConfig, asyncRPCClient From 8444b4c8a007fe9b2ed0f2390d618a51b4ecff73 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 17:36:45 -0700 Subject: [PATCH 097/152] chore(amber): drop placeholder-comment on ControllerProcessor coordinator Co-Authored-By: Claude Opus 4.7 (1M context) --- .../engine/architecture/controller/ControllerProcessor.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index dcb038fb5aa..7620df7baf7 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -47,8 +47,6 @@ class ControllerProcessor( val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) - // Constructed eagerly with an empty placeholder; Controller.initState calls - // `replaceSchedule(...)` once `workflowScheduler.updateSchedule(...)` has produced the real one. val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( Schedule(Map.empty), From 7d4a00edf0f9b0dea783c1f9ef5d18a6322dfb83 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 18:02:37 -0700 Subject: [PATCH 098/152] update --- .../engine/architecture/controller/ControllerProcessor.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 7620df7baf7..3d839674708 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -46,9 +46,7 @@ class ControllerProcessor( val workflowExecution: WorkflowExecution = WorkflowExecution() val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) - - val workflowExecutionCoordinator: WorkflowExecutionCoordinator = - new WorkflowExecutionCoordinator( + val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( Schedule(Map.empty), workflowExecution, controllerConfig, From a32abcf56f081fb7ab5f5d6ed8e097be14cbcef1 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 18:03:09 -0700 Subject: [PATCH 099/152] update --- .../controller/ControllerProcessor.scala | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 3d839674708..9a5ba2e0276 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -29,10 +29,7 @@ import org.apache.texera.amber.engine.architecture.common.{ } import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.architecture.logreplay.ReplayLogManager -import org.apache.texera.amber.engine.architecture.scheduling.{ - Schedule, - WorkflowExecutionCoordinator -} +import org.apache.texera.amber.engine.architecture.scheduling.{Schedule, WorkflowExecutionCoordinator} import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.MainThreadDelegateMessage import org.apache.texera.amber.engine.common.ambermessage.WorkflowFIFOMessage @@ -47,11 +44,11 @@ class ControllerProcessor( val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - Schedule(Map.empty), - workflowExecution, - controllerConfig, - asyncRPCClient - ) + Schedule(Map.empty), + workflowExecution, + controllerConfig, + asyncRPCClient + ) private val initializer = new ControllerAsyncRPCHandlerInitializer(this) From 7d0bd9622d75240a4945f325f04653849258373f Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 18:26:02 -0700 Subject: [PATCH 100/152] feat(amber): record jumps as replay tails appended to the schedule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jumping back to an earlier operator now extends the execution sequence instead of resetting the cursor. Concretely, with `levelSets` covering levels `0..N-1`, a jump to `targetLevel` builds a new schedule whose `executionLevels` is the consumed prefix followed by `targetLevel..N-1`, with the cursor parked at the start of that appended tail. So `ABCDEF` with a jump from E back to C becomes `ABCDECDEF`; another jump from E back to C becomes `ABCDECDECDEF`. Schedule changes: - Re-introduce `executionLevels: Vector[Int]` constructor param plus `effectiveExecutionLevels` derived val (default `0..N-1`). `next()` walks `effectiveExecutionLevels`, not `levelSets` directly. - Rename the cursor `currentLevel` → `currentLevelIndex` since it now indexes into the (potentially extended) `effectiveExecutionLevels`. - Add `position` and `levelCount` accessors so the handler can build the extended sequence. JumpToOperatorRegionHandler builds the extended sequence and replaces the schedule via `coordinator.replaceSchedule(...)`. Adds a `WorkflowExecutionCoordinatorSpec` case that exercises the ABCDEF -> ABCDECDEF -> ABCDECDECDEF progression directly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../JumpToOperatorRegionHandler.scala | 10 +++- .../architecture/scheduling/Schedule.scala | 21 ++++++-- .../WorkflowExecutionCoordinatorSpec.scala | 48 +++++++++++++++++-- 3 files changed, 68 insertions(+), 11 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index ef951e42ccc..56837f11168 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -36,9 +36,15 @@ trait JumpToOperatorRegionHandler { ctx: AsyncRPCContext ): Future[EmptyReturn] = { val schedule = cp.workflowExecutionCoordinator.getSchedule - schedule.getLevelIndexOfOperator(msg.targetOperatorId).foreach { levelIndex => + schedule.getLevelIndexOfOperator(msg.targetOperatorId).foreach { targetLevel => + val extendedExecutionLevels = + schedule.effectiveExecutionLevels.take(schedule.position) ++ + Vector.range(targetLevel, schedule.levelCount) cp.workflowExecutionCoordinator.replaceSchedule( - schedule.copy(initialLevelIndex = levelIndex) + schedule.copy( + executionLevels = extendedExecutionLevels, + initialLevelIndex = schedule.position + ) ) } EmptyReturn() diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 6bdd8e665b4..2704dc88618 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -23,6 +23,7 @@ import org.apache.texera.amber.core.virtualidentity.OperatorIdentity case class Schedule( private val levelSets: Map[Int, Set[Region]], + executionLevels: Vector[Int] = Vector.empty, initialLevelIndex: Int = 0 ) extends Iterator[Set[Region]] { require( @@ -30,23 +31,33 @@ case class Schedule( s"Schedule level keys must be contiguous starting at 0, got: ${levelSets.keys.toSeq.sorted}" ) + // The actual sequence of levels iterated. Defaults to a single forward pass `0..N-1`; + // jump-driven extensions append a replay tail to this vector. + val effectiveExecutionLevels: Vector[Int] = + if (executionLevels.nonEmpty) executionLevels + else (0 until levelSets.size).toVector + private val operatorLevelIndices: Map[OperatorIdentity, Int] = levelSets.iterator.flatMap { case (level, regions) => regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> level)) }.toMap - private var currentLevel: Int = initialLevelIndex + private var currentLevelIndex: Int = initialLevelIndex + + def levelCount: Int = levelSets.size + + def position: Int = currentLevelIndex def getRegions: List[Region] = levelSets.values.flatten.toList def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) - override def hasNext: Boolean = levelSets.isDefinedAt(currentLevel) + override def hasNext: Boolean = effectiveExecutionLevels.isDefinedAt(currentLevelIndex) override def next(): Set[Region] = { - val regions = levelSets(currentLevel) - currentLevel += 1 - regions + val level = effectiveExecutionLevels(currentLevelIndex) + currentLevelIndex += 1 + levelSets(level) } } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 815d9536e7c..5b3c9e06406 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -81,12 +81,21 @@ class WorkflowExecutionCoordinatorSpec if (schedule.hasNext) schedule.next() else Set.empty } - // Mirrors what JumpToOperatorRegionHandler does: read the current schedule, find the level - // containing the target operator, and replace the schedule with a copy seeded at that level. + // Mirrors what JumpToOperatorRegionHandler does: read the current schedule, look up the level + // containing the target operator, and replace the schedule with a copy whose execution levels + // are extended with a `targetLevel..N-1` replay tail. private def jumpTo(coordinator: WorkflowExecutionCoordinator, opName: String): Unit = { val schedule = coordinator.getSchedule - schedule.getLevelIndexOfOperator(OperatorIdentity(opName)).foreach { levelIndex => - coordinator.replaceSchedule(schedule.copy(initialLevelIndex = levelIndex)) + schedule.getLevelIndexOfOperator(OperatorIdentity(opName)).foreach { targetLevel => + val extendedExecutionLevels = + schedule.effectiveExecutionLevels.take(schedule.position) ++ + Vector.range(targetLevel, schedule.levelCount) + coordinator.replaceSchedule( + schedule.copy( + executionLevels = extendedExecutionLevels, + initialLevelIndex = schedule.position + ) + ) } } @@ -219,4 +228,35 @@ class WorkflowExecutionCoordinatorSpec assert(nextRegions(coordinator) == Set(third)) assert(nextRegions(coordinator) == Set.empty) } + + it should "extend the execution with a replay tail on each backward jump" in { + // Schedule ABCDEF: jumping from E back to C yields ABCDECDEF; jumping again from E back to C + // yields ABCDECDECDEF. + val a = jumpRegion(1, "a") + val b = jumpRegion(2, "b") + val c = jumpRegion(3, "c") + val d = jumpRegion(4, "d") + val e = jumpRegion(5, "e") + val f = jumpRegion(6, "f") + val schedule = Schedule( + Map(0 -> Set(a), 1 -> Set(b), 2 -> Set(c), 3 -> Set(d), 4 -> Set(e), 5 -> Set(f)) + ) + val coordinator = newJumpCoordinator(schedule) + + Seq(a, b, c, d, e).foreach { region => + assert(nextRegions(coordinator) == Set(region)) + } + + jumpTo(coordinator, "c") + Seq(c, d, e).foreach { region => + assert(nextRegions(coordinator) == Set(region)) + } + + jumpTo(coordinator, "c") + Seq(c, d, e, f).foreach { region => + assert(nextRegions(coordinator) == Set(region)) + } + + assert(nextRegions(coordinator) == Set.empty) + } } From 664fdd945c9f7be2678a05e6fc1bc5725e053fe1 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 18:37:48 -0700 Subject: [PATCH 101/152] refactor(amber): drop executionLevels, use cursor reset for jumps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The replay-tail bookkeeping in `executionLevels` produced the same visible yield sequence as a plain cursor reset, since after each jump the new schedule's first effective level was the target. Drop `executionLevels` and `effectiveExecutionLevels` from `Schedule`, collapse the cursor back to a level index, and reduce `JumpToOperatorRegionHandler` to a one-liner `schedule.copy(initialLevelIndex = targetLevel)`. The 6-level ABCDEF replay test (`replay the target-onward range each time it jumps back`) keeps the same assertions — the visible yield sequence ABCDECDECDEF is produced equally by cursor reset. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../JumpToOperatorRegionHandler.scala | 8 +------ .../architecture/scheduling/Schedule.scala | 21 +++++-------------- .../WorkflowExecutionCoordinatorSpec.scala | 20 ++++++------------ 3 files changed, 12 insertions(+), 37 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index 56837f11168..15663ef9035 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -37,14 +37,8 @@ trait JumpToOperatorRegionHandler { ): Future[EmptyReturn] = { val schedule = cp.workflowExecutionCoordinator.getSchedule schedule.getLevelIndexOfOperator(msg.targetOperatorId).foreach { targetLevel => - val extendedExecutionLevels = - schedule.effectiveExecutionLevels.take(schedule.position) ++ - Vector.range(targetLevel, schedule.levelCount) cp.workflowExecutionCoordinator.replaceSchedule( - schedule.copy( - executionLevels = extendedExecutionLevels, - initialLevelIndex = schedule.position - ) + schedule.copy(initialLevelIndex = targetLevel) ) } EmptyReturn() diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 2704dc88618..6bdd8e665b4 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -23,7 +23,6 @@ import org.apache.texera.amber.core.virtualidentity.OperatorIdentity case class Schedule( private val levelSets: Map[Int, Set[Region]], - executionLevels: Vector[Int] = Vector.empty, initialLevelIndex: Int = 0 ) extends Iterator[Set[Region]] { require( @@ -31,33 +30,23 @@ case class Schedule( s"Schedule level keys must be contiguous starting at 0, got: ${levelSets.keys.toSeq.sorted}" ) - // The actual sequence of levels iterated. Defaults to a single forward pass `0..N-1`; - // jump-driven extensions append a replay tail to this vector. - val effectiveExecutionLevels: Vector[Int] = - if (executionLevels.nonEmpty) executionLevels - else (0 until levelSets.size).toVector - private val operatorLevelIndices: Map[OperatorIdentity, Int] = levelSets.iterator.flatMap { case (level, regions) => regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> level)) }.toMap - private var currentLevelIndex: Int = initialLevelIndex - - def levelCount: Int = levelSets.size - - def position: Int = currentLevelIndex + private var currentLevel: Int = initialLevelIndex def getRegions: List[Region] = levelSets.values.flatten.toList def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) - override def hasNext: Boolean = effectiveExecutionLevels.isDefinedAt(currentLevelIndex) + override def hasNext: Boolean = levelSets.isDefinedAt(currentLevel) override def next(): Set[Region] = { - val level = effectiveExecutionLevels(currentLevelIndex) - currentLevelIndex += 1 - levelSets(level) + val regions = levelSets(currentLevel) + currentLevel += 1 + regions } } diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 5b3c9e06406..8ba381af3f7 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -82,20 +82,12 @@ class WorkflowExecutionCoordinatorSpec } // Mirrors what JumpToOperatorRegionHandler does: read the current schedule, look up the level - // containing the target operator, and replace the schedule with a copy whose execution levels - // are extended with a `targetLevel..N-1` replay tail. + // containing the target operator, and replace the schedule with a copy whose cursor is at + // that level. private def jumpTo(coordinator: WorkflowExecutionCoordinator, opName: String): Unit = { val schedule = coordinator.getSchedule schedule.getLevelIndexOfOperator(OperatorIdentity(opName)).foreach { targetLevel => - val extendedExecutionLevels = - schedule.effectiveExecutionLevels.take(schedule.position) ++ - Vector.range(targetLevel, schedule.levelCount) - coordinator.replaceSchedule( - schedule.copy( - executionLevels = extendedExecutionLevels, - initialLevelIndex = schedule.position - ) - ) + coordinator.replaceSchedule(schedule.copy(initialLevelIndex = targetLevel)) } } @@ -229,9 +221,9 @@ class WorkflowExecutionCoordinatorSpec assert(nextRegions(coordinator) == Set.empty) } - it should "extend the execution with a replay tail on each backward jump" in { - // Schedule ABCDEF: jumping from E back to C yields ABCDECDEF; jumping again from E back to C - // yields ABCDECDECDEF. + it should "replay the target-onward range each time it jumps back" in { + // Schedule ABCDEF: jumping from E back to C yields the visible sequence ABCDECDEF; jumping + // again from E back to C yields ABCDECDECDEF. val a = jumpRegion(1, "a") val b = jumpRegion(2, "b") val c = jumpRegion(3, "c") From dd9e738ce158d28cd09ebc39b4d12884e7a77289 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 19:12:35 -0700 Subject: [PATCH 102/152] refactor(amber): move target-operator search into JumpToOperatorRegionHandler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The precomputed `operatorLevelIndices` in `Schedule` was rebuilt every time a Schedule was constructed (including the `copy(...)` on each jump), so the up-front O(N*R*Ops) build cost matched a single linear scan — the O(1) lookup didn't pay off when each lookup happens at most once before the schedule is replaced. Drop the precomputed map and `getLevelIndexOfOperator`; the handler now scans `levelSets` inline with `collectFirst`. `levelSets` becomes a public val so the handler can reach it. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../JumpToOperatorRegionHandler.scala | 6 +++++- .../engine/architecture/scheduling/Schedule.scala | 12 +----------- .../WorkflowExecutionCoordinatorSpec.scala | 12 ++++++++---- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index 15663ef9035..7eb841f8d0b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -36,7 +36,11 @@ trait JumpToOperatorRegionHandler { ctx: AsyncRPCContext ): Future[EmptyReturn] = { val schedule = cp.workflowExecutionCoordinator.getSchedule - schedule.getLevelIndexOfOperator(msg.targetOperatorId).foreach { targetLevel => + schedule.levelSets.collectFirst { + case (level, regions) + if regions.exists(_.getOperators.exists(_.id.logicalOpId == msg.targetOperatorId)) => + level + }.foreach { targetLevel => cp.workflowExecutionCoordinator.replaceSchedule( schedule.copy(initialLevelIndex = targetLevel) ) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala index 6bdd8e665b4..9c03d07a62f 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/Schedule.scala @@ -19,10 +19,8 @@ package org.apache.texera.amber.engine.architecture.scheduling -import org.apache.texera.amber.core.virtualidentity.OperatorIdentity - case class Schedule( - private val levelSets: Map[Int, Set[Region]], + levelSets: Map[Int, Set[Region]], initialLevelIndex: Int = 0 ) extends Iterator[Set[Region]] { require( @@ -30,18 +28,10 @@ case class Schedule( s"Schedule level keys must be contiguous starting at 0, got: ${levelSets.keys.toSeq.sorted}" ) - private val operatorLevelIndices: Map[OperatorIdentity, Int] = - levelSets.iterator.flatMap { - case (level, regions) => - regions.iterator.flatMap(region => region.getOperators.map(_.id.logicalOpId -> level)) - }.toMap - private var currentLevel: Int = initialLevelIndex def getRegions: List[Region] = levelSets.values.flatten.toList - def getLevelIndexOfOperator(opId: OperatorIdentity): Option[Int] = operatorLevelIndices.get(opId) - override def hasNext: Boolean = levelSets.isDefinedAt(currentLevel) override def next(): Set[Region] = { diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 8ba381af3f7..6ca9316abf1 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -81,12 +81,16 @@ class WorkflowExecutionCoordinatorSpec if (schedule.hasNext) schedule.next() else Set.empty } - // Mirrors what JumpToOperatorRegionHandler does: read the current schedule, look up the level - // containing the target operator, and replace the schedule with a copy whose cursor is at - // that level. + // Mirrors what JumpToOperatorRegionHandler does: read the current schedule, scan for the + // level containing the target operator, and replace the schedule with a copy whose cursor is + // at that level. private def jumpTo(coordinator: WorkflowExecutionCoordinator, opName: String): Unit = { + val opId = OperatorIdentity(opName) val schedule = coordinator.getSchedule - schedule.getLevelIndexOfOperator(OperatorIdentity(opName)).foreach { targetLevel => + schedule.levelSets.collectFirst { + case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => + level + }.foreach { targetLevel => coordinator.replaceSchedule(schedule.copy(initialLevelIndex = targetLevel)) } } From 0c681ed189be1ec1322c020464ca14714c05ffad Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 19:34:22 -0700 Subject: [PATCH 103/152] refactor(amber): drop initialSchedule constructor param on coordinator The only caller (`ControllerProcessor`) always passed `Schedule(Map.empty)`; the real schedule arrives later via `replaceSchedule` in `Controller.initState`. Move the empty default inside the coordinator and drop the parameter. Tests that previously seeded the coordinator via the constructor now call `replaceSchedule` explicitly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../architecture/controller/ControllerProcessor.scala | 3 +-- .../scheduling/WorkflowExecutionCoordinator.scala | 3 +-- .../scheduling/WorkflowExecutionCoordinatorSpec.scala | 11 ++++++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala index 9a5ba2e0276..3ff8e7d978a 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/ControllerProcessor.scala @@ -29,7 +29,7 @@ import org.apache.texera.amber.engine.architecture.common.{ } import org.apache.texera.amber.engine.architecture.controller.execution.WorkflowExecution import org.apache.texera.amber.engine.architecture.logreplay.ReplayLogManager -import org.apache.texera.amber.engine.architecture.scheduling.{Schedule, WorkflowExecutionCoordinator} +import org.apache.texera.amber.engine.architecture.scheduling.WorkflowExecutionCoordinator import org.apache.texera.amber.engine.architecture.worker.WorkflowWorker.MainThreadDelegateMessage import org.apache.texera.amber.engine.common.ambermessage.WorkflowFIFOMessage @@ -44,7 +44,6 @@ class ControllerProcessor( val workflowScheduler: WorkflowScheduler = new WorkflowScheduler(workflowContext, actorId) val workflowExecutionCoordinator: WorkflowExecutionCoordinator = new WorkflowExecutionCoordinator( - Schedule(Map.empty), workflowExecution, controllerConfig, asyncRPCClient diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index 5f1ae75fbc5..e21305b1583 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -35,13 +35,12 @@ import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable class WorkflowExecutionCoordinator( - initialSchedule: Schedule, workflowExecution: WorkflowExecution, controllerConfig: ControllerConfig, asyncRPCClient: AsyncRPCClient ) extends LazyLogging { - private var schedule: Schedule = initialSchedule + private var schedule: Schedule = Schedule(Map.empty) private val executedRegions: mutable.ListBuffer[Set[Region]] = mutable.ListBuffer() diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 6ca9316abf1..64c6c9481a6 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -73,8 +73,11 @@ class WorkflowExecutionCoordinatorSpec (first, second, third, schedule) } - private def newJumpCoordinator(schedule: Schedule): WorkflowExecutionCoordinator = - new WorkflowExecutionCoordinator(schedule, WorkflowExecution(), null, null) + private def newJumpCoordinator(schedule: Schedule): WorkflowExecutionCoordinator = { + val coordinator = new WorkflowExecutionCoordinator(WorkflowExecution(), null, null) + coordinator.replaceSchedule(schedule) + coordinator + } private def nextRegions(coordinator: WorkflowExecutionCoordinator): Set[Region] = { val schedule = coordinator.getSchedule @@ -120,11 +123,13 @@ class WorkflowExecutionCoordinatorSpec registerLiveWorker(controller.actorRefService, secondWorkerId) val workflowCoordinator = new WorkflowExecutionCoordinator( - Schedule(Map(0 -> Set(firstRegion), 1 -> Set(secondRegion))), workflowExecution, ControllerConfig(None, None, None, None), rpcProbe.asyncRPCClient ) + workflowCoordinator.replaceSchedule( + Schedule(Map(0 -> Set(firstRegion), 1 -> Set(secondRegion))) + ) workflowCoordinator.setupActorRefService(controller.actorRefService) await(workflowCoordinator.coordinateRegionExecutors(controller.actorService)) From c646d5b82fd8e0f4e14721e04436bbd70e749490 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 19:38:07 -0700 Subject: [PATCH 104/152] refactor(amber): expose coordinator schedule as a public var Replace `getSchedule` / `replaceSchedule` on `WorkflowExecutionCoordinator` with a public `var schedule`. Callers now read via `coordinator.schedule` and mutate via `coordinator.schedule = ...`. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../engine/architecture/controller/Controller.scala | 2 +- .../promisehandlers/JumpToOperatorRegionHandler.scala | 6 ++---- .../scheduling/WorkflowExecutionCoordinator.scala | 8 +------- .../scheduling/WorkflowExecutionCoordinatorSpec.scala | 11 +++++------ 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala index 50c3527b321..512a3342ce4 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/Controller.scala @@ -116,7 +116,7 @@ class Controller( override def initState(): Unit = { attachRuntimeServicesToCPState() cp.workflowScheduler.updateSchedule(physicalPlan) - cp.workflowExecutionCoordinator.replaceSchedule(cp.workflowScheduler.getSchedule) + cp.workflowExecutionCoordinator.schedule = cp.workflowScheduler.getSchedule val regions: List[(Long, List[String])] = cp.workflowScheduler.getSchedule.getRegions.map { region => diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index 7eb841f8d0b..3e5760c331b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -35,15 +35,13 @@ trait JumpToOperatorRegionHandler { msg: JumpToOperatorRegionRequest, ctx: AsyncRPCContext ): Future[EmptyReturn] = { - val schedule = cp.workflowExecutionCoordinator.getSchedule + val schedule = cp.workflowExecutionCoordinator.schedule schedule.levelSets.collectFirst { case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == msg.targetOperatorId)) => level }.foreach { targetLevel => - cp.workflowExecutionCoordinator.replaceSchedule( - schedule.copy(initialLevelIndex = targetLevel) - ) + cp.workflowExecutionCoordinator.schedule = schedule.copy(initialLevelIndex = targetLevel) } EmptyReturn() } diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index e21305b1583..df504bf92d2 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -40,7 +40,7 @@ class WorkflowExecutionCoordinator( asyncRPCClient: AsyncRPCClient ) extends LazyLogging { - private var schedule: Schedule = Schedule(Map.empty) + var schedule: Schedule = Schedule(Map.empty) private val executedRegions: mutable.ListBuffer[Set[Region]] = mutable.ListBuffer() @@ -55,12 +55,6 @@ class WorkflowExecutionCoordinator( this.actorRefService = actorRefService } - def getSchedule: Schedule = schedule - - def replaceSchedule(newSchedule: Schedule): Unit = { - schedule = newSchedule - } - /** * Each invocation first syncs the internal statuses of each exisiting `RegionExecutionCoordintor`, after which each * of the `RegionExecutionCoordintor`s will launch the corresponding next phase of whenever needed until it is diff --git a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala index 64c6c9481a6..7547f4aa7eb 100644 --- a/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala +++ b/amber/src/test/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinatorSpec.scala @@ -75,12 +75,12 @@ class WorkflowExecutionCoordinatorSpec private def newJumpCoordinator(schedule: Schedule): WorkflowExecutionCoordinator = { val coordinator = new WorkflowExecutionCoordinator(WorkflowExecution(), null, null) - coordinator.replaceSchedule(schedule) + coordinator.schedule = schedule coordinator } private def nextRegions(coordinator: WorkflowExecutionCoordinator): Set[Region] = { - val schedule = coordinator.getSchedule + val schedule = coordinator.schedule if (schedule.hasNext) schedule.next() else Set.empty } @@ -89,12 +89,12 @@ class WorkflowExecutionCoordinatorSpec // at that level. private def jumpTo(coordinator: WorkflowExecutionCoordinator, opName: String): Unit = { val opId = OperatorIdentity(opName) - val schedule = coordinator.getSchedule + val schedule = coordinator.schedule schedule.levelSets.collectFirst { case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == opId)) => level }.foreach { targetLevel => - coordinator.replaceSchedule(schedule.copy(initialLevelIndex = targetLevel)) + coordinator.schedule = schedule.copy(initialLevelIndex = targetLevel) } } @@ -127,9 +127,8 @@ class WorkflowExecutionCoordinatorSpec ControllerConfig(None, None, None, None), rpcProbe.asyncRPCClient ) - workflowCoordinator.replaceSchedule( + workflowCoordinator.schedule = Schedule(Map(0 -> Set(firstRegion), 1 -> Set(secondRegion))) - ) workflowCoordinator.setupActorRefService(controller.actorRefService) await(workflowCoordinator.coordinateRegionExecutors(controller.actorService)) From ea3d5f1714f2a4ddac0cc0af76eeaa3cfb6c3983 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 19:40:05 -0700 Subject: [PATCH 105/152] update --- .../promisehandlers/JumpToOperatorRegionHandler.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index 3e5760c331b..bd742292846 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -35,13 +35,13 @@ trait JumpToOperatorRegionHandler { msg: JumpToOperatorRegionRequest, ctx: AsyncRPCContext ): Future[EmptyReturn] = { - val schedule = cp.workflowExecutionCoordinator.schedule - schedule.levelSets.collectFirst { + val coordinator = cp.workflowExecutionCoordinator + coordinator.schedule.levelSets.collectFirst { case (level, regions) if regions.exists(_.getOperators.exists(_.id.logicalOpId == msg.targetOperatorId)) => level }.foreach { targetLevel => - cp.workflowExecutionCoordinator.schedule = schedule.copy(initialLevelIndex = targetLevel) + coordinator.schedule = coordinator.schedule.copy(initialLevelIndex = targetLevel) } EmptyReturn() } From 9b9aba8ee1329e08f61ce361b19353de1ef51434 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 19:46:58 -0700 Subject: [PATCH 106/152] update --- .../controller/promisehandlers/JumpToOperatorRegionHandler.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala index bd742292846..5b15a082b29 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/promisehandlers/JumpToOperatorRegionHandler.scala @@ -27,7 +27,6 @@ import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{ } import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn -/** Requests the scheduler to continue from the region containing the target operator. */ trait JumpToOperatorRegionHandler { this: ControllerAsyncRPCHandlerInitializer => From ff3a9d14b417fef81414b04373ad74cd3843519b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 1 May 2026 19:54:06 -0700 Subject: [PATCH 107/152] fix(amber): IfOpExec uses State.values map after state refactor `State` is a `case class(values: Map[String, Any])` with no function-call apply, so `state(key)` no longer compiles. Read through `state.values(key)` instead. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../apache/texera/amber/operator/ifStatement/IfOpExec.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala index d2becc79a5b..4634ad1c18c 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/ifStatement/IfOpExec.scala @@ -34,7 +34,8 @@ class IfOpExec(descString: String) extends OperatorExecutor { //It can accept any value that can be converted to a boolean. For example, Int 1 will be converted to true. override def processState(state: State, port: Int): Option[State] = { outputPort = - if (state(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) else PortIdentity() + if (state.values(desc.conditionName).asInstanceOf[Boolean]) PortIdentity(1) + else PortIdentity() Some(state) } From b3266846c03a13c018b644d255af82ef654bdc43 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 2 May 2026 01:02:09 -0700 Subject: [PATCH 108/152] fix fmt --- amber/src/main/python/core/runnables/main_loop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index 20af171f755..e30d202334a 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -369,6 +369,7 @@ def _process_ecm(self, ecm_element: ECMElement): if ecm.ecm_type != EmbeddedControlMessageType.NO_ALIGNMENT: self.context.pause_manager.resume(PauseType.ECM_PAUSE) + if self.context.tuple_processing_manager.current_internal_marker: { StartChannel: self._process_start_channel, From 37ce61d1a91fa915d3f5f4973c41cf87e5c13b87 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 2 May 2026 01:03:17 -0700 Subject: [PATCH 109/152] fix fmt --- .../engine/architecture/controller/WorkflowScheduler.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala index ff6df1f0a06..b1acb3c0650 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/controller/WorkflowScheduler.scala @@ -32,7 +32,7 @@ class WorkflowScheduler( actorId: ActorVirtualIdentity ) extends java.io.Serializable { var physicalPlan: PhysicalPlan = _ - var schedule: Schedule = _ + private var schedule: Schedule = _ def getSchedule: Schedule = schedule From d84314a7dd6c4cf83863c400e2d94815b8d27e00 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 2 May 2026 01:03:38 -0700 Subject: [PATCH 110/152] fix fmt --- .../dashboard/user/workflow/WorkflowExecutionsResource.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowExecutionsResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowExecutionsResource.scala index 92582afdd2b..72fb1c364e5 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowExecutionsResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowExecutionsResource.scala @@ -247,8 +247,6 @@ object WorkflowExecutionsResource { OPERATOR_PORT_EXECUTIONS.RESULT_URI ) .values(eid.id.toInt, globalPortId.serializeAsString, uri.toString) - .onConflict() - .doNothing() .execute() } From 6ad18e864da6186d6d611446dcccbc90e33eb05b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 2 May 2026 02:13:42 -0700 Subject: [PATCH 111/152] add tests --- .../packaging/test_output_manager.py | 153 ++++++++++++++++++ ...ut_port_materialization_reader_runnable.py | 101 ++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 amber/src/main/python/core/architecture/packaging/test_output_manager.py create mode 100644 amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py diff --git a/amber/src/main/python/core/architecture/packaging/test_output_manager.py b/amber/src/main/python/core/architecture/packaging/test_output_manager.py new file mode 100644 index 00000000000..49d077e8429 --- /dev/null +++ b/amber/src/main/python/core/architecture/packaging/test_output_manager.py @@ -0,0 +1,153 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from unittest.mock import MagicMock, patch + +import pytest + +from core.architecture.packaging.output_manager import OutputManager +from core.models.state import State +from proto.org.apache.texera.amber.core import PortIdentity + + +class TestSaveStateToStorageIfNeeded: + @pytest.fixture + def output_manager(self): + return OutputManager(worker_id="Worker:WF0-test-main-0") + + @pytest.fixture + def port_a(self): + return PortIdentity(id=0, internal=False) + + @pytest.fixture + def port_b(self): + return PortIdentity(id=1, internal=False) + + @pytest.fixture + def state(self): + return State({"loop_counter": 1, "i": 2}) + + def _stub_document_factory(self, mock_factory): + document = MagicMock() + writer = MagicMock() + document.writer.return_value = writer + mock_factory.open_document.return_value = (document, MagicMock()) + return document, writer + + def test_no_storage_uris_is_a_noop(self, output_manager, state): + # save_state_to_storage_if_needed must not touch DocumentFactory when + # the worker has no provisioned output storage. + with patch( + "core.architecture.packaging.output_manager.DocumentFactory" + ) as mock_factory: + output_manager.save_state_to_storage_if_needed(state) + mock_factory.open_document.assert_not_called() + mock_factory.create_document.assert_not_called() + + def test_unknown_port_id_is_a_noop(self, output_manager, state, port_a): + with patch( + "core.architecture.packaging.output_manager.DocumentFactory" + ) as mock_factory: + output_manager.save_state_to_storage_if_needed(state, port_id=port_a) + mock_factory.open_document.assert_not_called() + + def test_writes_to_every_port_when_port_id_omitted( + self, output_manager, state, port_a, port_b + ): + output_manager._storage_uris[port_a] = "vfs:///wf/0/exec/0/result/op-a" + output_manager._storage_uris[port_b] = "vfs:///wf/0/exec/0/result/op-b" + + with patch( + "core.architecture.packaging.output_manager.DocumentFactory" + ) as mock_factory: + _, writer = self._stub_document_factory(mock_factory) + + output_manager.save_state_to_storage_if_needed(state) + + assert mock_factory.open_document.call_count == 2 + opened_uris = { + call.args[0] for call in mock_factory.open_document.call_args_list + } + assert opened_uris == { + "vfs:///wf/0/exec/0/state/op-a", + "vfs:///wf/0/exec/0/state/op-b", + } + assert writer.put_one.call_count == 2 + assert writer.close.call_count == 2 + + def test_writes_only_to_selected_port_when_port_id_specified( + self, output_manager, state, port_a, port_b + ): + output_manager._storage_uris[port_a] = "vfs:///wf/0/exec/0/result/op-a" + output_manager._storage_uris[port_b] = "vfs:///wf/0/exec/0/result/op-b" + + with patch( + "core.architecture.packaging.output_manager.DocumentFactory" + ) as mock_factory: + self._stub_document_factory(mock_factory) + + output_manager.save_state_to_storage_if_needed(state, port_id=port_a) + + assert mock_factory.open_document.call_count == 1 + assert ( + mock_factory.open_document.call_args.args[0] + == "vfs:///wf/0/exec/0/state/op-a" + ) + + def test_creates_document_when_open_raises_value_error( + self, output_manager, state, port_a + ): + # The first time a state is saved, the state document does not yet + # exist; open_document raises ValueError and we must fall back to + # create_document so the state still gets written. + output_manager._storage_uris[port_a] = "vfs:///wf/0/exec/0/result/op-a" + + with patch( + "core.architecture.packaging.output_manager.DocumentFactory" + ) as mock_factory: + mock_factory.open_document.side_effect = ValueError("not found") + created_document = MagicMock() + writer = MagicMock() + created_document.writer.return_value = writer + mock_factory.create_document.return_value = created_document + + output_manager.save_state_to_storage_if_needed(state) + + mock_factory.create_document.assert_called_once_with( + "vfs:///wf/0/exec/0/state/op-a", State.SCHEMA + ) + writer.put_one.assert_called_once() + writer.close.assert_called_once() + + def test_uri_is_recorded_when_storage_writer_is_set_up( + self, output_manager, port_a + ): + # set_up_port_storage_writer should populate _storage_uris so that a + # subsequent save_state_to_storage_if_needed can find the URI. + with patch( + "core.architecture.packaging.output_manager.DocumentFactory" + ) as mock_factory: + mock_factory.open_document.return_value = (MagicMock(), MagicMock()) + + output_manager.set_up_port_storage_writer( + port_a, "vfs:///wf/0/exec/0/result/op-a" + ) + + assert ( + output_manager._storage_uris[port_a] + == "vfs:///wf/0/exec/0/result/op-a" + ) diff --git a/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py new file mode 100644 index 00000000000..8e6addf174b --- /dev/null +++ b/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from unittest.mock import MagicMock + +import pytest + +from core.models import State, StateFrame, Tuple +from core.models.schema import Schema +from core.storage.runnables.input_port_materialization_reader_runnable import ( + InputPortMaterializationReaderRunnable, +) +from proto.org.apache.texera.amber.core import ActorVirtualIdentity + + +class TestEmitStateWithFilter: + """Cover the partitioner-filter logic for state payloads in + InputPortMaterializationReaderRunnable. These tests bypass __init__ + so we don't need a real partitioner or storage URI. + """ + + @pytest.fixture + def me(self): + return ActorVirtualIdentity(name="me") + + @pytest.fixture + def someone_else(self): + return ActorVirtualIdentity(name="other") + + @pytest.fixture + def runnable(self, me): + # __new__ skips __init__ so we can wire only the fields we need. + instance = InputPortMaterializationReaderRunnable.__new__( + InputPortMaterializationReaderRunnable + ) + instance.worker_actor_id = me + instance.partitioner = MagicMock() + instance.tuple_schema = Schema(raw_schema={"x": "INTEGER"}) + return instance + + def test_yields_state_frame_for_matching_receiver( + self, runnable, me + ): + state = State({"k": 1}) + runnable.partitioner.flush_state.return_value = [(me, state)] + + frames = list(runnable.emit_state_with_filter(state)) + + assert len(frames) == 1 + assert isinstance(frames[0], StateFrame) + assert frames[0].frame is state + + def test_filters_out_non_matching_receivers( + self, runnable, me, someone_else + ): + state = State({"k": 1}) + runnable.partitioner.flush_state.return_value = [ + (someone_else, state), + (me, state), + (someone_else, state), + ] + + frames = list(runnable.emit_state_with_filter(state)) + + assert len(frames) == 1 + assert isinstance(frames[0], StateFrame) + + def test_yields_data_frame_for_non_state_payload(self, runnable, me): + # When the partitioner produces a tuple-batch payload (BroadcastPartitioner + # case), the runnable must convert it to a DataFrame instead of wrapping + # it as a StateFrame. + state = State({"k": 1}) + tuples = [Tuple({"x": 7}, schema=runnable.tuple_schema)] + runnable.partitioner.flush_state.return_value = [(me, tuples)] + + frames = list(runnable.emit_state_with_filter(state)) + + assert len(frames) == 1 + # Should not be wrapped as a StateFrame. + assert not isinstance(frames[0], StateFrame) + assert frames[0].frame.num_rows == 1 + + def test_empty_partitioner_output_yields_nothing(self, runnable): + state = State({}) + runnable.partitioner.flush_state.return_value = [] + + assert list(runnable.emit_state_with_filter(state)) == [] From 03d81898ef85a6c1d809abe3c6894efd8b0497bf Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 2 May 2026 02:55:28 -0700 Subject: [PATCH 112/152] fix tests --- .../packaging/test_output_manager.py | 3 +-- ...ut_port_materialization_reader_runnable.py | 8 ++----- .../result/iceberg/IcebergDocumentSpec.scala | 23 +++++++++++++------ 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/test_output_manager.py b/amber/src/main/python/core/architecture/packaging/test_output_manager.py index 49d077e8429..873b94a27a8 100644 --- a/amber/src/main/python/core/architecture/packaging/test_output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/test_output_manager.py @@ -148,6 +148,5 @@ def test_uri_is_recorded_when_storage_writer_is_set_up( ) assert ( - output_manager._storage_uris[port_a] - == "vfs:///wf/0/exec/0/result/op-a" + output_manager._storage_uris[port_a] == "vfs:///wf/0/exec/0/result/op-a" ) diff --git a/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py index 8e6addf174b..9720bf4489a 100644 --- a/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py @@ -52,9 +52,7 @@ def runnable(self, me): instance.tuple_schema = Schema(raw_schema={"x": "INTEGER"}) return instance - def test_yields_state_frame_for_matching_receiver( - self, runnable, me - ): + def test_yields_state_frame_for_matching_receiver(self, runnable, me): state = State({"k": 1}) runnable.partitioner.flush_state.return_value = [(me, state)] @@ -64,9 +62,7 @@ def test_yields_state_frame_for_matching_receiver( assert isinstance(frames[0], StateFrame) assert frames[0].frame is state - def test_filters_out_non_matching_receivers( - self, runnable, me, someone_else - ): + def test_filters_out_non_matching_receivers(self, runnable, me, someone_else): state = State({"k": 1}) runnable.partitioner.flush_state.return_value = [ (someone_else, state), diff --git a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala index 46fa112d5b6..7f1d8573c2a 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala @@ -163,7 +163,7 @@ class IcebergDocumentSpec extends VirtualDocumentSpec[Tuple] with BeforeAndAfter val storedRows = stateDocument.get().toList assert(storedRows.length == 1) - val deserialized = State.fromTuple(storedRows.head) + val deserialized = State.fromTuple(storedRows.head).values assert(deserialized("loop_counter") == 3L) assert(deserialized("name") == "outer-loop") assert(deserialized("payload").asInstanceOf[Array[Byte]].sameElements(Array[Byte](0, 1, 2, 3))) @@ -194,20 +194,29 @@ class IcebergDocumentSpec extends VirtualDocumentSpec[Tuple] with BeforeAndAfter writer.close() val deserializedStates = - stateDocument.get().toList.map(State.fromTuple).sortBy(_("loop_counter").asInstanceOf[Long]) + stateDocument + .get() + .toList + .map(State.fromTuple) + .sortBy(_.values("loop_counter").asInstanceOf[Long]) assert(deserializedStates.length == states.length) deserializedStates.zip(states).foreach { case (actual, expected) => - assert(actual("loop_counter") == expected("loop_counter").asInstanceOf[Int].toLong) - assert(actual("i") == expected("i").asInstanceOf[Int].toLong) assert( - actual("payload") + actual.values("loop_counter") == expected.values("loop_counter").asInstanceOf[Int].toLong + ) + assert(actual.values("i") == expected.values("i").asInstanceOf[Int].toLong) + assert( + actual + .values("payload") .asInstanceOf[Array[Byte]] - .sameElements(expected("payload").asInstanceOf[Array[Byte]]) + .sameElements(expected.values("payload").asInstanceOf[Array[Byte]]) ) } assert( - deserializedStates(1)("nested").asInstanceOf[Map[String, Any]]("values") == List(3L, 4L) + deserializedStates(1) + .values("nested") + .asInstanceOf[Map[String, Any]]("values") == List(3L, 4L) ) } From fb1e038618872f29669b9ce5ccd4f37e24b6f1a6 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 2 May 2026 18:08:46 -0700 Subject: [PATCH 113/152] test(python): cover state-reader run() block and DocumentFactory routing Adds 9 unit tests targeting the codecov-flagged gaps in PR #4490: - InputPortMaterializationReaderRunnable.run() inner state-reading try-block, including the missing-state-document path (ValueError swallow). - DocumentFactory.create_document / open_document namespace routing for STATE and RESULT, plus the unsupported-resource-type and missing-table error paths. Iceberg dependencies are mocked at the document_factory import site so the tests run without Postgres. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...ut_port_materialization_reader_runnable.py | 97 ++++++++++++- .../core/storage/test_document_factory.py | 134 ++++++++++++++++++ 2 files changed, 229 insertions(+), 2 deletions(-) create mode 100644 amber/src/main/python/core/storage/test_document_factory.py diff --git a/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py index 9720bf4489a..3662d023f59 100644 --- a/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/test_input_port_materialization_reader_runnable.py @@ -15,16 +15,20 @@ # specific language governing permissions and limitations # under the License. -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest from core.models import State, StateFrame, Tuple +from core.models.internal_queue import DataElement from core.models.schema import Schema from core.storage.runnables.input_port_materialization_reader_runnable import ( InputPortMaterializationReaderRunnable, ) -from proto.org.apache.texera.amber.core import ActorVirtualIdentity +from proto.org.apache.texera.amber.core import ( + ActorVirtualIdentity, + ChannelIdentity, +) class TestEmitStateWithFilter: @@ -95,3 +99,92 @@ def test_empty_partitioner_output_yields_nothing(self, runnable): runnable.partitioner.flush_state.return_value = [] assert list(runnable.emit_state_with_filter(state)) == [] + + +class TestRunStateReadingBlock: + """Cover the inner try-block in run() that opens the state document and + emits its rows as StateFrames. + """ + + @pytest.fixture + def me(self): + return ActorVirtualIdentity(name="me") + + @pytest.fixture + def runnable(self, me): + instance = InputPortMaterializationReaderRunnable.__new__( + InputPortMaterializationReaderRunnable + ) + instance.uri = "vfs:///wf/0/exec/0/result/op-a" + instance.worker_actor_id = me + instance.tuple_schema = Schema(raw_schema={"x": "INTEGER"}) + instance._stopped = False + instance._finished = False + instance.channel_id = ChannelIdentity(me, me, is_control=False) + instance.queue = MagicMock() + instance.partitioner = MagicMock() + # No tuple-batches and no ECM-flush payloads in these tests. + instance.partitioner.flush.return_value = [] + return instance + + def test_state_rows_are_emitted_as_state_frames(self, runnable, me): + state_a = State({"loop_counter": 0}) + state_b = State({"loop_counter": 1}) + + # The state document yields opaque tuples; from_tuple deserializes + # them. Patch from_tuple so we don't have to wire a real serialization. + result_doc = MagicMock() + result_doc.get.return_value = iter([]) # No materialized tuples. + state_doc = MagicMock() + state_doc.get.return_value = iter(["row-a", "row-b"]) + + with ( + patch( + "core.storage.runnables.input_port_materialization_reader_runnable.DocumentFactory" + ) as mock_factory, + patch.object(State, "from_tuple") as mock_from_tuple, + ): + mock_factory.open_document.side_effect = [ + (result_doc, runnable.tuple_schema), + (state_doc, None), + ] + mock_from_tuple.side_effect = [state_a, state_b] + runnable.partitioner.flush_state.side_effect = [ + [(me, state_a)], + [(me, state_b)], + ] + + runnable.run() + + # Two StateFrames must have been put on the queue, in order. + state_frames = [ + call.args[0] + for call in runnable.queue.put.call_args_list + if isinstance(call.args[0], DataElement) + and isinstance(call.args[0].payload, StateFrame) + ] + assert [sf.payload.frame for sf in state_frames] == [state_a, state_b] + assert runnable._finished is True + + def test_missing_state_document_does_not_abort_run(self, runnable): + # The inner try is meant to swallow ValueError when no state document + # is provisioned; the outer run() should still finish cleanly. + result_doc = MagicMock() + result_doc.get.return_value = iter([]) + + with patch( + "core.storage.runnables.input_port_materialization_reader_runnable.DocumentFactory" + ) as mock_factory: + mock_factory.open_document.side_effect = [ + (result_doc, runnable.tuple_schema), + ValueError("no storage"), + ] + + runnable.run() + + assert runnable._finished is True + # No StateFrames should have been emitted. + for call in runnable.queue.put.call_args_list: + element = call.args[0] + if isinstance(element, DataElement): + assert not isinstance(element.payload, StateFrame) diff --git a/amber/src/main/python/core/storage/test_document_factory.py b/amber/src/main/python/core/storage/test_document_factory.py new file mode 100644 index 00000000000..859c0040246 --- /dev/null +++ b/amber/src/main/python/core/storage/test_document_factory.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from unittest.mock import MagicMock, patch + +import pytest + +from core.models import Schema +from core.storage.document_factory import DocumentFactory +from core.storage.storage_config import StorageConfig +from core.storage.vfs_uri_factory import VFSResourceType + + +# Avoid initializing the real config (only initializable once per process). +StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE = "test-result-ns" +StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE = "test-state-ns" + +VFS_URI = "vfs:///wid/0/eid/0/opid/test/main/0/0/result" + + +@pytest.fixture +def schema(): + return Schema(raw_schema={"x": "INTEGER"}) + + +def _decode_returning(resource_type): + """Helper: build a VFSURIFactory.decode_uri side_effect.""" + return lambda _uri: (None, None, None, resource_type) + + +@patch("core.storage.document_factory.IcebergDocument") +@patch("core.storage.document_factory.amber_schema_to_iceberg_schema") +@patch("core.storage.document_factory.create_table") +@patch("core.storage.document_factory.IcebergCatalogInstance") +@patch("core.storage.document_factory.VFSURIFactory") +class TestCreateDocumentNamespaceRouting: + def test_state_resource_type_uses_state_namespace( + self, mock_vfs, _icb, mock_create_table, _amber_schema, _doc, schema + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) + + DocumentFactory.create_document(VFS_URI, schema) + + args, _ = mock_create_table.call_args + assert args[1] == StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE + + def test_result_resource_type_uses_result_namespace( + self, mock_vfs, _icb, mock_create_table, _amber_schema, _doc, schema + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.RESULT) + + DocumentFactory.create_document(VFS_URI, schema) + + args, _ = mock_create_table.call_args + assert args[1] == StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + + def test_unsupported_resource_type_raises_value_error( + self, mock_vfs, _icb, _create_table, _amber_schema, _doc, schema + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + # CONSOLE_MESSAGES has no namespace mapping in the Python factory. + mock_vfs.decode_uri.side_effect = _decode_returning( + VFSResourceType.CONSOLE_MESSAGES + ) + + with pytest.raises(ValueError, match="not supported"): + DocumentFactory.create_document(VFS_URI, schema) + + +def test_create_document_rejects_non_vfs_scheme(schema): + with pytest.raises(NotImplementedError, match="Unsupported URI scheme"): + DocumentFactory.create_document("file:///tmp/x", schema) + + +@patch("core.storage.document_factory.IcebergDocument") +@patch("core.storage.document_factory.Schema") +@patch("core.storage.document_factory.load_table_metadata") +@patch("core.storage.document_factory.IcebergCatalogInstance") +@patch("core.storage.document_factory.VFSURIFactory") +class TestOpenDocumentNamespaceRouting: + @staticmethod + def _stub_table(): + table = MagicMock() + table.schema.return_value.as_arrow.return_value = MagicMock() + return table + + def test_state_resource_type_uses_state_namespace( + self, mock_vfs, _icb, mock_load, _schema_cls, _doc + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) + mock_load.return_value = self._stub_table() + + DocumentFactory.open_document(VFS_URI) + + args, _ = mock_load.call_args + assert args[1] == StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE + + def test_unsupported_resource_type_raises_value_error( + self, mock_vfs, _icb, _load, _schema_cls, _doc + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning( + VFSResourceType.CONSOLE_MESSAGES + ) + + with pytest.raises(ValueError, match="not supported"): + DocumentFactory.open_document(VFS_URI) + + def test_missing_table_raises_value_error( + self, mock_vfs, _icb, mock_load, _schema_cls, _doc + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) + mock_load.return_value = None + + with pytest.raises(ValueError, match="No storage is found"): + DocumentFactory.open_document(VFS_URI) From ad60adbc1425a73b9472634dea9ddb9cba4f206b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 4 May 2026 21:43:16 -0700 Subject: [PATCH 114/152] fix --- .../main/python/core/runnables/main_loop.py | 1 + .../python/core/runnables/test_main_loop.py | 76 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index ab35cda81b9..1334af12bfe 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -202,6 +202,7 @@ def process_input_state(self) -> None: payload=batch, ) ) + self.context.output_manager.save_state_to_storage_if_needed(output_state) def process_tuple_with_udf(self) -> Iterator[Optional[Tuple]]: """ diff --git a/amber/src/main/python/core/runnables/test_main_loop.py b/amber/src/main/python/core/runnables/test_main_loop.py index c9daa633f55..534493f0c21 100644 --- a/amber/src/main/python/core/runnables/test_main_loop.py +++ b/amber/src/main/python/core/runnables/test_main_loop.py @@ -1388,6 +1388,82 @@ def fake_switch_context(): assert second_output.payload.frame["value"] == 42 assert second_output.payload.frame["port"] == 0 + @pytest.mark.timeout(2) + def test_process_input_state_persists_output_state_to_storage( + self, + main_loop, + mock_data_output_channel, + monkeypatch, + ): + # process_input_state must invoke save_state_to_storage_if_needed + # with the freshly emitted output state, so every state that flows + # downstream is also durable on the upstream output port. + class DummyExecutor: + @staticmethod + def process_state(state: State, port: int) -> State: + return State({"value": state["value"] + 1, "port": port}) + + saved_states: list[State] = [] + main_loop.context.executor_manager.executor = DummyExecutor() + monkeypatch.setattr(main_loop, "_check_and_process_control", lambda: None) + monkeypatch.setattr( + main_loop.context.output_manager, + "emit_state", + lambda state: [(mock_data_output_channel.to_worker_id, StateFrame(state))], + ) + monkeypatch.setattr( + main_loop.context.output_manager, + "save_state_to_storage_if_needed", + lambda state: saved_states.append(state), + ) + + def fake_switch_context(): + current_input_state = ( + main_loop.context.state_processing_manager.current_input_state + ) + if current_input_state is not None: + main_loop.context.state_processing_manager.current_output_state = ( + DummyExecutor.process_state(current_input_state, 0) + ) + + monkeypatch.setattr(main_loop, "_switch_context", fake_switch_context) + + main_loop._process_state(State({"value": 1})) + main_loop._process_state(State({"value": 41})) + + # Each input state produced one output state, so both must have + # been persisted in order. + assert [s["value"] for s in saved_states] == [2, 42] + assert all(s["port"] == 0 for s in saved_states) + + @pytest.mark.timeout(2) + def test_process_input_state_does_not_save_when_no_output( + self, + main_loop, + monkeypatch, + ): + # When the executor returns no output state (process_state returned + # None), save_state_to_storage_if_needed must not be called -- no + # state means nothing to materialize. + save_calls: list[State] = [] + monkeypatch.setattr(main_loop, "_check_and_process_control", lambda: None) + monkeypatch.setattr( + main_loop.context.output_manager, + "emit_state", + lambda state: [], + ) + monkeypatch.setattr( + main_loop.context.output_manager, + "save_state_to_storage_if_needed", + lambda state: save_calls.append(state), + ) + # Pretend DataProc consumed the input but produced no output. + monkeypatch.setattr(main_loop, "_switch_context", lambda: None) + + main_loop._process_state(State({"value": 1})) + + assert save_calls == [] + @pytest.mark.timeout(2) def test_main_loop_thread_can_process_state( self, From 5494240dff0933e2e614ba23814bb393ded3627a Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 4 May 2026 21:44:09 -0700 Subject: [PATCH 115/152] fix --- .../texera/amber/engine/architecture/worker/DataProcessor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala index df6af1e48ea..b6c0c39aaff 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala @@ -126,7 +126,7 @@ class DataProcessor( val outputState = executor.processState(state, port) if (outputState.isDefined) { outputManager.emitState(outputState.get) - outputManager.saveStateToStorageIfNeeded(state) + outputManager.saveStateToStorageIfNeeded(outputState.get) } } catch safely { case e => From 7f8376ea80302392f27e22fb79416b60ab6b1b85 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 4 May 2026 21:48:17 -0700 Subject: [PATCH 116/152] fix --- .../runnables/input_port_materialization_reader_runnable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index 22b9bce51a7..bc2f069157e 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -125,7 +125,7 @@ def tuple_to_batch_with_filter(self, tuple_: Tuple) -> typing.Iterator[DataFrame if receiver == self.worker_actor_id: yield self.tuples_to_data_frame(tuples) - def emit_state_with_filter(self, state: State) -> typing.Iterator[StateFrame]: + def emit_state_with_filter(self, state: State) -> typing.Iterator[DataPayload]: for receiver, payload in self.partitioner.flush_state(state): if receiver == self.worker_actor_id: yield ( From 1645717d7529f3e6a26384ea33e4e7a636854e75 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Mon, 4 May 2026 22:10:52 -0700 Subject: [PATCH 117/152] fix --- .../architecture/packaging/output_manager.py | 38 +++-- .../packaging/test_output_manager.py | 145 ++++++++---------- .../messaginglayer/OutputManager.scala | 30 ++-- 3 files changed, 96 insertions(+), 117 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index 112c649c6a0..08fa210eca5 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -17,7 +17,6 @@ import threading import typing -import uuid from collections import OrderedDict from itertools import chain from loguru import logger @@ -88,7 +87,7 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() - self._storage_uris: typing.Dict[PortIdentity, str] = dict() + self._state_writers: typing.Dict[PortIdentity, typing.Any] = dict() def is_missing_output_ports(self): """ @@ -127,9 +126,9 @@ def add_output_port( def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri: str): """ Create a separate thread for saving output tuples of a port - to storage in batch. + to storage in batch, and open a long-lived buffered writer for + state materialization on the same port. """ - self._storage_uris[port_id] = storage_uri document, _ = DocumentFactory.open_document(storage_uri) buffered_item_writer = document.writer(str(get_worker_index(self.worker_id))) writer_queue = Queue() @@ -148,6 +147,13 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri: str): writer_thread, ) + state_document, _ = DocumentFactory.open_document( + State.uri_from_result_uri(storage_uri) + ) + state_writer = state_document.writer(str(get_worker_index(self.worker_id))) + state_writer.open() + self._state_writers[port_id] = state_writer + def get_port(self, port_id=None) -> WorkerPort: return list(self._ports.values())[0] @@ -176,22 +182,17 @@ def save_tuple_to_storage_if_needed(self, tuple_: Tuple, port_id=None) -> None: ) def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: + # Buffer the state on each long-lived writer; the writer flushes + # itself when its buffer fills, and the remaining buffer is + # flushed in close_port_storage_writers. if port_id is None: - uris = self._storage_uris.values() - elif port_id in self._storage_uris: - uris = [self._storage_uris[port_id]] + writers = self._state_writers.values() + elif port_id in self._state_writers: + writers = [self._state_writers[port_id]] else: return - - for uri in uris: - state_uri = State.uri_from_result_uri(uri) - try: - document = DocumentFactory.open_document(state_uri)[0] - except ValueError: - document = DocumentFactory.create_document(state_uri, State.SCHEMA) - writer = document.writer(str(uuid.uuid4())) + for writer in writers: writer.put_one(state.to_tuple()) - writer.close() def close_port_storage_writers(self) -> None: """ @@ -206,6 +207,11 @@ def close_port_storage_writers(self) -> None: for _, _, writer_thread in self._port_storage_writers.values(): # This blocking call will wait for all the writer to finish commit writer_thread.join() + # Close the long-lived state writers so the remaining buffered + # states are committed in a single Iceberg snapshot per port. + for state_writer in self._state_writers.values(): + state_writer.close() + self._state_writers.clear() def add_partitioning(self, tag: PhysicalLink, partitioning: Partitioning) -> None: """ diff --git a/amber/src/main/python/core/architecture/packaging/test_output_manager.py b/amber/src/main/python/core/architecture/packaging/test_output_manager.py index 873b94a27a8..8f1daf8052e 100644 --- a/amber/src/main/python/core/architecture/packaging/test_output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/test_output_manager.py @@ -41,112 +41,87 @@ def port_b(self): def state(self): return State({"loop_counter": 1, "i": 2}) - def _stub_document_factory(self, mock_factory): - document = MagicMock() - writer = MagicMock() - document.writer.return_value = writer - mock_factory.open_document.return_value = (document, MagicMock()) - return document, writer - - def test_no_storage_uris_is_a_noop(self, output_manager, state): - # save_state_to_storage_if_needed must not touch DocumentFactory when - # the worker has no provisioned output storage. - with patch( - "core.architecture.packaging.output_manager.DocumentFactory" - ) as mock_factory: - output_manager.save_state_to_storage_if_needed(state) - mock_factory.open_document.assert_not_called() - mock_factory.create_document.assert_not_called() + def test_no_state_writers_is_a_noop(self, output_manager, state): + # With no port set up, save_state_to_storage_if_needed must not + # touch any writer. + output_manager.save_state_to_storage_if_needed(state) # no-op, no exception def test_unknown_port_id_is_a_noop(self, output_manager, state, port_a): - with patch( - "core.architecture.packaging.output_manager.DocumentFactory" - ) as mock_factory: - output_manager.save_state_to_storage_if_needed(state, port_id=port_a) - mock_factory.open_document.assert_not_called() + output_manager.save_state_to_storage_if_needed(state, port_id=port_a) + # No assertion needed -- the absence of any writer means nothing + # was attempted. def test_writes_to_every_port_when_port_id_omitted( self, output_manager, state, port_a, port_b ): - output_manager._storage_uris[port_a] = "vfs:///wf/0/exec/0/result/op-a" - output_manager._storage_uris[port_b] = "vfs:///wf/0/exec/0/result/op-b" + writer_a = MagicMock() + writer_b = MagicMock() + output_manager._state_writers[port_a] = writer_a + output_manager._state_writers[port_b] = writer_b - with patch( - "core.architecture.packaging.output_manager.DocumentFactory" - ) as mock_factory: - _, writer = self._stub_document_factory(mock_factory) - - output_manager.save_state_to_storage_if_needed(state) + output_manager.save_state_to_storage_if_needed(state) - assert mock_factory.open_document.call_count == 2 - opened_uris = { - call.args[0] for call in mock_factory.open_document.call_args_list - } - assert opened_uris == { - "vfs:///wf/0/exec/0/state/op-a", - "vfs:///wf/0/exec/0/state/op-b", - } - assert writer.put_one.call_count == 2 - assert writer.close.call_count == 2 + writer_a.put_one.assert_called_once() + writer_b.put_one.assert_called_once() + # Long-lived writers must NOT be closed per state -- otherwise + # we'd be back to one Iceberg snapshot per state. + writer_a.close.assert_not_called() + writer_b.close.assert_not_called() def test_writes_only_to_selected_port_when_port_id_specified( self, output_manager, state, port_a, port_b ): - output_manager._storage_uris[port_a] = "vfs:///wf/0/exec/0/result/op-a" - output_manager._storage_uris[port_b] = "vfs:///wf/0/exec/0/result/op-b" + writer_a = MagicMock() + writer_b = MagicMock() + output_manager._state_writers[port_a] = writer_a + output_manager._state_writers[port_b] = writer_b - with patch( - "core.architecture.packaging.output_manager.DocumentFactory" - ) as mock_factory: - self._stub_document_factory(mock_factory) + output_manager.save_state_to_storage_if_needed(state, port_id=port_a) - output_manager.save_state_to_storage_if_needed(state, port_id=port_a) + writer_a.put_one.assert_called_once() + writer_b.put_one.assert_not_called() - assert mock_factory.open_document.call_count == 1 - assert ( - mock_factory.open_document.call_args.args[0] - == "vfs:///wf/0/exec/0/state/op-a" - ) + def test_state_writer_is_opened_at_port_setup(self, output_manager, port_a): + # set_up_port_storage_writer should open the result document AND + # the state document, then cache the state writer for reuse. + result_doc = MagicMock() + state_doc = MagicMock() + state_writer = MagicMock() + state_doc.writer.return_value = state_writer - def test_creates_document_when_open_raises_value_error( - self, output_manager, state, port_a - ): - # The first time a state is saved, the state document does not yet - # exist; open_document raises ValueError and we must fall back to - # create_document so the state still gets written. - output_manager._storage_uris[port_a] = "vfs:///wf/0/exec/0/result/op-a" - - with patch( - "core.architecture.packaging.output_manager.DocumentFactory" - ) as mock_factory: - mock_factory.open_document.side_effect = ValueError("not found") - created_document = MagicMock() - writer = MagicMock() - created_document.writer.return_value = writer - mock_factory.create_document.return_value = created_document - - output_manager.save_state_to_storage_if_needed(state) - - mock_factory.create_document.assert_called_once_with( - "vfs:///wf/0/exec/0/state/op-a", State.SCHEMA - ) - writer.put_one.assert_called_once() - writer.close.assert_called_once() - - def test_uri_is_recorded_when_storage_writer_is_set_up( - self, output_manager, port_a - ): - # set_up_port_storage_writer should populate _storage_uris so that a - # subsequent save_state_to_storage_if_needed can find the URI. with patch( "core.architecture.packaging.output_manager.DocumentFactory" ) as mock_factory: - mock_factory.open_document.return_value = (MagicMock(), MagicMock()) + mock_factory.open_document.side_effect = [ + (result_doc, MagicMock()), + (state_doc, MagicMock()), + ] output_manager.set_up_port_storage_writer( port_a, "vfs:///wf/0/exec/0/result/op-a" ) - assert ( - output_manager._storage_uris[port_a] == "vfs:///wf/0/exec/0/result/op-a" - ) + opened = [c.args[0] for c in mock_factory.open_document.call_args_list] + assert opened == [ + "vfs:///wf/0/exec/0/result/op-a", + "vfs:///wf/0/exec/0/state/op-a", + ] + state_writer.open.assert_called_once() + assert output_manager._state_writers[port_a] is state_writer + + def test_close_port_storage_writers_flushes_state_writers( + self, output_manager, port_a, port_b + ): + # After the port completes, the long-lived state writer's buffer + # must be flushed and the writer closed (one Iceberg commit per + # port instead of one per state). + writer_a = MagicMock() + writer_b = MagicMock() + output_manager._state_writers[port_a] = writer_a + output_manager._state_writers[port_b] = writer_b + + output_manager.close_port_storage_writers() + + writer_a.close.assert_called_once() + writer_b.close.assert_called_once() + assert output_manager._state_writers == {} diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 3b8caa4d671..80f22ace790 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -124,7 +124,8 @@ class OutputManager( : mutable.HashMap[PortIdentity, OutputPortResultWriterThread] = mutable.HashMap() - private val storageUris: mutable.ArrayBuffer[URI] = mutable.ArrayBuffer() + private val stateWriters: mutable.HashMap[PortIdentity, BufferedItemWriter[Tuple]] = + mutable.HashMap() /** * Add down stream operator and its corresponding Partitioner. @@ -235,19 +236,7 @@ class OutputManager( } def saveStateToStorageIfNeeded(state: State): Unit = { - try { - storageUris.foreach { uri => - val writer = DocumentFactory - .openDocument(State.uriFromResultUri(uri)) - ._1 - .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) - .asInstanceOf[BufferedItemWriter[Tuple]] - writer.putOne(state.toTuple) - writer.close() - } - } catch { - case _: Exception => () - } + stateWriters.values.foreach(_.putOne(state.toTuple)) } /** @@ -263,7 +252,7 @@ class OutputManager( writerThread.join() case None => } - + this.stateWriters.remove(outputPortId).foreach(_.close()) } def getPort(portId: PortIdentity): WorkerPort = ports(portId) @@ -298,7 +287,6 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, storageUri: URI): Unit = { - this.storageUris += storageUri val bufferedItemWriter = DocumentFactory .openDocument(storageUri) ._1 @@ -307,6 +295,16 @@ class OutputManager( val writerThread = new OutputPortResultWriterThread(bufferedItemWriter) this.outputPortResultWriterThreads(portId) = writerThread writerThread.start() + + // The state document is provisioned alongside the result document + // by RegionExecutionCoordinator, so it is always present. + val stateWriter = DocumentFactory + .openDocument(State.uriFromResultUri(storageUri)) + ._1 + .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) + .asInstanceOf[BufferedItemWriter[Tuple]] + stateWriter.open() + this.stateWriters(portId) = stateWriter } } From f313ee58a6fe10659829fa05b0c17b078e86469e Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 22:40:13 -0700 Subject: [PATCH 118/152] fix(pyamber): track _storage_uris so reset_loopend_storage works reset_loopend_storage referenced self._storage_uris[port_id] but the dict was never created or populated, so any LoopEnd iteration would raise AttributeError on the first reset call. Initialize the dict in __init__ and populate it in set_up_port_storage_writer. --- .../python/core/architecture/packaging/output_manager.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index feb1d0e1a60..343c40a9770 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -92,6 +92,12 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() + # Track the port base URI per output port so loop-end operators can + # recreate the storage documents on each loop iteration via + # `reset_loopend_storage`. Without this, the reset path has no way + # to look up which iceberg tables to drop and re-provision. + self._storage_uris: typing.Dict[PortIdentity, str] = dict() + def is_missing_output_ports(self): """ This method is only used for ensuring correct region execution. @@ -133,6 +139,9 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri_base: st state materialization on the same port. `storage_uri_base` is the port's base URI; the result and state URIs are derived from it. """ + # Remember the base URI so `reset_loopend_storage` can re-provision + # this port's iceberg tables on subsequent loop iterations. + self._storage_uris[port_id] = storage_uri_base document, _ = DocumentFactory.open_document( VFSURIFactory.result_uri(storage_uri_base) ) From d635c57a2339631be39774e1110af88300f7ddf3 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:09:35 -0700 Subject: [PATCH 119/152] update --- .../texera/amber/engine/architecture/worker/DataProcessor.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala index 0fc21129daf..84f1e8ec659 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/DataProcessor.scala @@ -125,8 +125,6 @@ class DataProcessor( try { val outputState = executor.processState(state, port) if (outputState.isDefined) { - // emitState already persists the state via the per-port state - // writers, so no explicit save call is needed here. outputManager.emitState(outputState.get) } } catch safely { From 8294e9653c1538809c8b8878fd0d8885ce544953 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:12:10 -0700 Subject: [PATCH 120/152] fix(loop): drop stale uri_from_result_uri / uriFromResultUri helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the main merge, the materialization reader stores a *port base URI* (not a result URI). State.uri_from_result_uri's substring replace of "/result" → "/state" silently returns the base URI unchanged on base URIs (no "/result" substring), and the unchanged base URI then hits DocumentFactory.create_document → decode_uri, which throws on a missing resource-type segment. The loop-jump-back path was broken. - Replace State.uri_from_result_uri(...) at main_loop.py:100 with VFSURIFactory.state_uri(...), which is the canonical way to derive a state URI from a port base URI. - Rename InputManager.get_input_state_result_uri -> get_input_port_base_uri so the name matches what it returns now (a base URI, after the base-URI refactor that landed via PR #4490). - Delete State.uri_from_result_uri (Python) and State.uriFromResultUri (Scala). Both have no remaining callers and would silently corrupt any new call site that handed them a base URI. - Drop the now-unused java.net.URI import from State.scala. --- .../python/core/architecture/packaging/input_manager.py | 7 ++++++- amber/src/main/python/core/models/state.py | 4 ---- amber/src/main/python/core/runnables/main_loop.py | 5 +++-- .../scala/org/apache/texera/amber/core/state/State.scala | 4 ---- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/input_manager.py b/amber/src/main/python/core/architecture/packaging/input_manager.py index af40423caec..e9aae2127a2 100644 --- a/amber/src/main/python/core/architecture/packaging/input_manager.py +++ b/amber/src/main/python/core/architecture/packaging/input_manager.py @@ -174,5 +174,10 @@ def _process_data(self, table: Table) -> Iterator[Tuple]: {name: field_accessor for name in table.column_names}, schema=schema ) - def get_input_state_result_uri(self): + def get_input_port_base_uri(self): + """Return the port base URI of the first materialization reader. + + Use `VFSURIFactory.result_uri(...)` / `state_uri(...)` on the + returned value to get the actual result / state document URI. + """ return next(iter(self._input_port_mat_reader_runnables.values()))[0].uri diff --git a/amber/src/main/python/core/models/state.py b/amber/src/main/python/core/models/state.py index 3ce610bbee5..003aaa212ac 100644 --- a/amber/src/main/python/core/models/state.py +++ b/amber/src/main/python/core/models/state.py @@ -41,10 +41,6 @@ def from_json(cls, payload: str) -> "State": def from_tuple(cls, row: Tuple) -> "State": return cls.from_json(row[cls.CONTENT]) - @staticmethod - def uri_from_result_uri(result_uri: str) -> str: - return result_uri.replace("/result", "/state") - _TYPE_MARKER = "__texera_type__" _PAYLOAD_MARKER = "payload" diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index e30d202334a..e2486616dbc 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -42,6 +42,7 @@ from core.models.state import State from core.runnables.data_processor import DataProcessor from core.storage.document_factory import DocumentFactory +from core.storage.vfs_uri_factory import VFSURIFactory from core.util import StoppableQueueBlockingRunnable, get_one_of from core.util.console_message.timestamp import current_time_in_local_timezone from core.util.customized_queue.queue_base import QueueElement @@ -97,8 +98,8 @@ def _attach_loop_start_id(self, output_state: State) -> None: output_state["LoopStartId"] = self.context.worker_id.split("-", 1)[1].rsplit( "-main-0", 1 )[0] - output_state["LoopStartStateURI"] = State.uri_from_result_uri( - self.context.input_manager.get_input_state_result_uri() + output_state["LoopStartStateURI"] = VFSURIFactory.state_uri( + self.context.input_manager.get_input_port_base_uri() ) def _jump_to_loop_start( diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala index 532f355c17e..ba146f1d57c 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/state/State.scala @@ -23,7 +23,6 @@ import com.fasterxml.jackson.databind.JsonNode import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} import org.apache.texera.amber.util.JSONUtils.objectMapper -import java.net.URI import java.util.Base64 import scala.jdk.CollectionConverters.IteratorHasAsScala @@ -58,9 +57,6 @@ object State { def fromTuple(row: Tuple): State = fromJson(row.getField[String](Content)) - def uriFromResultUri(resultUri: URI): URI = - new URI(resultUri.toString.replace("/result", "/state")) - private def toJsonValue(value: Any): Any = value match { case null => null From 85e8cc286e7c5bae3a1ac5ef4b22938178dd4170 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:35:37 -0700 Subject: [PATCH 121/152] init --- amber/src/test/python/core/runnables/test_main_loop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/amber/src/test/python/core/runnables/test_main_loop.py b/amber/src/test/python/core/runnables/test_main_loop.py index 39566a45339..c32f45b8886 100644 --- a/amber/src/test/python/core/runnables/test_main_loop.py +++ b/amber/src/test/python/core/runnables/test_main_loop.py @@ -1441,6 +1441,7 @@ def fake_switch_context(): assert [s["value"] for s in saved_states] == [2, 42] assert all(s["port"] == 0 for s in saved_states) + @pytest.mark.timeout(2) def test_process_start_channel_persists_produce_state_on_start_output( self, main_loop, From 08d3834cab7e091355e455f1c728cfb00a892401 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:37:01 -0700 Subject: [PATCH 122/152] init --- .../worker/managers/InputPortMaterializationReaderThread.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala index 653b6118eab..e330c0bc4a7 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala @@ -140,7 +140,6 @@ class InputPortMaterializationReaderThread( } // Flush any remaining tuples in the buffer. if (buffer.nonEmpty) flush() - emitECM(METHOD_END_CHANNEL, PORT_ALIGNMENT) isFinished.set(true) } catch { From fffe35525e1d91e8cc10ba1c6f0b63932519caa9 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:37:31 -0700 Subject: [PATCH 123/152] refactor(amber): extract workerIdx once in setupOutputStorageWriterThread The result-writer and state-writer setups in setupOutputStorageWriterThread both unwrapped VirtualIdentityUtils.getWorkerIndex(actorId).getOrElse(throw ...).toString inline. The merge resolution duplicated the 9-line boilerplate across both writers; factor it into a single local val so the two writers share the same worker-index format and a future change touches it once. --- .../messaginglayer/OutputManager.scala | 33 ++++++++----------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 456f3eecd33..c20015dcc8d 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -303,19 +303,21 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, portBaseURI: URI): Unit = { + // Worker index ends up in the iceberg file name; `.toString` on + // Option would leak "Some(...)" into the file name, so unwrap. + val workerIdx = VirtualIdentityUtils + .getWorkerIndex(actorId) + .getOrElse( + throw new IllegalStateException( + s"Expected worker actor id for output storage writer, got: ${actorId.name}" + ) + ) + .toString + val bufferedItemWriter = DocumentFactory .openDocument(VFSURIFactory.resultURI(portBaseURI)) ._1 - .writer( - VirtualIdentityUtils - .getWorkerIndex(actorId) - .getOrElse( - throw new IllegalStateException( - s"Expected worker actor id for output storage writer, got: ${actorId.name}" - ) - ) - .toString - ) + .writer(workerIdx) .asInstanceOf[BufferedItemWriter[Tuple]] val writerThread = new OutputPortStorageWriterThread(bufferedItemWriter) this.outputPortResultWriterThreads(portId) = writerThread @@ -326,16 +328,7 @@ class OutputManager( val stateWriter = DocumentFactory .openDocument(VFSURIFactory.stateURI(portBaseURI)) ._1 - .writer( - VirtualIdentityUtils - .getWorkerIndex(actorId) - .getOrElse( - throw new IllegalStateException( - s"Expected worker actor id for output storage writer, got: ${actorId.name}" - ) - ) - .toString - ) + .writer(workerIdx) .asInstanceOf[BufferedItemWriter[Tuple]] val stateWriterThread = new OutputPortStorageWriterThread(stateWriter) this.stateWriterThreads(portId) = stateWriterThread From 0cf9a0fa3da43429f1f35e8f2921d8e6d0ffc9d0 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:38:36 -0700 Subject: [PATCH 124/152] Revert "refactor(amber): extract workerIdx once in setupOutputStorageWriterThread" This reverts commit fffe35525e1d91e8cc10ba1c6f0b63932519caa9. --- .../messaginglayer/OutputManager.scala | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index c20015dcc8d..456f3eecd33 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -303,21 +303,19 @@ class OutputManager( } private def setupOutputStorageWriterThread(portId: PortIdentity, portBaseURI: URI): Unit = { - // Worker index ends up in the iceberg file name; `.toString` on - // Option would leak "Some(...)" into the file name, so unwrap. - val workerIdx = VirtualIdentityUtils - .getWorkerIndex(actorId) - .getOrElse( - throw new IllegalStateException( - s"Expected worker actor id for output storage writer, got: ${actorId.name}" - ) - ) - .toString - val bufferedItemWriter = DocumentFactory .openDocument(VFSURIFactory.resultURI(portBaseURI)) ._1 - .writer(workerIdx) + .writer( + VirtualIdentityUtils + .getWorkerIndex(actorId) + .getOrElse( + throw new IllegalStateException( + s"Expected worker actor id for output storage writer, got: ${actorId.name}" + ) + ) + .toString + ) .asInstanceOf[BufferedItemWriter[Tuple]] val writerThread = new OutputPortStorageWriterThread(bufferedItemWriter) this.outputPortResultWriterThreads(portId) = writerThread @@ -328,7 +326,16 @@ class OutputManager( val stateWriter = DocumentFactory .openDocument(VFSURIFactory.stateURI(portBaseURI)) ._1 - .writer(workerIdx) + .writer( + VirtualIdentityUtils + .getWorkerIndex(actorId) + .getOrElse( + throw new IllegalStateException( + s"Expected worker actor id for output storage writer, got: ${actorId.name}" + ) + ) + .toString + ) .asInstanceOf[BufferedItemWriter[Tuple]] val stateWriterThread = new OutputPortStorageWriterThread(stateWriter) this.stateWriterThreads(portId) = stateWriterThread From 7a320f603061edae22ad5ad287a023aa11052926 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:52:10 -0700 Subject: [PATCH 125/152] init --- .../architecture/messaginglayer/OutputManager.scala | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala index 456f3eecd33..030fa3a3bbd 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/messaginglayer/OutputManager.scala @@ -326,16 +326,7 @@ class OutputManager( val stateWriter = DocumentFactory .openDocument(VFSURIFactory.stateURI(portBaseURI)) ._1 - .writer( - VirtualIdentityUtils - .getWorkerIndex(actorId) - .getOrElse( - throw new IllegalStateException( - s"Expected worker actor id for output storage writer, got: ${actorId.name}" - ) - ) - .toString - ) + .writer(VirtualIdentityUtils.getWorkerIndex(actorId).toString) .asInstanceOf[BufferedItemWriter[Tuple]] val stateWriterThread = new OutputPortStorageWriterThread(stateWriter) this.stateWriterThreads(portId) = stateWriterThread From 171f7d6ad693f37a3a4d09e3255a1daec61dfb2f Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:55:38 -0700 Subject: [PATCH 126/152] fix fmt --- ...InputPortMaterializationReaderThread.scala | 30 +++++++------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala index e330c0bc4a7..428d9fb48cb 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/worker/managers/InputPortMaterializationReaderThread.scala @@ -97,25 +97,17 @@ class InputPortMaterializationReaderThread( // Notify the input port of start of input channel emitECM(METHOD_START_CHANNEL, NO_ALIGNMENT) try { - // Loop-specific: state document may not always be provisioned for - // a materialization in this branch, so guard the open(). Replay - // states before tuples so downstream operators have their state - // set up before processing the incoming tuples. - try { - val stateDocument = - DocumentFactory - .openDocument(VFSURIFactory.stateURI(uri)) - ._1 - .asInstanceOf[VirtualDocument[Tuple]] - val stateReadIterator = stateDocument.get() - while (stateReadIterator.hasNext) { - val state = State.fromTuple(stateReadIterator.next()) - inputMessageQueue.put( - FIFOMessageElement(WorkflowFIFOMessage(channelId, getSequenceNumber, StateFrame(state))) - ) - } - } catch { - case _: Exception => + val stateDocument = + DocumentFactory + .openDocument(VFSURIFactory.stateURI(uri)) + ._1 + .asInstanceOf[VirtualDocument[Tuple]] + val stateReadIterator = stateDocument.get() + while (stateReadIterator.hasNext) { + val state = State.fromTuple(stateReadIterator.next()) + inputMessageQueue.put( + FIFOMessageElement(WorkflowFIFOMessage(channelId, getSequenceNumber, StateFrame(state))) + ) } val materialization: VirtualDocument[Tuple] = DocumentFactory From 538a821fef582efefbf143c66434febc1aa9d20f Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Thu, 14 May 2026 23:57:20 -0700 Subject: [PATCH 127/152] fix fmt --- ...put_port_materialization_reader_runnable.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index e3ee27e4157..8a7e426a04b 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -148,19 +148,11 @@ def run(self) -> None: ) self.emit_ecm("StartChannel", EmbeddedControlMessageType.NO_ALIGNMENT) - # State is broadcast to every downstream worker (no partitioner - # filtering, unlike the tuple loop) -- per the design comment - # above. Loop-specific: guard with try/except since the state - # document may not be provisioned on every materialization in - # this branch (the LoopEnd path open-or-creates it). - try: - state_document, _ = DocumentFactory.open_document( - VFSURIFactory.state_uri(self.uri) - ) - for state_row in state_document.get(): - self.emit_payload(StateFrame(State.from_tuple(state_row))) - except ValueError: - pass + state_document, _ = DocumentFactory.open_document( + VFSURIFactory.state_uri(self.uri) + ) + for state_row in state_document.get(): + self.emit_payload(StateFrame(State.from_tuple(state_row))) storage_iterator = self.materialization.get() # Iterate and process tuples. From 45025395ae798eea10e60b05b4e8078fb49290e8 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 00:00:15 -0700 Subject: [PATCH 128/152] fix fmt --- .../runnables/input_port_materialization_reader_runnable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py index 8a7e426a04b..3e0e2d48ab5 100644 --- a/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py +++ b/amber/src/main/python/core/storage/runnables/input_port_materialization_reader_runnable.py @@ -17,8 +17,8 @@ import typing from loguru import logger -from typing import Union from pyarrow import Table +from typing import Union from core.architecture.sendsemantics.broad_cast_partitioner import ( BroadcastPartitioner, From 53e4d798f44fac91069c91590ff4b4f4c441e100 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 00:03:45 -0700 Subject: [PATCH 129/152] refactor(storage): replace LoopEnd open-or-create try/catch with explicit existence check Add DocumentFactory.documentExists(uri) so callers don't have to call openDocument inside a try/catch to test whether a VFS document already exists. RegionExecutionCoordinator's LoopEnd branch was using that pattern to avoid clobbering tables from a previous iteration. The new shape is symmetric for both result and state URIs: if (!isLoopEndRegion || !DocumentFactory.documentExists(uri)) { DocumentFactory.createDocument(uri, schema) } Behavior is unchanged: LoopEnd regions preserve existing documents across iterations; all other regions always (re)create. --- .../RegionExecutionCoordinator.scala | 23 ++++--------- .../amber/core/storage/DocumentFactory.scala | 34 +++++++++++++++++++ 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 70a7e6de1cf..f5b3ace265a 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -578,23 +578,14 @@ class RegionExecutionCoordinator( schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) // LoopEnd operators may re-execute the region multiple times; on // subsequent iterations the result/state documents already exist, - // so open-or-create instead of unconditional create to avoid - // clobbering existing data. - if (region.getOperators.exists(_.id.logicalOpId.id.startsWith("LoopEnd-operator-"))) { - try { - DocumentFactory.openDocument(resultURI) - } catch { - case _: Exception => - DocumentFactory.createDocument(resultURI, schema) - } - try { - DocumentFactory.openDocument(stateURI) - } catch { - case _: Exception => - DocumentFactory.createDocument(stateURI, State.schema) - } - } else { + // and `createDocument` (overrideIfExists=true) would clobber them. + // Skip the create call when the document is already there. + val isLoopEndRegion = + region.getOperators.exists(_.id.logicalOpId.id.startsWith("LoopEnd-operator-")) + if (!isLoopEndRegion || !DocumentFactory.documentExists(resultURI)) { DocumentFactory.createDocument(resultURI, schema) + } + if (!isLoopEndRegion || !DocumentFactory.documentExists(stateURI)) { DocumentFactory.createDocument(stateURI, State.schema) } if (!isRestart) { diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 00f6c70ba73..8356493ed23 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -103,6 +103,40 @@ object DocumentFactory { } } + /** + * Check whether a document exists at the given URI without opening it. + * + * Returns true iff the underlying storage already has an entry for this + * URI (e.g., an iceberg table at the resolved namespace + storage key). + * Useful for "create only if absent" flows that would otherwise have to + * call `openDocument` inside a try/catch to test existence. + */ + def documentExists(uri: URI): Boolean = { + uri.getScheme match { + case VFS_FILE_URI_SCHEME => + val (_, _, _, resourceType) = decodeURI(uri) + val storageKey = sanitizeURIPath(uri) + + val namespace = resourceType match { + case RESULT => StorageConfig.icebergTableResultNamespace + case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace + case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => StorageConfig.icebergTableStateNamespace + case _ => + throw new IllegalArgumentException(s"Resource type $resourceType is not supported") + } + + IcebergUtil + .loadTableMetadata(IcebergCatalogInstance.getInstance(), namespace, storageKey) + .isDefined + + case unsupportedScheme => + throw new UnsupportedOperationException( + s"Unsupported URI scheme: $unsupportedScheme for checking the document" + ) + } + } + /** * Open a document specified by the uri. * If the document is storing structural data, the schema will also be returned From 2aec6495e7ab390fd7caebefb2fe251b83a0b57b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 00:06:19 -0700 Subject: [PATCH 130/152] refactor: hoist isLoopEndRegion branch outside the per-URI guards Splits the LoopEnd vs. normal-region paths cleanly: the LoopEnd branch does documentExists guards for both URIs, the non-LoopEnd branch just creates unconditionally. Equivalent behavior, easier to read. --- .../scheduling/RegionExecutionCoordinator.scala | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index f5b3ace265a..0f85c465b65 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -582,10 +582,15 @@ class RegionExecutionCoordinator( // Skip the create call when the document is already there. val isLoopEndRegion = region.getOperators.exists(_.id.logicalOpId.id.startsWith("LoopEnd-operator-")) - if (!isLoopEndRegion || !DocumentFactory.documentExists(resultURI)) { + if (isLoopEndRegion) { + if (!DocumentFactory.documentExists(resultURI)) { + DocumentFactory.createDocument(resultURI, schema) + } + if (!DocumentFactory.documentExists(stateURI)) { + DocumentFactory.createDocument(stateURI, State.schema) + } + } else { DocumentFactory.createDocument(resultURI, schema) - } - if (!isLoopEndRegion || !DocumentFactory.documentExists(stateURI)) { DocumentFactory.createDocument(stateURI, State.schema) } if (!isRestart) { From d0d35c20eb69225360ea234b16f3671e566995c1 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 00:09:34 -0700 Subject: [PATCH 131/152] test(pyamber): remove test_output_manager.py --- .../packaging/test_output_manager.py | 127 ------------------ .../packaging/test_output_manager.py | 107 --------------- 2 files changed, 234 deletions(-) delete mode 100644 amber/src/main/python/core/architecture/packaging/test_output_manager.py delete mode 100644 amber/src/test/python/core/architecture/packaging/test_output_manager.py diff --git a/amber/src/main/python/core/architecture/packaging/test_output_manager.py b/amber/src/main/python/core/architecture/packaging/test_output_manager.py deleted file mode 100644 index 8f1daf8052e..00000000000 --- a/amber/src/main/python/core/architecture/packaging/test_output_manager.py +++ /dev/null @@ -1,127 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from unittest.mock import MagicMock, patch - -import pytest - -from core.architecture.packaging.output_manager import OutputManager -from core.models.state import State -from proto.org.apache.texera.amber.core import PortIdentity - - -class TestSaveStateToStorageIfNeeded: - @pytest.fixture - def output_manager(self): - return OutputManager(worker_id="Worker:WF0-test-main-0") - - @pytest.fixture - def port_a(self): - return PortIdentity(id=0, internal=False) - - @pytest.fixture - def port_b(self): - return PortIdentity(id=1, internal=False) - - @pytest.fixture - def state(self): - return State({"loop_counter": 1, "i": 2}) - - def test_no_state_writers_is_a_noop(self, output_manager, state): - # With no port set up, save_state_to_storage_if_needed must not - # touch any writer. - output_manager.save_state_to_storage_if_needed(state) # no-op, no exception - - def test_unknown_port_id_is_a_noop(self, output_manager, state, port_a): - output_manager.save_state_to_storage_if_needed(state, port_id=port_a) - # No assertion needed -- the absence of any writer means nothing - # was attempted. - - def test_writes_to_every_port_when_port_id_omitted( - self, output_manager, state, port_a, port_b - ): - writer_a = MagicMock() - writer_b = MagicMock() - output_manager._state_writers[port_a] = writer_a - output_manager._state_writers[port_b] = writer_b - - output_manager.save_state_to_storage_if_needed(state) - - writer_a.put_one.assert_called_once() - writer_b.put_one.assert_called_once() - # Long-lived writers must NOT be closed per state -- otherwise - # we'd be back to one Iceberg snapshot per state. - writer_a.close.assert_not_called() - writer_b.close.assert_not_called() - - def test_writes_only_to_selected_port_when_port_id_specified( - self, output_manager, state, port_a, port_b - ): - writer_a = MagicMock() - writer_b = MagicMock() - output_manager._state_writers[port_a] = writer_a - output_manager._state_writers[port_b] = writer_b - - output_manager.save_state_to_storage_if_needed(state, port_id=port_a) - - writer_a.put_one.assert_called_once() - writer_b.put_one.assert_not_called() - - def test_state_writer_is_opened_at_port_setup(self, output_manager, port_a): - # set_up_port_storage_writer should open the result document AND - # the state document, then cache the state writer for reuse. - result_doc = MagicMock() - state_doc = MagicMock() - state_writer = MagicMock() - state_doc.writer.return_value = state_writer - - with patch( - "core.architecture.packaging.output_manager.DocumentFactory" - ) as mock_factory: - mock_factory.open_document.side_effect = [ - (result_doc, MagicMock()), - (state_doc, MagicMock()), - ] - - output_manager.set_up_port_storage_writer( - port_a, "vfs:///wf/0/exec/0/result/op-a" - ) - - opened = [c.args[0] for c in mock_factory.open_document.call_args_list] - assert opened == [ - "vfs:///wf/0/exec/0/result/op-a", - "vfs:///wf/0/exec/0/state/op-a", - ] - state_writer.open.assert_called_once() - assert output_manager._state_writers[port_a] is state_writer - - def test_close_port_storage_writers_flushes_state_writers( - self, output_manager, port_a, port_b - ): - # After the port completes, the long-lived state writer's buffer - # must be flushed and the writer closed (one Iceberg commit per - # port instead of one per state). - writer_a = MagicMock() - writer_b = MagicMock() - output_manager._state_writers[port_a] = writer_a - output_manager._state_writers[port_b] = writer_b - - output_manager.close_port_storage_writers() - - writer_a.close.assert_called_once() - writer_b.close.assert_called_once() - assert output_manager._state_writers == {} diff --git a/amber/src/test/python/core/architecture/packaging/test_output_manager.py b/amber/src/test/python/core/architecture/packaging/test_output_manager.py deleted file mode 100644 index dcf7ccde673..00000000000 --- a/amber/src/test/python/core/architecture/packaging/test_output_manager.py +++ /dev/null @@ -1,107 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from unittest.mock import MagicMock - -import pytest - -from core.architecture.packaging.output_manager import OutputManager -from core.models.state import State -from core.storage.runnables.port_storage_writer import PortStorageWriterElement -from proto.org.apache.texera.amber.core import PortIdentity - - -def _stub_state_writer(output_manager, port_id): - """Inject a (queue, writer, thread) triple as if a port were set up.""" - queue = MagicMock() - writer = MagicMock() - thread = MagicMock() - output_manager._port_state_writers[port_id] = (queue, writer, thread) - return queue, writer, thread - - -class TestSaveStateToStorageIfNeeded: - @pytest.fixture - def output_manager(self): - return OutputManager(worker_id="Worker:WF0-test-main-0") - - @pytest.fixture - def port_a(self): - return PortIdentity(id=0, internal=False) - - @pytest.fixture - def port_b(self): - return PortIdentity(id=1, internal=False) - - @pytest.fixture - def state(self): - return State({"loop_counter": 1, "i": 2}) - - def test_no_state_writers_is_a_noop(self, output_manager, state): - # With no port set up, save_state_to_storage_if_needed must not - # touch any writer. - output_manager.save_state_to_storage_if_needed(state) # no-op - - def test_unknown_port_id_is_a_noop(self, output_manager, state, port_a): - output_manager.save_state_to_storage_if_needed(state, port_id=port_a) - # No assertion needed -- the absence of any writer means nothing - # was attempted. - - def test_enqueues_to_every_port_when_port_id_omitted( - self, output_manager, state, port_a, port_b - ): - queue_a, _, _ = _stub_state_writer(output_manager, port_a) - queue_b, _, _ = _stub_state_writer(output_manager, port_b) - - output_manager.save_state_to_storage_if_needed(state) - - # Each port's writer queue receives one PortStorageWriterElement. - # Critically, save is non-blocking -- the call must not invoke - # put_one / close on the buffered writer directly (those happen - # off-thread). - assert queue_a.put.call_count == 1 - assert queue_b.put.call_count == 1 - assert isinstance(queue_a.put.call_args.args[0], PortStorageWriterElement) - assert isinstance(queue_b.put.call_args.args[0], PortStorageWriterElement) - - def test_enqueues_only_to_selected_port_when_port_id_specified( - self, output_manager, state, port_a, port_b - ): - queue_a, _, _ = _stub_state_writer(output_manager, port_a) - queue_b, _, _ = _stub_state_writer(output_manager, port_b) - - output_manager.save_state_to_storage_if_needed(state, port_id=port_a) - - assert queue_a.put.call_count == 1 - queue_b.put.assert_not_called() - - def test_close_port_storage_writers_stops_state_threads( - self, output_manager, port_a, port_b - ): - # After the port completes, every state-writer thread must be - # stopped and joined so the buffered writer's close() (which - # flushes the final Iceberg commit) actually runs. - _, writer_a, thread_a = _stub_state_writer(output_manager, port_a) - _, writer_b, thread_b = _stub_state_writer(output_manager, port_b) - - output_manager.close_port_storage_writers() - - writer_a.stop.assert_called_once() - writer_b.stop.assert_called_once() - thread_a.join.assert_called_once() - thread_b.join.assert_called_once() - assert output_manager._port_state_writers == {} From 5a86b97ea1f5ae1f635edefe03593834acc6b64b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 00:10:11 -0700 Subject: [PATCH 132/152] test(pyamber): remove test_document_factory.py --- .../core/storage/test_document_factory.py | 134 ------------------ .../core/storage/test_document_factory.py | 134 ------------------ 2 files changed, 268 deletions(-) delete mode 100644 amber/src/main/python/core/storage/test_document_factory.py delete mode 100644 amber/src/test/python/core/storage/test_document_factory.py diff --git a/amber/src/main/python/core/storage/test_document_factory.py b/amber/src/main/python/core/storage/test_document_factory.py deleted file mode 100644 index 859c0040246..00000000000 --- a/amber/src/main/python/core/storage/test_document_factory.py +++ /dev/null @@ -1,134 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from unittest.mock import MagicMock, patch - -import pytest - -from core.models import Schema -from core.storage.document_factory import DocumentFactory -from core.storage.storage_config import StorageConfig -from core.storage.vfs_uri_factory import VFSResourceType - - -# Avoid initializing the real config (only initializable once per process). -StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE = "test-result-ns" -StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE = "test-state-ns" - -VFS_URI = "vfs:///wid/0/eid/0/opid/test/main/0/0/result" - - -@pytest.fixture -def schema(): - return Schema(raw_schema={"x": "INTEGER"}) - - -def _decode_returning(resource_type): - """Helper: build a VFSURIFactory.decode_uri side_effect.""" - return lambda _uri: (None, None, None, resource_type) - - -@patch("core.storage.document_factory.IcebergDocument") -@patch("core.storage.document_factory.amber_schema_to_iceberg_schema") -@patch("core.storage.document_factory.create_table") -@patch("core.storage.document_factory.IcebergCatalogInstance") -@patch("core.storage.document_factory.VFSURIFactory") -class TestCreateDocumentNamespaceRouting: - def test_state_resource_type_uses_state_namespace( - self, mock_vfs, _icb, mock_create_table, _amber_schema, _doc, schema - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) - - DocumentFactory.create_document(VFS_URI, schema) - - args, _ = mock_create_table.call_args - assert args[1] == StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE - - def test_result_resource_type_uses_result_namespace( - self, mock_vfs, _icb, mock_create_table, _amber_schema, _doc, schema - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.RESULT) - - DocumentFactory.create_document(VFS_URI, schema) - - args, _ = mock_create_table.call_args - assert args[1] == StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE - - def test_unsupported_resource_type_raises_value_error( - self, mock_vfs, _icb, _create_table, _amber_schema, _doc, schema - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - # CONSOLE_MESSAGES has no namespace mapping in the Python factory. - mock_vfs.decode_uri.side_effect = _decode_returning( - VFSResourceType.CONSOLE_MESSAGES - ) - - with pytest.raises(ValueError, match="not supported"): - DocumentFactory.create_document(VFS_URI, schema) - - -def test_create_document_rejects_non_vfs_scheme(schema): - with pytest.raises(NotImplementedError, match="Unsupported URI scheme"): - DocumentFactory.create_document("file:///tmp/x", schema) - - -@patch("core.storage.document_factory.IcebergDocument") -@patch("core.storage.document_factory.Schema") -@patch("core.storage.document_factory.load_table_metadata") -@patch("core.storage.document_factory.IcebergCatalogInstance") -@patch("core.storage.document_factory.VFSURIFactory") -class TestOpenDocumentNamespaceRouting: - @staticmethod - def _stub_table(): - table = MagicMock() - table.schema.return_value.as_arrow.return_value = MagicMock() - return table - - def test_state_resource_type_uses_state_namespace( - self, mock_vfs, _icb, mock_load, _schema_cls, _doc - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) - mock_load.return_value = self._stub_table() - - DocumentFactory.open_document(VFS_URI) - - args, _ = mock_load.call_args - assert args[1] == StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE - - def test_unsupported_resource_type_raises_value_error( - self, mock_vfs, _icb, _load, _schema_cls, _doc - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning( - VFSResourceType.CONSOLE_MESSAGES - ) - - with pytest.raises(ValueError, match="not supported"): - DocumentFactory.open_document(VFS_URI) - - def test_missing_table_raises_value_error( - self, mock_vfs, _icb, mock_load, _schema_cls, _doc - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) - mock_load.return_value = None - - with pytest.raises(ValueError, match="No storage is found"): - DocumentFactory.open_document(VFS_URI) diff --git a/amber/src/test/python/core/storage/test_document_factory.py b/amber/src/test/python/core/storage/test_document_factory.py deleted file mode 100644 index 859c0040246..00000000000 --- a/amber/src/test/python/core/storage/test_document_factory.py +++ /dev/null @@ -1,134 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from unittest.mock import MagicMock, patch - -import pytest - -from core.models import Schema -from core.storage.document_factory import DocumentFactory -from core.storage.storage_config import StorageConfig -from core.storage.vfs_uri_factory import VFSResourceType - - -# Avoid initializing the real config (only initializable once per process). -StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE = "test-result-ns" -StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE = "test-state-ns" - -VFS_URI = "vfs:///wid/0/eid/0/opid/test/main/0/0/result" - - -@pytest.fixture -def schema(): - return Schema(raw_schema={"x": "INTEGER"}) - - -def _decode_returning(resource_type): - """Helper: build a VFSURIFactory.decode_uri side_effect.""" - return lambda _uri: (None, None, None, resource_type) - - -@patch("core.storage.document_factory.IcebergDocument") -@patch("core.storage.document_factory.amber_schema_to_iceberg_schema") -@patch("core.storage.document_factory.create_table") -@patch("core.storage.document_factory.IcebergCatalogInstance") -@patch("core.storage.document_factory.VFSURIFactory") -class TestCreateDocumentNamespaceRouting: - def test_state_resource_type_uses_state_namespace( - self, mock_vfs, _icb, mock_create_table, _amber_schema, _doc, schema - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) - - DocumentFactory.create_document(VFS_URI, schema) - - args, _ = mock_create_table.call_args - assert args[1] == StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE - - def test_result_resource_type_uses_result_namespace( - self, mock_vfs, _icb, mock_create_table, _amber_schema, _doc, schema - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.RESULT) - - DocumentFactory.create_document(VFS_URI, schema) - - args, _ = mock_create_table.call_args - assert args[1] == StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE - - def test_unsupported_resource_type_raises_value_error( - self, mock_vfs, _icb, _create_table, _amber_schema, _doc, schema - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - # CONSOLE_MESSAGES has no namespace mapping in the Python factory. - mock_vfs.decode_uri.side_effect = _decode_returning( - VFSResourceType.CONSOLE_MESSAGES - ) - - with pytest.raises(ValueError, match="not supported"): - DocumentFactory.create_document(VFS_URI, schema) - - -def test_create_document_rejects_non_vfs_scheme(schema): - with pytest.raises(NotImplementedError, match="Unsupported URI scheme"): - DocumentFactory.create_document("file:///tmp/x", schema) - - -@patch("core.storage.document_factory.IcebergDocument") -@patch("core.storage.document_factory.Schema") -@patch("core.storage.document_factory.load_table_metadata") -@patch("core.storage.document_factory.IcebergCatalogInstance") -@patch("core.storage.document_factory.VFSURIFactory") -class TestOpenDocumentNamespaceRouting: - @staticmethod - def _stub_table(): - table = MagicMock() - table.schema.return_value.as_arrow.return_value = MagicMock() - return table - - def test_state_resource_type_uses_state_namespace( - self, mock_vfs, _icb, mock_load, _schema_cls, _doc - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) - mock_load.return_value = self._stub_table() - - DocumentFactory.open_document(VFS_URI) - - args, _ = mock_load.call_args - assert args[1] == StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE - - def test_unsupported_resource_type_raises_value_error( - self, mock_vfs, _icb, _load, _schema_cls, _doc - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning( - VFSResourceType.CONSOLE_MESSAGES - ) - - with pytest.raises(ValueError, match="not supported"): - DocumentFactory.open_document(VFS_URI) - - def test_missing_table_raises_value_error( - self, mock_vfs, _icb, mock_load, _schema_cls, _doc - ): - mock_vfs.VFS_FILE_URI_SCHEME = "vfs" - mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) - mock_load.return_value = None - - with pytest.raises(ValueError, match="No storage is found"): - DocumentFactory.open_document(VFS_URI) From 240f9e6d76d9af1bd13dbb78c821930d2ca29a3e Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 00:12:11 -0700 Subject: [PATCH 133/152] test(pyamber): restore test_output_manager.py and test_document_factory.py from main --- .../packaging/test_output_manager.py | 107 ++++++++++++++ .../core/storage/test_document_factory.py | 134 ++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 amber/src/test/python/core/architecture/packaging/test_output_manager.py create mode 100644 amber/src/test/python/core/storage/test_document_factory.py diff --git a/amber/src/test/python/core/architecture/packaging/test_output_manager.py b/amber/src/test/python/core/architecture/packaging/test_output_manager.py new file mode 100644 index 00000000000..dcf7ccde673 --- /dev/null +++ b/amber/src/test/python/core/architecture/packaging/test_output_manager.py @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from unittest.mock import MagicMock + +import pytest + +from core.architecture.packaging.output_manager import OutputManager +from core.models.state import State +from core.storage.runnables.port_storage_writer import PortStorageWriterElement +from proto.org.apache.texera.amber.core import PortIdentity + + +def _stub_state_writer(output_manager, port_id): + """Inject a (queue, writer, thread) triple as if a port were set up.""" + queue = MagicMock() + writer = MagicMock() + thread = MagicMock() + output_manager._port_state_writers[port_id] = (queue, writer, thread) + return queue, writer, thread + + +class TestSaveStateToStorageIfNeeded: + @pytest.fixture + def output_manager(self): + return OutputManager(worker_id="Worker:WF0-test-main-0") + + @pytest.fixture + def port_a(self): + return PortIdentity(id=0, internal=False) + + @pytest.fixture + def port_b(self): + return PortIdentity(id=1, internal=False) + + @pytest.fixture + def state(self): + return State({"loop_counter": 1, "i": 2}) + + def test_no_state_writers_is_a_noop(self, output_manager, state): + # With no port set up, save_state_to_storage_if_needed must not + # touch any writer. + output_manager.save_state_to_storage_if_needed(state) # no-op + + def test_unknown_port_id_is_a_noop(self, output_manager, state, port_a): + output_manager.save_state_to_storage_if_needed(state, port_id=port_a) + # No assertion needed -- the absence of any writer means nothing + # was attempted. + + def test_enqueues_to_every_port_when_port_id_omitted( + self, output_manager, state, port_a, port_b + ): + queue_a, _, _ = _stub_state_writer(output_manager, port_a) + queue_b, _, _ = _stub_state_writer(output_manager, port_b) + + output_manager.save_state_to_storage_if_needed(state) + + # Each port's writer queue receives one PortStorageWriterElement. + # Critically, save is non-blocking -- the call must not invoke + # put_one / close on the buffered writer directly (those happen + # off-thread). + assert queue_a.put.call_count == 1 + assert queue_b.put.call_count == 1 + assert isinstance(queue_a.put.call_args.args[0], PortStorageWriterElement) + assert isinstance(queue_b.put.call_args.args[0], PortStorageWriterElement) + + def test_enqueues_only_to_selected_port_when_port_id_specified( + self, output_manager, state, port_a, port_b + ): + queue_a, _, _ = _stub_state_writer(output_manager, port_a) + queue_b, _, _ = _stub_state_writer(output_manager, port_b) + + output_manager.save_state_to_storage_if_needed(state, port_id=port_a) + + assert queue_a.put.call_count == 1 + queue_b.put.assert_not_called() + + def test_close_port_storage_writers_stops_state_threads( + self, output_manager, port_a, port_b + ): + # After the port completes, every state-writer thread must be + # stopped and joined so the buffered writer's close() (which + # flushes the final Iceberg commit) actually runs. + _, writer_a, thread_a = _stub_state_writer(output_manager, port_a) + _, writer_b, thread_b = _stub_state_writer(output_manager, port_b) + + output_manager.close_port_storage_writers() + + writer_a.stop.assert_called_once() + writer_b.stop.assert_called_once() + thread_a.join.assert_called_once() + thread_b.join.assert_called_once() + assert output_manager._port_state_writers == {} diff --git a/amber/src/test/python/core/storage/test_document_factory.py b/amber/src/test/python/core/storage/test_document_factory.py new file mode 100644 index 00000000000..859c0040246 --- /dev/null +++ b/amber/src/test/python/core/storage/test_document_factory.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from unittest.mock import MagicMock, patch + +import pytest + +from core.models import Schema +from core.storage.document_factory import DocumentFactory +from core.storage.storage_config import StorageConfig +from core.storage.vfs_uri_factory import VFSResourceType + + +# Avoid initializing the real config (only initializable once per process). +StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE = "test-result-ns" +StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE = "test-state-ns" + +VFS_URI = "vfs:///wid/0/eid/0/opid/test/main/0/0/result" + + +@pytest.fixture +def schema(): + return Schema(raw_schema={"x": "INTEGER"}) + + +def _decode_returning(resource_type): + """Helper: build a VFSURIFactory.decode_uri side_effect.""" + return lambda _uri: (None, None, None, resource_type) + + +@patch("core.storage.document_factory.IcebergDocument") +@patch("core.storage.document_factory.amber_schema_to_iceberg_schema") +@patch("core.storage.document_factory.create_table") +@patch("core.storage.document_factory.IcebergCatalogInstance") +@patch("core.storage.document_factory.VFSURIFactory") +class TestCreateDocumentNamespaceRouting: + def test_state_resource_type_uses_state_namespace( + self, mock_vfs, _icb, mock_create_table, _amber_schema, _doc, schema + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) + + DocumentFactory.create_document(VFS_URI, schema) + + args, _ = mock_create_table.call_args + assert args[1] == StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE + + def test_result_resource_type_uses_result_namespace( + self, mock_vfs, _icb, mock_create_table, _amber_schema, _doc, schema + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.RESULT) + + DocumentFactory.create_document(VFS_URI, schema) + + args, _ = mock_create_table.call_args + assert args[1] == StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + + def test_unsupported_resource_type_raises_value_error( + self, mock_vfs, _icb, _create_table, _amber_schema, _doc, schema + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + # CONSOLE_MESSAGES has no namespace mapping in the Python factory. + mock_vfs.decode_uri.side_effect = _decode_returning( + VFSResourceType.CONSOLE_MESSAGES + ) + + with pytest.raises(ValueError, match="not supported"): + DocumentFactory.create_document(VFS_URI, schema) + + +def test_create_document_rejects_non_vfs_scheme(schema): + with pytest.raises(NotImplementedError, match="Unsupported URI scheme"): + DocumentFactory.create_document("file:///tmp/x", schema) + + +@patch("core.storage.document_factory.IcebergDocument") +@patch("core.storage.document_factory.Schema") +@patch("core.storage.document_factory.load_table_metadata") +@patch("core.storage.document_factory.IcebergCatalogInstance") +@patch("core.storage.document_factory.VFSURIFactory") +class TestOpenDocumentNamespaceRouting: + @staticmethod + def _stub_table(): + table = MagicMock() + table.schema.return_value.as_arrow.return_value = MagicMock() + return table + + def test_state_resource_type_uses_state_namespace( + self, mock_vfs, _icb, mock_load, _schema_cls, _doc + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) + mock_load.return_value = self._stub_table() + + DocumentFactory.open_document(VFS_URI) + + args, _ = mock_load.call_args + assert args[1] == StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE + + def test_unsupported_resource_type_raises_value_error( + self, mock_vfs, _icb, _load, _schema_cls, _doc + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning( + VFSResourceType.CONSOLE_MESSAGES + ) + + with pytest.raises(ValueError, match="not supported"): + DocumentFactory.open_document(VFS_URI) + + def test_missing_table_raises_value_error( + self, mock_vfs, _icb, mock_load, _schema_cls, _doc + ): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.STATE) + mock_load.return_value = None + + with pytest.raises(ValueError, match="No storage is found"): + DocumentFactory.open_document(VFS_URI) From 19b0cd9fbb4206a101e4ed85781b81cf4782da70 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 00:28:55 -0700 Subject: [PATCH 134/152] refactor: collapse LoopEnd guards back to per-URI condition --- .../scheduling/RegionExecutionCoordinator.scala | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 0f85c465b65..f5b3ace265a 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -582,15 +582,10 @@ class RegionExecutionCoordinator( // Skip the create call when the document is already there. val isLoopEndRegion = region.getOperators.exists(_.id.logicalOpId.id.startsWith("LoopEnd-operator-")) - if (isLoopEndRegion) { - if (!DocumentFactory.documentExists(resultURI)) { - DocumentFactory.createDocument(resultURI, schema) - } - if (!DocumentFactory.documentExists(stateURI)) { - DocumentFactory.createDocument(stateURI, State.schema) - } - } else { + if (!isLoopEndRegion || !DocumentFactory.documentExists(resultURI)) { DocumentFactory.createDocument(resultURI, schema) + } + if (!isLoopEndRegion || !DocumentFactory.documentExists(stateURI)) { DocumentFactory.createDocument(stateURI, State.schema) } if (!isRestart) { From 7aeb5ee7fac2cfd85f62ca3802d4a5ea25ec96ce Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 17:43:41 -0700 Subject: [PATCH 135/152] feat(storage): add DocumentFactory.documentExists for absence checks Enables "create only if absent" flows to test existence without catching exceptions from openDocument. Splits off from #4206. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../amber/core/storage/DocumentFactory.scala | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 00f6c70ba73..8356493ed23 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -103,6 +103,40 @@ object DocumentFactory { } } + /** + * Check whether a document exists at the given URI without opening it. + * + * Returns true iff the underlying storage already has an entry for this + * URI (e.g., an iceberg table at the resolved namespace + storage key). + * Useful for "create only if absent" flows that would otherwise have to + * call `openDocument` inside a try/catch to test existence. + */ + def documentExists(uri: URI): Boolean = { + uri.getScheme match { + case VFS_FILE_URI_SCHEME => + val (_, _, _, resourceType) = decodeURI(uri) + val storageKey = sanitizeURIPath(uri) + + val namespace = resourceType match { + case RESULT => StorageConfig.icebergTableResultNamespace + case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace + case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => StorageConfig.icebergTableStateNamespace + case _ => + throw new IllegalArgumentException(s"Resource type $resourceType is not supported") + } + + IcebergUtil + .loadTableMetadata(IcebergCatalogInstance.getInstance(), namespace, storageKey) + .isDefined + + case unsupportedScheme => + throw new UnsupportedOperationException( + s"Unsupported URI scheme: $unsupportedScheme for checking the document" + ) + } + } + /** * Open a document specified by the uri. * If the document is storing structural data, the schema will also be returned From 3ba674d2db2d3a5920ac3f3bbf5bda71bc62d58d Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 17:45:14 -0700 Subject: [PATCH 136/152] chore: remove DocumentFactory.documentExists hunk (now in #5085) The documentExists helper is being landed separately in #5085. This branch keeps the callers in RegionExecutionCoordinator that depend on it, so this PR now depends on #5085 merging first. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../amber/core/storage/DocumentFactory.scala | 34 ------------------- 1 file changed, 34 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 8356493ed23..00f6c70ba73 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -103,40 +103,6 @@ object DocumentFactory { } } - /** - * Check whether a document exists at the given URI without opening it. - * - * Returns true iff the underlying storage already has an entry for this - * URI (e.g., an iceberg table at the resolved namespace + storage key). - * Useful for "create only if absent" flows that would otherwise have to - * call `openDocument` inside a try/catch to test existence. - */ - def documentExists(uri: URI): Boolean = { - uri.getScheme match { - case VFS_FILE_URI_SCHEME => - val (_, _, _, resourceType) = decodeURI(uri) - val storageKey = sanitizeURIPath(uri) - - val namespace = resourceType match { - case RESULT => StorageConfig.icebergTableResultNamespace - case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace - case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace - case STATE => StorageConfig.icebergTableStateNamespace - case _ => - throw new IllegalArgumentException(s"Resource type $resourceType is not supported") - } - - IcebergUtil - .loadTableMetadata(IcebergCatalogInstance.getInstance(), namespace, storageKey) - .isDefined - - case unsupportedScheme => - throw new UnsupportedOperationException( - s"Unsupported URI scheme: $unsupportedScheme for checking the document" - ) - } - } - /** * Open a document specified by the uri. * If the document is storing structural data, the schema will also be returned From d760f989fa65f180650ba104479937925df86f6a Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 17:47:05 -0700 Subject: [PATCH 137/152] docs: drop try/catch rationale from documentExists comment Co-Authored-By: Claude Opus 4.7 (1M context) --- .../org/apache/texera/amber/core/storage/DocumentFactory.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 8356493ed23..4dcfb7a4838 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -108,8 +108,6 @@ object DocumentFactory { * * Returns true iff the underlying storage already has an entry for this * URI (e.g., an iceberg table at the resolved namespace + storage key). - * Useful for "create only if absent" flows that would otherwise have to - * call `openDocument` inside a try/catch to test existence. */ def documentExists(uri: URI): Boolean = { uri.getScheme match { From ceff249ee82e72360565a2bda108dbcaa0d609ae Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 17:56:43 -0700 Subject: [PATCH 138/152] refactor(storage): address PR review on documentExists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract resolveNamespace helper so createDocument, openDocument, and documentExists share one resourceType → namespace mapping. - Use catalog.tableExists via IcebergUtil instead of loadTableMetadata so transient catalog errors surface instead of becoming false negatives. - Tweak the unsupported-scheme message to mention "checking document existence" rather than "checking the document". - Add IcebergDocumentSpec cases for existing/fresh URIs and unsupported schemes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../amber/core/storage/DocumentFactory.scala | 47 ++++++------------- .../texera/amber/util/IcebergUtil.scala | 7 +++ .../result/iceberg/IcebergDocumentSpec.scala | 25 ++++++++++ 3 files changed, 47 insertions(+), 32 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 4dcfb7a4838..81f83e58d1e 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -39,6 +39,16 @@ object DocumentFactory { private def sanitizeURIPath(uri: URI): String = uri.getPath.stripPrefix("/").replace("/", "_") + private def resolveNamespace(resourceType: VFSResourceType.Value): String = + resourceType match { + case RESULT => StorageConfig.icebergTableResultNamespace + case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace + case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace + case STATE => StorageConfig.icebergTableStateNamespace + case _ => + throw new IllegalArgumentException(s"Resource type $resourceType is not supported") + } + /** * Open a document specified by the uri for read purposes only. * @param fileUri the uri of the document @@ -67,15 +77,7 @@ object DocumentFactory { case VFS_FILE_URI_SCHEME => val (_, _, _, resourceType) = decodeURI(uri) val storageKey = sanitizeURIPath(uri) - - val namespace = resourceType match { - case RESULT => StorageConfig.icebergTableResultNamespace - case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace - case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace - case STATE => StorageConfig.icebergTableStateNamespace - case _ => - throw new IllegalArgumentException(s"Resource type $resourceType is not supported") - } + val namespace = resolveNamespace(resourceType) val icebergSchema = IcebergUtil.toIcebergSchema(schema) IcebergUtil.createTable( @@ -114,23 +116,12 @@ object DocumentFactory { case VFS_FILE_URI_SCHEME => val (_, _, _, resourceType) = decodeURI(uri) val storageKey = sanitizeURIPath(uri) - - val namespace = resourceType match { - case RESULT => StorageConfig.icebergTableResultNamespace - case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace - case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace - case STATE => StorageConfig.icebergTableStateNamespace - case _ => - throw new IllegalArgumentException(s"Resource type $resourceType is not supported") - } - - IcebergUtil - .loadTableMetadata(IcebergCatalogInstance.getInstance(), namespace, storageKey) - .isDefined + val namespace = resolveNamespace(resourceType) + IcebergUtil.tableExists(IcebergCatalogInstance.getInstance(), namespace, storageKey) case unsupportedScheme => throw new UnsupportedOperationException( - s"Unsupported URI scheme: $unsupportedScheme for checking the document" + s"Unsupported URI scheme: $unsupportedScheme for checking document existence" ) } } @@ -147,15 +138,7 @@ object DocumentFactory { case VFS_FILE_URI_SCHEME => val (_, _, _, resourceType) = decodeURI(uri) val storageKey = sanitizeURIPath(uri) - - val namespace = resourceType match { - case RESULT => StorageConfig.icebergTableResultNamespace - case CONSOLE_MESSAGES => StorageConfig.icebergTableConsoleMessagesNamespace - case RUNTIME_STATISTICS => StorageConfig.icebergTableRuntimeStatisticsNamespace - case STATE => StorageConfig.icebergTableStateNamespace - case _ => - throw new IllegalArgumentException(s"Resource type $resourceType is not supported") - } + val namespace = resolveNamespace(resourceType) val table = IcebergUtil .loadTableMetadata( diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala index d6e406a53fc..c7b5c889ca4 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala @@ -227,6 +227,13 @@ object IcebergUtil { } } + def tableExists( + catalog: Catalog, + tableNamespace: String, + tableName: String + ): Boolean = + catalog.tableExists(TableIdentifier.of(tableNamespace, tableName)) + /** * Converts a custom Amber `Schema` to an Iceberg `Schema`. * Field names are encoded to preserve LARGE_BINARY type information. diff --git a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala index b92562eeb77..fcfbc6fb5b8 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala @@ -144,6 +144,31 @@ class IcebergDocumentSpec extends VirtualDocumentSpec[Tuple] with BeforeAndAfter } } + it should "report documentExists=true for a created URI and false for a fresh one" in { + assert(DocumentFactory.documentExists(uri)) + + val freshBase = VFSURIFactory.createPortBaseURI( + WorkflowIdentity(0), + ExecutionIdentity(0), + GlobalPortIdentity( + PhysicalOpIdentity( + logicalOpId = + OperatorIdentity(s"fresh-${UUID.randomUUID().toString.replace("-", "")}"), + layerName = "main" + ), + PortIdentity() + ) + ) + val freshUri = VFSURIFactory.resultURI(freshBase) + assert(!DocumentFactory.documentExists(freshUri)) + } + + it should "throw UnsupportedOperationException for documentExists on an unsupported scheme" in { + intercept[UnsupportedOperationException] { + DocumentFactory.documentExists(new URI("file:///tmp/anything")) + } + } + it should "round trip materialized state documents" in { val stateUri = VFSURIFactory.stateURI(baseURI) DocumentFactory.createDocument(stateUri, State.schema) From 771f0ebce2f90b14534cc97255f84d1bfeeeb999 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 18:03:02 -0700 Subject: [PATCH 139/152] refactor(storage): inline catalog.tableExists in documentExists Drops the single-use IcebergUtil.tableExists wrapper and calls catalog.tableExists directly via TableIdentifier. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../apache/texera/amber/core/storage/DocumentFactory.scala | 5 ++++- .../scala/org/apache/texera/amber/util/IcebergUtil.scala | 7 ------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala index 81f83e58d1e..7ec0c04e60f 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/DocumentFactory.scala @@ -27,6 +27,7 @@ import org.apache.texera.amber.core.storage.model._ import org.apache.texera.amber.core.storage.result.iceberg.IcebergDocument import org.apache.texera.amber.core.tuple.{Schema, Tuple} import org.apache.texera.amber.util.IcebergUtil +import org.apache.iceberg.catalog.TableIdentifier import org.apache.iceberg.data.Record import org.apache.iceberg.{Schema => IcebergSchema} @@ -117,7 +118,9 @@ object DocumentFactory { val (_, _, _, resourceType) = decodeURI(uri) val storageKey = sanitizeURIPath(uri) val namespace = resolveNamespace(resourceType) - IcebergUtil.tableExists(IcebergCatalogInstance.getInstance(), namespace, storageKey) + IcebergCatalogInstance + .getInstance() + .tableExists(TableIdentifier.of(namespace, storageKey)) case unsupportedScheme => throw new UnsupportedOperationException( diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala index c7b5c889ca4..d6e406a53fc 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala @@ -227,13 +227,6 @@ object IcebergUtil { } } - def tableExists( - catalog: Catalog, - tableNamespace: String, - tableName: String - ): Boolean = - catalog.tableExists(TableIdentifier.of(tableNamespace, tableName)) - /** * Converts a custom Amber `Schema` to an Iceberg `Schema`. * Field names are encoded to preserve LARGE_BINARY type information. From 8c02359c7a7ce81d1af8bdab158a401489dcf985 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 18:07:50 -0700 Subject: [PATCH 140/152] feat(storage,python): add DocumentFactory.document_exists Mirrors the Scala documentExists helper: - Extract _resolve_namespace so create_document, open_document, and document_exists share one resource_type -> namespace mapping. - document_exists calls catalog.table_exists directly so transient catalog errors surface instead of becoming false negatives. - Add unit tests covering true/false catalog responses, unsupported resource type, and unsupported URI scheme. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../python/core/storage/document_factory.py | 51 ++++++++++++------- .../core/storage/test_document_factory.py | 38 ++++++++++++++ 2 files changed, 71 insertions(+), 18 deletions(-) diff --git a/amber/src/main/python/core/storage/document_factory.py b/amber/src/main/python/core/storage/document_factory.py index bd690ceb592..2e536d8f2ec 100644 --- a/amber/src/main/python/core/storage/document_factory.py +++ b/amber/src/main/python/core/storage/document_factory.py @@ -43,6 +43,16 @@ class DocumentFactory: ICEBERG = "iceberg" + @staticmethod + def _resolve_namespace(resource_type: VFSResourceType) -> str: + match resource_type: + case VFSResourceType.RESULT: + return StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE + case VFSResourceType.STATE: + return StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE + case _: + raise ValueError(f"Resource type {resource_type} is not supported") + @staticmethod def sanitize_uri_path(uri): """ @@ -60,15 +70,7 @@ def create_document(uri: str, schema: Schema) -> VirtualDocument: parsed_uri = urlparse(uri) if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME: _, _, _, resource_type = VFSURIFactory.decode_uri(uri) - - match resource_type: - case VFSResourceType.RESULT: - namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE - case VFSResourceType.STATE: - namespace = StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE - case _: - raise ValueError(f"Resource type {resource_type} is not supported") - + namespace = DocumentFactory._resolve_namespace(resource_type) storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) # Convert Amber Schema to Iceberg Schema with LARGE_BINARY # field name encoding @@ -96,19 +98,32 @@ def create_document(uri: str, schema: Schema) -> VirtualDocument: ) @staticmethod - def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: + def document_exists(uri: str) -> bool: + """ + Check whether a document exists at the given URI without opening it. + + Returns True iff the underlying storage already has an entry for this + URI (e.g., an iceberg table at the resolved namespace + storage key). + """ parsed_uri = urlparse(uri) - if parsed_uri.scheme == "vfs": + if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME: _, _, _, resource_type = VFSURIFactory.decode_uri(uri) + namespace = DocumentFactory._resolve_namespace(resource_type) + storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) + return IcebergCatalogInstance.get_instance().table_exists( + f"{namespace}.{storage_key}" + ) - match resource_type: - case VFSResourceType.RESULT: - namespace = StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE - case VFSResourceType.STATE: - namespace = StorageConfig.ICEBERG_TABLE_STATE_NAMESPACE - case _: - raise ValueError(f"Resource type {resource_type} is not supported") + raise NotImplementedError( + f"Unsupported URI scheme: {parsed_uri.scheme} for checking document existence" + ) + @staticmethod + def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: + parsed_uri = urlparse(uri) + if parsed_uri.scheme == "vfs": + _, _, _, resource_type = VFSURIFactory.decode_uri(uri) + namespace = DocumentFactory._resolve_namespace(resource_type) storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) table = load_table_metadata( diff --git a/amber/src/test/python/core/storage/test_document_factory.py b/amber/src/test/python/core/storage/test_document_factory.py index 859c0040246..359b0c0aed9 100644 --- a/amber/src/test/python/core/storage/test_document_factory.py +++ b/amber/src/test/python/core/storage/test_document_factory.py @@ -132,3 +132,41 @@ def test_missing_table_raises_value_error( with pytest.raises(ValueError, match="No storage is found"): DocumentFactory.open_document(VFS_URI) + + +@patch("core.storage.document_factory.IcebergCatalogInstance") +@patch("core.storage.document_factory.VFSURIFactory") +class TestDocumentExists: + def test_returns_true_when_table_exists(self, mock_vfs, mock_icb): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.RESULT) + catalog = MagicMock() + catalog.table_exists.return_value = True + mock_icb.get_instance.return_value = catalog + + assert DocumentFactory.document_exists(VFS_URI) is True + identifier = catalog.table_exists.call_args.args[0] + assert identifier.startswith(f"{StorageConfig.ICEBERG_TABLE_RESULT_NAMESPACE}.") + + def test_returns_false_when_table_missing(self, mock_vfs, mock_icb): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning(VFSResourceType.RESULT) + catalog = MagicMock() + catalog.table_exists.return_value = False + mock_icb.get_instance.return_value = catalog + + assert DocumentFactory.document_exists(VFS_URI) is False + + def test_unsupported_resource_type_raises_value_error(self, mock_vfs, _icb): + mock_vfs.VFS_FILE_URI_SCHEME = "vfs" + mock_vfs.decode_uri.side_effect = _decode_returning( + VFSResourceType.CONSOLE_MESSAGES + ) + + with pytest.raises(ValueError, match="not supported"): + DocumentFactory.document_exists(VFS_URI) + + +def test_document_exists_rejects_non_vfs_scheme(): + with pytest.raises(NotImplementedError, match="Unsupported URI scheme"): + DocumentFactory.document_exists("file:///tmp/x") From 641a8df83269057aa476b8bc46182d3c511cc7ac Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 18:13:24 -0700 Subject: [PATCH 141/152] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Xinyuan Lin --- amber/src/main/python/core/storage/document_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amber/src/main/python/core/storage/document_factory.py b/amber/src/main/python/core/storage/document_factory.py index 2e536d8f2ec..45bbffeb164 100644 --- a/amber/src/main/python/core/storage/document_factory.py +++ b/amber/src/main/python/core/storage/document_factory.py @@ -121,7 +121,7 @@ def document_exists(uri: str) -> bool: @staticmethod def open_document(uri: str) -> typing.Tuple[VirtualDocument, Optional[Schema]]: parsed_uri = urlparse(uri) - if parsed_uri.scheme == "vfs": + if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME: _, _, _, resource_type = VFSURIFactory.decode_uri(uri) namespace = DocumentFactory._resolve_namespace(resource_type) storage_key = DocumentFactory.sanitize_uri_path(parsed_uri) From 7b174eb860cfc853ba39b977cf259a9016e1e72b Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 18:35:27 -0700 Subject: [PATCH 142/152] style: scalafmt IcebergDocumentSpec Co-Authored-By: Claude Opus 4.7 (1M context) --- .../amber/storage/result/iceberg/IcebergDocumentSpec.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala index fcfbc6fb5b8..6801a859dc8 100644 --- a/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala +++ b/common/workflow-core/src/test/scala/org/apache/texera/amber/storage/result/iceberg/IcebergDocumentSpec.scala @@ -152,8 +152,7 @@ class IcebergDocumentSpec extends VirtualDocumentSpec[Tuple] with BeforeAndAfter ExecutionIdentity(0), GlobalPortIdentity( PhysicalOpIdentity( - logicalOpId = - OperatorIdentity(s"fresh-${UUID.randomUUID().toString.replace("-", "")}"), + logicalOpId = OperatorIdentity(s"fresh-${UUID.randomUUID().toString.replace("-", "")}"), layerName = "main" ), PortIdentity() From caf7ce476f6184b387071eb5c4ef515c55e2ee9d Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 20:24:29 -0700 Subject: [PATCH 143/152] refactor(loop): drop loop_start_id() and inline state cleanup at the jump site MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LoopEndOperator.loop_start_id() returned a state value but also had two hidden `del` side effects — a surprising shape for an _id() accessor. The cleanup logic was split across two locations (loop_start_id deleted table/output; _jump_to_loop_start deleted LoopStartId/LoopStartStateURI), so the strip-then-write step wasn't obvious in either place. Inline the four-key strip into _jump_to_loop_start so all the metadata trimming happens right next to the iceberg write that needs it, and the operator class stays purely declarative. --- amber/src/main/python/core/models/operator.py | 5 ----- amber/src/main/python/core/runnables/main_loop.py | 9 ++++++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/amber/src/main/python/core/models/operator.py b/amber/src/main/python/core/models/operator.py index c68bce09bf3..d3b9b590f1f 100644 --- a/amber/src/main/python/core/models/operator.py +++ b/amber/src/main/python/core/models/operator.py @@ -318,8 +318,3 @@ def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike] @abstractmethod def condition(self) -> None: pass - - def loop_start_id(self) -> str: - del self.state["table"] - del self.state["output"] - return self.state["LoopStartId"] diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index e2486616dbc..0e0bbac1cda 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -106,11 +106,14 @@ def _jump_to_loop_start( self, executor: LoopEndOperator, controller_interface ) -> None: controller_interface.jump_to_operator_region( - JumpToOperatorRegionRequest(OperatorIdentity(executor.loop_start_id())) + JumpToOperatorRegionRequest(OperatorIdentity(executor.state["LoopStartId"])) ) uri = executor.state["LoopStartStateURI"] - del executor.state["LoopStartStateURI"] - del executor.state["LoopStartId"] + # Strip the per-iteration scratch (`table`, `output`) and the + # loop metadata (`LoopStartId`, `LoopStartStateURI`) so only the + # user-visible loop state is written back to LoopStart's input. + for key in ("table", "output", "LoopStartId", "LoopStartStateURI"): + executor.state.pop(key, None) writer = DocumentFactory.create_document(uri, State.SCHEMA).writer("0") writer.put_one(State(executor.state).to_tuple()) writer.close() From 5d893dc7a1013f0aabb5fb89e3315483cc9942ae Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 21:18:14 -0700 Subject: [PATCH 144/152] refactor(loop): drop generatePythonCode try/catch and inline get_input_port_base_uri generatePythonCode is a pure string interpolation with no failure modes, so the broad try/catch in LoopStartOpDesc and LoopEndOpDesc just defers any future code-gen bug from compile time to a Python syntax error at the worker. Drop the wrappers. InputManager.get_input_port_base_uri had one caller; inline it using the existing public get_input_port_mat_reader_threads accessor. --- .../core/architecture/packaging/input_manager.py | 8 -------- .../src/main/python/core/runnables/main_loop.py | 16 +++++++++++----- .../amber/operator/loop/LoopEndOpDesc.scala | 12 ++---------- .../amber/operator/loop/LoopStartOpDesc.scala | 12 ++---------- 4 files changed, 15 insertions(+), 33 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/input_manager.py b/amber/src/main/python/core/architecture/packaging/input_manager.py index e9aae2127a2..6cb6bdc08c4 100644 --- a/amber/src/main/python/core/architecture/packaging/input_manager.py +++ b/amber/src/main/python/core/architecture/packaging/input_manager.py @@ -173,11 +173,3 @@ def _process_data(self, table: Table) -> Iterator[Tuple]: yield Tuple( {name: field_accessor for name in table.column_names}, schema=schema ) - - def get_input_port_base_uri(self): - """Return the port base URI of the first materialization reader. - - Use `VFSURIFactory.result_uri(...)` / `state_uri(...)` on the - returned value to get the actual result / state document URI. - """ - return next(iter(self._input_port_mat_reader_runnables.values()))[0].uri diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index 0e0bbac1cda..64e9f44381d 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -98,24 +98,30 @@ def _attach_loop_start_id(self, output_state: State) -> None: output_state["LoopStartId"] = self.context.worker_id.split("-", 1)[1].rsplit( "-main-0", 1 )[0] + # The URI lives on the upstream operator's output port (which + # LoopStart's first materialization reader is reading from). + reader_runnables = ( + self.context.input_manager.get_input_port_mat_reader_threads() + ) output_state["LoopStartStateURI"] = VFSURIFactory.state_uri( - self.context.input_manager.get_input_port_base_uri() + next(iter(reader_runnables.values()))[0].uri ) def _jump_to_loop_start( self, executor: LoopEndOperator, controller_interface ) -> None: + state = executor.state controller_interface.jump_to_operator_region( - JumpToOperatorRegionRequest(OperatorIdentity(executor.state["LoopStartId"])) + JumpToOperatorRegionRequest(OperatorIdentity(state["LoopStartId"])) ) - uri = executor.state["LoopStartStateURI"] + uri = state["LoopStartStateURI"] # Strip the per-iteration scratch (`table`, `output`) and the # loop metadata (`LoopStartId`, `LoopStartStateURI`) so only the # user-visible loop state is written back to LoopStart's input. for key in ("table", "output", "LoopStartId", "LoopStartStateURI"): - executor.state.pop(key, None) + state.pop(key, None) writer = DocumentFactory.create_document(uri, State.SCHEMA).writer("0") - writer.put_one(State(executor.state).to_tuple()) + writer.put_one(State(state).to_tuple()) writer.close() def complete(self) -> None: diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala index cb911b3d369..cf095b8f2a4 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala @@ -39,26 +39,18 @@ class LoopEndOpDesc extends LogicalOp { override def getPhysicalOp( workflowId: WorkflowIdentity, executionId: ExecutionIdentity - ): PhysicalOp = { - val pythonCode = - try { - generatePythonCode() - } catch { - case ex: Throwable => - s"#EXCEPTION DURING CODE GENERATION: ${ex.getMessage}" - } + ): PhysicalOp = PhysicalOp .oneToOnePhysicalOp( workflowId, executionId, operatorIdentifier, - OpExecWithCode(pythonCode, "python") + OpExecWithCode(generatePythonCode(), "python") ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withSuggestedWorkerNum(1) .withParallelizable(false) - } override def operatorInfo: OperatorInfo = OperatorInfo( diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala index baf1f4f4092..ebdee746d04 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala @@ -39,26 +39,18 @@ class LoopStartOpDesc extends LogicalOp { override def getPhysicalOp( workflowId: WorkflowIdentity, executionId: ExecutionIdentity - ): PhysicalOp = { - val pythonCode = - try { - generatePythonCode() - } catch { - case ex: Throwable => - s"#EXCEPTION DURING CODE GENERATION: ${ex.getMessage}" - } + ): PhysicalOp = PhysicalOp .oneToOnePhysicalOp( workflowId, executionId, operatorIdentifier, - OpExecWithCode(pythonCode, "python") + OpExecWithCode(generatePythonCode(), "python") ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withSuggestedWorkerNum(1) .withParallelizable(false) - } override def operatorInfo: OperatorInfo = OperatorInfo( From a22b95e176c48ac537a1cf291bc599388fb80c1c Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 22:18:50 -0700 Subject: [PATCH 145/152] refactor(loop): small cleanups in process_input_state, condition, and OperatorInfo - process_input_state: collapse the two mutually-exclusive isinstance branches into if/elif over a single bound `executor`. No behavior change; saves one attribute lookup and makes mutual exclusivity explicit. - LoopEndOperator.condition: annotate return as `bool` (was `None`). Callers and the generated subclass already treat it as a boolean. - Loop{Start,End}OpDesc: replace the duplicated userFriendlyName / description with a real one-line description so the UI tooltip isn't redundant. --- amber/src/main/python/core/models/operator.py | 2 +- amber/src/main/python/core/runnables/main_loop.py | 5 +++-- .../apache/texera/amber/operator/loop/LoopEndOpDesc.scala | 2 +- .../apache/texera/amber/operator/loop/LoopStartOpDesc.scala | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/amber/src/main/python/core/models/operator.py b/amber/src/main/python/core/models/operator.py index d3b9b590f1f..b51e284c55b 100644 --- a/amber/src/main/python/core/models/operator.py +++ b/amber/src/main/python/core/models/operator.py @@ -316,5 +316,5 @@ def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike] yield table @abstractmethod - def condition(self) -> None: + def condition(self) -> bool: pass diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index 64e9f44381d..e32b489a2eb 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -233,9 +233,10 @@ def process_input_state(self) -> None: self._switch_context() output_state = self.context.state_processing_manager.get_output_state() if output_state is not None: - if isinstance(self.context.executor_manager.executor, LoopEndOperator): + executor = self.context.executor_manager.executor + if isinstance(executor, LoopEndOperator): self.context.output_manager.reset_loopend_storage() - if isinstance(self.context.executor_manager.executor, LoopStartOperator): + elif isinstance(executor, LoopStartOperator): self._attach_loop_start_id(output_state) for to, batch in self.context.output_manager.emit_state(output_state): self._output_queue.put( diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala index cf095b8f2a4..d4247009f64 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala @@ -55,7 +55,7 @@ class LoopEndOpDesc extends LogicalOp { override def operatorInfo: OperatorInfo = OperatorInfo( "Loop End", - "Loop End", + "Close a loop body and decide whether to iterate again based on a condition; pairs with Loop Start.", OperatorGroupConstants.CONTROL_GROUP, inputPorts = List(InputPort()), outputPorts = List(OutputPort()) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala index ebdee746d04..c00c99cc0e6 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopStartOpDesc.scala @@ -55,7 +55,7 @@ class LoopStartOpDesc extends LogicalOp { override def operatorInfo: OperatorInfo = OperatorInfo( "Loop Start", - "Loop Start", + "Begin a loop that iterates over rows of the input table; pairs with Loop End.", OperatorGroupConstants.CONTROL_GROUP, inputPorts = List(InputPort()), outputPorts = List(OutputPort()) From 4a48559bc806d37b14ad2a93e2c2727f50a4f51d Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 22:46:51 -0700 Subject: [PATCH 146/152] refactor(loop): mark LoopEnd via PhysicalOp.isLoopEnd, not op-id prefix RegionExecutionCoordinator detected LoopEnd regions by matching on the auto-generated op-id prefix ("LoopEnd-operator-"). That coupling silently breaks the moment the op-id format changes or a user names a non-LoopEnd op with the same prefix. Add `isLoopEnd: Boolean = false` to PhysicalOp (with a small `withIsLoopEnd` builder), set it from LoopEndOpDesc, and check it directly in the scheduler. Matches the type-based pattern already used in WorkflowExecutionService for LoopStart. --- .../scheduling/RegionExecutionCoordinator.scala | 3 +-- .../apache/texera/amber/core/workflow/PhysicalOp.scala | 9 +++++++++ .../texera/amber/operator/loop/LoopEndOpDesc.scala | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index f5b3ace265a..6d083fa6d6b 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -580,8 +580,7 @@ class RegionExecutionCoordinator( // subsequent iterations the result/state documents already exist, // and `createDocument` (overrideIfExists=true) would clobber them. // Skip the create call when the document is already there. - val isLoopEndRegion = - region.getOperators.exists(_.id.logicalOpId.id.startsWith("LoopEnd-operator-")) + val isLoopEndRegion = region.getOperators.exists(_.isLoopEnd) if (!isLoopEndRegion || !DocumentFactory.documentExists(resultURI)) { DocumentFactory.createDocument(resultURI, schema) } diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/workflow/PhysicalOp.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/workflow/PhysicalOp.scala index 494fd6f76dc..cc53e621ada 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/workflow/PhysicalOp.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/workflow/PhysicalOp.scala @@ -198,6 +198,7 @@ case class PhysicalOp( // schema propagation function propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), isOneToManyOp: Boolean = false, + isLoopEnd: Boolean = false, // hint for number of workers suggestedWorkerNum: Option[Int] = None ) extends LazyLogging { @@ -314,6 +315,14 @@ case class PhysicalOp( def withIsOneToManyOp(isOneToManyOp: Boolean): PhysicalOp = this.copy(isOneToManyOp = isOneToManyOp) + /** + * Creates a copy marked as a LoopEnd operator. Used by the region + * scheduler to preserve this operator's iceberg output across loop + * iterations instead of overwriting it on every region invocation. + */ + def withIsLoopEnd(isLoopEnd: Boolean): PhysicalOp = + this.copy(isLoopEnd = isLoopEnd) + /** * Creates a copy of the PhysicalOp with the schema of a specified input port updated. * The schema can either be a successful schema definition or an error represented as a Throwable. diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala index d4247009f64..004c9651d98 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala @@ -51,6 +51,7 @@ class LoopEndOpDesc extends LogicalOp { .withOutputPorts(operatorInfo.outputPorts) .withSuggestedWorkerNum(1) .withParallelizable(false) + .withIsLoopEnd(true) override def operatorInfo: OperatorInfo = OperatorInfo( From 245fe4bc1e6b6a6cf40a644679b6336300745c0e Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Fri, 15 May 2026 23:47:40 -0700 Subject: [PATCH 147/152] refactor(loop): match condition() annotation in generated code with base class --- .../org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala index 004c9651d98..db4af48fc90 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/loop/LoopEndOpDesc.scala @@ -79,7 +79,7 @@ class LoopEndOpDesc extends LogicalOp { | return None | | @overrides - | def condition(self) -> None: + | def condition(self) -> bool: | exec("output = $condition", {}, self.state) | return self.state["output"] |""".stripMargin From 8db3f750fbdeaedfc53bdf721c04ff84835d868d Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 16 May 2026 01:39:34 -0700 Subject: [PATCH 148/152] refactor(loop): collapse per-port storage URI dict to single field LoopEnd has exactly one output port, so OutputManager only ever stored one entry in _storage_uris. Replace the dict with a single Optional[str] field. --- .../core/architecture/packaging/output_manager.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index 343c40a9770..2744e854a13 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -92,11 +92,10 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() - # Track the port base URI per output port so loop-end operators can - # recreate the storage documents on each loop iteration via - # `reset_loopend_storage`. Without this, the reset path has no way - # to look up which iceberg tables to drop and re-provision. - self._storage_uris: typing.Dict[PortIdentity, str] = dict() + # Loop-end operators have a single output port; remember its base + # URI so `reset_loopend_storage` can re-provision the iceberg + # tables on each loop iteration. + self._storage_uri_base: typing.Optional[str] = None def is_missing_output_ports(self): """ @@ -140,8 +139,8 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri_base: st port's base URI; the result and state URIs are derived from it. """ # Remember the base URI so `reset_loopend_storage` can re-provision - # this port's iceberg tables on subsequent loop iterations. - self._storage_uris[port_id] = storage_uri_base + # the iceberg tables on subsequent loop iterations. + self._storage_uri_base = storage_uri_base document, _ = DocumentFactory.open_document( VFSURIFactory.result_uri(storage_uri_base) ) @@ -228,7 +227,7 @@ def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: def reset_loopend_storage(self) -> None: port_id = self.get_port_ids()[0] - storage_uri_base = self._storage_uris[port_id] + storage_uri_base = self._storage_uri_base self.close_port_storage_writers() DocumentFactory.create_document( VFSURIFactory.result_uri(storage_uri_base), From 71ca457f5e83e7cd6329b7b0fdb16d6f580bc2a1 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 16 May 2026 01:45:07 -0700 Subject: [PATCH 149/152] refactor(loop): stash storage_uri_base on PortStorageWriter Remove OutputManager._storage_uri_base by carrying the base URI on the result-port PortStorageWriter itself. reset_loopend_storage now reads the URI from the writer it is about to close, so OutputManager no longer holds any loop-specific bookkeeping field. --- .../architecture/packaging/output_manager.py | 19 +++++++++---------- .../storage/runnables/port_storage_writer.py | 13 ++++++++++++- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index 2744e854a13..e6dd1e7f5be 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -92,11 +92,6 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() - # Loop-end operators have a single output port; remember its base - # URI so `reset_loopend_storage` can re-provision the iceberg - # tables on each loop iteration. - self._storage_uri_base: typing.Optional[str] = None - def is_missing_output_ports(self): """ This method is only used for ensuring correct region execution. @@ -138,16 +133,19 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri_base: st state materialization on the same port. `storage_uri_base` is the port's base URI; the result and state URIs are derived from it. """ - # Remember the base URI so `reset_loopend_storage` can re-provision - # the iceberg tables on subsequent loop iterations. - self._storage_uri_base = storage_uri_base document, _ = DocumentFactory.open_document( VFSURIFactory.result_uri(storage_uri_base) ) buffered_item_writer = document.writer(str(get_worker_index(self.worker_id))) writer_queue = Queue() + # Stash the base URI on the result-port writer so + # `reset_loopend_storage` can re-provision the iceberg tables on + # subsequent loop iterations without OutputManager having to + # remember it separately. port_storage_writer = PortStorageWriter( - buffered_item_writer=buffered_item_writer, queue=writer_queue + buffered_item_writer=buffered_item_writer, + queue=writer_queue, + storage_uri_base=storage_uri_base, ) writer_thread = threading.Thread( target=port_storage_writer.run, @@ -227,7 +225,8 @@ def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: def reset_loopend_storage(self) -> None: port_id = self.get_port_ids()[0] - storage_uri_base = self._storage_uri_base + _, result_writer, _ = self._port_storage_writers[port_id] + storage_uri_base = result_writer.storage_uri_base self.close_port_storage_writers() DocumentFactory.create_document( VFSURIFactory.result_uri(storage_uri_base), diff --git a/amber/src/main/python/core/storage/runnables/port_storage_writer.py b/amber/src/main/python/core/storage/runnables/port_storage_writer.py index 5a026162526..23629bdbf6c 100644 --- a/amber/src/main/python/core/storage/runnables/port_storage_writer.py +++ b/amber/src/main/python/core/storage/runnables/port_storage_writer.py @@ -16,6 +16,8 @@ # under the License. from dataclasses import dataclass +from typing import Optional + from overrides import overrides from core.models import Tuple @@ -30,9 +32,18 @@ class PortStorageWriterElement(QueueElement): class PortStorageWriter(StoppableQueueBlockingRunnable): - def __init__(self, buffered_item_writer: BufferedItemWriter, queue: IQueue): + def __init__( + self, + buffered_item_writer: BufferedItemWriter, + queue: IQueue, + storage_uri_base: Optional[str] = None, + ): super().__init__(name=self.__class__.__name__, queue=queue) self.buffered_item_writer: BufferedItemWriter = buffered_item_writer + # The VFS base URI this writer was opened from. Only the result-port + # writer needs to remember this so loop-end can rebuild the iceberg + # tables on each iteration; for other writers it stays None. + self.storage_uri_base: Optional[str] = storage_uri_base @overrides def receive(self, next_entry: QueueElement) -> None: From 0d3125daa7f8d47c21795f63f4b4aeec340fdb8f Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 16 May 2026 01:45:49 -0700 Subject: [PATCH 150/152] Revert "refactor(loop): stash storage_uri_base on PortStorageWriter" This reverts commit 71ca457f5e83e7cd6329b7b0fdb16d6f580bc2a1. --- .../architecture/packaging/output_manager.py | 19 ++++++++++--------- .../storage/runnables/port_storage_writer.py | 13 +------------ 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index e6dd1e7f5be..2744e854a13 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -92,6 +92,11 @@ def __init__(self, worker_id: str): PortIdentity, typing.Tuple[Queue, PortStorageWriter, Thread] ] = dict() + # Loop-end operators have a single output port; remember its base + # URI so `reset_loopend_storage` can re-provision the iceberg + # tables on each loop iteration. + self._storage_uri_base: typing.Optional[str] = None + def is_missing_output_ports(self): """ This method is only used for ensuring correct region execution. @@ -133,19 +138,16 @@ def set_up_port_storage_writer(self, port_id: PortIdentity, storage_uri_base: st state materialization on the same port. `storage_uri_base` is the port's base URI; the result and state URIs are derived from it. """ + # Remember the base URI so `reset_loopend_storage` can re-provision + # the iceberg tables on subsequent loop iterations. + self._storage_uri_base = storage_uri_base document, _ = DocumentFactory.open_document( VFSURIFactory.result_uri(storage_uri_base) ) buffered_item_writer = document.writer(str(get_worker_index(self.worker_id))) writer_queue = Queue() - # Stash the base URI on the result-port writer so - # `reset_loopend_storage` can re-provision the iceberg tables on - # subsequent loop iterations without OutputManager having to - # remember it separately. port_storage_writer = PortStorageWriter( - buffered_item_writer=buffered_item_writer, - queue=writer_queue, - storage_uri_base=storage_uri_base, + buffered_item_writer=buffered_item_writer, queue=writer_queue ) writer_thread = threading.Thread( target=port_storage_writer.run, @@ -225,8 +227,7 @@ def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: def reset_loopend_storage(self) -> None: port_id = self.get_port_ids()[0] - _, result_writer, _ = self._port_storage_writers[port_id] - storage_uri_base = result_writer.storage_uri_base + storage_uri_base = self._storage_uri_base self.close_port_storage_writers() DocumentFactory.create_document( VFSURIFactory.result_uri(storage_uri_base), diff --git a/amber/src/main/python/core/storage/runnables/port_storage_writer.py b/amber/src/main/python/core/storage/runnables/port_storage_writer.py index 23629bdbf6c..5a026162526 100644 --- a/amber/src/main/python/core/storage/runnables/port_storage_writer.py +++ b/amber/src/main/python/core/storage/runnables/port_storage_writer.py @@ -16,8 +16,6 @@ # under the License. from dataclasses import dataclass -from typing import Optional - from overrides import overrides from core.models import Tuple @@ -32,18 +30,9 @@ class PortStorageWriterElement(QueueElement): class PortStorageWriter(StoppableQueueBlockingRunnable): - def __init__( - self, - buffered_item_writer: BufferedItemWriter, - queue: IQueue, - storage_uri_base: Optional[str] = None, - ): + def __init__(self, buffered_item_writer: BufferedItemWriter, queue: IQueue): super().__init__(name=self.__class__.__name__, queue=queue) self.buffered_item_writer: BufferedItemWriter = buffered_item_writer - # The VFS base URI this writer was opened from. Only the result-port - # writer needs to remember this so loop-end can rebuild the iceberg - # tables on each iteration; for other writers it stays None. - self.storage_uri_base: Optional[str] = storage_uri_base @overrides def receive(self, next_entry: QueueElement) -> None: From 833056d8f072d39d9347c0550a743e43187afc6e Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Sat, 16 May 2026 01:53:56 -0700 Subject: [PATCH 151/152] fix fmt --- .../main/python/core/architecture/packaging/output_manager.py | 2 +- amber/src/main/python/core/runnables/main_loop.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/amber/src/main/python/core/architecture/packaging/output_manager.py b/amber/src/main/python/core/architecture/packaging/output_manager.py index 2744e854a13..7ef0ca804a2 100644 --- a/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -225,7 +225,7 @@ def save_state_to_storage_if_needed(self, state: State, port_id=None) -> None: elif port_id in self._port_state_writers: self._port_state_writers[port_id][0].put(element) - def reset_loopend_storage(self) -> None: + def reset_storage(self) -> None: port_id = self.get_port_ids()[0] storage_uri_base = self._storage_uri_base self.close_port_storage_writers() diff --git a/amber/src/main/python/core/runnables/main_loop.py b/amber/src/main/python/core/runnables/main_loop.py index e32b489a2eb..faa57b2c31b 100644 --- a/amber/src/main/python/core/runnables/main_loop.py +++ b/amber/src/main/python/core/runnables/main_loop.py @@ -235,7 +235,7 @@ def process_input_state(self) -> None: if output_state is not None: executor = self.context.executor_manager.executor if isinstance(executor, LoopEndOperator): - self.context.output_manager.reset_loopend_storage() + self.context.output_manager.reset_storage() elif isinstance(executor, LoopStartOperator): self._attach_loop_start_id(output_state) for to, batch in self.context.output_manager.emit_state(output_state): From ca12ad0f3cbfaf31e09383f0d5696495c2d91310 Mon Sep 17 00:00:00 2001 From: Xinyuan Lin Date: Wed, 20 May 2026 14:45:21 -0700 Subject: [PATCH 152/152] test(loop): cover LoopStart/LoopEnd codegen + flat/nested runtime behavior Adds three spec files exercising the loop PR's surface: * LoopStartOpDescSpec: pins operatorInfo, the generated Python code (init / output expression sites, LoopStartOperator subclassing), and PhysicalOp flags (non-parallelizable, single worker, not isLoopEnd). * LoopEndOpDescSpec: pins operatorInfo, the generated Python code (update / condition sites, condition() returning bool, the nested- loop loop_counter > 0 pass-through branch, the matching-loop pickle round-trip), and PhysicalOp flags including isLoopEnd=true. * test_loop_operators.py: drives stub subclasses that mirror the generated runtime classes through both single-loop scenarios (first-time merge, produce_state_on_finish pickling, matching branch update/condition) and nested-loop scenarios (loop_counter increments on LoopStart re-entry; decrements on LoopEnd pass- through; depth-symmetric round trip across outer x inner loop). --- .../python/core/models/test_loop_operators.py | 424 ++++++++++++++++++ .../operator/loop/LoopEndOpDescSpec.scala | 152 +++++++ .../operator/loop/LoopStartOpDescSpec.scala | 121 +++++ 3 files changed, 697 insertions(+) create mode 100644 amber/src/test/python/core/models/test_loop_operators.py create mode 100644 common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/loop/LoopEndOpDescSpec.scala create mode 100644 common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/loop/LoopStartOpDescSpec.scala diff --git a/amber/src/test/python/core/models/test_loop_operators.py b/amber/src/test/python/core/models/test_loop_operators.py new file mode 100644 index 00000000000..8fee19f09f2 --- /dev/null +++ b/amber/src/test/python/core/models/test_loop_operators.py @@ -0,0 +1,424 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Unit tests for the loop runtime: LoopStartOperator and LoopEndOperator. + +These exercise the abstract base classes in operator.py that the +generated `ProcessLoopStartOperator` / `ProcessLoopEndOperator` classes +extend. The tests use minimal stub subclasses that mirror what +`LoopStartOpDesc.generatePythonCode` / `LoopEndOpDesc.generatePythonCode` +emit so the behavior covered here is the same shape that ships at +runtime. + +Single-loop coverage: + - LoopStart's first-time state observation (merge into self.state). + - LoopEnd's process_table is the identity yield. + - End-to-end one-iteration loop driven through the matching-loop branch. + +Nested-loop coverage: + - LoopStart.process_state with `LoopStartStateURI` already present + must increment `loop_counter` and pass the state through downstream + (this is what makes inner LoopStart not consume outer-loop state). + - LoopEnd's generated process_state, when `loop_counter > 0`, must + decrement and return the state unchanged so the outer LoopEnd is + the one that runs the user's update / condition. + - Round-trip outer × inner loop preserves the nesting invariant + (loop_counter is symmetric across LoopStart/LoopEnd traversals). +""" + +from pickle import loads +from typing import Iterator, Optional + +import pytest + +from core.models import State, Table, TableLike, Tuple +from core.models.operator import LoopEndOperator, LoopStartOperator + + +# --------------------------------------------------------------------------- +# Stub subclasses that mirror the generated Python in +# LoopStart/LoopEnd OpDesc. Keeping them here (rather than reusing the +# real generator) lets the test pin behavior without spinning up a Scala +# runtime to produce code. +# --------------------------------------------------------------------------- + + +class _StubLoopStart(LoopStartOperator): + """Mirrors `ProcessLoopStartOperator` from LoopStartOpDesc codegen. + + open() seeds `loop_counter` to 0 and runs the user's `initialization`. + process_table runs the user's `output` expression and yields the + result for downstream. + """ + + def __init__(self, initialization="i = 0", output_expr="table.iloc[i]"): + super().__init__() + self._initialization = initialization + self._output_expr = output_expr + + def open(self) -> None: + self.state = {"loop_counter": 0} + exec(self._initialization, {}, self.state) + + def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: + self.state["table"] = table + exec(f"output = {self._output_expr}", {}, self.state) + yield self.state["output"] + + +class _StubLoopEnd(LoopEndOperator): + """Mirrors `ProcessLoopEndOperator` from LoopEndOpDesc codegen. + + process_state recognises the nested-loop pass-through path + (`loop_counter > 0`) and decrements; on the matching-loop branch + it stashes the state, deserializes the pickled table, and runs the + user's `update`. condition() returns the boolean result of the + user's `condition` expression evaluated in self.state. + """ + + def __init__(self, update="i += 1", condition_expr="i < 3"): + super().__init__() + self._update = update + self._condition_expr = condition_expr + self.state = {} + + def process_state(self, state: State, port: int) -> Optional[State]: + loop_counter = int(state.get("loop_counter", 0)) + if loop_counter > 0: + state["loop_counter"] = loop_counter - 1 + return state + self.state = dict(state) + self.state["table"] = loads(self.state["table"]) + exec(self._update, {}, self.state) + return None + + def condition(self) -> bool: + exec(f"output = {self._condition_expr}", {}, self.state) + return self.state["output"] + + +# --------------------------------------------------------------------------- +# LoopStartOperator — process_state +# --------------------------------------------------------------------------- + + +class TestLoopStartProcessState: + def test_first_time_state_is_merged_into_self_state_and_none_is_returned(self): + # First entry: state from upstream (no LoopStartStateURI). The + # base class must merge it into self.state and return None so + # nothing flows downstream of LoopStart until the table is in. + op = _StubLoopStart() + op.open() + op.state["i"] = 0 # simulate the user's initialization + + result = op.process_state(State({"upstream_key": "v"}), port=0) + + assert result is None, "first-time state must not be forwarded" + assert op.state["upstream_key"] == "v", "state was not merged into self.state" + # loop_counter is left at its open() value (0) on first entry. + assert op.state["loop_counter"] == 0 + + def test_reentry_state_with_loop_start_uri_increments_loop_counter(self): + # Re-entry from this LoopStart's own LoopEnd: the state carries + # LoopStartStateURI, so the base class must INCREMENT + # loop_counter and PASS THROUGH the state downstream. This is + # what main_loop's _attach_loop_start_id relies on. + op = _StubLoopStart() + op.open() + incoming = State({"LoopStartStateURI": "vfs:///x", "loop_counter": 0, "i": 2}) + + result = op.process_state(incoming, port=0) + + assert result is not None, "re-entry state must be returned for downstream" + assert result["loop_counter"] == 1 + # The user variable rides along. + assert result["i"] == 2 + + def test_reentry_at_nested_loop_counter_bumps_one(self): + # Nested loop: an outer loop's re-entry state passes through this + # inner LoopStart with a loop_counter already > 0 (because the + # outer LoopStart bumped it on its own re-entry first). The + # invariant is that we only ever +1, never reset. + op = _StubLoopStart() + op.open() + incoming = State({"LoopStartStateURI": "vfs:///outer", "loop_counter": 5}) + + result = op.process_state(incoming, port=0) + + assert result["loop_counter"] == 6 + + +# --------------------------------------------------------------------------- +# LoopStartOperator — produce_state_on_finish +# --------------------------------------------------------------------------- + + +class TestLoopStartProduceStateOnFinish: + def test_pickles_buffered_table_into_state_table_field(self): + # produce_state_on_finish must serialize the buffered table via + # pickle (so the cross-region state stream can carry a heavy + # pandas DataFrame as bytes). The receiving LoopEnd unpickles + # it on the matching-loop branch. + op = _StubLoopStart() + op.open() + # Drive a couple of tuples through to populate the per-port buffer. + list(op.process_tuple(Tuple({"v": 1}), port=0)) + list(op.process_tuple(Tuple({"v": 2}), port=0)) + + produced = op.produce_state_on_finish(port=0) + + assert isinstance(produced, dict) + assert "table" in produced + assert isinstance(produced["table"], bytes), "table must be pickled bytes" + # Round-trip through pickle.loads must give back our two tuples. + unpickled = loads(produced["table"]) + assert isinstance(unpickled, Table) + rows = list(unpickled.as_tuples()) + assert rows == [Tuple({"v": 1}), Tuple({"v": 2})] + + def test_user_state_fields_survive_into_produced_state(self): + # Any vars the user set in open() (e.g. i, accumulators) must + # ride along in the produced state so LoopEnd can run the user's + # `update` expression against them. + op = _StubLoopStart(initialization="i = 0; acc = []") + op.open() + list(op.process_tuple(Tuple({"v": 1}), port=0)) + + produced = op.produce_state_on_finish(port=0) + + assert produced["i"] == 0 + assert produced["acc"] == [] + assert produced["loop_counter"] == 0 + + +# --------------------------------------------------------------------------- +# LoopEndOperator — base class behaviour +# --------------------------------------------------------------------------- + + +class TestLoopEndBase: + def test_process_table_yields_input_table_unchanged(self): + # The base class finalizes process_table to a single identity + # yield. The user only ever overrides condition() and (via + # codegen) process_state. + op = _StubLoopEnd() + in_table = Table([Tuple({"x": 1}), Tuple({"x": 2})]) + out = list(op.process_table(in_table, port=0)) + assert out == [in_table] + + def test_condition_is_abstract_on_base_class(self): + # A class that extends LoopEndOperator without supplying + # condition() must be uninstantiable. This is what stops a + # user from shipping a loop with an empty exit condition. + class _Missing(LoopEndOperator): + pass + + with pytest.raises(TypeError, match="condition"): + _Missing() + + +# --------------------------------------------------------------------------- +# Generated-style LoopEnd — single-loop matching branch +# --------------------------------------------------------------------------- + + +class TestLoopEndMatchingBranch: + def test_loop_counter_zero_runs_user_update_and_returns_none(self): + # The matching-loop branch (loop_counter == 0) is where the user's + # update expression runs. process_state must return None so no + # state flows downstream; the actual loop-back is driven by + # main_loop.complete() reading executor.state. + op = _StubLoopEnd(update="i += 1") + # Simulate LoopStart's produced state arriving here. + from pickle import dumps + + incoming = State( + { + "loop_counter": 0, + "i": 2, + "table": dumps(Table([Tuple({"v": 1})])), + "LoopStartId": "outer-loop", + "LoopStartStateURI": "vfs:///outer", + } + ) + + result = op.process_state(incoming, port=0) + + assert result is None, "matching-loop branch must not emit state downstream" + assert op.state["i"] == 3, "user's update did not run on the matching branch" + # The table is unpickled in-place so condition() can see it as + # a real Table without a second round of deserialization. + assert isinstance(op.state["table"], Table) + # Loop metadata is preserved so _jump_to_loop_start can read it. + assert op.state["LoopStartId"] == "outer-loop" + assert op.state["LoopStartStateURI"] == "vfs:///outer" + + def test_condition_evaluates_user_expression_against_stashed_state(self): + op = _StubLoopEnd(update="i += 1", condition_expr="i < 3") + from pickle import dumps + + # Drive process_state once so self.state is populated. + op.process_state( + State( + { + "loop_counter": 0, + "i": 1, + "table": dumps(Table([Tuple({"v": 1})])), + } + ), + port=0, + ) + assert op.condition() is True # i became 2, 2 < 3 + + # Run another iteration to push i past the threshold. + op.process_state( + State( + { + "loop_counter": 0, + "i": 2, + "table": dumps(Table([Tuple({"v": 1})])), + } + ), + port=0, + ) + assert op.condition() is False # i became 3, 3 < 3 is False + + +# --------------------------------------------------------------------------- +# Nested loops — LoopEnd pass-through branch +# --------------------------------------------------------------------------- + + +class TestLoopEndNestedPassThrough: + def test_loop_counter_positive_decrements_and_passes_state_through(self): + # When the inner LoopEnd receives state with loop_counter > 0, + # the state belongs to an OUTER loop. The inner LoopEnd must + # decrement loop_counter and return the state for downstream + # routing (which eventually reaches the outer LoopEnd at + # loop_counter == 0). + op = _StubLoopEnd(update="i += 1") + op.state = {"sentinel": "must_not_be_overwritten"} + + incoming = State({"loop_counter": 2, "outer_var": "v"}) + result = op.process_state(incoming, port=0) + + assert result is not None, "pass-through branch must emit state downstream" + assert result["loop_counter"] == 1 + assert result["outer_var"] == "v" + # The pass-through branch must NOT overwrite self.state — the + # inner LoopEnd's own matching-loop state from a previous inner + # iteration must be preserved. + assert op.state == {"sentinel": "must_not_be_overwritten"} + + def test_pass_through_chain_collapses_to_matching_branch_at_zero(self): + # Walk a state with loop_counter=3 through three levels of + # nested LoopEnds: each strips one level, and the fourth + # (loop_counter == 0) is the matching loop that runs the + # user's update. This pins the depth-symmetric invariant + # nested-for-loop scheduling depends on. + from pickle import dumps + + outer = _StubLoopEnd(update="i += 10") + middle = _StubLoopEnd(update="i += 100") + inner = _StubLoopEnd(update="i += 1000") + match = _StubLoopEnd(update="i += 1") + + state = State( + { + "loop_counter": 3, + "i": 0, + "table": dumps(Table([Tuple({"v": 1})])), + } + ) + + # Each outer→inner hop decrements once. + state = outer.process_state(state, port=0) + assert state["loop_counter"] == 2 + state = middle.process_state(state, port=0) + assert state["loop_counter"] == 1 + state = inner.process_state(state, port=0) + assert state["loop_counter"] == 0 + # At loop_counter == 0 the matching LoopEnd consumes the state + # and runs ITS user update — not any of the pass-through ops'. + result = match.process_state(state, port=0) + assert result is None + assert match.state["i"] == 1, "only the matching LoopEnd's update should fire" + + +# --------------------------------------------------------------------------- +# Nested loops — round trip +# --------------------------------------------------------------------------- + + +class TestNestedLoopRoundTrip: + def test_outer_then_inner_loop_state_keeps_counters_symmetric(self): + # Simulate the state flow for one outer iteration that itself + # triggers one inner iteration: + # + # outer LoopStart re-entry → loop_counter 0 → 1 + # inner LoopStart re-entry → loop_counter 1 → 2 + # inner LoopEnd → loop_counter 2 → 1 + # outer LoopEnd → loop_counter 1 → 0 (matching branch) + # + # The matching branch on the outer LoopEnd is reached iff every + # increment is mirrored by exactly one decrement. A bug in + # either side would land us in the wrong branch. + outer_start = _StubLoopStart() + inner_start = _StubLoopStart() + inner_end = _StubLoopEnd(update="outer_i += 100") + outer_end = _StubLoopEnd(update="outer_i += 1") + outer_start.open() + inner_start.open() + + from pickle import dumps + + # outer LoopEnd jumped back to outer LoopStart with this state. + state_in = State( + { + "LoopStartStateURI": "vfs:///outer", + "loop_counter": 0, + "outer_i": 0, + "table": dumps(Table([Tuple({"v": 1})])), + } + ) + + # outer LoopStart re-entry: +1 + state_after_outer_start = outer_start.process_state(state_in, port=0) + assert state_after_outer_start["loop_counter"] == 1 + # inner LoopStart sees the same passing state and +1 again. + state_after_inner_start = inner_start.process_state( + state_after_outer_start, port=0 + ) + assert state_after_inner_start["loop_counter"] == 2 + # inner LoopEnd: pass-through branch (-1). + state_after_inner_end = inner_end.process_state(state_after_inner_start, port=0) + assert state_after_inner_end is not None + assert state_after_inner_end["loop_counter"] == 1 + # outer LoopEnd: pass-through (-1) lands at 0, the matching branch. + # Now process_state would have to run the matching branch path + # because loop_counter == 0. To get there we need one more hop: + result = outer_end.process_state(state_after_inner_end, port=0) + # NOT yet at 0 — pass-through decrements to 0 and returns. The + # NEXT hop is the matching branch. + assert result is not None + assert result["loop_counter"] == 0 + + # Final landing on the matching branch consumes the state and + # runs the outer update (outer_i += 1). + matching = _StubLoopEnd(update="outer_i += 1") + assert matching.process_state(result, port=0) is None + assert matching.state["outer_i"] == 1 diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/loop/LoopEndOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/loop/LoopEndOpDescSpec.scala new file mode 100644 index 00000000000..fe0963f63fe --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/loop/LoopEndOpDescSpec.scala @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.loop + +import org.apache.texera.amber.core.executor.OpExecWithCode +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.operator.metadata.OperatorGroupConstants +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class LoopEndOpDescSpec extends AnyFlatSpec with Matchers { + + private val workflowId = WorkflowIdentity(1L) + private val executionId = ExecutionIdentity(1L) + + private def desc( + update: String = "i += 1", + condition: String = "i < len(table)" + ): LoopEndOpDesc = { + val d = new LoopEndOpDesc() + d.update = update + d.condition = condition + d + } + + "LoopEndOpDesc.operatorInfo" should "advertise the user-friendly name and Control group" in { + val info = desc().operatorInfo + info.userFriendlyName shouldBe "Loop End" + info.operatorGroupName shouldBe OperatorGroupConstants.CONTROL_GROUP + info.operatorDescription should include("loop") + } + + it should "expose exactly one input port and one output port" in { + val info = desc().operatorInfo + info.inputPorts should have length 1 + info.outputPorts should have length 1 + } + + "LoopEndOpDesc.generatePythonCode" should "embed the user-supplied update and condition expressions" in { + // Distinct sentinels so we know the codegen wires the right user field + // into the right `exec(...)` site. If `condition` were accidentally + // pasted in place of `update`, a generic `code.contains("i")` check + // would still pass — these sentinels force the asymmetry. + val code = desc(update = "UPDATE_SENT", condition = "COND_SENT").generatePythonCode() + code should include("UPDATE_SENT") + code should include("COND_SENT") + } + + it should "subclass LoopEndOperator from pytexera" in { + // Runtime branch `isinstance(executor, LoopEndOperator)` in main_loop + // gates the loop-end reset path; a rename of either side must break + // this assertion. + val code = desc().generatePythonCode() + code should include("from pytexera import *") + code should include("class ProcessLoopEndOperator(LoopEndOperator)") + } + + it should "declare condition() as returning bool, matching the abstract base" in { + // The abstract base in operator.py was fixed to `-> bool`; the + // generator template must agree. A `-> None` slip here would produce + // a class that disagrees with the abstract contract. + val code = desc().generatePythonCode() + code should include("def condition(self) -> bool:") + } + + it should "decrement loop_counter and pass state through when loop_counter > 0 (nested-loop case)" in { + // For nested loops, the inner LoopEnd sees state belonging to an + // outer loop. The generated process_state recognises this by a + // positive loop_counter and just decrements + returns the state, + // leaving the actual loop-control work to the outer LoopEnd. + // This branch is critical for nested-for-loop correctness so pin + // its shape explicitly. + val code = desc().generatePythonCode() + code should include("loop_counter = int(state.get(\"loop_counter\", 0))") + code should include("if loop_counter > 0:") + code should include("state[\"loop_counter\"] = loop_counter - 1") + } + + it should "stash state, deserialize the pickled table, and run the user update on the matching-loop branch" in { + val code = desc(update = "i = i + 7").generatePythonCode() + // The matching-loop branch is the path the user's `update` expression + // runs on. Pin the pickle round-trip and the exec call so a refactor + // of either is intentional. + code should include("self.state = dict(state)") + code should include("from pickle import loads") + code should include("self.state[\"table\"] = loads(self.state[\"table\"])") + code should include("exec(\"i = i + 7\"") + } + + it should "evaluate the user condition in process-shared state" in { + val code = desc(condition = "i < 3").generatePythonCode() + // condition() must read from self.state (populated by the matching- + // loop branch above) and assign into self.state["output"] before + // returning it. Pinning both the exec format and the assignment + // keeps a future "just return the expr" refactor from silently + // dropping the state side-effect main_loop.complete() depends on. + code should include("exec(\"output = i < 3\"") + code should include("self.state[\"output\"]") + } + + "LoopEndOpDesc.getPhysicalOp" should "produce a non-parallelizable PhysicalOp pinned to a single worker" in { + // Same reasoning as LoopStart: the loop body's per-iteration state + // is per-instance, and the accumulated table must be a single buffer. + val physical = desc().getPhysicalOp(workflowId, executionId) + physical.parallelizable shouldBe false + physical.suggestedWorkerNum shouldBe Some(1) + } + + it should "be tagged as a loop end so RegionExecutionCoordinator skips iceberg recreation" in { + // The isLoopEnd flag drives the + // `if (!isLoopEndRegion || !DocumentFactory.documentExists(...))` + // branch in RegionExecutionCoordinator. Without the tag, every loop + // iteration would unconditionally recreate the result/state tables + // and lose accumulated data. The flag must be set. + val physical = desc().getPhysicalOp(workflowId, executionId) + physical.isLoopEnd shouldBe true + } + + it should "carry the generated Python code via OpExecWithCode" in { + val physical = desc().getPhysicalOp(workflowId, executionId) + physical.opExecInitInfo match { + case OpExecWithCode(code, language) => + language shouldBe "python" + code should include("class ProcessLoopEndOperator(LoopEndOperator)") + case other => + fail(s"expected OpExecWithCode, got $other") + } + } + + it should "carry forward the operatorInfo input/output ports onto the PhysicalOp" in { + val physical = desc().getPhysicalOp(workflowId, executionId) + physical.inputPorts.size shouldBe desc().operatorInfo.inputPorts.size + physical.outputPorts.size shouldBe desc().operatorInfo.outputPorts.size + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/loop/LoopStartOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/loop/LoopStartOpDescSpec.scala new file mode 100644 index 00000000000..2409b5a58c1 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/loop/LoopStartOpDescSpec.scala @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.loop + +import org.apache.texera.amber.core.executor.OpExecWithCode +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.operator.metadata.OperatorGroupConstants +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class LoopStartOpDescSpec extends AnyFlatSpec with Matchers { + + private val workflowId = WorkflowIdentity(1L) + private val executionId = ExecutionIdentity(1L) + + private def desc(init: String = "i = 0", out: String = "table.iloc[i]"): LoopStartOpDesc = { + val d = new LoopStartOpDesc() + d.initialization = init + d.output = out + d + } + + "LoopStartOpDesc.operatorInfo" should "advertise the user-friendly name and Control group" in { + val info = desc().operatorInfo + info.userFriendlyName shouldBe "Loop Start" + info.operatorGroupName shouldBe OperatorGroupConstants.CONTROL_GROUP + info.operatorDescription should include("loop") + } + + it should "expose exactly one input port and one output port" in { + val info = desc().operatorInfo + info.inputPorts should have length 1 + info.outputPorts should have length 1 + } + + "LoopStartOpDesc.generatePythonCode" should "embed the user-supplied initialization and output expressions" in { + // The init + output strings are interpolated directly into the generated + // class so the Python `exec` calls at runtime see the user-provided code. + // Use distinct sentinels so we know each field is wired through and not + // accidentally swapped (e.g. init pasted in place of output). + val code = desc(init = "INIT_SENT", out = "OUT_SENT").generatePythonCode() + code should include("INIT_SENT") + code should include("OUT_SENT") + } + + it should "subclass LoopStartOperator from pytexera" in { + // The generated class must extend the base LoopStartOperator (defined + // in core.models.operator) so the runtime's + // `isinstance(executor, LoopStartOperator)` branch in main_loop fires + // for it. A rename of either side should break this assertion. + val code = desc().generatePythonCode() + code should include("from pytexera import *") + code should include("class ProcessLoopStartOperator(LoopStartOperator)") + } + + it should "wire the initialization expression into open() and the output expression into process_table()" in { + // The user's `initialization` runs once in `open()` to seed self.state + // (specifically self.state['loop_counter'] = 0 plus user vars); the + // user's `output` runs in `process_table()` against the buffered table. + // Pin both call sites so a future refactor that swaps the two doesn't + // silently produce a runnable-looking class that loops over nothing. + val code = desc(init = "i = 0", out = "table.iloc[i]").generatePythonCode() + code should include("def open(self)") + code should include("\"loop_counter\": 0") + code should include("exec(\"i = 0\"") + code should include("def process_table(self, table: Table, port: int)") + code should include("exec(\"output = table.iloc[i]\"") + } + + "LoopStartOpDesc.getPhysicalOp" should "produce a non-parallelizable PhysicalOp pinned to a single worker" in { + // LoopStart must run on exactly one worker because the loop state + // (self.state, the accumulated table) is per-instance, not distributed. + // Parallelizing it would fan-out the table and break the loop body's + // per-iteration invariants. + val physical = desc().getPhysicalOp(workflowId, executionId) + physical.parallelizable shouldBe false + physical.suggestedWorkerNum shouldBe Some(1) + } + + it should "not be tagged as a loop end" in { + // The isLoopEnd flag is consumed by RegionExecutionCoordinator to skip + // recreating result/state tables across loop iterations. LoopStart + // must NOT carry the flag — only LoopEnd does. + val physical = desc().getPhysicalOp(workflowId, executionId) + physical.isLoopEnd shouldBe false + } + + it should "carry the generated Python code via OpExecWithCode" in { + val physical = desc().getPhysicalOp(workflowId, executionId) + physical.opExecInitInfo match { + case OpExecWithCode(code, language) => + language shouldBe "python" + code should include("class ProcessLoopStartOperator(LoopStartOperator)") + case other => + fail(s"expected OpExecWithCode, got $other") + } + } + + it should "carry forward the operatorInfo input/output ports onto the PhysicalOp" in { + val physical = desc().getPhysicalOp(workflowId, executionId) + physical.inputPorts.size shouldBe desc().operatorInfo.inputPorts.size + physical.outputPorts.size shouldBe desc().operatorInfo.outputPorts.size + } +}