diff --git a/docs/api/agents.md b/docs/api/agents.md index bc8073e..1257a30 100644 --- a/docs/api/agents.md +++ b/docs/api/agents.md @@ -27,4 +27,4 @@ AI agents currently interact with IOWarp through: - [CLIO Agent Overview](https://iowarp.ai/platform/clio-agent/) — Architecture and design - [CLIO Kit](https://docs.iowarp.ai) — Interactive MCP server showcase -- [GitHub Issues](https://github.com/iowarp/iowarp/issues) — Request API features +- [GitHub Issues](https://github.com/iowarp/clio-core/issues) — Request API features diff --git a/docs/deployment/configuration.md b/docs/deployment/configuration.md index b91b9b1..22a6bc2 100644 --- a/docs/deployment/configuration.md +++ b/docs/deployment/configuration.md @@ -37,7 +37,7 @@ Size values throughout the file accept: `B`, `KB`, `MB`, `GB`, `TB` (case-insens | Parameter | Default | Description | |-----------|---------|-------------| -| `port` | `5555` | ZeroMQ RPC listener port. Must match across all cluster nodes. | +| `port` | `9413` | ZeroMQ RPC listener port. Must match across all cluster nodes. Can be overridden by `CHI_PORT` env var. | | `neighborhood_size` | `32` | Maximum nodes queried when splitting range queries. | | `hostfile` | *(none)* | Path to a file listing cluster node IPs/hostnames, one per line. Required for multi-node deployments. | | `wait_for_restart` | `30` | Seconds to wait for peer nodes during startup. | @@ -45,7 +45,7 @@ Size values throughout the file accept: `B`, `KB`, `MB`, `GB`, `TB` (case-insens ```yaml networking: - port: 5555 + port: 9413 neighborhood_size: 32 # hostfile: /etc/iowarp/hostfile # Multi-node only wait_for_restart: 30 @@ -261,7 +261,7 @@ All fields are optional and override compile-time defaults. ```yaml networking: - port: 5555 + port: 9413 runtime: num_threads: 4 @@ -291,7 +291,7 @@ compose: ```yaml networking: - port: 5555 + port: 9413 runtime: num_threads: 16 @@ -334,7 +334,7 @@ compose: ```yaml networking: - port: 5555 + port: 9413 neighborhood_size: 32 hostfile: /etc/iowarp/hostfile @@ -383,7 +383,7 @@ services: volumes: - ./chimaera.yaml:/home/iowarp/.chimaera/chimaera.yaml:ro ports: - - "5555:5555" + - "9413:9413" mem_limit: 8g command: ["chimaera", "runtime", "start"] restart: unless-stopped diff --git a/docs/deployment/hpc-cluster.md b/docs/deployment/hpc-cluster.md index b3e0c5a..af98a29 100644 --- a/docs/deployment/hpc-cluster.md +++ b/docs/deployment/hpc-cluster.md @@ -42,6 +42,13 @@ The following environment variables control runtime behavior. Set them before st export CHI_SERVER_CONF=/etc/iowarp/config.yaml ``` +### Networking Overrides + +| Variable | Default | Description | +|----------|---------|-------------| +| `CHI_PORT` | `9413` | Override the RPC port. Takes priority over the YAML `networking.port` setting. | +| `CHI_SERVER_ADDR` | `127.0.0.1` | Override the server address that clients connect to. | + ### IPC Transport Mode | Variable | Default | Description | @@ -107,7 +114,7 @@ Reference it in your config: ```yaml networking: - port: 5555 + port: 9413 hostfile: /etc/iowarp/hostfile ``` diff --git a/docs/faq.md b/docs/faq.md index bc079e7..2dd0a98 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -19,7 +19,7 @@ IOWarp is released under the [BSD 3-Clause License](https://opensource.org/licen ### Where can I get help? - **Zulip Chat**: [iowarp.zulipchat.com](https://iowarp.zulipchat.com) -- **GitHub Issues**: [github.com/iowarp/iowarp/issues](https://github.com/iowarp/iowarp/issues) +- **GitHub Issues**: [github.com/iowarp/clio-core/issues](https://github.com/iowarp/clio-core/issues) - **Discussions**: [github.com/orgs/iowarp/discussions](https://github.com/orgs/iowarp/discussions) - **Email**: grc@illinoistech.edu diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx index f0582e9..3baa1d2 100644 --- a/docs/getting-started/installation.mdx +++ b/docs/getting-started/installation.mdx @@ -19,8 +19,8 @@ This uses conda internally to manage dependencies and produces a full-featured build with all optional components. ```bash -git clone --recurse-submodules https://github.com/iowarp/iowarp-core.git -cd iowarp-core +git clone --recurse-submodules https://github.com/iowarp/clio-core.git +cd clio-core ./install.sh ``` @@ -73,7 +73,7 @@ docker pull iowarp/deploy-cpu:latest Run the container: ```bash -docker run -d -p 5555:5555 --memory=8g --name iowarp iowarp/deploy-cpu:latest chimaera runtime start +docker run -d -p 9413:9413 --memory=8g --name iowarp iowarp/deploy-cpu:latest chimaera runtime start ``` ### Using Docker Compose @@ -89,7 +89,7 @@ services: volumes: - ./chimaera.yaml:/home/iowarp/.chimaera/chimaera.yaml:ro ports: - - "5555:5555" + - "9413:9413" mem_limit: 8g command: ["chimaera", "runtime", "start"] restart: unless-stopped @@ -120,8 +120,9 @@ echo ". ${PWD}/share/spack/setup-env.sh" >> ~/.bashrc 2. Add the IOWarp repository: ```bash -git clone https://github.com/iowarp/iowarp-install.git -spack repo add iowarp-install/iowarp-spack +git clone --recurse-submodules https://github.com/iowarp/clio-core.git +cd clio-core +spack repo add installers/spack ``` 3. Install IOWarp: diff --git a/docs/getting-started/quick-start.mdx b/docs/getting-started/quick-start.mdx index d43b27e..26c8b88 100644 --- a/docs/getting-started/quick-start.mdx +++ b/docs/getting-started/quick-start.mdx @@ -74,7 +74,7 @@ The runtime resolves its configuration in this order: | 2 | `~/.chimaera/chimaera.yaml` | | 3 | Built-in defaults | -The default configuration starts **4 worker threads** on **port 5555** and +The default configuration starts **4 worker threads** on **port 9413** and composes three modules automatically: | Module | Purpose | @@ -175,6 +175,8 @@ Done! |----------|-------------| | `CHI_SERVER_CONF` | Path to YAML configuration file (highest priority) | | `CHI_IPC_MODE` | Transport: `SHM` (shared memory), `TCP` (default), `IPC` (Unix socket) | +| `CHI_PORT` | Override the RPC port (default: `9413`). Takes priority over YAML config. | +| `CHI_SERVER_ADDR` | Override the server address clients connect to (default: `127.0.0.1`). | | `HSHM_LOG_LEVEL` | Logging verbosity: `debug`, `info`, `warning`, `error`, `fatal` | ## Next Steps @@ -190,7 +192,7 @@ Done! ## Support -- Open an issue on the [GitHub repository](https://github.com/iowarp/core) +- Open an issue on the [GitHub repository](https://github.com/iowarp/clio-core) - Join the [Zulip Chat](https://iowarp.zulipchat.com) - Visit the [IOWarp website](https://iowarp.ai) - Email: grc@illinoistech.edu diff --git a/docs/intro.md b/docs/intro.md index 9a57fef..3c8d657 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -44,6 +44,6 @@ IOWarp consists of three layers: ## Need Help? - **Community** — [Zulip Chat](https://iowarp.zulipchat.com) -- **Bug Reports** — [GitHub Issues](https://github.com/iowarp/iowarp/issues) +- **Bug Reports** — [GitHub Issues](https://github.com/iowarp/clio-core/issues) - **Feature Requests** — [GitHub Discussions](https://github.com/orgs/iowarp/discussions) - **Email** — grc@illinoistech.edu diff --git a/docs/sdk/context-runtime/2.module_dev_guide.md b/docs/sdk/context-runtime/2.module_dev_guide.md index 94671d4..3ceeb9f 100644 --- a/docs/sdk/context-runtime/2.module_dev_guide.md +++ b/docs/sdk/context-runtime/2.module_dev_guide.md @@ -2407,7 +2407,7 @@ struct SerializableTask : public chi::Task { ``` :::note GPU-Compatible Data Structures -For GPU-compatible modules, HSHM provides shared-memory data structures (`hshm::string`, `hshm::ipc::vector`, `hshm::ipc::ring_buffer`) with cross-platform annotations. See the [Data Structures Guide](/docs/sdk/context-transport-primitives/types/data_structures_guide) for details. +For GPU-compatible modules, HSHM provides shared-memory data structures (`hshm::string`, `hshm::ipc::vector`, `hshm::ipc::ring_buffer`) with cross-platform annotations. See the [Vector Guide](/docs/sdk/context-transport-primitives/types/vector_guide), [Ring Buffer Guide](/docs/sdk/context-transport-primitives/types/ring_buffer_guide), and [String Guide](/docs/sdk/context-transport-primitives/types/string_guide) for details. ::: ### Bulk Transfer Support with ar.bulk diff --git a/docs/sdk/context-runtime/3.module_test_guide.md b/docs/sdk/context-runtime/3.module_test_guide.md index 747234b..c2541c6 100644 --- a/docs/sdk/context-runtime/3.module_test_guide.md +++ b/docs/sdk/context-runtime/3.module_test_guide.md @@ -33,7 +33,7 @@ A minimal test configuration: ```yaml networking: - port: 5555 + port: 9413 runtime: num_threads: 4 @@ -181,7 +181,7 @@ std::string CreateComposeConfig() { << " num_threads: 4\n" << "\n" << "networking:\n" - << " port: 5555\n" + << " port: 9413\n" << "\n" << "compose:\n" << "- mod_name: chimaera_bdev\n" @@ -405,7 +405,7 @@ ctest -R my_module rm -f /dev/shm/chimaera_* ``` -6. **Kill stale processes on port conflicts**: Tests bind to port 5555. If a previous run left a zombie: +6. **Kill stale processes on port conflicts**: Tests bind to port 9413. If a previous run left a zombie: ```bash - sudo kill -9 $(sudo lsof -t -i :5555) 2>/dev/null + sudo kill -9 $(sudo lsof -t -i :9413) 2>/dev/null ``` diff --git a/docs/sdk/context-runtime/5.scheduler.md b/docs/sdk/context-runtime/5.scheduler.md index 4aeaa7c..22111d6 100644 --- a/docs/sdk/context-runtime/5.scheduler.md +++ b/docs/sdk/context-runtime/5.scheduler.md @@ -2,17 +2,18 @@ ## Overview -The Chimaera runtime uses a pluggable scheduler architecture to control how tasks are mapped to workers and how workers are organized. This document explains how to build custom schedulers for the IOWarp runtime. +The Chimaera runtime uses a pluggable scheduler architecture to control how tasks are mapped to workers and how workers are organized. Scheduling happens at two levels: **container-level** scheduling resolves *what* to execute (via `Container::ScheduleTask`), and the **Scheduler** decides *where* to execute it (which worker thread). This document explains both levels and how to build custom schedulers. ## Table of Contents 1. [Architecture Overview](#architecture-overview) -2. [Scheduler Interface](#scheduler-interface) -3. [Worker Lifecycle](#worker-lifecycle) -4. [Implementing a Custom Scheduler](#implementing-a-custom-scheduler) -5. [DefaultScheduler Example](#defaultscheduler-example) -6. [Best Practices](#best-practices) -7. [Integration Points](#integration-points) +2. [Two-Level Scheduling](#two-level-scheduling) +3. [Scheduler Interface](#scheduler-interface) +4. [Worker Lifecycle](#worker-lifecycle) +5. [Implementing a Custom Scheduler](#implementing-a-custom-scheduler) +6. [DefaultScheduler Example](#defaultscheduler-example) +7. [Best Practices](#best-practices) +8. [Integration Points](#integration-points) ## Architecture Overview @@ -23,7 +24,8 @@ The IOWarp runtime separates concerns across three main components: - **ConfigManager**: Manages configuration (number of threads, queue depth, etc.) - **WorkOrchestrator**: Creates workers, spawns threads, assigns lanes to workers (1:1 mapping for all workers) - **Scheduler**: Decides worker partitioning, task-to-worker mapping, and load balancing -- **IpcManager**: Manages shared memory, queues, and provides task routing infrastructure +- **IpcManager**: Manages shared memory, queues, and provides task routing infrastructure (`RouteTask`, `RouteLocal`, `RouteGlobal`) +- **Container**: Provides per-pool dynamic scheduling via `ScheduleTask` ### Data Flow @@ -53,6 +55,69 @@ The IOWarp runtime separates concerns across three main components: └─────────────────┘ ``` +### Task Routing Flow + +When a task is submitted, `IpcManager::RouteTask` orchestrates the full routing pipeline: + +``` +RouteTask(future, force_enqueue) + │ + ├─ 1. Container::ScheduleTask() → resolve Dynamic pool query + │ (e.g., Dynamic → DirectHash) + │ + ├─ 2. ResolvePoolQuery() → resolve to physical node(s) + │ + ├─ 3. IsTaskLocal() → local or remote? + │ │ + │ ├─ local: RouteLocal() → resolve container, pick worker + │ │ │ + │ │ ├─ RuntimeMapTask() → scheduler picks dest worker + │ │ │ + │ │ ├─ dest == current && !force_enqueue → execute directly + │ │ └─ otherwise → enqueue to dest worker's lane + │ │ + │ └─ remote: RouteGlobal() → enqueue to net_queue_ for SendIn + │ + └─ return true if task can be executed by current worker +``` + +## Two-Level Scheduling + +### Level 1: Container Scheduling (`ScheduleTask`) + +Before the Scheduler decides *which worker* runs a task, the container decides *which container instance* handles it. This is the `Container::ScheduleTask` virtual method. + +```cpp +class Container { +public: + virtual PoolQuery ScheduleTask(const hipc::FullPtr &task) { + return task->pool_query_; // Default: no transformation + } +}; +``` + +**Purpose**: Transform a high-level `PoolQuery` (often `Dynamic`) into a concrete routing mode (`DirectHash`, `Local`, `Broadcast`, etc.) based on the task's semantics. + +**Example**: A distributed filesystem container might schedule read tasks to the node holding the target block: + +```cpp +PoolQuery MyFsContainer::ScheduleTask(const hipc::FullPtr &task) { + if (task->method_ == kRead || task->method_ == kWrite) { + // Route to the container holding the target block + u64 block_id = GetBlockId(task); + return PoolQuery::DirectHash(block_id); + } + // Metadata ops stay local + return PoolQuery::Local(); +} +``` + +`ScheduleTask` is called by `RouteTask` **before** pool query resolution. The returned `PoolQuery` then goes through `ResolvePoolQuery` to determine concrete physical nodes and containers. + +### Level 2: Worker Scheduling (`RuntimeMapTask`) + +After container and node resolution, `RouteLocal` calls `Scheduler::RuntimeMapTask` to pick the specific worker thread. This is where I/O-size routing, task group affinity, and network worker pinning happen. + ## Scheduler Interface All schedulers must inherit from the `Scheduler` base class and implement the following methods: @@ -71,13 +136,20 @@ public: virtual u32 ClientMapTask(IpcManager *ipc_manager, const Future &task) = 0; // Map tasks from runtime workers to other workers - virtual u32 RuntimeMapTask(Worker *worker, const Future &task) = 0; + virtual u32 RuntimeMapTask(Worker *worker, const Future &task, + Container *container) = 0; // Rebalance load across workers (called periodically by workers) virtual void RebalanceWorker(Worker *worker) = 0; // Adjust polling intervals for periodic tasks virtual void AdjustPolling(RunContext *run_ctx) = 0; + + // Get designated GPU worker (optional) + virtual Worker *GetGpuWorker() const { return nullptr; } + + // Get designated network worker (optional) + virtual Worker *GetNetWorker() const { return nullptr; } }; ``` @@ -91,8 +163,9 @@ public: **Responsibilities**: - Access workers via `work_orch->GetWorker(worker_id)` -- Organize workers into scheduler-specific groups (e.g., task workers, network worker) -- **Update IpcManager** with the total worker count via `IpcManager::SetNumSchedQueues()` +- Organize workers into scheduler-specific groups (e.g., scheduler worker, I/O workers, network worker, GPU worker) +- **Update IpcManager** with the scheduling queue count via `IpcManager::SetNumSchedQueues()` +- **Set the network lane** via `IpcManager::SetNetLane()` so the runtime knows where to enqueue network tasks **Important**: All workers are assigned lanes by `WorkOrchestrator::SpawnWorkerThreads()` using 1:1 mapping. The scheduler does NOT control lane assignment — it only tracks worker groups for routing decisions. @@ -101,21 +174,24 @@ public: void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { u32 total_workers = work_orch->GetTotalWorkerCount(); - // Track workers: first N-1 are task workers, last is network - for (u32 i = 0; i < total_workers - 1; ++i) { - Worker *worker = work_orch->GetWorker(i); - if (worker) { - task_workers_.push_back(worker); - } + // Worker 0: scheduler worker (metadata + small I/O) + scheduler_worker_ = work_orch->GetWorker(0); + + // Workers 1..N-2: I/O workers (large I/O round-robin) + for (u32 i = 1; i < total_workers - 1; ++i) { + io_workers_.push_back(work_orch->GetWorker(i)); } - // Last worker is network worker + // Worker N-1: network worker net_worker_ = work_orch->GetWorker(total_workers - 1); - // IMPORTANT: Update IpcManager with worker count + // IMPORTANT: Update IpcManager IpcManager *ipc = CHI_IPC; if (ipc) { - ipc->SetNumSchedQueues(total_workers); + ipc->SetNumSchedQueues(1); // Client tasks go to scheduler worker + if (net_worker_) { + ipc->SetNetLane(net_worker_->GetLane()); + } } } ``` @@ -124,7 +200,7 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { **Purpose**: Determine which worker lane a task from a client should be assigned to. -**Called**: When clients submit tasks to the runtime. +**Called**: When clients submit tasks to the runtime via `SendRuntimeClient`. **Responsibilities**: - Return a lane ID in range `[0, num_sched_queues)` @@ -138,45 +214,60 @@ u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future &task u32 num_lanes = ipc_manager->GetNumSchedQueues(); if (num_lanes == 0) return 0; - // Route network tasks (Send/Recv from admin pool) to last worker + // Network tasks (Send/Recv from admin pool) → network worker Task *task_ptr = task.get(); if (task_ptr != nullptr && task_ptr->pool_id_ == chi::kAdminPoolId) { u32 method_id = task_ptr->method_; - if (method_id == 14 || method_id == 15) { // kSend or kRecv + if (method_id == 14 || method_id == 15 || + method_id == 20 || method_id == 21) { return num_lanes - 1; } } - // PID+TID hash-based mapping for other tasks - auto *sys_info = HSHM_SYSTEM_INFO; - pid_t pid = sys_info->pid_; - auto tid = HSHM_THREAD_MODEL->GetTid(); - - size_t hash = std::hash{}(pid) ^ (std::hash{}(&tid) << 1); - return static_cast(hash % num_lanes); + // Default: scheduler worker (lane 0) + return 0; } ``` -#### `RuntimeMapTask(Worker *worker, const Future &task)` +#### `RuntimeMapTask(Worker *worker, const Future &task, Container *container)` **Purpose**: Determine which worker should execute a task when routing from within the runtime. -**Called**: By `Worker::RouteLocal()` to decide whether a task should execute on the current worker or be forwarded to another. +**Called**: By `IpcManager::RouteLocal()` after the execution container has been resolved. If the returned worker ID differs from the current worker (or if `force_enqueue` is set), the task is enqueued to the destination worker's lane. + +**Parameters**: +- `worker`: The current worker (may be nullptr if called from a non-worker thread) +- `task`: The task being routed +- `container`: The resolved execution container. Used for task-group affinity lookups. May be nullptr when called without a resolved container. **Responsibilities**: - Return a worker ID for task execution - Route periodic network tasks (Send/Recv) to the dedicated network worker -- For all other tasks, return the current worker's ID (no migration) +- Implement I/O-size-based routing (large I/O → dedicated workers) +- Implement task group affinity (pin related tasks to the same worker) **Example**: ```cpp -u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future &task) { - // Route periodic network tasks to the network worker +u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future &task, + Container *container) { Task *task_ptr = task.get(); + + // Task group affinity: if this task's group is already pinned, use that worker + if (container != nullptr && task_ptr != nullptr && + !task_ptr->task_group_.IsNull()) { + int64_t group_id = task_ptr->task_group_.id_; + ScopedCoRwReadLock read_lock(container->task_group_lock_); + auto it = container->task_group_map_.find(group_id); + if (it != container->task_group_map_.end() && it->second != nullptr) { + return it->second->GetId(); + } + } + + // Periodic network tasks → network worker if (task_ptr != nullptr && task_ptr->IsPeriodic()) { if (task_ptr->pool_id_ == chi::kAdminPoolId) { u32 method_id = task_ptr->method_; - if (method_id == 14 || method_id == 15) { // kSend or kRecv + if (method_id == 14 || method_id == 15) { if (net_worker_ != nullptr) { return net_worker_->GetId(); } @@ -184,7 +275,20 @@ u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future &task) { } } - // All other tasks execute on the current worker + // Large I/O → round-robin across I/O workers + if (task_ptr != nullptr && !io_workers_.empty()) { + if (task_ptr->stat_.io_size_ >= kLargeIOThreshold) { + u32 idx = next_io_idx_.fetch_add(1, std::memory_order_relaxed) + % static_cast(io_workers_.size()); + return io_workers_[idx]->GetId(); + } + } + + // Small I/O / metadata → scheduler worker + if (scheduler_worker_ != nullptr) { + return scheduler_worker_->GetId(); + } + return worker ? worker->GetId() : 0; } ``` @@ -218,6 +322,7 @@ void MyScheduler::RebalanceWorker(Worker *worker) { - Modify `run_ctx->yield_time_us_` based on `run_ctx->did_work_` - Implement adaptive polling (exponential backoff when idle) - Reduce CPU usage for idle periodic tasks +- **Important**: `co_await` on Futures sets `yield_time_us_` to 0, so this method must restore it to prevent busy-looping **Example**: ```cpp @@ -252,8 +357,9 @@ Understanding the worker lifecycle is crucial for scheduler implementation: - Calls Scheduler::DivideWorkers() ↓ 3. Scheduler::DivideWorkers() - - Tracks workers into functional groups (task workers, network worker) + - Tracks workers into functional groups (scheduler, I/O, network, GPU) - Updates IpcManager::SetNumSchedQueues() + - Sets network lane via IpcManager::SetNetLane() ↓ 4. WorkOrchestrator::StartWorkers() - Calls SpawnWorkerThreads() @@ -261,9 +367,12 @@ Understanding the worker lifecycle is crucial for scheduler implementation: - Spawns actual OS threads ↓ 5. Workers run task processing loops - - Process tasks from assigned lanes - - Call Scheduler::RuntimeMapTask() for task routing in RouteLocal() + - ProcessNewTask: pop futures from lane, route via RouteTask() + - RouteTask calls Container::ScheduleTask() for dynamic resolution + - RouteLocal calls Scheduler::RuntimeMapTask() for worker selection + - If dest != current worker (or force_enqueue), re-enqueue to dest lane - Call Scheduler::RebalanceWorker() periodically + - Call Scheduler::AdjustPolling() after periodic task execution ``` ## Implementing a Custom Scheduler @@ -276,6 +385,7 @@ Create `context-runtime/include/chimaera/scheduler/my_scheduler.h`: #ifndef CHIMAERA_INCLUDE_CHIMAERA_SCHEDULER_MY_SCHEDULER_H_ #define CHIMAERA_INCLUDE_CHIMAERA_SCHEDULER_MY_SCHEDULER_H_ +#include #include #include "chimaera/scheduler/scheduler.h" @@ -283,20 +393,25 @@ namespace chi { class MyScheduler : public Scheduler { public: - MyScheduler() : net_worker_(nullptr) {} + MyScheduler() : scheduler_worker_(nullptr), net_worker_(nullptr), + gpu_worker_(nullptr), next_io_idx_{0} {} ~MyScheduler() override = default; - // Implement required interface methods void DivideWorkers(WorkOrchestrator *work_orch) override; u32 ClientMapTask(IpcManager *ipc_manager, const Future &task) override; - u32 RuntimeMapTask(Worker *worker, const Future &task) override; + u32 RuntimeMapTask(Worker *worker, const Future &task, + Container *container) override; void RebalanceWorker(Worker *worker) override; void AdjustPolling(RunContext *run_ctx) override; + Worker *GetGpuWorker() const override { return gpu_worker_; } + Worker *GetNetWorker() const override { return net_worker_; } private: - // Your scheduler-specific state - std::vector scheduler_workers_; + Worker *scheduler_worker_; + std::vector io_workers_; Worker *net_worker_; + Worker *gpu_worker_; + std::atomic next_io_idx_{0}; }; } // namespace chi @@ -322,47 +437,73 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { u32 total_workers = work_orch->GetTotalWorkerCount(); - scheduler_workers_.clear(); - net_worker_ = nullptr; - - // Network worker is always the last worker + scheduler_worker_ = work_orch->GetWorker(0); net_worker_ = work_orch->GetWorker(total_workers - 1); - // Scheduler workers are all workers except the last one - u32 num_sched_workers = (total_workers == 1) ? 1 : (total_workers - 1); - for (u32 i = 0; i < num_sched_workers; ++i) { - Worker *worker = work_orch->GetWorker(i); - if (worker) { - scheduler_workers_.push_back(worker); + if (total_workers > 2) { + gpu_worker_ = work_orch->GetWorker(total_workers - 2); + for (u32 i = 1; i < total_workers - 1; ++i) { + io_workers_.push_back(work_orch->GetWorker(i)); } } - // CRITICAL: Update IpcManager with the number of workers IpcManager *ipc = CHI_IPC; if (ipc) { - ipc->SetNumSchedQueues(total_workers); + ipc->SetNumSchedQueues(1); + if (net_worker_) { + ipc->SetNetLane(net_worker_->GetLane()); + } } } u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future &task) { u32 num_lanes = ipc_manager->GetNumSchedQueues(); if (num_lanes == 0) return 0; - - // Implement your mapping strategy here - return 0; // Simple: always map to lane 0 + return 0; // All client tasks → scheduler worker } -u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future &task) { +u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future &task, + Container *container) { + Task *task_ptr = task.get(); + + // Task group affinity + if (container != nullptr && task_ptr != nullptr && + !task_ptr->task_group_.IsNull()) { + int64_t group_id = task_ptr->task_group_.id_; + ScopedCoRwReadLock read_lock(container->task_group_lock_); + auto it = container->task_group_map_.find(group_id); + if (it != container->task_group_map_.end() && it->second != nullptr) { + return it->second->GetId(); + } + } + + // Periodic network tasks → network worker + if (task_ptr != nullptr && task_ptr->IsPeriodic() && + task_ptr->pool_id_ == chi::kAdminPoolId) { + u32 m = task_ptr->method_; + if ((m == 14 || m == 15 || m == 20 || m == 21) && net_worker_) { + return net_worker_->GetId(); + } + } + + // Large I/O → round-robin across I/O workers + if (task_ptr != nullptr && !io_workers_.empty() && + task_ptr->stat_.io_size_ >= 4096) { + u32 idx = next_io_idx_.fetch_add(1, std::memory_order_relaxed) + % static_cast(io_workers_.size()); + return io_workers_[idx]->GetId(); + } + + // Default → scheduler worker + if (scheduler_worker_) return scheduler_worker_->GetId(); return worker ? worker->GetId() : 0; } -void MyScheduler::RebalanceWorker(Worker *worker) { - (void)worker; -} +void MyScheduler::RebalanceWorker(Worker *worker) { (void)worker; } void MyScheduler::AdjustPolling(RunContext *run_ctx) { if (!run_ctx) return; - // Implement adaptive polling or leave with default behavior + run_ctx->yield_time_us_ = run_ctx->true_period_ns_ / 1000.0; } } // namespace chi @@ -406,27 +547,45 @@ runtime: ## DefaultScheduler Example -The `DefaultScheduler` provides a reference implementation with these characteristics: +The `DefaultScheduler` provides a reference implementation with I/O-size-based routing and task group affinity. ### Worker Partitioning -- Tracks all workers except the last as scheduler workers -- Last worker is designated as the network worker -- All workers get lanes assigned by WorkOrchestrator (1:1 mapping) -- `SetNumSchedQueues(total_workers)` includes all workers for client task mapping +- **Worker 0**: Scheduler worker — handles metadata and small I/O (< 4KB) +- **Workers 1..N-2**: I/O workers — handle large I/O via round-robin +- **Worker N-2**: Also serves as the GPU worker (polls GPU queues) +- **Worker N-1**: Network worker — handles all Send/Recv/ClientSend/ClientRecv tasks +- `SetNumSchedQueues(1)` — client tasks all go to the scheduler worker initially + +### Dynamic Scheduling via `ScheduleTask` + +The `DefaultScheduler` works hand-in-hand with `Container::ScheduleTask`. When a task arrives with a `Dynamic` pool query, the container's `ScheduleTask` resolves it before `RuntimeMapTask` picks the worker: + +1. Task submitted with `PoolQuery::Dynamic()` +2. `RouteTask` calls `container->ScheduleTask(task)` → returns e.g. `DirectHash(block_id)` +3. `ResolvePoolQuery` resolves `DirectHash` to a physical node + container +4. `IsTaskLocal` checks if the target is this node +5. `RouteLocal` calls `RuntimeMapTask` to pick the worker + +This two-level design means containers control *what* gets executed where, while the scheduler controls *how* workers are utilized. -### Task Mapping Strategy -- **Client Tasks**: PID+TID hash-based mapping for regular tasks - - Ensures different processes/threads use different lanes - - Network tasks (Send/Recv from admin pool, methods 14/15) are routed to the last worker (network worker) -- **Runtime Tasks**: Tasks execute on the current worker, except periodic Send/Recv tasks which are routed to the network worker +### Task Group Affinity -### Load Balancing -- No active rebalancing (simple design) -- Tasks processed by worker that picks them up +The `DefaultScheduler` implements task group affinity to pin related tasks to the same worker. Each `Container` maintains a `task_group_map_` mapping group IDs to workers: -### Polling Adjustment -- Currently disabled (early return) to avoid hanging issues -- When enabled, implements exponential backoff for idle periodic tasks +1. When `RuntimeMapTask` sees a task with a non-null `task_group_`, it checks the container's `task_group_map_` +2. If the group is already mapped to a worker, the task goes to that worker +3. If not, normal routing selects a worker and records the mapping + +This ensures tasks in the same group (e.g., operations on the same file handle) execute on the same worker, avoiding lock contention and improving cache locality. + +### I/O-Size Routing +- Tasks with `stat_.io_size_ >= 4096` → round-robin across I/O workers +- Tasks with `stat_.io_size_ < 4096` → scheduler worker (worker 0) +- Network tasks (admin pool methods 14, 15, 20, 21) → network worker + +### Force Enqueue + +`RouteTask` accepts a `force_enqueue` parameter (default `false`). When `true`, `RouteLocal` always enqueues the task to the destination worker's lane, even if the destination is the current worker. This is used by `SendRuntime` (non-worker thread path) which cannot execute tasks directly and must always enqueue. ### Code Reference @@ -442,10 +601,12 @@ See implementation in: void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { // ... partition workers ... - // CRITICAL: Update IpcManager with worker count IpcManager *ipc = CHI_IPC; if (ipc) { - ipc->SetNumSchedQueues(total_workers); + ipc->SetNumSchedQueues(num_client_lanes); + if (net_worker_) { + ipc->SetNetLane(net_worker_->GetLane()); + } } } ``` @@ -454,9 +615,13 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { ### 2. Route Network Tasks to the Network Worker -Both `ClientMapTask` and `RuntimeMapTask` should route Send/Recv tasks (methods 14/15 from admin pool) to the dedicated network worker (last worker). This prevents network I/O from blocking task processing workers. +Both `ClientMapTask` and `RuntimeMapTask` should route Send/Recv tasks (methods 14/15/20/21 from admin pool) to the dedicated network worker (last worker). This prevents network I/O from blocking task processing workers. + +### 3. Implement Task Group Affinity -### 3. Validate Lane IDs +Use `container->task_group_map_` to pin related tasks to the same worker. Protect map access with `container->task_group_lock_` (read lock for lookups, write lock for updates). + +### 4. Validate Lane IDs ```cpp u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future &task) { @@ -468,7 +633,7 @@ u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future &task } ``` -### 4. Handle Null Pointers +### 5. Handle Null Pointers ```cpp void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { @@ -476,19 +641,20 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { // ... proceed ... } -u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future &task) { +u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future &task, + Container *container) { return worker ? worker->GetId() : 0; } ``` -### 5. Consider Thread Safety +### 6. Consider Thread Safety If your scheduler maintains shared state accessed by multiple workers: -- Use atomic operations for counters -- Use mutexes for complex data structures +- Use atomic operations for counters (e.g., `next_io_idx_`) +- Use `CoRwLock` for complex data structures (e.g., `task_group_map_`) - Prefer lock-free designs when possible -### 6. Test with Different Configurations +### 7. Test with Different Configurations Test your scheduler with various `num_threads` values: - Single thread (num_threads = 1): single worker serves dual role @@ -531,6 +697,22 @@ u32 id = worker->GetId(); TaskLane *lane = worker->GetLane(); ``` +### Container Access + +Containers expose scheduling-related state: + +```cpp +// Task group affinity map (per container) +container->task_group_map_ // std::unordered_map +container->task_group_lock_ // CoRwLock protecting the map + +// Override ScheduleTask for custom dynamic scheduling +PoolQuery MyContainer::ScheduleTask(const hipc::FullPtr &task) { + // Transform Dynamic queries into concrete routing modes + return PoolQuery::DirectHash(ComputeHash(task)); +} +``` + ### Logging Use Hermes logging macros: @@ -555,6 +737,35 @@ std::string sched_name = config->GetLocalSched(); ## Advanced Topics +### Custom Container Scheduling + +Override `ScheduleTask` to implement application-specific routing: + +```cpp +class DistributedKVStore : public Container { +public: + PoolQuery ScheduleTask(const hipc::FullPtr &task) override { + switch (task->method_) { + case kGet: + case kPut: + case kDelete: { + // Route key-value ops to the node owning the key's hash partition + u64 key_hash = ExtractKeyHash(task); + return PoolQuery::DirectHash(key_hash); + } + case kScan: + // Range scans may span multiple nodes + return PoolQuery::Range(ExtractStartKey(task), ExtractEndKey(task)); + case kCreateIndex: + // Metadata ops broadcast to all nodes + return PoolQuery::Broadcast(); + default: + return PoolQuery::Local(); + } + } +}; +``` + ### Work Stealing Implement work stealing in `RebalanceWorker`: @@ -563,7 +774,7 @@ Implement work stealing in `RebalanceWorker`: void MyScheduler::RebalanceWorker(Worker *worker) { TaskLane *my_lane = worker->GetLane(); if (my_lane->Empty()) { - for (Worker *victim : scheduler_workers_) { + for (Worker *victim : io_workers_) { if (victim == worker) continue; TaskLane *victim_lane = victim->GetLane(); @@ -579,20 +790,6 @@ void MyScheduler::RebalanceWorker(Worker *worker) { } ``` -### Locality-Aware Mapping - -Map tasks based on data locality: - -```cpp -u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future &task) { - // Extract data location from task - PoolId pool_id = task->pool_id_; - - // Map to worker closest to data - return ComputeLocalityMap(pool_id, ipc_manager->GetNumSchedQueues()); -} -``` - ### Priority-Based Scheduling Use task priorities for scheduling: @@ -625,6 +822,7 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { 1. Did you call `IpcManager::SetNumSchedQueues()` in `DivideWorkers`? 2. Are all workers getting lanes via WorkOrchestrator's 1:1 mapping? 3. Does `ClientMapTask` return lane IDs in valid range? +4. Is `IpcManager::SetNetLane()` called for the network worker? ### Client Mapping Errors @@ -635,6 +833,14 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { 2. Did you check for `num_lanes == 0`? 3. Are you using modulo to wrap lane IDs? +### Tasks Hang After Re-enqueue + +**Symptom**: Tasks enqueued to a different worker never complete + +**Check**: +1. When `RouteLocal` re-enqueues to a different worker, `ProcessNewTask` on the destination worker updates the RunContext's `worker_id_`, `lane_`, and `event_queue_` to match the new worker. If RunContext fields are stale, subtask completion events go to the wrong worker. +2. Check that `TASK_STARTED` is respected — re-enqueued tasks with live coroutines must be resumed, not restarted. + ### Worker Crashes **Symptom**: Workers crash during initialization @@ -648,6 +854,7 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) { - **Scheduler Interface**: `context-runtime/include/chimaera/scheduler/scheduler.h` - **DefaultScheduler**: `context-runtime/src/scheduler/default_sched.cc` +- **Container Base**: `context-runtime/include/chimaera/container.h` +- **IpcManager (RouteTask/RouteLocal)**: `context-runtime/src/ipc_manager.cc` - **WorkOrchestrator**: `context-runtime/src/work_orchestrator.cc` -- **IpcManager**: `context-runtime/src/ipc_manager.cc` - **Configuration**: [Configuration Reference](../../deployment/configuration) diff --git a/docs/sdk/context-transport-primitives/1.allocator/allocator_guide.md b/docs/sdk/context-transport-primitives/1.allocator/allocator_guide.md index b9c95e6..892edeb 100644 --- a/docs/sdk/context-transport-primitives/1.allocator/allocator_guide.md +++ b/docs/sdk/context-transport-primitives/1.allocator/allocator_guide.md @@ -339,4 +339,5 @@ wait $RANK0_PID $RANK1_PID $RANK2_PID ## Related Documentation - [Memory Backends Guide](./memory_backend_guide) - Backends that provide memory regions for these allocators -- [Data Structures Guide](../types/data_structures_guide) - Data structures that use these allocators +- [Vector Guide](../types/vector_guide) - Shared-memory vectors that use these allocators +- [Ring Buffer Guide](../types/ring_buffer_guide) - Lock-free circular queues diff --git a/docs/sdk/context-transport-primitives/1.allocator/memory_backend_guide.md b/docs/sdk/context-transport-primitives/1.allocator/memory_backend_guide.md index 5d0a076..1ab155f 100644 --- a/docs/sdk/context-transport-primitives/1.allocator/memory_backend_guide.md +++ b/docs/sdk/context-transport-primitives/1.allocator/memory_backend_guide.md @@ -135,4 +135,4 @@ GPU backends are only compiled when CUDA or ROCm is enabled: ## Related Documentation - [Allocator Guide](./allocator_guide) - Allocators that manage memory from these backends -- [Data Structures Guide](../types/data_structures_guide) - Data structures that use these allocators +- [Vector Guide](../types/vector_guide) - Shared-memory vectors that use these allocators diff --git a/docs/sdk/context-transport-primitives/2.types/data_structures_guide.md b/docs/sdk/context-transport-primitives/2.types/data_structures_guide.md deleted file mode 100644 index 5bd011e..0000000 --- a/docs/sdk/context-transport-primitives/2.types/data_structures_guide.md +++ /dev/null @@ -1,335 +0,0 @@ -# Data Structures Guide - -## Overview - -HSHM provides data structures designed for shared memory and GPU compatibility. These are alternatives to STL containers for use cases requiring cross-process sharing or GPU kernel access. - -For standard ChiMod development, use `std::string` and `std::vector`. The HSHM data structures below are needed when: -- Data must be accessible from GPU kernels -- Data must live in shared memory across processes -- You need lock-free concurrent queues - -## Vector - -HSHM provides two vector variants: `hshm::ipc::vector` for shared memory and `hshm::priv::vector` for private memory. - -### hshm::ipc::vector - -**Source:** `hermes_shm/data_structures/ipc/vector.h` - -A dynamic array stored in shared memory using offset-based pointers (`OffsetPtr`) for process-independent addressing. - -```cpp -#include - -// Create with an allocator -hshm::ipc::vector vec(alloc, 10); // 10 elements - -// Standard vector operations -vec.push_back(42); -vec.emplace_back(100); -int val = vec[0]; -vec.resize(20); -vec.reserve(50); -vec.clear(); - -// Iteration -for (auto it = vec.begin(); it != vec.end(); ++it) { - process(*it); -} -``` - -**Template Parameters:** -- `T` - Element type -- `AllocT` - Allocator type (determines shared vs private memory) - -**Key Differences from std::vector:** -- Requires an allocator at construction time -- Uses `OffsetPtr` internally instead of raw pointers -- Safe for cross-process access in shared memory -- Annotated with `HSHM_CROSS_FUN` for GPU compatibility - -### hshm::priv::vector - -**Source:** `hermes_shm/data_structures/priv/vector.h` - -A private-memory vector with allocator integration. Supports the same API as `std::vector` plus serialization. - -```cpp -#include - -// Standard construction -hshm::priv::vector vec = {1, 2, 3, 4, 5}; -hshm::priv::vector vec2(10, 0); // 10 zeros - -// Full STL-compatible API -vec.push_back(6); -vec.pop_back(); -vec.insert(vec.begin() + 2, 99); -vec.erase(vec.begin()); - -// Reverse iteration -for (auto it = vec.rbegin(); it != vec.rend(); ++it) { - process(*it); -} -``` - -**Optimizations:** -- Uses `memcpy`/`memmove` for trivially copyable types (POD optimization) -- Exponential capacity growth strategy -- Annotated with `HSHM_CROSS_FUN` for GPU compatibility - -### When to Use Each - -| Variant | Use Case | -|---------|----------| -| `std::vector` | Default choice for ChiMod task data | -| `hshm::priv::vector` | Private memory with serialization support or GPU access | -| `hshm::ipc::vector` | Cross-process shared memory regions | - -## Ring Buffer - -**Source:** `hermes_shm/data_structures/ipc/ring_buffer.h` - -A lock-free circular queue for concurrent producer-consumer patterns. Configurable via compile-time flags. - -### Configuration Flags - -```cpp -namespace hshm::ipc { -enum RingQueueFlag { - RING_BUFFER_SPSC_FLAGS = 0x01, // Single Producer Single Consumer - RING_BUFFER_MPSC_FLAGS = 0x02, // Multiple Producer Single Consumer - RING_BUFFER_WAIT_FOR_SPACE = 0x04, // Block until space available - RING_BUFFER_ERROR_ON_NO_SPACE = 0x08, // Return error if full - RING_BUFFER_DYNAMIC_SIZE = 0x10, // Resize when full - RING_BUFFER_FIXED_SIZE = 0x20, // Fixed-size buffer -}; -} -``` - -### Pre-defined Type Aliases - -| Alias | Flags | Description | -|-------|-------|-------------| -| `spsc_ring_buffer` | SPSC + Fixed + Error | Single-producer single-consumer, fixed size | -| `mpsc_ring_buffer` | MPSC + Fixed + Wait | Multi-producer single-consumer, blocks when full | -| `circular_mpsc_ring_buffer` | MPSC + Fixed + Error | Multi-producer single-consumer, wraps around | -| `ext_ring_buffer` | MPSC + Dynamic + Wait | Extensible, resizes when full | - -### Usage - -```cpp -#include - -// Create a fixed-size SPSC ring buffer with depth 1024 -hshm::ipc::spsc_ring_buffer rb(alloc, 1024); - -// Producer -rb.Push(42); -rb.Emplace(100); - -// Consumer -int val; -if (rb.Pop(val)) { - // Got value -} - -// Query state -size_t count = rb.Size(); -bool empty = rb.Empty(); -bool full = rb.Full(); -``` - -### RingBufferEntry - -Each entry has an atomic ready flag for lock-free synchronization: - -```cpp -template -struct RingBufferEntry { - bool IsReady(); // Check if entry has data - void SetReady(); // Mark entry as containing data - void ClearReady(); // Mark entry as consumed - T& GetData(); // Access the entry data -}; -``` - -### Internal Design - -- Uses atomic head/tail pointers for lock-free operation -- Head is the consumer pointer, tail is the producer pointer -- Queue capacity is `depth + 1` to distinguish full from empty -- MPSC mode uses atomic tail with CAS for concurrent producers -- SPSC mode uses non-atomic pointers for maximum performance -- Includes worker metadata: `assigned_worker_id_`, `signal_fd_`, `tid_`, `active_` - -## String - -**Source:** `hermes_shm/data_structures/priv/string.h` - -An SSO (Short String Optimization) string backed by `hshm::priv::vector`. - -```cpp -#include - -// Construction -hshm::string s1("hello"); -hshm::string s2(std::string("world")); -hshm::string s3(s1); // Copy - -// Standard string API -s1.append(" world"); -s1 += "!"; -size_t pos = s1.find("world"); -hshm::string sub = s1.substr(0, 5); -bool eq = (s1 == s2); - -// Access -const char* cstr = s1.c_str(); -char ch = s1[0]; -size_t len = s1.size(); - -// Conversion to/from std::string -std::string std_str = s1.str(); -std::string std_str2 = static_cast(s1); -``` - -**Template Parameters:** -- `T` - Character type (default: `char`) -- `AllocT` - Allocator type -- `SSOSize` - Short string buffer size (default: 32 bytes) - -**Key Features:** -- Short strings (32 bytes or fewer) stored inline without heap allocation -- Longer strings use `hshm::priv::vector` as backing store -- Full `std::string`-compatible API: `find`, `substr`, `replace`, `starts_with`, `ends_with` -- Annotated with `HSHM_CROSS_FUN` for GPU compatibility -- Serialization support via `save()`/`load()` - -**Type Alias:** `hshm::string` is a convenience alias for `hshm::priv::basic_string`. - -## Unordered Map (Vector of Lists) - -**Source:** `hermes_shm/data_structures/priv/unordered_map_ll.h` - -A hash map implementation using a vector of lists design that provides efficient concurrent access when combined with external locking. Each bucket contains a `std::list` of key-value pairs; the hash space is partitioned across a fixed number of buckets set at construction time. - -**Key Characteristics:** -- **Vector of Lists Design**: Uses a vector of buckets, each containing a list of key-value pairs -- **External Locking Required**: No internal mutexes - users must provide synchronization -- **Bucket Partitioning**: Hash space is partitioned across multiple buckets for better cache locality -- **Standard API**: Compatible with `std::unordered_map` interface -- **NOT Shared-Memory Compatible**: For runtime-only data structures, not task parameters - -### Basic Usage - -```cpp -#include - -// Create map with 32 buckets -hshm::priv::unordered_map_ll map(32); - -// Insert -auto [inserted, ptr] = map.insert(1, "hello"); -map.insert_or_assign(2, "world"); -map[3] = "foo"; - -// Lookup -std::string* val = map.find(1); // Returns nullptr if not found -const std::string& ref = map.at(2); // Throws if not found -bool exists = map.contains(3); - -// Remove -map.erase(1); -map.clear(); - -// Iterate -map.for_each([](const int& key, std::string& value) { - // Process each element -}); -``` - -### Constructor - -```cpp -hshm::priv::unordered_map_ll map(max_concurrency); -``` - -**Parameters:** -- `max_concurrency`: Number of buckets (default: 16). Higher values give better distribution at the cost of more memory. Typical values: 16-64. - -### API Reference - -```cpp -// Insertion operations -auto [inserted, value_ptr] = map.insert(key, value); // Insert if not exists -auto [inserted, value_ptr] = map.insert_or_assign(key, value); // Insert or update -T& ref = map[key]; // Insert default if missing - -// Lookup operations -T* ptr = map.find(key); // Returns nullptr if not found -const T* ptr = map.find(key) const; // Const version -T& ref = map.at(key); // Throws if not found -bool exists = map.contains(key); // Check existence -size_t count = map.count(key); // Returns 0 or 1 - -// Removal operations -size_t erased = map.erase(key); // Returns number of elements erased -map.clear(); // Remove all elements - -// Size operations -size_t s = map.size(); // Total element count -bool e = map.empty(); // Check if empty -size_t b = map.bucket_count(); // Number of buckets - -// Iteration -map.for_each([](const Key& key, T& value) { /* ... */ }); -``` - -Insert operations return `std::pair` where `first` is `true` if insertion occurred and `second` is a pointer to the value. - -### Key Differences from std::unordered_map - -| Feature | std::unordered_map | hshm::priv::unordered_map_ll | -|---------|-------------------|----------------------| -| Internal Structure | Implementation-defined | Vector of lists (explicit) | -| Bucket Count | Dynamic rehashing | Fixed at construction | -| Iterator Stability | Unstable across insertions | Stable (list-based) | -| Shared Memory | Not compatible | Not compatible | -| Return Values | Iterators | Pointers to values | - -### When to Use - -| Scenario | Recommendation | -|----------|---------------| -| Runtime container data structures (caches, registries) | `hshm::priv::unordered_map_ll` | -| Task input/output parameters | `std::unordered_map` or `chi::ipc::` types | -| Client-side code | `std::unordered_map` | -| Data requiring serialization | `std::unordered_map` with cereal | - -## GPU Compatibility - -All HSHM data structures use cross-platform annotations for CPU/GPU compilation: - -| Annotation | Purpose | -|-----------|---------| -| `HSHM_INLINE_CROSS_FUN` | Inline function callable from both CPU and GPU | -| `HSHM_CROSS_FUN` | Function callable from both CPU and GPU | -| `HSHM_IS_HOST` | Compile-time check: true when compiling for CPU | -| `HSHM_IS_GPU` | Compile-time check: true when compiling for GPU | - -These annotations expand to CUDA `__host__ __device__` or HIP equivalents when GPU support is enabled, and are no-ops on CPU-only builds. - -```cpp -// Example: Method accessible from both CPU and GPU -HSHM_INLINE_CROSS_FUN -T& vector::operator[](size_t index) { - return data_[index]; -} -``` - -## Related Documentation - -- [Allocator Guide](../allocator/allocator_guide) - Memory allocators used by these data structures -- [Atomic Types Guide](./atomic_types_guide) - Atomic primitives used in ring buffers diff --git a/docs/sdk/context-transport-primitives/2.types/ring_buffer_guide.md b/docs/sdk/context-transport-primitives/2.types/ring_buffer_guide.md new file mode 100644 index 0000000..1ed5e97 --- /dev/null +++ b/docs/sdk/context-transport-primitives/2.types/ring_buffer_guide.md @@ -0,0 +1,87 @@ +--- +sidebar_position: 2 +--- + +# Ring Buffer Guide + +## Overview + +**Source:** `hermes_shm/data_structures/ipc/ring_buffer.h` + +A lock-free circular queue for concurrent producer-consumer patterns. Configurable via compile-time flags. + +## Configuration Flags + +```cpp +namespace hshm::ipc { +enum RingQueueFlag { + RING_BUFFER_SPSC_FLAGS = 0x01, // Single Producer Single Consumer + RING_BUFFER_MPSC_FLAGS = 0x02, // Multiple Producer Single Consumer + RING_BUFFER_WAIT_FOR_SPACE = 0x04, // Block until space available + RING_BUFFER_ERROR_ON_NO_SPACE = 0x08, // Return error if full + RING_BUFFER_DYNAMIC_SIZE = 0x10, // Resize when full + RING_BUFFER_FIXED_SIZE = 0x20, // Fixed-size buffer +}; +} +``` + +## Pre-defined Type Aliases + +| Alias | Flags | Description | +|-------|-------|-------------| +| `spsc_ring_buffer` | SPSC + Fixed + Error | Single-producer single-consumer, fixed size | +| `mpsc_ring_buffer` | MPSC + Fixed + Wait | Multi-producer single-consumer, blocks when full | +| `circular_mpsc_ring_buffer` | MPSC + Fixed + Error | Multi-producer single-consumer, wraps around | +| `ext_ring_buffer` | MPSC + Dynamic + Wait | Extensible, resizes when full | + +## Usage + +```cpp +#include + +// Create a fixed-size SPSC ring buffer with depth 1024 +hshm::ipc::spsc_ring_buffer rb(alloc, 1024); + +// Producer +rb.Push(42); +rb.Emplace(100); + +// Consumer +int val; +if (rb.Pop(val)) { + // Got value +} + +// Query state +size_t count = rb.Size(); +bool empty = rb.Empty(); +bool full = rb.Full(); +``` + +## RingBufferEntry + +Each entry has an atomic ready flag for lock-free synchronization: + +```cpp +template +struct RingBufferEntry { + bool IsReady(); // Check if entry has data + void SetReady(); // Mark entry as containing data + void ClearReady(); // Mark entry as consumed + T& GetData(); // Access the entry data +}; +``` + +## Internal Design + +- Uses atomic head/tail pointers for lock-free operation +- Head is the consumer pointer, tail is the producer pointer +- Queue capacity is `depth + 1` to distinguish full from empty +- MPSC mode uses atomic tail with CAS for concurrent producers +- SPSC mode uses non-atomic pointers for maximum performance +- Includes worker metadata: `assigned_worker_id_`, `signal_fd_`, `tid_`, `active_` + +## Related Documentation + +- [Allocator Guide](../allocator/allocator_guide) - Memory allocators used by ring buffers +- [Atomic Types Guide](./atomic_types_guide) - Atomic primitives used in ring buffers diff --git a/docs/sdk/context-transport-primitives/2.types/string_guide.md b/docs/sdk/context-transport-primitives/2.types/string_guide.md new file mode 100644 index 0000000..e5fa671 --- /dev/null +++ b/docs/sdk/context-transport-primitives/2.types/string_guide.md @@ -0,0 +1,54 @@ +--- +sidebar_position: 3 +--- + +# String Guide + +## Overview + +**Source:** `hermes_shm/data_structures/priv/string.h` + +An SSO (Short String Optimization) string backed by `hshm::priv::vector`. Short strings (32 bytes or fewer) are stored inline without heap allocation. + +## Usage + +```cpp +#include + +// Construction +hshm::string s1("hello"); +hshm::string s2(std::string("world")); +hshm::string s3(s1); // Copy + +// Standard string API +s1.append(" world"); +s1 += "!"; +size_t pos = s1.find("world"); +hshm::string sub = s1.substr(0, 5); +bool eq = (s1 == s2); + +// Access +const char* cstr = s1.c_str(); +char ch = s1[0]; +size_t len = s1.size(); + +// Conversion to/from std::string +std::string std_str = s1.str(); +std::string std_str2 = static_cast(s1); +``` + +## Template Parameters + +- `T` - Character type (default: `char`) +- `AllocT` - Allocator type +- `SSOSize` - Short string buffer size (default: 32 bytes) + +## Key Features + +- Short strings (32 bytes or fewer) stored inline without heap allocation +- Longer strings use `hshm::priv::vector` as backing store +- Full `std::string`-compatible API: `find`, `substr`, `replace`, `starts_with`, `ends_with` +- Annotated with `HSHM_CROSS_FUN` for GPU compatibility +- Serialization support via `save()`/`load()` + +**Type Alias:** `hshm::string` is a convenience alias for `hshm::priv::basic_string`. diff --git a/docs/sdk/context-transport-primitives/2.types/unordered_map_guide.md b/docs/sdk/context-transport-primitives/2.types/unordered_map_guide.md new file mode 100644 index 0000000..b51a8f4 --- /dev/null +++ b/docs/sdk/context-transport-primitives/2.types/unordered_map_guide.md @@ -0,0 +1,104 @@ +--- +sidebar_position: 4 +--- + +# Unordered Map Guide + +## Overview + +**Source:** `hermes_shm/data_structures/priv/unordered_map_ll.h` + +A hash map implementation using a vector of lists design that provides efficient concurrent access when combined with external locking. Each bucket contains a `std::list` of key-value pairs; the hash space is partitioned across a fixed number of buckets set at construction time. + +**Key Characteristics:** +- **Vector of Lists Design**: Uses a vector of buckets, each containing a list of key-value pairs +- **External Locking Required**: No internal mutexes — users must provide synchronization +- **Bucket Partitioning**: Hash space is partitioned across multiple buckets for better cache locality +- **Standard API**: Compatible with `std::unordered_map` interface +- **NOT Shared-Memory Compatible**: For runtime-only data structures, not task parameters + +## Basic Usage + +```cpp +#include + +// Create map with 32 buckets +hshm::priv::unordered_map_ll map(32); + +// Insert +auto [inserted, ptr] = map.insert(1, "hello"); +map.insert_or_assign(2, "world"); +map[3] = "foo"; + +// Lookup +std::string* val = map.find(1); // Returns nullptr if not found +const std::string& ref = map.at(2); // Throws if not found +bool exists = map.contains(3); + +// Remove +map.erase(1); +map.clear(); + +// Iterate +map.for_each([](const int& key, std::string& value) { + // Process each element +}); +``` + +## Constructor + +```cpp +hshm::priv::unordered_map_ll map(max_concurrency); +``` + +**Parameters:** +- `max_concurrency`: Number of buckets (default: 16). Higher values give better distribution at the cost of more memory. Typical values: 16-64. + +## API Reference + +```cpp +// Insertion operations +auto [inserted, value_ptr] = map.insert(key, value); // Insert if not exists +auto [inserted, value_ptr] = map.insert_or_assign(key, value); // Insert or update +T& ref = map[key]; // Insert default if missing + +// Lookup operations +T* ptr = map.find(key); // Returns nullptr if not found +const T* ptr = map.find(key) const; // Const version +T& ref = map.at(key); // Throws if not found +bool exists = map.contains(key); // Check existence +size_t count = map.count(key); // Returns 0 or 1 + +// Removal operations +size_t erased = map.erase(key); // Returns number of elements erased +map.clear(); // Remove all elements + +// Size operations +size_t s = map.size(); // Total element count +bool e = map.empty(); // Check if empty +size_t b = map.bucket_count(); // Number of buckets + +// Iteration +map.for_each([](const Key& key, T& value) { /* ... */ }); +``` + +Insert operations return `std::pair` where `first` is `true` if insertion occurred and `second` is a pointer to the value. + +## Key Differences from std::unordered_map + +| Feature | std::unordered_map | hshm::priv::unordered_map_ll | +|---------|-------------------|----------------------| +| Internal Structure | Implementation-defined | Vector of lists (explicit) | +| Bucket Count | Dynamic rehashing | Fixed at construction | +| Iterator Stability | Unstable across insertions | Stable (list-based) | +| Shared Memory | Not compatible | Not compatible | +| Return Values | Iterators | Pointers to values | + +## When to Use + +| Scenario | Recommendation | +|----------|---------------| +| Runtime container data structures (caches, registries) | `hshm::priv::unordered_map_ll` | +| Task input/output parameters | `std::unordered_map` or `chi::ipc::` types | +| Client-side code | `std::unordered_map` | +| Data requiring serialization | `std::unordered_map` with cereal | diff --git a/docs/sdk/context-transport-primitives/2.types/vector_guide.md b/docs/sdk/context-transport-primitives/2.types/vector_guide.md new file mode 100644 index 0000000..98f7fd3 --- /dev/null +++ b/docs/sdk/context-transport-primitives/2.types/vector_guide.md @@ -0,0 +1,87 @@ +--- +sidebar_position: 1 +--- + +# Vector Guide + +## Overview + +HSHM provides two vector variants: `hshm::ipc::vector` for shared memory and `hshm::priv::vector` for private memory. For standard ChiMod development, use `std::vector`. The HSHM vectors are needed when data must be accessible from GPU kernels or live in shared memory across processes. + +## hshm::ipc::vector + +**Source:** `hermes_shm/data_structures/ipc/vector.h` + +A dynamic array stored in shared memory using offset-based pointers (`OffsetPtr`) for process-independent addressing. + +```cpp +#include + +// Create with an allocator +hshm::ipc::vector vec(alloc, 10); // 10 elements + +// Standard vector operations +vec.push_back(42); +vec.emplace_back(100); +int val = vec[0]; +vec.resize(20); +vec.reserve(50); +vec.clear(); + +// Iteration +for (auto it = vec.begin(); it != vec.end(); ++it) { + process(*it); +} +``` + +**Template Parameters:** +- `T` - Element type +- `AllocT` - Allocator type (determines shared vs private memory) + +**Key Differences from std::vector:** +- Requires an allocator at construction time +- Uses `OffsetPtr` internally instead of raw pointers +- Safe for cross-process access in shared memory +- Annotated with `HSHM_CROSS_FUN` for GPU compatibility + +## hshm::priv::vector + +**Source:** `hermes_shm/data_structures/priv/vector.h` + +A private-memory vector with allocator integration. Supports the same API as `std::vector` plus serialization. + +```cpp +#include + +// Standard construction +hshm::priv::vector vec = {1, 2, 3, 4, 5}; +hshm::priv::vector vec2(10, 0); // 10 zeros + +// Full STL-compatible API +vec.push_back(6); +vec.pop_back(); +vec.insert(vec.begin() + 2, 99); +vec.erase(vec.begin()); + +// Reverse iteration +for (auto it = vec.rbegin(); it != vec.rend(); ++it) { + process(*it); +} +``` + +**Optimizations:** +- Uses `memcpy`/`memmove` for trivially copyable types (POD optimization) +- Exponential capacity growth strategy +- Annotated with `HSHM_CROSS_FUN` for GPU compatibility + +## When to Use Each + +| Variant | Use Case | +|---------|----------| +| `std::vector` | Default choice for ChiMod task data | +| `hshm::priv::vector` | Private memory with serialization support or GPU access | +| `hshm::ipc::vector` | Cross-process shared memory regions | + +## Related Documentation + +- [Allocator Guide](../allocator/allocator_guide) - Memory allocators used by these vectors diff --git a/docusaurus.config.ts b/docusaurus.config.ts index 9b1f119..826b45b 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -131,7 +131,7 @@ const config: Config = { title: 'Community', items: [ {label: 'GitHub', href: 'https://github.com/iowarp'}, - {label: 'Issues', href: 'https://github.com/iowarp/iowarp/issues'}, + {label: 'Issues', href: 'https://github.com/iowarp/clio-core/issues'}, {label: 'Discussions', href: 'https://github.com/orgs/iowarp/discussions'}, {label: 'Zulip Chat', href: 'https://iowarp.zulipchat.com'}, ],