diff --git a/docs/api/agents.md b/docs/api/agents.md
index bc8073e..1257a30 100644
--- a/docs/api/agents.md
+++ b/docs/api/agents.md
@@ -27,4 +27,4 @@ AI agents currently interact with IOWarp through:
 
 - [CLIO Agent Overview](https://iowarp.ai/platform/clio-agent/) — Architecture and design
 - [CLIO Kit](https://docs.iowarp.ai) — Interactive MCP server showcase
-- [GitHub Issues](https://github.com/iowarp/iowarp/issues) — Request API features
+- [GitHub Issues](https://github.com/iowarp/clio-core/issues) — Request API features
diff --git a/docs/deployment/configuration.md b/docs/deployment/configuration.md
index b91b9b1..22a6bc2 100644
--- a/docs/deployment/configuration.md
+++ b/docs/deployment/configuration.md
@@ -37,7 +37,7 @@ Size values throughout the file accept: `B`, `KB`, `MB`, `GB`, `TB` (case-insens
 
 | Parameter | Default | Description |
 |-----------|---------|-------------|
-| `port` | `5555` | ZeroMQ RPC listener port. Must match across all cluster nodes. |
+| `port` | `9413` | ZeroMQ RPC listener port. Must match across all cluster nodes. Can be overridden by `CHI_PORT` env var. |
 | `neighborhood_size` | `32` | Maximum nodes queried when splitting range queries. |
 | `hostfile` | *(none)* | Path to a file listing cluster node IPs/hostnames, one per line. Required for multi-node deployments. |
 | `wait_for_restart` | `30` | Seconds to wait for peer nodes during startup. |
@@ -45,7 +45,7 @@ Size values throughout the file accept: `B`, `KB`, `MB`, `GB`, `TB` (case-insens
 
 ```yaml
 networking:
-  port: 5555
+  port: 9413
   neighborhood_size: 32
   # hostfile: /etc/iowarp/hostfile   # Multi-node only
   wait_for_restart: 30
@@ -261,7 +261,7 @@ All fields are optional and override compile-time defaults.
 
 ```yaml
 networking:
-  port: 5555
+  port: 9413
 
 runtime:
   num_threads: 4
@@ -291,7 +291,7 @@ compose:
 
 ```yaml
 networking:
-  port: 5555
+  port: 9413
 
 runtime:
   num_threads: 16
@@ -334,7 +334,7 @@ compose:
 
 ```yaml
 networking:
-  port: 5555
+  port: 9413
   neighborhood_size: 32
   hostfile: /etc/iowarp/hostfile
 
@@ -383,7 +383,7 @@ services:
     volumes:
       - ./chimaera.yaml:/home/iowarp/.chimaera/chimaera.yaml:ro
     ports:
-      - "5555:5555"
+      - "9413:9413"
     mem_limit: 8g
     command: ["chimaera", "runtime", "start"]
     restart: unless-stopped
diff --git a/docs/deployment/hpc-cluster.md b/docs/deployment/hpc-cluster.md
index b3e0c5a..af98a29 100644
--- a/docs/deployment/hpc-cluster.md
+++ b/docs/deployment/hpc-cluster.md
@@ -42,6 +42,13 @@ The following environment variables control runtime behavior. Set them before st
 export CHI_SERVER_CONF=/etc/iowarp/config.yaml
 ```
 
+### Networking Overrides
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `CHI_PORT` | `9413` | Override the RPC port. Takes priority over the YAML `networking.port` setting. |
+| `CHI_SERVER_ADDR` | `127.0.0.1` | Override the server address that clients connect to. |
+
 ### IPC Transport Mode
 
 | Variable | Default | Description |
@@ -107,7 +114,7 @@ Reference it in your config:
 
 ```yaml
 networking:
-  port: 5555
+  port: 9413
   hostfile: /etc/iowarp/hostfile
 ```
 
diff --git a/docs/faq.md b/docs/faq.md
index bc079e7..2dd0a98 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -19,7 +19,7 @@ IOWarp is released under the [BSD 3-Clause License](https://opensource.org/licen
 ### Where can I get help?
 
 - **Zulip Chat**: [iowarp.zulipchat.com](https://iowarp.zulipchat.com)
-- **GitHub Issues**: [github.com/iowarp/iowarp/issues](https://github.com/iowarp/iowarp/issues)
+- **GitHub Issues**: [github.com/iowarp/clio-core/issues](https://github.com/iowarp/clio-core/issues)
 - **Discussions**: [github.com/orgs/iowarp/discussions](https://github.com/orgs/iowarp/discussions)
 - **Email**: grc@illinoistech.edu
 
diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx
index f0582e9..3baa1d2 100644
--- a/docs/getting-started/installation.mdx
+++ b/docs/getting-started/installation.mdx
@@ -19,8 +19,8 @@ This uses conda internally to manage dependencies and produces a full-featured
 build with all optional components.
 
 ```bash
-git clone --recurse-submodules https://github.com/iowarp/iowarp-core.git
-cd iowarp-core
+git clone --recurse-submodules https://github.com/iowarp/clio-core.git
+cd clio-core
 ./install.sh
 ```
 
@@ -73,7 +73,7 @@ docker pull iowarp/deploy-cpu:latest
 Run the container:
 
 ```bash
-docker run -d -p 5555:5555 --memory=8g --name iowarp iowarp/deploy-cpu:latest chimaera runtime start
+docker run -d -p 9413:9413 --memory=8g --name iowarp iowarp/deploy-cpu:latest chimaera runtime start
 ```
 
 ### Using Docker Compose
@@ -89,7 +89,7 @@ services:
     volumes:
       - ./chimaera.yaml:/home/iowarp/.chimaera/chimaera.yaml:ro
     ports:
-      - "5555:5555"
+      - "9413:9413"
     mem_limit: 8g
     command: ["chimaera", "runtime", "start"]
     restart: unless-stopped
@@ -120,8 +120,9 @@ echo ". ${PWD}/share/spack/setup-env.sh" >> ~/.bashrc
 2. Add the IOWarp repository:
 
 ```bash
-git clone https://github.com/iowarp/iowarp-install.git
-spack repo add iowarp-install/iowarp-spack
+git clone --recurse-submodules https://github.com/iowarp/clio-core.git
+cd clio-core
+spack repo add installers/spack
 ```
 
 3. Install IOWarp:
diff --git a/docs/getting-started/quick-start.mdx b/docs/getting-started/quick-start.mdx
index d43b27e..26c8b88 100644
--- a/docs/getting-started/quick-start.mdx
+++ b/docs/getting-started/quick-start.mdx
@@ -74,7 +74,7 @@ The runtime resolves its configuration in this order:
 | 2 | `~/.chimaera/chimaera.yaml` |
 | 3 | Built-in defaults |
 
-The default configuration starts **4 worker threads** on **port 5555** and
+The default configuration starts **4 worker threads** on **port 9413** and
 composes three modules automatically:
 
 | Module | Purpose |
@@ -175,6 +175,8 @@ Done!
 |----------|-------------|
 | `CHI_SERVER_CONF` | Path to YAML configuration file (highest priority) |
 | `CHI_IPC_MODE` | Transport: `SHM` (shared memory), `TCP` (default), `IPC` (Unix socket) |
+| `CHI_PORT` | Override the RPC port (default: `9413`). Takes priority over YAML config. |
+| `CHI_SERVER_ADDR` | Override the server address clients connect to (default: `127.0.0.1`). |
 | `HSHM_LOG_LEVEL` | Logging verbosity: `debug`, `info`, `warning`, `error`, `fatal` |
 
 ## Next Steps
@@ -190,7 +192,7 @@ Done!
 
 ## Support
 
-- Open an issue on the [GitHub repository](https://github.com/iowarp/core)
+- Open an issue on the [GitHub repository](https://github.com/iowarp/clio-core)
 - Join the [Zulip Chat](https://iowarp.zulipchat.com)
 - Visit the [IOWarp website](https://iowarp.ai)
 - Email: grc@illinoistech.edu
diff --git a/docs/intro.md b/docs/intro.md
index 9a57fef..3c8d657 100644
--- a/docs/intro.md
+++ b/docs/intro.md
@@ -44,6 +44,6 @@ IOWarp consists of three layers:
 ## Need Help?
 
 - **Community** — [Zulip Chat](https://iowarp.zulipchat.com)
-- **Bug Reports** — [GitHub Issues](https://github.com/iowarp/iowarp/issues)
+- **Bug Reports** — [GitHub Issues](https://github.com/iowarp/clio-core/issues)
 - **Feature Requests** — [GitHub Discussions](https://github.com/orgs/iowarp/discussions)
 - **Email** — grc@illinoistech.edu
diff --git a/docs/sdk/context-runtime/2.module_dev_guide.md b/docs/sdk/context-runtime/2.module_dev_guide.md
index 94671d4..3ceeb9f 100644
--- a/docs/sdk/context-runtime/2.module_dev_guide.md
+++ b/docs/sdk/context-runtime/2.module_dev_guide.md
@@ -2407,7 +2407,7 @@ struct SerializableTask : public chi::Task {
 ```
 
 :::note GPU-Compatible Data Structures
-For GPU-compatible modules, HSHM provides shared-memory data structures (`hshm::string`, `hshm::ipc::vector`, `hshm::ipc::ring_buffer`) with cross-platform annotations. See the [Data Structures Guide](/docs/sdk/context-transport-primitives/types/data_structures_guide) for details.
+For GPU-compatible modules, HSHM provides shared-memory data structures (`hshm::string`, `hshm::ipc::vector`, `hshm::ipc::ring_buffer`) with cross-platform annotations. See the [Vector Guide](/docs/sdk/context-transport-primitives/types/vector_guide), [Ring Buffer Guide](/docs/sdk/context-transport-primitives/types/ring_buffer_guide), and [String Guide](/docs/sdk/context-transport-primitives/types/string_guide) for details.
 :::
 
 ### Bulk Transfer Support with ar.bulk
diff --git a/docs/sdk/context-runtime/3.module_test_guide.md b/docs/sdk/context-runtime/3.module_test_guide.md
index 747234b..c2541c6 100644
--- a/docs/sdk/context-runtime/3.module_test_guide.md
+++ b/docs/sdk/context-runtime/3.module_test_guide.md
@@ -33,7 +33,7 @@ A minimal test configuration:
 
 ```yaml
 networking:
-  port: 5555
+  port: 9413
 
 runtime:
   num_threads: 4
@@ -181,7 +181,7 @@ std::string CreateComposeConfig() {
     << "  num_threads: 4\n"
     << "\n"
     << "networking:\n"
-    << "  port: 5555\n"
+    << "  port: 9413\n"
     << "\n"
     << "compose:\n"
     << "- mod_name: chimaera_bdev\n"
@@ -405,7 +405,7 @@ ctest -R my_module
    rm -f /dev/shm/chimaera_*
    ```
 
-6. **Kill stale processes on port conflicts**: Tests bind to port 5555. If a previous run left a zombie:
+6. **Kill stale processes on port conflicts**: Tests bind to port 9413. If a previous run left a zombie:
    ```bash
-   sudo kill -9 $(sudo lsof -t -i :5555) 2>/dev/null
+   sudo kill -9 $(sudo lsof -t -i :9413) 2>/dev/null
    ```
diff --git a/docs/sdk/context-runtime/5.scheduler.md b/docs/sdk/context-runtime/5.scheduler.md
index 4aeaa7c..22111d6 100644
--- a/docs/sdk/context-runtime/5.scheduler.md
+++ b/docs/sdk/context-runtime/5.scheduler.md
@@ -2,17 +2,18 @@
 
 ## Overview
 
-The Chimaera runtime uses a pluggable scheduler architecture to control how tasks are mapped to workers and how workers are organized. This document explains how to build custom schedulers for the IOWarp runtime.
+The Chimaera runtime uses a pluggable scheduler architecture to control how tasks are mapped to workers and how workers are organized. Scheduling happens at two levels: **container-level** scheduling resolves *what* to execute (via `Container::ScheduleTask`), and the **Scheduler** decides *where* to execute it (which worker thread). This document explains both levels and how to build custom schedulers.
 
 ## Table of Contents
 
 1. [Architecture Overview](#architecture-overview)
-2. [Scheduler Interface](#scheduler-interface)
-3. [Worker Lifecycle](#worker-lifecycle)
-4. [Implementing a Custom Scheduler](#implementing-a-custom-scheduler)
-5. [DefaultScheduler Example](#defaultscheduler-example)
-6. [Best Practices](#best-practices)
-7. [Integration Points](#integration-points)
+2. [Two-Level Scheduling](#two-level-scheduling)
+3. [Scheduler Interface](#scheduler-interface)
+4. [Worker Lifecycle](#worker-lifecycle)
+5. [Implementing a Custom Scheduler](#implementing-a-custom-scheduler)
+6. [DefaultScheduler Example](#defaultscheduler-example)
+7. [Best Practices](#best-practices)
+8. [Integration Points](#integration-points)
 
 ## Architecture Overview
 
@@ -23,7 +24,8 @@ The IOWarp runtime separates concerns across three main components:
 - **ConfigManager**: Manages configuration (number of threads, queue depth, etc.)
 - **WorkOrchestrator**: Creates workers, spawns threads, assigns lanes to workers (1:1 mapping for all workers)
 - **Scheduler**: Decides worker partitioning, task-to-worker mapping, and load balancing
-- **IpcManager**: Manages shared memory, queues, and provides task routing infrastructure
+- **IpcManager**: Manages shared memory, queues, and provides task routing infrastructure (`RouteTask`, `RouteLocal`, `RouteGlobal`)
+- **Container**: Provides per-pool dynamic scheduling via `ScheduleTask`
 
 ### Data Flow
 
@@ -53,6 +55,69 @@ The IOWarp runtime separates concerns across three main components:
 └─────────────────┘
 ```
 
+### Task Routing Flow
+
+When a task is submitted, `IpcManager::RouteTask` orchestrates the full routing pipeline:
+
+```
+RouteTask(future, force_enqueue)
+  │
+  ├─ 1. Container::ScheduleTask()    → resolve Dynamic pool query
+  │                                     (e.g., Dynamic → DirectHash)
+  │
+  ├─ 2. ResolvePoolQuery()           → resolve to physical node(s)
+  │
+  ├─ 3. IsTaskLocal()                → local or remote?
+  │     │
+  │     ├─ local:  RouteLocal()      → resolve container, pick worker
+  │     │           │
+  │     │           ├─ RuntimeMapTask()  → scheduler picks dest worker
+  │     │           │
+  │     │           ├─ dest == current && !force_enqueue → execute directly
+  │     │           └─ otherwise → enqueue to dest worker's lane
+  │     │
+  │     └─ remote: RouteGlobal()     → enqueue to net_queue_ for SendIn
+  │
+  └─ return true if task can be executed by current worker
+```
+
+## Two-Level Scheduling
+
+### Level 1: Container Scheduling (`ScheduleTask`)
+
+Before the Scheduler decides *which worker* runs a task, the container decides *which container instance* handles it. This is the `Container::ScheduleTask` virtual method.
+
+```cpp
+class Container {
+public:
+  virtual PoolQuery ScheduleTask(const hipc::FullPtr<Task> &task) {
+    return task->pool_query_;  // Default: no transformation
+  }
+};
+```
+
+**Purpose**: Transform a high-level `PoolQuery` (often `Dynamic`) into a concrete routing mode (`DirectHash`, `Local`, `Broadcast`, etc.) based on the task's semantics.
+
+**Example**: A distributed filesystem container might schedule read tasks to the node holding the target block:
+
+```cpp
+PoolQuery MyFsContainer::ScheduleTask(const hipc::FullPtr<Task> &task) {
+  if (task->method_ == kRead || task->method_ == kWrite) {
+    // Route to the container holding the target block
+    u64 block_id = GetBlockId(task);
+    return PoolQuery::DirectHash(block_id);
+  }
+  // Metadata ops stay local
+  return PoolQuery::Local();
+}
+```
+
+`ScheduleTask` is called by `RouteTask` **before** pool query resolution. The returned `PoolQuery` then goes through `ResolvePoolQuery` to determine concrete physical nodes and containers.
+
+### Level 2: Worker Scheduling (`RuntimeMapTask`)
+
+After container and node resolution, `RouteLocal` calls `Scheduler::RuntimeMapTask` to pick the specific worker thread. This is where I/O-size routing, task group affinity, and network worker pinning happen.
+
 ## Scheduler Interface
 
 All schedulers must inherit from the `Scheduler` base class and implement the following methods:
@@ -71,13 +136,20 @@ public:
   virtual u32 ClientMapTask(IpcManager *ipc_manager, const Future<Task> &task) = 0;
 
   // Map tasks from runtime workers to other workers
-  virtual u32 RuntimeMapTask(Worker *worker, const Future<Task> &task) = 0;
+  virtual u32 RuntimeMapTask(Worker *worker, const Future<Task> &task,
+                             Container *container) = 0;
 
   // Rebalance load across workers (called periodically by workers)
   virtual void RebalanceWorker(Worker *worker) = 0;
 
   // Adjust polling intervals for periodic tasks
   virtual void AdjustPolling(RunContext *run_ctx) = 0;
+
+  // Get designated GPU worker (optional)
+  virtual Worker *GetGpuWorker() const { return nullptr; }
+
+  // Get designated network worker (optional)
+  virtual Worker *GetNetWorker() const { return nullptr; }
 };
 ```
 
@@ -91,8 +163,9 @@ public:
 
 **Responsibilities**:
 - Access workers via `work_orch->GetWorker(worker_id)`
-- Organize workers into scheduler-specific groups (e.g., task workers, network worker)
-- **Update IpcManager** with the total worker count via `IpcManager::SetNumSchedQueues()`
+- Organize workers into scheduler-specific groups (e.g., scheduler worker, I/O workers, network worker, GPU worker)
+- **Update IpcManager** with the scheduling queue count via `IpcManager::SetNumSchedQueues()`
+- **Set the network lane** via `IpcManager::SetNetLane()` so the runtime knows where to enqueue network tasks
 
 **Important**: All workers are assigned lanes by `WorkOrchestrator::SpawnWorkerThreads()` using 1:1 mapping. The scheduler does NOT control lane assignment — it only tracks worker groups for routing decisions.
 
@@ -101,21 +174,24 @@ public:
 void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
   u32 total_workers = work_orch->GetTotalWorkerCount();
 
-  // Track workers: first N-1 are task workers, last is network
-  for (u32 i = 0; i < total_workers - 1; ++i) {
-    Worker *worker = work_orch->GetWorker(i);
-    if (worker) {
-      task_workers_.push_back(worker);
-    }
+  // Worker 0: scheduler worker (metadata + small I/O)
+  scheduler_worker_ = work_orch->GetWorker(0);
+
+  // Workers 1..N-2: I/O workers (large I/O round-robin)
+  for (u32 i = 1; i < total_workers - 1; ++i) {
+    io_workers_.push_back(work_orch->GetWorker(i));
   }
 
-  // Last worker is network worker
+  // Worker N-1: network worker
   net_worker_ = work_orch->GetWorker(total_workers - 1);
 
-  // IMPORTANT: Update IpcManager with worker count
+  // IMPORTANT: Update IpcManager
   IpcManager *ipc = CHI_IPC;
   if (ipc) {
-    ipc->SetNumSchedQueues(total_workers);
+    ipc->SetNumSchedQueues(1);  // Client tasks go to scheduler worker
+    if (net_worker_) {
+      ipc->SetNetLane(net_worker_->GetLane());
+    }
   }
 }
 ```
@@ -124,7 +200,7 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
 
 **Purpose**: Determine which worker lane a task from a client should be assigned to.
 
-**Called**: When clients submit tasks to the runtime.
+**Called**: When clients submit tasks to the runtime via `SendRuntimeClient`.
 
 **Responsibilities**:
 - Return a lane ID in range `[0, num_sched_queues)`
@@ -138,45 +214,60 @@ u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future<Task> &task
   u32 num_lanes = ipc_manager->GetNumSchedQueues();
   if (num_lanes == 0) return 0;
 
-  // Route network tasks (Send/Recv from admin pool) to last worker
+  // Network tasks (Send/Recv from admin pool) → network worker
   Task *task_ptr = task.get();
   if (task_ptr != nullptr && task_ptr->pool_id_ == chi::kAdminPoolId) {
     u32 method_id = task_ptr->method_;
-    if (method_id == 14 || method_id == 15) {  // kSend or kRecv
+    if (method_id == 14 || method_id == 15 ||
+        method_id == 20 || method_id == 21) {
       return num_lanes - 1;
     }
   }
 
-  // PID+TID hash-based mapping for other tasks
-  auto *sys_info = HSHM_SYSTEM_INFO;
-  pid_t pid = sys_info->pid_;
-  auto tid = HSHM_THREAD_MODEL->GetTid();
-
-  size_t hash = std::hash<pid_t>{}(pid) ^ (std::hash<void*>{}(&tid) << 1);
-  return static_cast<u32>(hash % num_lanes);
+  // Default: scheduler worker (lane 0)
+  return 0;
 }
 ```
 
-#### `RuntimeMapTask(Worker *worker, const Future<Task> &task)`
+#### `RuntimeMapTask(Worker *worker, const Future<Task> &task, Container *container)`
 
 **Purpose**: Determine which worker should execute a task when routing from within the runtime.
 
-**Called**: By `Worker::RouteLocal()` to decide whether a task should execute on the current worker or be forwarded to another.
+**Called**: By `IpcManager::RouteLocal()` after the execution container has been resolved. If the returned worker ID differs from the current worker (or if `force_enqueue` is set), the task is enqueued to the destination worker's lane.
+
+**Parameters**:
+- `worker`: The current worker (may be nullptr if called from a non-worker thread)
+- `task`: The task being routed
+- `container`: The resolved execution container. Used for task-group affinity lookups. May be nullptr when called without a resolved container.
 
 **Responsibilities**:
 - Return a worker ID for task execution
 - Route periodic network tasks (Send/Recv) to the dedicated network worker
-- For all other tasks, return the current worker's ID (no migration)
+- Implement I/O-size-based routing (large I/O → dedicated workers)
+- Implement task group affinity (pin related tasks to the same worker)
 
 **Example**:
 ```cpp
-u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future<Task> &task) {
-  // Route periodic network tasks to the network worker
+u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future<Task> &task,
+                                Container *container) {
   Task *task_ptr = task.get();
+
+  // Task group affinity: if this task's group is already pinned, use that worker
+  if (container != nullptr && task_ptr != nullptr &&
+      !task_ptr->task_group_.IsNull()) {
+    int64_t group_id = task_ptr->task_group_.id_;
+    ScopedCoRwReadLock read_lock(container->task_group_lock_);
+    auto it = container->task_group_map_.find(group_id);
+    if (it != container->task_group_map_.end() && it->second != nullptr) {
+      return it->second->GetId();
+    }
+  }
+
+  // Periodic network tasks → network worker
   if (task_ptr != nullptr && task_ptr->IsPeriodic()) {
     if (task_ptr->pool_id_ == chi::kAdminPoolId) {
       u32 method_id = task_ptr->method_;
-      if (method_id == 14 || method_id == 15) {  // kSend or kRecv
+      if (method_id == 14 || method_id == 15) {
         if (net_worker_ != nullptr) {
           return net_worker_->GetId();
         }
@@ -184,7 +275,20 @@ u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future<Task> &task) {
     }
   }
 
-  // All other tasks execute on the current worker
+  // Large I/O → round-robin across I/O workers
+  if (task_ptr != nullptr && !io_workers_.empty()) {
+    if (task_ptr->stat_.io_size_ >= kLargeIOThreshold) {
+      u32 idx = next_io_idx_.fetch_add(1, std::memory_order_relaxed)
+                % static_cast<u32>(io_workers_.size());
+      return io_workers_[idx]->GetId();
+    }
+  }
+
+  // Small I/O / metadata → scheduler worker
+  if (scheduler_worker_ != nullptr) {
+    return scheduler_worker_->GetId();
+  }
+
   return worker ? worker->GetId() : 0;
 }
 ```
@@ -218,6 +322,7 @@ void MyScheduler::RebalanceWorker(Worker *worker) {
 - Modify `run_ctx->yield_time_us_` based on `run_ctx->did_work_`
 - Implement adaptive polling (exponential backoff when idle)
 - Reduce CPU usage for idle periodic tasks
+- **Important**: `co_await` on Futures sets `yield_time_us_` to 0, so this method must restore it to prevent busy-looping
 
 **Example**:
 ```cpp
@@ -252,8 +357,9 @@ Understanding the worker lifecycle is crucial for scheduler implementation:
    - Calls Scheduler::DivideWorkers()
    ↓
 3. Scheduler::DivideWorkers()
-   - Tracks workers into functional groups (task workers, network worker)
+   - Tracks workers into functional groups (scheduler, I/O, network, GPU)
    - Updates IpcManager::SetNumSchedQueues()
+   - Sets network lane via IpcManager::SetNetLane()
    ↓
 4. WorkOrchestrator::StartWorkers()
    - Calls SpawnWorkerThreads()
@@ -261,9 +367,12 @@ Understanding the worker lifecycle is crucial for scheduler implementation:
    - Spawns actual OS threads
    ↓
 5. Workers run task processing loops
-   - Process tasks from assigned lanes
-   - Call Scheduler::RuntimeMapTask() for task routing in RouteLocal()
+   - ProcessNewTask: pop futures from lane, route via RouteTask()
+     - RouteTask calls Container::ScheduleTask() for dynamic resolution
+     - RouteLocal calls Scheduler::RuntimeMapTask() for worker selection
+     - If dest != current worker (or force_enqueue), re-enqueue to dest lane
    - Call Scheduler::RebalanceWorker() periodically
+   - Call Scheduler::AdjustPolling() after periodic task execution
 ```
 
 ## Implementing a Custom Scheduler
@@ -276,6 +385,7 @@ Create `context-runtime/include/chimaera/scheduler/my_scheduler.h`:
 #ifndef CHIMAERA_INCLUDE_CHIMAERA_SCHEDULER_MY_SCHEDULER_H_
 #define CHIMAERA_INCLUDE_CHIMAERA_SCHEDULER_MY_SCHEDULER_H_
 
+#include <atomic>
 #include <vector>
 #include "chimaera/scheduler/scheduler.h"
 
@@ -283,20 +393,25 @@ namespace chi {
 
 class MyScheduler : public Scheduler {
 public:
-  MyScheduler() : net_worker_(nullptr) {}
+  MyScheduler() : scheduler_worker_(nullptr), net_worker_(nullptr),
+                  gpu_worker_(nullptr), next_io_idx_{0} {}
   ~MyScheduler() override = default;
 
-  // Implement required interface methods
   void DivideWorkers(WorkOrchestrator *work_orch) override;
   u32 ClientMapTask(IpcManager *ipc_manager, const Future<Task> &task) override;
-  u32 RuntimeMapTask(Worker *worker, const Future<Task> &task) override;
+  u32 RuntimeMapTask(Worker *worker, const Future<Task> &task,
+                     Container *container) override;
   void RebalanceWorker(Worker *worker) override;
   void AdjustPolling(RunContext *run_ctx) override;
+  Worker *GetGpuWorker() const override { return gpu_worker_; }
+  Worker *GetNetWorker() const override { return net_worker_; }
 
 private:
-  // Your scheduler-specific state
-  std::vector<Worker*> scheduler_workers_;
+  Worker *scheduler_worker_;
+  std::vector<Worker*> io_workers_;
   Worker *net_worker_;
+  Worker *gpu_worker_;
+  std::atomic<u32> next_io_idx_{0};
 };
 
 }  // namespace chi
@@ -322,47 +437,73 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
 
   u32 total_workers = work_orch->GetTotalWorkerCount();
 
-  scheduler_workers_.clear();
-  net_worker_ = nullptr;
-
-  // Network worker is always the last worker
+  scheduler_worker_ = work_orch->GetWorker(0);
   net_worker_ = work_orch->GetWorker(total_workers - 1);
 
-  // Scheduler workers are all workers except the last one
-  u32 num_sched_workers = (total_workers == 1) ? 1 : (total_workers - 1);
-  for (u32 i = 0; i < num_sched_workers; ++i) {
-    Worker *worker = work_orch->GetWorker(i);
-    if (worker) {
-      scheduler_workers_.push_back(worker);
+  if (total_workers > 2) {
+    gpu_worker_ = work_orch->GetWorker(total_workers - 2);
+    for (u32 i = 1; i < total_workers - 1; ++i) {
+      io_workers_.push_back(work_orch->GetWorker(i));
     }
   }
 
-  // CRITICAL: Update IpcManager with the number of workers
   IpcManager *ipc = CHI_IPC;
   if (ipc) {
-    ipc->SetNumSchedQueues(total_workers);
+    ipc->SetNumSchedQueues(1);
+    if (net_worker_) {
+      ipc->SetNetLane(net_worker_->GetLane());
+    }
   }
 }
 
 u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future<Task> &task) {
   u32 num_lanes = ipc_manager->GetNumSchedQueues();
   if (num_lanes == 0) return 0;
-
-  // Implement your mapping strategy here
-  return 0; // Simple: always map to lane 0
+  return 0;  // All client tasks → scheduler worker
 }
 
-u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future<Task> &task) {
+u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future<Task> &task,
+                                Container *container) {
+  Task *task_ptr = task.get();
+
+  // Task group affinity
+  if (container != nullptr && task_ptr != nullptr &&
+      !task_ptr->task_group_.IsNull()) {
+    int64_t group_id = task_ptr->task_group_.id_;
+    ScopedCoRwReadLock read_lock(container->task_group_lock_);
+    auto it = container->task_group_map_.find(group_id);
+    if (it != container->task_group_map_.end() && it->second != nullptr) {
+      return it->second->GetId();
+    }
+  }
+
+  // Periodic network tasks → network worker
+  if (task_ptr != nullptr && task_ptr->IsPeriodic() &&
+      task_ptr->pool_id_ == chi::kAdminPoolId) {
+    u32 m = task_ptr->method_;
+    if ((m == 14 || m == 15 || m == 20 || m == 21) && net_worker_) {
+      return net_worker_->GetId();
+    }
+  }
+
+  // Large I/O → round-robin across I/O workers
+  if (task_ptr != nullptr && !io_workers_.empty() &&
+      task_ptr->stat_.io_size_ >= 4096) {
+    u32 idx = next_io_idx_.fetch_add(1, std::memory_order_relaxed)
+              % static_cast<u32>(io_workers_.size());
+    return io_workers_[idx]->GetId();
+  }
+
+  // Default → scheduler worker
+  if (scheduler_worker_) return scheduler_worker_->GetId();
   return worker ? worker->GetId() : 0;
 }
 
-void MyScheduler::RebalanceWorker(Worker *worker) {
-  (void)worker;
-}
+void MyScheduler::RebalanceWorker(Worker *worker) { (void)worker; }
 
 void MyScheduler::AdjustPolling(RunContext *run_ctx) {
   if (!run_ctx) return;
-  // Implement adaptive polling or leave with default behavior
+  run_ctx->yield_time_us_ = run_ctx->true_period_ns_ / 1000.0;
 }
 
 }  // namespace chi
@@ -406,27 +547,45 @@ runtime:
 
 ## DefaultScheduler Example
 
-The `DefaultScheduler` provides a reference implementation with these characteristics:
+The `DefaultScheduler` provides a reference implementation with I/O-size-based routing and task group affinity.
 
 ### Worker Partitioning
-- Tracks all workers except the last as scheduler workers
-- Last worker is designated as the network worker
-- All workers get lanes assigned by WorkOrchestrator (1:1 mapping)
-- `SetNumSchedQueues(total_workers)` includes all workers for client task mapping
+- **Worker 0**: Scheduler worker — handles metadata and small I/O (< 4KB)
+- **Workers 1..N-2**: I/O workers — handle large I/O via round-robin
+- **Worker N-2**: Also serves as the GPU worker (polls GPU queues)
+- **Worker N-1**: Network worker — handles all Send/Recv/ClientSend/ClientRecv tasks
+- `SetNumSchedQueues(1)` — client tasks all go to the scheduler worker initially
+
+### Dynamic Scheduling via `ScheduleTask`
+
+The `DefaultScheduler` works hand-in-hand with `Container::ScheduleTask`. When a task arrives with a `Dynamic` pool query, the container's `ScheduleTask` resolves it before `RuntimeMapTask` picks the worker:
+
+1. Task submitted with `PoolQuery::Dynamic()`
+2. `RouteTask` calls `container->ScheduleTask(task)` → returns e.g. `DirectHash(block_id)`
+3. `ResolvePoolQuery` resolves `DirectHash` to a physical node + container
+4. `IsTaskLocal` checks if the target is this node
+5. `RouteLocal` calls `RuntimeMapTask` to pick the worker
+
+This two-level design means containers control *what* gets executed where, while the scheduler controls *how* workers are utilized.
 
-### Task Mapping Strategy
-- **Client Tasks**: PID+TID hash-based mapping for regular tasks
-  - Ensures different processes/threads use different lanes
-  - Network tasks (Send/Recv from admin pool, methods 14/15) are routed to the last worker (network worker)
-- **Runtime Tasks**: Tasks execute on the current worker, except periodic Send/Recv tasks which are routed to the network worker
+### Task Group Affinity
 
-### Load Balancing
-- No active rebalancing (simple design)
-- Tasks processed by worker that picks them up
+The `DefaultScheduler` implements task group affinity to pin related tasks to the same worker. Each `Container` maintains a `task_group_map_` mapping group IDs to workers:
 
-### Polling Adjustment
-- Currently disabled (early return) to avoid hanging issues
-- When enabled, implements exponential backoff for idle periodic tasks
+1. When `RuntimeMapTask` sees a task with a non-null `task_group_`, it checks the container's `task_group_map_`
+2. If the group is already mapped to a worker, the task goes to that worker
+3. If not, normal routing selects a worker and records the mapping
+
+This ensures tasks in the same group (e.g., operations on the same file handle) execute on the same worker, avoiding lock contention and improving cache locality.
+
+### I/O-Size Routing
+- Tasks with `stat_.io_size_ >= 4096` → round-robin across I/O workers
+- Tasks with `stat_.io_size_ < 4096` → scheduler worker (worker 0)
+- Network tasks (admin pool methods 14, 15, 20, 21) → network worker
+
+### Force Enqueue
+
+`RouteTask` accepts a `force_enqueue` parameter (default `false`). When `true`, `RouteLocal` always enqueues the task to the destination worker's lane, even if the destination is the current worker. This is used by `SendRuntime` (non-worker thread path) which cannot execute tasks directly and must always enqueue.
 
 ### Code Reference
 
@@ -442,10 +601,12 @@ See implementation in:
 void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
   // ... partition workers ...
 
-  // CRITICAL: Update IpcManager with worker count
   IpcManager *ipc = CHI_IPC;
   if (ipc) {
-    ipc->SetNumSchedQueues(total_workers);
+    ipc->SetNumSchedQueues(num_client_lanes);
+    if (net_worker_) {
+      ipc->SetNetLane(net_worker_->GetLane());
+    }
   }
 }
 ```
@@ -454,9 +615,13 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
 
 ### 2. Route Network Tasks to the Network Worker
 
-Both `ClientMapTask` and `RuntimeMapTask` should route Send/Recv tasks (methods 14/15 from admin pool) to the dedicated network worker (last worker). This prevents network I/O from blocking task processing workers.
+Both `ClientMapTask` and `RuntimeMapTask` should route Send/Recv tasks (methods 14/15/20/21 from admin pool) to the dedicated network worker (last worker). This prevents network I/O from blocking task processing workers.
+
+### 3. Implement Task Group Affinity
 
-### 3. Validate Lane IDs
+Use `container->task_group_map_` to pin related tasks to the same worker. Protect map access with `container->task_group_lock_` (read lock for lookups, write lock for updates).
+
+### 4. Validate Lane IDs
 
 ```cpp
 u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future<Task> &task) {
@@ -468,7 +633,7 @@ u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future<Task> &task
 }
 ```
 
-### 4. Handle Null Pointers
+### 5. Handle Null Pointers
 
 ```cpp
 void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
@@ -476,19 +641,20 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
   // ... proceed ...
 }
 
-u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future<Task> &task) {
+u32 MyScheduler::RuntimeMapTask(Worker *worker, const Future<Task> &task,
+                                Container *container) {
   return worker ? worker->GetId() : 0;
 }
 ```
 
-### 5. Consider Thread Safety
+### 6. Consider Thread Safety
 
 If your scheduler maintains shared state accessed by multiple workers:
-- Use atomic operations for counters
-- Use mutexes for complex data structures
+- Use atomic operations for counters (e.g., `next_io_idx_`)
+- Use `CoRwLock` for complex data structures (e.g., `task_group_map_`)
 - Prefer lock-free designs when possible
 
-### 6. Test with Different Configurations
+### 7. Test with Different Configurations
 
 Test your scheduler with various `num_threads` values:
 - Single thread (num_threads = 1): single worker serves dual role
@@ -531,6 +697,22 @@ u32 id = worker->GetId();
 TaskLane *lane = worker->GetLane();
 ```
 
+### Container Access
+
+Containers expose scheduling-related state:
+
+```cpp
+// Task group affinity map (per container)
+container->task_group_map_    // std::unordered_map<int64_t, Worker*>
+container->task_group_lock_   // CoRwLock protecting the map
+
+// Override ScheduleTask for custom dynamic scheduling
+PoolQuery MyContainer::ScheduleTask(const hipc::FullPtr<Task> &task) {
+  // Transform Dynamic queries into concrete routing modes
+  return PoolQuery::DirectHash(ComputeHash(task));
+}
+```
+
 ### Logging
 
 Use Hermes logging macros:
@@ -555,6 +737,35 @@ std::string sched_name = config->GetLocalSched();
 
 ## Advanced Topics
 
+### Custom Container Scheduling
+
+Override `ScheduleTask` to implement application-specific routing:
+
+```cpp
+class DistributedKVStore : public Container {
+public:
+  PoolQuery ScheduleTask(const hipc::FullPtr<Task> &task) override {
+    switch (task->method_) {
+      case kGet:
+      case kPut:
+      case kDelete: {
+        // Route key-value ops to the node owning the key's hash partition
+        u64 key_hash = ExtractKeyHash(task);
+        return PoolQuery::DirectHash(key_hash);
+      }
+      case kScan:
+        // Range scans may span multiple nodes
+        return PoolQuery::Range(ExtractStartKey(task), ExtractEndKey(task));
+      case kCreateIndex:
+        // Metadata ops broadcast to all nodes
+        return PoolQuery::Broadcast();
+      default:
+        return PoolQuery::Local();
+    }
+  }
+};
+```
+
 ### Work Stealing
 
 Implement work stealing in `RebalanceWorker`:
@@ -563,7 +774,7 @@ Implement work stealing in `RebalanceWorker`:
 void MyScheduler::RebalanceWorker(Worker *worker) {
   TaskLane *my_lane = worker->GetLane();
   if (my_lane->Empty()) {
-    for (Worker *victim : scheduler_workers_) {
+    for (Worker *victim : io_workers_) {
       if (victim == worker) continue;
 
       TaskLane *victim_lane = victim->GetLane();
@@ -579,20 +790,6 @@ void MyScheduler::RebalanceWorker(Worker *worker) {
 }
 ```
 
-### Locality-Aware Mapping
-
-Map tasks based on data locality:
-
-```cpp
-u32 MyScheduler::ClientMapTask(IpcManager *ipc_manager, const Future<Task> &task) {
-  // Extract data location from task
-  PoolId pool_id = task->pool_id_;
-
-  // Map to worker closest to data
-  return ComputeLocalityMap(pool_id, ipc_manager->GetNumSchedQueues());
-}
-```
-
 ### Priority-Based Scheduling
 
 Use task priorities for scheduling:
@@ -625,6 +822,7 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
 1. Did you call `IpcManager::SetNumSchedQueues()` in `DivideWorkers`?
 2. Are all workers getting lanes via WorkOrchestrator's 1:1 mapping?
 3. Does `ClientMapTask` return lane IDs in valid range?
+4. Is `IpcManager::SetNetLane()` called for the network worker?
 
 ### Client Mapping Errors
 
@@ -635,6 +833,14 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
 2. Did you check for `num_lanes == 0`?
 3. Are you using modulo to wrap lane IDs?
 
+### Tasks Hang After Re-enqueue
+
+**Symptom**: Tasks enqueued to a different worker never complete
+
+**Check**:
+1. When `RouteLocal` re-enqueues to a different worker, `ProcessNewTask` on the destination worker updates the RunContext's `worker_id_`, `lane_`, and `event_queue_` to match the new worker. If RunContext fields are stale, subtask completion events go to the wrong worker.
+2. Check that `TASK_STARTED` is respected — re-enqueued tasks with live coroutines must be resumed, not restarted.
+
 ### Worker Crashes
 
 **Symptom**: Workers crash during initialization
@@ -648,6 +854,7 @@ void MyScheduler::DivideWorkers(WorkOrchestrator *work_orch) {
 
 - **Scheduler Interface**: `context-runtime/include/chimaera/scheduler/scheduler.h`
 - **DefaultScheduler**: `context-runtime/src/scheduler/default_sched.cc`
+- **Container Base**: `context-runtime/include/chimaera/container.h`
+- **IpcManager (RouteTask/RouteLocal)**: `context-runtime/src/ipc_manager.cc`
 - **WorkOrchestrator**: `context-runtime/src/work_orchestrator.cc`
-- **IpcManager**: `context-runtime/src/ipc_manager.cc`
 - **Configuration**: [Configuration Reference](../../deployment/configuration)
diff --git a/docs/sdk/context-transport-primitives/1.allocator/allocator_guide.md b/docs/sdk/context-transport-primitives/1.allocator/allocator_guide.md
index b9c95e6..892edeb 100644
--- a/docs/sdk/context-transport-primitives/1.allocator/allocator_guide.md
+++ b/docs/sdk/context-transport-primitives/1.allocator/allocator_guide.md
@@ -339,4 +339,5 @@ wait $RANK0_PID $RANK1_PID $RANK2_PID
 ## Related Documentation
 
 - [Memory Backends Guide](./memory_backend_guide) - Backends that provide memory regions for these allocators
-- [Data Structures Guide](../types/data_structures_guide) - Data structures that use these allocators
+- [Vector Guide](../types/vector_guide) - Shared-memory vectors that use these allocators
+- [Ring Buffer Guide](../types/ring_buffer_guide) - Lock-free circular queues
diff --git a/docs/sdk/context-transport-primitives/1.allocator/memory_backend_guide.md b/docs/sdk/context-transport-primitives/1.allocator/memory_backend_guide.md
index 5d0a076..1ab155f 100644
--- a/docs/sdk/context-transport-primitives/1.allocator/memory_backend_guide.md
+++ b/docs/sdk/context-transport-primitives/1.allocator/memory_backend_guide.md
@@ -135,4 +135,4 @@ GPU backends are only compiled when CUDA or ROCm is enabled:
 ## Related Documentation
 
 - [Allocator Guide](./allocator_guide) - Allocators that manage memory from these backends
-- [Data Structures Guide](../types/data_structures_guide) - Data structures that use these allocators
+- [Vector Guide](../types/vector_guide) - Shared-memory vectors that use these allocators
diff --git a/docs/sdk/context-transport-primitives/2.types/data_structures_guide.md b/docs/sdk/context-transport-primitives/2.types/data_structures_guide.md
deleted file mode 100644
index 5bd011e..0000000
--- a/docs/sdk/context-transport-primitives/2.types/data_structures_guide.md
+++ /dev/null
@@ -1,335 +0,0 @@
-# Data Structures Guide
-
-## Overview
-
-HSHM provides data structures designed for shared memory and GPU compatibility. These are alternatives to STL containers for use cases requiring cross-process sharing or GPU kernel access.
-
-For standard ChiMod development, use `std::string` and `std::vector`. The HSHM data structures below are needed when:
-- Data must be accessible from GPU kernels
-- Data must live in shared memory across processes
-- You need lock-free concurrent queues
-
-## Vector
-
-HSHM provides two vector variants: `hshm::ipc::vector` for shared memory and `hshm::priv::vector` for private memory.
-
-### hshm::ipc::vector
-
-**Source:** `hermes_shm/data_structures/ipc/vector.h`
-
-A dynamic array stored in shared memory using offset-based pointers (`OffsetPtr<T>`) for process-independent addressing.
-
-```cpp
-#include <hermes_shm/data_structures/ipc/vector.h>
-
-// Create with an allocator
-hshm::ipc::vector<int, AllocT> vec(alloc, 10);  // 10 elements
-
-// Standard vector operations
-vec.push_back(42);
-vec.emplace_back(100);
-int val = vec[0];
-vec.resize(20);
-vec.reserve(50);
-vec.clear();
-
-// Iteration
-for (auto it = vec.begin(); it != vec.end(); ++it) {
-  process(*it);
-}
-```
-
-**Template Parameters:**
-- `T` - Element type
-- `AllocT` - Allocator type (determines shared vs private memory)
-
-**Key Differences from std::vector:**
-- Requires an allocator at construction time
-- Uses `OffsetPtr<T>` internally instead of raw pointers
-- Safe for cross-process access in shared memory
-- Annotated with `HSHM_CROSS_FUN` for GPU compatibility
-
-### hshm::priv::vector
-
-**Source:** `hermes_shm/data_structures/priv/vector.h`
-
-A private-memory vector with allocator integration. Supports the same API as `std::vector` plus serialization.
-
-```cpp
-#include <hermes_shm/data_structures/priv/vector.h>
-
-// Standard construction
-hshm::priv::vector<int> vec = {1, 2, 3, 4, 5};
-hshm::priv::vector<int> vec2(10, 0);  // 10 zeros
-
-// Full STL-compatible API
-vec.push_back(6);
-vec.pop_back();
-vec.insert(vec.begin() + 2, 99);
-vec.erase(vec.begin());
-
-// Reverse iteration
-for (auto it = vec.rbegin(); it != vec.rend(); ++it) {
-  process(*it);
-}
-```
-
-**Optimizations:**
-- Uses `memcpy`/`memmove` for trivially copyable types (POD optimization)
-- Exponential capacity growth strategy
-- Annotated with `HSHM_CROSS_FUN` for GPU compatibility
-
-### When to Use Each
-
-| Variant | Use Case |
-|---------|----------|
-| `std::vector` | Default choice for ChiMod task data |
-| `hshm::priv::vector` | Private memory with serialization support or GPU access |
-| `hshm::ipc::vector` | Cross-process shared memory regions |
-
-## Ring Buffer
-
-**Source:** `hermes_shm/data_structures/ipc/ring_buffer.h`
-
-A lock-free circular queue for concurrent producer-consumer patterns. Configurable via compile-time flags.
-
-### Configuration Flags
-
-```cpp
-namespace hshm::ipc {
-enum RingQueueFlag {
-  RING_BUFFER_SPSC_FLAGS       = 0x01,  // Single Producer Single Consumer
-  RING_BUFFER_MPSC_FLAGS       = 0x02,  // Multiple Producer Single Consumer
-  RING_BUFFER_WAIT_FOR_SPACE   = 0x04,  // Block until space available
-  RING_BUFFER_ERROR_ON_NO_SPACE = 0x08, // Return error if full
-  RING_BUFFER_DYNAMIC_SIZE     = 0x10,  // Resize when full
-  RING_BUFFER_FIXED_SIZE       = 0x20,  // Fixed-size buffer
-};
-}
-```
-
-### Pre-defined Type Aliases
-
-| Alias | Flags | Description |
-|-------|-------|-------------|
-| `spsc_ring_buffer<T>` | SPSC + Fixed + Error | Single-producer single-consumer, fixed size |
-| `mpsc_ring_buffer<T>` | MPSC + Fixed + Wait | Multi-producer single-consumer, blocks when full |
-| `circular_mpsc_ring_buffer<T>` | MPSC + Fixed + Error | Multi-producer single-consumer, wraps around |
-| `ext_ring_buffer<T>` | MPSC + Dynamic + Wait | Extensible, resizes when full |
-
-### Usage
-
-```cpp
-#include <hermes_shm/data_structures/ipc/ring_buffer.h>
-
-// Create a fixed-size SPSC ring buffer with depth 1024
-hshm::ipc::spsc_ring_buffer<int> rb(alloc, 1024);
-
-// Producer
-rb.Push(42);
-rb.Emplace(100);
-
-// Consumer
-int val;
-if (rb.Pop(val)) {
-  // Got value
-}
-
-// Query state
-size_t count = rb.Size();
-bool empty = rb.Empty();
-bool full = rb.Full();
-```
-
-### RingBufferEntry
-
-Each entry has an atomic ready flag for lock-free synchronization:
-
-```cpp
-template<typename T>
-struct RingBufferEntry {
-  bool IsReady();     // Check if entry has data
-  void SetReady();    // Mark entry as containing data
-  void ClearReady();  // Mark entry as consumed
-  T& GetData();       // Access the entry data
-};
-```
-
-### Internal Design
-
-- Uses atomic head/tail pointers for lock-free operation
-- Head is the consumer pointer, tail is the producer pointer
-- Queue capacity is `depth + 1` to distinguish full from empty
-- MPSC mode uses atomic tail with CAS for concurrent producers
-- SPSC mode uses non-atomic pointers for maximum performance
-- Includes worker metadata: `assigned_worker_id_`, `signal_fd_`, `tid_`, `active_`
-
-## String
-
-**Source:** `hermes_shm/data_structures/priv/string.h`
-
-An SSO (Short String Optimization) string backed by `hshm::priv::vector`.
-
-```cpp
-#include <hermes_shm/data_structures/priv/string.h>
-
-// Construction
-hshm::string s1("hello");
-hshm::string s2(std::string("world"));
-hshm::string s3(s1);  // Copy
-
-// Standard string API
-s1.append(" world");
-s1 += "!";
-size_t pos = s1.find("world");
-hshm::string sub = s1.substr(0, 5);
-bool eq = (s1 == s2);
-
-// Access
-const char* cstr = s1.c_str();
-char ch = s1[0];
-size_t len = s1.size();
-
-// Conversion to/from std::string
-std::string std_str = s1.str();
-std::string std_str2 = static_cast<std::string>(s1);
-```
-
-**Template Parameters:**
-- `T` - Character type (default: `char`)
-- `AllocT` - Allocator type
-- `SSOSize` - Short string buffer size (default: 32 bytes)
-
-**Key Features:**
-- Short strings (32 bytes or fewer) stored inline without heap allocation
-- Longer strings use `hshm::priv::vector` as backing store
-- Full `std::string`-compatible API: `find`, `substr`, `replace`, `starts_with`, `ends_with`
-- Annotated with `HSHM_CROSS_FUN` for GPU compatibility
-- Serialization support via `save()`/`load()`
-
-**Type Alias:** `hshm::string` is a convenience alias for `hshm::priv::basic_string<char>`.
-
-## Unordered Map (Vector of Lists)
-
-**Source:** `hermes_shm/data_structures/priv/unordered_map_ll.h`
-
-A hash map implementation using a vector of lists design that provides efficient concurrent access when combined with external locking. Each bucket contains a `std::list` of key-value pairs; the hash space is partitioned across a fixed number of buckets set at construction time.
-
-**Key Characteristics:**
-- **Vector of Lists Design**: Uses a vector of buckets, each containing a list of key-value pairs
-- **External Locking Required**: No internal mutexes - users must provide synchronization
-- **Bucket Partitioning**: Hash space is partitioned across multiple buckets for better cache locality
-- **Standard API**: Compatible with `std::unordered_map` interface
-- **NOT Shared-Memory Compatible**: For runtime-only data structures, not task parameters
-
-### Basic Usage
-
-```cpp
-#include <hermes_shm/data_structures/priv/unordered_map_ll.h>
-
-// Create map with 32 buckets
-hshm::priv::unordered_map_ll<int, std::string> map(32);
-
-// Insert
-auto [inserted, ptr] = map.insert(1, "hello");
-map.insert_or_assign(2, "world");
-map[3] = "foo";
-
-// Lookup
-std::string* val = map.find(1);      // Returns nullptr if not found
-const std::string& ref = map.at(2);  // Throws if not found
-bool exists = map.contains(3);
-
-// Remove
-map.erase(1);
-map.clear();
-
-// Iterate
-map.for_each([](const int& key, std::string& value) {
-  // Process each element
-});
-```
-
-### Constructor
-
-```cpp
-hshm::priv::unordered_map_ll<Key, T> map(max_concurrency);
-```
-
-**Parameters:**
-- `max_concurrency`: Number of buckets (default: 16). Higher values give better distribution at the cost of more memory. Typical values: 16-64.
-
-### API Reference
-
-```cpp
-// Insertion operations
-auto [inserted, value_ptr] = map.insert(key, value);          // Insert if not exists
-auto [inserted, value_ptr] = map.insert_or_assign(key, value); // Insert or update
-T& ref = map[key];                                            // Insert default if missing
-
-// Lookup operations
-T* ptr = map.find(key);                    // Returns nullptr if not found
-const T* ptr = map.find(key) const;        // Const version
-T& ref = map.at(key);                      // Throws if not found
-bool exists = map.contains(key);           // Check existence
-size_t count = map.count(key);             // Returns 0 or 1
-
-// Removal operations
-size_t erased = map.erase(key);            // Returns number of elements erased
-map.clear();                               // Remove all elements
-
-// Size operations
-size_t s = map.size();                     // Total element count
-bool e = map.empty();                      // Check if empty
-size_t b = map.bucket_count();             // Number of buckets
-
-// Iteration
-map.for_each([](const Key& key, T& value) { /* ... */ });
-```
-
-Insert operations return `std::pair<bool, T*>` where `first` is `true` if insertion occurred and `second` is a pointer to the value.
-
-### Key Differences from std::unordered_map
-
-| Feature | std::unordered_map | hshm::priv::unordered_map_ll |
-|---------|-------------------|----------------------|
-| Internal Structure | Implementation-defined | Vector of lists (explicit) |
-| Bucket Count | Dynamic rehashing | Fixed at construction |
-| Iterator Stability | Unstable across insertions | Stable (list-based) |
-| Shared Memory | Not compatible | Not compatible |
-| Return Values | Iterators | Pointers to values |
-
-### When to Use
-
-| Scenario | Recommendation |
-|----------|---------------|
-| Runtime container data structures (caches, registries) | `hshm::priv::unordered_map_ll` |
-| Task input/output parameters | `std::unordered_map` or `chi::ipc::` types |
-| Client-side code | `std::unordered_map` |
-| Data requiring serialization | `std::unordered_map` with cereal |
-
-## GPU Compatibility
-
-All HSHM data structures use cross-platform annotations for CPU/GPU compilation:
-
-| Annotation | Purpose |
-|-----------|---------|
-| `HSHM_INLINE_CROSS_FUN` | Inline function callable from both CPU and GPU |
-| `HSHM_CROSS_FUN` | Function callable from both CPU and GPU |
-| `HSHM_IS_HOST` | Compile-time check: true when compiling for CPU |
-| `HSHM_IS_GPU` | Compile-time check: true when compiling for GPU |
-
-These annotations expand to CUDA `__host__ __device__` or HIP equivalents when GPU support is enabled, and are no-ops on CPU-only builds.
-
-```cpp
-// Example: Method accessible from both CPU and GPU
-HSHM_INLINE_CROSS_FUN
-T& vector::operator[](size_t index) {
-  return data_[index];
-}
-```
-
-## Related Documentation
-
-- [Allocator Guide](../allocator/allocator_guide) - Memory allocators used by these data structures
-- [Atomic Types Guide](./atomic_types_guide) - Atomic primitives used in ring buffers
diff --git a/docs/sdk/context-transport-primitives/2.types/ring_buffer_guide.md b/docs/sdk/context-transport-primitives/2.types/ring_buffer_guide.md
new file mode 100644
index 0000000..1ed5e97
--- /dev/null
+++ b/docs/sdk/context-transport-primitives/2.types/ring_buffer_guide.md
@@ -0,0 +1,87 @@
+---
+sidebar_position: 2
+---
+
+# Ring Buffer Guide
+
+## Overview
+
+**Source:** `hermes_shm/data_structures/ipc/ring_buffer.h`
+
+A lock-free circular queue for concurrent producer-consumer patterns. Configurable via compile-time flags.
+
+## Configuration Flags
+
+```cpp
+namespace hshm::ipc {
+enum RingQueueFlag {
+  RING_BUFFER_SPSC_FLAGS       = 0x01,  // Single Producer Single Consumer
+  RING_BUFFER_MPSC_FLAGS       = 0x02,  // Multiple Producer Single Consumer
+  RING_BUFFER_WAIT_FOR_SPACE   = 0x04,  // Block until space available
+  RING_BUFFER_ERROR_ON_NO_SPACE = 0x08, // Return error if full
+  RING_BUFFER_DYNAMIC_SIZE     = 0x10,  // Resize when full
+  RING_BUFFER_FIXED_SIZE       = 0x20,  // Fixed-size buffer
+};
+}
+```
+
+## Pre-defined Type Aliases
+
+| Alias | Flags | Description |
+|-------|-------|-------------|
+| `spsc_ring_buffer<T>` | SPSC + Fixed + Error | Single-producer single-consumer, fixed size |
+| `mpsc_ring_buffer<T>` | MPSC + Fixed + Wait | Multi-producer single-consumer, blocks when full |
+| `circular_mpsc_ring_buffer<T>` | MPSC + Fixed + Error | Multi-producer single-consumer, wraps around |
+| `ext_ring_buffer<T>` | MPSC + Dynamic + Wait | Extensible, resizes when full |
+
+## Usage
+
+```cpp
+#include <hermes_shm/data_structures/ipc/ring_buffer.h>
+
+// Create a fixed-size SPSC ring buffer with depth 1024
+hshm::ipc::spsc_ring_buffer<int> rb(alloc, 1024);
+
+// Producer
+rb.Push(42);
+rb.Emplace(100);
+
+// Consumer
+int val;
+if (rb.Pop(val)) {
+  // Got value
+}
+
+// Query state
+size_t count = rb.Size();
+bool empty = rb.Empty();
+bool full = rb.Full();
+```
+
+## RingBufferEntry
+
+Each entry has an atomic ready flag for lock-free synchronization:
+
+```cpp
+template<typename T>
+struct RingBufferEntry {
+  bool IsReady();     // Check if entry has data
+  void SetReady();    // Mark entry as containing data
+  void ClearReady();  // Mark entry as consumed
+  T& GetData();       // Access the entry data
+};
+```
+
+## Internal Design
+
+- Uses atomic head/tail pointers for lock-free operation
+- Head is the consumer pointer, tail is the producer pointer
+- Queue capacity is `depth + 1` to distinguish full from empty
+- MPSC mode uses atomic tail with CAS for concurrent producers
+- SPSC mode uses non-atomic pointers for maximum performance
+- Includes worker metadata: `assigned_worker_id_`, `signal_fd_`, `tid_`, `active_`
+
+## Related Documentation
+
+- [Allocator Guide](../allocator/allocator_guide) - Memory allocators used by ring buffers
+- [Atomic Types Guide](./atomic_types_guide) - Atomic primitives used in ring buffers
diff --git a/docs/sdk/context-transport-primitives/2.types/string_guide.md b/docs/sdk/context-transport-primitives/2.types/string_guide.md
new file mode 100644
index 0000000..e5fa671
--- /dev/null
+++ b/docs/sdk/context-transport-primitives/2.types/string_guide.md
@@ -0,0 +1,54 @@
+---
+sidebar_position: 3
+---
+
+# String Guide
+
+## Overview
+
+**Source:** `hermes_shm/data_structures/priv/string.h`
+
+An SSO (Short String Optimization) string backed by `hshm::priv::vector`. Short strings (32 bytes or fewer) are stored inline without heap allocation.
+
+## Usage
+
+```cpp
+#include <hermes_shm/data_structures/priv/string.h>
+
+// Construction
+hshm::string s1("hello");
+hshm::string s2(std::string("world"));
+hshm::string s3(s1);  // Copy
+
+// Standard string API
+s1.append(" world");
+s1 += "!";
+size_t pos = s1.find("world");
+hshm::string sub = s1.substr(0, 5);
+bool eq = (s1 == s2);
+
+// Access
+const char* cstr = s1.c_str();
+char ch = s1[0];
+size_t len = s1.size();
+
+// Conversion to/from std::string
+std::string std_str = s1.str();
+std::string std_str2 = static_cast<std::string>(s1);
+```
+
+## Template Parameters
+
+- `T` - Character type (default: `char`)
+- `AllocT` - Allocator type
+- `SSOSize` - Short string buffer size (default: 32 bytes)
+
+## Key Features
+
+- Short strings (32 bytes or fewer) stored inline without heap allocation
+- Longer strings use `hshm::priv::vector` as backing store
+- Full `std::string`-compatible API: `find`, `substr`, `replace`, `starts_with`, `ends_with`
+- Annotated with `HSHM_CROSS_FUN` for GPU compatibility
+- Serialization support via `save()`/`load()`
+
+**Type Alias:** `hshm::string` is a convenience alias for `hshm::priv::basic_string<char>`.
diff --git a/docs/sdk/context-transport-primitives/2.types/unordered_map_guide.md b/docs/sdk/context-transport-primitives/2.types/unordered_map_guide.md
new file mode 100644
index 0000000..b51a8f4
--- /dev/null
+++ b/docs/sdk/context-transport-primitives/2.types/unordered_map_guide.md
@@ -0,0 +1,104 @@
+---
+sidebar_position: 4
+---
+
+# Unordered Map Guide
+
+## Overview
+
+**Source:** `hermes_shm/data_structures/priv/unordered_map_ll.h`
+
+A hash map implementation using a vector of lists design that provides efficient concurrent access when combined with external locking. Each bucket contains a `std::list` of key-value pairs; the hash space is partitioned across a fixed number of buckets set at construction time.
+
+**Key Characteristics:**
+- **Vector of Lists Design**: Uses a vector of buckets, each containing a list of key-value pairs
+- **External Locking Required**: No internal mutexes — users must provide synchronization
+- **Bucket Partitioning**: Hash space is partitioned across multiple buckets for better cache locality
+- **Standard API**: Compatible with `std::unordered_map` interface
+- **NOT Shared-Memory Compatible**: For runtime-only data structures, not task parameters
+
+## Basic Usage
+
+```cpp
+#include <hermes_shm/data_structures/priv/unordered_map_ll.h>
+
+// Create map with 32 buckets
+hshm::priv::unordered_map_ll<int, std::string> map(32);
+
+// Insert
+auto [inserted, ptr] = map.insert(1, "hello");
+map.insert_or_assign(2, "world");
+map[3] = "foo";
+
+// Lookup
+std::string* val = map.find(1);      // Returns nullptr if not found
+const std::string& ref = map.at(2);  // Throws if not found
+bool exists = map.contains(3);
+
+// Remove
+map.erase(1);
+map.clear();
+
+// Iterate
+map.for_each([](const int& key, std::string& value) {
+  // Process each element
+});
+```
+
+## Constructor
+
+```cpp
+hshm::priv::unordered_map_ll<Key, T> map(max_concurrency);
+```
+
+**Parameters:**
+- `max_concurrency`: Number of buckets (default: 16). Higher values give better distribution at the cost of more memory. Typical values: 16-64.
+
+## API Reference
+
+```cpp
+// Insertion operations
+auto [inserted, value_ptr] = map.insert(key, value);          // Insert if not exists
+auto [inserted, value_ptr] = map.insert_or_assign(key, value); // Insert or update
+T& ref = map[key];                                            // Insert default if missing
+
+// Lookup operations
+T* ptr = map.find(key);                    // Returns nullptr if not found
+const T* ptr = map.find(key) const;        // Const version
+T& ref = map.at(key);                      // Throws if not found
+bool exists = map.contains(key);           // Check existence
+size_t count = map.count(key);             // Returns 0 or 1
+
+// Removal operations
+size_t erased = map.erase(key);            // Returns number of elements erased
+map.clear();                               // Remove all elements
+
+// Size operations
+size_t s = map.size();                     // Total element count
+bool e = map.empty();                      // Check if empty
+size_t b = map.bucket_count();             // Number of buckets
+
+// Iteration
+map.for_each([](const Key& key, T& value) { /* ... */ });
+```
+
+Insert operations return `std::pair<bool, T*>` where `first` is `true` if insertion occurred and `second` is a pointer to the value.
+
+## Key Differences from std::unordered_map
+
+| Feature | std::unordered_map | hshm::priv::unordered_map_ll |
+|---------|-------------------|----------------------|
+| Internal Structure | Implementation-defined | Vector of lists (explicit) |
+| Bucket Count | Dynamic rehashing | Fixed at construction |
+| Iterator Stability | Unstable across insertions | Stable (list-based) |
+| Shared Memory | Not compatible | Not compatible |
+| Return Values | Iterators | Pointers to values |
+
+## When to Use
+
+| Scenario | Recommendation |
+|----------|---------------|
+| Runtime container data structures (caches, registries) | `hshm::priv::unordered_map_ll` |
+| Task input/output parameters | `std::unordered_map` or `chi::ipc::` types |
+| Client-side code | `std::unordered_map` |
+| Data requiring serialization | `std::unordered_map` with cereal |
diff --git a/docs/sdk/context-transport-primitives/2.types/vector_guide.md b/docs/sdk/context-transport-primitives/2.types/vector_guide.md
new file mode 100644
index 0000000..98f7fd3
--- /dev/null
+++ b/docs/sdk/context-transport-primitives/2.types/vector_guide.md
@@ -0,0 +1,87 @@
+---
+sidebar_position: 1
+---
+
+# Vector Guide
+
+## Overview
+
+HSHM provides two vector variants: `hshm::ipc::vector` for shared memory and `hshm::priv::vector` for private memory. For standard ChiMod development, use `std::vector`. The HSHM vectors are needed when data must be accessible from GPU kernels or live in shared memory across processes.
+
+## hshm::ipc::vector
+
+**Source:** `hermes_shm/data_structures/ipc/vector.h`
+
+A dynamic array stored in shared memory using offset-based pointers (`OffsetPtr<T>`) for process-independent addressing.
+
+```cpp
+#include <hermes_shm/data_structures/ipc/vector.h>
+
+// Create with an allocator
+hshm::ipc::vector<int, AllocT> vec(alloc, 10);  // 10 elements
+
+// Standard vector operations
+vec.push_back(42);
+vec.emplace_back(100);
+int val = vec[0];
+vec.resize(20);
+vec.reserve(50);
+vec.clear();
+
+// Iteration
+for (auto it = vec.begin(); it != vec.end(); ++it) {
+  process(*it);
+}
+```
+
+**Template Parameters:**
+- `T` - Element type
+- `AllocT` - Allocator type (determines shared vs private memory)
+
+**Key Differences from std::vector:**
+- Requires an allocator at construction time
+- Uses `OffsetPtr<T>` internally instead of raw pointers
+- Safe for cross-process access in shared memory
+- Annotated with `HSHM_CROSS_FUN` for GPU compatibility
+
+## hshm::priv::vector
+
+**Source:** `hermes_shm/data_structures/priv/vector.h`
+
+A private-memory vector with allocator integration. Supports the same API as `std::vector` plus serialization.
+
+```cpp
+#include <hermes_shm/data_structures/priv/vector.h>
+
+// Standard construction
+hshm::priv::vector<int> vec = {1, 2, 3, 4, 5};
+hshm::priv::vector<int> vec2(10, 0);  // 10 zeros
+
+// Full STL-compatible API
+vec.push_back(6);
+vec.pop_back();
+vec.insert(vec.begin() + 2, 99);
+vec.erase(vec.begin());
+
+// Reverse iteration
+for (auto it = vec.rbegin(); it != vec.rend(); ++it) {
+  process(*it);
+}
+```
+
+**Optimizations:**
+- Uses `memcpy`/`memmove` for trivially copyable types (POD optimization)
+- Exponential capacity growth strategy
+- Annotated with `HSHM_CROSS_FUN` for GPU compatibility
+
+## When to Use Each
+
+| Variant | Use Case |
+|---------|----------|
+| `std::vector` | Default choice for ChiMod task data |
+| `hshm::priv::vector` | Private memory with serialization support or GPU access |
+| `hshm::ipc::vector` | Cross-process shared memory regions |
+
+## Related Documentation
+
+- [Allocator Guide](../allocator/allocator_guide) - Memory allocators used by these vectors
diff --git a/docusaurus.config.ts b/docusaurus.config.ts
index 9b1f119..826b45b 100644
--- a/docusaurus.config.ts
+++ b/docusaurus.config.ts
@@ -131,7 +131,7 @@ const config: Config = {
           title: 'Community',
           items: [
             {label: 'GitHub', href: 'https://github.com/iowarp'},
-            {label: 'Issues', href: 'https://github.com/iowarp/iowarp/issues'},
+            {label: 'Issues', href: 'https://github.com/iowarp/clio-core/issues'},
             {label: 'Discussions', href: 'https://github.com/orgs/iowarp/discussions'},
             {label: 'Zulip Chat', href: 'https://iowarp.zulipchat.com'},
           ],