diff --git a/docs/CONNECTION_POOL.md b/docs/CONNECTION_POOL.md new file mode 100644 index 0000000..fe12542 --- /dev/null +++ b/docs/CONNECTION_POOL.md @@ -0,0 +1,229 @@ +# Connection Pool Configuration + +This document explains how `rust_loadtest` manages HTTP connections and how to +configure pooling behavior for different test scenarios. + +## How Connection Pooling Works + +Each load test builds a single `reqwest::Client` that maintains a connection +pool per target host. When a request completes, the underlying TCP connection +(including its TLS session) is returned to the pool. Subsequent requests grab +an existing connection from the pool instead of performing a new TCP handshake +and TLS negotiation. + +This is the **default behavior** — no special configuration is needed to reuse +connections. + +### When connections are reused + +- Workers fire requests continuously (e.g., RPS >= 1) +- Idle connections haven't exceeded the idle timeout +- The pool hasn't reached the max idle limit + +### When new connections are created + +- First request from each worker (no pooled connection exists yet) +- Idle timeout expired — the pooled connection was closed +- `maxIdlePerHost` is set to 0 — pooling is effectively disabled +- The server closed the connection (e.g., server-side idle timeout) + +## Configuration + +Pool settings can be configured via **environment variables** (applied at +startup) or via the **YAML config** (applied per-test on `POST /config`). +YAML values override environment variables when present. + +### Environment Variables + +| Variable | Default | Description | +|--------------------------|---------|--------------------------------------------------| +| `POOL_MAX_IDLE_PER_HOST` | `32` | Maximum idle connections kept per host | +| `POOL_IDLE_TIMEOUT_SECS` | `30` | Seconds an idle connection stays in the pool | +| `TCP_NODELAY` | `true` | Disable Nagle's algorithm for lower latency | +| `REQUEST_TIMEOUT_SECS` | `30` | Per-request timeout | + +### YAML Config + +Add an optional `pool` section under `config`: + +```yaml +config: + baseUrl: https://example.com + pool: + maxIdlePerHost: 32 + idleTimeoutSecs: 30 + metricsReuseThresholdMs: 100 +``` + +| Field | Default | Description | +|--------------------------|---------|--------------------------------------------------| +| `maxIdlePerHost` | `32` | Max idle connections per host. Set to `0` to disable pooling. | +| `idleTimeoutSecs` | `30` | Seconds before idle connections are closed. Set to `0` to close immediately. | +| `metricsReuseThresholdMs`| `100` | Latency threshold (ms) for the Prometheus metrics heuristic. Does **not** affect actual connection behavior — only how metrics classify requests as "new" vs "reused". | + +## Use Case: Force New Connection Per Request + +Use this when you need every request to perform a full TCP + TLS handshake. +Useful for testing: + +- TLS handshake latency and overhead +- Server-side connection establishment handling under load +- Certificate validation performance +- Load balancer connection distribution + +```yaml +version: "1.0" +config: + baseUrl: https://api.example.com + workers: 10 + duration: 5m + timeout: 30s + pool: + maxIdlePerHost: 0 + idleTimeoutSecs: 0 +load: + model: rps + target: 100 +scenarios: + - name: new-connection-test + weight: 100 + steps: + - name: request + request: + method: GET + path: /health + assertions: + - type: statusCode + expected: 200 +``` + +With environment variables: + +```bash +POOL_MAX_IDLE_PER_HOST=0 POOL_IDLE_TIMEOUT_SECS=0 +``` + +## Use Case: Reuse Connections (Default) + +Use this for standard load testing where you want realistic connection behavior. +Connections are established once and reused across requests, which is how most +production clients behave. + +```yaml +version: "1.0" +config: + baseUrl: https://api.example.com + workers: 25 + duration: 10m + timeout: 30s + # No pool section needed — defaults reuse connections +load: + model: rps + target: 1000 +scenarios: + - name: reuse-connection-test + weight: 100 + steps: + - name: request + request: + method: GET + path: /health + assertions: + - type: statusCode + expected: 200 +``` + +## Use Case: Long-Lived Connection Reuse with Infrequent Requests + +Use this when requests are spaced far apart (e.g., every 5 minutes) but you +want to keep the same TCP/TLS session alive between them. Increase the idle +timeout to prevent the pool from closing connections during gaps. + +```yaml +version: "1.0" +config: + baseUrl: https://api.example.com + workers: 1 + duration: 1h + timeout: 30s + pool: + maxIdlePerHost: 1 + idleTimeoutSecs: 600 +load: + model: rps + target: 1 +scenarios: + - name: keepalive-test + weight: 100 + steps: + - name: request + request: + method: POST + path: /oauth2/v1/token + body: "grant_type=client_credentials&client_id=my_id&client_secret=my_secret" + headers: + Content-Type: application/x-www-form-urlencoded + assertions: + - type: statusCode + expected: 200 + thinkTime: + min: 4m + max: 5m +standby: + workers: 1 + rps: 1.0 +``` + +**Note:** Even with a high idle timeout, the remote server may close the +connection on its side (common server idle timeouts are 60-120s). The pool +will transparently open a new connection when this happens. + +## Monitoring Connection Reuse + +Prometheus metrics are available on port 9090: + +| Metric | Type | Description | +|-----------------------------------------|------------|------------------------------------------| +| `connection_pool_likely_new_total` | Counter | Requests classified as new connections | +| `connection_pool_likely_reused_total` | Counter | Requests classified as reused connections| +| `connection_pool_reuse_rate_percent` | Gauge | Current reuse percentage | +| `connection_pool_requests_total` | Counter | Total requests tracked | +| `connection_pool_max_idle_per_host` | Gauge | Configured max idle setting | +| `connection_pool_idle_timeout_seconds` | Gauge | Configured idle timeout setting | + +### Important: Metrics Are Heuristic-Based + +The "new" vs "reused" classification uses a **latency heuristic**, not actual +connection state (reqwest does not expose this). Requests slower than +`metricsReuseThresholdMs` (default: 100ms) are classified as "likely new +connection" because a TLS handshake typically adds 50-150ms. + +This means: + +- Fast targets where TLS completes in <100ms will **undercount** new connections +- Slow targets where reused requests take >100ms will **overcount** new connections + +Tune `metricsReuseThresholdMs` in the YAML to match your target's typical TLS +handshake time for more accurate classification. For definitive connection +tracking, check server-side access logs. + +### Grafana Queries + +**New vs reused connections over time (time series panel):** + +| Query | Legend | +|-------------------------------------------------|----------| +| `rate(connection_pool_likely_reused_total[1m])` | Reused | +| `rate(connection_pool_likely_new_total[1m])` | New | + +**Reuse rate (single stat panel):** + +```promql +connection_pool_reuse_rate_percent +``` + +**Percentage of new connections (single stat panel):** + +```promql +connection_pool_likely_new_total / connection_pool_requests_total * 100 +``` diff --git a/src/config.rs b/src/config.rs index 5e5d65d..ae9385d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -101,6 +101,7 @@ pub struct Config { // When Some, these override env-var defaults when building the HTTP client. pub pool_max_idle_per_host: Option, pub pool_idle_timeout_secs: Option, + pub pool_metrics_reuse_threshold_ms: Option, } /// Helper to get a required environment variable. @@ -235,10 +236,15 @@ impl Config { let auto_disable_percentiles_on_warning = env_bool("AUTO_DISABLE_PERCENTILES_ON_WARNING", true); - let (pool_max_idle_per_host, pool_idle_timeout_secs) = match &yaml_config.config.pool { - Some(p) => (p.max_idle_per_host, p.idle_timeout_secs), - None => (None, None), - }; + let (pool_max_idle_per_host, pool_idle_timeout_secs, pool_new_connection_threshold_ms) = + match &yaml_config.config.pool { + Some(p) => ( + p.max_idle_per_host, + p.idle_timeout_secs, + p.metrics_reuse_threshold_ms, + ), + None => (None, None, None), + }; let config = Config { target_url, @@ -263,6 +269,7 @@ impl Config { cluster: ClusterConfig::from_env(), pool_max_idle_per_host, pool_idle_timeout_secs, + pool_metrics_reuse_threshold_ms, }; config.validate()?; @@ -330,10 +337,15 @@ impl Config { let auto_disable_percentiles_on_warning = env_bool("AUTO_DISABLE_PERCENTILES_ON_WARNING", true); - let (pool_max_idle_per_host, pool_idle_timeout_secs) = match &yaml_config.config.pool { - Some(p) => (p.max_idle_per_host, p.idle_timeout_secs), - None => (None, None), - }; + let (pool_max_idle_per_host, pool_idle_timeout_secs, pool_new_connection_threshold_ms) = + match &yaml_config.config.pool { + Some(p) => ( + p.max_idle_per_host, + p.idle_timeout_secs, + p.metrics_reuse_threshold_ms, + ), + None => (None, None, None), + }; let config = Config { target_url, @@ -358,6 +370,7 @@ impl Config { cluster: ClusterConfig::from_env(), pool_max_idle_per_host, pool_idle_timeout_secs, + pool_metrics_reuse_threshold_ms, }; config.validate()?; @@ -525,6 +538,7 @@ impl Config { cluster: ClusterConfig::from_env(), pool_max_idle_per_host: None, pool_idle_timeout_secs: None, + pool_metrics_reuse_threshold_ms: None, }; config.validate()?; @@ -730,6 +744,7 @@ impl Config { cluster: ClusterConfig::for_testing(), pool_max_idle_per_host: None, pool_idle_timeout_secs: None, + pool_metrics_reuse_threshold_ms: None, } } diff --git a/src/connection_pool.rs b/src/connection_pool.rs index 99c1452..ffd16c0 100644 --- a/src/connection_pool.rs +++ b/src/connection_pool.rs @@ -191,7 +191,7 @@ pub struct PoolStatsTracker { /// Threshold for considering a connection "likely new" (milliseconds) /// Requests slower than this are likely establishing new connections - new_connection_threshold_ms: u64, + new_connection_threshold_ms: Arc>, } impl PoolStatsTracker { @@ -203,10 +203,15 @@ impl PoolStatsTracker { pub fn new(new_connection_threshold_ms: u64) -> Self { Self { stats: Arc::new(Mutex::new(ConnectionStats::default())), - new_connection_threshold_ms, + new_connection_threshold_ms: Arc::new(Mutex::new(new_connection_threshold_ms)), } } + /// Update the latency threshold used to classify new vs reused connections. + pub fn set_threshold_ms(&self, threshold_ms: u64) { + *self.new_connection_threshold_ms.lock().unwrap() = threshold_ms; + } + /// Record a request with timing information. /// /// Uses latency to infer connection reuse. Requests with very low latency @@ -214,6 +219,7 @@ impl PoolStatsTracker { /// may have established a new connection (including TLS handshake). pub fn record_request(&self, latency_ms: u64) { let now = Instant::now(); + let threshold = *self.new_connection_threshold_ms.lock().unwrap(); let mut stats = self.stats.lock().unwrap(); stats.total_requests += 1; @@ -228,21 +234,19 @@ impl PoolStatsTracker { // Infer connection type based on latency // Fast requests (= self.new_connection_threshold_ms { + if latency_ms >= threshold { stats.likely_new_connections += 1; CONNECTION_POOL_LIKELY_NEW.inc(); debug!( - latency_ms = latency_ms, - threshold = self.new_connection_threshold_ms, - "Request latency suggests new connection" + latency_ms, + threshold, "Request latency suggests new connection" ); } else { stats.likely_reused_connections += 1; CONNECTION_POOL_LIKELY_REUSED.inc(); debug!( - latency_ms = latency_ms, - threshold = self.new_connection_threshold_ms, - "Request latency suggests reused connection" + latency_ms, + threshold, "Request latency suggests reused connection" ); } diff --git a/src/main.rs b/src/main.rs index afab893..09060d4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1136,6 +1136,12 @@ async fn main() -> Result<(), Box> { h.abort(); } + // Apply pool stats threshold from YAML and reset counters for new test. + if let Some(threshold_ms) = new_cfg.pool_metrics_reuse_threshold_ms { + GLOBAL_POOL_STATS.set_threshold_ms(threshold_ms); + } + GLOBAL_POOL_STATS.reset(); + // Rebuild HTTP client in case TLS/pool config changed. let new_client = match rust_loadtest::client::build_client(&new_cfg.to_client_config()) { diff --git a/src/yaml_config.rs b/src/yaml_config.rs index 1e72c1a..67462a5 100644 --- a/src/yaml_config.rs +++ b/src/yaml_config.rs @@ -118,6 +118,13 @@ pub struct YamlPoolConfig { /// Set to 0 to immediately close connections after each request. #[serde(rename = "idleTimeoutSecs")] pub idle_timeout_secs: Option, + + /// Latency threshold in milliseconds used by Prometheus metrics to classify + /// a request as a new connection vs a reused one (default: 100). Requests + /// slower than this are counted as "likely new connection". Does NOT affect + /// actual connection behavior — only the metrics heuristic. + #[serde(rename = "metricsReuseThresholdMs")] + pub metrics_reuse_threshold_ms: Option, } fn default_timeout() -> YamlDuration {