elastic · codefromthecrypt · Apr 17, 2026 · codefromthecrypt · Apr 17, 2026
@@ -12,7 +12,7 @@ This directory includes examples of OpenAI accessible inferences platforms and
 proxies. Each are observable with OpenTelemetry compatible backends such as
 Elastic Stack.
 
-* [ArchGW](archgw) - [with tracing configuration][archgw]
+* [Plano](plano) - [with tracing configuration][plano]
 * [Envoy AI Gateway](aigw) - with [OpenTelemetry tracing and metrics][aigw]
 * [LiteLLM](litellm) - with  [OpenTelemetry logging callbacks][litellm]
 * [LlamaStack](llama-stack) - with [OpenTelemetry sinks][llama-stack]
@@ -138,12 +138,12 @@ To start and use Ollama, do the following:
 1. Ensure `ollama` is installed
    - On macOS/Linux: `brew install ollama`
    - For Windows or otherwise, see the [download page][ollama-dl].
-2. In a separate terminal, run `OLLAMA_HOST=0.0.0.0 OLLAMA_CONTEXT_LENGTH=8192 ollama serve`
+2. In a separate terminal, run `OLLAMA_FLASH_ATTENTION=1 OLLAMA_KV_CACHE_TYPE=q8_0 ollama serve`
    - This accepts OpenAI requests for any model on http://localhost:11434/v1
 
 ---
 [aigw]: https://aigateway.envoyproxy.io/docs/cli/aigwrun
-[archgw]: https://docs.planoai.dev/guides/observability/tracing.html
+[plano]: https://docs.planoai.dev/guides/observability/tracing.html
 [litellm]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
 [llama-stack]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#telemetry
 [AgC]: https://github.com/masaic-ai-platform/AgC/blob/main/platform/README.md#setting-up-the-opentelemetry-collector

@@ -0,0 +1,80 @@
+# plano
+
+This shows how to use [Plano][docs] as an OpenAI [LLM router][config], using
+its `tracing` configuration for OpenTelemetry.
+
+Plano defaults to native mode, downloading pre-compiled Envoy binaries to
+`~/.plano/`. Envoy handles requests, collects telemetry and forwards them to
+Ollama via the OpenAI API.
+
+## Prerequisites
+
+Start Ollama and your OpenTelemetry Collector via this repository's [README](../README.md).
+
+## Run Plano
+
+Plano is a python command that runs Envoy natively (no Docker required). Run
+`planoai` using `uv run` from [uv][uv]:
+
+```bash
+uv run --python 3.13 --with planoai -- planoai up plano_config.yaml
+```
+
+When finished, clean up like this:
+
+```bash
+uv run --python 3.13 --with planoai -- planoai down
+```
+
+## Call Plano with python
+
+Once Plano is running, use [uv][uv] to make an OpenAI request via
+[chat.py](../chat.py):
+
+```bash
+uv run --exact -q --env-file env.local ../chat.py
+```
+
+## Start Prometheus Scraping
+
+### otel-tui
+
+If you are using [otel-tui][otel-tui] to visualize OpenTelemetry data, you can
+add Plano's Prometheus endpoint to it when starting, like this:
+
+```bash
+otel-tui --prom-target http://localhost:9901/stats?format=prometheus
+```
+
+### Elastic Stack
+
+If your OpenTelemetry backend is Elasticsearch, you can pump Prometheus metrics
+coming from Plano to Elasticsearch like this:
+
+```bash
+docker compose -f docker-compose-elastic.yml run --rm prometheus-pump
+```
+
+## Notes
+
+OpenTelemetry signals are a function of native [Envoy support][envoy-otel]
+and anything added in Plano's [wasm filter][plano-wasm].
+
+* Traces come from Envoy, whose configuration is written by `planoai`. At the
+  moment, this hard-codes aspects including default ports.
+* Prometheus metrics show the cluster as "openai_localhost" - the
+  provider_interface plus the first segment of the hostname.
+* Until [this][openai-responses] resolves, don't use `--use-responses-api`.
+
+The chat prompt was designed to be idempotent, but the results are not. You may
+see something besides 'South Atlantic Ocean.'.
+Just run it again until we find a way to make the results idempotent.
+
+---
+[docs]: https://docs.planoai.dev
+[config]: https://docs.planoai.dev/guides/observability/tracing.html
+[envoy-otel]: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/trace/v3/opentelemetry.proto#extension-envoy-tracers-opentelemetry
+[plano-wasm]: https://github.com/katanemo/plano/blob/main/README.md
+[uv]: https://docs.astral.sh/uv/getting-started/installation/
+[openai-responses]: https://github.com/katanemo/plano/issues/476
+[otel-tui]: https://github.com/ymtdzzz/otel-tui
@@ -1,8 +1,8 @@
 configs:
-  # Configuration is simplified from archgw here:
-  # https://github.com/katanemo/archgw/blob/main/docs/source/guides/observability/monitoring.rst
+  # Configuration is simplified from plano here:
+  # https://github.com/katanemo/plano/blob/main/docs/source/guides/observability/monitoring.rst
   #
-  # Note: The cluster name for ollama + host.docker.internal = ollama_host
+  # Note: The cluster name for openai + localhost = openai_localhost
   prometheus-pump-config:
     content: |
       receivers:
@@ -11,16 +11,16 @@ configs:
             global:
               evaluation_interval: 5s
             scrape_configs:
-              - job_name: 'archgw'
+              - job_name: 'plano'
                 honor_timestamps: true
                 scrape_interval: 5s
                 scrape_timeout: 5s
                 metrics_path: /stats
                 static_configs:
-                  - targets: ['localhost:19901']
+                  - targets: ['localhost:9901']
                 params:
                   format: ["prometheus"]
-      
+
       processors:
         # Elastic Stack doesn't currently support cumulative metrics
         cumulativetodelta:
@@ -36,7 +36,7 @@ configs:
             enabled: true
           flush:
             interval: 1s  # improve responsiveness in example apps (default 30s)
-      
+
       service:
         pipelines:
           metrics:
@@ -58,4 +58,3 @@ services:
         mode: 0444
     extra_hosts:  # send localhost traffic to the docker host, e.g. your laptop
       - "localhost:host-gateway"
-
@@ -1,9 +1,9 @@
-# Arch gateway endpoint
+# Plano endpoint
 OPENAI_BASE_URL=http://localhost:12000/v1
 OPENAI_API_KEY=unused
-CHAT_MODEL=qwen3:0.6B
+CHAT_MODEL=qwen3:0.6b
 
-OTEL_SERVICE_NAME=archgw
+OTEL_SERVICE_NAME=plano
 
 # Disable resource detectors by default
 OTEL_PYTHON_DISABLED_RESOURCE_DETECTORS=all
@@ -1,18 +1,18 @@
-version: v0.1.0
+version: v0.3.0
 
 listeners:
-  egress_traffic:
+  - type: model
+    name: model_listener
     address: 0.0.0.0
     port: 12000
-    message_format: openai
     timeout: 30s
 
-llm_providers:
+model_providers:
   # Use ollama directly, since we can't inherit OPENAI_BASE_URL etc and need
   # to hard-code the model anyway.
-  - model: ollama/qwen3:0.6b
-    # This configuration is converted to Envoy and run inside Docker.
-    base_url: http://host.docker.internal:11434
+  - model: openai/qwen3:0.6b
+    # In native mode, Envoy runs on the host alongside ollama.
+    base_url: http://localhost:11434
     default: true
 
 tracing: