diff --git a/inference-platforms/README.md b/inference-platforms/README.md index 18ecc7e..ab9549d 100644 --- a/inference-platforms/README.md +++ b/inference-platforms/README.md @@ -12,7 +12,7 @@ This directory includes examples of OpenAI accessible inferences platforms and proxies. Each are observable with OpenTelemetry compatible backends such as Elastic Stack. -* [ArchGW](archgw) - [with tracing configuration][archgw] +* [Plano](plano) - [with tracing configuration][plano] * [Envoy AI Gateway](aigw) - with [OpenTelemetry tracing and metrics][aigw] * [LiteLLM](litellm) - with [OpenTelemetry logging callbacks][litellm] * [LlamaStack](llama-stack) - with [OpenTelemetry sinks][llama-stack] @@ -138,12 +138,12 @@ To start and use Ollama, do the following: 1. Ensure `ollama` is installed - On macOS/Linux: `brew install ollama` - For Windows or otherwise, see the [download page][ollama-dl]. -2. In a separate terminal, run `OLLAMA_HOST=0.0.0.0 OLLAMA_CONTEXT_LENGTH=8192 ollama serve` +2. In a separate terminal, run `OLLAMA_FLASH_ATTENTION=1 OLLAMA_KV_CACHE_TYPE=q8_0 ollama serve` - This accepts OpenAI requests for any model on http://localhost:11434/v1 --- [aigw]: https://aigateway.envoyproxy.io/docs/cli/aigwrun -[archgw]: https://docs.planoai.dev/guides/observability/tracing.html +[plano]: https://docs.planoai.dev/guides/observability/tracing.html [litellm]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration [llama-stack]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#telemetry [AgC]: https://github.com/masaic-ai-platform/AgC/blob/main/platform/README.md#setting-up-the-opentelemetry-collector diff --git a/inference-platforms/archgw/README.md b/inference-platforms/archgw/README.md deleted file mode 100644 index 0ef4ce4..0000000 --- a/inference-platforms/archgw/README.md +++ /dev/null @@ -1,84 +0,0 @@ -# archgw - -This shows how to use the Arch Gateway as an OpenAI [LLM router][docs], using -its [`tracing` configuration][config] for OpenTelemetry. - -Arch Gateway does not serve OpenAI requests. Rather, it configures an Envoy -proxy according to its configuration. Envoy handles requests, collects -telemetry and forwards them to Ollama via the OpenAI API. - -## Setup - -Start ollama and the otel collector via this repository's [README](../../README.md). - -## Run Arch Gateway - -Arch Gateway is a python command that internally runs Docker. Hence, you need a -working Docker configuration. Run `archgw` using `uv run` from [uv][uv]: - -```bash -uv run --python 3.12 --with archgw -- archgw up arch_config.yaml -``` - -When finished, clean up like this: - -```bash -uv run --python 3.12 --with archgw -- archgw down -``` - -## Start Prometheus Scraping - -### Elastic Stack - -If your OpenTelemetry backend is Elasticsearch, you can pump Prometheus metrics -coming from Arch Gateway to Elasticsearch like this: - -```bash -docker compose -f docker-compose-elastic.yml run --rm prometheus-pump -``` - -### otel-tui - -If you are using [otel-tui][otel-tui] to visualize OpenTelemetry data, you can -add Arch Gateway's Prometheus endpoint to it when starting, like this: - -```bash -otel-tui --prom-target http://localhost:19901/stats?format=prometheus -``` - -## Call Arch Gateway with python - -Once Arch Gateway is running, use [uv][uv] to make an OpenAI request via -[chat.py](../chat.py): - -```bash -uv run --exact -q --env-file env.local ../chat.py -``` - -## Notes - -OpenTelemetry signals are a function of native [Envoy support][envoy-otel] -and anything added in Arch Gateway's [wasm filter][archgw-wasm]. - -* `archgw` invokes `envoy` in a Docker container, which is why this has no - instructions to run from Docker (to avoid nested docker). -* Traces come from Envoy, whose configuration is written by `archgw`. At the - moment, this hard-codes aspects including default ports. -* Prometheus metrics show the cluster as "ollama_host" - the provider_interface - plus the first segment of the hostname (dots truncate the rest). The "host" - comes from "host.docker.internal". -* Until [this][openai-responses] resolves, don't use `--use-responses-api`. -* This example uses Python 3.12 until torch has wheels for 3.14. - -The chat prompt was designed to be idempotent, but the results are not. You may -see something besides 'South Atlantic Ocean.'. -Just run it again until we find a way to make the results idempotent. - ---- -[docs]: https://github.com/katanemo/archgw?tab=readme-ov-file#use-arch-gateway-as-llm-router -[config]: https://docs.planoai.dev/guides/observability/tracing.html -[envoy-otel]: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/trace/v3/opentelemetry.proto#extension-envoy-tracers-opentelemetry -[archgw-wasm]: https://github.com/katanemo/plano/blob/main/README.md -[uv]: https://docs.astral.sh/uv/getting-started/installation/ -[openai-responses]: https://github.com/katanemo/plano/issues/476 -[otel-tui]: https://github.com/ymtdzzz/otel-tui diff --git a/inference-platforms/plano/README.md b/inference-platforms/plano/README.md new file mode 100644 index 0000000..fafc14d --- /dev/null +++ b/inference-platforms/plano/README.md @@ -0,0 +1,80 @@ +# plano + +This shows how to use [Plano][docs] as an OpenAI [LLM router][config], using +its `tracing` configuration for OpenTelemetry. + +Plano defaults to native mode, downloading pre-compiled Envoy binaries to +`~/.plano/`. Envoy handles requests, collects telemetry and forwards them to +Ollama via the OpenAI API. + +## Prerequisites + +Start Ollama and your OpenTelemetry Collector via this repository's [README](../README.md). + +## Run Plano + +Plano is a python command that runs Envoy natively (no Docker required). Run +`planoai` using `uv run` from [uv][uv]: + +```bash +uv run --python 3.13 --with planoai -- planoai up plano_config.yaml +``` + +When finished, clean up like this: + +```bash +uv run --python 3.13 --with planoai -- planoai down +``` + +## Call Plano with python + +Once Plano is running, use [uv][uv] to make an OpenAI request via +[chat.py](../chat.py): + +```bash +uv run --exact -q --env-file env.local ../chat.py +``` + +## Start Prometheus Scraping + +### otel-tui + +If you are using [otel-tui][otel-tui] to visualize OpenTelemetry data, you can +add Plano's Prometheus endpoint to it when starting, like this: + +```bash +otel-tui --prom-target http://localhost:9901/stats?format=prometheus +``` + +### Elastic Stack + +If your OpenTelemetry backend is Elasticsearch, you can pump Prometheus metrics +coming from Plano to Elasticsearch like this: + +```bash +docker compose -f docker-compose-elastic.yml run --rm prometheus-pump +``` + +## Notes + +OpenTelemetry signals are a function of native [Envoy support][envoy-otel] +and anything added in Plano's [wasm filter][plano-wasm]. + +* Traces come from Envoy, whose configuration is written by `planoai`. At the + moment, this hard-codes aspects including default ports. +* Prometheus metrics show the cluster as "openai_localhost" - the + provider_interface plus the first segment of the hostname. +* Until [this][openai-responses] resolves, don't use `--use-responses-api`. + +The chat prompt was designed to be idempotent, but the results are not. You may +see something besides 'South Atlantic Ocean.'. +Just run it again until we find a way to make the results idempotent. + +--- +[docs]: https://docs.planoai.dev +[config]: https://docs.planoai.dev/guides/observability/tracing.html +[envoy-otel]: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/trace/v3/opentelemetry.proto#extension-envoy-tracers-opentelemetry +[plano-wasm]: https://github.com/katanemo/plano/blob/main/README.md +[uv]: https://docs.astral.sh/uv/getting-started/installation/ +[openai-responses]: https://github.com/katanemo/plano/issues/476 +[otel-tui]: https://github.com/ymtdzzz/otel-tui diff --git a/inference-platforms/archgw/docker-compose-elastic.yml b/inference-platforms/plano/docker-compose-elastic.yml similarity index 83% rename from inference-platforms/archgw/docker-compose-elastic.yml rename to inference-platforms/plano/docker-compose-elastic.yml index 5793c84..6a74ce4 100644 --- a/inference-platforms/archgw/docker-compose-elastic.yml +++ b/inference-platforms/plano/docker-compose-elastic.yml @@ -1,8 +1,8 @@ configs: - # Configuration is simplified from archgw here: - # https://github.com/katanemo/archgw/blob/main/docs/source/guides/observability/monitoring.rst + # Configuration is simplified from plano here: + # https://github.com/katanemo/plano/blob/main/docs/source/guides/observability/monitoring.rst # - # Note: The cluster name for ollama + host.docker.internal = ollama_host + # Note: The cluster name for openai + localhost = openai_localhost prometheus-pump-config: content: | receivers: @@ -11,16 +11,16 @@ configs: global: evaluation_interval: 5s scrape_configs: - - job_name: 'archgw' + - job_name: 'plano' honor_timestamps: true scrape_interval: 5s scrape_timeout: 5s metrics_path: /stats static_configs: - - targets: ['localhost:19901'] + - targets: ['localhost:9901'] params: format: ["prometheus"] - + processors: # Elastic Stack doesn't currently support cumulative metrics cumulativetodelta: @@ -36,7 +36,7 @@ configs: enabled: true flush: interval: 1s # improve responsiveness in example apps (default 30s) - + service: pipelines: metrics: @@ -58,4 +58,3 @@ services: mode: 0444 extra_hosts: # send localhost traffic to the docker host, e.g. your laptop - "localhost:host-gateway" - diff --git a/inference-platforms/archgw/env.local b/inference-platforms/plano/env.local similarity index 67% rename from inference-platforms/archgw/env.local rename to inference-platforms/plano/env.local index 055b469..39b7cac 100644 --- a/inference-platforms/archgw/env.local +++ b/inference-platforms/plano/env.local @@ -1,9 +1,9 @@ -# Arch gateway endpoint +# Plano endpoint OPENAI_BASE_URL=http://localhost:12000/v1 OPENAI_API_KEY=unused -CHAT_MODEL=qwen3:0.6B +CHAT_MODEL=qwen3:0.6b -OTEL_SERVICE_NAME=archgw +OTEL_SERVICE_NAME=plano # Disable resource detectors by default OTEL_PYTHON_DISABLED_RESOURCE_DETECTORS=all diff --git a/inference-platforms/archgw/arch_config.yaml b/inference-platforms/plano/plano_config.yaml similarity index 53% rename from inference-platforms/archgw/arch_config.yaml rename to inference-platforms/plano/plano_config.yaml index da6238a..b11511d 100644 --- a/inference-platforms/archgw/arch_config.yaml +++ b/inference-platforms/plano/plano_config.yaml @@ -1,18 +1,18 @@ -version: v0.1.0 +version: v0.3.0 listeners: - egress_traffic: + - type: model + name: model_listener address: 0.0.0.0 port: 12000 - message_format: openai timeout: 30s -llm_providers: +model_providers: # Use ollama directly, since we can't inherit OPENAI_BASE_URL etc and need # to hard-code the model anyway. - - model: ollama/qwen3:0.6b - # This configuration is converted to Envoy and run inside Docker. - base_url: http://host.docker.internal:11434 + - model: openai/qwen3:0.6b + # In native mode, Envoy runs on the host alongside ollama. + base_url: http://localhost:11434 default: true tracing: