From 01c570ea4d1263b7cb93b38c2545f9854125b771 Mon Sep 17 00:00:00 2001 From: Harshaneel Gokhale Date: Mon, 6 Apr 2026 18:38:51 -0700 Subject: [PATCH 1/2] chore: Added env vars for tuning perf of the container --- README.md | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- entrypoint.sh | 25 ++++++++++++++++----- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 211876a..77d5569 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # localaik -[![CI](https://github.com/harshaneel/localaik/actions/workflows/release.yml/badge.svg)](https://github.com/harshaneel/localaik/actions/workflows/release.yml) -[![Docker Hub](https://img.shields.io/docker/v/gokhalh/localaik?sort=semver&label=Docker%20Hub)](https://hub.docker.com/r/gokhalh/localaik) -[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) -[![Go Report Card](https://goreportcard.com/badge/github.com/harshaneel/localaik)](https://goreportcard.com/report/github.com/harshaneel/localaik) -[![Go Version](https://img.shields.io/github/go-mod/go-version/harshaneel/localaik)](https://github.com/harshaneel/localaik/blob/main/go.mod) -[![Go Reference](https://pkg.go.dev/badge/github.com/harshaneel/localaik.svg)](https://pkg.go.dev/github.com/harshaneel/localaik) +[CI](https://github.com/harshaneel/localaik/actions/workflows/release.yml) +[Docker Hub](https://hub.docker.com/r/gokhalh/localaik) +[License: MIT](LICENSE) +[Go Report Card](https://goreportcard.com/report/github.com/harshaneel/localaik) +[Go Version](https://github.com/harshaneel/localaik/blob/main/go.mod) +[Go Reference](https://pkg.go.dev/github.com/harshaneel/localaik) A local compatibility server for the Gemini and OpenAI APIs. Run one container, point your SDK at `http://localhost:8090`, and get both protocol shapes on the same port for tests and development. @@ -58,6 +58,7 @@ More runnable samples (curl, Go, Python, JavaScript, Java) live under **[example ### Gemini SDK **Go:** + ```go client, err := genai.NewClient(ctx, &genai.ClientConfig{ APIKey: "test", @@ -66,6 +67,7 @@ client, err := genai.NewClient(ctx, &genai.ClientConfig{ ``` **Python:** + ```python from google import genai @@ -84,6 +86,7 @@ export GOOGLE_GEMINI_BASE_URL=http://localhost:8090 ### OpenAI SDK **Python:** + ```python from openai import OpenAI @@ -91,6 +94,7 @@ client = OpenAI(api_key="test", base_url="http://localhost:8090/v1") ``` **Go:** + ```go client := openai.NewClient( option.WithAPIKey("test"), @@ -109,6 +113,51 @@ client := openai.NewClient( Version-pinned tags follow the pattern `v0.1.1-gemma3-4b`, `v0.1.1-gemma3-12b`. +## Tuning (v0.1.3 onwards) + +Pass environment variables to tune the underlying model server: + +```bash +docker run -d -p 8090:8090 \ + -e LK_THREADS=8 \ + -e LK_CTX_SIZE=4096 \ + -e LK_FLASH_ATTN=1 \ + -e LK_CONT_BATCHING=1 \ + -e LK_PARALLEL=2 \ + gokhalh/localaik +``` + +Or with Docker Compose: + +```yaml +services: + localaik: + image: gokhalh/localaik + ports: + - "8090:8090" + environment: + LK_THREADS: 8 + LK_CTX_SIZE: 4096 + LK_FLASH_ATTN: 1 + LK_CONT_BATCHING: 1 + LK_PARALLEL: 2 +``` + + +| Variable | Default | Description | +| ------------------ | --------------- | ----------------------------------- | +| `LK_CTX_SIZE` | 8192 | Context window in tokens | +| `LK_THREADS` | auto | CPU threads for inference | +| `LK_THREADS_BATCH` | same as threads | CPU threads for prompt processing | +| `LK_BATCH_SIZE` | 2048 | Prompt processing batch size | +| `LK_UBATCH_SIZE` | 512 | Micro-batch size | +| `LK_GPU_LAYERS` | 0 | Layers offloaded to GPU (99 = all) | +| `LK_PARALLEL` | 1 | Max concurrent request slots | +| `LK_FLASH_ATTN` | 0 (off) | Flash attention (`1` to enable) | +| `LK_CONT_BATCHING` | 0 (off) | Continuous batching (`1` to enable) | +| `LK_MLOCK` | 0 (off) | Lock model in RAM (`1` to enable) | + + ## Implemented routes diff --git a/entrypoint.sh b/entrypoint.sh index ec9906a..d63367b 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -18,12 +18,25 @@ if [ ! -x "${LLAMA_SERVER_BIN}" ]; then fi fi -"${LLAMA_SERVER_BIN}" \ - --model /models/model.gguf \ - --mmproj /models/mmproj-model-f16.gguf \ - --port 8080 \ - --host 127.0.0.1 \ - --ctx-size 8192 & +SERVER_ARGS="--model /models/model.gguf" +SERVER_ARGS="${SERVER_ARGS} --mmproj /models/mmproj-model-f16.gguf" +SERVER_ARGS="${SERVER_ARGS} --port 8080" +SERVER_ARGS="${SERVER_ARGS} --host 127.0.0.1" +SERVER_ARGS="${SERVER_ARGS} --ctx-size ${LK_CTX_SIZE:-8192}" + +[ -n "${LK_THREADS:-}" ] && SERVER_ARGS="${SERVER_ARGS} --threads ${LK_THREADS}" +[ -n "${LK_THREADS_BATCH:-}" ] && SERVER_ARGS="${SERVER_ARGS} --threads-batch ${LK_THREADS_BATCH}" +[ -n "${LK_BATCH_SIZE:-}" ] && SERVER_ARGS="${SERVER_ARGS} --batch-size ${LK_BATCH_SIZE}" +[ -n "${LK_UBATCH_SIZE:-}" ] && SERVER_ARGS="${SERVER_ARGS} --ubatch-size ${LK_UBATCH_SIZE}" +[ -n "${LK_GPU_LAYERS:-}" ] && SERVER_ARGS="${SERVER_ARGS} --n-gpu-layers ${LK_GPU_LAYERS}" +[ -n "${LK_PARALLEL:-}" ] && SERVER_ARGS="${SERVER_ARGS} --parallel ${LK_PARALLEL}" + +[ "${LK_FLASH_ATTN:-0}" = "1" ] && SERVER_ARGS="${SERVER_ARGS} --flash-attn" +[ "${LK_CONT_BATCHING:-0}" = "1" ] && SERVER_ARGS="${SERVER_ARGS} --cont-batching" +[ "${LK_MLOCK:-0}" = "1" ] && SERVER_ARGS="${SERVER_ARGS} --mlock" + +# shellcheck disable=SC2086 +"${LLAMA_SERVER_BIN}" ${SERVER_ARGS} & echo "localaik: loading model..." tries=0 From 918630898febbaea5abffd7af05ae8f2118bff72 Mon Sep 17 00:00:00 2001 From: Harshaneel Gokhale Date: Mon, 6 Apr 2026 18:40:31 -0700 Subject: [PATCH 2/2] README --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 77d5569..b6a8813 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # localaik -[CI](https://github.com/harshaneel/localaik/actions/workflows/release.yml) -[Docker Hub](https://hub.docker.com/r/gokhalh/localaik) -[License: MIT](LICENSE) -[Go Report Card](https://goreportcard.com/report/github.com/harshaneel/localaik) -[Go Version](https://github.com/harshaneel/localaik/blob/main/go.mod) -[Go Reference](https://pkg.go.dev/github.com/harshaneel/localaik) +[![CI](https://github.com/harshaneel/localaik/actions/workflows/release.yml/badge.svg)](https://github.com/harshaneel/localaik/actions/workflows/release.yml) +[![Docker Hub](https://img.shields.io/docker/v/gokhalh/localaik?sort=semver&label=Docker%20Hub)](https://hub.docker.com/r/gokhalh/localaik) +[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) +[![Go Report Card](https://goreportcard.com/badge/github.com/harshaneel/localaik)](https://goreportcard.com/report/github.com/harshaneel/localaik) +[![Go Version](https://img.shields.io/github/go-mod/go-version/harshaneel/localaik)](https://github.com/harshaneel/localaik/blob/main/go.mod) +[![Go Reference](https://pkg.go.dev/badge/github.com/harshaneel/localaik.svg)](https://pkg.go.dev/github.com/harshaneel/localaik) A local compatibility server for the Gemini and OpenAI APIs. Run one container, point your SDK at `http://localhost:8090`, and get both protocol shapes on the same port for tests and development.