diff --git a/README.md b/README.md index 211876a..b6a8813 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ More runnable samples (curl, Go, Python, JavaScript, Java) live under **[example ### Gemini SDK **Go:** + ```go client, err := genai.NewClient(ctx, &genai.ClientConfig{ APIKey: "test", @@ -66,6 +67,7 @@ client, err := genai.NewClient(ctx, &genai.ClientConfig{ ``` **Python:** + ```python from google import genai @@ -84,6 +86,7 @@ export GOOGLE_GEMINI_BASE_URL=http://localhost:8090 ### OpenAI SDK **Python:** + ```python from openai import OpenAI @@ -91,6 +94,7 @@ client = OpenAI(api_key="test", base_url="http://localhost:8090/v1") ``` **Go:** + ```go client := openai.NewClient( option.WithAPIKey("test"), @@ -109,6 +113,51 @@ client := openai.NewClient( Version-pinned tags follow the pattern `v0.1.1-gemma3-4b`, `v0.1.1-gemma3-12b`. +## Tuning (v0.1.3 onwards) + +Pass environment variables to tune the underlying model server: + +```bash +docker run -d -p 8090:8090 \ + -e LK_THREADS=8 \ + -e LK_CTX_SIZE=4096 \ + -e LK_FLASH_ATTN=1 \ + -e LK_CONT_BATCHING=1 \ + -e LK_PARALLEL=2 \ + gokhalh/localaik +``` + +Or with Docker Compose: + +```yaml +services: + localaik: + image: gokhalh/localaik + ports: + - "8090:8090" + environment: + LK_THREADS: 8 + LK_CTX_SIZE: 4096 + LK_FLASH_ATTN: 1 + LK_CONT_BATCHING: 1 + LK_PARALLEL: 2 +``` + + +| Variable | Default | Description | +| ------------------ | --------------- | ----------------------------------- | +| `LK_CTX_SIZE` | 8192 | Context window in tokens | +| `LK_THREADS` | auto | CPU threads for inference | +| `LK_THREADS_BATCH` | same as threads | CPU threads for prompt processing | +| `LK_BATCH_SIZE` | 2048 | Prompt processing batch size | +| `LK_UBATCH_SIZE` | 512 | Micro-batch size | +| `LK_GPU_LAYERS` | 0 | Layers offloaded to GPU (99 = all) | +| `LK_PARALLEL` | 1 | Max concurrent request slots | +| `LK_FLASH_ATTN` | 0 (off) | Flash attention (`1` to enable) | +| `LK_CONT_BATCHING` | 0 (off) | Continuous batching (`1` to enable) | +| `LK_MLOCK` | 0 (off) | Lock model in RAM (`1` to enable) | + + ## Implemented routes diff --git a/entrypoint.sh b/entrypoint.sh index ec9906a..d63367b 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -18,12 +18,25 @@ if [ ! -x "${LLAMA_SERVER_BIN}" ]; then fi fi -"${LLAMA_SERVER_BIN}" \ - --model /models/model.gguf \ - --mmproj /models/mmproj-model-f16.gguf \ - --port 8080 \ - --host 127.0.0.1 \ - --ctx-size 8192 & +SERVER_ARGS="--model /models/model.gguf" +SERVER_ARGS="${SERVER_ARGS} --mmproj /models/mmproj-model-f16.gguf" +SERVER_ARGS="${SERVER_ARGS} --port 8080" +SERVER_ARGS="${SERVER_ARGS} --host 127.0.0.1" +SERVER_ARGS="${SERVER_ARGS} --ctx-size ${LK_CTX_SIZE:-8192}" + +[ -n "${LK_THREADS:-}" ] && SERVER_ARGS="${SERVER_ARGS} --threads ${LK_THREADS}" +[ -n "${LK_THREADS_BATCH:-}" ] && SERVER_ARGS="${SERVER_ARGS} --threads-batch ${LK_THREADS_BATCH}" +[ -n "${LK_BATCH_SIZE:-}" ] && SERVER_ARGS="${SERVER_ARGS} --batch-size ${LK_BATCH_SIZE}" +[ -n "${LK_UBATCH_SIZE:-}" ] && SERVER_ARGS="${SERVER_ARGS} --ubatch-size ${LK_UBATCH_SIZE}" +[ -n "${LK_GPU_LAYERS:-}" ] && SERVER_ARGS="${SERVER_ARGS} --n-gpu-layers ${LK_GPU_LAYERS}" +[ -n "${LK_PARALLEL:-}" ] && SERVER_ARGS="${SERVER_ARGS} --parallel ${LK_PARALLEL}" + +[ "${LK_FLASH_ATTN:-0}" = "1" ] && SERVER_ARGS="${SERVER_ARGS} --flash-attn" +[ "${LK_CONT_BATCHING:-0}" = "1" ] && SERVER_ARGS="${SERVER_ARGS} --cont-batching" +[ "${LK_MLOCK:-0}" = "1" ] && SERVER_ARGS="${SERVER_ARGS} --mlock" + +# shellcheck disable=SC2086 +"${LLAMA_SERVER_BIN}" ${SERVER_ARGS} & echo "localaik: loading model..." tries=0