harshaneel · harshaneel · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/README.md b/README.md
@@ -58,6 +58,7 @@ More runnable samples (curl, Go, Python, JavaScript, Java) live under **[example
 ### Gemini SDK
 
 **Go:**
+
 ```go
 client, err := genai.NewClient(ctx, &genai.ClientConfig{
     APIKey:      "test",
@@ -66,6 +67,7 @@ client, err := genai.NewClient(ctx, &genai.ClientConfig{
 ```
 
 **Python:**
+
 ```python
 from google import genai
 
@@ -84,13 +86,15 @@ export GOOGLE_GEMINI_BASE_URL=http://localhost:8090
 ### OpenAI SDK
 
 **Python:**
+
 ```python
 from openai import OpenAI
 
 client = OpenAI(api_key="test", base_url="http://localhost:8090/v1")
 ```
 
 **Go:**
+
 ```go
 client := openai.NewClient(
     option.WithAPIKey("test"),
@@ -109,6 +113,51 @@ client := openai.NewClient(
 
 Version-pinned tags follow the pattern `v0.1.1-gemma3-4b`, `v0.1.1-gemma3-12b`.
 
+## Tuning (v0.1.3 onwards)
+
+Pass environment variables to tune the underlying model server:
+
+```bash
+docker run -d -p 8090:8090 \
+  -e LK_THREADS=8 \
+  -e LK_CTX_SIZE=4096 \
+  -e LK_FLASH_ATTN=1 \
+  -e LK_CONT_BATCHING=1 \
+  -e LK_PARALLEL=2 \
+  gokhalh/localaik
+```
+
+Or with Docker Compose:
+
+```yaml
+services:
+  localaik:
+    image: gokhalh/localaik
+    ports:
+      - "8090:8090"
+    environment:
+      LK_THREADS: 8
+      LK_CTX_SIZE: 4096
+      LK_FLASH_ATTN: 1
+      LK_CONT_BATCHING: 1
+      LK_PARALLEL: 2
+```
+
+
+| Variable           | Default         | Description                         |
+| ------------------ | --------------- | ----------------------------------- |
+| `LK_CTX_SIZE`      | 8192            | Context window in tokens            |
+| `LK_THREADS`       | auto            | CPU threads for inference           |
+| `LK_THREADS_BATCH` | same as threads | CPU threads for prompt processing   |
+| `LK_BATCH_SIZE`    | 2048            | Prompt processing batch size        |
+| `LK_UBATCH_SIZE`   | 512             | Micro-batch size                    |
+| `LK_GPU_LAYERS`    | 0               | Layers offloaded to GPU (99 = all)  |
+| `LK_PARALLEL`      | 1               | Max concurrent request slots        |
+| `LK_FLASH_ATTN`    | 0 (off)         | Flash attention (`1` to enable)     |
+| `LK_CONT_BATCHING` | 0 (off)         | Continuous batching (`1` to enable) |
+| `LK_MLOCK`         | 0 (off)         | Lock model in RAM (`1` to enable)   |
+
+
 ## Implemented routes
 
 

diff --git a/entrypoint.sh b/entrypoint.sh
@@ -18,12 +18,25 @@ if [ ! -x "${LLAMA_SERVER_BIN}" ]; then
   fi
 fi
 
-"${LLAMA_SERVER_BIN}" \
-  --model /models/model.gguf \
-  --mmproj /models/mmproj-model-f16.gguf \
-  --port 8080 \
-  --host 127.0.0.1 \
-  --ctx-size 8192 &
+SERVER_ARGS="--model /models/model.gguf"
+SERVER_ARGS="${SERVER_ARGS} --mmproj /models/mmproj-model-f16.gguf"
+SERVER_ARGS="${SERVER_ARGS} --port 8080"
+SERVER_ARGS="${SERVER_ARGS} --host 127.0.0.1"
+SERVER_ARGS="${SERVER_ARGS} --ctx-size ${LK_CTX_SIZE:-8192}"
+
+[ -n "${LK_THREADS:-}" ]       && SERVER_ARGS="${SERVER_ARGS} --threads ${LK_THREADS}"
+[ -n "${LK_THREADS_BATCH:-}" ] && SERVER_ARGS="${SERVER_ARGS} --threads-batch ${LK_THREADS_BATCH}"
+[ -n "${LK_BATCH_SIZE:-}" ]    && SERVER_ARGS="${SERVER_ARGS} --batch-size ${LK_BATCH_SIZE}"
+[ -n "${LK_UBATCH_SIZE:-}" ]   && SERVER_ARGS="${SERVER_ARGS} --ubatch-size ${LK_UBATCH_SIZE}"
+[ -n "${LK_GPU_LAYERS:-}" ]    && SERVER_ARGS="${SERVER_ARGS} --n-gpu-layers ${LK_GPU_LAYERS}"
+[ -n "${LK_PARALLEL:-}" ]      && SERVER_ARGS="${SERVER_ARGS} --parallel ${LK_PARALLEL}"
+
+[ "${LK_FLASH_ATTN:-0}" = "1" ]    && SERVER_ARGS="${SERVER_ARGS} --flash-attn"
+[ "${LK_CONT_BATCHING:-0}" = "1" ] && SERVER_ARGS="${SERVER_ARGS} --cont-batching"
+[ "${LK_MLOCK:-0}" = "1" ]         && SERVER_ARGS="${SERVER_ARGS} --mlock"
+
+# shellcheck disable=SC2086
+"${LLAMA_SERVER_BIN}" ${SERVER_ARGS} &
 
 echo "localaik: loading model..."
 tries=0