From f36972ea5d9d42cbcc216d95c033249acfa6b1a9 Mon Sep 17 00:00:00 2001 From: mikepapadim Date: Fri, 23 May 2025 15:05:06 +0300 Subject: [PATCH] Change defaults max tokens to 1024 and verbose on interactive mode --- src/main/java/com/example/LlamaApp.java | 23 ++++++++----------- .../inference/engine/impl/Options.java | 2 +- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/main/java/com/example/LlamaApp.java b/src/main/java/com/example/LlamaApp.java index 8328dd52..89d9d138 100644 --- a/src/main/java/com/example/LlamaApp.java +++ b/src/main/java/com/example/LlamaApp.java @@ -10,7 +10,6 @@ import com.example.inference.engine.impl.Options; import com.example.loader.weights.ModelLoader; import com.example.loader.weights.State; -import com.example.tokenizer.impl.Tokenizer; import com.example.tornadovm.FloatArrayUtils; import com.example.tornadovm.TornadoVMMasterPlan; import uk.ac.manchester.tornado.api.types.arrays.FloatArray; @@ -29,7 +28,8 @@ public class LlamaApp { public static final boolean USE_VECTOR_API = Boolean.parseBoolean(System.getProperty("llama.VectorAPI", "true")); // Enable Java Vector API for CPU acceleration public static final boolean USE_AOT = Boolean.parseBoolean(System.getProperty("llama.AOT", "false")); // Use Ahead-of-Time compilation public static final boolean USE_TORNADOVM = Boolean.parseBoolean(System.getProperty("use.tornadovm", "false")); // Use TornadoVM for GPU acceleration - public static final boolean SHOW_PERF_INTERACTIVE = Boolean.parseBoolean(System.getProperty("llama.ShowPerfInteractive", "false")); // Show performance metrics in interactive mode + public static final boolean SHOW_PERF_INTERACTIVE = Boolean.parseBoolean(System.getProperty("llama.ShowPerfInteractive", "true")); // Show performance metrics in interactive mode + /** * Creates and configures a sampler for token generation based on specified parameters. * @@ -115,7 +115,6 @@ static Sampler selectSampler(int vocabularySize, float temperature, float topp, return sampler; } - static void runInteractive(Llama model, Sampler sampler, Options options) { State state = null; List conversationTokens = new ArrayList<>(); @@ -162,15 +161,12 @@ static void runInteractive(Llama model, Sampler sampler, Options options) { // Choose between GPU and CPU path based on configuration if (USE_TORNADOVM) { // GPU path using TornadoVM - responseTokens = Llama.generateTokensGPU(model, state, startPosition, - conversationTokens.subList(startPosition, conversationTokens.size()), - stopTokens, options.maxTokens(), sampler, options.echo(), - options.stream() ? tokenConsumer : null, tornadoVMPlan); + responseTokens = Llama.generateTokensGPU(model, state, startPosition, conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens, options.maxTokens(), + sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan); } else { // CPU path - responseTokens = Llama.generateTokens(model, state, startPosition, - conversationTokens.subList(startPosition, conversationTokens.size()), - stopTokens, options.maxTokens(), sampler, options.echo(), tokenConsumer); + responseTokens = Llama.generateTokens(model, state, startPosition, conversationTokens.subList(startPosition, conversationTokens.size()), stopTokens, options.maxTokens(), sampler, + options.echo(), tokenConsumer); } // Include stop token in the prompt history, but not in the response displayed to the user. @@ -211,7 +207,7 @@ static void runInteractive(Llama model, Sampler sampler, Options options) { static void runInstructOnce(Llama model, Sampler sampler, Options options) { State state = model.createNewState(); ChatFormat chatFormat = new ChatFormat(model.tokenizer()); - TornadoVMMasterPlan tornadoVMPlan =null; + TornadoVMMasterPlan tornadoVMPlan = null; List promptTokens = new ArrayList<>(); promptTokens.add(chatFormat.beginOfText); @@ -233,10 +229,9 @@ static void runInstructOnce(Llama model, Sampler sampler, Options options) { Set stopTokens = chatFormat.getStopTokens(); if (USE_TORNADOVM) { - tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, model); + tornadoVMPlan = TornadoVMMasterPlan.initializeTornadoVMPlan(state, model); // Call generateTokensGPU without the token consumer parameter - responseTokens = Llama.generateTokensGPU(model, state, 0, promptTokens, stopTokens, options.maxTokens(), - sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan); + responseTokens = Llama.generateTokensGPU(model, state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), options.stream() ? tokenConsumer : null, tornadoVMPlan); } else { // CPU path still uses the token consumer responseTokens = Llama.generateTokens(model, state, 0, promptTokens, stopTokens, options.maxTokens(), sampler, options.echo(), tokenConsumer); diff --git a/src/main/java/com/example/inference/engine/impl/Options.java b/src/main/java/com/example/inference/engine/impl/Options.java index ed66786c..7ee13194 100644 --- a/src/main/java/com/example/inference/engine/impl/Options.java +++ b/src/main/java/com/example/inference/engine/impl/Options.java @@ -7,7 +7,7 @@ public record Options(Path modelPath, String prompt, String systemPrompt, boolean interactive, float temperature, float topp, long seed, int maxTokens, boolean stream, boolean echo) { - public static final int DEFAULT_MAX_TOKENS = 512; + public static final int DEFAULT_MAX_TOKENS = 1024; public Options { require(modelPath != null, "Missing argument: --model is required");