dexwritescode · dexwritescode · May 15, 2026 · May 14, 2026 · May 15, 2026
diff --git a/.github/workflows/release-macos.yml b/.github/workflows/release-macos.yml
@@ -109,10 +109,10 @@ jobs:
           gh release upload "${{ needs.check-release.outputs.tag }}" \
             "Neurons-${{ needs.check-release.outputs.tag }}-arm64.dmg"
 
-      - name: Upload neuron binary to release
+      - name: Upload neurons binary to release
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          cp build/bin/neuron "neuron-${{ needs.check-release.outputs.tag }}-arm64"
+          cp build/bin/neurons "neurons-${{ needs.check-release.outputs.tag }}-arm64"
           gh release upload "${{ needs.check-release.outputs.tag }}" \
-            "neuron-${{ needs.check-release.outputs.tag }}-arm64"
+            "neurons-${{ needs.check-release.outputs.tag }}-arm64"
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ A from-scratch LLM inference engine and chat application. Built to understand ho
 Neurons is a full-stack local AI system:
 
 - **`compute/`** — C++23 inference library. Implements the transformer forward pass from first principles: quantized matmul, RoPE, RMSNorm, KV cache, sampling. Pluggable backends (`ComputeBackend` interface).
-- **`service/`** — gRPC inference server + OpenAI-compatible HTTP endpoint. Embedded in the `neuron` binary (`neuron serve`). Runs on any machine on your network.
+- **`service/`** — gRPC inference server + OpenAI-compatible HTTP endpoint. Embedded in the `neurons` binary (`neurons server`). Runs on any machine on your network.
 - **`cli/`** — Terminal interface. Chat, download models, manage nodes, start a server.
 - **`gui/`** — Flutter macOS app. Chat UI, model browser, multi-node management, live tok/s stats.
 
@@ -204,7 +204,7 @@ The app opens on the Chats screen. Go to **Browse** to search HuggingFace, downl
 
 ```bash
 # Start with an HTTP endpoint on port 8080
-./build/bin/neuron server --http-port 8080 --model mlx-community/Qwen2.5-3B-Instruct-4bit
+./build/bin/neurons server --http-port 8080 --model mlx-community/Qwen2.5-3B-Instruct-4bit
 
 # Point any OpenAI client at it
 curl http://localhost:8080/v1/chat/completions \
@@ -234,7 +234,7 @@ neurons config  show/set         Configuration
 
 ## Remote nodes
 
-Neurons supports connecting multiple machines as inference nodes. Each node runs `neurons-service`; the GUI and CLI connect to all of them and route requests.
+Neurons supports connecting multiple machines as inference nodes. Each node runs `neurons server`; the GUI and CLI connect to all of them and route requests.
 
 ```bash
 # On the remote machine

diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt
@@ -173,6 +173,6 @@ endif()
 
 # Set executable name
 set_target_properties(cli PROPERTIES
-    OUTPUT_NAME "neuron"
+    OUTPUT_NAME "neurons"
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
 )
diff --git a/cli/src/cli/core/cli_app.cpp b/cli/src/cli/core/cli_app.cpp
@@ -4,7 +4,7 @@
 namespace neurons::cli {
 
 CliApp::CliApp()
-    : app_{"neuron", "Neurons — local AI inference and service management"} {
+    : app_{"neurons", "Neurons — local AI inference and service management"} {
     app_.require_subcommand(1);
 }
 

diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
@@ -188,6 +188,8 @@ if(BUILD_TESTING)
         tests/compute/test_qwen3_transformer_moe_integration.cpp
         tests/compute/test_chat_template.cpp
         tests/compute/test_tool_runner.cpp
+        tests/compute/test_sampler.cpp
+        tests/compute/test_hf_tokenizer.cpp
     )
 
     target_link_libraries(compute_tests PRIVATE

diff --git a/compute/tests/compute/test_hf_tokenizer.cpp b/compute/tests/compute/test_hf_tokenizer.cpp
@@ -0,0 +1,108 @@
+#include <gtest/gtest.h>
+#include "compute/model/hf_tokenizer.h"
+#include "test_config.h"
+#include <filesystem>
+
+namespace fs = std::filesystem;
+
+namespace compute {
+
+// ── Fixture ───────────────────────────────────────────────────────────────────
+// All tests require the TinyLlama model on disk; skip automatically if absent.
+
+class HFTokenizerTest : public ::testing::Test {
+protected:
+    static void SetUpTestSuite() {
+        model_dir_ = TINYLLAMA_MODEL_DIR;
+        if (!fs::exists(model_dir_)) {
+            skip_reason_ = "Model not found: " + model_dir_.string();
+            return;
+        }
+        auto result = HFTokenizer::from_model_dir(model_dir_);
+        if (!result) {
+            skip_reason_ = "Tokenizer load failed: " + result.error().message;
+            return;
+        }
+        tok_ = std::make_unique<HFTokenizer>(std::move(*result));
+    }
+
+    static void TearDownTestSuite() { tok_.reset(); }
+
+    void SetUp() override {
+        if (!skip_reason_.empty()) GTEST_SKIP() << skip_reason_;
+    }
+
+    static fs::path                    model_dir_;
+    static std::string                 skip_reason_;
+    static std::unique_ptr<HFTokenizer> tok_;
+};
+
+fs::path                    HFTokenizerTest::model_dir_;
+std::string                 HFTokenizerTest::skip_reason_;
+std::unique_ptr<HFTokenizer> HFTokenizerTest::tok_;
+
+// ── Encode / decode roundtrip ─────────────────────────────────────────────────
+
+TEST_F(HFTokenizerTest, EncodeProducesNonEmptyIds) {
+    const auto ids = tok_->encode("Hello, world!", /*add_special_tokens=*/false);
+    EXPECT_FALSE(ids.empty());
+}
+
+TEST_F(HFTokenizerTest, DecodeRoundtrip) {
+    const std::string text = "The capital of France is Paris.";
+    const auto ids = tok_->encode(text, /*add_special_tokens=*/false);
+    ASSERT_FALSE(ids.empty());
+    const std::string decoded = tok_->decode(ids, /*skip_special_tokens=*/true);
+    EXPECT_EQ(decoded, text);
+}
+
+TEST_F(HFTokenizerTest, AddSpecialTokensInsertsBoS) {
+    const auto with    = tok_->encode("Hi", /*add_special_tokens=*/true);
+    const auto without = tok_->encode("Hi", /*add_special_tokens=*/false);
+    EXPECT_GT(with.size(), without.size());
+    // BOS token is prepended
+    EXPECT_EQ(with.front(), tok_->bos_token_id());
+}
+
+TEST_F(HFTokenizerTest, SkipSpecialTokensStripsBoS) {
+    const auto ids = tok_->encode("Hi", /*add_special_tokens=*/true);
+    ASSERT_FALSE(ids.empty());
+    const std::string with_skip    = tok_->decode(ids, /*skip_special_tokens=*/true);
+    const std::string without_skip = tok_->decode(ids, /*skip_special_tokens=*/false);
+    // Skipping removes the BOS marker; not-skipping keeps it
+    EXPECT_LT(with_skip.size(), without_skip.size());
+}
+
+// ── find_token_id / get_token_string ─────────────────────────────────────────
+
+TEST_F(HFTokenizerTest, FindTokenIdForBosToken) {
+    const int bos = tok_->bos_token_id();
+    ASSERT_GE(bos, 0);
+    const std::string bos_str = tok_->get_token_string(bos);
+    EXPECT_EQ(tok_->find_token_id(bos_str), bos);
+}
+
+TEST_F(HFTokenizerTest, FindTokenIdUnknownReturnsNegativeOne) {
+    EXPECT_EQ(tok_->find_token_id("ZZZDEFINITELYNOTINVOCABZZZ"), -1);
+}
+
+// ── Metadata ─────────────────────────────────────────────────────────────────
+
+TEST_F(HFTokenizerTest, VocabSizeIsReasonable) {
+    EXPECT_GT(tok_->vocab_size(), 100u);
+    EXPECT_LT(tok_->vocab_size(), 300000u);
+}
+
+TEST_F(HFTokenizerTest, SpecialTokenIdsAreValid) {
+    EXPECT_GE(tok_->bos_token_id(), 0);
+    EXPECT_GE(tok_->eos_token_id(), 0);
+}
+
+// ── Error path ────────────────────────────────────────────────────────────────
+
+TEST(HFTokenizerErrorTest, MissingDirReturnsError) {
+    const auto result = HFTokenizer::from_model_dir("/nonexistent/path/to/model");
+    EXPECT_FALSE(result.has_value());
+}
+
+} // namespace compute
diff --git a/compute/tests/compute/test_sampler.cpp b/compute/tests/compute/test_sampler.cpp
@@ -0,0 +1,153 @@
+#include <gtest/gtest.h>
+#include "compute/model/sampler.h"
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+namespace compute {
+
+// ── Greedy (temperature = 0) ──────────────────────────────────────────────────
+
+TEST(SamplerTest, GreedyPicksArgmax) {
+    std::vector<float> logits = {0.1f, 0.5f, 9.0f, 0.3f};
+    SamplingParams p;
+    p.temperature = 0.0f;
+    EXPECT_EQ(Sampler::sample(logits, p, {}), 2);
+}
+
+TEST(SamplerTest, GreedyFirstTokenWinsOnTie) {
+    std::vector<float> logits = {5.0f, 5.0f, 1.0f};
+    SamplingParams p;
+    p.temperature = 0.0f;
+    EXPECT_EQ(Sampler::sample(logits, p, {}), 0);
+}
+
+// ── Top-k ─────────────────────────────────────────────────────────────────────
+
+TEST(SamplerTest, TopKRestrictsToTopKTokens) {
+    // 10 tokens; only top-2 should ever be sampled
+    const int vocab = 10;
+    std::vector<float> logits(vocab, 0.0f);
+    logits[3] = 10.0f;
+    logits[7] = 9.0f;  // second highest
+
+    SamplingParams p;
+    p.temperature = 1.0f;
+    p.top_k       = 2;
+    p.top_p       = 1.0f;
+
+    // Run many samples — only ids 3 and 7 should appear
+    std::vector<int> seen;
+    for (int i = 0; i < 200; ++i)
+        seen.push_back(Sampler::sample(logits, p, {}));
+
+    for (int id : seen)
+        EXPECT_TRUE(id == 3 || id == 7) << "Got unexpected token id: " << id;
+}
+
+TEST(SamplerTest, TopKDisabledWhenZero) {
+    // With top_k=0 and temperature=0, still returns argmax
+    std::vector<float> logits = {1.0f, 2.0f, 5.0f};
+    SamplingParams p;
+    p.temperature = 0.0f;
+    p.top_k       = 0;
+    EXPECT_EQ(Sampler::sample(logits, p, {}), 2);
+}
+
+// ── Top-p (nucleus) ───────────────────────────────────────────────────────────
+
+TEST(SamplerTest, TopPDisabledAt1) {
+    // top_p=1.0 doesn't filter anything — all tokens are in nucleus
+    std::vector<float> logits = {1.0f, 1.0f, 1.0f, 100.0f};
+    SamplingParams p;
+    p.temperature = 0.0f;
+    p.top_p       = 1.0f;
+    EXPECT_EQ(Sampler::sample(logits, p, {}), 3);
+}
+
+TEST(SamplerTest, TopPConcentratesOnDominantToken) {
+    // Token 0 has overwhelmingly high logit — with top_p=0.9 it should always win
+    std::vector<float> logits = {100.0f, 0.0f, 0.0f, 0.0f};
+    SamplingParams p;
+    p.temperature = 1.0f;
+    p.top_k       = 0;
+    p.top_p       = 0.9f;
+
+    for (int i = 0; i < 50; ++i)
+        EXPECT_EQ(Sampler::sample(logits, p, {}), 0);
+}
+
+// ── Min-p ─────────────────────────────────────────────────────────────────────
+
+TEST(SamplerTest, MinPFiltersLowProbTokens) {
+    // Token 0 has overwhelming probability; min_p=0.1 clips the rest
+    std::vector<float> logits = {100.0f, 0.0f, 0.0f, 0.0f};
+    SamplingParams p;
+    p.temperature = 1.0f;
+    p.top_k       = 0;
+    p.top_p       = 1.0f;
+    p.min_p       = 0.1f;
+
+    for (int i = 0; i < 50; ++i)
+        EXPECT_EQ(Sampler::sample(logits, p, {}), 0);
+}
+
+// ── Repetition penalty ────────────────────────────────────────────────────────
+
+TEST(SamplerTest, RepPenaltyShiftsWinnerWhenTokenSeen) {
+    // top_k=1 makes sampling deterministic: only the single highest-scoring token
+    // after all transforms can be selected.
+    // Without penalty: token 0 (logit 100) wins.
+    // With rep_penalty=1e4 on token 0: score drops to 0.01 < token 1's 10 → token 1 wins.
+    std::vector<float> logits = {100.0f, 10.0f, 1.0f, 0.1f};
+    SamplingParams no_pen;
+    no_pen.temperature = 0.0f;
+    no_pen.rep_penalty = 1.0f;
+    EXPECT_EQ(Sampler::sample(logits, no_pen, {}), 0);
+
+    SamplingParams p;
+    p.temperature = 1.0f;
+    p.top_k       = 1;
+    p.top_p       = 1.0f;
+    p.rep_penalty = 1e4f;
+    EXPECT_EQ(Sampler::sample(logits, p, {0}), 1);
+}
+
+TEST(SamplerTest, RepPenaltyOnNegativeLogitPushesItFurtherDown) {
+    // Tokens with a negative logit in context get multiplied (more negative),
+    // not divided. Use top_k=1 to verify the non-penalised token wins after
+    // the negative token is pushed deep negative.
+    // logits = {5.0, -0.01, 0.0, 0.0}; penalty on token 1 (negative):
+    //   scores[1] = -0.01 * 1e6 = -10000 — stays the worst
+    //   top_k=1 selects token 0 (score 5.0)
+    std::vector<float> logits = {5.0f, -0.01f, 0.0f, 0.0f};
+    SamplingParams p;
+    p.temperature = 1.0f;
+    p.top_k       = 1;
+    p.top_p       = 1.0f;
+    p.rep_penalty = 1e6f;
+    EXPECT_EQ(Sampler::sample(logits, p, {1}), 0);
+}
+
+TEST(SamplerTest, RepPenaltyIgnoresTokensNotInContext) {
+    std::vector<float> logits = {5.0f, 1.0f};
+    SamplingParams p;
+    p.temperature = 0.0f;
+    p.rep_penalty = 100.0f;
+    // Context doesn't include token 0, so penalty doesn't apply
+    EXPECT_EQ(Sampler::sample(logits, p, {1}), 0);
+}
+
+TEST(SamplerTest, RepPenaltyDeduplicatesContext) {
+    // Seeing token 0 three times should apply the penalty only once
+    std::vector<float> logits = {2.0f, 1.0f};
+    SamplingParams p;
+    p.temperature = 0.0f;
+    p.rep_penalty = 3.0f;
+    // With 3× occurrence but dedup: same result as 1× occurrence
+    int with_one  = Sampler::sample(logits, p, {0});
+    int with_many = Sampler::sample(logits, p, {0, 0, 0});
+    EXPECT_EQ(with_one, with_many);
+}
+
+} // namespace compute
diff --git a/gui/README.md b/gui/README.md
@@ -1,6 +1,6 @@
 # neurons_gui
 
-Flutter GUI for the Neurons inference engine. Connects to the `neuron` inference server (gRPC) and provides a local-first chat interface.
+Flutter GUI for the Neurons inference engine. Connects to the `neurons` inference server (gRPC) and provides a local-first chat interface.
 
 ## Running the app (macOS)
 
@@ -17,7 +17,7 @@ This compiles `libneurons_core.dylib` and installs it to `gui/macos/neurons_core
 **Step 2 — Start the inference server** (from the repo root):
 
 ```bash
-./build/bin/neuron serve
+./build/bin/neurons serve
 ```
 
 **Step 3 — Run the Flutter app** (from the `gui/` directory):
@@ -31,7 +31,7 @@ flutter run -d macos
 If you haven't changed any C++ code, skip Step 1:
 
 ```bash
-./build/bin/neuron serve &
+./build/bin/neurons serve &
 flutter run -d macos
 ```