From df32038efc59d804aeb822ada791066d9e477677 Mon Sep 17 00:00:00 2001 From: Dex Date: Wed, 13 May 2026 22:44:32 -0400 Subject: [PATCH 1/2] feat: rename CLI binary from neuron to neurons (Neurons-8l2) Eliminates the cognitive friction of mapping project name "Neurons" to binary name "neuron". The binary, CLI app name, all docs, and release artifact name are now consistently "neurons". --- .github/workflows/release-macos.yml | 6 +++--- README.md | 6 +++--- cli/CMakeLists.txt | 2 +- cli/src/cli/core/cli_app.cpp | 2 +- gui/README.md | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/release-macos.yml b/.github/workflows/release-macos.yml index 058ea72..6891e77 100644 --- a/.github/workflows/release-macos.yml +++ b/.github/workflows/release-macos.yml @@ -109,10 +109,10 @@ jobs: gh release upload "${{ needs.check-release.outputs.tag }}" \ "Neurons-${{ needs.check-release.outputs.tag }}-arm64.dmg" - - name: Upload neuron binary to release + - name: Upload neurons binary to release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - cp build/bin/neuron "neuron-${{ needs.check-release.outputs.tag }}-arm64" + cp build/bin/neurons "neurons-${{ needs.check-release.outputs.tag }}-arm64" gh release upload "${{ needs.check-release.outputs.tag }}" \ - "neuron-${{ needs.check-release.outputs.tag }}-arm64" + "neurons-${{ needs.check-release.outputs.tag }}-arm64" diff --git a/README.md b/README.md index be39f6e..efd3587 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ A from-scratch LLM inference engine and chat application. Built to understand ho Neurons is a full-stack local AI system: - **`compute/`** — C++23 inference library. Implements the transformer forward pass from first principles: quantized matmul, RoPE, RMSNorm, KV cache, sampling. Pluggable backends (`ComputeBackend` interface). -- **`service/`** — gRPC inference server + OpenAI-compatible HTTP endpoint. Embedded in the `neuron` binary (`neuron serve`). Runs on any machine on your network. +- **`service/`** — gRPC inference server + OpenAI-compatible HTTP endpoint. Embedded in the `neurons` binary (`neurons server`). Runs on any machine on your network. - **`cli/`** — Terminal interface. Chat, download models, manage nodes, start a server. - **`gui/`** — Flutter macOS app. Chat UI, model browser, multi-node management, live tok/s stats. @@ -204,7 +204,7 @@ The app opens on the Chats screen. Go to **Browse** to search HuggingFace, downl ```bash # Start with an HTTP endpoint on port 8080 -./build/bin/neuron server --http-port 8080 --model mlx-community/Qwen2.5-3B-Instruct-4bit +./build/bin/neurons server --http-port 8080 --model mlx-community/Qwen2.5-3B-Instruct-4bit # Point any OpenAI client at it curl http://localhost:8080/v1/chat/completions \ @@ -234,7 +234,7 @@ neurons config show/set Configuration ## Remote nodes -Neurons supports connecting multiple machines as inference nodes. Each node runs `neurons-service`; the GUI and CLI connect to all of them and route requests. +Neurons supports connecting multiple machines as inference nodes. Each node runs `neurons server`; the GUI and CLI connect to all of them and route requests. ```bash # On the remote machine diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt index d837962..68fe673 100644 --- a/cli/CMakeLists.txt +++ b/cli/CMakeLists.txt @@ -173,6 +173,6 @@ endif() # Set executable name set_target_properties(cli PROPERTIES - OUTPUT_NAME "neuron" + OUTPUT_NAME "neurons" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" ) diff --git a/cli/src/cli/core/cli_app.cpp b/cli/src/cli/core/cli_app.cpp index d32d377..25d7fbf 100644 --- a/cli/src/cli/core/cli_app.cpp +++ b/cli/src/cli/core/cli_app.cpp @@ -4,7 +4,7 @@ namespace neurons::cli { CliApp::CliApp() - : app_{"neuron", "Neurons — local AI inference and service management"} { + : app_{"neurons", "Neurons — local AI inference and service management"} { app_.require_subcommand(1); } diff --git a/gui/README.md b/gui/README.md index 8921d2c..01e4af9 100644 --- a/gui/README.md +++ b/gui/README.md @@ -1,6 +1,6 @@ # neurons_gui -Flutter GUI for the Neurons inference engine. Connects to the `neuron` inference server (gRPC) and provides a local-first chat interface. +Flutter GUI for the Neurons inference engine. Connects to the `neurons` inference server (gRPC) and provides a local-first chat interface. ## Running the app (macOS) @@ -17,7 +17,7 @@ This compiles `libneurons_core.dylib` and installs it to `gui/macos/neurons_core **Step 2 — Start the inference server** (from the repo root): ```bash -./build/bin/neuron serve +./build/bin/neurons serve ``` **Step 3 — Run the Flutter app** (from the `gui/` directory): @@ -31,7 +31,7 @@ flutter run -d macos If you haven't changed any C++ code, skip Step 1: ```bash -./build/bin/neuron serve & +./build/bin/neurons serve & flutter run -d macos ``` From 208bc18b8917a52199a6404fb5cbaf7d43b076bd Mon Sep 17 00:00:00 2001 From: Dex Date: Thu, 14 May 2026 21:12:56 -0400 Subject: [PATCH 2/2] test: add unit tests for Sampler and HFTokenizer (Neurons-m3j) Both inference-path modules were previously only covered by integration tests that require a real model on disk. The new tests are self-contained (sampler) or skip automatically when TinyLlama is absent (HFTokenizer). - test_sampler.cpp: greedy argmax, top-k filtering, top-p nucleus, min-p, rep_penalty winner shift (uses top_k=1 for determinism), and dedup - test_hf_tokenizer.cpp: encode/decode roundtrip, add_special_tokens, skip_special_tokens, find_token_id, vocab size sanity, error path --- compute/CMakeLists.txt | 2 + compute/tests/compute/test_hf_tokenizer.cpp | 108 ++++++++++++++ compute/tests/compute/test_sampler.cpp | 153 ++++++++++++++++++++ 3 files changed, 263 insertions(+) create mode 100644 compute/tests/compute/test_hf_tokenizer.cpp create mode 100644 compute/tests/compute/test_sampler.cpp diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index fcf21ba..d64318a 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -188,6 +188,8 @@ if(BUILD_TESTING) tests/compute/test_qwen3_transformer_moe_integration.cpp tests/compute/test_chat_template.cpp tests/compute/test_tool_runner.cpp + tests/compute/test_sampler.cpp + tests/compute/test_hf_tokenizer.cpp ) target_link_libraries(compute_tests PRIVATE diff --git a/compute/tests/compute/test_hf_tokenizer.cpp b/compute/tests/compute/test_hf_tokenizer.cpp new file mode 100644 index 0000000..e77ddf3 --- /dev/null +++ b/compute/tests/compute/test_hf_tokenizer.cpp @@ -0,0 +1,108 @@ +#include +#include "compute/model/hf_tokenizer.h" +#include "test_config.h" +#include + +namespace fs = std::filesystem; + +namespace compute { + +// ── Fixture ─────────────────────────────────────────────────────────────────── +// All tests require the TinyLlama model on disk; skip automatically if absent. + +class HFTokenizerTest : public ::testing::Test { +protected: + static void SetUpTestSuite() { + model_dir_ = TINYLLAMA_MODEL_DIR; + if (!fs::exists(model_dir_)) { + skip_reason_ = "Model not found: " + model_dir_.string(); + return; + } + auto result = HFTokenizer::from_model_dir(model_dir_); + if (!result) { + skip_reason_ = "Tokenizer load failed: " + result.error().message; + return; + } + tok_ = std::make_unique(std::move(*result)); + } + + static void TearDownTestSuite() { tok_.reset(); } + + void SetUp() override { + if (!skip_reason_.empty()) GTEST_SKIP() << skip_reason_; + } + + static fs::path model_dir_; + static std::string skip_reason_; + static std::unique_ptr tok_; +}; + +fs::path HFTokenizerTest::model_dir_; +std::string HFTokenizerTest::skip_reason_; +std::unique_ptr HFTokenizerTest::tok_; + +// ── Encode / decode roundtrip ───────────────────────────────────────────────── + +TEST_F(HFTokenizerTest, EncodeProducesNonEmptyIds) { + const auto ids = tok_->encode("Hello, world!", /*add_special_tokens=*/false); + EXPECT_FALSE(ids.empty()); +} + +TEST_F(HFTokenizerTest, DecodeRoundtrip) { + const std::string text = "The capital of France is Paris."; + const auto ids = tok_->encode(text, /*add_special_tokens=*/false); + ASSERT_FALSE(ids.empty()); + const std::string decoded = tok_->decode(ids, /*skip_special_tokens=*/true); + EXPECT_EQ(decoded, text); +} + +TEST_F(HFTokenizerTest, AddSpecialTokensInsertsBoS) { + const auto with = tok_->encode("Hi", /*add_special_tokens=*/true); + const auto without = tok_->encode("Hi", /*add_special_tokens=*/false); + EXPECT_GT(with.size(), without.size()); + // BOS token is prepended + EXPECT_EQ(with.front(), tok_->bos_token_id()); +} + +TEST_F(HFTokenizerTest, SkipSpecialTokensStripsBoS) { + const auto ids = tok_->encode("Hi", /*add_special_tokens=*/true); + ASSERT_FALSE(ids.empty()); + const std::string with_skip = tok_->decode(ids, /*skip_special_tokens=*/true); + const std::string without_skip = tok_->decode(ids, /*skip_special_tokens=*/false); + // Skipping removes the BOS marker; not-skipping keeps it + EXPECT_LT(with_skip.size(), without_skip.size()); +} + +// ── find_token_id / get_token_string ───────────────────────────────────────── + +TEST_F(HFTokenizerTest, FindTokenIdForBosToken) { + const int bos = tok_->bos_token_id(); + ASSERT_GE(bos, 0); + const std::string bos_str = tok_->get_token_string(bos); + EXPECT_EQ(tok_->find_token_id(bos_str), bos); +} + +TEST_F(HFTokenizerTest, FindTokenIdUnknownReturnsNegativeOne) { + EXPECT_EQ(tok_->find_token_id("ZZZDEFINITELYNOTINVOCABZZZ"), -1); +} + +// ── Metadata ───────────────────────────────────────────────────────────────── + +TEST_F(HFTokenizerTest, VocabSizeIsReasonable) { + EXPECT_GT(tok_->vocab_size(), 100u); + EXPECT_LT(tok_->vocab_size(), 300000u); +} + +TEST_F(HFTokenizerTest, SpecialTokenIdsAreValid) { + EXPECT_GE(tok_->bos_token_id(), 0); + EXPECT_GE(tok_->eos_token_id(), 0); +} + +// ── Error path ──────────────────────────────────────────────────────────────── + +TEST(HFTokenizerErrorTest, MissingDirReturnsError) { + const auto result = HFTokenizer::from_model_dir("/nonexistent/path/to/model"); + EXPECT_FALSE(result.has_value()); +} + +} // namespace compute diff --git a/compute/tests/compute/test_sampler.cpp b/compute/tests/compute/test_sampler.cpp new file mode 100644 index 0000000..2a55508 --- /dev/null +++ b/compute/tests/compute/test_sampler.cpp @@ -0,0 +1,153 @@ +#include +#include "compute/model/sampler.h" +#include +#include +#include + +namespace compute { + +// ── Greedy (temperature = 0) ────────────────────────────────────────────────── + +TEST(SamplerTest, GreedyPicksArgmax) { + std::vector logits = {0.1f, 0.5f, 9.0f, 0.3f}; + SamplingParams p; + p.temperature = 0.0f; + EXPECT_EQ(Sampler::sample(logits, p, {}), 2); +} + +TEST(SamplerTest, GreedyFirstTokenWinsOnTie) { + std::vector logits = {5.0f, 5.0f, 1.0f}; + SamplingParams p; + p.temperature = 0.0f; + EXPECT_EQ(Sampler::sample(logits, p, {}), 0); +} + +// ── Top-k ───────────────────────────────────────────────────────────────────── + +TEST(SamplerTest, TopKRestrictsToTopKTokens) { + // 10 tokens; only top-2 should ever be sampled + const int vocab = 10; + std::vector logits(vocab, 0.0f); + logits[3] = 10.0f; + logits[7] = 9.0f; // second highest + + SamplingParams p; + p.temperature = 1.0f; + p.top_k = 2; + p.top_p = 1.0f; + + // Run many samples — only ids 3 and 7 should appear + std::vector seen; + for (int i = 0; i < 200; ++i) + seen.push_back(Sampler::sample(logits, p, {})); + + for (int id : seen) + EXPECT_TRUE(id == 3 || id == 7) << "Got unexpected token id: " << id; +} + +TEST(SamplerTest, TopKDisabledWhenZero) { + // With top_k=0 and temperature=0, still returns argmax + std::vector logits = {1.0f, 2.0f, 5.0f}; + SamplingParams p; + p.temperature = 0.0f; + p.top_k = 0; + EXPECT_EQ(Sampler::sample(logits, p, {}), 2); +} + +// ── Top-p (nucleus) ─────────────────────────────────────────────────────────── + +TEST(SamplerTest, TopPDisabledAt1) { + // top_p=1.0 doesn't filter anything — all tokens are in nucleus + std::vector logits = {1.0f, 1.0f, 1.0f, 100.0f}; + SamplingParams p; + p.temperature = 0.0f; + p.top_p = 1.0f; + EXPECT_EQ(Sampler::sample(logits, p, {}), 3); +} + +TEST(SamplerTest, TopPConcentratesOnDominantToken) { + // Token 0 has overwhelmingly high logit — with top_p=0.9 it should always win + std::vector logits = {100.0f, 0.0f, 0.0f, 0.0f}; + SamplingParams p; + p.temperature = 1.0f; + p.top_k = 0; + p.top_p = 0.9f; + + for (int i = 0; i < 50; ++i) + EXPECT_EQ(Sampler::sample(logits, p, {}), 0); +} + +// ── Min-p ───────────────────────────────────────────────────────────────────── + +TEST(SamplerTest, MinPFiltersLowProbTokens) { + // Token 0 has overwhelming probability; min_p=0.1 clips the rest + std::vector logits = {100.0f, 0.0f, 0.0f, 0.0f}; + SamplingParams p; + p.temperature = 1.0f; + p.top_k = 0; + p.top_p = 1.0f; + p.min_p = 0.1f; + + for (int i = 0; i < 50; ++i) + EXPECT_EQ(Sampler::sample(logits, p, {}), 0); +} + +// ── Repetition penalty ──────────────────────────────────────────────────────── + +TEST(SamplerTest, RepPenaltyShiftsWinnerWhenTokenSeen) { + // top_k=1 makes sampling deterministic: only the single highest-scoring token + // after all transforms can be selected. + // Without penalty: token 0 (logit 100) wins. + // With rep_penalty=1e4 on token 0: score drops to 0.01 < token 1's 10 → token 1 wins. + std::vector logits = {100.0f, 10.0f, 1.0f, 0.1f}; + SamplingParams no_pen; + no_pen.temperature = 0.0f; + no_pen.rep_penalty = 1.0f; + EXPECT_EQ(Sampler::sample(logits, no_pen, {}), 0); + + SamplingParams p; + p.temperature = 1.0f; + p.top_k = 1; + p.top_p = 1.0f; + p.rep_penalty = 1e4f; + EXPECT_EQ(Sampler::sample(logits, p, {0}), 1); +} + +TEST(SamplerTest, RepPenaltyOnNegativeLogitPushesItFurtherDown) { + // Tokens with a negative logit in context get multiplied (more negative), + // not divided. Use top_k=1 to verify the non-penalised token wins after + // the negative token is pushed deep negative. + // logits = {5.0, -0.01, 0.0, 0.0}; penalty on token 1 (negative): + // scores[1] = -0.01 * 1e6 = -10000 — stays the worst + // top_k=1 selects token 0 (score 5.0) + std::vector logits = {5.0f, -0.01f, 0.0f, 0.0f}; + SamplingParams p; + p.temperature = 1.0f; + p.top_k = 1; + p.top_p = 1.0f; + p.rep_penalty = 1e6f; + EXPECT_EQ(Sampler::sample(logits, p, {1}), 0); +} + +TEST(SamplerTest, RepPenaltyIgnoresTokensNotInContext) { + std::vector logits = {5.0f, 1.0f}; + SamplingParams p; + p.temperature = 0.0f; + p.rep_penalty = 100.0f; + // Context doesn't include token 0, so penalty doesn't apply + EXPECT_EQ(Sampler::sample(logits, p, {1}), 0); +} + +TEST(SamplerTest, RepPenaltyDeduplicatesContext) { + // Seeing token 0 three times should apply the penalty only once + std::vector logits = {2.0f, 1.0f}; + SamplingParams p; + p.temperature = 0.0f; + p.rep_penalty = 3.0f; + // With 3× occurrence but dedup: same result as 1× occurrence + int with_one = Sampler::sample(logits, p, {0}); + int with_many = Sampler::sample(logits, p, {0, 0, 0}); + EXPECT_EQ(with_one, with_many); +} + +} // namespace compute