diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index f3fecdd..d794c65 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -25,6 +25,7 @@ jobs: - name: Run benchmark run: ./bench --benchmark_out=benchmark.json --benchmark_out_format=json - name: Store benchmark result + if: ${{ github.event_name == 'push' }} uses: benchmark-action/github-action-benchmark@v1 with: tool: 'googlecpp' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6d7381b..20ad38a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,25 +2,33 @@ name: C++ CI on: push: + branches: [main] pull_request: jobs: build: runs-on: ubuntu-latest + strategy: + matrix: + arrow: [0, 1] steps: - uses: actions/checkout@v3 - name: Update submodules run: git submodule update --init --recursive - name: Install dependencies run: | + sudo apt update + sudo apt install -y ca-certificates lsb-release wget + wget https://packages.apache.org/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt install -y ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb sudo apt-get update - sudo apt-get install -y g++ make clang-tidy clang-format python3 python3-pip + sudo apt-get install -y g++ make clang-tidy clang-format python3 python3-pip libarrow-dev libparquet-dev pip3 install pandas pyarrow pytest - name: Build - run: make + run: make USE_ARROW=${{ matrix.arrow }} - name: Run clang-format run: make format && git diff --exit-code - name: Run clang-tidy run: make tidy - name: Run test - run: make test + run: make USE_ARROW=${{ matrix.arrow }} test diff --git a/Makefile b/Makefile index e75cb17..c4064d9 100644 --- a/Makefile +++ b/Makefile @@ -3,15 +3,21 @@ TARGET = main BENCH = bench OBJ = main_bench.o +USE_ARROW ?= 0 +ifeq ($(USE_ARROW),1) +CXXFLAGS += $(shell pkg-config --cflags arrow parquet) +LDFLAGS += $(shell pkg-config --libs arrow parquet) +DEFINES += -DUSE_ARROW +endif $(TARGET): main.cpp - $(CXX) -O3 -std=c++20 -fopenmp -march=native -o $(TARGET) $< + $(CXX) -O3 -std=c++20 -fopenmp -march=native $(DEFINES) $(CXXFLAGS) -o $(TARGET) $< $(LDFLAGS) $(OBJ): main.cpp - $(CXX) -O3 -std=c++20 -fopenmp -march=native -DBENCH_LIB -c main.cpp -o $(OBJ) + $(CXX) -O3 -std=c++20 -fopenmp -march=native -DBENCH_LIB $(DEFINES) $(CXXFLAGS) -c main.cpp -o $(OBJ) $(BENCH): $(OBJ) bench.cpp - $(CXX) -O3 -std=c++20 -fopenmp -march=native bench.cpp $(OBJ) -lbenchmark -lpthread -o $(BENCH) + $(CXX) -O3 -std=c++20 -fopenmp -march=native bench.cpp $(OBJ) -lbenchmark -lpthread $(LDFLAGS) -o $(BENCH) format: clang-format -i main.cpp diff --git a/README.md b/README.md index 8506254..db47317 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,20 @@ Use the provided `Makefile`: make ``` -This produces an executable named `main`. +This produces an executable named `main`. To enable Apache Arrow support pass +`USE_ARROW=1` (requires Arrow C++ libraries with `pkg-config` files). ## Running -Run the simulator by providing a connectome CSV file and lists of active and silent neurons: +Run the simulator by providing a connectome CSV file and lists of active and silent neurons. When built with Arrow support you may provide a Parquet file instead: ```bash ./main --csv connectome.csv --active active.txt --silent silent.txt --t 1000 ``` +To load a Parquet file when Arrow support is enabled use: +```bash +./main --parquet Connectivity_783.parquet --active active.txt --silent silent.txt --t 1000 +``` The output spike times are written to `spikes.bin`, and basic statistics are printed to stderr. diff --git a/bench.cpp b/bench.cpp index 73259e9..bd960a4 100644 --- a/bench.cpp +++ b/bench.cpp @@ -3,13 +3,14 @@ #include size_t run_simulation(const std::string &csv, + const std::string &parquet, const std::string &act, const std::string &sil, size_t T); static void BM_Run(benchmark::State &state) { for (auto _ : state) { - size_t spikes = run_simulation("test/test_connectome.csv", + size_t spikes = run_simulation("test/test_connectome.csv", "", "test/active.txt", "test/silent.txt", 5); benchmark::DoNotOptimize(spikes); diff --git a/docs/AGENT_PROMPTS.md b/docs/AGENT_PROMPTS.md index ea6a045..85375ec 100644 --- a/docs/AGENT_PROMPTS.md +++ b/docs/AGENT_PROMPTS.md @@ -8,7 +8,7 @@ The project simulates spikes in a Drosophila connectome using a leaky integrate- - Keep the simulator self contained in `main.cpp` unless new modules are justified. - Unit tests live under `test/` and should be expanded when new features are added. -- Large datasets are provided via the optional `Drosophila_brain_model` submodule and should not be committed directly to the repository. +- Large datasets are provided via the optional `Drosophila_brain_model` submodule and should not be committed directly to the repository. Support for loading Parquet files via Apache Arrow is optional and can be enabled by building with `USE_ARROW=1`. ## Pull Request Guidance diff --git a/main.cpp b/main.cpp index 35f2150..b862152 100644 --- a/main.cpp +++ b/main.cpp @@ -14,6 +14,11 @@ #include #include #include +#ifdef USE_ARROW +#include +#include +#include +#endif using namespace std; @@ -76,6 +81,73 @@ static CSR load_csv_to_csr(const string &csv, size_t N, partial_sum(g.row.begin(), g.row.end(), g.row.begin()); return g; } +#ifdef USE_ARROW +static size_t count_neurons_parquet(const string &pq) { + std::shared_ptr infile; + PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(pq)); + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr table; + PARQUET_THROW_NOT_OK( + reader->ReadTable({"Presynaptic_Index", "Postsynaptic_Index"}, &table)); + auto pre_chunks = table->GetColumnByName("Presynaptic_Index")->chunks(); + auto post_chunks = table->GetColumnByName("Postsynaptic_Index")->chunks(); + size_t N = 0; + for (size_t c = 0; c < pre_chunks.size(); ++c) { + auto pre = std::static_pointer_cast(pre_chunks[c]); + auto post = std::static_pointer_cast(post_chunks[c]); + for (int64_t i = 0; i < pre->length(); ++i) { + uint32_t a = pre->Value(i); + uint32_t b = post->Value(i); + N = max(N, size_t(max(a, b) + 1)); + } + } + return N; +} +static CSR load_parquet_to_csr(const string &pq, size_t N, + const vector &silent) { + std::shared_ptr infile; + PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(pq)); + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + auto pre_chunks = table->GetColumnByName("Presynaptic_Index")->chunks(); + auto post_chunks = table->GetColumnByName("Postsynaptic_Index")->chunks(); + auto w_chunks = table->GetColumnByName("Connectivity")->chunks(); + vector> edges; + size_t total = table->num_rows(); + edges.reserve(total); + for (size_t c = 0; c < pre_chunks.size(); ++c) { + auto pre = std::static_pointer_cast(pre_chunks[c]); + auto post = std::static_pointer_cast(post_chunks[c]); + auto w = std::static_pointer_cast(w_chunks[c]); + for (int64_t i = 0; i < pre->length(); ++i) { + uint32_t a = pre->Value(i); + uint32_t b = post->Value(i); + if (silent[a] || silent[b]) + continue; + float wt = w->Value(i); + edges.emplace_back(a, b, wt); + } + } + sort(edges.begin(), edges.end(), + [](auto &x, auto &y) { return get<0>(x) < get<0>(y); }); + CSR g; + g.row.assign(N + 1, 0); + g.col.reserve(edges.size()); + g.w.reserve(edges.size()); + for (auto [pre, post, w] : edges) { + ++g.row[pre + 1]; + g.col.push_back(post); + g.w.push_back(w); + } + partial_sum(g.row.begin(), g.row.end(), g.row.begin()); + return g; +} +#endif /* ---------- simulation ---------- */ struct Simulator { size_t N; @@ -169,10 +241,26 @@ struct Simulator { } }; /* ---------- main ---------- */ -size_t run_simulation(const string &csv, const string &act, const string &sil, - size_t T) { - /* step 1: count neurons (max id+1) */ +size_t run_simulation(const string &csv, const string &pq, const string &act, + const string &sil, size_t T) { size_t N = 0; + CSR G; +#ifdef USE_ARROW + if (!pq.empty()) { + N = count_neurons_parquet(pq); + vector silent_vec_tmp(N, 0); + for (auto id : load_list(sil)) + if (id < N) + silent_vec_tmp[id] = 1; + G = load_parquet_to_csr(pq, N, silent_vec_tmp); + Params P; + Simulator S(N, G, P, std::move(silent_vec_tmp)); + S.run(T, load_list(act)); + S.save_bin("spikes.bin"); + return S.spikes.size(); + } +#endif + /* default CSV loader */ { ifstream f(csv); string l; @@ -188,7 +276,7 @@ size_t run_simulation(const string &csv, const string &act, const string &sil, if (id < N) silent_vec[id] = 1; - CSR G = load_csv_to_csr(csv, N, silent_vec); + G = load_csv_to_csr(csv, N, silent_vec); Params P; Simulator S(N, G, P, silent_vec); S.run(T, load_list(act)); @@ -198,12 +286,14 @@ size_t run_simulation(const string &csv, const string &act, const string &sil, #ifndef BENCH_LIB int main(int argc, char **argv) { - string csv, act, sil; + string csv, pq, act, sil; size_t T = 1000; for (int i = 1; i < argc; ++i) { string a = argv[i]; if (a == "--csv") csv = argv[++i]; + else if (a == "--parquet") + pq = argv[++i]; else if (a == "--active") act = argv[++i]; else if (a == "--silent") @@ -211,11 +301,11 @@ int main(int argc, char **argv) { else if (a == "--t") T = stoul(argv[++i]); } - if (csv.empty()) { - cerr << "--csv required\n"; + if (csv.empty() && pq.empty()) { + cerr << "--csv or --parquet required\n"; return 1; } - size_t spikes = run_simulation(csv, act, sil, T); + size_t spikes = run_simulation(csv, pq, act, sil, T); cerr << "Spikes: " << spikes << "\n"; } #endif