vladkvit · vladkvit · Feb 8, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -18,11 +18,12 @@ rayon = "1.11"
 num_cpus = "1.17"
 arrow-array = "57"
 pyo3-arrow = "0.15"
+numpy = "0.27"
+parquet = "57"
+arrow = "57"
 
 [dev-dependencies]
 criterion = "0.8"
-parquet = "57"
-arrow = "57"
 
 [[bench]]
 harness = false

diff --git a/README.md b/README.md
@@ -5,26 +5,45 @@ This project adds Python bindings to [rust-pgn-reader](https://github.com/niklas
 ## Installing
 `pip install rust_pgn_reader_python_binding`
 
+## API
+
+Three entry points are available:
+
+- `parse_game(pgn)` - Parse a single PGN string
+- `parse_games(chunked_array)` - Parse games from a PyArrow ChunkedArray (multithreaded)
+- `parse_games_from_strings(pgns)` - Parse a list of PGN strings (multithreaded)
+
+All return a `ParsedGames` container with flat NumPy arrays, supporting:
+- Indexing (`result[i]`), slicing (`result[1:3]`), and iteration (`for game in result`)
+- Per-game views (`PyGameView`) with zero-copy array slices
+- Position-to-game and move-to-game mapping for ML workflows
+- Optional comment storage (`store_comments=True`)
+- Optional legal move storage (`store_legal_moves=True`)
+
 ## Benchmarks
-Below are some benchmarks on Lichess's 2013-07 chess games (293,459	games) on an 7800X3D.
+Below are some benchmarks on Lichess's 2013-07 chess games (293,459 games) on a 7800X3D.
 
 | Parser                                                                     | File format | Time   |
 |----------------------------------------------------------------------------|-------------|--------|
 | [rust-pgn-reader](https://github.com/niklasf/rust-pgn-reader/tree/master)  | PGN         | 1s     |
-| rust_pgn_reader_python_binding                                             | PGN         | 4.7s   |
-| rust_pgn_reader_python_binding, parse_game (single_threaded)               | parquet     | 3.3s   |
-| rust_pgn_reader_python_binding, parse_games (multithreaded)                | parquet     | 0.5s   |
-| rust_pgn_reader_python_binding, parse_game_moves_arrow_chunked_array (multithreaded) | parquet     | 0.35s   |
+| rust_pgn_reader_python_binding, parse_games (multithreaded)                | parquet     | 0.35s  |
+| rust_pgn_reader_python_binding, parse_games_from_strings (multithreaded)   | parquet     | 0.5s   |
+| rust_pgn_reader_python_binding, parse_game (single-threaded)               | parquet     | 3.3s   |
+| rust_pgn_reader_python_binding, parse_game (single-threaded)               | PGN         | 4.7s   |
 | [chess-library](https://github.com/Disservin/chess-library)                | PGN         | 2s     |
 | [python-chess](https://github.com/niklasf/python-chess)                    | PGN         | 3+ min |
 
 To replicate, download `2013-07-train-00000-of-00001.parquet` and then run:
 
-`python bench_parquet.py` (single-threaded parse_game)
+`python src/bench_parse_games.py` (recommended — multithreaded parse_games via Arrow)
+
+`python src/bench_parse_games_from_strings.py` (multithreaded parse_games_from_strings)
+
+`python src/bench_parse_game.py` (single-threaded parse_game from parquet)
 
-`python bench_parquet_parallel.py` (multithreaded parse_games)
+`python src/bench_parse_game_pgn.py` (single-threaded parse_game from .pgn file)
 
-`python bench_parquet_arrow.py` (multithreaded parse_game_moves_arrow_chunked_array)
+`python src/bench_data_access.py 2013-07-train-00000-of-00001.parquet` (parsing + data access + memory)
 
 ## Building
 `maturin develop`
@@ -34,12 +53,12 @@ To replicate, download `2013-07-train-00000-of-00001.parquet` and then run:
 For a more thorough tutorial, follow https://lukesalamone.github.io/posts/how-to-create-rust-python-bindings/
 
 ## Profiling
-`py-spy record -s -F -f speedscope --output profile.speedscope -- python ./src/bench_parquet.py`
+`py-spy record -s -F -f speedscope --output profile.speedscope -- python ./src/bench_parse_games.py`
 
 Linux/WSL-only:
-`py-spy record -s -F -n -f speedscope --output profile.speedscope -- python ./src/bench_parquet.py`
+`py-spy record -s -F -n -f speedscope --output profile.speedscope -- python ./src/bench_parse_games.py`
 
 ## Testing
 `cargo test`
 
-`python -m unittest src/test.py`
+`python -m unittest src/test.py`
diff --git a/benches/parquet_bench.rs b/benches/parquet_bench.rs
@@ -1,63 +1,154 @@
+//! Benchmark for PGN parsing API, designed to mirror the Python workflow.
+//!
+//! `cargo bench --bench parquet_bench`
+//! `samply record --rate 10000 cargo bench --bench parquet_bench`
+
 use arrow::array::{Array, StringArray};
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use rayon::prelude::*;
+use rayon::ThreadPoolBuilder;
 use std::fs::File;
 use std::path::Path;
 use std::time::Instant;
 
-use rust_pgn_reader_python_binding::parse_multiple_games_native;
+use rust_pgn_reader_python_binding::{parse_game_to_buffers, Buffers, ParseConfig};
+
+const FILE_PATH: &str = "2013-07-train-00000-of-00001.parquet";
 
-pub fn bench_parquet() {
-    let file_path = "2013-07-train-00000-of-00001.parquet";
+/// Chunk multiplier for explicit chunking.
+/// 1 = exactly num_threads chunks (minimal overhead)
+/// Higher values provide better load balancing at cost of more buffers.
+const CHUNK_MULTIPLIER: usize = 1;
 
-    // Open the Parquet file
+/// Read parquet file and return the raw Arrow StringArrays.
+/// This preserves Arrow's memory layout for zero-copy string access.
+fn read_parquet_to_string_arrays(file_path: &str) -> Vec<StringArray> {
     let file = File::open(Path::new(file_path)).expect("Unable to open file");
     let builder = ParquetRecordBatchReaderBuilder::try_new(file)
         .expect("Failed to create ParquetRecordBatchReaderBuilder");
     let mut reader = builder
         .build()
         .expect("Failed to build ParquetRecordBatchReader");
 
-    // Process record batches
-    let mut movetexts = Vec::new();
+    let mut arrays = Vec::new();
     while let Some(batch) = reader
         .next()
         .transpose()
         .expect("Error reading record batch")
     {
-        // Extract "movetext" column from the record batch
         if let Some(array) = batch
             .column_by_name("movetext")
             .and_then(|col| col.as_any().downcast_ref::<StringArray>())
         {
-            for i in 0..array.len() {
-                if array.is_valid(i) {
-                    movetexts.push(array.value(i).to_string());
-                }
-            }
+            arrays.push(array.clone());
         } else {
             panic!("movetext column not found or not a StringArray");
         }
     }
+    arrays
+}
+
+/// Extract &str slices from Arrow StringArrays (zero-copy).
+fn extract_str_slices<'a>(arrays: &'a [StringArray]) -> Vec<&'a str> {
+    let total_len: usize = arrays.iter().map(|a| a.len()).sum();
+    let mut slices = Vec::with_capacity(total_len);
+
+    for array in arrays {
+        for i in 0..array.len() {
+            if array.is_valid(i) {
+                slices.push(array.value(i));
+            }
+        }
+    }
+    slices
+}
 
-    println!("Read {} rows.", movetexts.len());
-    // Measure start time
+/// Benchmark the parsing API workflow.
+///
+/// 1. Read parquet to Arrow arrays
+/// 2. Extract &str slices from StringArray
+/// 3. Parse in parallel with explicit chunking (par_chunks) -> fixed number of Buffers
+///
+/// No merge step - the chunked architecture keeps per-thread buffers as-is.
+pub fn bench_parse_api() {
+    let config = ParseConfig {
+        store_comments: false,
+        store_legal_moves: false,
+    };
+
+    // Step 1: Read parquet to Arrow StringArrays
+    let arrays = read_parquet_to_string_arrays(FILE_PATH);
+
+    // Step 2: Extract &str slices (zero-copy)
+    let pgn_slices = extract_str_slices(&arrays);
+    println!("Read {} games from parquet.", pgn_slices.len());
+
+    // Step 3: Build thread pool and compute capacity estimates
+    let num_threads = num_cpus::get();
+    let n_games = pgn_slices.len();
+    let moves_per_game = 70;
+
+    // Calculate chunk size for explicit chunking
+    let num_chunks = num_threads * CHUNK_MULTIPLIER;
+    let chunk_size = (n_games + num_chunks - 1) / num_chunks;
+    let chunk_size = chunk_size.max(1);
+    let games_per_chunk = chunk_size;
+
+    println!(
+        "Using {} threads, {} chunks, {} games/chunk",
+        num_threads, num_chunks, games_per_chunk
+    );
+
+    let thread_pool = ThreadPoolBuilder::new()
+        .num_threads(num_threads)
+        .build()
+        .expect("Failed to build Rayon thread pool");
+
+    // Step 4: Parse in parallel using par_chunks
     let start = Instant::now();
 
-    let result = parse_multiple_games_native(&movetexts, None, false);
+    let chunk_results: Vec<Buffers> = thread_pool.install(|| {
+        pgn_slices
+            .par_chunks(chunk_size)
+            .map(|chunk| {
+                let mut buffers = Buffers::with_capacity(games_per_chunk, moves_per_game, &config);
+                for &pgn in chunk {
+                    let _ = parse_game_to_buffers(pgn, &mut buffers, &config);
+                }
+                buffers
+            })
+            .collect()
+    });
 
-    let duration = start.elapsed();
-    println!("Time taken: {:?}", duration);
+    let duration_parallel = start.elapsed();
+    println!("Parallel parsing time: {:?}", duration_parallel);
+    println!(
+        "Created {} Buffers chunks (no merge needed)",
+        chunk_results.len()
+    );
 
-    match result {
-        Ok(parsed) => println!("Parsed {} games.", parsed.len()),
-        Err(err) => eprintln!("Error parsing games: {}", err),
-    }
+    // Compute totals from chunks
+    let total_games: usize = chunk_results.iter().map(|b| b.num_games()).sum();
+    let total_positions: usize = chunk_results.iter().map(|b| b.total_positions()).sum();
+
+    let duration_total = start.elapsed();
+    println!("Total time (parsing, no merge): {:?}", duration_total);
+    println!(
+        "Parsed {} games, {} total positions.",
+        total_games, total_positions
+    );
 
-    let duration2 = start.elapsed();
+    // Measure cleanup time
+    let drop_start = Instant::now();
+    drop(chunk_results);
+    let drop_duration = drop_start.elapsed();
 
-    println!("Time after checks: {:?}", duration2);
+    let total_duration = start.elapsed();
+    println!("Cleanup time (drop): {:?}", drop_duration);
+    println!("Total time (parsing + cleanup): {:?}", total_duration);
 }
 
 fn main() {
-    bench_parquet();
+    println!("=== Parse API (Buffers) ===\n");
+    bench_parse_api();
 }