From d7110434326bab3888fed8a43b68a1583fb396e5 Mon Sep 17 00:00:00 2001 From: Adriano Meligrana <68152031+Tortar@users.noreply.github.com> Date: Fri, 15 Aug 2025 13:27:25 +0200 Subject: [PATCH 1/3] Use append in example --- docs/src/example.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/src/example.md b/docs/src/example.md index a2c6c05..1a58828 100644 --- a/docs/src/example.md +++ b/docs/src/example.md @@ -31,12 +31,10 @@ function generate_file(filename, format) end end elseif format == :arrow - open(Arrow.Writer, filename) do writer - for i in 1:numchunks - starttpl, endtpl = (i-1)*chunktpl+1, min(i*chunktpl, totaltpl) - Arrow.write(writer, (data=map(i -> (a=rand(), b=rand(), c=rand(), d=rand()), - 1:endtpl-starttpl+1),)) - end + for i in 1:numchunks + starttpl, endtpl = (i-1)*chunktpl+1, min(i*chunktpl, totaltpl) + Arrow.append("random_data.arrow", (data=map(i -> (a=rand(), b=rand(), c=rand(), d=rand()), + 1:endtpl-starttpl+1),);file=false) end end end From 010381e27b8ae3c4d80e7b1a26e31051697ca31b Mon Sep 17 00:00:00 2001 From: Adriano Meligrana <68152031+Tortar@users.noreply.github.com> Date: Fri, 15 Aug 2025 13:27:54 +0200 Subject: [PATCH 2/3] Update Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index e04485c..db0589b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StreamSampling" uuid = "ff63dad9-3335-55d8-95ec-f8139d39e468" -version = "0.7.3" +version = "0.7.4" [deps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" From 66fe12120b7bfe795fd38dbd20072f5733d71c01 Mon Sep 17 00:00:00 2001 From: Adriano Meligrana <68152031+Tortar@users.noreply.github.com> Date: Fri, 15 Aug 2025 13:28:52 +0200 Subject: [PATCH 3/3] Update example.md --- docs/src/example.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/src/example.md b/docs/src/example.md index 1a58828..1fb9d0f 100644 --- a/docs/src/example.md +++ b/docs/src/example.md @@ -127,9 +127,8 @@ rngs = [Xoshiro(i) for i in 1:Threads.nthreads()] As you can see, the speed-up is not linear in the number of threads for an hdf5 file. This is mainly due to the fact that accessing the chunks is single-threaded, so one would need to use -`MPI.jl` as explained at https://juliaio.github.io/HDF5.jl/stable/mpi/ to improve the multi-threading -performance. Though, we are already sampling at 500MB/s, which is not bad! - +`MPI.jl` as explained at [HDF5.jl/stable/mpi/](https://juliaio.github.io/HDF5.jl/stable/mpi/) to +improve the multi-threading performance. Though, we are already sampling at 500MB/s, which is not bad! Using `Arrow.jl` gives an even better performance, and a scalability which is better than linear somehow, reaching a 2GB/s sampling speed!