diff --git a/Project.toml b/Project.toml index e04485c..db0589b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StreamSampling" uuid = "ff63dad9-3335-55d8-95ec-f8139d39e468" -version = "0.7.3" +version = "0.7.4" [deps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" diff --git a/docs/src/example.md b/docs/src/example.md index a2c6c05..1fb9d0f 100644 --- a/docs/src/example.md +++ b/docs/src/example.md @@ -31,12 +31,10 @@ function generate_file(filename, format) end end elseif format == :arrow - open(Arrow.Writer, filename) do writer - for i in 1:numchunks - starttpl, endtpl = (i-1)*chunktpl+1, min(i*chunktpl, totaltpl) - Arrow.write(writer, (data=map(i -> (a=rand(), b=rand(), c=rand(), d=rand()), - 1:endtpl-starttpl+1),)) - end + for i in 1:numchunks + starttpl, endtpl = (i-1)*chunktpl+1, min(i*chunktpl, totaltpl) + Arrow.append("random_data.arrow", (data=map(i -> (a=rand(), b=rand(), c=rand(), d=rand()), + 1:endtpl-starttpl+1),);file=false) end end end @@ -129,9 +127,8 @@ rngs = [Xoshiro(i) for i in 1:Threads.nthreads()] As you can see, the speed-up is not linear in the number of threads for an hdf5 file. This is mainly due to the fact that accessing the chunks is single-threaded, so one would need to use -`MPI.jl` as explained at https://juliaio.github.io/HDF5.jl/stable/mpi/ to improve the multi-threading -performance. Though, we are already sampling at 500MB/s, which is not bad! - +`MPI.jl` as explained at [HDF5.jl/stable/mpi/](https://juliaio.github.io/HDF5.jl/stable/mpi/) to +improve the multi-threading performance. Though, we are already sampling at 500MB/s, which is not bad! Using `Arrow.jl` gives an even better performance, and a scalability which is better than linear somehow, reaching a 2GB/s sampling speed!