mratsim · mratsim · Dec 3, 2024 · Nov 27, 2024 · Nov 28, 2024 · Dec 3, 2024
diff --git a/benchmarks/bench_eth_eip4844_kzg.nim b/benchmarks/bench_eth_eip4844_kzg.nim
@@ -74,6 +74,7 @@ proc benchBlobToKzgCommitment(b: BenchSet, ctx: ptr EthereumKZGContext, iters: i
 
   ## We require `tp` to be unintialized as even idle threads somehow reduce perf of serial benches
   let tp = Threadpool.new()
+  let numThreads = tp.numThreads
 
   let startParallel = getMonotime()
   block:
@@ -88,7 +89,7 @@ proc benchBlobToKzgCommitment(b: BenchSet, ctx: ptr EthereumKZGContext, iters: i
   let perfParallel = inNanoseconds((stopParallel-startParallel) div iters)
 
   let parallelSpeedup = float(perfSerial) / float(perfParallel)
-  echo &"Speedup ratio parallel {tp.numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
+  echo &"Speedup ratio parallel {numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
 
 proc benchComputeKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: int) =
 
@@ -102,6 +103,7 @@ proc benchComputeKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: int)
 
   ## We require `tp` to be unintialized as even idle threads somehow reduce perf of serial benches
   let tp = Threadpool.new()
+  let numThreads = tp.numThreads
 
   let startParallel = getMonotime()
   block:
@@ -117,7 +119,7 @@ proc benchComputeKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: int)
   let perfParallel = inNanoseconds((stopParallel-startParallel) div iters)
 
   let parallelSpeedup = float(perfSerial) / float(perfParallel)
-  echo &"Speedup ratio parallel {tp.numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
+  echo &"Speedup ratio parallel {numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
 
 proc benchComputeBlobKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: int) =
 
@@ -130,6 +132,7 @@ proc benchComputeBlobKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: i
 
   ## We require `tp` to be unintialized as even idle threads somehow reduce perf of serial benches
   let tp = Threadpool.new()
+  let numThreads = tp.numThreads
 
   let startParallel = getMonotime()
   block:
@@ -144,7 +147,7 @@ proc benchComputeBlobKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: i
   let perfParallel = inNanoseconds((stopParallel-startParallel) div iters)
 
   let parallelSpeedup = float(perfSerial) / float(perfParallel)
-  echo &"Speedup ratio parallel {tp.numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
+  echo &"Speedup ratio parallel {numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
 
 proc benchVerifyKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: int) =
 
@@ -163,6 +166,7 @@ proc benchVerifyBlobKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: in
 
   ## We require `tp` to be unintialized as even idle threads somehow reduce perf of serial benches
   let tp = Threadpool.new()
+  let numThreads = tp.numThreads
 
   let startParallel = getMonotime()
   block:
@@ -176,7 +180,7 @@ proc benchVerifyBlobKzgProof(b: BenchSet, ctx: ptr EthereumKZGContext, iters: in
   let perfParallel = inNanoseconds((stopParallel-startParallel) div iters)
 
   let parallelSpeedup = float(perfSerial) / float(perfParallel)
-  echo &"Speedup ratio parallel {tp.numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
+  echo &"Speedup ratio parallel {numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
 
 proc benchVerifyBlobKzgProofBatch(b: BenchSet, ctx: ptr EthereumKZGContext, iters: int) =
 
@@ -201,6 +205,7 @@ proc benchVerifyBlobKzgProofBatch(b: BenchSet, ctx: ptr EthereumKZGContext, iter
 
     ## We require `tp` to be unintialized as even idle threads somehow reduce perf of serial benches
     let tp = Threadpool.new()
+    let numTHreads = tp.numThreads
 
     let startParallel = getMonotime()
     block:
@@ -220,7 +225,7 @@ proc benchVerifyBlobKzgProofBatch(b: BenchSet, ctx: ptr EthereumKZGContext, iter
     let perfParallel = inNanoseconds((stopParallel-startParallel) div iters)
 
     let parallelSpeedup = float(perfSerial) / float(perfParallel)
-    echo &"Speedup ratio parallel {tp.numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
+    echo &"Speedup ratio parallel {numThreads} threads over serial: {parallelSpeedup:>6.3f}x"
     echo ""
 
     i *= 2
@@ -258,7 +263,7 @@ proc main() =
   echo ""
   benchVerifyBlobKzgProofBatch(b, ctx, Iters)
   separator()
-
+  ctx.trusted_setup_delete()
 
 when isMainModule:
   main()
diff --git a/constantine/math/elliptic/ec_multi_scalar_mul.nim b/constantine/math/elliptic/ec_multi_scalar_mul.nim
@@ -49,8 +49,8 @@ func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff](
   const numBuckets = 1 shl c - 1 # bucket 0 is unused
   const numWindows = bits.ceilDiv_vartime(c)
 
-  let miniMSMs = allocHeapArray(EC, numWindows)
-  let buckets = allocHeapArray(EC, numBuckets)
+  let miniMSMs = allocHeapArrayAligned(EC, numWindows, alignment = 64)
+  let buckets = allocHeapArrayAligned(EC, numBuckets, alignment = 64)
 
   # Algorithm
   # ---------
@@ -91,8 +91,8 @@ func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff](
 
   # Cleanup
   # -------
-  buckets.freeHeap()
-  miniMSMs.freeHeap()
+  buckets.freeHeapAligned()
+  miniMSMs.freeHeapAligned()
 
 func multiScalarMul_reference_dispatch_vartime[bits: static int, EC, ECaff](
        r: var EC,
@@ -151,7 +151,7 @@ func multiScalarMul_reference_vartime*[F, EC, ECaff](
   coefs_big.batchFromField(coefs, n)
   r.multiScalarMul_reference_vartime(coefs_big, points, n)
 
-  freeHeapAligned(coefs_big)
+  coefs_big.freeHeapAligned()
 
 func multiScalarMul_reference_vartime*[EC, ECaff](
        r: var EC,
@@ -264,7 +264,7 @@ func msmImpl_vartime[bits: static int, EC, ECaff](
   # -----
   const numBuckets = 1 shl (c-1)
 
-  let buckets = allocHeapArray(EC, numBuckets)
+  let buckets = allocHeapArrayAligned(EC, numBuckets, alignment = 64)
   for i in 0 ..< numBuckets:
     buckets[i].setNeutral()
 
@@ -293,7 +293,7 @@ func msmImpl_vartime[bits: static int, EC, ECaff](
 
   # Cleanup
   # -------
-  buckets.freeHeap()
+  buckets.freeHeapAligned()
 
 # Multi scalar multiplication with batched affine additions
 # -----------------------------------------------------------------------------------------------------------------------
@@ -357,8 +357,8 @@ func msmAffineImpl_vartime[bits: static int, EC, ECaff](
   # Setup
   # -----
   const (numBuckets, queueLen) = c.deriveSchedulerConstants()
-  let buckets = allocHeap(Buckets[numBuckets, EC, ECaff])
-  let sched = allocHeap(Scheduler[numBuckets, queueLen, EC, ECaff])
+  let buckets = allocHeapAligned(Buckets[numBuckets, EC, ECaff], alignment = 64)
+  let sched = allocHeapAligned(Scheduler[numBuckets, queueLen, EC, ECaff], alignment = 64)
   sched.init(points, buckets, 0, numBuckets.int32)
 
   # Algorithm
@@ -389,8 +389,8 @@ func msmAffineImpl_vartime[bits: static int, EC, ECaff](
 
   # Cleanup
   # -------
-  sched.freeHeap()
-  buckets.freeHeap()
+  sched.freeHeapAligned()
+  buckets.freeHeapAligned()
 
 # Endomorphism acceleration
 # -----------------------------------------------------------------------------------------------------------------------
@@ -410,8 +410,8 @@ proc applyEndomorphism[bits: static int, ECaff](
             else: ECaff.G
 
   const L = ECaff.getScalarField().bits().computeEndoRecodedLength(M)
-  let splitCoefs   = allocHeapArray(array[M, BigInt[L]], N)
-  let endoBasis    = allocHeapArray(array[M, ECaff], N)
+  let splitCoefs   = allocHeapArrayAligned(array[M, BigInt[L]], N, alignment = 64)
+  let endoBasis    = allocHeapArrayAligned(array[M, ECaff], N, alignment = 64)
 
   for i in 0 ..< N:
     var negatePoints {.noinit.}: array[M, SecretBool]
@@ -448,8 +448,8 @@ template withEndo[coefsBits: static int, EC, ECaff](
     # Given that bits and N changed, we are able to use a bigger `c`
     # but it has no significant impact on performance
     msmProc(r, endoCoefs, endoPoints, endoN, c)
-    freeHeap(endoCoefs)
-    freeHeap(endoPoints)
+    endoCoefs.freeHeapAligned()
+    endoPoints.freeHeapAligned()
   else:
     msmProc(r, coefs, points, N, c)
 
@@ -555,7 +555,7 @@ func multiScalarMul_vartime*[F, EC, ECaff](
   coefs_big.batchFromField(coefs, n)
   r.multiScalarMul_vartime(coefs_big, points, n)
 
-  freeHeapAligned(coefs_big)
+  coefs_big.freeHeapAligned()
 
 func multiScalarMul_vartime*[EC, ECaff](
        r: var EC,