diff --git a/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim b/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim new file mode 100644 index 000000000..e3c160fa6 --- /dev/null +++ b/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim @@ -0,0 +1,128 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + constantine/threadpool, + constantine/named/[algebras, zoo_subgroups], + constantine/math/arithmetic, + constantine/math/ec_shortweierstrass, + # Helpers + helpers/prng_unsafe, + ./bench_elliptic_parallel_template +# ./bench_msm_impl_optional_drop_windows + +# ############################################################ +# +# Benchmark of the G1 group of +# Short Weierstrass elliptic curves +# in (homogeneous) projective coordinates +# +# ############################################################ + +type + BenchTimes = tuple[numInputs: int, bits: int, bAll, bWo, oAll, oWo: float] + +proc msmBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int, bits: int): BenchTimes = + const bigIntBits = EC.getScalarField().bits() + type ECaff = affine(EC) + + template coefs: untyped = ctx.coefs.toOpenArray(0, numInputs-1) + template points: untyped = ctx.points.toOpenArray(0, numInputs-1) + + template benchIt(body: untyped): untyped = + block: + var useZeroWindows {.inject.} = true + let startAll = getMonotime() + block: + body + let stopAll = getMonoTime() + useZeroWindows = false + let startWo = getMonoTime() + block: + body + let stopWo = getMonotime() + (all: float inNanoseconds(stopAll - startAll), + wo: float inNanoseconds(stopWo - startWo)) + + var r{.noInit.}: EC + var startNaive, stopNaive, startbaseline, stopbaseline, startopt, stopopt, startpara, stoppara: MonoTime + + let (bAll, bWo) = benchIt: + bench(&"EC multi-scalar-mul baseline {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters): + r.multiScalarMul_reference_vartime(coefs, points, useZeroWindows) + let (oAll, oWo) = benchIt: + bench(&"EC multi-scalar-mul optimized {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters): + r.multiScalarMul_vartime(coefs, points, useZeroWindows) + + let pbAll = bAll / iters.float + let pbWo = bWo / iters.float + let poAll = oAll / iters.float + let poWo = oWo / iters.float + + echo &"total time baseline (useZeroWindows = true) = {bAll / 1e9} s" + echo &"total time baseline (useZeroWindows = false) = {bWo / 1e9} s" + echo &"total time optimized (useZeroWindows = true) = {oAll / 1e9} s" + echo &"total time optimized (useZeroWindows = false) = {oWo / 1e9} s" + + echo &"Speedup ratio baseline with & without all windows: {pbAll / pbWo:>6.3f}x" + echo &"Speedup ratio optimized with & without all windows: {poAll / poWo:>6.3f}x" + echo &"Speedup ratio optimized over baseline with all windows: {pbAll / poAll:>6.3f}x" + echo &"Speedup ratio optimized over baseline without all windows: {pbWo / poWo:>6.3f}x" + + result = (numInputs: numInputs, bits: bits, bAll: bAll, bWo: bWo, oAll: oAll, oWo: oWo) + +const Iters = 10_000 +const AvailableCurves = [ + BLS12_381, +] + +const testNumPoints = [2, 8, 64, 1024, 4096, 65536, 1048576] #, 4194304, 8388608, 16777216] + +template canImport(x: untyped): bool = + compiles: + import x + +when canImport(ggplotnim): + import ggplotnim +else: + {.error: "This benchmarks requires `ggplotnim` to produce a plot of the benchmark results.".} +proc main() = + separator() + staticFor i, 0, AvailableCurves.len: + const curve = AvailableCurves[i] + const maxBits = [1, 32, 128, 512] # [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients + var df = newDataFrame() + for bits in maxBits: + var ctx = createBenchMsmContext(EC_ShortW_Jac[Fp[curve], G1], testNumPoints, bits) + separator() + for numPoints in testNumPoints: + let batchIters = max(1, Iters div numPoints) + df.add ctx.msmBench(numPoints, batchIters, bits) + separator() + separator() + echo "\n\n\n" + separator() + separator() + + df = df.gather(["bAll", "bWo", "oAll", "oWo"], "Bench", "Time") + .mutate(f{"Time" ~ `Time` * 1e-9}) + df.writeCsv("/tmp/data.csv") + ggplot(df, aes("numInputs", "Time", shape = "Bench", color = "bits")) + + geom_point() + + scale_x_continuous() + + scale_x_log2(breaks = @testNumPoints) + scale_y_log10() + + xlab("Number of inputs of the MSM") + ylab("Time [s]") + + ggtitle("bits = number of bits set in coefficients") + + margin(right = 4) + + xMargin(0.05) + + theme_scale(1.2) + + ggsave("plots/bench_result.pdf") + +main() +notes() diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim index 647a9f81b..19ccac6d6 100644 --- a/benchmarks/bench_elliptic_parallel_template.nim +++ b/benchmarks/bench_elliptic_parallel_template.nim @@ -29,6 +29,22 @@ import export bench_elliptic_template +from std / math import divmod +proc random_coefficient*[N: static int](rng: var RngState, maxBit: int = 0): BigInt[N] = + ## Initializes a random BigInt[N] with `maxBit` as the most significant bit + ## of it. + ## If `maxBit` is set to zero, the coefficient will utilize all bits. + const WordSize = 64 + let toShift = result.limbs.len * WordSize - maxBit + let (d, r) = divmod(toShift, WordSize) # how many limbs to zero & how many bits in next limb + result = rng.random_unsafe(BigInt[N]) + if maxBit == 0 or maxBit >= N: return # use all bits + let limbs = result.limbs.len + for i in countdown(limbs-1, limbs - d): + result.limbs[i] = SecretWord(0'u64) # zero most significant limbs + result.shiftRight(r) # shift right by remaining required + + # ############################################################ # # Parallel Benchmark definitions @@ -55,11 +71,14 @@ proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) = type BenchMsmContext*[EC] = object tp: Threadpool - numInputs: int - coefs: seq[getBigInt(EC.getName(), kScalarField)] - points: seq[affine(EC)] - -proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] = + numInputs*: int + coefs*: seq[getBigInt(EC.getName(), kScalarField)] + points*: seq[affine(EC)] + +proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int], + maxBit = 0): BenchMsmContext[EC] = + ## `maxBit` sets the maximum bit set in the coefficients that are randomly sampled. + ## Useful to benchmark MSM with many leading zeroes. result.tp = Threadpool.new() let maxNumInputs = inputSizes.max() @@ -70,7 +89,9 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC result.points = newSeq[ECaff](maxNumInputs) result.coefs = newSeq[BigInt[bits]](maxNumInputs) - proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int, points: ptr ECaff, coefs: ptr BigInt[bits]) {.nimcall.} = + proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int, + points: ptr ECaff, + coefs: ptr BigInt[bits], maxBit: int) {.nimcall.} = let points = cast[ptr UncheckedArray[ECaff]](points) let coefs = cast[ptr UncheckedArray[BigInt[bits]]](coefs) @@ -82,7 +103,7 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC var tmp = threadRng.random_unsafe(EC) tmp.clearCofactor() points[i].affine(tmp) - coefs[i] = threadRng.random_unsafe(BigInt[bits]) + coefs[i] = random_coefficient[bits](threadRng, maxBit) let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads) @@ -94,7 +115,10 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC syncScope: for (id, start, size) in items(chunks): - result.tp.spawn genCoefPointPairsChunk[EC, ECaff](rng.next(), start, size, result.points[0].addr, result.coefs[0].addr) + result.tp.spawn genCoefPointPairsChunk[EC, ECaff]( + rng.next(), start, size, + result.points[0].addr, result.coefs[0].addr, maxBit + ) # Even if child threads are sleeping, it seems like perf is lower when there are threads around # maybe because the kernel has more overhead or time quantum to keep track off so shut them down. diff --git a/constantine/math/elliptic/ec_multi_scalar_mul.nim b/constantine/math/elliptic/ec_multi_scalar_mul.nim index 25c1fc754..bf40584e5 100644 --- a/constantine/math/elliptic/ec_multi_scalar_mul.nim +++ b/constantine/math/elliptic/ec_multi_scalar_mul.nim @@ -37,17 +37,51 @@ export bestBucketBitSize, abstractions # # See the litterature references at the top of `ec_multi_scalar_mul_scheduler.nim` +proc determineBitsSet*[bits: static int](coefs: ptr UncheckedArray[BigInt[bits]], N: int): int = + ## Computes the highest bit set in the collection of all MSM coefficients. + ## Useful to avoid bucket computation in all most-significant-windows that + ## are fully zero in all coefficients. + const WordSize = 64 + var acc: BigInt[bits] + let numLimbs = acc.limbs.len + # OR all coefficients + for i in 0 ..< N: + let coef = coefs[i] + for j in 0 ..< numLimbs: + acc.limbs[j] = acc.limbs[j] or coef.limbs[j] + + # Determine most significant set bit + 1 (only first byte set -> msb == 8) + var msb = -1 + block MsbCheck: + + for i in countdown(numLimbs-1, 0): + for j in countdown(WordSize - 1, 0): # Check each bit + let val = (1'u64 shl j) + if (acc.limbs[i].uint64 and val) != 0: + msb = i * WordSize + j + 1 + break MsbCheck + result = msb + func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int, c: static int) {.tags:[VarTime, HeapAlloc].} = + N: int, c: static int, + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} = ## Inner implementation of MSM, for static dispatch over c, the bucket bit length ## This is a straightforward simple translation of BDLO12, section 4 - # Prologue # -------- + let numWindows = + if not useZeroWindows: + var msb = determineBitsSet[bits](coefs, N) + if msb < 0: # return early, nothing to do, all empty + return + # round up to next window of c + msb = ceilDiv_vartime(msb, c) # * c + msb + else: + bits.ceilDiv_vartime(c) const numBuckets = 1 shl c - 1 # bucket 0 is unused - const numWindows = bits.ceilDiv_vartime(c) let miniMSMs = allocHeapArray(EC, numWindows) let buckets = allocHeapArray(EC, numBuckets) @@ -88,7 +122,6 @@ func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff]( for _ in 0 ..< c: r.double() r ~+= miniMSMs[w] - # Cleanup # ------- buckets.freeHeap() @@ -98,28 +131,29 @@ func multiScalarMul_reference_dispatch_vartime[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int) {.tags:[VarTime, HeapAlloc].} = + N: int, + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ let c = bestBucketBitSize(N, bits, useSignedBuckets = false, useManualTuning = false) case c - of 2: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 2) - of 3: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 3) - of 4: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 4) - of 5: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 5) - of 6: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 6) - of 7: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 7) - of 8: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 8) - of 9: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 9) - of 10: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 10) - of 11: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 11) - of 12: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 12) - of 13: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 13) - of 14: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 14) - of 15: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 15) - - of 16..20: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 16) + of 2: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 2, useZeroWindows) + of 3: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 3, useZeroWindows) + of 4: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 4, useZeroWindows) + of 5: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 5, useZeroWindows) + of 6: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 6, useZeroWindows) + of 7: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 7, useZeroWindows) + of 8: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 8, useZeroWindows) + of 9: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 9, useZeroWindows) + of 10: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 10, useZeroWindows) + of 11: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 11, useZeroWindows) + of 12: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 12, useZeroWindows) + of 13: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 13, useZeroWindows) + of 14: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 14, useZeroWindows) + of 15: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 15, useZeroWindows) + + of 16..20: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 16, useZeroWindows) else: unreachable() @@ -127,41 +161,45 @@ func multiScalarMul_reference_vartime*[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int) {.tags:[VarTime, HeapAlloc].} = + N: int, + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ - multiScalarMul_reference_dispatch_vartime(r, coefs, points, N) + multiScalarMul_reference_dispatch_vartime(r, coefs, points, N, useZeroWindows) -func multiScalarMul_reference_vartime*[EC, ECaff](r: var EC, coefs: openArray[BigInt], points: openArray[ECaff]) {.tags:[VarTime, HeapAlloc].} = +func multiScalarMul_reference_vartime*[EC, ECaff](r: var EC, coefs: openArray[BigInt], points: openArray[ECaff], + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ debug: doAssert coefs.len == points.len let N = points.len - multiScalarMul_reference_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N) + multiScalarMul_reference_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows) func multiScalarMul_reference_vartime*[F, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[F], points: ptr UncheckedArray[ECaff], - len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + len: int, + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ let n = cast[int](len) let coefs_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64) coefs_big.batchFromField(coefs, n) - r.multiScalarMul_reference_vartime(coefs_big, points, n) + r.multiScalarMul_reference_vartime(coefs_big, points, n, useZeroWindows) freeHeapAligned(coefs_big) func multiScalarMul_reference_vartime*[EC, ECaff]( r: var EC, coefs: openArray[Fr], - points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} = + points: openArray[ECaff], + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ debug: doAssert coefs.len == points.len let N = points.len - multiScalarMul_reference_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N) + multiScalarMul_reference_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows) # ########################################################### # # # @@ -256,27 +294,41 @@ func miniMSM[bits: static int, EC, ECaff]( func msmImpl_vartime[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int, c: static int) {.tags:[VarTime, HeapAlloc], meter.} = + N: int, c: static int, + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ # Setup # ----- const numBuckets = 1 shl (c-1) - let buckets = allocHeapArray(EC, numBuckets) for i in 0 ..< numBuckets: buckets[i].setNeutral() # Algorithm # --------- - const excess = bits mod c - const top = bits - excess + var excess = bits mod c + let top = + if not useZeroWindows: + var msb = determineBitsSet[bits](coefs, N) + if msb < 0: # return early, nothing to do, all empty + return + # round up to next window of c + if msb < bits - excess: + msb = ceilDiv_vartime(msb, c) * c + excess = 0 + msb + else: # in this case, we don't gain anything by modiying windows + bits - excess + else: + bits - excess + var w = top r.setNeutral() - when top != 0: # Prologue - when excess != 0: + if top != 0: # Prologue + if excess != 0: r.miniMSM(buckets, w, kTopWindow, c, coefs, points, N) w -= c else: @@ -350,10 +402,10 @@ func miniMSM_affine[NumBuckets, QueueLen, EC, ECaff; bits: static int]( func msmAffineImpl_vartime[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int, c: static int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + N: int, c: static int, + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ - # Setup # ----- const (numBuckets, queueLen) = c.deriveSchedulerConstants() @@ -363,13 +415,27 @@ func msmAffineImpl_vartime[bits: static int, EC, ECaff]( # Algorithm # --------- - const excess = bits mod c - const top = bits - excess + var excess = bits mod c + let top = + if not useZeroWindows: + var msb = determineBitsSet[bits](coefs, N) + if msb < 0: # return early, nothing to do, all empty + return + # round up to next window of c + if msb < bits - excess: + msb = ceilDiv_vartime(msb, c) * c + excess = 0 + msb + else: # in this case, we don't gain anything by modiying windows + bits - excess + else: + bits - excess + var w = top r.setNeutral() - when top != 0: # Prologue - when excess != 0: + if top != 0: # Prologue + if excess != 0: # The top might use only a few bits, the affine scheduler would likely have significant collisions for i in 0 ..< numBuckets: sched.buckets.pt[i].setNeutral() @@ -436,7 +502,8 @@ template withEndo[coefsBits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[coefsBits]], points: ptr UncheckedArray[ECaff], - N: int, c: static int) = + N: int, c: static int, + useZeroWindows: bool) = when hasEndomorphismAcceleration(EC.getName()) and EndomorphismThreshold <= coefsBits and coefsBits <= EC.getScalarField().bits() and @@ -447,7 +514,7 @@ template withEndo[coefsBits: static int, EC, ECaff]( let (endoCoefs, endoPoints, endoN) = applyEndomorphism(coefs, points, N) # Given that bits and N changed, we are able to use a bigger `c` # but it has no significant impact on performance - msmProc(r, endoCoefs, endoPoints, endoN, c) + msmProc(r, endoCoefs, endoPoints, endoN, c, useZeroWindows) freeHeap(endoCoefs) freeHeap(endoPoints) else: @@ -459,7 +526,8 @@ template withEndo[coefsBits: static int, EC, ECaff]( func msm_dispatch_vartime[bits: static int, F, G]( r: var (EC_ShortW_Jac[F, G] or EC_ShortW_Prj[F, G]), coefs: ptr UncheckedArray[BigInt[bits]], - points: ptr UncheckedArray[EC_ShortW_Aff[F, G]], N: int) = + points: ptr UncheckedArray[EC_ShortW_Aff[F, G]], N: int, + useZeroWindows: bool) = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ let c = bestBucketBitSize(N, bits, useSignedBuckets = true, useManualTuning = true) @@ -469,29 +537,30 @@ func msm_dispatch_vartime[bits: static int, F, G]( # but it has no significant impact on performance case c - of 2: withEndo(msmImpl_vartime, r, coefs, points, N, c = 2) - of 3: withEndo(msmImpl_vartime, r, coefs, points, N, c = 3) - of 4: withEndo(msmImpl_vartime, r, coefs, points, N, c = 4) - of 5: withEndo(msmImpl_vartime, r, coefs, points, N, c = 5) - of 6: withEndo(msmImpl_vartime, r, coefs, points, N, c = 6) - of 7: withEndo(msmImpl_vartime, r, coefs, points, N, c = 7) - of 8: withEndo(msmImpl_vartime, r, coefs, points, N, c = 8) - - of 9: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 9) - of 10: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 10) - of 11: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 11) - of 12: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 12) - of 13: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 13) - of 14: msmAffineImpl_vartime(r, coefs, points, N, c = 14) - of 15: msmAffineImpl_vartime(r, coefs, points, N, c = 15) - - of 16..17: msmAffineImpl_vartime(r, coefs, points, N, c = 16) + of 2: withEndo(msmImpl_vartime, r, coefs, points, N, c = 2, useZeroWindows) + of 3: withEndo(msmImpl_vartime, r, coefs, points, N, c = 3, useZeroWindows) + of 4: withEndo(msmImpl_vartime, r, coefs, points, N, c = 4, useZeroWindows) + of 5: withEndo(msmImpl_vartime, r, coefs, points, N, c = 5, useZeroWindows) + of 6: withEndo(msmImpl_vartime, r, coefs, points, N, c = 6, useZeroWindows) + of 7: withEndo(msmImpl_vartime, r, coefs, points, N, c = 7, useZeroWindows) + of 8: withEndo(msmImpl_vartime, r, coefs, points, N, c = 8, useZeroWindows) + + of 9: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 9, useZeroWindows) + of 10: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 10, useZeroWindows) + of 11: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 11, useZeroWindows) + of 12: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 12, useZeroWindows) + of 13: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 13, useZeroWindows) + of 14: msmAffineImpl_vartime(r, coefs, points, N, c = 14, useZeroWindows) + of 15: msmAffineImpl_vartime(r, coefs, points, N, c = 15, useZeroWindows) + + of 16..17: msmAffineImpl_vartime(r, coefs, points, N, c = 16, useZeroWindows) else: unreachable() func msm_dispatch_vartime[bits: static int, F]( r: var EC_TwEdw_Prj[F], coefs: ptr UncheckedArray[BigInt[bits]], - points: ptr UncheckedArray[EC_TwEdw_Aff[F]], N: int) = + points: ptr UncheckedArray[EC_TwEdw_Aff[F]], N: int, + useZeroWindows: bool) = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ @@ -503,22 +572,22 @@ func msm_dispatch_vartime[bits: static int, F]( # but it has no significant impact on performance case c - of 2: withEndo(msmImpl_vartime, r, coefs, points, N, c = 2) - of 3: withEndo(msmImpl_vartime, r, coefs, points, N, c = 3) - of 4: withEndo(msmImpl_vartime, r, coefs, points, N, c = 4) - of 5: withEndo(msmImpl_vartime, r, coefs, points, N, c = 5) - of 6: withEndo(msmImpl_vartime, r, coefs, points, N, c = 6) - of 7: withEndo(msmImpl_vartime, r, coefs, points, N, c = 7) - of 8: withEndo(msmImpl_vartime, r, coefs, points, N, c = 8) - of 9: withEndo(msmImpl_vartime, r, coefs, points, N, c = 9) - of 10: withEndo(msmImpl_vartime, r, coefs, points, N, c = 10) - of 11: withEndo(msmImpl_vartime, r, coefs, points, N, c = 11) - of 12: withEndo(msmImpl_vartime, r, coefs, points, N, c = 12) - of 13: withEndo(msmImpl_vartime, r, coefs, points, N, c = 13) - of 14: msmImpl_vartime(r, coefs, points, N, c = 14) - of 15: msmImpl_vartime(r, coefs, points, N, c = 15) - - of 16..17: msmImpl_vartime(r, coefs, points, N, c = 16) + of 2: withEndo(msmImpl_vartime, r, coefs, points, N, c = 2, useZeroWindows) + of 3: withEndo(msmImpl_vartime, r, coefs, points, N, c = 3, useZeroWindows) + of 4: withEndo(msmImpl_vartime, r, coefs, points, N, c = 4, useZeroWindows) + of 5: withEndo(msmImpl_vartime, r, coefs, points, N, c = 5, useZeroWindows) + of 6: withEndo(msmImpl_vartime, r, coefs, points, N, c = 6, useZeroWindows) + of 7: withEndo(msmImpl_vartime, r, coefs, points, N, c = 7, useZeroWindows) + of 8: withEndo(msmImpl_vartime, r, coefs, points, N, c = 8, useZeroWindows) + of 9: withEndo(msmImpl_vartime, r, coefs, points, N, c = 9, useZeroWindows) + of 10: withEndo(msmImpl_vartime, r, coefs, points, N, c = 10, useZeroWindows) + of 11: withEndo(msmImpl_vartime, r, coefs, points, N, c = 11, useZeroWindows) + of 12: withEndo(msmImpl_vartime, r, coefs, points, N, c = 12, useZeroWindows) + of 13: withEndo(msmImpl_vartime, r, coefs, points, N, c = 13, useZeroWindows) + of 14: msmImpl_vartime(r, coefs, points, N, c = 14, useZeroWindows) + of 15: msmImpl_vartime(r, coefs, points, N, c = 15, useZeroWindows) + + of 16..17: msmImpl_vartime(r, coefs, points, N, c = 16, useZeroWindows) else: unreachable() @@ -526,43 +595,47 @@ func multiScalarMul_vartime*[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + len: int, + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ - msm_dispatch_vartime(r, coefs, points, len) + msm_dispatch_vartime(r, coefs, points, len, useZeroWindows) func multiScalarMul_vartime*[bits: static int, EC, ECaff]( r: var EC, coefs: openArray[BigInt[bits]], - points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + points: openArray[ECaff], + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ debug: doAssert coefs.len == points.len let N = points.len - msm_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N) + msm_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows) func multiScalarMul_vartime*[F, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[F], points: ptr UncheckedArray[ECaff], - len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + len: int, + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ let n = cast[int](len) let coefs_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64) coefs_big.batchFromField(coefs, n) - r.multiScalarMul_vartime(coefs_big, points, n) + r.multiScalarMul_vartime(coefs_big, points, n, useZeroWindows) freeHeapAligned(coefs_big) func multiScalarMul_vartime*[EC, ECaff]( r: var EC, coefs: openArray[Fr], - points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} = + points: openArray[ECaff], + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ debug: doAssert coefs.len == points.len let N = points.len - multiScalarMul_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N) + multiScalarMul_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows) diff --git a/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim b/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim new file mode 100644 index 000000000..08c4a216e --- /dev/null +++ b/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim @@ -0,0 +1,25 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + constantine/named/algebras, + constantine/math/ec_shortweierstrass, + constantine/math/arithmetic, + # Test utilities + ./t_ec_template + +run_EC_multi_scalar_mul_zero_windows_sanity( + ec = EC_ShortW_Jac[Fp[BN254_Snarks], G1], + moduleName = "test_msm_zero_windows_sanity" & $BN254_Snarks + ) + +run_EC_multi_scalar_mul_zero_windows_sanity( + ec = EC_ShortW_Jac[Fp[BLS12_381], G1], + moduleName = "test_msm_zero_windows_sanity" & $BLS12_381 + ) diff --git a/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim b/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim new file mode 100644 index 000000000..c7dfadd70 --- /dev/null +++ b/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim @@ -0,0 +1,29 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + constantine/named/algebras, + constantine/math/ec_shortweierstrass, + constantine/math/arithmetic, + # Test utilities + ./t_ec_template + +const numPoints = [1, 2, 8, 16, 32, 64, 128, 1024, 2048, 4096] # 32768, 262144, 1048576] + +run_EC_multi_scalar_mul_different_zero_windows( + ec = EC_ShortW_Jac[Fp[BN254_Snarks], G1], + numPoints = numPoints, + moduleName = "test_ec_shortweierstrass_jacobian_multi_scalar_mul_different_zero_windows_" & $BN254_Snarks + ) + +run_EC_multi_scalar_mul_different_zero_windows( + ec = EC_ShortW_Jac[Fp[BLS12_381], G1], + numPoints = numPoints, + moduleName = "test_ec_shortweierstrass_jacobian_multi_scalar_mul_different_zero_windows_" & $BLS12_381 + ) diff --git a/tests/math_elliptic_curves/t_ec_template.nim b/tests/math_elliptic_curves/t_ec_template.nim index 2e003f3be..cbafdd4a3 100644 --- a/tests/math_elliptic_curves/t_ec_template.nim +++ b/tests/math_elliptic_curves/t_ec_template.nim @@ -14,7 +14,7 @@ import # Standard library - std/[unittest, times], + std/[unittest, times, strformat], # Internals constantine/platforms/abstractions, constantine/named/algebras, @@ -94,6 +94,21 @@ func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen) else: result = rng.random_long01Seq_with_randZ(EC) +from std / math import divmod +proc random_coefficient*[N: static int](rng: var RngState, maxBit: int = 0): BigInt[N] = + ## Initializes a random BigInt[N] with `maxBit` as the most significant bit + ## of it. + ## If `maxBit` is set to zero, the coefficient will utilize all bits. + const WordSize = 64 + let toShift = result.limbs.len * WordSize - maxBit + let (d, r) = divmod(toShift, WordSize) # how many limbs to zero & how many bits in next limb + result = rng.random_unsafe(BigInt[N]) + if maxBit == 0 or maxBit >= N: return # use all bits + let limbs = result.limbs.len + for i in countdown(limbs-1, limbs - d): + result.limbs[i] = SecretWord(0'u64) # zero most significant limbs + result.shiftRight(r) # shift right by remaining required + proc run_EC_addition_tests*( ec: typedesc, Iters: static int, @@ -1390,3 +1405,103 @@ proc run_EC_multi_scalar_mul_impl*[N: static int]( test(ec, gen = Uniform) test(ec, gen = HighHammingWeight) test(ec, gen = Long01Sequence) + +proc run_EC_multi_scalar_mul_zero_windows_sanity*( + ec: typedesc, + moduleName: string) = + # Random seed for reproducibility + var rng: RngState + let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 + rng.seed(seed) + echo "\n------------------------------------------------------\n" + echo moduleName, " xoshiro512** seed: ", seed + + const testSuiteDesc = "Elliptic curve multi-scalar-multiplication" + const NumPoints = 4096 + + suite "Scalar coefficients with specific number of zero bits": + const maxBits = [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients + for bits in maxBits: + test "Zero bits " & $bits: + proc test(EC: typedesc) = + type T = BigInt[EC.getScalarField().bits()] + var coefs = newSeq[T](NumPoints) + for i in 0 ..< NumPoints: + coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits) + # verify number of bits needed + let msb = determineBitsSet(cast[ptr UncheckedArray[T]](coefs[0].addr), NumPoints) + ## `determineBitsSet` returns the highest set bit in the list of coefficients + 1 + ## and the size of the BigInt if the target size is larger than the BigInt. + check msb == min(bits, bEC.getScalarField().bits()) + test(ec) + +import ../../benchmarks/bench_msm_impl_optional_drop_windows +proc run_EC_multi_scalar_mul_different_zero_windows*[N: static int]( + ec: typedesc, + numPoints: array[N, int], + moduleName: string) = + # Random seed for reproducibility + var rng: RngState + let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 + rng.seed(seed) + echo "\n------------------------------------------------------\n" + echo moduleName, " xoshiro512** seed: ", seed + + const testSuiteDesc = "Elliptic curve multi-scalar-multiplication with all windows compared to dropping MSB zero windows" + + suite &"{testSuiteDesc} - {$ec} - [{WordBitWidth}-bit mode]": + const maxBits = [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients + # first check a selection of bits with different number of points + for bits in maxBits: + for n in numPoints: + let bucketBits = bestBucketBitSize(n, ec.getScalarField().bits(), useSignedBuckets = false, useManualTuning = false) + test &"{$ec} Multi-scalar-mul (N={n}, bucket bits: {bucketBits}, maxBitsSet={bits})": + proc test(EC: typedesc) = + var points = newSeq[affine(EC)](n) + var coefs = newSeq[BigInt[EC.getScalarField().bits()]](n) + + for i in 0 ..< n: + var tmp = rng.random_unsafe(EC) + tmp.clearCofactor() + points[i].affine(tmp) + coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits) + + var refAll, refWo, optAll, optWo: EC + refAll.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = true) + refWo.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = false) + optAll.multiScalarMul_vartime(coefs, points, useZeroWindows = true) + optWo.multiScalarMul_vartime(coefs, points, useZeroWindows = false) + + doAssert bool(refAll == refWo) + doAssert bool(optAll == optWo) + doAssert bool(refWo == optWo) + + test(ec) + + # now check each possible bit explicitly + # Note: this uses a fixed bucket size, but given the above, that should be fine + for bits in 0 ..< ec.getScalarField().bits(): + const n = 32 + let bucketBits = bestBucketBitSize(n, ec.getScalarField().bits(), useSignedBuckets = false, useManualTuning = false) + test &"{$ec} Multi-scalar-mul (N={n}, bucket bits: {bucketBits}, maxBitsSet={bits})": + proc test(EC: typedesc) = + var points = newSeq[affine(EC)](n) + var coefs = newSeq[BigInt[EC.getScalarField().bits()]](n) + + for i in 0 ..< n: + var tmp = rng.random_unsafe(EC) + tmp.clearCofactor() + points[i].affine(tmp) + coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits) + + var refAll, refWo, optAll, optWo: EC + refAll.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = true) + refWo.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = false) + optAll.multiScalarMul_vartime(coefs, points, useZeroWindows = true) + optWo.multiScalarMul_vartime(coefs, points, useZeroWindows = false) + + doAssert bool(refAll == refWo) + doAssert bool(optAll == optWo) + doAssert bool(refWo == optWo) + + test(ec)