diff --git a/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim b/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim
new file mode 100644
index 000000000..e3c160fa6
--- /dev/null
+++ b/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim
@@ -0,0 +1,128 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  constantine/threadpool,
+  constantine/named/[algebras, zoo_subgroups],
+  constantine/math/arithmetic,
+  constantine/math/ec_shortweierstrass,
+  # Helpers
+  helpers/prng_unsafe,
+  ./bench_elliptic_parallel_template
+#  ./bench_msm_impl_optional_drop_windows
+
+# ############################################################
+#
+#               Benchmark of the G1 group of
+#            Short Weierstrass elliptic curves
+#          in (homogeneous) projective coordinates
+#
+# ############################################################
+
+type
+  BenchTimes = tuple[numInputs: int, bits: int, bAll, bWo, oAll, oWo: float]
+
+proc msmBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int, bits: int): BenchTimes =
+  const bigIntBits = EC.getScalarField().bits()
+  type ECaff = affine(EC)
+
+  template coefs: untyped = ctx.coefs.toOpenArray(0, numInputs-1)
+  template points: untyped = ctx.points.toOpenArray(0, numInputs-1)
+
+  template benchIt(body: untyped): untyped =
+    block:
+      var useZeroWindows {.inject.} = true
+      let startAll = getMonotime()
+      block:
+        body
+      let stopAll = getMonoTime()
+      useZeroWindows = false
+      let startWo = getMonoTime()
+      block:
+        body
+      let stopWo = getMonotime()
+      (all: float inNanoseconds(stopAll - startAll),
+       wo:  float inNanoseconds(stopWo  - startWo))
+
+  var r{.noInit.}: EC
+  var startNaive, stopNaive, startbaseline, stopbaseline, startopt, stopopt, startpara, stoppara: MonoTime
+
+  let (bAll, bWo) = benchIt:
+    bench(&"EC multi-scalar-mul baseline  {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters):
+      r.multiScalarMul_reference_vartime(coefs, points, useZeroWindows)
+  let (oAll, oWo) = benchIt:
+    bench(&"EC multi-scalar-mul optimized {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters):
+      r.multiScalarMul_vartime(coefs, points, useZeroWindows)
+
+  let pbAll = bAll / iters.float
+  let pbWo  = bWo / iters.float
+  let poAll = oAll / iters.float
+  let poWo  = oWo / iters.float
+
+  echo &"total time baseline  (useZeroWindows = true)  = {bAll / 1e9} s"
+  echo &"total time baseline  (useZeroWindows = false) = {bWo  / 1e9} s"
+  echo &"total time optimized (useZeroWindows = true)  = {oAll / 1e9} s"
+  echo &"total time optimized (useZeroWindows = false) = {oWo  / 1e9} s"
+
+  echo &"Speedup ratio baseline with & without all windows:         {pbAll / pbWo:>6.3f}x"
+  echo &"Speedup ratio optimized with & without all windows:        {poAll / poWo:>6.3f}x"
+  echo &"Speedup ratio optimized over baseline with all windows:    {pbAll / poAll:>6.3f}x"
+  echo &"Speedup ratio optimized over baseline without all windows: {pbWo  / poWo:>6.3f}x"
+
+  result = (numInputs: numInputs, bits: bits, bAll: bAll, bWo: bWo, oAll: oAll, oWo: oWo)
+
+const Iters = 10_000
+const AvailableCurves = [
+  BLS12_381,
+]
+
+const testNumPoints = [2, 8, 64, 1024, 4096, 65536, 1048576] #, 4194304, 8388608, 16777216]
+
+template canImport(x: untyped): bool =
+  compiles:
+    import x
+
+when canImport(ggplotnim):
+  import ggplotnim
+else:
+  {.error: "This benchmarks requires `ggplotnim` to produce a plot of the benchmark results.".}
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    const maxBits = [1, 32, 128, 512] # [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients
+    var df = newDataFrame()
+    for bits in maxBits:
+      var ctx = createBenchMsmContext(EC_ShortW_Jac[Fp[curve], G1], testNumPoints, bits)
+      separator()
+      for numPoints in testNumPoints:
+        let batchIters = max(1, Iters div numPoints)
+        df.add ctx.msmBench(numPoints, batchIters, bits)
+        separator()
+      separator()
+      echo "\n\n\n"
+      separator()
+    separator()
+
+    df = df.gather(["bAll", "bWo", "oAll", "oWo"], "Bench", "Time")
+      .mutate(f{"Time" ~ `Time` * 1e-9})
+    df.writeCsv("/tmp/data.csv")
+    ggplot(df, aes("numInputs", "Time", shape = "Bench", color = "bits")) +
+      geom_point() +
+      scale_x_continuous() +
+      scale_x_log2(breaks = @testNumPoints) + scale_y_log10() +
+      xlab("Number of inputs of the MSM") + ylab("Time [s]") +
+      ggtitle("bits = number of bits set in coefficients") +
+      margin(right = 4) +
+      xMargin(0.05) +
+      theme_scale(1.2) +
+      ggsave("plots/bench_result.pdf")
+
+main()
+notes()
diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim
index 647a9f81b..19ccac6d6 100644
--- a/benchmarks/bench_elliptic_parallel_template.nim
+++ b/benchmarks/bench_elliptic_parallel_template.nim
@@ -29,6 +29,22 @@ import
 
 export bench_elliptic_template
 
+from std / math import divmod
+proc random_coefficient*[N: static int](rng: var RngState, maxBit: int = 0): BigInt[N] =
+  ## Initializes a random BigInt[N] with `maxBit` as the most significant bit
+  ## of it.
+  ## If `maxBit` is set to zero, the coefficient will utilize all bits.
+  const WordSize = 64
+  let toShift = result.limbs.len * WordSize - maxBit
+  let (d, r) = divmod(toShift, WordSize) # how many limbs to zero & how many bits in next limb
+  result = rng.random_unsafe(BigInt[N])
+  if maxBit == 0 or maxBit >= N: return # use all bits
+  let limbs = result.limbs.len
+  for i in countdown(limbs-1, limbs - d):
+    result.limbs[i] = SecretWord(0'u64)  # zero most significant limbs
+  result.shiftRight(r)                   # shift right by remaining required
+
+
 # ############################################################
 #
 #             Parallel Benchmark definitions
@@ -55,11 +71,14 @@ proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) =
 
 type BenchMsmContext*[EC] = object
   tp: Threadpool
-  numInputs: int
-  coefs: seq[getBigInt(EC.getName(), kScalarField)]
-  points: seq[affine(EC)]
-
-proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] =
+  numInputs*: int
+  coefs*: seq[getBigInt(EC.getName(), kScalarField)]
+  points*: seq[affine(EC)]
+
+proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int],
+                            maxBit = 0): BenchMsmContext[EC] =
+  ## `maxBit` sets the maximum bit set in the coefficients that are randomly sampled.
+  ## Useful to benchmark MSM with many leading zeroes.
   result.tp = Threadpool.new()
   let maxNumInputs = inputSizes.max()
 
@@ -70,7 +89,9 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
   result.points = newSeq[ECaff](maxNumInputs)
   result.coefs = newSeq[BigInt[bits]](maxNumInputs)
 
-  proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int, points: ptr ECaff, coefs: ptr BigInt[bits]) {.nimcall.} =
+  proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int,
+                                         points: ptr ECaff,
+                                         coefs: ptr BigInt[bits], maxBit: int) {.nimcall.} =
     let points = cast[ptr UncheckedArray[ECaff]](points)
     let coefs = cast[ptr UncheckedArray[BigInt[bits]]](coefs)
 
@@ -82,7 +103,7 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
       var tmp = threadRng.random_unsafe(EC)
       tmp.clearCofactor()
       points[i].affine(tmp)
-      coefs[i] = threadRng.random_unsafe(BigInt[bits])
+      coefs[i] = random_coefficient[bits](threadRng, maxBit)
 
   let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads)
 
@@ -94,7 +115,10 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
 
   syncScope:
     for (id, start, size) in items(chunks):
-      result.tp.spawn genCoefPointPairsChunk[EC, ECaff](rng.next(), start, size, result.points[0].addr, result.coefs[0].addr)
+      result.tp.spawn genCoefPointPairsChunk[EC, ECaff](
+        rng.next(), start, size,
+        result.points[0].addr, result.coefs[0].addr, maxBit
+      )
 
   # Even if child threads are sleeping, it seems like perf is lower when there are threads around
   # maybe because the kernel has more overhead or time quantum to keep track off so shut them down.
diff --git a/constantine/math/elliptic/ec_multi_scalar_mul.nim b/constantine/math/elliptic/ec_multi_scalar_mul.nim
index 25c1fc754..bf40584e5 100644
--- a/constantine/math/elliptic/ec_multi_scalar_mul.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul.nim
@@ -37,17 +37,51 @@ export bestBucketBitSize, abstractions
 #
 # See the litterature references at the top of `ec_multi_scalar_mul_scheduler.nim`
 
+proc determineBitsSet*[bits: static int](coefs: ptr UncheckedArray[BigInt[bits]], N: int): int =
+  ## Computes the highest bit set in the collection of all MSM coefficients.
+  ## Useful to avoid bucket computation in all most-significant-windows that
+  ## are fully zero in all coefficients.
+  const WordSize = 64
+  var acc: BigInt[bits]
+  let numLimbs = acc.limbs.len
+  # OR all coefficients
+  for i in 0 ..< N:
+    let coef = coefs[i]
+    for j in 0 ..< numLimbs:
+      acc.limbs[j] = acc.limbs[j] or coef.limbs[j]
+
+  # Determine most significant set bit + 1 (only first byte set -> msb == 8)
+  var msb = -1
+  block MsbCheck:
+
+    for i in countdown(numLimbs-1, 0):
+      for j in countdown(WordSize - 1, 0):  # Check each bit
+        let val = (1'u64 shl j)
+        if (acc.limbs[i].uint64 and val) != 0:
+          msb = i * WordSize + j + 1
+          break MsbCheck
+  result = msb
+
 func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff],
-       N: int, c: static int) {.tags:[VarTime, HeapAlloc].} =
+       N: int, c: static int,
+       useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} =
   ## Inner implementation of MSM, for static dispatch over c, the bucket bit length
   ## This is a straightforward simple translation of BDLO12, section 4
-
   # Prologue
   # --------
+  let numWindows =
+    if not useZeroWindows:
+      var msb = determineBitsSet[bits](coefs, N)
+      if msb < 0: # return early, nothing to do, all empty
+        return
+      # round up to next window of c
+      msb = ceilDiv_vartime(msb, c) #  * c
+      msb
+    else:
+      bits.ceilDiv_vartime(c)
   const numBuckets = 1 shl c - 1 # bucket 0 is unused
-  const numWindows = bits.ceilDiv_vartime(c)
 
   let miniMSMs = allocHeapArray(EC, numWindows)
   let buckets = allocHeapArray(EC, numBuckets)
@@ -88,7 +122,6 @@ func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff](
     for _ in 0 ..< c:
       r.double()
     r ~+= miniMSMs[w]
-
   # Cleanup
   # -------
   buckets.freeHeap()
@@ -98,28 +131,29 @@ func multiScalarMul_reference_dispatch_vartime[bits: static int, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[BigInt[bits]],
        points: ptr UncheckedArray[ECaff],
-       N: int) {.tags:[VarTime, HeapAlloc].} =
+       N: int,
+       useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
   let c = bestBucketBitSize(N, bits, useSignedBuckets = false, useManualTuning = false)
 
   case c
-  of  2: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  2)
-  of  3: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  3)
-  of  4: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  4)
-  of  5: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  5)
-  of  6: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  6)
-  of  7: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  7)
-  of  8: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  8)
-  of  9: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  9)
-  of 10: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 10)
-  of 11: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 11)
-  of 12: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 12)
-  of 13: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 13)
-  of 14: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 14)
-  of 15: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 15)
-
-  of 16..20: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 16)
+  of  2: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  2, useZeroWindows)
+  of  3: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  3, useZeroWindows)
+  of  4: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  4, useZeroWindows)
+  of  5: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  5, useZeroWindows)
+  of  6: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  6, useZeroWindows)
+  of  7: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  7, useZeroWindows)
+  of  8: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  8, useZeroWindows)
+  of  9: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c =  9, useZeroWindows)
+  of 10: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 10, useZeroWindows)
+  of 11: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 11, useZeroWindows)
+  of 12: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 12, useZeroWindows)
+  of 13: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 13, useZeroWindows)
+  of 14: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 14, useZeroWindows)
+  of 15: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 15, useZeroWindows)
+
+  of 16..20: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 16, useZeroWindows)
   else:
     unreachable()
 
@@ -127,41 +161,45 @@ func multiScalarMul_reference_vartime*[bits: static int, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[BigInt[bits]],
        points: ptr UncheckedArray[ECaff],
-       N: int) {.tags:[VarTime, HeapAlloc].} =
+       N: int,
+       useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
-  multiScalarMul_reference_dispatch_vartime(r, coefs, points, N)
+  multiScalarMul_reference_dispatch_vartime(r, coefs, points, N, useZeroWindows)
 
-func multiScalarMul_reference_vartime*[EC, ECaff](r: var EC, coefs: openArray[BigInt], points: openArray[ECaff]) {.tags:[VarTime, HeapAlloc].} =
+func multiScalarMul_reference_vartime*[EC, ECaff](r: var EC, coefs: openArray[BigInt], points: openArray[ECaff],
+                                                  useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
   debug: doAssert coefs.len == points.len
   let N = points.len
-  multiScalarMul_reference_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N)
+  multiScalarMul_reference_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows)
 
 func multiScalarMul_reference_vartime*[F, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[F],
        points: ptr UncheckedArray[ECaff],
-       len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+       len: int,
+       useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc].} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
   let n = cast[int](len)
   let coefs_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64)
   coefs_big.batchFromField(coefs, n)
-  r.multiScalarMul_reference_vartime(coefs_big, points, n)
+  r.multiScalarMul_reference_vartime(coefs_big, points, n, useZeroWindows)
 
   freeHeapAligned(coefs_big)
 
 func multiScalarMul_reference_vartime*[EC, ECaff](
        r: var EC,
        coefs: openArray[Fr],
-       points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} =
+       points: openArray[ECaff],
+       useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc].} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
   debug: doAssert coefs.len == points.len
   let N = points.len
-  multiScalarMul_reference_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N)
+  multiScalarMul_reference_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows)
 
 # ########################################################### #
 #                                                             #
@@ -256,27 +294,41 @@ func miniMSM[bits: static int, EC, ECaff](
 func msmImpl_vartime[bits: static int, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff],
-       N: int, c: static int) {.tags:[VarTime, HeapAlloc], meter.} =
+       N: int, c: static int,
+       useZeroWindows: bool) {.tags:[VarTime, HeapAlloc], meter.} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
 
   # Setup
   # -----
   const numBuckets = 1 shl (c-1)
-
   let buckets = allocHeapArray(EC, numBuckets)
   for i in 0 ..< numBuckets:
     buckets[i].setNeutral()
 
   # Algorithm
   # ---------
-  const excess = bits mod c
-  const top = bits - excess
+  var excess = bits mod c
+  let top =
+    if not useZeroWindows:
+      var msb = determineBitsSet[bits](coefs, N)
+      if msb < 0: # return early, nothing to do, all empty
+        return
+      # round up to next window of c
+      if msb < bits - excess:
+        msb = ceilDiv_vartime(msb, c) * c
+        excess = 0
+        msb
+      else: # in this case, we don't gain anything by modiying windows
+        bits - excess
+    else:
+      bits - excess
+
   var w = top
   r.setNeutral()
 
-  when top != 0:      # Prologue
-    when excess != 0:
+  if top != 0:      # Prologue
+    if excess != 0:
       r.miniMSM(buckets, w, kTopWindow, c, coefs, points, N)
       w -= c
     else:
@@ -350,10 +402,10 @@ func miniMSM_affine[NumBuckets, QueueLen, EC, ECaff; bits: static int](
 func msmAffineImpl_vartime[bits: static int, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff],
-       N: int, c: static int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+       N: int, c: static int,
+       useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
-
   # Setup
   # -----
   const (numBuckets, queueLen) = c.deriveSchedulerConstants()
@@ -363,13 +415,27 @@ func msmAffineImpl_vartime[bits: static int, EC, ECaff](
 
   # Algorithm
   # ---------
-  const excess = bits mod c
-  const top = bits - excess
+  var excess = bits mod c
+  let top =
+    if not useZeroWindows:
+      var msb = determineBitsSet[bits](coefs, N)
+      if msb < 0: # return early, nothing to do, all empty
+        return
+      # round up to next window of c
+      if msb < bits - excess:
+        msb = ceilDiv_vartime(msb, c) * c
+        excess = 0
+        msb
+      else: # in this case, we don't gain anything by modiying windows
+        bits - excess
+    else:
+      bits - excess
+
   var w = top
   r.setNeutral()
 
-  when top != 0:      # Prologue
-    when excess != 0:
+  if top != 0:      # Prologue
+    if excess != 0:
       # The top might use only a few bits, the affine scheduler would likely have significant collisions
       for i in 0 ..< numBuckets:
         sched.buckets.pt[i].setNeutral()
@@ -436,7 +502,8 @@ template withEndo[coefsBits: static int, EC, ECaff](
            r: var EC,
            coefs: ptr UncheckedArray[BigInt[coefsBits]],
            points: ptr UncheckedArray[ECaff],
-           N: int, c: static int) =
+           N: int, c: static int,
+           useZeroWindows: bool) =
   when hasEndomorphismAcceleration(EC.getName()) and
         EndomorphismThreshold <= coefsBits and
         coefsBits <= EC.getScalarField().bits() and
@@ -447,7 +514,7 @@ template withEndo[coefsBits: static int, EC, ECaff](
     let (endoCoefs, endoPoints, endoN) = applyEndomorphism(coefs, points, N)
     # Given that bits and N changed, we are able to use a bigger `c`
     # but it has no significant impact on performance
-    msmProc(r, endoCoefs, endoPoints, endoN, c)
+    msmProc(r, endoCoefs, endoPoints, endoN, c, useZeroWindows)
     freeHeap(endoCoefs)
     freeHeap(endoPoints)
   else:
@@ -459,7 +526,8 @@ template withEndo[coefsBits: static int, EC, ECaff](
 func msm_dispatch_vartime[bits: static int, F, G](
        r: var (EC_ShortW_Jac[F, G] or EC_ShortW_Prj[F, G]),
        coefs: ptr UncheckedArray[BigInt[bits]],
-       points: ptr UncheckedArray[EC_ShortW_Aff[F, G]], N: int) =
+       points: ptr UncheckedArray[EC_ShortW_Aff[F, G]], N: int,
+       useZeroWindows: bool) =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
   let c = bestBucketBitSize(N, bits, useSignedBuckets = true, useManualTuning = true)
@@ -469,29 +537,30 @@ func msm_dispatch_vartime[bits: static int, F, G](
   # but it has no significant impact on performance
 
   case c
-  of  2: withEndo(msmImpl_vartime, r, coefs, points, N, c =  2)
-  of  3: withEndo(msmImpl_vartime, r, coefs, points, N, c =  3)
-  of  4: withEndo(msmImpl_vartime, r, coefs, points, N, c =  4)
-  of  5: withEndo(msmImpl_vartime, r, coefs, points, N, c =  5)
-  of  6: withEndo(msmImpl_vartime, r, coefs, points, N, c =  6)
-  of  7: withEndo(msmImpl_vartime, r, coefs, points, N, c =  7)
-  of  8: withEndo(msmImpl_vartime, r, coefs, points, N, c =  8)
-
-  of  9: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c =  9)
-  of 10: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 10)
-  of 11: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 11)
-  of 12: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 12)
-  of 13: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 13)
-  of 14: msmAffineImpl_vartime(r, coefs, points, N, c = 14)
-  of 15: msmAffineImpl_vartime(r, coefs, points, N, c = 15)
-
-  of 16..17: msmAffineImpl_vartime(r, coefs, points, N, c = 16)
+  of  2: withEndo(msmImpl_vartime, r, coefs, points, N, c =  2, useZeroWindows)
+  of  3: withEndo(msmImpl_vartime, r, coefs, points, N, c =  3, useZeroWindows)
+  of  4: withEndo(msmImpl_vartime, r, coefs, points, N, c =  4, useZeroWindows)
+  of  5: withEndo(msmImpl_vartime, r, coefs, points, N, c =  5, useZeroWindows)
+  of  6: withEndo(msmImpl_vartime, r, coefs, points, N, c =  6, useZeroWindows)
+  of  7: withEndo(msmImpl_vartime, r, coefs, points, N, c =  7, useZeroWindows)
+  of  8: withEndo(msmImpl_vartime, r, coefs, points, N, c =  8, useZeroWindows)
+
+  of  9: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c =  9, useZeroWindows)
+  of 10: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 10, useZeroWindows)
+  of 11: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 11, useZeroWindows)
+  of 12: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 12, useZeroWindows)
+  of 13: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 13, useZeroWindows)
+  of 14: msmAffineImpl_vartime(r, coefs, points, N, c = 14, useZeroWindows)
+  of 15: msmAffineImpl_vartime(r, coefs, points, N, c = 15, useZeroWindows)
+
+  of 16..17: msmAffineImpl_vartime(r, coefs, points, N, c = 16, useZeroWindows)
   else:
     unreachable()
 
 func msm_dispatch_vartime[bits: static int, F](
        r: var EC_TwEdw_Prj[F], coefs: ptr UncheckedArray[BigInt[bits]],
-       points: ptr UncheckedArray[EC_TwEdw_Aff[F]], N: int) =
+       points: ptr UncheckedArray[EC_TwEdw_Aff[F]], N: int,
+       useZeroWindows: bool) =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ
 
@@ -503,22 +572,22 @@ func msm_dispatch_vartime[bits: static int, F](
   # but it has no significant impact on performance
 
   case c
-  of  2: withEndo(msmImpl_vartime, r, coefs, points, N, c =  2)
-  of  3: withEndo(msmImpl_vartime, r, coefs, points, N, c =  3)
-  of  4: withEndo(msmImpl_vartime, r, coefs, points, N, c =  4)
-  of  5: withEndo(msmImpl_vartime, r, coefs, points, N, c =  5)
-  of  6: withEndo(msmImpl_vartime, r, coefs, points, N, c =  6)
-  of  7: withEndo(msmImpl_vartime, r, coefs, points, N, c =  7)
-  of  8: withEndo(msmImpl_vartime, r, coefs, points, N, c =  8)
-  of  9: withEndo(msmImpl_vartime, r, coefs, points, N, c =  9)
-  of 10: withEndo(msmImpl_vartime, r, coefs, points, N, c = 10)
-  of 11: withEndo(msmImpl_vartime, r, coefs, points, N, c = 11)
-  of 12: withEndo(msmImpl_vartime, r, coefs, points, N, c = 12)
-  of 13: withEndo(msmImpl_vartime, r, coefs, points, N, c = 13)
-  of 14: msmImpl_vartime(r, coefs, points, N, c = 14)
-  of 15: msmImpl_vartime(r, coefs, points, N, c = 15)
-
-  of 16..17: msmImpl_vartime(r, coefs, points, N, c = 16)
+  of  2: withEndo(msmImpl_vartime, r, coefs, points, N, c =  2, useZeroWindows)
+  of  3: withEndo(msmImpl_vartime, r, coefs, points, N, c =  3, useZeroWindows)
+  of  4: withEndo(msmImpl_vartime, r, coefs, points, N, c =  4, useZeroWindows)
+  of  5: withEndo(msmImpl_vartime, r, coefs, points, N, c =  5, useZeroWindows)
+  of  6: withEndo(msmImpl_vartime, r, coefs, points, N, c =  6, useZeroWindows)
+  of  7: withEndo(msmImpl_vartime, r, coefs, points, N, c =  7, useZeroWindows)
+  of  8: withEndo(msmImpl_vartime, r, coefs, points, N, c =  8, useZeroWindows)
+  of  9: withEndo(msmImpl_vartime, r, coefs, points, N, c =  9, useZeroWindows)
+  of 10: withEndo(msmImpl_vartime, r, coefs, points, N, c = 10, useZeroWindows)
+  of 11: withEndo(msmImpl_vartime, r, coefs, points, N, c = 11, useZeroWindows)
+  of 12: withEndo(msmImpl_vartime, r, coefs, points, N, c = 12, useZeroWindows)
+  of 13: withEndo(msmImpl_vartime, r, coefs, points, N, c = 13, useZeroWindows)
+  of 14: msmImpl_vartime(r, coefs, points, N, c = 14, useZeroWindows)
+  of 15: msmImpl_vartime(r, coefs, points, N, c = 15, useZeroWindows)
+
+  of 16..17: msmImpl_vartime(r, coefs, points, N, c = 16, useZeroWindows)
   else:
     unreachable()
 
@@ -526,43 +595,47 @@ func multiScalarMul_vartime*[bits: static int, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[BigInt[bits]],
        points: ptr UncheckedArray[ECaff],
-       len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+       len: int,
+       useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
 
-  msm_dispatch_vartime(r, coefs, points, len)
+  msm_dispatch_vartime(r, coefs, points, len, useZeroWindows)
 
 func multiScalarMul_vartime*[bits: static int, EC, ECaff](
        r: var EC,
        coefs: openArray[BigInt[bits]],
-       points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+       points: openArray[ECaff],
+       useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
   debug: doAssert coefs.len == points.len
   let N = points.len
-  msm_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N)
+  msm_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows)
 
 func multiScalarMul_vartime*[F, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[F],
        points: ptr UncheckedArray[ECaff],
-       len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+       len: int,
+       useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
 
   let n = cast[int](len)
   let coefs_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64)
   coefs_big.batchFromField(coefs, n)
-  r.multiScalarMul_vartime(coefs_big, points, n)
+  r.multiScalarMul_vartime(coefs_big, points, n, useZeroWindows)
 
   freeHeapAligned(coefs_big)
 
 func multiScalarMul_vartime*[EC, ECaff](
        r: var EC,
        coefs: openArray[Fr],
-       points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} =
+       points: openArray[ECaff],
+       useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
   debug: doAssert coefs.len == points.len
   let N = points.len
-  multiScalarMul_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N)
+  multiScalarMul_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows)
diff --git a/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim b/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim
new file mode 100644
index 000000000..08c4a216e
--- /dev/null
+++ b/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim
@@ -0,0 +1,25 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  constantine/named/algebras,
+  constantine/math/ec_shortweierstrass,
+  constantine/math/arithmetic,
+  # Test utilities
+  ./t_ec_template
+
+run_EC_multi_scalar_mul_zero_windows_sanity(
+    ec = EC_ShortW_Jac[Fp[BN254_Snarks], G1],
+    moduleName = "test_msm_zero_windows_sanity" & $BN254_Snarks
+  )
+
+run_EC_multi_scalar_mul_zero_windows_sanity(
+    ec = EC_ShortW_Jac[Fp[BLS12_381], G1],
+    moduleName = "test_msm_zero_windows_sanity" & $BLS12_381
+  )
diff --git a/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim b/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim
new file mode 100644
index 000000000..c7dfadd70
--- /dev/null
+++ b/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim
@@ -0,0 +1,29 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  constantine/named/algebras,
+  constantine/math/ec_shortweierstrass,
+  constantine/math/arithmetic,
+  # Test utilities
+  ./t_ec_template
+
+const numPoints =  [1, 2, 8, 16, 32, 64, 128, 1024, 2048, 4096] # 32768, 262144, 1048576]
+
+run_EC_multi_scalar_mul_different_zero_windows(
+    ec = EC_ShortW_Jac[Fp[BN254_Snarks], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_jacobian_multi_scalar_mul_different_zero_windows_" & $BN254_Snarks
+  )
+
+run_EC_multi_scalar_mul_different_zero_windows(
+    ec = EC_ShortW_Jac[Fp[BLS12_381], G1],
+    numPoints = numPoints,
+    moduleName = "test_ec_shortweierstrass_jacobian_multi_scalar_mul_different_zero_windows_" & $BLS12_381
+  )
diff --git a/tests/math_elliptic_curves/t_ec_template.nim b/tests/math_elliptic_curves/t_ec_template.nim
index 2e003f3be..cbafdd4a3 100644
--- a/tests/math_elliptic_curves/t_ec_template.nim
+++ b/tests/math_elliptic_curves/t_ec_template.nim
@@ -14,7 +14,7 @@
 
 import
   # Standard library
-  std/[unittest, times],
+  std/[unittest, times, strformat],
   # Internals
   constantine/platforms/abstractions,
   constantine/named/algebras,
@@ -94,6 +94,21 @@ func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen)
       else:
         result = rng.random_long01Seq_with_randZ(EC)
 
+from std / math import divmod
+proc random_coefficient*[N: static int](rng: var RngState, maxBit: int = 0): BigInt[N] =
+  ## Initializes a random BigInt[N] with `maxBit` as the most significant bit
+  ## of it.
+  ## If `maxBit` is set to zero, the coefficient will utilize all bits.
+  const WordSize = 64
+  let toShift = result.limbs.len * WordSize - maxBit
+  let (d, r) = divmod(toShift, WordSize) # how many limbs to zero & how many bits in next limb
+  result = rng.random_unsafe(BigInt[N])
+  if maxBit == 0 or maxBit >= N: return # use all bits
+  let limbs = result.limbs.len
+  for i in countdown(limbs-1, limbs - d):
+    result.limbs[i] = SecretWord(0'u64)  # zero most significant limbs
+  result.shiftRight(r)                   # shift right by remaining required
+
 proc run_EC_addition_tests*(
        ec: typedesc,
        Iters: static int,
@@ -1390,3 +1405,103 @@ proc run_EC_multi_scalar_mul_impl*[N: static int](
         test(ec, gen = Uniform)
         test(ec, gen = HighHammingWeight)
         test(ec, gen = Long01Sequence)
+
+proc run_EC_multi_scalar_mul_zero_windows_sanity*(
+       ec: typedesc,
+       moduleName: string) =
+  # Random seed for reproducibility
+  var rng: RngState
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(seed)
+  echo "\n------------------------------------------------------\n"
+  echo moduleName, " xoshiro512** seed: ", seed
+
+  const testSuiteDesc = "Elliptic curve multi-scalar-multiplication"
+  const NumPoints = 4096
+
+  suite "Scalar coefficients with specific number of zero bits":
+    const maxBits = [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients
+    for bits in maxBits:
+      test "Zero bits " & $bits:
+        proc test(EC: typedesc) =
+          type T = BigInt[EC.getScalarField().bits()]
+          var coefs = newSeq[T](NumPoints)
+          for i in 0 ..< NumPoints:
+            coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits)
+          # verify number of bits needed
+          let msb = determineBitsSet(cast[ptr UncheckedArray[T]](coefs[0].addr), NumPoints)
+          ## `determineBitsSet` returns the highest set bit in the list of coefficients + 1
+          ## and the size of the BigInt if the target size is larger than the BigInt.
+          check msb == min(bits, bEC.getScalarField().bits())
+        test(ec)
+
+import ../../benchmarks/bench_msm_impl_optional_drop_windows
+proc run_EC_multi_scalar_mul_different_zero_windows*[N: static int](
+       ec: typedesc,
+       numPoints: array[N, int],
+       moduleName: string) =
+  # Random seed for reproducibility
+  var rng: RngState
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(seed)
+  echo "\n------------------------------------------------------\n"
+  echo moduleName, " xoshiro512** seed: ", seed
+
+  const testSuiteDesc = "Elliptic curve multi-scalar-multiplication with all windows compared to dropping MSB zero windows"
+
+  suite &"{testSuiteDesc} - {$ec} - [{WordBitWidth}-bit mode]":
+    const maxBits = [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients
+    # first check a selection of bits with different number of points
+    for bits in maxBits:
+      for n in numPoints:
+        let bucketBits = bestBucketBitSize(n, ec.getScalarField().bits(), useSignedBuckets = false, useManualTuning = false)
+        test &"{$ec} Multi-scalar-mul (N={n}, bucket bits: {bucketBits}, maxBitsSet={bits})":
+          proc test(EC: typedesc) =
+            var points = newSeq[affine(EC)](n)
+            var coefs = newSeq[BigInt[EC.getScalarField().bits()]](n)
+
+            for i in 0 ..< n:
+              var tmp = rng.random_unsafe(EC)
+              tmp.clearCofactor()
+              points[i].affine(tmp)
+              coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits)
+
+            var refAll, refWo, optAll, optWo: EC
+            refAll.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = true)
+            refWo.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = false)
+            optAll.multiScalarMul_vartime(coefs, points, useZeroWindows = true)
+            optWo.multiScalarMul_vartime(coefs, points, useZeroWindows = false)
+
+            doAssert bool(refAll == refWo)
+            doAssert bool(optAll == optWo)
+            doAssert bool(refWo  == optWo)
+
+          test(ec)
+
+    # now check each possible bit explicitly
+    # Note: this uses a fixed bucket size, but given the above, that should be fine
+    for bits in 0 ..< ec.getScalarField().bits():
+      const n = 32
+      let bucketBits = bestBucketBitSize(n, ec.getScalarField().bits(), useSignedBuckets = false, useManualTuning = false)
+      test &"{$ec} Multi-scalar-mul (N={n}, bucket bits: {bucketBits}, maxBitsSet={bits})":
+        proc test(EC: typedesc) =
+          var points = newSeq[affine(EC)](n)
+          var coefs = newSeq[BigInt[EC.getScalarField().bits()]](n)
+
+          for i in 0 ..< n:
+            var tmp = rng.random_unsafe(EC)
+            tmp.clearCofactor()
+            points[i].affine(tmp)
+            coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits)
+
+          var refAll, refWo, optAll, optWo: EC
+          refAll.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = true)
+          refWo.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = false)
+          optAll.multiScalarMul_vartime(coefs, points, useZeroWindows = true)
+          optWo.multiScalarMul_vartime(coefs, points, useZeroWindows = false)
+
+          doAssert bool(refAll == refWo)
+          doAssert bool(optAll == optWo)
+          doAssert bool(refWo  == optWo)
+
+        test(ec)