diff --git a/.gitignore b/.gitignore
index 9847c84..c84633f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,61 @@
+*
+
+### Allowed files and directories ###
+
+!.gitignore
+!.github/
+!.github/workflows/
+!.github/workflows/*
+!.github/dependabot.yml
+
+!LICENSE.md
+!README.md
+!Project.toml
+
+!src/
+!src/*.jl
+
+!test/
+!test/*.jl
+
+!docs/
+!docs/src/
+!docs/src/*.md
+!docs/make.jl
+!docs/Project.toml
+
+!benchmarks/
+!benchmarks/*.jl
+!benchmarks/Project.toml
+!benchmarks/README.md
+
+### Denied even if allowed above ###
+
+# Files generated by invoking Julia with --code-coverage
 *.jl.cov
 *.jl.*.cov
+
+# Files generated by invoking Julia with --track-allocation
 *.jl.mem
-Manifest.toml
-docs/build
-docs/site
-docs/Manifest.toml
+
+# System-specific files and directories generated by the BinaryProvider and BinDeps packages
+# They contain absolute paths specific to the host computer, and so should not be committed
+deps/deps.jl
+deps/build.log
+deps/downloads/
+deps/usr/
+deps/src/
+
+# Build artifacts for creating documentation generated by the Documenter package
+docs/build/
+docs/site/
+
+# File generated by Pkg, the package manager, based on a corresponding Project.toml
+# It records a fixed state of all packages used by the project. As such, it should not be
+# committed for packages, but should be committed for applications that require a static
+# environment.
+Manifest*.toml
+
+# File generated by the Preferences package to store local preferences
+LocalPreferences.toml
+JuliaLocalPreferences.toml
\ No newline at end of file
diff --git a/Project.toml b/Project.toml
index 460e0eb..f4d7abb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,14 +5,16 @@ version = "0.5.7"
 [deps]
 IntegerMathUtils = "18e54dd8-cb9d-406c-a71d-865a43cbb235"
 
+[compat]
+BitIntegers = "0.3"
+IntegerMathUtils = "0.1.1"
+julia = "1.6"
+
 [extras]
-IntegerMathUtils = "18e54dd8-cb9d-406c-a71d-865a43cbb235"
+BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+IntegerMathUtils = "18e54dd8-cb9d-406c-a71d-865a43cbb235"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["DataStructures", "IntegerMathUtils", "Test"]
-
-[compat]
-IntegerMathUtils = "0.1.1"
-julia = "1.6"
+test = ["BitIntegers", "DataStructures", "IntegerMathUtils", "Test"]
diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
new file mode 100644
index 0000000..76683dd
--- /dev/null
+++ b/benchmarks/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
+Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
+
+[sources]
+Primes = {path = ".."}
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..d17802f
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,61 @@
+# Benchmarks
+
+Benchmark suite for Primes.jl, comparing GMP-optimised in-place arithmetic against
+the generic (allocating) code path for BigInt operations, and measuring end-to-end
+factorisation performance across different number sizes.
+
+## Prerequisites
+
+```bash
+julia --project=benchmarks -e 'using Pkg; Pkg.instantiate()'
+```
+
+## Running
+
+Run all benchmarks:
+
+```bash
+julia --project=benchmarks benchmarks/run_all.jl
+```
+
+Or run individual suites:
+
+```bash
+# GMP in-place vs generic (allocating) ECM — the core comparison
+julia --project=benchmarks benchmarks/ecm_gmp_vs_generic.jl
+
+# Micro-benchmarks for individual ECM operations
+julia --project=benchmarks benchmarks/ecm_microbenchmarks.jl
+
+# End-to-end factor(n) at various sizes
+julia --project=benchmarks benchmarks/factorization_endtoend.jl
+```
+
+## Benchmark Suites
+
+### `ecm_gmp_vs_generic.jl`
+
+Compares the two ECM code paths for `BigInt` inputs:
+
+| Path | Style | Allocations |
+|------|-------|-------------|
+| `ecm_factor(n::BigInt, ...)` | In-place GMP (`Base.GMP.MPZ.*!`) | Near-zero in hot loop |
+| Generic via `_ecm_scalar_mul` | Functional (new BigInt per op) | O(k) per scalar multiply |
+
+Tests at two scales: a ~40-digit and a ~55-digit semiprime.
+
+### `ecm_microbenchmarks.jl`
+
+Benchmarks individual ECM building blocks across three execution modes:
+
+| Operation | GMP in-place | Generic BigInt | UInt128 (LLVM) |
+|-----------|-------------|----------------|----------------|
+| `_mulmod` | `_mulmod!` | `_mulmod` | `_mulmod` |
+| `_ecm_add` | `_ecm_add!` | `_ecm_add` | `_ecm_add` |
+| `_ecm_double` | `_ecm_double!` | `_ecm_double` | `_ecm_double` |
+| `_ecm_scalar_mul` | `_ecm_scalar_mul!` | `_ecm_scalar_mul` | `_ecm_scalar_mul` |
+
+### `factorization_endtoend.jl`
+
+Benchmarks `factor(n)` for semiprimes ranging from 12 to 45+ digits,
+showing how the polyalgorithm (trial division → Pollard rho → ECM → MPQS) scales.
diff --git a/benchmarks/ecm_gmp_vs_generic.jl b/benchmarks/ecm_gmp_vs_generic.jl
new file mode 100644
index 0000000..bcb4de2
--- /dev/null
+++ b/benchmarks/ecm_gmp_vs_generic.jl
@@ -0,0 +1,116 @@
+# ECM Benchmark: GMP In-Place vs Generic (Allocating) BigInt Path
+#
+# Compares the two ECM code paths for BigInt inputs:
+#   1. BigInt-specialised: uses in-place GMP arithmetic (zero-allocation hot loop)
+#   2. Generic: functional style, allocates new BigInts per operation
+#
+# Usage:
+#   julia --project=benchmarks benchmarks/ecm_gmp_vs_generic.jl
+
+using Primes
+using BenchmarkTools
+
+# Reproduce the generic (allocating) ECM path for BigInt, bypassing the
+# BigInt-specialised method that normally intercepts dispatch.
+function ecm_factor_generic(n::BigInt, B1::Int, num_curves::Int)
+    prime_powers = Primes._ecm_prime_powers(B1)
+    T = BigInt
+    for _ in 1:num_curves
+        curve = Primes._ecm_suyama(n)
+        curve === nothing && continue
+        curve isa Tuple || return curve
+        x0, z0, a24 = curve
+
+        QX, QZ = x0, z0
+        degenerate = false
+        acc = one(T)
+        batch_count = 0
+        for pk in prime_powers
+            QX, QZ = Primes._ecm_scalar_mul(pk, QX, QZ, n, a24)
+            acc = Primes._mulmod(acc, QZ, n)
+            batch_count += 1
+            if batch_count >= 100
+                g = gcd(acc, n)
+                if 1 < g < n
+                    return g
+                end
+                if g == n
+                    degenerate = true
+                    break
+                end
+                acc = one(T)
+                batch_count = 0
+            end
+        end
+        degenerate && continue
+        if batch_count > 0
+            g = gcd(acc, n)
+            1 < g < n && return g
+        end
+    end
+    return nothing
+end
+
+# ---------------------------------------------------------------------------
+# Test cases at different scales
+# ---------------------------------------------------------------------------
+const CASES = [
+    (
+        name  = "small (~40-digit semiprime)",
+        n     = big"824633720831" * big"1000000007",
+        B1    = 2_000,
+        curves = 50,
+    ),
+    (
+        name  = "medium (~55-digit semiprime)",
+        n     = big"780002082420426809" * big"810735269523504809437013569",
+        B1    = 11_000,
+        curves = 200,
+    ),
+]
+
+# ---------------------------------------------------------------------------
+# Run benchmarks
+# ---------------------------------------------------------------------------
+println("=" ^ 70)
+println("  ECM Benchmark: GMP In-Place vs Generic (Allocating) BigInt")
+println("=" ^ 70)
+
+for case in CASES
+    (; name, n, B1, curves) = case
+    println("\n--- $name ---")
+    println("  n      = $n  ($(ndigits(n)) digits)")
+    println("  B1     = $B1")
+    println("  curves = $curves")
+
+    # Warm-up both paths
+    Primes.ecm_factor(n, B1, curves)
+    ecm_factor_generic(n, B1, curves)
+
+    println("\n  [GMP in-place]")
+    b_gmp = @benchmark Primes.ecm_factor($n, $B1, $curves) samples=20 evals=1
+    display(b_gmp)
+
+    println("\n  [Generic (allocating)]")
+    b_gen = @benchmark ecm_factor_generic($n, $B1, $curves) samples=20 evals=1
+    display(b_gen)
+
+    med_gmp = median(b_gmp).time / 1e6
+    med_gen = median(b_gen).time / 1e6
+    alloc_gmp = median(b_gmp).memory / 1024
+    alloc_gen = median(b_gen).memory / 1024
+
+    println("\n  Summary (median):")
+    println("    GMP in-place : $(round(med_gmp; digits=2)) ms,  $(round(alloc_gmp; digits=1)) KiB")
+    println("    Generic alloc: $(round(med_gen; digits=2)) ms,  $(round(alloc_gen; digits=1)) KiB")
+    if med_gmp > 0
+        println("    Speedup      : $(round(med_gmp / med_gen; digits=2))×")
+    end
+    if alloc_gen > 0
+        println("    Memory saved : $(round((1 - alloc_gmp / alloc_gen) * 100; digits=1))%")
+    end
+end
+
+println("\n" * "=" ^ 70)
+println("  Done.")
+println("=" ^ 70)
diff --git a/benchmarks/ecm_microbenchmarks.jl b/benchmarks/ecm_microbenchmarks.jl
new file mode 100644
index 0000000..613a0f4
--- /dev/null
+++ b/benchmarks/ecm_microbenchmarks.jl
@@ -0,0 +1,172 @@
+# ECM Micro-Benchmarks
+#
+# Benchmarks individual ECM building blocks:
+#   - _mulmod / _mulmod! (modular multiplication)
+#   - _ecm_add / _ecm_add! (differential point addition)
+#   - _ecm_double / _ecm_double! (point doubling)
+#   - _ecm_scalar_mul / _ecm_scalar_mul! (Montgomery ladder)
+#
+# Each operation is tested in four modes:
+#   1. BigInt GMP in-place (specialised path)
+#   2. BigInt generic (allocating, functional)
+#   3. UInt128 generic (note: widen(UInt128) == BigInt, so still allocates)
+#   4. UInt256 generic via BitIntegers.jl (widen stays in LLVM fixed-width)
+#
+# Usage:
+#   julia --project=benchmarks benchmarks/ecm_microbenchmarks.jl
+
+using Primes
+using BenchmarkTools
+using BitIntegers
+
+# ---------------------------------------------------------------------------
+# Setup: representative values for a ~40-digit modulus
+# ---------------------------------------------------------------------------
+const N_BIG = big"824633720831" * big"1000000007"
+const A24_BIG = mod(big"123456789012345", N_BIG)
+const PX_BIG  = mod(big"314159265358979323846", N_BIG)
+const PZ_BIG  = mod(big"271828182845904523536", N_BIG)
+const QX_BIG  = mod(big"141421356237309504880", N_BIG)
+const QZ_BIG  = mod(big"173205080756887729352", N_BIG)
+const DX_BIG  = mod(big"223606797749978969640", N_BIG)
+const DZ_BIG  = mod(big"264575131106459059050", N_BIG)
+
+# UInt128 versions (note: widen(UInt128) == BigInt in base Julia)
+const N_U128   = UInt128(824633720831) * UInt128(1000000007)
+const A24_U128 = UInt128(mod(big"123456789012345", big(N_U128)))
+const PX_U128  = UInt128(mod(big"314159265358979323846", big(N_U128)))
+const PZ_U128  = UInt128(mod(big"271828182845904523536", big(N_U128)))
+const QX_U128  = UInt128(mod(big"141421356237309504880", big(N_U128)))
+const QZ_U128  = UInt128(mod(big"173205080756887729352", big(N_U128)))
+const DX_U128  = UInt128(mod(big"223606797749978969640", big(N_U128)))
+const DZ_U128  = UInt128(mod(big"264575131106459059050", big(N_U128)))
+
+# UInt256 versions (widen(UInt256) == UInt512, stays in LLVM fixed-width)
+const N_U256   = UInt256(824633720831) * UInt256(1000000007)
+const A24_U256 = UInt256(mod(big"123456789012345", big(N_U256)))
+const PX_U256  = UInt256(mod(big"314159265358979323846", big(N_U256)))
+const PZ_U256  = UInt256(mod(big"271828182845904523536", big(N_U256)))
+const QX_U256  = UInt256(mod(big"141421356237309504880", big(N_U256)))
+const QZ_U256  = UInt256(mod(big"173205080756887729352", big(N_U256)))
+const DX_U256  = UInt256(mod(big"223606797749978969640", big(N_U256)))
+const DZ_U256  = UInt256(mod(big"264575131106459059050", big(N_U256)))
+
+# GMP buffers
+const BUF = Primes.ECMBuffers()
+const TMP_MUL = BigInt()
+
+println("=" ^ 70)
+println("  ECM Micro-Benchmarks")
+println("=" ^ 70)
+println()
+println("  Note: widen(UInt128) == BigInt   → still allocates (no LLVM benefit)")
+println("        widen(UInt256) == UInt512  → pure LLVM fixed-width arithmetic")
+
+# ---------------------------------------------------------------------------
+# 1. Modular multiplication
+# ---------------------------------------------------------------------------
+println("\n--- _mulmod ---")
+
+print("  BigInt GMP in-place (_mulmod!):   ")
+dst = BigInt()
+b1 = @benchmark Primes._mulmod!($dst, $PX_BIG, $PZ_BIG, $N_BIG, $TMP_MUL)
+println("$(round(median(b1).time; digits=1)) ns,  $(median(b1).memory) bytes")
+
+print("  BigInt generic    (_mulmod):      ")
+b2 = @benchmark Primes._mulmod($PX_BIG, $PZ_BIG, $N_BIG)
+println("$(round(median(b2).time; digits=1)) ns,  $(median(b2).memory) bytes")
+
+print("  UInt128 generic   (_mulmod):      ")
+b3 = @benchmark Primes._mulmod($PX_U128, $PZ_U128, $N_U128)
+println("$(round(median(b3).time; digits=1)) ns,  $(median(b3).memory) bytes")
+
+print("  UInt256 generic   (_mulmod):      ")
+b3b = @benchmark Primes._mulmod($PX_U256, $PZ_U256, $N_U256)
+println("$(round(median(b3b).time; digits=1)) ns,  $(median(b3b).memory) bytes")
+
+# ---------------------------------------------------------------------------
+# 2. Differential point addition
+# ---------------------------------------------------------------------------
+println("\n--- _ecm_add ---")
+
+res_X, res_Z = BigInt(), BigInt()
+
+print("  BigInt GMP in-place (_ecm_add!):  ")
+b4 = @benchmark Primes._ecm_add!($res_X, $res_Z,
+    $PX_BIG, $PZ_BIG, $QX_BIG, $QZ_BIG, $DX_BIG, $DZ_BIG, $N_BIG, $BUF)
+println("$(round(median(b4).time; digits=1)) ns,  $(median(b4).memory) bytes")
+
+print("  BigInt generic    (_ecm_add):     ")
+b5 = @benchmark Primes._ecm_add($PX_BIG, $PZ_BIG, $QX_BIG, $QZ_BIG, $DX_BIG, $DZ_BIG, $N_BIG)
+println("$(round(median(b5).time; digits=1)) ns,  $(median(b5).memory) bytes")
+
+print("  UInt128 generic   (_ecm_add):     ")
+b6 = @benchmark Primes._ecm_add($PX_U128, $PZ_U128, $QX_U128, $QZ_U128, $DX_U128, $DZ_U128, $N_U128)
+println("$(round(median(b6).time; digits=1)) ns,  $(median(b6).memory) bytes")
+
+print("  UInt256 generic   (_ecm_add):     ")
+b6b = @benchmark Primes._ecm_add($PX_U256, $PZ_U256, $QX_U256, $QZ_U256, $DX_U256, $DZ_U256, $N_U256)
+println("$(round(median(b6b).time; digits=1)) ns,  $(median(b6b).memory) bytes")
+
+# ---------------------------------------------------------------------------
+# 3. Point doubling
+# ---------------------------------------------------------------------------
+println("\n--- _ecm_double ---")
+
+print("  BigInt GMP in-place (_ecm_double!): ")
+b7 = @benchmark Primes._ecm_double!($res_X, $res_Z, $PX_BIG, $PZ_BIG, $N_BIG, $A24_BIG, $BUF)
+println("$(round(median(b7).time; digits=1)) ns,  $(median(b7).memory) bytes")
+
+print("  BigInt generic    (_ecm_double):    ")
+b8 = @benchmark Primes._ecm_double($PX_BIG, $PZ_BIG, $N_BIG, $A24_BIG)
+println("$(round(median(b8).time; digits=1)) ns,  $(median(b8).memory) bytes")
+
+print("  UInt128 generic   (_ecm_double):    ")
+b9 = @benchmark Primes._ecm_double($PX_U128, $PZ_U128, $N_U128, $A24_U128)
+println("$(round(median(b9).time; digits=1)) ns,  $(median(b9).memory) bytes")
+
+print("  UInt256 generic   (_ecm_double):    ")
+b9b = @benchmark Primes._ecm_double($PX_U256, $PZ_U256, $N_U256, $A24_U256)
+println("$(round(median(b9b).time; digits=1)) ns,  $(median(b9b).memory) bytes")
+
+# ---------------------------------------------------------------------------
+# 4. Montgomery ladder scalar multiplication
+# ---------------------------------------------------------------------------
+println("\n--- _ecm_scalar_mul (k=997, prime) ---")
+
+const K = 997
+
+print("  BigInt GMP in-place (_ecm_scalar_mul!): ")
+b10 = @benchmark Primes._ecm_scalar_mul!($res_X, $res_Z, $K, $PX_BIG, $PZ_BIG, $N_BIG, $A24_BIG, $BUF)
+println("$(round(median(b10).time / 1e3; digits=2)) μs,  $(median(b10).memory) bytes")
+
+print("  BigInt generic    (_ecm_scalar_mul):    ")
+b11 = @benchmark Primes._ecm_scalar_mul($K, $PX_BIG, $PZ_BIG, $N_BIG, $A24_BIG)
+println("$(round(median(b11).time / 1e3; digits=2)) μs,  $(median(b11).memory) bytes")
+
+print("  UInt128 generic   (_ecm_scalar_mul):    ")
+b12 = @benchmark Primes._ecm_scalar_mul($K, $PX_U128, $PZ_U128, $N_U128, $A24_U128)
+println("$(round(median(b12).time / 1e3; digits=2)) μs,  $(median(b12).memory) bytes")
+
+print("  UInt256 generic   (_ecm_scalar_mul):    ")
+b12b = @benchmark Primes._ecm_scalar_mul($K, $PX_U256, $PZ_U256, $N_U256, $A24_U256)
+println("$(round(median(b12b).time / 1e3; digits=2)) μs,  $(median(b12b).memory) bytes")
+
+# ---------------------------------------------------------------------------
+# Summary table
+# ---------------------------------------------------------------------------
+println("\n" * "=" ^ 70)
+println("  Summary: median time")
+println("  (GMP in-place → Generic BigInt → UInt128 → UInt256)")
+println("=" ^ 70)
+
+function fmt_speedup(base, other)
+    s = base / other
+    "$(round(s; digits=2))×"
+end
+
+println("  _mulmod:         $(round(median(b1).time; digits=0)) ns → $(round(median(b2).time; digits=0)) ns ($(fmt_speedup(median(b1).time, median(b2).time))) → $(round(median(b3).time; digits=0)) ns ($(fmt_speedup(median(b1).time, median(b3).time))) → $(round(median(b3b).time; digits=0)) ns ($(fmt_speedup(median(b1).time, median(b3b).time)))")
+println("  _ecm_add:        $(round(median(b4).time; digits=0)) ns → $(round(median(b5).time; digits=0)) ns ($(fmt_speedup(median(b4).time, median(b5).time))) → $(round(median(b6).time; digits=0)) ns ($(fmt_speedup(median(b4).time, median(b6).time))) → $(round(median(b6b).time; digits=0)) ns ($(fmt_speedup(median(b4).time, median(b6b).time)))")
+println("  _ecm_double:     $(round(median(b7).time; digits=0)) ns → $(round(median(b8).time; digits=0)) ns ($(fmt_speedup(median(b7).time, median(b8).time))) → $(round(median(b9).time; digits=0)) ns ($(fmt_speedup(median(b7).time, median(b9).time))) → $(round(median(b9b).time; digits=0)) ns ($(fmt_speedup(median(b7).time, median(b9b).time)))")
+println("  _ecm_scalar_mul: $(round(median(b10).time/1e3; digits=1)) μs → $(round(median(b11).time/1e3; digits=1)) μs ($(fmt_speedup(median(b10).time, median(b11).time))) → $(round(median(b12).time/1e3; digits=1)) μs ($(fmt_speedup(median(b10).time, median(b12).time))) → $(round(median(b12b).time/1e3; digits=1)) μs ($(fmt_speedup(median(b10).time, median(b12b).time)))")
+println("=" ^ 70)
diff --git a/benchmarks/factorization_endtoend.jl b/benchmarks/factorization_endtoend.jl
new file mode 100644
index 0000000..17887cf
--- /dev/null
+++ b/benchmarks/factorization_endtoend.jl
@@ -0,0 +1,52 @@
+# End-to-End Factorization Benchmarks
+#
+# Benchmarks `factor(n)` for semiprimes of increasing size to show the
+# polyalgorithm's performance across different regimes:
+#   - Small (< 20 digits): trial division + Pollard rho
+#   - Medium (20-40 digits): ECM kicks in
+#   - Large (40-60 digits): ECM + MPQS
+#   - Very large (60+ digits): MPQS-dominated
+#
+# Usage:
+#   julia --project=benchmarks benchmarks/factorization_endtoend.jl
+
+using Primes
+using BenchmarkTools
+
+const CASES = [
+    (name = "~12 digits",  n = big"824633720831"),
+    (name = "~20 digits",  n = big"99999999999999999877" ),  # 20-digit semiprime-like
+    (name = "~30 digits",  n = big"824633720831" * big"1000000007"),
+    (name = "~40 digits",  n = big"2152302898747" * big"99999999999999999877"),
+    (name = "~45 digits",  n = big"780002082420426809" * big"810735269523504809437013569"),
+]
+
+println("=" ^ 70)
+println("  End-to-End factor(n) Benchmarks")
+println("=" ^ 70)
+
+for case in CASES
+    (; name, n) = case
+    println("\n--- $name  ($(ndigits(n)) digits) ---")
+    println("  n = $n")
+
+    # Warm-up and verify correctness
+    f = factor(n)
+    @assert prodfactors(f) == n "Factorization verification failed for $name"
+    println("  factor(n) = $f")
+
+    # Benchmark
+    samples = ndigits(n) > 40 ? 5 : 20
+    b = @benchmark factor($n) samples=samples evals=1
+    display(b)
+
+    med = median(b).time
+    unit = med < 1e6 ? ("$(round(med / 1e3; digits=2)) μs") :
+           med < 1e9 ? ("$(round(med / 1e6; digits=2)) ms") :
+                       ("$(round(med / 1e9; digits=2)) s")
+    println("  Median: $unit, $(round(median(b).memory / 1024; digits=1)) KiB allocated")
+end
+
+println("\n" * "=" ^ 70)
+println("  Done.")
+println("=" ^ 70)
diff --git a/benchmarks/run_all.jl b/benchmarks/run_all.jl
new file mode 100644
index 0000000..90e8c83
--- /dev/null
+++ b/benchmarks/run_all.jl
@@ -0,0 +1,25 @@
+# Run All Benchmarks
+#
+# Usage:
+#   julia --project=benchmarks benchmarks/run_all.jl
+#
+# Or run individual benchmarks:
+#   julia --project=benchmarks benchmarks/ecm_gmp_vs_generic.jl
+#   julia --project=benchmarks benchmarks/ecm_microbenchmarks.jl
+#   julia --project=benchmarks benchmarks/factorization_endtoend.jl
+
+const BENCHDIR = @__DIR__
+
+const SCRIPTS = [
+    "ecm_microbenchmarks.jl",
+    "ecm_gmp_vs_generic.jl",
+    "factorization_endtoend.jl",
+]
+
+for script in SCRIPTS
+    path = joinpath(BENCHDIR, script)
+    println("\n" * "#" ^ 70)
+    println("# Running: $script")
+    println("#" ^ 70 * "\n")
+    include(path)
+end
diff --git a/docs/make.jl b/docs/make.jl
index 73117d9..a0552bd 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -7,7 +7,8 @@ makedocs(
     checkdocs = :none,
     pages = Any[
         "Home" => "index.md",
-        "Functions" => "api.md"
+        "Functions" => "api.md",
+        "Benchmarks" => "benchmarks.md"
     ],
 )
 
diff --git a/docs/src/benchmarks.md b/docs/src/benchmarks.md
new file mode 100644
index 0000000..cd8fc34
--- /dev/null
+++ b/docs/src/benchmarks.md
@@ -0,0 +1,61 @@
+# Benchmarks
+
+Primes.jl ships a benchmark suite in `benchmarks/` for measuring the performance of
+its factorisation algorithms, and comparing the GMP-optimised BigInt code path against
+the generic (allocating) implementation.
+
+## Running Benchmarks
+
+Install dependencies once:
+
+```bash
+julia --project=benchmarks -e 'using Pkg; Pkg.instantiate()'
+```
+
+Run all benchmarks:
+
+```bash
+julia --project=benchmarks benchmarks/run_all.jl
+```
+
+Or run individual suites:
+
+```bash
+julia --project=benchmarks benchmarks/ecm_gmp_vs_generic.jl
+julia --project=benchmarks benchmarks/ecm_microbenchmarks.jl
+julia --project=benchmarks benchmarks/factorization_endtoend.jl
+```
+
+## Benchmark Suites
+
+### ECM: GMP In-Place vs Generic
+
+`ecm_gmp_vs_generic.jl` compares the two ECM code paths for `BigInt` inputs:
+
+- **GMP in-place** (`ecm_factor(n::BigInt, ...)`): Uses `Base.GMP.MPZ.*!` functions for
+  zero-allocation arithmetic in the Montgomery ladder hot loop.
+- **Generic** (via `_ecm_scalar_mul`): Functional style that allocates a new `BigInt` for
+  every modular operation.
+
+### ECM Micro-Benchmarks
+
+`ecm_microbenchmarks.jl` measures individual ECM building blocks across four execution
+modes:
+
+| Operation | GMP in-place | Generic BigInt | UInt128 | UInt256 (LLVM) |
+|-----------|-------------|----------------|---------|----------------|
+| Modular multiply | `_mulmod!` | `_mulmod` | `_mulmod` | `_mulmod` |
+| Point addition | `_ecm_add!` | `_ecm_add` | `_ecm_add` | `_ecm_add` |
+| Point doubling | `_ecm_double!` | `_ecm_double` | `_ecm_double` | `_ecm_double` |
+| Scalar multiply | `_ecm_scalar_mul!` | `_ecm_scalar_mul` | `_ecm_scalar_mul` | `_ecm_scalar_mul` |
+
+!!! note
+    `widen(UInt128) == BigInt` in base Julia, so UInt128 still allocates through BigInt.
+    `widen(UInt256) == UInt512` via BitIntegers.jl, giving pure LLVM fixed-width arithmetic
+    with zero allocations — but GMP's hand-tuned assembly is faster per-operation.
+
+### End-to-End Factorisation
+
+`factorization_endtoend.jl` benchmarks `factor(n)` for semiprimes ranging from 12 to 45+
+digits, showing how the polyalgorithm (trial division → Pollard rho → ECM → MPQS) scales
+with input size.
diff --git a/src/Primes.jl b/src/Primes.jl
index b95fdac..9880b89 100644
--- a/src/Primes.jl
+++ b/src/Primes.jl
@@ -12,6 +12,7 @@ export isprime, primes, primesmask, factor, eachfactor, divisors, ismersenneprim
        nextprime, nextprimes, prevprime, prevprimes, prime, prodfactors, radical, totient
 
 include("factorization.jl")
+include("ecm.jl")
 
 # Primes generating functions
 #     https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes
diff --git a/src/ecm.jl b/src/ecm.jl
new file mode 100644
index 0000000..0aed909
--- /dev/null
+++ b/src/ecm.jl
@@ -0,0 +1,585 @@
+# Elliptic Curve Method (ECM) for integer factorization
+# Ref: Lenstra (1987) "Factoring integers with elliptic curves"
+# Ref: Montgomery (1987) "Speeding the Pollard and Elliptic Curve Methods of Factorization"
+
+# =============================================================================
+# Generic modular arithmetic helpers (work for any T<:Integer, including
+# fixed-width types from BitIntegers.jl such as UInt256, UInt512, etc.)
+# Uses widen(T) for overflow-safe multiplication; for BitIntegers types,
+# widen returns the next fixed-width type (e.g. widen(UInt256) == UInt512),
+# so no BigInt allocation occurs.
+# =============================================================================
+
+# Modular addition: (a + b) mod n, for a, b ∈ [0, n).
+# Avoids computing a + b directly, which could overflow for unsigned T
+# when n is close to typemax(T).
+@inline function _addmod(a::T, b::T, n::T) where {T<:Integer}
+    a >= n - b ? a - (n - b) : a + b
+end
+
+# Modular subtraction: (a - b) mod n, for a, b ∈ [0, n).
+# Written as n - (b - a) when a < b to avoid underflow in unsigned types.
+@inline function _submod(a::T, b::T, n::T) where {T<:Integer}
+    a >= b ? a - b : n - (b - a)
+end
+
+# Modular multiplication: (a * b) mod n.
+# Uses widen(T) to avoid overflow for fixed-width integers.
+# For BigInt, widen returns BigInt itself, so this is simply mod(a * b, n).
+@inline function _mulmod(a::T, b::T, n::T) where {T<:Integer}
+    mod(a * b, n)
+end
+
+# =============================================================================
+# Shared helpers used by both the generic and BigInt-optimized ECM paths.
+# =============================================================================
+
+# Precompute prime powers p^k ≤ B1 for each prime p ≤ B1.
+function _ecm_prime_powers(B1::Int)
+    result = Int[]
+    for p in primes(B1)
+        pk = p
+        while pk * p <= B1
+            pk *= p
+        end
+        push!(result, pk)
+    end
+    result
+end
+
+# Suyama's parametrization: generate a random Montgomery curve and initial point.
+# Returns (x0, z0, a24) on success, a lucky factor g::T, or nothing for a
+# degenerate curve (gcd == n).
+function _ecm_suyama(n::T) where {T<:Integer}
+    σ = T(rand(6:10^9))
+    u  = _submod(_mulmod(σ, σ, n), T(5), n)
+    v  = _mulmod(T(4), σ, n)
+    x0 = _mulmod(_mulmod(u, u, n), u, n)
+    z0 = _mulmod(_mulmod(v, v, n), v, n)
+
+    vu_diff  = _submod(v, u, n)
+    vu_diff3 = _mulmod(_mulmod(vu_diff, vu_diff, n), vu_diff, n)
+    a24_num  = _mulmod(vu_diff3, _addmod(_mulmod(T(3), u, n), v, n), n)
+    a24_den  = _mulmod(_mulmod(T(16), x0, n), v, n)
+
+    g = gcd(a24_den, n)
+    1 < g < n && return g
+    g == n && return nothing
+
+    a24 = _mulmod(a24_num, invmod(a24_den, n), n)
+    (x0, z0, a24)
+end
+
+# =============================================================================
+# GMP-optimized path for BigInt: in-place arithmetic avoids allocations in
+# the hot Montgomery ladder loop, which matters for large-integer factoring.
+# =============================================================================
+
+# In-place modular reduction: r = n mod d (non-negative remainder).
+function _mpz_fdiv_r!(r::BigInt, n::BigInt, d::BigInt)
+    ccall((:__gmpz_fdiv_r, :libgmp), Cvoid, (Ref{BigInt}, Ref{BigInt}, Ref{BigInt}), r, n, d)
+end
+
+# Preallocated scratch space for the BigInt ECM inner loop.
+# t1–t6: scratch for add!/double!; R0/R1/tmp: scratch for scalar_mul!
+struct ECMBuffers
+    t1::BigInt
+    t2::BigInt
+    t3::BigInt
+    t4::BigInt
+    t5::BigInt
+    t6::BigInt
+    R0_X::BigInt
+    R0_Z::BigInt
+    R1_X::BigInt
+    R1_Z::BigInt
+    tmp_X::BigInt
+    tmp_Z::BigInt
+end
+
+ECMBuffers() = ECMBuffers(BigInt(), BigInt(), BigInt(), BigInt(), BigInt(), BigInt(),
+                          BigInt(), BigInt(), BigInt(), BigInt(), BigInt(), BigInt())
+
+# In-place: dst = (a * b) mod n, using tmp as scratch.
+@inline function _mulmod!(dst::BigInt, a::BigInt, b::BigInt, n::BigInt, tmp::BigInt)
+    Base.GMP.MPZ.mul!(tmp, a, b)
+    _mpz_fdiv_r!(dst, tmp, n)
+end
+
+# Differential addition on Montgomery curve (BigInt in-place version).
+# Given projective P, Q and their difference P-Q (as diff), computes P+Q.
+function _ecm_add!(res_X::BigInt, res_Z::BigInt,
+                   P_X::BigInt, P_Z::BigInt, Q_X::BigInt, Q_Z::BigInt,
+                   diff_X::BigInt, diff_Z::BigInt, n::BigInt, buf::ECMBuffers)
+    t1, t2, t3, t4, t5, t6 = buf.t1, buf.t2, buf.t3, buf.t4, buf.t5, buf.t6
+    Base.GMP.MPZ.sub!(t1, P_X, P_Z)
+    Base.GMP.MPZ.add!(t2, Q_X, Q_Z)
+    _mulmod!(t5, t1, t2, n, t3)             # t5 = (P.X-P.Z)(Q.X+Q.Z) mod n
+    Base.GMP.MPZ.add!(t1, P_X, P_Z)
+    Base.GMP.MPZ.sub!(t2, Q_X, Q_Z)
+    _mulmod!(t6, t1, t2, n, t3)             # t6 = (P.X+P.Z)(Q.X-Q.Z) mod n
+    Base.GMP.MPZ.add!(t1, t5, t6)           # t1 = t5 + t6
+    Base.GMP.MPZ.sub!(t2, t5, t6)           # t2 = t5 - t6
+    _mulmod!(t3, t1, t1, n, t4)
+    _mulmod!(res_X, diff_Z, t3, n, t4)      # res_X = diff.Z * (t5+t6)^2 mod n
+    _mulmod!(t3, t2, t2, n, t4)
+    _mulmod!(res_Z, diff_X, t3, n, t4)      # res_Z = diff.X * (t5-t6)^2 mod n
+end
+
+# Point doubling on Montgomery curve with a24 = (a+2)/4 (BigInt in-place version).
+function _ecm_double!(res_X::BigInt, res_Z::BigInt,
+                      P_X::BigInt, P_Z::BigInt,
+                      n::BigInt, a24::BigInt, buf::ECMBuffers)
+    t1, t2, t3, t4, t5, t6 = buf.t1, buf.t2, buf.t3, buf.t4, buf.t5, buf.t6
+    Base.GMP.MPZ.add!(t1, P_X, P_Z)
+    _mulmod!(t5, t1, t1, n, t3)             # t5 = (P.X+P.Z)^2 mod n
+    Base.GMP.MPZ.sub!(t1, P_X, P_Z)
+    _mulmod!(t6, t1, t1, n, t3)             # t6 = (P.X-P.Z)^2 mod n
+    Base.GMP.MPZ.sub!(t1, t5, t6)           # t1 = t5 - t6
+    _mulmod!(res_X, t5, t6, n, t3)          # res_X = t5 * t6 mod n
+    _mulmod!(t2, a24, t1, n, t3)
+    Base.GMP.MPZ.add!(t2, t6)
+    _mulmod!(res_Z, t1, t2, n, t3)          # res_Z = (t5-t6) * (t6 + a24*(t5-t6)) mod n
+end
+
+# Montgomery ladder scalar multiplication (BigInt in-place version).
+function _ecm_scalar_mul!(res_X::BigInt, res_Z::BigInt,
+                          k::Integer, P_X::BigInt, P_Z::BigInt,
+                          n::BigInt, a24::BigInt, buf::ECMBuffers)
+    R0_X, R0_Z = buf.R0_X, buf.R0_Z
+    R1_X, R1_Z = buf.R1_X, buf.R1_Z
+    tmp_X, tmp_Z = buf.tmp_X, buf.tmp_Z
+    Base.GMP.MPZ.set!(R0_X, P_X)
+    Base.GMP.MPZ.set!(R0_Z, P_Z)
+    _ecm_double!(R1_X, R1_Z, P_X, P_Z, n, a24, buf)
+    bits = ndigits(k, base=2)
+    for i in (bits - 2):-1:0
+        if isodd(k >> i)
+            _ecm_add!(tmp_X, tmp_Z, R0_X, R0_Z, R1_X, R1_Z, P_X, P_Z, n, buf)
+            Base.GMP.MPZ.set!(R0_X, tmp_X)
+            Base.GMP.MPZ.set!(R0_Z, tmp_Z)
+            _ecm_double!(tmp_X, tmp_Z, R1_X, R1_Z, n, a24, buf)
+            Base.GMP.MPZ.set!(R1_X, tmp_X)
+            Base.GMP.MPZ.set!(R1_Z, tmp_Z)
+        else
+            _ecm_add!(tmp_X, tmp_Z, R0_X, R0_Z, R1_X, R1_Z, P_X, P_Z, n, buf)
+            Base.GMP.MPZ.set!(R1_X, tmp_X)
+            Base.GMP.MPZ.set!(R1_Z, tmp_Z)
+            _ecm_double!(tmp_X, tmp_Z, R0_X, R0_Z, n, a24, buf)
+            Base.GMP.MPZ.set!(R0_X, tmp_X)
+            Base.GMP.MPZ.set!(R0_Z, tmp_Z)
+        end
+    end
+    Base.GMP.MPZ.set!(res_X, R0_X)
+    Base.GMP.MPZ.set!(res_Z, R0_Z)
+end
+
+# =============================================================================
+# Generic ECM functions (functional style, no mutation).
+# Work for any T<:Integer; the compiler specialises for each concrete type.
+# For BitIntegers.jl types the widen chain stays in fixed-width LLVM integers,
+# making these significantly faster than going through BigInt.
+# =============================================================================
+
+# Differential addition on Montgomery curve (generic version).
+# Given projective P=(PX:PZ), Q=(QX:QZ) and their difference P-Q=(diffX:diffZ),
+# returns (res_X, res_Z) = P+Q.
+function _ecm_add(PX::T, PZ::T, QX::T, QZ::T,
+                  diffX::T, diffZ::T, n::T) where {T<:Integer}
+    u = _mulmod(_submod(PX, PZ, n), _addmod(QX, QZ, n), n)
+    v = _mulmod(_addmod(PX, PZ, n), _submod(QX, QZ, n), n)
+    ad = _addmod(u, v, n)
+    sb = _submod(u, v, n)
+    resX = _mulmod(diffZ, _mulmod(ad, ad, n), n)
+    resZ = _mulmod(diffX, _mulmod(sb, sb, n), n)
+    resX, resZ
+end
+
+# Point doubling on Montgomery curve with a24 = (a+2)/4 (generic version).
+# Returns (res_X, res_Z).
+function _ecm_double(PX::T, PZ::T, n::T, a24::T) where {T<:Integer}
+    s = _addmod(PX, PZ, n)
+    d = _submod(PX, PZ, n)
+    u = _mulmod(s, s, n)
+    v = _mulmod(d, d, n)
+    diff = _submod(u, v, n)
+    resX = _mulmod(u, v, n)
+    resZ = _mulmod(diff, _addmod(v, _mulmod(a24, diff, n), n), n)
+    resX, resZ
+end
+
+# Montgomery ladder scalar multiplication (generic version).
+# Computes [k]P on the Montgomery curve; k is a small integer (prime power ≤ B1).
+# Returns (res_X, res_Z).
+function _ecm_scalar_mul(k::Integer, PX::T, PZ::T, n::T, a24::T) where {T<:Integer}
+    R0X, R0Z = PX, PZ
+    R1X, R1Z = _ecm_double(PX, PZ, n, a24)
+    nbits = ndigits(k, base=2)
+    for i in (nbits - 2):-1:0
+        if isodd(k >> i)
+            tmpX, tmpZ = _ecm_add(R0X, R0Z, R1X, R1Z, PX, PZ, n)
+            R0X, R0Z = tmpX, tmpZ
+            R1X, R1Z = _ecm_double(R1X, R1Z, n, a24)
+        else
+            tmpX, tmpZ = _ecm_add(R0X, R0Z, R1X, R1Z, PX, PZ, n)
+            R1X, R1Z = tmpX, tmpZ
+            R0X, R0Z = _ecm_double(R0X, R0Z, n, a24)
+        end
+    end
+    R0X, R0Z
+end
+
+# =============================================================================
+# Stage 2: standard continuation (baby-step giant-step).
+#
+# After Stage 1 computes Q = [∏ p^k ≤ B1] P, Stage 2 checks whether the
+# group order has exactly one prime factor p in (B1, B2].  Uses a stride D
+# and precomputed baby-step table [d]Q for d ∈ {1,3,5,...,D-1} with gcd(d,6)=1
+# to cover all primes in the interval with only differential additions.
+#
+# Ref: Montgomery (1987) §10, Brent (1986) "Some integer factorization
+#      algorithms using elliptic curves".
+# =============================================================================
+
+# Baby-step offsets: odd values d with 1 ≤ d < D and gcd(d, 6) == 1.
+function _stage2_baby_offsets(D::Int)
+    [d for d in 1:2:(D-1) if gcd(d, 6) == 1]
+end
+
+# Generic Stage 2 continuation.
+# Returns a non-trivial factor of n, or nothing.
+function _ecm_stage2(QX::T, QZ::T, n::T, a24::T, B1::Int, B2::Int) where {T<:Integer}
+    D = 2 * isqrt(B2 - B1 + 1)
+    D = max(D, 6)
+    # Round D up to next multiple of 6 for alignment with prime residues.
+    D = D + (6 - mod(D, 6)) % 6
+
+    offsets = _stage2_baby_offsets(D)
+    isempty(offsets) && return nothing
+
+    # Precompute baby-step table: baby[d] = [d]Q for each offset d.
+    # Also need [2]Q for building the table via differential addition chain.
+    S2X, S2Z = _ecm_double(QX, QZ, n, a24)  # [2]Q
+
+    # Build multiples of Q: muls[k] = [k]Q for k = 1, 2, ..., max(D-1, D)
+    # We only need odd multiples with gcd(d,6)==1, but building a full table
+    # up to D is simpler and the table is small.
+    max_d = maximum(offsets)
+    # mulX[k], mulZ[k] = [k]Q
+    mulX = Vector{T}(undef, max_d)
+    mulZ = Vector{T}(undef, max_d)
+    mulX[1], mulZ[1] = QX, QZ
+    if max_d >= 2
+        mulX[2], mulZ[2] = S2X, S2Z
+    end
+    for k in 3:max_d
+        # [k]Q = [k-1]Q + [1]Q, diff = [k-2]Q
+        mulX[k], mulZ[k] = _ecm_add(mulX[k-1], mulZ[k-1], QX, QZ,
+                                      mulX[k-2], mulZ[k-2], n)
+    end
+
+    # Precompute [D]Q for giant steps.
+    # Build [D]Q using the ladder.
+    DX, DZ = _ecm_scalar_mul(D, QX, QZ, n, a24)
+
+    # Giant step: start at the first multiple of D above B1.
+    # For each giant step center c = j*D, check primes c ± d for d ∈ offsets.
+    j_start = div(B1, D) + 1
+    j_end   = div(B2, D) + 1
+
+    # Compute R = [j_start * D]Q
+    RX, RZ = _ecm_scalar_mul(j_start * D, QX, QZ, n, a24)
+
+    # We also need the previous giant step for differential addition:
+    # Rprev = [(j_start - 1) * D]Q
+    RprevX, RprevZ = _ecm_scalar_mul((j_start - 1) * D, QX, QZ, n, a24)
+
+    acc = one(T)
+    batch_count = 0
+
+    for j in j_start:j_end
+        c = j * D
+        for d in offsets
+            p_plus  = c + d
+            p_minus = c - d
+            # Only check values in (B1, B2].
+            if B1 < p_plus <= B2
+                # The factor reveals itself when [p]Q = O, i.e. Z = 0.
+                # With R = [c]Q, baby = [d]Q:
+                # [c+d]Q and [c-d]Q share the property that their Z-coords
+                # vanish iff c±d divides the group order.
+                # The key identity: for Montgomery curves,
+                #   X([c]Q) * Z([d]Q) - Z([c]Q) * X([d]Q) = 0
+                # iff [c-d]Q or [c+d]Q is the point at infinity.
+                # We accumulate: acc *= (R_X * baby_Z - R_Z * baby_X)
+                val = _submod(_mulmod(RX, mulZ[d], n), _mulmod(RZ, mulX[d], n), n)
+                acc = _mulmod(acc, val, n)
+                batch_count += 1
+            end
+            if B1 < p_minus <= B2 && p_minus != p_plus
+                val = _submod(_mulmod(RX, mulZ[d], n), _mulmod(RZ, mulX[d], n), n)
+                # For c-d we use the same identity (same expression).
+                acc = _mulmod(acc, val, n)
+                batch_count += 1
+            end
+        end
+
+        if batch_count >= 100
+            g = gcd(acc, n)
+            1 < g < n && return g
+            g == n && return nothing  # degenerate
+            acc = one(T)
+            batch_count = 0
+        end
+
+        # Advance giant step: R_next = R + [D]Q, diff = R_prev
+        if j < j_end
+            RnextX, RnextZ = _ecm_add(RX, RZ, DX, DZ, RprevX, RprevZ, n)
+            RprevX, RprevZ = RX, RZ
+            RX, RZ = RnextX, RnextZ
+        end
+    end
+
+    # Final GCD check
+    if batch_count > 0
+        g = gcd(acc, n)
+        1 < g < n && return g
+    end
+    return nothing
+end
+
+# BigInt GMP in-place Stage 2 continuation.
+function _ecm_stage2!(Q_X::BigInt, Q_Z::BigInt, n::BigInt, a24::BigInt,
+                      B1::Int, B2::Int, buf::ECMBuffers)::Union{BigInt, Nothing}
+    D = 2 * isqrt(B2 - B1 + 1)
+    D = max(D, 6)
+    D = D + (6 - mod(D, 6)) % 6
+
+    offsets = _stage2_baby_offsets(D)
+    isempty(offsets) && return nothing
+
+    max_d = maximum(offsets)
+
+    # Precompute baby-step table (vectors of BigInt).
+    mulX = [BigInt() for _ in 1:max_d]
+    mulZ = [BigInt() for _ in 1:max_d]
+    Base.GMP.MPZ.set!(mulX[1], Q_X)
+    Base.GMP.MPZ.set!(mulZ[1], Q_Z)
+
+    # [2]Q
+    S2X, S2Z = BigInt(), BigInt()
+    _ecm_double!(S2X, S2Z, Q_X, Q_Z, n, a24, buf)
+    if max_d >= 2
+        Base.GMP.MPZ.set!(mulX[2], S2X)
+        Base.GMP.MPZ.set!(mulZ[2], S2Z)
+    end
+    for k in 3:max_d
+        _ecm_add!(mulX[k], mulZ[k], mulX[k-1], mulZ[k-1], Q_X, Q_Z,
+                  mulX[k-2], mulZ[k-2], n, buf)
+    end
+
+    # [D]Q via scalar multiplication
+    DX, DZ = BigInt(), BigInt()
+    _ecm_scalar_mul!(DX, DZ, D, Q_X, Q_Z, n, a24, buf)
+
+    # Giant step state
+    j_start = div(B1, D) + 1
+    j_end   = div(B2, D) + 1
+
+    RX, RZ = BigInt(), BigInt()
+    _ecm_scalar_mul!(RX, RZ, j_start * D, Q_X, Q_Z, n, a24, buf)
+
+    RprevX, RprevZ = BigInt(), BigInt()
+    _ecm_scalar_mul!(RprevX, RprevZ, (j_start - 1) * D, Q_X, Q_Z, n, a24, buf)
+
+    RnextX, RnextZ = BigInt(), BigInt()
+    acc = BigInt(1)
+    tmp_mul = BigInt()
+    tmp1 = BigInt()
+    tmp2 = BigInt()
+    batch_count = 0
+
+    for j in j_start:j_end
+        c = j * D
+        for d in offsets
+            p_plus  = c + d
+            p_minus = c - d
+            if B1 < p_plus <= B2
+                # val = R_X * baby_Z - R_Z * baby_X  (mod n)
+                _mulmod!(tmp1, RX, mulZ[d], n, tmp_mul)
+                _mulmod!(tmp2, RZ, mulX[d], n, tmp_mul)
+                Base.GMP.MPZ.sub!(tmp1, tmp2)
+                Base.GMP.MPZ.mul!(tmp_mul, acc, tmp1)
+                _mpz_fdiv_r!(acc, tmp_mul, n)
+                batch_count += 1
+            end
+            if B1 < p_minus <= B2 && p_minus != p_plus
+                _mulmod!(tmp1, RX, mulZ[d], n, tmp_mul)
+                _mulmod!(tmp2, RZ, mulX[d], n, tmp_mul)
+                Base.GMP.MPZ.sub!(tmp1, tmp2)
+                Base.GMP.MPZ.mul!(tmp_mul, acc, tmp1)
+                _mpz_fdiv_r!(acc, tmp_mul, n)
+                batch_count += 1
+            end
+        end
+
+        if batch_count >= 100
+            g = gcd(acc, n)
+            1 < g < n && return g
+            g == n && return nothing
+            Base.GMP.MPZ.set_si!(acc, 1)
+            batch_count = 0
+        end
+
+        if j < j_end
+            _ecm_add!(RnextX, RnextZ, RX, RZ, DX, DZ, RprevX, RprevZ, n, buf)
+            Base.GMP.MPZ.set!(RprevX, RX)
+            Base.GMP.MPZ.set!(RprevZ, RZ)
+            Base.GMP.MPZ.set!(RX, RnextX)
+            Base.GMP.MPZ.set!(RZ, RnextZ)
+        end
+    end
+
+    if batch_count > 0
+        g = gcd(acc, n)
+        1 < g < n && return g
+    end
+    return nothing
+end
+
+# =============================================================================
+# Public API
+# =============================================================================
+
+"""
+    ecm_factor(n::T, B1::Int, num_curves::Int) where {T<:Integer} -> Union{T, Nothing}
+
+Attempt to find a non-trivial factor of `n` using the Elliptic Curve Method (ECM)
+with Stage 1 (bound `B1`) and Stage 2 continuation (bound `B2 = 100 * B1`).
+
+Uses Montgomery curves with Suyama's parametrization and a batched GCD to reduce
+the number of `gcd` calls in the inner loop.  Stage 2 uses a baby-step giant-step
+approach to efficiently check primes in `(B1, B2]`.
+
+This generic method works for any integer type `T`, including fixed-width types from
+[BitIntegers.jl](https://github.com/rfourquet/BitIntegers.jl) (e.g. `UInt256`, `UInt512`).
+For those types modular multiplication stays within fixed-width LLVM integers via
+`widen(T)`, avoiding BigInt allocation entirely.
+
+Returns a non-trivial factor of `n`, or `nothing` if none is found within the budget.
+
+See also the `BigInt`-optimised overload which uses in-place GMP arithmetic.
+"""
+function ecm_factor(n::T, B1::Int, num_curves::Int) where {T<:Integer}
+    prime_powers = _ecm_prime_powers(B1)
+    B2 = 100 * B1
+
+    for _ in 1:num_curves
+        curve = _ecm_suyama(n)
+        curve === nothing && continue
+        curve isa Tuple || return curve  # lucky factor from gcd
+        x0, z0, a24 = curve
+
+        QX, QZ = x0, z0
+
+        # Stage 1: multiply Q by each prime power, batching the GCD checks.
+        degenerate = false
+        acc = one(T)
+        batch_count = 0
+        for pk in prime_powers
+            QX, QZ = _ecm_scalar_mul(pk, QX, QZ, n, a24)
+            acc = _mulmod(acc, QZ, n)
+            batch_count += 1
+            if batch_count >= 100
+                g = gcd(acc, n)
+                if 1 < g < n
+                    return g
+                end
+                if g == n
+                    degenerate = true
+                    break
+                end
+                acc = one(T)
+                batch_count = 0
+            end
+        end
+
+        degenerate && continue
+
+        if batch_count > 0
+            g = gcd(acc, n)
+            1 < g < n && return g
+            g == n && continue
+        end
+
+        # Stage 2: standard continuation for primes in (B1, B2].
+        s2 = _ecm_stage2(QX, QZ, n, a24, B1, B2)
+        s2 !== nothing && return s2
+    end
+    return nothing
+end
+
+"""
+    ecm_factor(n::BigInt, B1::Int, num_curves::Int) -> Union{BigInt, Nothing}
+
+BigInt-optimised ECM using in-place GMP arithmetic to minimise allocations in the
+hot Montgomery ladder loop.  Includes Stage 2 continuation (`B2 = 100 * B1`) using
+baby-step giant-step.  For fixed-width integer types use the generic method.
+
+Returns a non-trivial factor of `n`, or `nothing` if none is found within the budget.
+"""
+function ecm_factor(n::BigInt, B1::Int, num_curves::Int)::Union{BigInt, Nothing}
+    prime_powers = _ecm_prime_powers(B1)
+    B2 = 100 * B1
+
+    buf     = ECMBuffers()
+    Q_X     = BigInt()
+    Q_Z     = BigInt()
+    tmp_mul = BigInt()
+
+    for _ in 1:num_curves
+        curve = _ecm_suyama(n)
+        curve === nothing && continue
+        curve isa Tuple || return curve  # lucky factor from gcd
+        x0, z0, a24 = curve
+
+        Base.GMP.MPZ.set!(Q_X, x0)
+        Base.GMP.MPZ.set!(Q_Z, z0)
+
+        # Stage 1: multiply Q by each prime power, with batched GCD.
+        degenerate  = false
+        acc         = BigInt(1)
+        batch_count = 0
+        for pk in prime_powers
+            _ecm_scalar_mul!(Q_X, Q_Z, pk, Q_X, Q_Z, n, a24, buf)
+            Base.GMP.MPZ.mul!(tmp_mul, acc, Q_Z)
+            _mpz_fdiv_r!(acc, tmp_mul, n)
+            batch_count += 1
+            if batch_count >= 100
+                g = gcd(acc, n)
+                if 1 < g < n
+                    return g
+                end
+                if g == n
+                    degenerate = true
+                    break
+                end
+                Base.GMP.MPZ.set_si!(acc, 1)
+                batch_count = 0
+            end
+        end
+
+        degenerate && continue
+
+        if batch_count > 0
+            g = gcd(acc, n)
+            1 < g < n && return g
+            g == n && continue
+        end
+
+        # Stage 2: standard continuation for primes in (B1, B2].
+        s2 = _ecm_stage2!(Q_X, Q_Z, n, a24, B1, B2, buf)
+        s2 !== nothing && return s2
+    end
+    return nothing
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index f63e59a..e00ebcf 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -511,3 +511,168 @@ end
     end
     @test primes(2^31-20, 2^31-1) == [2147483629, 2147483647]
 end
+
+@testset "ECM factorization" begin
+    # Semi-prime with a ~40-bit factor
+    p1 = big"824633720831"   # 40-bit prime
+    q1 = big"1000000007"     # 30-bit prime
+    n1 = p1 * q1
+    f1 = factor(n1)
+    @test prodfactors(f1) == n1
+    @test all(isprime(p) for (p, _) in f1)
+
+    # p ~40 bits, q ~80 bits
+    p2 = big"824633720831"
+    q2 = big"1180591620717411303449"  # ~80-bit prime
+    n2 = p2 * q2
+    f2 = factor(n2)
+    @test prodfactors(f2) == n2
+    @test all(isprime(p) for (p, _) in f2)
+end
+
+@testset "MPQS factorization" begin
+    # Semi-prime with two ~60-bit factors
+    p1 = big"780002082420426809"
+    q1 = big"810735269523504809437013569"
+    n1 = p1 * q1
+    f1 = factor(n1)
+    @test prodfactors(f1) == n1
+    @test all(isprime(p) for (p, _) in f1)
+end
+
+@testset "BigInt factorization" begin
+    # factor(BigInt(n)) returns Factorization{BigInt}
+    n1 = big"2152302898747"
+    f1 = factor(n1)
+    @test f1 isa Primes.Factorization{BigInt}
+    @test prodfactors(f1) == n1
+
+    # BigInt prime returns single-entry
+    p = big"359334085968622831041960188598043661065388726959079837"
+    f2 = factor(p)
+    @test length(f2) == 1
+    @test first(f2) == (p => 1)
+end
+
+@testset "ECM edge cases" begin
+    # ECM with a small semi-prime (BigInt path — GMP in-place)
+    n = big"1000000007" * big"1000000009"
+    result = Primes.ecm_factor(n, 2000, 50)
+    @test result !== nothing
+    @test mod(n, result) == 0
+
+    # ECM returns nothing when curves are insufficient for a hard number
+    hard = nextprime(big"10"^30) * nextprime(big"10"^30 + 1000)
+    result2 = Primes.ecm_factor(hard, 100, 1)
+    # May or may not find a factor with just 1 curve — no assertion on result
+end
+
+@testset "ECM Stage 2" begin
+    # Stage 2 should improve success rate for low B1 values where Stage 1
+    # alone frequently fails. With B1=200 and B2=100*B1, the continuation
+    # catches group orders with one large prime factor in (B1, B2].
+
+    n = big"824633720831" * big"1000000007"
+
+    # With B1=200 and many curves, Stage 2 should find the factor reliably.
+    # (Stage 1 alone at B1=200 finds it only ~8% of the time per 5-curve attempt.)
+    result = Primes.ecm_factor(n, 200, 50)
+    @test result !== nothing
+    @test mod(n, result) == 0
+
+    # BigInt path: verify GMP in-place Stage 2 works
+    n_big = big"1000000007" * big"1000000009"
+    result_big = Primes.ecm_factor(n_big, 200, 50)
+    @test result_big !== nothing
+    @test mod(n_big, result_big) == 0
+
+    # Generic path: UInt128
+    n_u128 = UInt128(1_000_000_007) * UInt128(1_000_000_009)
+    result_u128 = Primes.ecm_factor(n_u128, 200, 50)
+    @test result_u128 !== nothing
+    @test mod(n_u128, result_u128) == 0
+    @test result_u128 isa UInt128
+end
+
+@testset "ECM Stage 2 finds factors Stage 1 misses" begin
+    # Run Stage 1 alone on many curves with low B1.  Collect curves where
+    # Stage 1 failed (gcd == 1), then run Stage 2 on those and verify it
+    # finds at least one factor that Stage 1 could not.
+    n = big"824633720831" * big"1000000007"
+    B1 = 200
+    B2 = 100 * B1
+    pp = Primes._ecm_prime_powers(B1)
+
+    stage1_only_found  = 0
+    stage2_extra_found = 0
+    curves_tested      = 0
+    stage1_missed      = 0
+
+    for _ in 1:200
+        curve = Primes._ecm_suyama(n)
+        curve === nothing && continue
+        !(curve isa Tuple) && continue
+        x0, z0, a24 = curve
+        curves_tested += 1
+
+        # Run Stage 1 manually
+        QX, QZ = x0, z0
+        for pk in pp
+            QX, QZ = Primes._ecm_scalar_mul(pk, QX, QZ, n, a24)
+        end
+        g1 = gcd(QZ, n)
+        if 1 < g1 < n
+            stage1_only_found += 1
+            continue
+        end
+
+        # Stage 1 did not find a factor on this curve
+        stage1_missed += 1
+
+        # Now run Stage 2 on the same post-Stage-1 point
+        s2 = Primes._ecm_stage2(QX, QZ, n, a24, B1, B2)
+        if s2 !== nothing && 1 < s2 < n
+            stage2_extra_found += 1
+            @test mod(n, s2) == 0
+        end
+    end
+
+    # With B1=200 on this number, Stage 1 finds ~8% of curves.
+    # Stage 2 should rescue additional curves, proving it adds value.
+    @test curves_tested >= 100  # enough curves for a meaningful test
+    @test stage1_missed > 0     # Stage 1 didn't find everything
+    @test stage2_extra_found > 0  # Stage 2 found at least one that Stage 1 missed
+    @test stage2_extra_found > stage1_only_found  # Stage 2 rescues more than Stage 1 found
+end
+
+@testset "ECM generic integer types" begin
+    # UInt128: widen(UInt128) == BigInt in base Julia — exercises the generic
+    # _mulmod/_addmod/_submod helpers via the BigInt widen path.
+    p = UInt128(1_000_000_007) * UInt128(999_999_937)
+    q = UInt128(1_000_000_009)
+    n = p * q
+    result = Primes.ecm_factor(n, 2000, 50)
+    @test result !== nothing
+    @test mod(n, result) == 0
+    @test result isa UInt128
+
+    # UInt256 / UInt512 from BitIntegers.jl: widen stays in LLVM fixed-width
+    # integers (UInt256 → UInt512), so no BigInt allocation in the inner loop.
+    using BitIntegers
+    p256 = UInt256(1_000_000_007) * UInt256(999_999_937)
+    q256 = UInt256(1_000_000_009)
+    n256 = p256 * q256
+    r256 = Primes.ecm_factor(n256, 2000, 50)
+    @test r256 !== nothing
+    @test mod(n256, r256) == 0
+    @test r256 isa UInt256
+
+    # Int256 (signed BitIntegers type)
+    p256s = Int256(1_000_000_007) * Int256(999_999_937)
+    q256s = Int256(1_000_000_009)
+    n256s = p256s * q256s
+    r256s = Primes.ecm_factor(n256s, 2000, 50)
+    @test r256s !== nothing
+    @test mod(n256s, r256s) == 0
+    @test r256s isa Int256
+end