diff --git a/.gitignore b/.gitignore index 9847c84..c84633f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,61 @@ +* + +### Allowed files and directories ### + +!.gitignore +!.github/ +!.github/workflows/ +!.github/workflows/* +!.github/dependabot.yml + +!LICENSE.md +!README.md +!Project.toml + +!src/ +!src/*.jl + +!test/ +!test/*.jl + +!docs/ +!docs/src/ +!docs/src/*.md +!docs/make.jl +!docs/Project.toml + +!benchmarks/ +!benchmarks/*.jl +!benchmarks/Project.toml +!benchmarks/README.md + +### Denied even if allowed above ### + +# Files generated by invoking Julia with --code-coverage *.jl.cov *.jl.*.cov + +# Files generated by invoking Julia with --track-allocation *.jl.mem -Manifest.toml -docs/build -docs/site -docs/Manifest.toml + +# System-specific files and directories generated by the BinaryProvider and BinDeps packages +# They contain absolute paths specific to the host computer, and so should not be committed +deps/deps.jl +deps/build.log +deps/downloads/ +deps/usr/ +deps/src/ + +# Build artifacts for creating documentation generated by the Documenter package +docs/build/ +docs/site/ + +# File generated by Pkg, the package manager, based on a corresponding Project.toml +# It records a fixed state of all packages used by the project. As such, it should not be +# committed for packages, but should be committed for applications that require a static +# environment. +Manifest*.toml + +# File generated by the Preferences package to store local preferences +LocalPreferences.toml +JuliaLocalPreferences.toml \ No newline at end of file diff --git a/Project.toml b/Project.toml index 460e0eb..f4d7abb 100644 --- a/Project.toml +++ b/Project.toml @@ -5,14 +5,16 @@ version = "0.5.7" [deps] IntegerMathUtils = "18e54dd8-cb9d-406c-a71d-865a43cbb235" +[compat] +BitIntegers = "0.3" +IntegerMathUtils = "0.1.1" +julia = "1.6" + [extras] -IntegerMathUtils = "18e54dd8-cb9d-406c-a71d-865a43cbb235" +BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +IntegerMathUtils = "18e54dd8-cb9d-406c-a71d-865a43cbb235" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["DataStructures", "IntegerMathUtils", "Test"] - -[compat] -IntegerMathUtils = "0.1.1" -julia = "1.6" +test = ["BitIntegers", "DataStructures", "IntegerMathUtils", "Test"] diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml new file mode 100644 index 0000000..76683dd --- /dev/null +++ b/benchmarks/Project.toml @@ -0,0 +1,7 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" +Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae" + +[sources] +Primes = {path = ".."} diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..d17802f --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,61 @@ +# Benchmarks + +Benchmark suite for Primes.jl, comparing GMP-optimised in-place arithmetic against +the generic (allocating) code path for BigInt operations, and measuring end-to-end +factorisation performance across different number sizes. + +## Prerequisites + +```bash +julia --project=benchmarks -e 'using Pkg; Pkg.instantiate()' +``` + +## Running + +Run all benchmarks: + +```bash +julia --project=benchmarks benchmarks/run_all.jl +``` + +Or run individual suites: + +```bash +# GMP in-place vs generic (allocating) ECM — the core comparison +julia --project=benchmarks benchmarks/ecm_gmp_vs_generic.jl + +# Micro-benchmarks for individual ECM operations +julia --project=benchmarks benchmarks/ecm_microbenchmarks.jl + +# End-to-end factor(n) at various sizes +julia --project=benchmarks benchmarks/factorization_endtoend.jl +``` + +## Benchmark Suites + +### `ecm_gmp_vs_generic.jl` + +Compares the two ECM code paths for `BigInt` inputs: + +| Path | Style | Allocations | +|------|-------|-------------| +| `ecm_factor(n::BigInt, ...)` | In-place GMP (`Base.GMP.MPZ.*!`) | Near-zero in hot loop | +| Generic via `_ecm_scalar_mul` | Functional (new BigInt per op) | O(k) per scalar multiply | + +Tests at two scales: a ~40-digit and a ~55-digit semiprime. + +### `ecm_microbenchmarks.jl` + +Benchmarks individual ECM building blocks across three execution modes: + +| Operation | GMP in-place | Generic BigInt | UInt128 (LLVM) | +|-----------|-------------|----------------|----------------| +| `_mulmod` | `_mulmod!` | `_mulmod` | `_mulmod` | +| `_ecm_add` | `_ecm_add!` | `_ecm_add` | `_ecm_add` | +| `_ecm_double` | `_ecm_double!` | `_ecm_double` | `_ecm_double` | +| `_ecm_scalar_mul` | `_ecm_scalar_mul!` | `_ecm_scalar_mul` | `_ecm_scalar_mul` | + +### `factorization_endtoend.jl` + +Benchmarks `factor(n)` for semiprimes ranging from 12 to 45+ digits, +showing how the polyalgorithm (trial division → Pollard rho → ECM → MPQS) scales. diff --git a/benchmarks/ecm_gmp_vs_generic.jl b/benchmarks/ecm_gmp_vs_generic.jl new file mode 100644 index 0000000..bcb4de2 --- /dev/null +++ b/benchmarks/ecm_gmp_vs_generic.jl @@ -0,0 +1,116 @@ +# ECM Benchmark: GMP In-Place vs Generic (Allocating) BigInt Path +# +# Compares the two ECM code paths for BigInt inputs: +# 1. BigInt-specialised: uses in-place GMP arithmetic (zero-allocation hot loop) +# 2. Generic: functional style, allocates new BigInts per operation +# +# Usage: +# julia --project=benchmarks benchmarks/ecm_gmp_vs_generic.jl + +using Primes +using BenchmarkTools + +# Reproduce the generic (allocating) ECM path for BigInt, bypassing the +# BigInt-specialised method that normally intercepts dispatch. +function ecm_factor_generic(n::BigInt, B1::Int, num_curves::Int) + prime_powers = Primes._ecm_prime_powers(B1) + T = BigInt + for _ in 1:num_curves + curve = Primes._ecm_suyama(n) + curve === nothing && continue + curve isa Tuple || return curve + x0, z0, a24 = curve + + QX, QZ = x0, z0 + degenerate = false + acc = one(T) + batch_count = 0 + for pk in prime_powers + QX, QZ = Primes._ecm_scalar_mul(pk, QX, QZ, n, a24) + acc = Primes._mulmod(acc, QZ, n) + batch_count += 1 + if batch_count >= 100 + g = gcd(acc, n) + if 1 < g < n + return g + end + if g == n + degenerate = true + break + end + acc = one(T) + batch_count = 0 + end + end + degenerate && continue + if batch_count > 0 + g = gcd(acc, n) + 1 < g < n && return g + end + end + return nothing +end + +# --------------------------------------------------------------------------- +# Test cases at different scales +# --------------------------------------------------------------------------- +const CASES = [ + ( + name = "small (~40-digit semiprime)", + n = big"824633720831" * big"1000000007", + B1 = 2_000, + curves = 50, + ), + ( + name = "medium (~55-digit semiprime)", + n = big"780002082420426809" * big"810735269523504809437013569", + B1 = 11_000, + curves = 200, + ), +] + +# --------------------------------------------------------------------------- +# Run benchmarks +# --------------------------------------------------------------------------- +println("=" ^ 70) +println(" ECM Benchmark: GMP In-Place vs Generic (Allocating) BigInt") +println("=" ^ 70) + +for case in CASES + (; name, n, B1, curves) = case + println("\n--- $name ---") + println(" n = $n ($(ndigits(n)) digits)") + println(" B1 = $B1") + println(" curves = $curves") + + # Warm-up both paths + Primes.ecm_factor(n, B1, curves) + ecm_factor_generic(n, B1, curves) + + println("\n [GMP in-place]") + b_gmp = @benchmark Primes.ecm_factor($n, $B1, $curves) samples=20 evals=1 + display(b_gmp) + + println("\n [Generic (allocating)]") + b_gen = @benchmark ecm_factor_generic($n, $B1, $curves) samples=20 evals=1 + display(b_gen) + + med_gmp = median(b_gmp).time / 1e6 + med_gen = median(b_gen).time / 1e6 + alloc_gmp = median(b_gmp).memory / 1024 + alloc_gen = median(b_gen).memory / 1024 + + println("\n Summary (median):") + println(" GMP in-place : $(round(med_gmp; digits=2)) ms, $(round(alloc_gmp; digits=1)) KiB") + println(" Generic alloc: $(round(med_gen; digits=2)) ms, $(round(alloc_gen; digits=1)) KiB") + if med_gmp > 0 + println(" Speedup : $(round(med_gmp / med_gen; digits=2))×") + end + if alloc_gen > 0 + println(" Memory saved : $(round((1 - alloc_gmp / alloc_gen) * 100; digits=1))%") + end +end + +println("\n" * "=" ^ 70) +println(" Done.") +println("=" ^ 70) diff --git a/benchmarks/ecm_microbenchmarks.jl b/benchmarks/ecm_microbenchmarks.jl new file mode 100644 index 0000000..613a0f4 --- /dev/null +++ b/benchmarks/ecm_microbenchmarks.jl @@ -0,0 +1,172 @@ +# ECM Micro-Benchmarks +# +# Benchmarks individual ECM building blocks: +# - _mulmod / _mulmod! (modular multiplication) +# - _ecm_add / _ecm_add! (differential point addition) +# - _ecm_double / _ecm_double! (point doubling) +# - _ecm_scalar_mul / _ecm_scalar_mul! (Montgomery ladder) +# +# Each operation is tested in four modes: +# 1. BigInt GMP in-place (specialised path) +# 2. BigInt generic (allocating, functional) +# 3. UInt128 generic (note: widen(UInt128) == BigInt, so still allocates) +# 4. UInt256 generic via BitIntegers.jl (widen stays in LLVM fixed-width) +# +# Usage: +# julia --project=benchmarks benchmarks/ecm_microbenchmarks.jl + +using Primes +using BenchmarkTools +using BitIntegers + +# --------------------------------------------------------------------------- +# Setup: representative values for a ~40-digit modulus +# --------------------------------------------------------------------------- +const N_BIG = big"824633720831" * big"1000000007" +const A24_BIG = mod(big"123456789012345", N_BIG) +const PX_BIG = mod(big"314159265358979323846", N_BIG) +const PZ_BIG = mod(big"271828182845904523536", N_BIG) +const QX_BIG = mod(big"141421356237309504880", N_BIG) +const QZ_BIG = mod(big"173205080756887729352", N_BIG) +const DX_BIG = mod(big"223606797749978969640", N_BIG) +const DZ_BIG = mod(big"264575131106459059050", N_BIG) + +# UInt128 versions (note: widen(UInt128) == BigInt in base Julia) +const N_U128 = UInt128(824633720831) * UInt128(1000000007) +const A24_U128 = UInt128(mod(big"123456789012345", big(N_U128))) +const PX_U128 = UInt128(mod(big"314159265358979323846", big(N_U128))) +const PZ_U128 = UInt128(mod(big"271828182845904523536", big(N_U128))) +const QX_U128 = UInt128(mod(big"141421356237309504880", big(N_U128))) +const QZ_U128 = UInt128(mod(big"173205080756887729352", big(N_U128))) +const DX_U128 = UInt128(mod(big"223606797749978969640", big(N_U128))) +const DZ_U128 = UInt128(mod(big"264575131106459059050", big(N_U128))) + +# UInt256 versions (widen(UInt256) == UInt512, stays in LLVM fixed-width) +const N_U256 = UInt256(824633720831) * UInt256(1000000007) +const A24_U256 = UInt256(mod(big"123456789012345", big(N_U256))) +const PX_U256 = UInt256(mod(big"314159265358979323846", big(N_U256))) +const PZ_U256 = UInt256(mod(big"271828182845904523536", big(N_U256))) +const QX_U256 = UInt256(mod(big"141421356237309504880", big(N_U256))) +const QZ_U256 = UInt256(mod(big"173205080756887729352", big(N_U256))) +const DX_U256 = UInt256(mod(big"223606797749978969640", big(N_U256))) +const DZ_U256 = UInt256(mod(big"264575131106459059050", big(N_U256))) + +# GMP buffers +const BUF = Primes.ECMBuffers() +const TMP_MUL = BigInt() + +println("=" ^ 70) +println(" ECM Micro-Benchmarks") +println("=" ^ 70) +println() +println(" Note: widen(UInt128) == BigInt → still allocates (no LLVM benefit)") +println(" widen(UInt256) == UInt512 → pure LLVM fixed-width arithmetic") + +# --------------------------------------------------------------------------- +# 1. Modular multiplication +# --------------------------------------------------------------------------- +println("\n--- _mulmod ---") + +print(" BigInt GMP in-place (_mulmod!): ") +dst = BigInt() +b1 = @benchmark Primes._mulmod!($dst, $PX_BIG, $PZ_BIG, $N_BIG, $TMP_MUL) +println("$(round(median(b1).time; digits=1)) ns, $(median(b1).memory) bytes") + +print(" BigInt generic (_mulmod): ") +b2 = @benchmark Primes._mulmod($PX_BIG, $PZ_BIG, $N_BIG) +println("$(round(median(b2).time; digits=1)) ns, $(median(b2).memory) bytes") + +print(" UInt128 generic (_mulmod): ") +b3 = @benchmark Primes._mulmod($PX_U128, $PZ_U128, $N_U128) +println("$(round(median(b3).time; digits=1)) ns, $(median(b3).memory) bytes") + +print(" UInt256 generic (_mulmod): ") +b3b = @benchmark Primes._mulmod($PX_U256, $PZ_U256, $N_U256) +println("$(round(median(b3b).time; digits=1)) ns, $(median(b3b).memory) bytes") + +# --------------------------------------------------------------------------- +# 2. Differential point addition +# --------------------------------------------------------------------------- +println("\n--- _ecm_add ---") + +res_X, res_Z = BigInt(), BigInt() + +print(" BigInt GMP in-place (_ecm_add!): ") +b4 = @benchmark Primes._ecm_add!($res_X, $res_Z, + $PX_BIG, $PZ_BIG, $QX_BIG, $QZ_BIG, $DX_BIG, $DZ_BIG, $N_BIG, $BUF) +println("$(round(median(b4).time; digits=1)) ns, $(median(b4).memory) bytes") + +print(" BigInt generic (_ecm_add): ") +b5 = @benchmark Primes._ecm_add($PX_BIG, $PZ_BIG, $QX_BIG, $QZ_BIG, $DX_BIG, $DZ_BIG, $N_BIG) +println("$(round(median(b5).time; digits=1)) ns, $(median(b5).memory) bytes") + +print(" UInt128 generic (_ecm_add): ") +b6 = @benchmark Primes._ecm_add($PX_U128, $PZ_U128, $QX_U128, $QZ_U128, $DX_U128, $DZ_U128, $N_U128) +println("$(round(median(b6).time; digits=1)) ns, $(median(b6).memory) bytes") + +print(" UInt256 generic (_ecm_add): ") +b6b = @benchmark Primes._ecm_add($PX_U256, $PZ_U256, $QX_U256, $QZ_U256, $DX_U256, $DZ_U256, $N_U256) +println("$(round(median(b6b).time; digits=1)) ns, $(median(b6b).memory) bytes") + +# --------------------------------------------------------------------------- +# 3. Point doubling +# --------------------------------------------------------------------------- +println("\n--- _ecm_double ---") + +print(" BigInt GMP in-place (_ecm_double!): ") +b7 = @benchmark Primes._ecm_double!($res_X, $res_Z, $PX_BIG, $PZ_BIG, $N_BIG, $A24_BIG, $BUF) +println("$(round(median(b7).time; digits=1)) ns, $(median(b7).memory) bytes") + +print(" BigInt generic (_ecm_double): ") +b8 = @benchmark Primes._ecm_double($PX_BIG, $PZ_BIG, $N_BIG, $A24_BIG) +println("$(round(median(b8).time; digits=1)) ns, $(median(b8).memory) bytes") + +print(" UInt128 generic (_ecm_double): ") +b9 = @benchmark Primes._ecm_double($PX_U128, $PZ_U128, $N_U128, $A24_U128) +println("$(round(median(b9).time; digits=1)) ns, $(median(b9).memory) bytes") + +print(" UInt256 generic (_ecm_double): ") +b9b = @benchmark Primes._ecm_double($PX_U256, $PZ_U256, $N_U256, $A24_U256) +println("$(round(median(b9b).time; digits=1)) ns, $(median(b9b).memory) bytes") + +# --------------------------------------------------------------------------- +# 4. Montgomery ladder scalar multiplication +# --------------------------------------------------------------------------- +println("\n--- _ecm_scalar_mul (k=997, prime) ---") + +const K = 997 + +print(" BigInt GMP in-place (_ecm_scalar_mul!): ") +b10 = @benchmark Primes._ecm_scalar_mul!($res_X, $res_Z, $K, $PX_BIG, $PZ_BIG, $N_BIG, $A24_BIG, $BUF) +println("$(round(median(b10).time / 1e3; digits=2)) μs, $(median(b10).memory) bytes") + +print(" BigInt generic (_ecm_scalar_mul): ") +b11 = @benchmark Primes._ecm_scalar_mul($K, $PX_BIG, $PZ_BIG, $N_BIG, $A24_BIG) +println("$(round(median(b11).time / 1e3; digits=2)) μs, $(median(b11).memory) bytes") + +print(" UInt128 generic (_ecm_scalar_mul): ") +b12 = @benchmark Primes._ecm_scalar_mul($K, $PX_U128, $PZ_U128, $N_U128, $A24_U128) +println("$(round(median(b12).time / 1e3; digits=2)) μs, $(median(b12).memory) bytes") + +print(" UInt256 generic (_ecm_scalar_mul): ") +b12b = @benchmark Primes._ecm_scalar_mul($K, $PX_U256, $PZ_U256, $N_U256, $A24_U256) +println("$(round(median(b12b).time / 1e3; digits=2)) μs, $(median(b12b).memory) bytes") + +# --------------------------------------------------------------------------- +# Summary table +# --------------------------------------------------------------------------- +println("\n" * "=" ^ 70) +println(" Summary: median time") +println(" (GMP in-place → Generic BigInt → UInt128 → UInt256)") +println("=" ^ 70) + +function fmt_speedup(base, other) + s = base / other + "$(round(s; digits=2))×" +end + +println(" _mulmod: $(round(median(b1).time; digits=0)) ns → $(round(median(b2).time; digits=0)) ns ($(fmt_speedup(median(b1).time, median(b2).time))) → $(round(median(b3).time; digits=0)) ns ($(fmt_speedup(median(b1).time, median(b3).time))) → $(round(median(b3b).time; digits=0)) ns ($(fmt_speedup(median(b1).time, median(b3b).time)))") +println(" _ecm_add: $(round(median(b4).time; digits=0)) ns → $(round(median(b5).time; digits=0)) ns ($(fmt_speedup(median(b4).time, median(b5).time))) → $(round(median(b6).time; digits=0)) ns ($(fmt_speedup(median(b4).time, median(b6).time))) → $(round(median(b6b).time; digits=0)) ns ($(fmt_speedup(median(b4).time, median(b6b).time)))") +println(" _ecm_double: $(round(median(b7).time; digits=0)) ns → $(round(median(b8).time; digits=0)) ns ($(fmt_speedup(median(b7).time, median(b8).time))) → $(round(median(b9).time; digits=0)) ns ($(fmt_speedup(median(b7).time, median(b9).time))) → $(round(median(b9b).time; digits=0)) ns ($(fmt_speedup(median(b7).time, median(b9b).time)))") +println(" _ecm_scalar_mul: $(round(median(b10).time/1e3; digits=1)) μs → $(round(median(b11).time/1e3; digits=1)) μs ($(fmt_speedup(median(b10).time, median(b11).time))) → $(round(median(b12).time/1e3; digits=1)) μs ($(fmt_speedup(median(b10).time, median(b12).time))) → $(round(median(b12b).time/1e3; digits=1)) μs ($(fmt_speedup(median(b10).time, median(b12b).time)))") +println("=" ^ 70) diff --git a/benchmarks/factorization_endtoend.jl b/benchmarks/factorization_endtoend.jl new file mode 100644 index 0000000..17887cf --- /dev/null +++ b/benchmarks/factorization_endtoend.jl @@ -0,0 +1,52 @@ +# End-to-End Factorization Benchmarks +# +# Benchmarks `factor(n)` for semiprimes of increasing size to show the +# polyalgorithm's performance across different regimes: +# - Small (< 20 digits): trial division + Pollard rho +# - Medium (20-40 digits): ECM kicks in +# - Large (40-60 digits): ECM + MPQS +# - Very large (60+ digits): MPQS-dominated +# +# Usage: +# julia --project=benchmarks benchmarks/factorization_endtoend.jl + +using Primes +using BenchmarkTools + +const CASES = [ + (name = "~12 digits", n = big"824633720831"), + (name = "~20 digits", n = big"99999999999999999877" ), # 20-digit semiprime-like + (name = "~30 digits", n = big"824633720831" * big"1000000007"), + (name = "~40 digits", n = big"2152302898747" * big"99999999999999999877"), + (name = "~45 digits", n = big"780002082420426809" * big"810735269523504809437013569"), +] + +println("=" ^ 70) +println(" End-to-End factor(n) Benchmarks") +println("=" ^ 70) + +for case in CASES + (; name, n) = case + println("\n--- $name ($(ndigits(n)) digits) ---") + println(" n = $n") + + # Warm-up and verify correctness + f = factor(n) + @assert prodfactors(f) == n "Factorization verification failed for $name" + println(" factor(n) = $f") + + # Benchmark + samples = ndigits(n) > 40 ? 5 : 20 + b = @benchmark factor($n) samples=samples evals=1 + display(b) + + med = median(b).time + unit = med < 1e6 ? ("$(round(med / 1e3; digits=2)) μs") : + med < 1e9 ? ("$(round(med / 1e6; digits=2)) ms") : + ("$(round(med / 1e9; digits=2)) s") + println(" Median: $unit, $(round(median(b).memory / 1024; digits=1)) KiB allocated") +end + +println("\n" * "=" ^ 70) +println(" Done.") +println("=" ^ 70) diff --git a/benchmarks/run_all.jl b/benchmarks/run_all.jl new file mode 100644 index 0000000..90e8c83 --- /dev/null +++ b/benchmarks/run_all.jl @@ -0,0 +1,25 @@ +# Run All Benchmarks +# +# Usage: +# julia --project=benchmarks benchmarks/run_all.jl +# +# Or run individual benchmarks: +# julia --project=benchmarks benchmarks/ecm_gmp_vs_generic.jl +# julia --project=benchmarks benchmarks/ecm_microbenchmarks.jl +# julia --project=benchmarks benchmarks/factorization_endtoend.jl + +const BENCHDIR = @__DIR__ + +const SCRIPTS = [ + "ecm_microbenchmarks.jl", + "ecm_gmp_vs_generic.jl", + "factorization_endtoend.jl", +] + +for script in SCRIPTS + path = joinpath(BENCHDIR, script) + println("\n" * "#" ^ 70) + println("# Running: $script") + println("#" ^ 70 * "\n") + include(path) +end diff --git a/docs/make.jl b/docs/make.jl index 73117d9..a0552bd 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -7,7 +7,8 @@ makedocs( checkdocs = :none, pages = Any[ "Home" => "index.md", - "Functions" => "api.md" + "Functions" => "api.md", + "Benchmarks" => "benchmarks.md" ], ) diff --git a/docs/src/benchmarks.md b/docs/src/benchmarks.md new file mode 100644 index 0000000..cd8fc34 --- /dev/null +++ b/docs/src/benchmarks.md @@ -0,0 +1,61 @@ +# Benchmarks + +Primes.jl ships a benchmark suite in `benchmarks/` for measuring the performance of +its factorisation algorithms, and comparing the GMP-optimised BigInt code path against +the generic (allocating) implementation. + +## Running Benchmarks + +Install dependencies once: + +```bash +julia --project=benchmarks -e 'using Pkg; Pkg.instantiate()' +``` + +Run all benchmarks: + +```bash +julia --project=benchmarks benchmarks/run_all.jl +``` + +Or run individual suites: + +```bash +julia --project=benchmarks benchmarks/ecm_gmp_vs_generic.jl +julia --project=benchmarks benchmarks/ecm_microbenchmarks.jl +julia --project=benchmarks benchmarks/factorization_endtoend.jl +``` + +## Benchmark Suites + +### ECM: GMP In-Place vs Generic + +`ecm_gmp_vs_generic.jl` compares the two ECM code paths for `BigInt` inputs: + +- **GMP in-place** (`ecm_factor(n::BigInt, ...)`): Uses `Base.GMP.MPZ.*!` functions for + zero-allocation arithmetic in the Montgomery ladder hot loop. +- **Generic** (via `_ecm_scalar_mul`): Functional style that allocates a new `BigInt` for + every modular operation. + +### ECM Micro-Benchmarks + +`ecm_microbenchmarks.jl` measures individual ECM building blocks across four execution +modes: + +| Operation | GMP in-place | Generic BigInt | UInt128 | UInt256 (LLVM) | +|-----------|-------------|----------------|---------|----------------| +| Modular multiply | `_mulmod!` | `_mulmod` | `_mulmod` | `_mulmod` | +| Point addition | `_ecm_add!` | `_ecm_add` | `_ecm_add` | `_ecm_add` | +| Point doubling | `_ecm_double!` | `_ecm_double` | `_ecm_double` | `_ecm_double` | +| Scalar multiply | `_ecm_scalar_mul!` | `_ecm_scalar_mul` | `_ecm_scalar_mul` | `_ecm_scalar_mul` | + +!!! note + `widen(UInt128) == BigInt` in base Julia, so UInt128 still allocates through BigInt. + `widen(UInt256) == UInt512` via BitIntegers.jl, giving pure LLVM fixed-width arithmetic + with zero allocations — but GMP's hand-tuned assembly is faster per-operation. + +### End-to-End Factorisation + +`factorization_endtoend.jl` benchmarks `factor(n)` for semiprimes ranging from 12 to 45+ +digits, showing how the polyalgorithm (trial division → Pollard rho → ECM → MPQS) scales +with input size. diff --git a/src/Primes.jl b/src/Primes.jl index b95fdac..9880b89 100644 --- a/src/Primes.jl +++ b/src/Primes.jl @@ -12,6 +12,7 @@ export isprime, primes, primesmask, factor, eachfactor, divisors, ismersenneprim nextprime, nextprimes, prevprime, prevprimes, prime, prodfactors, radical, totient include("factorization.jl") +include("ecm.jl") # Primes generating functions # https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes diff --git a/src/ecm.jl b/src/ecm.jl new file mode 100644 index 0000000..0aed909 --- /dev/null +++ b/src/ecm.jl @@ -0,0 +1,585 @@ +# Elliptic Curve Method (ECM) for integer factorization +# Ref: Lenstra (1987) "Factoring integers with elliptic curves" +# Ref: Montgomery (1987) "Speeding the Pollard and Elliptic Curve Methods of Factorization" + +# ============================================================================= +# Generic modular arithmetic helpers (work for any T<:Integer, including +# fixed-width types from BitIntegers.jl such as UInt256, UInt512, etc.) +# Uses widen(T) for overflow-safe multiplication; for BitIntegers types, +# widen returns the next fixed-width type (e.g. widen(UInt256) == UInt512), +# so no BigInt allocation occurs. +# ============================================================================= + +# Modular addition: (a + b) mod n, for a, b ∈ [0, n). +# Avoids computing a + b directly, which could overflow for unsigned T +# when n is close to typemax(T). +@inline function _addmod(a::T, b::T, n::T) where {T<:Integer} + a >= n - b ? a - (n - b) : a + b +end + +# Modular subtraction: (a - b) mod n, for a, b ∈ [0, n). +# Written as n - (b - a) when a < b to avoid underflow in unsigned types. +@inline function _submod(a::T, b::T, n::T) where {T<:Integer} + a >= b ? a - b : n - (b - a) +end + +# Modular multiplication: (a * b) mod n. +# Uses widen(T) to avoid overflow for fixed-width integers. +# For BigInt, widen returns BigInt itself, so this is simply mod(a * b, n). +@inline function _mulmod(a::T, b::T, n::T) where {T<:Integer} + mod(a * b, n) +end + +# ============================================================================= +# Shared helpers used by both the generic and BigInt-optimized ECM paths. +# ============================================================================= + +# Precompute prime powers p^k ≤ B1 for each prime p ≤ B1. +function _ecm_prime_powers(B1::Int) + result = Int[] + for p in primes(B1) + pk = p + while pk * p <= B1 + pk *= p + end + push!(result, pk) + end + result +end + +# Suyama's parametrization: generate a random Montgomery curve and initial point. +# Returns (x0, z0, a24) on success, a lucky factor g::T, or nothing for a +# degenerate curve (gcd == n). +function _ecm_suyama(n::T) where {T<:Integer} + σ = T(rand(6:10^9)) + u = _submod(_mulmod(σ, σ, n), T(5), n) + v = _mulmod(T(4), σ, n) + x0 = _mulmod(_mulmod(u, u, n), u, n) + z0 = _mulmod(_mulmod(v, v, n), v, n) + + vu_diff = _submod(v, u, n) + vu_diff3 = _mulmod(_mulmod(vu_diff, vu_diff, n), vu_diff, n) + a24_num = _mulmod(vu_diff3, _addmod(_mulmod(T(3), u, n), v, n), n) + a24_den = _mulmod(_mulmod(T(16), x0, n), v, n) + + g = gcd(a24_den, n) + 1 < g < n && return g + g == n && return nothing + + a24 = _mulmod(a24_num, invmod(a24_den, n), n) + (x0, z0, a24) +end + +# ============================================================================= +# GMP-optimized path for BigInt: in-place arithmetic avoids allocations in +# the hot Montgomery ladder loop, which matters for large-integer factoring. +# ============================================================================= + +# In-place modular reduction: r = n mod d (non-negative remainder). +function _mpz_fdiv_r!(r::BigInt, n::BigInt, d::BigInt) + ccall((:__gmpz_fdiv_r, :libgmp), Cvoid, (Ref{BigInt}, Ref{BigInt}, Ref{BigInt}), r, n, d) +end + +# Preallocated scratch space for the BigInt ECM inner loop. +# t1–t6: scratch for add!/double!; R0/R1/tmp: scratch for scalar_mul! +struct ECMBuffers + t1::BigInt + t2::BigInt + t3::BigInt + t4::BigInt + t5::BigInt + t6::BigInt + R0_X::BigInt + R0_Z::BigInt + R1_X::BigInt + R1_Z::BigInt + tmp_X::BigInt + tmp_Z::BigInt +end + +ECMBuffers() = ECMBuffers(BigInt(), BigInt(), BigInt(), BigInt(), BigInt(), BigInt(), + BigInt(), BigInt(), BigInt(), BigInt(), BigInt(), BigInt()) + +# In-place: dst = (a * b) mod n, using tmp as scratch. +@inline function _mulmod!(dst::BigInt, a::BigInt, b::BigInt, n::BigInt, tmp::BigInt) + Base.GMP.MPZ.mul!(tmp, a, b) + _mpz_fdiv_r!(dst, tmp, n) +end + +# Differential addition on Montgomery curve (BigInt in-place version). +# Given projective P, Q and their difference P-Q (as diff), computes P+Q. +function _ecm_add!(res_X::BigInt, res_Z::BigInt, + P_X::BigInt, P_Z::BigInt, Q_X::BigInt, Q_Z::BigInt, + diff_X::BigInt, diff_Z::BigInt, n::BigInt, buf::ECMBuffers) + t1, t2, t3, t4, t5, t6 = buf.t1, buf.t2, buf.t3, buf.t4, buf.t5, buf.t6 + Base.GMP.MPZ.sub!(t1, P_X, P_Z) + Base.GMP.MPZ.add!(t2, Q_X, Q_Z) + _mulmod!(t5, t1, t2, n, t3) # t5 = (P.X-P.Z)(Q.X+Q.Z) mod n + Base.GMP.MPZ.add!(t1, P_X, P_Z) + Base.GMP.MPZ.sub!(t2, Q_X, Q_Z) + _mulmod!(t6, t1, t2, n, t3) # t6 = (P.X+P.Z)(Q.X-Q.Z) mod n + Base.GMP.MPZ.add!(t1, t5, t6) # t1 = t5 + t6 + Base.GMP.MPZ.sub!(t2, t5, t6) # t2 = t5 - t6 + _mulmod!(t3, t1, t1, n, t4) + _mulmod!(res_X, diff_Z, t3, n, t4) # res_X = diff.Z * (t5+t6)^2 mod n + _mulmod!(t3, t2, t2, n, t4) + _mulmod!(res_Z, diff_X, t3, n, t4) # res_Z = diff.X * (t5-t6)^2 mod n +end + +# Point doubling on Montgomery curve with a24 = (a+2)/4 (BigInt in-place version). +function _ecm_double!(res_X::BigInt, res_Z::BigInt, + P_X::BigInt, P_Z::BigInt, + n::BigInt, a24::BigInt, buf::ECMBuffers) + t1, t2, t3, t4, t5, t6 = buf.t1, buf.t2, buf.t3, buf.t4, buf.t5, buf.t6 + Base.GMP.MPZ.add!(t1, P_X, P_Z) + _mulmod!(t5, t1, t1, n, t3) # t5 = (P.X+P.Z)^2 mod n + Base.GMP.MPZ.sub!(t1, P_X, P_Z) + _mulmod!(t6, t1, t1, n, t3) # t6 = (P.X-P.Z)^2 mod n + Base.GMP.MPZ.sub!(t1, t5, t6) # t1 = t5 - t6 + _mulmod!(res_X, t5, t6, n, t3) # res_X = t5 * t6 mod n + _mulmod!(t2, a24, t1, n, t3) + Base.GMP.MPZ.add!(t2, t6) + _mulmod!(res_Z, t1, t2, n, t3) # res_Z = (t5-t6) * (t6 + a24*(t5-t6)) mod n +end + +# Montgomery ladder scalar multiplication (BigInt in-place version). +function _ecm_scalar_mul!(res_X::BigInt, res_Z::BigInt, + k::Integer, P_X::BigInt, P_Z::BigInt, + n::BigInt, a24::BigInt, buf::ECMBuffers) + R0_X, R0_Z = buf.R0_X, buf.R0_Z + R1_X, R1_Z = buf.R1_X, buf.R1_Z + tmp_X, tmp_Z = buf.tmp_X, buf.tmp_Z + Base.GMP.MPZ.set!(R0_X, P_X) + Base.GMP.MPZ.set!(R0_Z, P_Z) + _ecm_double!(R1_X, R1_Z, P_X, P_Z, n, a24, buf) + bits = ndigits(k, base=2) + for i in (bits - 2):-1:0 + if isodd(k >> i) + _ecm_add!(tmp_X, tmp_Z, R0_X, R0_Z, R1_X, R1_Z, P_X, P_Z, n, buf) + Base.GMP.MPZ.set!(R0_X, tmp_X) + Base.GMP.MPZ.set!(R0_Z, tmp_Z) + _ecm_double!(tmp_X, tmp_Z, R1_X, R1_Z, n, a24, buf) + Base.GMP.MPZ.set!(R1_X, tmp_X) + Base.GMP.MPZ.set!(R1_Z, tmp_Z) + else + _ecm_add!(tmp_X, tmp_Z, R0_X, R0_Z, R1_X, R1_Z, P_X, P_Z, n, buf) + Base.GMP.MPZ.set!(R1_X, tmp_X) + Base.GMP.MPZ.set!(R1_Z, tmp_Z) + _ecm_double!(tmp_X, tmp_Z, R0_X, R0_Z, n, a24, buf) + Base.GMP.MPZ.set!(R0_X, tmp_X) + Base.GMP.MPZ.set!(R0_Z, tmp_Z) + end + end + Base.GMP.MPZ.set!(res_X, R0_X) + Base.GMP.MPZ.set!(res_Z, R0_Z) +end + +# ============================================================================= +# Generic ECM functions (functional style, no mutation). +# Work for any T<:Integer; the compiler specialises for each concrete type. +# For BitIntegers.jl types the widen chain stays in fixed-width LLVM integers, +# making these significantly faster than going through BigInt. +# ============================================================================= + +# Differential addition on Montgomery curve (generic version). +# Given projective P=(PX:PZ), Q=(QX:QZ) and their difference P-Q=(diffX:diffZ), +# returns (res_X, res_Z) = P+Q. +function _ecm_add(PX::T, PZ::T, QX::T, QZ::T, + diffX::T, diffZ::T, n::T) where {T<:Integer} + u = _mulmod(_submod(PX, PZ, n), _addmod(QX, QZ, n), n) + v = _mulmod(_addmod(PX, PZ, n), _submod(QX, QZ, n), n) + ad = _addmod(u, v, n) + sb = _submod(u, v, n) + resX = _mulmod(diffZ, _mulmod(ad, ad, n), n) + resZ = _mulmod(diffX, _mulmod(sb, sb, n), n) + resX, resZ +end + +# Point doubling on Montgomery curve with a24 = (a+2)/4 (generic version). +# Returns (res_X, res_Z). +function _ecm_double(PX::T, PZ::T, n::T, a24::T) where {T<:Integer} + s = _addmod(PX, PZ, n) + d = _submod(PX, PZ, n) + u = _mulmod(s, s, n) + v = _mulmod(d, d, n) + diff = _submod(u, v, n) + resX = _mulmod(u, v, n) + resZ = _mulmod(diff, _addmod(v, _mulmod(a24, diff, n), n), n) + resX, resZ +end + +# Montgomery ladder scalar multiplication (generic version). +# Computes [k]P on the Montgomery curve; k is a small integer (prime power ≤ B1). +# Returns (res_X, res_Z). +function _ecm_scalar_mul(k::Integer, PX::T, PZ::T, n::T, a24::T) where {T<:Integer} + R0X, R0Z = PX, PZ + R1X, R1Z = _ecm_double(PX, PZ, n, a24) + nbits = ndigits(k, base=2) + for i in (nbits - 2):-1:0 + if isodd(k >> i) + tmpX, tmpZ = _ecm_add(R0X, R0Z, R1X, R1Z, PX, PZ, n) + R0X, R0Z = tmpX, tmpZ + R1X, R1Z = _ecm_double(R1X, R1Z, n, a24) + else + tmpX, tmpZ = _ecm_add(R0X, R0Z, R1X, R1Z, PX, PZ, n) + R1X, R1Z = tmpX, tmpZ + R0X, R0Z = _ecm_double(R0X, R0Z, n, a24) + end + end + R0X, R0Z +end + +# ============================================================================= +# Stage 2: standard continuation (baby-step giant-step). +# +# After Stage 1 computes Q = [∏ p^k ≤ B1] P, Stage 2 checks whether the +# group order has exactly one prime factor p in (B1, B2]. Uses a stride D +# and precomputed baby-step table [d]Q for d ∈ {1,3,5,...,D-1} with gcd(d,6)=1 +# to cover all primes in the interval with only differential additions. +# +# Ref: Montgomery (1987) §10, Brent (1986) "Some integer factorization +# algorithms using elliptic curves". +# ============================================================================= + +# Baby-step offsets: odd values d with 1 ≤ d < D and gcd(d, 6) == 1. +function _stage2_baby_offsets(D::Int) + [d for d in 1:2:(D-1) if gcd(d, 6) == 1] +end + +# Generic Stage 2 continuation. +# Returns a non-trivial factor of n, or nothing. +function _ecm_stage2(QX::T, QZ::T, n::T, a24::T, B1::Int, B2::Int) where {T<:Integer} + D = 2 * isqrt(B2 - B1 + 1) + D = max(D, 6) + # Round D up to next multiple of 6 for alignment with prime residues. + D = D + (6 - mod(D, 6)) % 6 + + offsets = _stage2_baby_offsets(D) + isempty(offsets) && return nothing + + # Precompute baby-step table: baby[d] = [d]Q for each offset d. + # Also need [2]Q for building the table via differential addition chain. + S2X, S2Z = _ecm_double(QX, QZ, n, a24) # [2]Q + + # Build multiples of Q: muls[k] = [k]Q for k = 1, 2, ..., max(D-1, D) + # We only need odd multiples with gcd(d,6)==1, but building a full table + # up to D is simpler and the table is small. + max_d = maximum(offsets) + # mulX[k], mulZ[k] = [k]Q + mulX = Vector{T}(undef, max_d) + mulZ = Vector{T}(undef, max_d) + mulX[1], mulZ[1] = QX, QZ + if max_d >= 2 + mulX[2], mulZ[2] = S2X, S2Z + end + for k in 3:max_d + # [k]Q = [k-1]Q + [1]Q, diff = [k-2]Q + mulX[k], mulZ[k] = _ecm_add(mulX[k-1], mulZ[k-1], QX, QZ, + mulX[k-2], mulZ[k-2], n) + end + + # Precompute [D]Q for giant steps. + # Build [D]Q using the ladder. + DX, DZ = _ecm_scalar_mul(D, QX, QZ, n, a24) + + # Giant step: start at the first multiple of D above B1. + # For each giant step center c = j*D, check primes c ± d for d ∈ offsets. + j_start = div(B1, D) + 1 + j_end = div(B2, D) + 1 + + # Compute R = [j_start * D]Q + RX, RZ = _ecm_scalar_mul(j_start * D, QX, QZ, n, a24) + + # We also need the previous giant step for differential addition: + # Rprev = [(j_start - 1) * D]Q + RprevX, RprevZ = _ecm_scalar_mul((j_start - 1) * D, QX, QZ, n, a24) + + acc = one(T) + batch_count = 0 + + for j in j_start:j_end + c = j * D + for d in offsets + p_plus = c + d + p_minus = c - d + # Only check values in (B1, B2]. + if B1 < p_plus <= B2 + # The factor reveals itself when [p]Q = O, i.e. Z = 0. + # With R = [c]Q, baby = [d]Q: + # [c+d]Q and [c-d]Q share the property that their Z-coords + # vanish iff c±d divides the group order. + # The key identity: for Montgomery curves, + # X([c]Q) * Z([d]Q) - Z([c]Q) * X([d]Q) = 0 + # iff [c-d]Q or [c+d]Q is the point at infinity. + # We accumulate: acc *= (R_X * baby_Z - R_Z * baby_X) + val = _submod(_mulmod(RX, mulZ[d], n), _mulmod(RZ, mulX[d], n), n) + acc = _mulmod(acc, val, n) + batch_count += 1 + end + if B1 < p_minus <= B2 && p_minus != p_plus + val = _submod(_mulmod(RX, mulZ[d], n), _mulmod(RZ, mulX[d], n), n) + # For c-d we use the same identity (same expression). + acc = _mulmod(acc, val, n) + batch_count += 1 + end + end + + if batch_count >= 100 + g = gcd(acc, n) + 1 < g < n && return g + g == n && return nothing # degenerate + acc = one(T) + batch_count = 0 + end + + # Advance giant step: R_next = R + [D]Q, diff = R_prev + if j < j_end + RnextX, RnextZ = _ecm_add(RX, RZ, DX, DZ, RprevX, RprevZ, n) + RprevX, RprevZ = RX, RZ + RX, RZ = RnextX, RnextZ + end + end + + # Final GCD check + if batch_count > 0 + g = gcd(acc, n) + 1 < g < n && return g + end + return nothing +end + +# BigInt GMP in-place Stage 2 continuation. +function _ecm_stage2!(Q_X::BigInt, Q_Z::BigInt, n::BigInt, a24::BigInt, + B1::Int, B2::Int, buf::ECMBuffers)::Union{BigInt, Nothing} + D = 2 * isqrt(B2 - B1 + 1) + D = max(D, 6) + D = D + (6 - mod(D, 6)) % 6 + + offsets = _stage2_baby_offsets(D) + isempty(offsets) && return nothing + + max_d = maximum(offsets) + + # Precompute baby-step table (vectors of BigInt). + mulX = [BigInt() for _ in 1:max_d] + mulZ = [BigInt() for _ in 1:max_d] + Base.GMP.MPZ.set!(mulX[1], Q_X) + Base.GMP.MPZ.set!(mulZ[1], Q_Z) + + # [2]Q + S2X, S2Z = BigInt(), BigInt() + _ecm_double!(S2X, S2Z, Q_X, Q_Z, n, a24, buf) + if max_d >= 2 + Base.GMP.MPZ.set!(mulX[2], S2X) + Base.GMP.MPZ.set!(mulZ[2], S2Z) + end + for k in 3:max_d + _ecm_add!(mulX[k], mulZ[k], mulX[k-1], mulZ[k-1], Q_X, Q_Z, + mulX[k-2], mulZ[k-2], n, buf) + end + + # [D]Q via scalar multiplication + DX, DZ = BigInt(), BigInt() + _ecm_scalar_mul!(DX, DZ, D, Q_X, Q_Z, n, a24, buf) + + # Giant step state + j_start = div(B1, D) + 1 + j_end = div(B2, D) + 1 + + RX, RZ = BigInt(), BigInt() + _ecm_scalar_mul!(RX, RZ, j_start * D, Q_X, Q_Z, n, a24, buf) + + RprevX, RprevZ = BigInt(), BigInt() + _ecm_scalar_mul!(RprevX, RprevZ, (j_start - 1) * D, Q_X, Q_Z, n, a24, buf) + + RnextX, RnextZ = BigInt(), BigInt() + acc = BigInt(1) + tmp_mul = BigInt() + tmp1 = BigInt() + tmp2 = BigInt() + batch_count = 0 + + for j in j_start:j_end + c = j * D + for d in offsets + p_plus = c + d + p_minus = c - d + if B1 < p_plus <= B2 + # val = R_X * baby_Z - R_Z * baby_X (mod n) + _mulmod!(tmp1, RX, mulZ[d], n, tmp_mul) + _mulmod!(tmp2, RZ, mulX[d], n, tmp_mul) + Base.GMP.MPZ.sub!(tmp1, tmp2) + Base.GMP.MPZ.mul!(tmp_mul, acc, tmp1) + _mpz_fdiv_r!(acc, tmp_mul, n) + batch_count += 1 + end + if B1 < p_minus <= B2 && p_minus != p_plus + _mulmod!(tmp1, RX, mulZ[d], n, tmp_mul) + _mulmod!(tmp2, RZ, mulX[d], n, tmp_mul) + Base.GMP.MPZ.sub!(tmp1, tmp2) + Base.GMP.MPZ.mul!(tmp_mul, acc, tmp1) + _mpz_fdiv_r!(acc, tmp_mul, n) + batch_count += 1 + end + end + + if batch_count >= 100 + g = gcd(acc, n) + 1 < g < n && return g + g == n && return nothing + Base.GMP.MPZ.set_si!(acc, 1) + batch_count = 0 + end + + if j < j_end + _ecm_add!(RnextX, RnextZ, RX, RZ, DX, DZ, RprevX, RprevZ, n, buf) + Base.GMP.MPZ.set!(RprevX, RX) + Base.GMP.MPZ.set!(RprevZ, RZ) + Base.GMP.MPZ.set!(RX, RnextX) + Base.GMP.MPZ.set!(RZ, RnextZ) + end + end + + if batch_count > 0 + g = gcd(acc, n) + 1 < g < n && return g + end + return nothing +end + +# ============================================================================= +# Public API +# ============================================================================= + +""" + ecm_factor(n::T, B1::Int, num_curves::Int) where {T<:Integer} -> Union{T, Nothing} + +Attempt to find a non-trivial factor of `n` using the Elliptic Curve Method (ECM) +with Stage 1 (bound `B1`) and Stage 2 continuation (bound `B2 = 100 * B1`). + +Uses Montgomery curves with Suyama's parametrization and a batched GCD to reduce +the number of `gcd` calls in the inner loop. Stage 2 uses a baby-step giant-step +approach to efficiently check primes in `(B1, B2]`. + +This generic method works for any integer type `T`, including fixed-width types from +[BitIntegers.jl](https://github.com/rfourquet/BitIntegers.jl) (e.g. `UInt256`, `UInt512`). +For those types modular multiplication stays within fixed-width LLVM integers via +`widen(T)`, avoiding BigInt allocation entirely. + +Returns a non-trivial factor of `n`, or `nothing` if none is found within the budget. + +See also the `BigInt`-optimised overload which uses in-place GMP arithmetic. +""" +function ecm_factor(n::T, B1::Int, num_curves::Int) where {T<:Integer} + prime_powers = _ecm_prime_powers(B1) + B2 = 100 * B1 + + for _ in 1:num_curves + curve = _ecm_suyama(n) + curve === nothing && continue + curve isa Tuple || return curve # lucky factor from gcd + x0, z0, a24 = curve + + QX, QZ = x0, z0 + + # Stage 1: multiply Q by each prime power, batching the GCD checks. + degenerate = false + acc = one(T) + batch_count = 0 + for pk in prime_powers + QX, QZ = _ecm_scalar_mul(pk, QX, QZ, n, a24) + acc = _mulmod(acc, QZ, n) + batch_count += 1 + if batch_count >= 100 + g = gcd(acc, n) + if 1 < g < n + return g + end + if g == n + degenerate = true + break + end + acc = one(T) + batch_count = 0 + end + end + + degenerate && continue + + if batch_count > 0 + g = gcd(acc, n) + 1 < g < n && return g + g == n && continue + end + + # Stage 2: standard continuation for primes in (B1, B2]. + s2 = _ecm_stage2(QX, QZ, n, a24, B1, B2) + s2 !== nothing && return s2 + end + return nothing +end + +""" + ecm_factor(n::BigInt, B1::Int, num_curves::Int) -> Union{BigInt, Nothing} + +BigInt-optimised ECM using in-place GMP arithmetic to minimise allocations in the +hot Montgomery ladder loop. Includes Stage 2 continuation (`B2 = 100 * B1`) using +baby-step giant-step. For fixed-width integer types use the generic method. + +Returns a non-trivial factor of `n`, or `nothing` if none is found within the budget. +""" +function ecm_factor(n::BigInt, B1::Int, num_curves::Int)::Union{BigInt, Nothing} + prime_powers = _ecm_prime_powers(B1) + B2 = 100 * B1 + + buf = ECMBuffers() + Q_X = BigInt() + Q_Z = BigInt() + tmp_mul = BigInt() + + for _ in 1:num_curves + curve = _ecm_suyama(n) + curve === nothing && continue + curve isa Tuple || return curve # lucky factor from gcd + x0, z0, a24 = curve + + Base.GMP.MPZ.set!(Q_X, x0) + Base.GMP.MPZ.set!(Q_Z, z0) + + # Stage 1: multiply Q by each prime power, with batched GCD. + degenerate = false + acc = BigInt(1) + batch_count = 0 + for pk in prime_powers + _ecm_scalar_mul!(Q_X, Q_Z, pk, Q_X, Q_Z, n, a24, buf) + Base.GMP.MPZ.mul!(tmp_mul, acc, Q_Z) + _mpz_fdiv_r!(acc, tmp_mul, n) + batch_count += 1 + if batch_count >= 100 + g = gcd(acc, n) + if 1 < g < n + return g + end + if g == n + degenerate = true + break + end + Base.GMP.MPZ.set_si!(acc, 1) + batch_count = 0 + end + end + + degenerate && continue + + if batch_count > 0 + g = gcd(acc, n) + 1 < g < n && return g + g == n && continue + end + + # Stage 2: standard continuation for primes in (B1, B2]. + s2 = _ecm_stage2!(Q_X, Q_Z, n, a24, B1, B2, buf) + s2 !== nothing && return s2 + end + return nothing +end diff --git a/test/runtests.jl b/test/runtests.jl index f63e59a..e00ebcf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -511,3 +511,168 @@ end end @test primes(2^31-20, 2^31-1) == [2147483629, 2147483647] end + +@testset "ECM factorization" begin + # Semi-prime with a ~40-bit factor + p1 = big"824633720831" # 40-bit prime + q1 = big"1000000007" # 30-bit prime + n1 = p1 * q1 + f1 = factor(n1) + @test prodfactors(f1) == n1 + @test all(isprime(p) for (p, _) in f1) + + # p ~40 bits, q ~80 bits + p2 = big"824633720831" + q2 = big"1180591620717411303449" # ~80-bit prime + n2 = p2 * q2 + f2 = factor(n2) + @test prodfactors(f2) == n2 + @test all(isprime(p) for (p, _) in f2) +end + +@testset "MPQS factorization" begin + # Semi-prime with two ~60-bit factors + p1 = big"780002082420426809" + q1 = big"810735269523504809437013569" + n1 = p1 * q1 + f1 = factor(n1) + @test prodfactors(f1) == n1 + @test all(isprime(p) for (p, _) in f1) +end + +@testset "BigInt factorization" begin + # factor(BigInt(n)) returns Factorization{BigInt} + n1 = big"2152302898747" + f1 = factor(n1) + @test f1 isa Primes.Factorization{BigInt} + @test prodfactors(f1) == n1 + + # BigInt prime returns single-entry + p = big"359334085968622831041960188598043661065388726959079837" + f2 = factor(p) + @test length(f2) == 1 + @test first(f2) == (p => 1) +end + +@testset "ECM edge cases" begin + # ECM with a small semi-prime (BigInt path — GMP in-place) + n = big"1000000007" * big"1000000009" + result = Primes.ecm_factor(n, 2000, 50) + @test result !== nothing + @test mod(n, result) == 0 + + # ECM returns nothing when curves are insufficient for a hard number + hard = nextprime(big"10"^30) * nextprime(big"10"^30 + 1000) + result2 = Primes.ecm_factor(hard, 100, 1) + # May or may not find a factor with just 1 curve — no assertion on result +end + +@testset "ECM Stage 2" begin + # Stage 2 should improve success rate for low B1 values where Stage 1 + # alone frequently fails. With B1=200 and B2=100*B1, the continuation + # catches group orders with one large prime factor in (B1, B2]. + + n = big"824633720831" * big"1000000007" + + # With B1=200 and many curves, Stage 2 should find the factor reliably. + # (Stage 1 alone at B1=200 finds it only ~8% of the time per 5-curve attempt.) + result = Primes.ecm_factor(n, 200, 50) + @test result !== nothing + @test mod(n, result) == 0 + + # BigInt path: verify GMP in-place Stage 2 works + n_big = big"1000000007" * big"1000000009" + result_big = Primes.ecm_factor(n_big, 200, 50) + @test result_big !== nothing + @test mod(n_big, result_big) == 0 + + # Generic path: UInt128 + n_u128 = UInt128(1_000_000_007) * UInt128(1_000_000_009) + result_u128 = Primes.ecm_factor(n_u128, 200, 50) + @test result_u128 !== nothing + @test mod(n_u128, result_u128) == 0 + @test result_u128 isa UInt128 +end + +@testset "ECM Stage 2 finds factors Stage 1 misses" begin + # Run Stage 1 alone on many curves with low B1. Collect curves where + # Stage 1 failed (gcd == 1), then run Stage 2 on those and verify it + # finds at least one factor that Stage 1 could not. + n = big"824633720831" * big"1000000007" + B1 = 200 + B2 = 100 * B1 + pp = Primes._ecm_prime_powers(B1) + + stage1_only_found = 0 + stage2_extra_found = 0 + curves_tested = 0 + stage1_missed = 0 + + for _ in 1:200 + curve = Primes._ecm_suyama(n) + curve === nothing && continue + !(curve isa Tuple) && continue + x0, z0, a24 = curve + curves_tested += 1 + + # Run Stage 1 manually + QX, QZ = x0, z0 + for pk in pp + QX, QZ = Primes._ecm_scalar_mul(pk, QX, QZ, n, a24) + end + g1 = gcd(QZ, n) + if 1 < g1 < n + stage1_only_found += 1 + continue + end + + # Stage 1 did not find a factor on this curve + stage1_missed += 1 + + # Now run Stage 2 on the same post-Stage-1 point + s2 = Primes._ecm_stage2(QX, QZ, n, a24, B1, B2) + if s2 !== nothing && 1 < s2 < n + stage2_extra_found += 1 + @test mod(n, s2) == 0 + end + end + + # With B1=200 on this number, Stage 1 finds ~8% of curves. + # Stage 2 should rescue additional curves, proving it adds value. + @test curves_tested >= 100 # enough curves for a meaningful test + @test stage1_missed > 0 # Stage 1 didn't find everything + @test stage2_extra_found > 0 # Stage 2 found at least one that Stage 1 missed + @test stage2_extra_found > stage1_only_found # Stage 2 rescues more than Stage 1 found +end + +@testset "ECM generic integer types" begin + # UInt128: widen(UInt128) == BigInt in base Julia — exercises the generic + # _mulmod/_addmod/_submod helpers via the BigInt widen path. + p = UInt128(1_000_000_007) * UInt128(999_999_937) + q = UInt128(1_000_000_009) + n = p * q + result = Primes.ecm_factor(n, 2000, 50) + @test result !== nothing + @test mod(n, result) == 0 + @test result isa UInt128 + + # UInt256 / UInt512 from BitIntegers.jl: widen stays in LLVM fixed-width + # integers (UInt256 → UInt512), so no BigInt allocation in the inner loop. + using BitIntegers + p256 = UInt256(1_000_000_007) * UInt256(999_999_937) + q256 = UInt256(1_000_000_009) + n256 = p256 * q256 + r256 = Primes.ecm_factor(n256, 2000, 50) + @test r256 !== nothing + @test mod(n256, r256) == 0 + @test r256 isa UInt256 + + # Int256 (signed BitIntegers type) + p256s = Int256(1_000_000_007) * Int256(999_999_937) + q256s = Int256(1_000_000_009) + n256s = p256s * q256s + r256s = Primes.ecm_factor(n256s, 2000, 50) + @test r256s !== nothing + @test mod(n256s, r256s) == 0 + @test r256s isa Int256 +end