leanprover · SamuelSchlesinger · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -120,3 +120,10 @@ public import Cslib.Logics.LinearLogic.CLL.CutElimination
 public import Cslib.Logics.LinearLogic.CLL.EtaExpansion
 public import Cslib.Logics.LinearLogic.CLL.PhaseSemantics.Basic
 public import Cslib.Logics.Propositional.Defs
+public import Cslib.MachineLearning.PACLearning.Defs
+public import Cslib.MachineLearning.PACLearning.SampleComplexityLower
+public import Cslib.MachineLearning.PACLearning.SampleComplexityLower.AdversarialMeasure
+public import Cslib.MachineLearning.PACLearning.SampleComplexityLower.EHKVProof
+public import Cslib.MachineLearning.PACLearning.SampleComplexityLower.Helpers
+public import Cslib.MachineLearning.PACLearning.SampleComplexityLower.InvolutionPairing
+public import Cslib.MachineLearning.PACLearning.VCDimension
@@ -0,0 +1,178 @@
+/-
+Copyright (c) 2026 Samuel Schlesinger. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Samuel Schlesinger
+-/
+
+module
+
+public import Cslib.Init
+public import Mathlib.MeasureTheory.Measure.MeasureSpace
+public import Mathlib.MeasureTheory.Constructions.Pi
+public import Mathlib.Order.SymmDiff
+
+@[expose] public section
+
+/-! # PAC Learning
+
+This file defines the Probably Approximately Correct (PAC) learning model
+introduced by Valiant [Valiant1984]. A concept class `C` over a domain `α` is a
+collection of subsets of `α`. A learning algorithm receives a labeled sample
+drawn i.i.d. from an unknown distribution and must produce a hypothesis that,
+with high probability, has low error with respect to the true concept.
+
+## Main definitions
+
+- `ConceptClass`: a concept class over domain `α`, i.e., a set of subsets.
+- `LabeledSample`: a finite sequence of (point, label) pairs.
+- `sampleOf`: constructs a labeled sample from a sequence of points and a concept.
+- `hypothesisError`: the total error of a hypothesis with respect to a concept under a
+  distribution, defined as the measure of the symmetric difference.
+- `falsePositiveError`: the false positive error `P(h \ c)`.
+- `falseNegativeError`: the false negative error `P(c \ h)`.
+- `Learner`: a function from labeled samples to hypotheses.
+- `IsPACLearner`: the property that a deterministic learner produces a hypothesis
+  with error at most `ε` with probability at least `1 - δ`, for every distribution
+  and concept from the class.
+- `IsRPACLearner`: the randomized variant, where the learner draws internal
+  randomness from a probability space `(Ω, Q)`.
+- `sampleComplexity`: the smallest sample size admitting a deterministic PAC learner.
+- `rsampleComplexity`: the smallest sample size admitting a randomized PAC learner.
+
+## Main statements
+
+- `IsPACLearner.toIsRPACLearner`: every deterministic PAC learner is in particular
+  a randomized PAC learner (with the trivial randomness space `PUnit`).
+- `hypothesisError_eq_add`: total error decomposes as the sum of false positive and
+  false negative errors.
+
+## References
+
+* [L. G. Valiant, *A Theory of the Learnable*][Valiant1984]
+* [A. Ehrenfeucht, D. Haussler, M. Kearns, L. Valiant,
+  *A General Lower Bound on the Number of Examples Needed for Learning*][EHKV1989]
+-/
+
+open MeasureTheory Set
+open scoped ENNReal
+
+namespace Cslib.MachineLearning
+
+/-- A *concept class* over domain `α` is a collection of subsets of `α`. Each subset represents
+a concept (i.e., a binary classifier). -/
+abbrev ConceptClass (α : Type*) := Set (Set α)
+
+/-- A *labeled sample* of size `m` over domain `α` is a sequence of `(point, label)` pairs. -/
+abbrev LabeledSample (α : Type*) (m : ℕ) := Fin m → (α × Bool)
+
+open Classical in
+/-- Construct a labeled sample from a sequence of points `xs` and a concept `c`.
+Each point is labeled `true` if it belongs to the concept and `false` otherwise. -/
+noncomputable def sampleOf {α : Type*} {m : ℕ} (c : Set α) (xs : Fin m → α) :
+    LabeledSample α m :=
+  fun i => (xs i, decide (xs i ∈ c))
+
+/-- The *error* of a hypothesis `h` with respect to a target concept `c` under distribution `P`,
+defined as the measure of their symmetric difference `h ∆ c`. -/
+noncomputable def hypothesisError {α : Type*} [MeasurableSpace α] (P : Measure α)
+    (h c : Set α) : ℝ≥0∞ :=
+  P (symmDiff h c)
+
+/-- The *false positive error* of a hypothesis `h` with respect to a target concept `c`
+under distribution `P`, defined as the measure of `h \ c` — points classified positive
+but not in the concept. -/
+noncomputable def falsePositiveError {α : Type*} [MeasurableSpace α] (P : Measure α)
+    (h c : Set α) : ℝ≥0∞ :=
+  P (h \ c)
+
+/-- The *false negative error* of a hypothesis `h` with respect to a target concept `c`
+under distribution `P`, defined as the measure of `c \ h` — points in the concept but
+classified negative. -/
+noncomputable def falseNegativeError {α : Type*} [MeasurableSpace α] (P : Measure α)
+    (h c : Set α) : ℝ≥0∞ :=
+  P (c \ h)
+
+/-- The total hypothesis error decomposes as the sum of false positive and false negative
+errors, since `h ∆ c = (h \ c) ∪ (c \ h)` is a disjoint union. -/
+theorem hypothesisError_eq_add {α : Type*} [MeasurableSpace α] {P : Measure α}
+    {h c : Set α} (hh : MeasurableSet h) (hc : MeasurableSet c) :
+    hypothesisError P h c = falsePositiveError P h c + falseNegativeError P h c := by
+  simp only [hypothesisError, falsePositiveError, falseNegativeError, symmDiff_def, sup_eq_union]
+  exact measure_union disjoint_sdiff_sdiff (hc.diff hh)
+
+/-- A learner using `m` samples is a function that takes a labeled sample of size `m` and produces
+a hypothesis (a subset of the domain). -/
+abbrev Learner (α : Type*) (m : ℕ) := LabeledSample α m → Set α
+
+variable {α : Type*} [MeasurableSpace α]
+
+/-- `IsPACLearner m ε δ C` asserts that there exists a learner using `m` samples that is
+`(ε, δ)`-correct for the concept class `C`: for every probability measure `P` on `α` and every
+target concept `c ∈ C`, the probability (over i.i.d. samples from `P`) that the learner's
+hypothesis has error greater than `ε` is at most `δ`.
+
+More precisely, we require that the set of sample-vectors whose induced hypothesis has error
+exceeding `ε` has `P^m`-measure at most `δ`. -/
+def IsPACLearner (m : ℕ) (ε δ : ℝ≥0∞) (C : ConceptClass α) : Prop :=
+  ∃ A : Learner α m,
+    ∀ (P : Measure α) [IsProbabilityMeasure P],
+    ∀ c ∈ C,
+      (Measure.pi (fun _ : Fin m => P))
+        {xs : Fin m → α | hypothesisError P (A (sampleOf c xs)) c > ε} ≤ δ
+
+/-- `IsRPACLearner m ε δ C` asserts that there exists a *randomized* learner using `m` samples
+that is `(ε, δ)`-correct for the concept class `C`. A randomized learner draws internal
+randomness `ω` from a probability space `(Ω, Q)` and then acts as the deterministic learner
+`A(ω)`.
+
+For every probability measure `P` on `α` and every target concept `c ∈ C`, the failure
+probability function `ω ↦ P^m{xs | error(A(ω)(xs), c) > ε}` must be `Q`-a.e. measurable,
+and its expectation over `ω` must be at most `δ`.
+
+A deterministic PAC learner (`IsPACLearner`) is the special case `Ω = PUnit`;
+see `IsPACLearner.toIsRPACLearner`. -/
+def IsRPACLearner (m : ℕ) (ε δ : ℝ≥0∞) (C : ConceptClass α) : Prop :=
+  ∃ (Ω : Type*) (_ : MeasurableSpace Ω) (Q : Measure Ω) (_ : IsProbabilityMeasure Q)
+    (A : Ω → Learner α m),
+    ∀ (P : Measure α) [IsProbabilityMeasure P],
+    ∀ c ∈ C,
+      AEMeasurable (fun ω => (Measure.pi (fun _ : Fin m => P))
+        {xs : Fin m → α | hypothesisError P ((A ω) (sampleOf c xs)) c > ε}) Q ∧
+      ∫⁻ ω, (Measure.pi (fun _ : Fin m => P))
+        {xs : Fin m → α | hypothesisError P ((A ω) (sampleOf c xs)) c > ε} ∂Q ≤ δ
+
+/-- Every deterministic PAC learner is in particular a randomized PAC learner
+(with the trivial one-point randomness space `PUnit`). -/
+theorem IsPACLearner.toIsRPACLearner {m : ℕ} {ε δ : ℝ≥0∞} {C : ConceptClass α}
+    (h : IsPACLearner m ε δ C) : IsRPACLearner m ε δ C := by
+  obtain ⟨A, hA⟩ := h
+  refine ⟨PUnit, inferInstance, Measure.dirac PUnit.unit, inferInstance, fun _ => A, ?_⟩
+  intro P _ c hc
+  exact ⟨measurable_const.aemeasurable, by
+    simp only [gt_iff_lt, lintegral_const, measure_univ, mul_one]; exact hA P c hc⟩
+
+/-- The *deterministic sample complexity* of a concept class `C` at accuracy `ε` and confidence `δ`
+is the smallest sample size `m` such that a deterministic `(ε, δ)`-PAC learner for `C` exists
+using `m` samples. See also `rsampleComplexity` for the randomized variant.
+
+**Caveat**: because `sInf` on `ℕ` returns `0` for the empty set, this definition returns `0` when
+no deterministic learner exists (e.g., when `C` has infinite VC dimension). It is only meaningful
+when the defining set `{m | IsPACLearner m ε δ C}` is nonempty. -/
+noncomputable def sampleComplexity (C : ConceptClass α) (ε δ : ℝ≥0∞) : ℕ :=
+  sInf {m : ℕ | IsPACLearner m ε δ C}
+
+/-- The *randomized sample complexity* of a concept class `C` at accuracy `ε` and confidence `δ`
+is the smallest sample size `m` such that a randomized `(ε, δ)`-PAC learner for `C` exists
+using `m` samples. This is at most `sampleComplexity C ε δ` since every deterministic learner
+is also a randomized learner (see `IsPACLearner.toIsRPACLearner`).
+
+The universe of the randomness space `Ω` is pinned to `Type 0` (via `.{_, 0}`) so that the
+`sInf` is taken over a definite set; without the pin the existential quantifier over `Ω : Type*`
+would range over all universe levels, making the set ill-defined.
+
+**Caveat**: because `sInf` on `ℕ` returns `0` for the empty set, this definition returns `0` when
+no randomized learner exists. It is only meaningful when the defining set is nonempty. -/
+noncomputable def rsampleComplexity (C : ConceptClass α) (ε δ : ℝ≥0∞) : ℕ :=
+  sInf {m : ℕ | IsRPACLearner.{_, 0} m ε δ C}
+
+end Cslib.MachineLearning
@@ -0,0 +1,191 @@
+/-
+Copyright (c) 2026 Samuel Schlesinger. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Samuel Schlesinger
+-/
+
+module
+
+public import Cslib.MachineLearning.PACLearning.SampleComplexityLower.EHKVProof
+
+@[expose] public section
+
+/-! # Sample Complexity Lower Bound
+
+We use the prefix `ehkv` for Ehrenfeucht–Haussler–Kearns–Valiant throughout.
+
+This module formalizes the main result of [EHKV1989]: a lower bound on the
+number of examples required for distribution-free PAC learning of a concept
+class, in terms of its Vapnik-Chervonenkis dimension.
+
+**Theorem 1** [EHKV1989, Theorem 1]: Assume `0 < ε ≤ 1/8`,
+`0 < δ < 1/14`, and `VCdim(C) ≥ 2`. Then any `(ε, δ)`-learning
+algorithm for `C` must use sample size at least `(VCdim(C) - 1) / (32ε)`.
+
+The proof constructs an adversarial distribution `P` on `d + 1` points
+(where `d = VCdim(C) - 1`). Via a Markov/Bernoulli bound (Lemma 3),
+"bad" samples — those which do not reveal enough of the shattered
+set — occur with probability `> 1/2` when the sample size is too
+small. An involution pairing argument (Lemma 2) shows that for each
+bad sample, at least half of the `2^d` concepts obtained from
+shattering force large error, and a counting/contradiction argument
+then produces a single concept whose failure probability exceeds `δ`.
+
+## Proof structure (submodules)
+
+- `SampleComplexityLower.Helpers`: generic lemmas (Bernoulli inequality,
+  product measure support, `seenElements`, integration bound)
+- `SampleComplexityLower.AdversarialMeasure`: construction of the
+  adversarial probability distribution on `d + 1` points
+- `SampleComplexityLower.InvolutionPairing`: the involution/pairing
+  argument and complementary-error contradiction
+- `SampleComplexityLower.EHKVProof`: Markov bound on bad samples,
+  half-fraction sum lower bound, and the assembled contradiction
+
+## Main statements
+
+- `sample_complexity_lower_bound_randomized`: **Theorem 1** of [EHKV1989] for
+  randomized learners — the full strength of the result.
+- `sample_complexity_lower_bound`: deterministic corollary via
+  `IsPACLearner.toIsRPACLearner`.
+- `sample_complexity_lower_bound_vcDim`: the bound stated in terms of `vcDim`.
+- `sampleComplexity_lower_bound_vcDim`: lower bound on `sampleComplexity` via `vcDim`.
+- `rsampleComplexity_lower_bound_vcDim`: lower bound on `rsampleComplexity` via `vcDim`.
+
+## References
+
+* [A. Ehrenfeucht, D. Haussler, M. Kearns, L. Valiant,
+  *A General Lower Bound on the Number of Examples Needed
+  for Learning*][EHKV1989]
+-/
+
+open MeasureTheory Set Finset
+open scoped ENNReal
+
+noncomputable section
+
+namespace Cslib.MachineLearning
+
+/-- **Theorem 1 (randomized)** [EHKV1989]: The sample-complexity lower bound
+`(VCdim(C) - 1) / (32 * ε) ≤ m` holds for *randomized* `(ε, δ)`-PAC
+learners. This is the full strength of the EHKV result. -/
+theorem sample_complexity_lower_bound_randomized
+    {α : Type*} [MeasurableSpace α] [MeasurableSingletonClass α]
+    {C : ConceptClass α}
+    {W : Finset α} (hW : SetShatters C (↑W))
+    {m : ℕ} {ε δ : ℝ≥0∞}
+    (hε_pos : 0 < ε) (hε_le : ε ≤ ENNReal.ofReal (1 / 8))
+    (hδ_lt : δ < ENNReal.ofReal (1 / 14))
+    (hW_card : 2 ≤ W.card)
+    (hlearn : IsRPACLearner m ε δ C) :
+    (W.card - 1 : ℝ) / (32 * ε.toReal) ≤ m := by
+  by_contra h
+  push Not at h
+  have hε_ne_top : ε ≠ ⊤ := ne_top_of_le_ne_top ENNReal.ofReal_ne_top hε_le
+  have hε'_pos : 0 < ε.toReal := ENNReal.toReal_pos (ne_of_gt hε_pos) hε_ne_top
+  have h32ε_pos : (0 : ℝ) < 32 * ε.toReal := by positivity
+  have hW_sub : (0 : ℝ) < (W.card : ℝ) - 1 := by
+    have : (2 : ℝ) ≤ (W.card : ℝ) := by exact_mod_cast hW_card
+    linarith
+  have hm_ennreal : (↑m : ℝ≥0∞) < ENNReal.ofReal
+      ((W.card - 1 : ℝ) / (32 * ε.toReal)) := by
+    rw [← ENNReal.ofReal_natCast (n := m)]
+    exact ENNReal.ofReal_lt_ofReal_iff (div_pos hW_sub h32ε_pos) |>.mpr h
+  obtain ⟨Ω, mΩ, Q, hQ, A, hA⟩ := hlearn
+  have hA_aem : ∀ (P : Measure α) [IsProbabilityMeasure P], ∀ c ∈ C,
+      AEMeasurable (fun ω => (Measure.pi (fun _ : Fin m => P))
+        {xs : Fin m → α | hypothesisError P ((A ω) (sampleOf c xs)) c > ε}) Q :=
+    fun P _ c hc => (hA P c hc).1
+  obtain ⟨P, hP, c, hc, hbad⟩ :=
+    exists_bad_distribution_and_concept_randomized hW hW_card
+      hε_pos hε_le hδ_lt hm_ennreal Q A hA_aem
+  haveI := hP
+  exact absurd ((hA P c hc).2) (not_le_of_gt hbad)
+
+/-- **Theorem 1** [EHKV1989]: Assume `0 < ε ≤ 1/8`, `0 < δ < 1/14`,
+and `VCdim(C) ≥ 2`. Then any deterministic `(ε, δ)`-learning algorithm
+for `C` must use sample size `m` satisfying `(VCdim(C) - 1) / (32 * ε) ≤ m`.
+
+This is a corollary of the stronger `sample_complexity_lower_bound_randomized`
+via `IsPACLearner.toIsRPACLearner`. -/
+theorem sample_complexity_lower_bound
+    {α : Type*} [MeasurableSpace α] [MeasurableSingletonClass α]
+    {C : ConceptClass α}
+    {W : Finset α} (hW : SetShatters C (↑W))
+    {m : ℕ} {ε δ : ℝ≥0∞}
+    (hε_pos : 0 < ε) (hε_le : ε ≤ ENNReal.ofReal (1 / 8))
+    (hδ_lt : δ < ENNReal.ofReal (1 / 14))
+    (hW_card : 2 ≤ W.card)
+    (hlearn : IsPACLearner m ε δ C) :
+    (W.card - 1 : ℝ) / (32 * ε.toReal) ≤ m := by
+  exact sample_complexity_lower_bound_randomized hW hε_pos hε_le hδ_lt hW_card
+    (IsPACLearner.toIsRPACLearner.{_, 0} hlearn)
+
+/-- **Corollary**: The EHKV sample-complexity lower bound stated in terms of `vcDim`.
+
+If the VC dimension of `C` is at least `2` (and is finite, i.e., the defining set is bounded
+above), then any randomized `(ε, δ)`-PAC learner for `C` must use at least
+`(vcDim C - 1) / (32ε)` samples.
+
+This wraps `sample_complexity_lower_bound_randomized` by extracting a shattered witness from
+`vcDim`. -/
+theorem sample_complexity_lower_bound_vcDim
+    {α : Type*} [MeasurableSpace α] [MeasurableSingletonClass α]
+    {C : ConceptClass α}
+    {m : ℕ} {ε δ : ℝ≥0∞}
+    (hε_pos : 0 < ε) (hε_le : ε ≤ ENNReal.ofReal (1 / 8))
+    (hδ_lt : δ < ENNReal.ofReal (1 / 14))
+    (hvc : 2 ≤ vcDim C)
+    (hbdd : BddAbove {n : ℕ | ∃ W : Finset α, W.card = n ∧ SetShatters C (↑W)})
+    (hlearn : IsRPACLearner m ε δ C) :
+    (vcDim C - 1 : ℝ) / (32 * ε.toReal) ≤ m := by
+  set S := {n : ℕ | ∃ W : Finset α, W.card = n ∧ SetShatters C (↑W)}
+  have hne : S.Nonempty := by
+    by_contra hempty
+    rw [Set.not_nonempty_iff_eq_empty] at hempty
+    have : (2 : ℕ) ≤ sSup (∅ : Set ℕ) := hempty ▸ hvc
+    simp at this
+  obtain ⟨W, hWcard, hW⟩ := Nat.sSup_mem hne hbdd
+  have hW_card : 2 ≤ W.card := hWcard ▸ hvc
+  have hvc_eq : vcDim C = W.card := hWcard.symm
+  simp only [hvc_eq]
+  exact sample_complexity_lower_bound_randomized hW hε_pos hε_le hδ_lt hW_card hlearn
+
+/-- Lower bound on deterministic sample complexity in terms of `vcDim`.
+
+If the VC dimension is at least `2` and finite, then
+`(vcDim C - 1) / (32ε) ≤ sampleComplexity C ε δ`,
+provided the concept class is learnable (some deterministic PAC learner exists). -/
+theorem sampleComplexity_lower_bound_vcDim
+    {α : Type*} [MeasurableSpace α] [MeasurableSingletonClass α]
+    {C : ConceptClass α}
+    {ε δ : ℝ≥0∞}
+    (hε_pos : 0 < ε) (hε_le : ε ≤ ENNReal.ofReal (1 / 8))
+    (hδ_lt : δ < ENNReal.ofReal (1 / 14))
+    (hvc : 2 ≤ vcDim C)
+    (hbdd : BddAbove {n : ℕ | ∃ W : Finset α, W.card = n ∧ SetShatters C (↑W)})
+    (hlearnable : {m : ℕ | IsPACLearner m ε δ C}.Nonempty) :
+    (vcDim C - 1 : ℝ) / (32 * ε.toReal) ≤ sampleComplexity C ε δ := by
+  have hmem := Nat.sInf_mem hlearnable
+  exact sample_complexity_lower_bound_vcDim hε_pos hε_le hδ_lt hvc hbdd
+    (IsPACLearner.toIsRPACLearner.{_, 0} hmem)
+
+/-- Lower bound on randomized sample complexity in terms of `vcDim`.
+
+If the VC dimension is at least `2` and finite, then
+`(vcDim C - 1) / (32ε) ≤ rsampleComplexity C ε δ`,
+provided the concept class is learnable by a randomized learner. -/
+theorem rsampleComplexity_lower_bound_vcDim
+    {α : Type*} [MeasurableSpace α] [MeasurableSingletonClass α]
+    {C : ConceptClass α}
+    {ε δ : ℝ≥0∞}
+    (hε_pos : 0 < ε) (hε_le : ε ≤ ENNReal.ofReal (1 / 8))
+    (hδ_lt : δ < ENNReal.ofReal (1 / 14))
+    (hvc : 2 ≤ vcDim C)
+    (hbdd : BddAbove {n : ℕ | ∃ W : Finset α, W.card = n ∧ SetShatters C (↑W)})
+    (hlearnable : {m : ℕ | IsRPACLearner.{_, 0} m ε δ C}.Nonempty) :
+    (vcDim C - 1 : ℝ) / (32 * ε.toReal) ≤ rsampleComplexity C ε δ := by
+  have hmem := Nat.sInf_mem hlearnable
+  exact sample_complexity_lower_bound_vcDim hε_pos hε_le hδ_lt hvc hbdd hmem
+
+end Cslib.MachineLearning