diff --git a/Cslib.lean b/Cslib.lean index 43d44b938..e12e99c03 100644 --- a/Cslib.lean +++ b/Cslib.lean @@ -121,3 +121,4 @@ public import Cslib.Logics.LinearLogic.CLL.EtaExpansion public import Cslib.Logics.LinearLogic.CLL.PhaseSemantics.Basic public import Cslib.Logics.Propositional.Defs public import Cslib.MachineLearning.PACLearning.Defs +public import Cslib.MachineLearning.PACLearning.VersionSpace diff --git a/Cslib/MachineLearning/PACLearning/VersionSpace.lean b/Cslib/MachineLearning/PACLearning/VersionSpace.lean new file mode 100644 index 000000000..80ce42209 --- /dev/null +++ b/Cslib/MachineLearning/PACLearning/VersionSpace.lean @@ -0,0 +1,158 @@ +/- +Copyright (c) 2026 Dhruv Gupta. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Authors: Dhruv Gupta +-/ + +module + +public import Cslib.MachineLearning.PACLearning.Defs + +@[expose] public section + +/-! # Version Space + +The *version space* of a concept class `C` given a labeled sample `S` is the set +of concepts in `C` whose labels agree with `S` on every observed point. It is the +classical abstraction of "concepts still consistent with the data," introduced by +Mitchell (1982) and systematised by Angluin (1980) and Mitchell (1997). + +Version space is the structural companion to `sampleComplexity`: where sample +complexity monotonicity expresses "more samples make PAC easier," version space +antitonicity expresses "more data shrinks the consistent concept set." Together +they describe the two monotonicities of the passive learning setup. + +## Main definitions + +- `VersionSpace C S`: the subset of `C` whose concepts agree with `S` on every + sample point. +- `IsConsistent A C`: a learner is consistent with `C` if its output always lies + in the version space at the received sample. + +## Main results + +- `versionSpace_subset`: the version space is a subset of the original class. +- `versionSpace_empty_sample`: with no data, the version space is the whole + concept class. +- `versionSpace_antitone`: *structural monotonicity* — more samples give a + smaller version space. +- `IsConsistent.output_mem_conceptClass`: consistent learners output concepts + in the class. +- `mem_versionSpace_of_realizable`: under a realizable sample (labeled by some + `c ∈ C`), the version space is nonempty. + +## Cross-link to sample complexity + +Under a realizable distribution on `α × β` (where labels are deterministically +given by some target `c ∈ C`), a consistent learner's output lies in +`VersionSpace C S` for every sample by definition. By `versionSpace_antitone`, +this version space shrinks monotonically as the sample size grows. Combined with +finiteness of `C` (not assumed here — deferred to future work via Occam / +Chernoff characterisations), this yields realizable PAC learnability via the +classical consistent-learning-implies-PAC theorem. + +## References + +* [T. Mitchell, *Generalization as Search*, Artificial Intelligence 18(2)][Mitchell1982] +* [D. Angluin, *Inductive Inference of Formal Languages from Positive Data*, + Information and Control 45(2)][Angluin1980] +* [T. Mitchell, *Machine Learning*, McGraw-Hill][Mitchell1997] +-/ + +open Set + +namespace Cslib.MachineLearning.PACLearning + +variable {α : Type*} {β : Type*} + +/-! ### Version Space -/ + +/-- The *version space* of a concept class `C` given a labeled sample `S`: +the set of concepts in `C` whose labels agree with `S` on every observed point. + +Equivalently, `VersionSpace C S = C ∩ {h | ∀ i, h (S i).1 = (S i).2}`. -/ +def VersionSpace {m : ℕ} (C : ConceptClass α β) (S : LabeledSample α β m) : + ConceptClass α β := + {h ∈ C | ∀ i : Fin m, h (S i).1 = (S i).2} + +/-- Membership in the version space unfolds to concept membership plus +per-sample consistency. -/ +theorem mem_versionSpace_iff {m : ℕ} {C : ConceptClass α β} + {S : LabeledSample α β m} {h : α → β} : + h ∈ VersionSpace C S ↔ h ∈ C ∧ ∀ i : Fin m, h (S i).1 = (S i).2 := Iff.rfl + +/-- The version space is a subset of the original concept class. -/ +theorem versionSpace_subset {m : ℕ} (C : ConceptClass α β) + (S : LabeledSample α β m) : + VersionSpace C S ⊆ C := fun _ hh => hh.1 + +/-- Version space on the empty sample equals the whole concept class: no data, +no constraint on hypotheses. -/ +theorem versionSpace_empty_sample (C : ConceptClass α β) + (S : LabeledSample α β 0) : + VersionSpace C S = C := by + ext h + refine ⟨fun hh => hh.1, fun hh => ⟨hh, fun i => i.elim0⟩⟩ + +/-- *Version space antitonicity.* Given a sample of size `n` and `m ≤ n`, the +version space on all `n` observations is a subset of the version space on the +first `m` observations. More data never enlarges the version space. + +This is the structural dual to sample complexity monotonicity: where sample +complexity says more samples make PAC easier to achieve, version space +antitonicity says more samples make the consistent concept set smaller. +Together these describe the two monotonicities of the passive learning setup. -/ +theorem versionSpace_antitone {m n : ℕ} (hmn : m ≤ n) (C : ConceptClass α β) + (S : LabeledSample α β n) : + VersionSpace C S ⊆ VersionSpace C (fun i => S (Fin.castLE hmn i)) := by + intro h hh + exact ⟨hh.1, fun i => hh.2 (Fin.castLE hmn i)⟩ + +/-! ### Consistent Learners -/ + +/-- A learner is *consistent* with the concept class `C` if, on every labeled +sample it receives, its output hypothesis lies in the version space of `C` at +that sample. Equivalently, the learner's output always agrees with every +observed labeled pair, provided a consistent hypothesis exists in `C`. + +This is the classical consistency condition, central to Angluin's inductive +inference framework and Mitchell's candidate-elimination algorithm. -/ +def IsConsistent {m : ℕ} (A : Learner α β m) (C : ConceptClass α β) : Prop := + ∀ S : LabeledSample α β m, A S ∈ VersionSpace C S + +/-- A consistent learner's output is always in the concept class. -/ +theorem IsConsistent.output_mem_conceptClass {m : ℕ} {A : Learner α β m} + {C : ConceptClass α β} (hA : IsConsistent A C) (S : LabeledSample α β m) : + A S ∈ C := (hA S).1 + +/-- A consistent learner's output agrees with the sample on every observed +point. -/ +theorem IsConsistent.output_consistent {m : ℕ} {A : Learner α β m} + {C : ConceptClass α β} (hA : IsConsistent A C) (S : LabeledSample α β m) + (i : Fin m) : + A S (S i).1 = (S i).2 := (hA S).2 i + +/-! ### Realizable case -/ + +/-- *Realizable version-space nonemptiness.* If a target concept `c` lies in +`C` and the sample `S` is labeled by `c` (i.e. every `(S i).2 = c (S i).1`), +then `c` itself lies in the version space `VersionSpace C S`, so the version +space is nonempty. + +This is the realizable-case correctness condition: when data comes from a +concept in the class, a consistent learner always has at least one legal +choice. -/ +theorem mem_versionSpace_of_realizable {m : ℕ} {C : ConceptClass α β} + (c : α → β) (hc : c ∈ C) (S : LabeledSample α β m) + (hS : ∀ i : Fin m, (S i).2 = c (S i).1) : + c ∈ VersionSpace C S := + ⟨hc, fun i => (hS i).symm⟩ + +/-- Corollary: under realizable data, the version space is nonempty. -/ +theorem versionSpace_nonempty_of_realizable {m : ℕ} {C : ConceptClass α β} + {c : α → β} (hc : c ∈ C) (S : LabeledSample α β m) + (hS : ∀ i : Fin m, (S i).2 = c (S i).1) : + (VersionSpace C S).Nonempty := + ⟨c, mem_versionSpace_of_realizable c hc S hS⟩ + +end Cslib.MachineLearning.PACLearning diff --git a/references.bib b/references.bib index 3e8c26190..ae466156b 100644 --- a/references.bib +++ b/references.bib @@ -337,3 +337,33 @@ @incollection{WinskelNielsen1995 url = {https://doi.org/10.1093/oso/9780198537809.003.0001}, eprint = {https://academic.oup.com/book/0/chapter/421962123/chapter-pdf/52352653/isbn-9780198537809-book-part-1.pdf}, } + +@article{Mitchell1982, + author = {Mitchell, Tom M.}, + title = {Generalization as Search}, + journal = {Artificial Intelligence}, + volume = {18}, + number = {2}, + pages = {203--226}, + year = {1982}, + doi = {10.1016/0004-3702(82)90040-6} +} + +@article{Angluin1980, + author = {Angluin, Dana}, + title = {Inductive Inference of Formal Languages from Positive Data}, + journal = {Information and Control}, + volume = {45}, + number = {2}, + pages = {117--135}, + year = {1980}, + doi = {10.1016/S0019-9958(80)90285-5} +} + +@book{Mitchell1997, + author = {Mitchell, Tom M.}, + title = {Machine Learning}, + year = {1997}, + publisher = {McGraw-Hill}, + isbn = {0070428077} +}