From 0f813bc7d7130c6c40e74614680a4f14b52a5a3f Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 12:40:17 -0700
Subject: [PATCH 01/14] Implement pydantic models for datasets

Currently we just pass through the models and dump them out to dicts to put in the PickleableTinyDB
---
 espei/datasets/__init__.py            |  2 +
 espei/datasets/dataset_models.py      | 97 +++++++++++++++++++++++++++
 espei/{datasets.py => datasets/db.py} | 28 +++++---
 3 files changed, 119 insertions(+), 8 deletions(-)
 create mode 100644 espei/datasets/__init__.py
 create mode 100644 espei/datasets/dataset_models.py
 rename espei/{datasets.py => datasets/db.py} (94%)

diff --git a/espei/datasets/__init__.py b/espei/datasets/__init__.py
new file mode 100644
index 00000000..18dc966a
--- /dev/null
+++ b/espei/datasets/__init__.py
@@ -0,0 +1,2 @@
+from .dataset_models import *
+from .db import *
\ No newline at end of file
diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
new file mode 100644
index 00000000..2f17a921
--- /dev/null
+++ b/espei/datasets/dataset_models.py
@@ -0,0 +1,97 @@
+from typing import Literal, Optional, Union, TypeAlias
+from pydantic import BaseModel, Field
+
+__all__ = [
+    "Dataset",
+    "BroadcastSinglePhaseFixedConfigurationDataset",
+    "ActivityPropertyDataset",
+    "EquilibriumPropertyDataset",
+    "ZPFDataset",
+]
+
+ComponentName: TypeAlias = str
+PhaseName: TypeAlias = str
+PhaseCompositionType: TypeAlias = Union[
+    tuple[PhaseName, list[ComponentName], list[float | None]],       # The usual definition ["LIQUID", ["B"], [0.5]]
+    tuple[PhaseName, list[ComponentName], list[float | None], bool]  # Handle the disordered flag
+]
+PhaseRegionType: TypeAlias = list[PhaseCompositionType]
+
+class Dataset(BaseModel):
+    pass
+
+class Solver(BaseModel):
+    mode: Literal["manual"] = Field(default="manual")
+    sublattice_site_ratios: list[float]
+    # TODO: migrate to list[list[list[float]]]
+    sublattice_configurations: list[list[ComponentName | list[ComponentName]]]
+    sublattice_occupancies: list[list[float | list[float]]] # TODO: optional and validate against configurations
+
+class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
+    components: list[ComponentName] = Field(min_length=1)
+    phases: list[PhaseName] = Field(min_length=1, max_length=1)
+    solver: Solver
+    conditions: dict[str, float | list[float]]
+    output: str
+    values: list[list[list[float]]]
+    excluded_model_contributions: list[str] = Field(default_factory=list)
+    reference: str = Field(default="")
+    bibtex: str = Field(default="")
+    dataset_author: str = Field(default="")
+    comment: str = Field(default="")
+    disabled: bool = Field(default=False)
+
+
+# TODO: would be great to remove
+class ActivityDataReferenceState(BaseModel):
+    phases: list[PhaseName] = Field(min_length=1)
+    conditions: dict[str, float]
+
+
+# TODO: refactor to merge this with EquilibriumPropertyDataset
+class ActivityPropertyDataset(Dataset):
+    components: list[ComponentName] = Field(min_length=1)
+    phases: list[PhaseName] = Field(min_length=1, max_length=1)
+    conditions: dict[str, float | list[float]]
+    reference_state: ActivityDataReferenceState
+    output: str
+    values: list[list[list[float]]]
+    reference: str = Field(default="")
+    bibtex: str = Field(default="")
+    dataset_author: str = Field(default="")
+    comment: str = Field(default="")
+    disabled: bool = Field(default=False)
+
+
+class ReferenceStates(BaseModel):
+    phase: PhaseName
+    fixed_state_variables: dict[str, float] | None = Field(default=None, description="Fixed potentials for the reference state", examples=[{"T": 298.15, "P": 101325}])
+
+
+class EquilibriumPropertyDataset(Dataset):
+    components: list[ComponentName] = Field(min_length=1)
+    phases: list[PhaseName] = Field(min_length=1, max_length=1)
+    conditions: dict[str, float | list[float]]
+    reference_states: dict[ComponentName, ReferenceStates]
+    output: str
+    values: list[list[list[float]]]
+    reference: str = Field(default="")
+    bibtex: str = Field(default="")
+    dataset_author: str = Field(default="")
+    comment: str = Field(default="")
+    disabled: bool = Field(default=False)
+
+
+class ZPFDataset(Dataset):
+    components: list[ComponentName] = Field(min_length=1)
+    phases: list[str] = Field(min_length=1)
+    conditions: dict[str, float | list[float]]
+    broadcast_conditions: Literal[False] = Field(default=False)  # TODO: migrate and remove, since True was never supported
+    output: Literal["ZPF"]
+    values: list[PhaseRegionType]  # TODO: validate to be of same shape as conditions
+    excluded_model_contributions: list[str] = Field(default_factory=list)
+    reference: str = Field(default="")
+    bibtex: str = Field(default="")
+    dataset_author: str = Field(default="")
+    comment: str = Field(default="")
+    disabled: bool = Field(default=False)
diff --git a/espei/datasets.py b/espei/datasets/db.py
similarity index 94%
rename from espei/datasets.py
rename to espei/datasets/db.py
index ffa27f98..27ea23ad 100644
--- a/espei/datasets.py
+++ b/espei/datasets/db.py
@@ -1,5 +1,5 @@
 import fnmatch, json, os
-from typing import Any, Dict, List
+from typing import Any, Dict, List, TypeAlias
 
 import numpy as np
 from tinydb.storages import MemoryStorage
@@ -7,8 +7,8 @@
 
 from espei.utils import PickleableTinyDB
 
-# Create a type
-Dataset = Dict[str, Any]
+from .dataset_models import Dataset, ActivityPropertyDataset, BroadcastSinglePhaseFixedConfigurationDataset, EquilibriumPropertyDataset, ZPFDataset
+
 
 class DatasetError(Exception):
     """Exception raised when datasets are invalid."""
@@ -42,7 +42,7 @@ def recursive_map(f, x):
         return f(x)
 
 
-def check_dataset(dataset: Dataset):
+def check_dataset(dataset: dict[str, Any]) -> Dataset:
     """Ensure that the dataset is valid and consistent.
 
     Currently supports the following validation checks:
@@ -64,7 +64,7 @@ def check_dataset(dataset: Dataset):
 
     Returns
     -------
-    None
+    Dataset
 
     Raises
     ------
@@ -206,8 +206,20 @@ def check_dataset(dataset: Dataset):
                     if isinstance(subl, (list, tuple)) and sorted(subl) != subl:
                         raise DatasetError('Sublattice {} in configuration {} is must be sorted in alphabetic order ({})'.format(subl, configuration, sorted(subl)))
 
+    if is_zpf:
+        dataset_obj = ZPFDataset(**clean_dataset(dataset))
+    elif is_activity:
+        dataset_obj = ActivityPropertyDataset(**clean_dataset(dataset))
+    elif is_equilibrium:
+        dataset_obj = EquilibriumPropertyDataset(**clean_dataset(dataset))
+    elif is_single_phase:
+        dataset_obj = BroadcastSinglePhaseFixedConfigurationDataset(**clean_dataset(dataset))
+    else:
+        raise ValueError(f"Unknown dataset type for dataset {dataset}")
+    return dataset_obj
+
 
-def clean_dataset(dataset: Dataset) -> Dataset:
+def clean_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
     """
     Clean an ESPEI dataset dictionary.
 
@@ -333,8 +345,8 @@ def load_datasets(dataset_filenames, include_disabled=False) -> PickleableTinyDB
                 if not include_disabled and d.get('disabled', False):
                     # The dataset is disabled and not included
                     continue
-                check_dataset(d)
-                ds_database.insert(clean_dataset(d))
+                dataset_obj = check_dataset(d)
+                ds_database.insert(dataset_obj.model_dump())
             except ValueError as e:
                 raise ValueError('JSON Error in {}: {}'.format(fname, e))
             except DatasetError as e:

From b48213cef9a5317065bc40e39db5dc215ba7d9f3 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 13:46:49 -0700
Subject: [PATCH 02/14] Migrate ZPF-specific check_dataset functions to
 ZPFDataset validator

---
 espei/datasets/dataset_models.py | 72 +++++++++++++++++++++++++++++-
 espei/datasets/db.py             | 75 ++------------------------------
 tests/test_datasets.py           | 26 ++++++++++-
 3 files changed, 98 insertions(+), 75 deletions(-)

diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
index 2f17a921..fae2d1f0 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets/dataset_models.py
@@ -1,5 +1,6 @@
-from typing import Literal, Optional, Union, TypeAlias
-from pydantic import BaseModel, Field
+from typing import Literal, Optional, Union, TypeAlias, Self
+from pydantic import BaseModel, Field, model_validator, field_validator
+import numpy as np
 
 __all__ = [
     "Dataset",
@@ -9,6 +10,10 @@
     "ZPFDataset",
 ]
 
+class DatasetError(Exception):
+    """Exception raised when datasets are invalid."""
+    pass
+
 ComponentName: TypeAlias = str
 PhaseName: TypeAlias = str
 PhaseCompositionType: TypeAlias = Union[
@@ -95,3 +100,66 @@ class ZPFDataset(Dataset):
     dataset_author: str = Field(default="")
     comment: str = Field(default="")
     disabled: bool = Field(default=False)
+
+    @model_validator(mode="after")
+    def validate_condition_value_shape_agreement(self) -> Self:
+        values_shape = (len(self.values),)
+        num_temperature = np.atleast_1d(self.conditions["T"]).size
+        num_pressure = np.atleast_1d(self.conditions["P"]).size
+        if num_pressure != 1:
+            raise DatasetError("Non-scalar pressures are not currently supported")
+        conditions_shape = (num_temperature,)
+        if conditions_shape != values_shape:
+            raise DatasetError("Shape of conditions (T): {} does not match the shape of the values {}.".format(conditions_shape, values_shape))
+        return self
+
+    @model_validator(mode="after")
+    def validate_phases_entered_match_phases_used(self) -> Self:
+        phases_entered = set(self.phases)
+        phases_used = set()
+        for phase_region in self.values:
+            for phase_composition in phase_region:
+                phases_used.add(phase_composition[0])
+        if len(phases_entered - phases_used) > 0:
+            raise DatasetError("Phases entered {} do not match phases used {}.".format(phases_entered, phases_used))
+        return self
+
+    @model_validator(mode="after")
+    def validate_components_entered_match_components_used(self) -> Self:
+        components_entered = set(self.components)
+        for i, phase_region in enumerate(self.values):
+            for j, phase_compositions in enumerate(phase_region):
+                phase_composition_components = set(phase_compositions[1])
+                if not components_entered.issuperset(phase_composition_components):
+                    raise DatasetError("Components were used in phase region {} ({}) for phase composition {} ({}) that are not specified as components in the dataset ()", i,phase_region, j, phase_compositions, components_entered)
+                independent_components = components_entered - phase_composition_components - {'VA'}
+                if len(independent_components) != 1:
+                    raise DatasetError('Degree of freedom error: expected 1 independent component, got {} for entered components {} and phase composition components {} in phase region {} ({}) for phase composition {} ({})'.format(len(independent_components), components_entered, phase_composition_components, i, phase_region, j, phase_compositions))
+        return self
+
+    @field_validator("values", mode="after")
+    @classmethod
+    def validate_phase_compositions(cls, values: list[PhaseRegionType]) -> list[PhaseRegionType]:
+        for i, phase_region in enumerate(values):
+            for j, phase_composition in enumerate(phase_region):
+                phase = phase_composition[0]
+                component_list = phase_composition[1]
+                mole_fraction_list = phase_composition[2]
+                # check that the phase is a string, components a list of strings,
+                #  and the fractions are a list of float
+                if not isinstance(phase, str):
+                    raise DatasetError('The first element in phase composition {} ({}) for phase region {} ({}) should be a string. Instead it is a {} of value {}'.format(j, phase_composition, i, phase_region, type(phase), phase))
+                if not all([isinstance(comp, str) for comp in component_list]):
+                    raise DatasetError('The second element in phase composition {} ({}) for phase region {} ({}) should be a list of strings. Instead it is a {} of value {}'.format(j, phase_composition, i, phase_region, type(component_list), component_list))
+                if not all([(isinstance(mole_frac, (int, float)) or mole_frac is None)  for mole_frac in mole_fraction_list]):
+                    raise DatasetError('The last element in phase composition {} ({}) for phase region {} ({}) should be a list of numbers. Instead it is a {} of value {}'.format(j, phase_composition, i, phase_region, type(mole_fraction_list), mole_fraction_list))
+                # check that the shape of components list and mole fractions list is the same
+                if len(component_list) != len(mole_fraction_list):
+                    raise DatasetError('The length of the components list and mole fractions list in phase composition {} ({}) for phase region {} ({}) should be the same.'.format(j, phase_composition, i, phase_region))
+                # check that all mole fractions are less than one
+                mf_sum = np.nansum(np.array(mole_fraction_list, dtype=np.float64))
+                if any([mf is not None for mf in mole_fraction_list]) and mf_sum > 1.0:
+                    raise DatasetError('Mole fractions for phase composition {} ({}) for phase region {} ({}) sum to greater than one.'.format(j, phase_composition, i, phase_region))
+                if any([(mf is not None) and (mf < 0.0) for mf in mole_fraction_list]):
+                    raise DatasetError('Got unallowed negative mole fraction for phase composition {} ({}) for phase region {} ({}).'.format(j, phase_composition, i, phase_region))
+        return values
\ No newline at end of file
diff --git a/espei/datasets/db.py b/espei/datasets/db.py
index 27ea23ad..40a9b902 100644
--- a/espei/datasets/db.py
+++ b/espei/datasets/db.py
@@ -7,13 +7,9 @@
 
 from espei.utils import PickleableTinyDB
 
-from .dataset_models import Dataset, ActivityPropertyDataset, BroadcastSinglePhaseFixedConfigurationDataset, EquilibriumPropertyDataset, ZPFDataset
+from .dataset_models import Dataset, ActivityPropertyDataset, BroadcastSinglePhaseFixedConfigurationDataset, EquilibriumPropertyDataset, ZPFDataset, DatasetError
 
 
-class DatasetError(Exception):
-    """Exception raised when datasets are invalid."""
-    pass
-
 
 def recursive_map(f, x):
     """
@@ -73,10 +69,7 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
     """
     is_equilibrium = 'solver' not in dataset.keys() and dataset['output'] != 'ZPF'
     is_activity = dataset['output'].startswith('ACR')
-    is_zpf = dataset['output'] == 'ZPF'
     is_single_phase = 'solver' in dataset.keys()
-    if not any((is_equilibrium, is_single_phase, is_zpf)):
-        raise DatasetError("Cannot determine type of dataset")
     components = dataset['components']
     conditions = dataset['conditions']
     values = dataset['values']
@@ -121,21 +114,6 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
         conditions_shape = (num_pressure, num_temperature, num_configs)
         if conditions_shape != values_shape:
             raise DatasetError('Shape of conditions (P, T, configs): {} does not match the shape of the values {}.'.format(conditions_shape, values_shape))
-    elif is_zpf:
-        values_shape = (len(values))
-        conditions_shape = (num_temperature)
-        if conditions_shape != values_shape:
-            raise DatasetError('Shape of conditions (T): {} does not match the shape of the values {}.'.format(conditions_shape, values_shape))
-
-    # check that all of the correct phases are present
-    if is_zpf:
-        phases_entered = set(phases)
-        phases_used = set()
-        for zpf in values:
-            for tieline in zpf:
-                phases_used.add(tieline[0])
-        if len(phases_entered - phases_used) > 0:
-            raise DatasetError('Phases entered {} do not match phases used {}.'.format(phases_entered, phases_used))
 
     # check that all of the components used match the components entered
     components_entered = set(components)
@@ -152,41 +130,9 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
         components_used.update({c.split('_')[1] for c in comp_conditions.keys()})
         # mass balance of components
         comp_dof = len(comp_conditions.keys())
-    elif is_zpf:
-        for zpf in values:
-            for tieline in zpf:
-                tieline_comps = set(tieline[1])
-                components_used.update(tieline_comps)
-                if len(components_entered - tieline_comps - {'VA'}) != 1:
-                    raise DatasetError('Degree of freedom error for entered components {} in tieline {} of ZPF {}'.format(components_entered, tieline, zpf))
-        # handle special case of mass balance in ZPFs
-        comp_dof = 1
-    if len(components_entered - components_used - {'VA'}) > comp_dof or len(components_used - components_entered) > 0:
+    if (is_single_phase or is_activity or is_equilibrium) and (len(components_entered - components_used - {'VA'}) > comp_dof or len(components_used - components_entered) > 0):
         raise DatasetError('Components entered {} do not match components used {}.'.format(components_entered, components_used))
 
-    # check that the ZPF values are formatted properly
-    if is_zpf:
-        for zpf in values:
-            for tieline in zpf:
-                phase = tieline[0]
-                component_list = tieline[1]
-                mole_fraction_list = tieline[2]
-                # check that the phase is a string, components a list of strings,
-                #  and the fractions are a list of float
-                if not isinstance(phase, str):
-                    raise DatasetError('The first element in the tieline {} for the ZPF point {} should be a string. Instead it is a {} of value {}'.format(tieline, zpf, type(phase), phase))
-                if not all([isinstance(comp, str) for comp in component_list]):
-                    raise DatasetError('The second element in the tieline {} for the ZPF point {} should be a list of strings. Instead it is a {} of value {}'.format(tieline, zpf, type(component_list), component_list))
-                if not all([(isinstance(mole_frac, (int, float)) or mole_frac is None)  for mole_frac in mole_fraction_list]):
-                    raise DatasetError('The last element in the tieline {} for the ZPF point {} should be a list of numbers. Instead it is a {} of value {}'.format(tieline, zpf, type(mole_fraction_list), mole_fraction_list))
-                # check that the shape of components list and mole fractions list is the same
-                if len(component_list) != len(mole_fraction_list):
-                    raise DatasetError('The length of the components list and mole fractions list in tieline {} for the ZPF point {} should be the same.'.format(tieline, zpf))
-                # check that all mole fractions are less than one
-                mf_sum = np.nansum(np.array(mole_fraction_list, dtype=np.float64))
-                if any([mf is not None for mf in mole_fraction_list]) and mf_sum > 1.0:
-                    raise DatasetError('Mole fractions for tieline {} for the ZPF point {} sum to greater than one.'.format(tieline, zpf))
-
     # check that the site ratios are valid as well as site occupancies, if applicable
     if is_single_phase:
         nconfigs = len(sublattice_configurations)
@@ -206,7 +152,7 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
                     if isinstance(subl, (list, tuple)) and sorted(subl) != subl:
                         raise DatasetError('Sublattice {} in configuration {} is must be sorted in alphabetic order ({})'.format(subl, configuration, sorted(subl)))
 
-    if is_zpf:
+    if dataset["output"] == "ZPF":
         dataset_obj = ZPFDataset(**clean_dataset(dataset))
     elif is_activity:
         dataset_obj = ActivityPropertyDataset(**clean_dataset(dataset))
@@ -248,20 +194,7 @@ def clean_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
         if occupancies is not None:
             solver["sublattice_occupancies"] = recursive_map(float, occupancies)
 
-    if dataset["output"] == "ZPF":
-        values = dataset["values"]
-        new_values = []
-        for tieline in values:
-            new_tieline = []
-            for tieline_point in tieline:
-                if all([comp is None for comp in tieline_point[2]]):
-                    # this is a null tieline point
-                    new_tieline.append(tieline_point)
-                else:
-                    new_tieline.append([tieline_point[0], tieline_point[1], recursive_map(float, tieline_point[2])])
-            new_values.append(new_tieline)
-        dataset["values"] = new_values
-    else:
+    if dataset["output"] != "ZPF":
         # values should be all numerical
         dataset["values"] = recursive_map(float, dataset["values"])
 
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 65d914c2..2734a517 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -4,6 +4,7 @@
 
 from .testing_data import CU_MG_EXP_ACTIVITY, CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES, CU_MG_DATASET_ZPF_STRING_VALUES, LI_SN_LIQUID_DATA, dataset_multi_valid_ternary
 from .fixtures import datasets_db
+from pydantic import ValidationError
 
 dataset_single_valid = {
     "components": ["AL", "NI", "VA"],
@@ -294,6 +295,21 @@
     ],
 }
 
+dataset_zpf_negative_mole_fraction = {
+    "components": ["AL", "NI", "VA"],
+    "phases": ["AL3NI2", "BCC_B2"],
+    "conditions": {
+        "P": 101325,
+        "T": [1348]
+    },
+    "output": "ZPF",
+    "values": [
+        [["AL3NI2", ["NI"], [-0.5]], ["BCC_B2", ["NI"], [None]]], # mole fraction is negative
+    ],
+}
+
+
+
 dataset_single_unsorted_interaction = {
     "components": ["AL", "NI", "VA"],
     "phases": ["BCC_B2"],
@@ -382,7 +398,7 @@ def test_check_datasets_raises_with_incorrect_components():
 
 def test_check_datasets_raises_with_malformed_zpf():
     """Passed datasets that have malformed ZPF values should raise."""
-    with pytest.raises(DatasetError):
+    with pytest.raises((DatasetError, ValidationError)):
         check_dataset(dataset_multi_malformed_zpfs_components_not_list)
     with pytest.raises(DatasetError):
         check_dataset(dataset_multi_malformed_zpfs_fractions_do_not_match_components)
@@ -409,6 +425,12 @@ def test_check_datasets_raises_with_zpf_fractions_greater_than_one():
         check_dataset(dataset_multi_mole_fractions_as_percents)
 
 
+def test_check_datasets_raises_with_negative_zpf_fractions():
+    """Passed datasets that have negative mole fractions should raise."""
+    with pytest.raises(DatasetError):
+        check_dataset(dataset_zpf_negative_mole_fraction)
+
+
 def test_check_datasets_raises_with_unsorted_interactions():
     """Passed datasets that have sublattice interactions not in sorted order should raise."""
     with pytest.raises(DatasetError):
@@ -425,7 +447,7 @@ def test_datasets_convert_thermochemical_string_values_producing_correct_value(d
 
 def test_datasets_convert_zpf_string_values_producing_correct_value(datasets_db):
     """Strings where floats are expected should give correct answers for ZPF datasets"""
-    ds = clean_dataset(CU_MG_DATASET_ZPF_STRING_VALUES)
+    ds = check_dataset(CU_MG_DATASET_ZPF_STRING_VALUES).model_dump()
     assert np.issubdtype(np.array([t[0][2] for t in ds['values']]).dtype, np.number)
     assert np.issubdtype(np.array(ds['conditions']['T']).dtype, np.number)
     assert np.issubdtype(np.array(ds['conditions']['P']).dtype, np.number)

From fb9cc5d48011c0b759d7d7570e917c0d4aca2061 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 14:01:58 -0700
Subject: [PATCH 03/14] Deprecate clean_dataset as the behavior is in pydantic
 now

---
 espei/datasets/dataset_models.py |  4 ++-
 espei/datasets/db.py             | 42 ++++++--------------------------
 tests/test_datasets.py           |  7 +++---
 3 files changed, 14 insertions(+), 39 deletions(-)

diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
index fae2d1f0..4dc7209f 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets/dataset_models.py
@@ -30,7 +30,9 @@ class Solver(BaseModel):
     sublattice_site_ratios: list[float]
     # TODO: migrate to list[list[list[float]]]
     sublattice_configurations: list[list[ComponentName | list[ComponentName]]]
-    sublattice_occupancies: list[list[float | list[float]]] # TODO: optional and validate against configurations
+    sublattice_occupancies: list[list[float | list[float]]] | None = Field(default=None)
+
+
 
 class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
     components: list[ComponentName] = Field(min_length=1)
diff --git a/espei/datasets/db.py b/espei/datasets/db.py
index 40a9b902..44b4dfa8 100644
--- a/espei/datasets/db.py
+++ b/espei/datasets/db.py
@@ -1,6 +1,6 @@
 import fnmatch, json, os
 from typing import Any, Dict, List, TypeAlias
-
+import warnings
 import numpy as np
 from tinydb.storages import MemoryStorage
 from tinydb import where
@@ -153,13 +153,13 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
                         raise DatasetError('Sublattice {} in configuration {} is must be sorted in alphabetic order ({})'.format(subl, configuration, sorted(subl)))
 
     if dataset["output"] == "ZPF":
-        dataset_obj = ZPFDataset(**clean_dataset(dataset))
+        dataset_obj = ZPFDataset(**dataset)
     elif is_activity:
-        dataset_obj = ActivityPropertyDataset(**clean_dataset(dataset))
+        dataset_obj = ActivityPropertyDataset(**dataset)
     elif is_equilibrium:
-        dataset_obj = EquilibriumPropertyDataset(**clean_dataset(dataset))
+        dataset_obj = EquilibriumPropertyDataset(**dataset)
     elif is_single_phase:
-        dataset_obj = BroadcastSinglePhaseFixedConfigurationDataset(**clean_dataset(dataset))
+        dataset_obj = BroadcastSinglePhaseFixedConfigurationDataset(**dataset)
     else:
         raise ValueError(f"Unknown dataset type for dataset {dataset}")
     return dataset_obj
@@ -167,37 +167,9 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
 
 def clean_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
     """
-    Clean an ESPEI dataset dictionary.
-
-    Parameters
-    ----------
-    dataset: Dataset
-        Dictionary of the standard ESPEI dataset.   dataset : dic
-
-    Returns
-    -------
-    Dataset
-        Modified dataset that has been cleaned
-
-    Notes
-    -----
-    Assumes a valid, checked dataset. Currently handles
-    * Converting expected numeric values to floats
-
+    No-op
     """
-    dataset["conditions"] = {k: recursive_map(float, v) for k, v in dataset["conditions"].items()}
-
-    solver = dataset.get("solver")
-    if solver is not None:
-        solver["sublattice_site_ratios"] = recursive_map(float, solver["sublattice_site_ratios"])
-        occupancies = solver.get("sublattice_occupancies")
-        if occupancies is not None:
-            solver["sublattice_occupancies"] = recursive_map(float, occupancies)
-
-    if dataset["output"] != "ZPF":
-        # values should be all numerical
-        dataset["values"] = recursive_map(float, dataset["values"])
-
+    warnings.warn(f"clean_dataset deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models.", DeprecationWarning)
     return dataset
 
 
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 2734a517..9c6f3642 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,6 +1,7 @@
+from copy import deepcopy
 import pytest
 import numpy as np
-from espei.datasets import DatasetError, check_dataset, clean_dataset, apply_tags
+from espei.datasets import DatasetError, check_dataset, apply_tags
 
 from .testing_data import CU_MG_EXP_ACTIVITY, CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES, CU_MG_DATASET_ZPF_STRING_VALUES, LI_SN_LIQUID_DATA, dataset_multi_valid_ternary
 from .fixtures import datasets_db
@@ -439,7 +440,7 @@ def test_check_datasets_raises_with_unsorted_interactions():
 
 def test_datasets_convert_thermochemical_string_values_producing_correct_value(datasets_db):
     """Strings where floats are expected should give correct answers for thermochemical datasets"""
-    ds = clean_dataset(CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES)
+    ds = check_dataset(CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES).model_dump()
     assert np.issubdtype(np.array(ds['values']).dtype, np.number)
     assert np.issubdtype(np.array(ds['conditions']['T']).dtype, np.number)
     assert np.issubdtype(np.array(ds['conditions']['P']).dtype, np.number)
@@ -468,7 +469,7 @@ def test_non_equilibrium_thermo_data_with_species_passes_checker():
 
 def test_applying_tags(datasets_db):
     """Test that applying tags updates the appropriate values"""
-    dataset = clean_dataset(CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES)
+    dataset = deepcopy(CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES)
     # overwrite tags for this test
     dataset["tags"] = ["testtag"]
     datasets_db.insert(dataset)

From b3d8e9bd35865e45c52a7877479c2309f0898ef7 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 16:10:01 -0700
Subject: [PATCH 04/14] Fix max length

---
 espei/datasets/dataset_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
index 4dc7209f..02d8cef9 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets/dataset_models.py
@@ -58,7 +58,7 @@ class ActivityDataReferenceState(BaseModel):
 # TODO: refactor to merge this with EquilibriumPropertyDataset
 class ActivityPropertyDataset(Dataset):
     components: list[ComponentName] = Field(min_length=1)
-    phases: list[PhaseName] = Field(min_length=1, max_length=1)
+    phases: list[PhaseName] = Field(min_length=1)
     conditions: dict[str, float | list[float]]
     reference_state: ActivityDataReferenceState
     output: str
@@ -77,7 +77,7 @@ class ReferenceStates(BaseModel):
 
 class EquilibriumPropertyDataset(Dataset):
     components: list[ComponentName] = Field(min_length=1)
-    phases: list[PhaseName] = Field(min_length=1, max_length=1)
+    phases: list[PhaseName] = Field(min_length=1)
     conditions: dict[str, float | list[float]]
     reference_states: dict[ComponentName, ReferenceStates]
     output: str

From 5a0b8879b9ef9107c7c5dfa4217a6e400ed853bf Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 16:39:36 -0700
Subject: [PATCH 05/14] Migrate check_datasets validators to pydantic models

---
 espei/datasets/dataset_models.py | 61 ++++++++++++++++++++++++++++++++
 espei/datasets/db.py             | 58 ++++--------------------------
 2 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
index 02d8cef9..89290558 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets/dataset_models.py
@@ -49,6 +49,67 @@ class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
     disabled: bool = Field(default=False)
 
 
+    @model_validator(mode="after")
+    def validate_components_entered_match_components_used(self) -> Self:
+        components_entered = set(self.components)
+        components_used = set()
+        for config in self.solver.sublattice_configurations:
+            for subl in config:
+                if isinstance(subl, list):
+                    components_used.update(set(subl))
+                else:
+                    components_used.add(subl)
+        # Don't count vacancies as a component here
+        components_difference = components_entered.symmetric_difference(components_used) - {"VA"}
+        if len(components_difference) != 0:
+            raise DatasetError(f'Components entered {components_entered} do not match components used {components_used} ({components_difference} different).')
+        return self
+
+    @model_validator(mode="after")
+    def validate_condition_value_shape_agreement(self) -> Self:
+        values_shape = np.array(self.values).shape
+        num_configs = len(self.solver.sublattice_configurations)
+        num_temperature = np.atleast_1d(self.conditions["T"]).size
+        num_pressure = np.atleast_1d(self.conditions["P"]).size
+        conditions_shape = (num_pressure, num_temperature, num_configs)
+        if conditions_shape != values_shape:
+            raise DatasetError(f'Shape of conditions (P, T, configs): {conditions_shape} does not match the shape of the values {values_shape}.')
+        return self
+
+    @model_validator(mode="after")
+    def validate_configuration_occupancy_shape_agreement(self) -> Self:
+        sublattice_configurations = self.solver.sublattice_configurations
+        sublattice_site_ratios = self.solver.sublattice_site_ratios
+        sublattice_occupancies = self.solver.sublattice_occupancies
+        # check for mixing
+        is_mixing = any([any([isinstance(subl, list) for subl in config]) for config in sublattice_configurations])
+        # pad the values of sublattice occupancies if there is no mixing
+        # just for the purposes of checking validity
+        if sublattice_occupancies is None and not is_mixing:
+            sublattice_occupancies = [None]*len(sublattice_configurations)
+        elif sublattice_occupancies is None:
+            raise DatasetError(f'At least one sublattice in the following sublattice configurations is mixing, but the "sublattice_occupancies" key is empty: {sublattice_configurations}')
+
+        # check that the site ratios are valid as well as site occupancies, if applicable
+        nconfigs = len(sublattice_configurations)
+        noccupancies = len(sublattice_occupancies)
+        if nconfigs != noccupancies:
+            raise DatasetError(f'Number of sublattice configurations ({nconfigs}) does not match the number of sublattice occupancies ({noccupancies})')
+        for configuration, occupancy in zip(sublattice_configurations, sublattice_occupancies):
+            if len(configuration) != len(sublattice_site_ratios):
+                raise DatasetError(f'Sublattice configuration {configuration} and sublattice site ratio {sublattice_site_ratios} describe different numbers of sublattices ({len(configuration)} and {len(sublattice_site_ratios)}).')
+            if is_mixing:
+                configuration_shape = tuple(len(sl) if isinstance(sl, list) else 1 for sl in configuration)
+                occupancy_shape = tuple(len(sl) if isinstance(sl, list) else 1 for sl in occupancy)
+                if configuration_shape != occupancy_shape:
+                    raise DatasetError(f'The shape of sublattice configuration {configuration} ({configuration_shape}) does not match the shape of occupancies {occupancy} ({occupancy_shape})')
+                # check that sublattice interactions are in sorted. Related to sorting in espei.core_utils.get_samples
+                for subl in configuration:
+                    if isinstance(subl, (list, tuple)) and sorted(subl) != subl:
+                        raise DatasetError(f'Sublattice {subl} in configuration {configuration} is must be sorted in alphabetic order ({sorted(subl)})')
+        return self
+
+
 # TODO: would be great to remove
 class ActivityDataReferenceState(BaseModel):
     phases: list[PhaseName] = Field(min_length=1)
diff --git a/espei/datasets/db.py b/espei/datasets/db.py
index 44b4dfa8..94a262f7 100644
--- a/espei/datasets/db.py
+++ b/espei/datasets/db.py
@@ -69,23 +69,10 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
     """
     is_equilibrium = 'solver' not in dataset.keys() and dataset['output'] != 'ZPF'
     is_activity = dataset['output'].startswith('ACR')
-    is_single_phase = 'solver' in dataset.keys()
     components = dataset['components']
     conditions = dataset['conditions']
     values = dataset['values']
     phases = dataset['phases']
-    if is_single_phase:
-        solver = dataset['solver']
-        sublattice_configurations = solver['sublattice_configurations']
-        sublattice_site_ratios = solver['sublattice_site_ratios']
-        sublattice_occupancies = solver.get('sublattice_occupancies', None)
-        # check for mixing
-        is_mixing = any([any([isinstance(subl, list) for subl in config]) for config in sublattice_configurations])
-        # pad the values of sublattice occupancies if there is no mixing
-        if sublattice_occupancies is None and not is_mixing:
-            sublattice_occupancies = [None]*len(sublattice_configurations)
-        elif sublattice_occupancies is None:
-            raise DatasetError('At least one sublattice in the following sublattice configurations is mixing, but the "sublattice_occupancies" key is empty: {}'.format(sublattice_configurations))
     if is_equilibrium:
         conditions = dataset['conditions']
         comp_conditions = {k: v for k, v in conditions.items() if k.startswith('X_')}
@@ -108,49 +95,16 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
         conditions_shape = (num_pressure, num_temperature, num_x_conds[0])
         if conditions_shape != values_shape:
             raise DatasetError('Shape of conditions (P, T, compositions): {} does not match the shape of the values {}.'.format(conditions_shape, values_shape))
-    elif is_single_phase:
-        values_shape = np.array(values).shape
-        num_configs = len(dataset['solver']['sublattice_configurations'])
-        conditions_shape = (num_pressure, num_temperature, num_configs)
-        if conditions_shape != values_shape:
-            raise DatasetError('Shape of conditions (P, T, configs): {} does not match the shape of the values {}.'.format(conditions_shape, values_shape))
 
     # check that all of the components used match the components entered
-    components_entered = set(components)
-    components_used = set()
-    if is_single_phase:
-        for config in sublattice_configurations:
-            for sl in config:
-                if isinstance(sl, list):
-                    components_used.update(set(sl))
-                else:
-                    components_used.add(sl)
-        comp_dof = 0
-    elif is_equilibrium:
+    if is_equilibrium:  # and is_activity
+        components_entered = set(components)
+        components_used = set()
         components_used.update({c.split('_')[1] for c in comp_conditions.keys()})
         # mass balance of components
         comp_dof = len(comp_conditions.keys())
-    if (is_single_phase or is_activity or is_equilibrium) and (len(components_entered - components_used - {'VA'}) > comp_dof or len(components_used - components_entered) > 0):
-        raise DatasetError('Components entered {} do not match components used {}.'.format(components_entered, components_used))
-
-    # check that the site ratios are valid as well as site occupancies, if applicable
-    if is_single_phase:
-        nconfigs = len(sublattice_configurations)
-        noccupancies = len(sublattice_occupancies)
-        if nconfigs != noccupancies:
-            raise DatasetError('Number of sublattice configurations ({}) does not match the number of sublattice occupancies ({})'.format(nconfigs, noccupancies))
-        for configuration, occupancy in zip(sublattice_configurations, sublattice_occupancies):
-            if len(configuration) != len(sublattice_site_ratios):
-                raise DatasetError('Sublattice configuration {} and sublattice site ratio {} describe different numbers of sublattices ({} and {}).'.format(configuration, sublattice_site_ratios, len(configuration), len(sublattice_site_ratios)))
-            if is_mixing:
-                configuration_shape = tuple(len(sl) if isinstance(sl, list) else 1 for sl in configuration)
-                occupancy_shape = tuple(len(sl) if isinstance(sl, list) else 1 for sl in occupancy)
-                if configuration_shape != occupancy_shape:
-                    raise DatasetError('The shape of sublattice configuration {} ({}) does not match the shape of occupancies {} ({})'.format(configuration, configuration_shape, occupancy, occupancy_shape))
-                # check that sublattice interactions are in sorted. Related to sorting in espei.core_utils.get_samples
-                for subl in configuration:
-                    if isinstance(subl, (list, tuple)) and sorted(subl) != subl:
-                        raise DatasetError('Sublattice {} in configuration {} is must be sorted in alphabetic order ({})'.format(subl, configuration, sorted(subl)))
+        if len(components_entered - components_used - {'VA'}) > comp_dof or len(components_used - components_entered) > 0:
+            raise DatasetError('Components entered {} do not match components used {}.'.format(components_entered, components_used))
 
     if dataset["output"] == "ZPF":
         dataset_obj = ZPFDataset(**dataset)
@@ -158,7 +112,7 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
         dataset_obj = ActivityPropertyDataset(**dataset)
     elif is_equilibrium:
         dataset_obj = EquilibriumPropertyDataset(**dataset)
-    elif is_single_phase:
+    elif 'solver' in dataset.keys():
         dataset_obj = BroadcastSinglePhaseFixedConfigurationDataset(**dataset)
     else:
         raise ValueError(f"Unknown dataset type for dataset {dataset}")

From d2f64271906d835067c67d993ed2ab275f10bac8 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 16:48:27 -0700
Subject: [PATCH 06/14] Cleanup of activity check_dataset stuff

Activity-specific checks weren't done at all and it's all subsumed by equilibrium
---
 espei/datasets/dataset_models.py |  2 --
 espei/datasets/db.py             | 12 +++++-------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
index 89290558..82e765cd 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets/dataset_models.py
@@ -33,7 +33,6 @@ class Solver(BaseModel):
     sublattice_occupancies: list[list[float | list[float]]] | None = Field(default=None)
 
 
-
 class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
     components: list[ComponentName] = Field(min_length=1)
     phases: list[PhaseName] = Field(min_length=1, max_length=1)
@@ -48,7 +47,6 @@ class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
     comment: str = Field(default="")
     disabled: bool = Field(default=False)
 
-
     @model_validator(mode="after")
     def validate_components_entered_match_components_used(self) -> Self:
         components_entered = set(self.components)
diff --git a/espei/datasets/db.py b/espei/datasets/db.py
index 94a262f7..3cd2844e 100644
--- a/espei/datasets/db.py
+++ b/espei/datasets/db.py
@@ -68,17 +68,11 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
         If an error is found in the dataset
     """
     is_equilibrium = 'solver' not in dataset.keys() and dataset['output'] != 'ZPF'
-    is_activity = dataset['output'].startswith('ACR')
     components = dataset['components']
     conditions = dataset['conditions']
     values = dataset['values']
     phases = dataset['phases']
     if is_equilibrium:
-        conditions = dataset['conditions']
-        comp_conditions = {k: v for k, v in conditions.items() if k.startswith('X_')}
-    if is_activity:
-        ref_state = dataset['reference_state']
-    elif is_equilibrium:
         for el, vals in dataset.get('reference_states', {}).items():
             if 'phase' not in vals:
                 raise DatasetError(f'Reference state for element {el} must define the `phase` key with the reference phase name.')
@@ -87,6 +81,8 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
     num_pressure = np.atleast_1d(conditions['P']).size
     num_temperature = np.atleast_1d(conditions['T']).size
     if is_equilibrium:
+        conditions = dataset['conditions']
+        comp_conditions = {k: v for k, v in conditions.items() if k.startswith('X_')}
         values_shape = np.array(values).shape
         # check each composition condition is the same shape
         num_x_conds = [len(v) for _, v in comp_conditions.items()]
@@ -98,6 +94,8 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
 
     # check that all of the components used match the components entered
     if is_equilibrium:  # and is_activity
+        conditions = dataset['conditions']
+        comp_conditions = {k: v for k, v in conditions.items() if k.startswith('X_')}
         components_entered = set(components)
         components_used = set()
         components_used.update({c.split('_')[1] for c in comp_conditions.keys()})
@@ -108,7 +106,7 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
 
     if dataset["output"] == "ZPF":
         dataset_obj = ZPFDataset(**dataset)
-    elif is_activity:
+    elif dataset['output'].startswith('ACR'):
         dataset_obj = ActivityPropertyDataset(**dataset)
     elif is_equilibrium:
         dataset_obj = EquilibriumPropertyDataset(**dataset)

From a3610feed7be93635fdd603daf3e1535b4e7c558 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 18:20:08 -0700
Subject: [PATCH 07/14] Migrate equlibrium and activity check_datasets
 functionality to pydantic models

Added some new tests that were previously uncovered
---
 espei/datasets/dataset_models.py              | 77 ++++++++++++++-
 espei/datasets/db.py                          | 41 +-------
 .../equilibrium_thermochemical_error.py       |  2 +-
 tests/test_datasets.py                        | 96 +++++++++++++++++++
 4 files changed, 173 insertions(+), 43 deletions(-)

diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
index 82e765cd..1a9d82d7 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets/dataset_models.py
@@ -109,12 +109,15 @@ def validate_configuration_occupancy_shape_agreement(self) -> Self:
 
 
 # TODO: would be great to remove
-class ActivityDataReferenceState(BaseModel):
+class ActivityDataReferenceState(Dataset):
     phases: list[PhaseName] = Field(min_length=1)
     conditions: dict[str, float]
 
 
 # TODO: refactor to merge this with EquilibriumPropertyDataset
+# The validator functions are exactly duplicated in EquilibriumPropertyDataset
+# The duplication simplifies the implementation since the activity special case is
+# ultimately meant to be removed once activity is a PyCalphad Workspace property
 class ActivityPropertyDataset(Dataset):
     components: list[ComponentName] = Field(min_length=1)
     phases: list[PhaseName] = Field(min_length=1)
@@ -128,6 +131,36 @@ class ActivityPropertyDataset(Dataset):
     comment: str = Field(default="")
     disabled: bool = Field(default=False)
 
+    @model_validator(mode="after")
+    def validate_condition_value_shape_agreement(self) -> Self:
+        conditions = self.conditions
+        comp_conditions = {k: v for k, v in conditions.items() if k.startswith('X_')}
+        num_temperature = np.atleast_1d(self.conditions["T"]).size
+        num_pressure = np.atleast_1d(self.conditions["P"]).size
+        # check each composition condition is the same shape
+        num_x_conds = [np.atleast_1d(vals).size for _, vals in comp_conditions.items()]
+        if num_x_conds.count(num_x_conds[0]) != len(num_x_conds):
+            raise DatasetError(f'All compositions in conditions are not the same shape. Note that conditions cannot be broadcast. Composition conditions are {comp_conditions}')
+        conditions_shape = (num_pressure, num_temperature, num_x_conds[0])
+        values_shape = np.array(self.values).shape
+        if conditions_shape != values_shape:
+            raise DatasetError(f'Shape of conditions (P, T, compositions): {conditions_shape} does not match the shape of the values {values_shape}.')
+        return self
+
+    @model_validator(mode="after")
+    def validate_components_entered_match_components_used(self) -> Self:
+        conditions = self.conditions
+        comp_conditions = {ky: vl for ky, vl in conditions.items() if ky.startswith('X_')}
+        components_entered = set(self.components)
+        components_used = set()
+        components_used.update({c.split('_')[1] for c in comp_conditions.keys()})
+        if not components_entered.issuperset(components_used):
+            raise DatasetError(f"Components were used as conditions that are not present in the specified components: {components_used - components_entered}.")
+        independent_components = components_entered - components_used - {'VA'}
+        if len(independent_components) != 1:
+            raise DatasetError(f"Degree of freedom error: expected 1 independent component, got {len(independent_components)} for entered components {components_entered} and {components_used} used in the conditions.")
+        return self
+
 
 class ReferenceStates(BaseModel):
     phase: PhaseName
@@ -138,15 +171,55 @@ class EquilibriumPropertyDataset(Dataset):
     components: list[ComponentName] = Field(min_length=1)
     phases: list[PhaseName] = Field(min_length=1)
     conditions: dict[str, float | list[float]]
-    reference_states: dict[ComponentName, ReferenceStates]
     output: str
     values: list[list[list[float]]]
+    reference_states: dict[ComponentName, ReferenceStates] | None = Field(default=None)
     reference: str = Field(default="")
     bibtex: str = Field(default="")
     dataset_author: str = Field(default="")
     comment: str = Field(default="")
     disabled: bool = Field(default=False)
 
+    @model_validator(mode="after")
+    def validate_condition_value_shape_agreement(self) -> Self:
+        conditions = self.conditions
+        comp_conditions = {k: v for k, v in conditions.items() if k.startswith('X_')}
+        num_temperature = np.atleast_1d(self.conditions["T"]).size
+        num_pressure = np.atleast_1d(self.conditions["P"]).size
+        # check each composition condition is the same shape
+        num_x_conds = [np.atleast_1d(vals).size for _, vals in comp_conditions.items()]
+        if num_x_conds.count(num_x_conds[0]) != len(num_x_conds):
+            raise DatasetError(f'All compositions in conditions are not the same shape. Note that conditions cannot be broadcast. Composition conditions are {comp_conditions}')
+        conditions_shape = (num_pressure, num_temperature, num_x_conds[0])
+        values_shape = np.array(self.values).shape
+        if conditions_shape != values_shape:
+            raise DatasetError(f'Shape of conditions (P, T, compositions): {conditions_shape} does not match the shape of the values {values_shape}.')
+        return self
+
+    @model_validator(mode="after")
+    def validate_components_entered_match_components_used(self) -> Self:
+        conditions = self.conditions
+        comp_conditions = {ky: vl for ky, vl in conditions.items() if ky.startswith('X_')}
+        components_entered = set(self.components)
+        components_used = set()
+        components_used.update({c.split('_')[1] for c in comp_conditions.keys()})
+        if not components_entered.issuperset(components_used):
+            raise DatasetError(f"Components were used as conditions that are not present in the specified components: {components_used - components_entered}.")
+        independent_components = components_entered - components_used - {'VA'}
+        if len(independent_components) != 1:
+            raise DatasetError(f"Degree of freedom error: expected 1 independent component, got {len(independent_components)} for entered components {components_entered} and {components_used} used in the conditions.")
+        return self
+
+    @model_validator(mode="after")
+    def validate_reference_state_fully_specified_if_used(self) -> Self:
+        """If there is a reference state specified, the components in the reference state must match the dataset components"""
+        components_entered = set(self.components) - {"VA"}
+        if self.reference_states is not None:
+            reference_state_components = set(self.reference_states.keys()) - {"VA"}
+            if components_entered != reference_state_components:
+                raise DatasetError(f"If used, reference states in equilibrium property must define a reference state for all components in the calculation. Got {components_entered} entered components and {reference_state_components} in the reference states ({components_entered.symmetric_difference(reference_state_components)} non-matching).")
+        return self
+
 
 class ZPFDataset(Dataset):
     components: list[ComponentName] = Field(min_length=1)
diff --git a/espei/datasets/db.py b/espei/datasets/db.py
index 3cd2844e..34ebde77 100644
--- a/espei/datasets/db.py
+++ b/espei/datasets/db.py
@@ -67,53 +67,14 @@ def check_dataset(dataset: dict[str, Any]) -> Dataset:
     DatasetError
         If an error is found in the dataset
     """
-    is_equilibrium = 'solver' not in dataset.keys() and dataset['output'] != 'ZPF'
-    components = dataset['components']
-    conditions = dataset['conditions']
-    values = dataset['values']
-    phases = dataset['phases']
-    if is_equilibrium:
-        for el, vals in dataset.get('reference_states', {}).items():
-            if 'phase' not in vals:
-                raise DatasetError(f'Reference state for element {el} must define the `phase` key with the reference phase name.')
-
-    # check that the shape of conditions match the values
-    num_pressure = np.atleast_1d(conditions['P']).size
-    num_temperature = np.atleast_1d(conditions['T']).size
-    if is_equilibrium:
-        conditions = dataset['conditions']
-        comp_conditions = {k: v for k, v in conditions.items() if k.startswith('X_')}
-        values_shape = np.array(values).shape
-        # check each composition condition is the same shape
-        num_x_conds = [len(v) for _, v in comp_conditions.items()]
-        if num_x_conds.count(num_x_conds[0]) != len(num_x_conds):
-            raise DatasetError('All compositions in conditions are not the same shape. Note that conditions cannot be broadcast. Composition conditions are {}'.format(comp_conditions))
-        conditions_shape = (num_pressure, num_temperature, num_x_conds[0])
-        if conditions_shape != values_shape:
-            raise DatasetError('Shape of conditions (P, T, compositions): {} does not match the shape of the values {}.'.format(conditions_shape, values_shape))
-
-    # check that all of the components used match the components entered
-    if is_equilibrium:  # and is_activity
-        conditions = dataset['conditions']
-        comp_conditions = {k: v for k, v in conditions.items() if k.startswith('X_')}
-        components_entered = set(components)
-        components_used = set()
-        components_used.update({c.split('_')[1] for c in comp_conditions.keys()})
-        # mass balance of components
-        comp_dof = len(comp_conditions.keys())
-        if len(components_entered - components_used - {'VA'}) > comp_dof or len(components_used - components_entered) > 0:
-            raise DatasetError('Components entered {} do not match components used {}.'.format(components_entered, components_used))
-
     if dataset["output"] == "ZPF":
         dataset_obj = ZPFDataset(**dataset)
     elif dataset['output'].startswith('ACR'):
         dataset_obj = ActivityPropertyDataset(**dataset)
-    elif is_equilibrium:
-        dataset_obj = EquilibriumPropertyDataset(**dataset)
     elif 'solver' in dataset.keys():
         dataset_obj = BroadcastSinglePhaseFixedConfigurationDataset(**dataset)
     else:
-        raise ValueError(f"Unknown dataset type for dataset {dataset}")
+        dataset_obj = EquilibriumPropertyDataset(**dataset)
     return dataset_obj
 
 
diff --git a/espei/error_functions/equilibrium_thermochemical_error.py b/espei/error_functions/equilibrium_thermochemical_error.py
index bd96194c..6352811d 100644
--- a/espei/error_functions/equilibrium_thermochemical_error.py
+++ b/espei/error_functions/equilibrium_thermochemical_error.py
@@ -87,7 +87,7 @@ def build_eqpropdata(data: tinydb.database.Document,
 
     # Models are now modified in response to the data from this data
     # TODO: build a reference state MetaProperty with the reference state information, maybe just-in-time, below
-    if 'reference_states' in data:
+    if data.get("reference_states") is not None:
         property_output = output[:-1] if output.endswith('R') else output  # unreferenced model property so we can tell shift_reference_state what to build.
         reference_states = []
         for el, vals in data['reference_states'].items():
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 9c6f3642..72078f97 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -396,6 +396,36 @@ def test_check_datasets_raises_with_incorrect_components():
     with pytest.raises(DatasetError):
         check_dataset(dataset_multi_incorrect_components_underspecified)
 
+    # equilibrium datasets underspecified
+    ds_eq_underspecified = {
+    "components": ["NI"],
+    "phases": ["LIQUID"],
+    "conditions": {
+        "P": 101325,
+        "T": [1348, 1176, 977],
+        "X_NI": 0.5
+    },
+    "output": "HM",
+    "values": [[[-1000], [-900], [-800]]]
+    }
+    with pytest.raises(DatasetError):
+        check_dataset(ds_eq_underspecified)
+
+    # equilibrium datasets overspecified
+    ds_eq_overspecified = {
+    "components": ["CU", "MG", "NI"],
+    "phases": ["LIQUID"],
+    "conditions": {
+        "P": 101325,
+        "T": [1348, 1176, 977],
+        "X_NI": 0.5
+    },
+    "output": "HM",
+    "values": [[[-1000], [-900], [-800]]]
+    }
+    with pytest.raises(DatasetError):
+        check_dataset(ds_eq_overspecified)
+
 
 def test_check_datasets_raises_with_malformed_zpf():
     """Passed datasets that have malformed ZPF values should raise."""
@@ -415,6 +445,72 @@ def test_check_datasets_raises_with_malformed_sublattice_configurations():
        check_dataset(dataset_single_malformed_site_ratios)
 
 
+def test_check_datasets_raises_with_equilibrium_conditions_and_values_shapes_mismatch():
+    """Passed equilibrium datasets that have mismatched condition and values shapes should raise."""
+    COND_VALS_SHAPE_GOOD = {
+        "components": ["CU", "MG"],
+        "phases": ["LIQUID"],
+        "conditions": {"P": [101325, 1e5], "T": [1400, 1500, 1600], "X_MG": [0.5, 0.6, 0.7, 0.8]},
+        "reference_states": {
+            "CU": {"phase": "LIQUID"},
+            "MG": {"phase": "LIQUID"}
+        },
+        "output": "HMR",
+        "values": np.zeros((2, 3, 4)).tolist(),
+        "reference": "equilibrium thermochemical tests",
+    }
+    # Good shape should not raise
+    check_dataset(COND_VALS_SHAPE_GOOD)
+
+    COND_VALS_SHAPE_DISAGREEMENT_1_1_2 = {
+        "components": ["CU", "MG", "NI"],
+        "phases": ["LIQUID"],
+        "conditions": {"P": 101325, "T": [1400], "X_MG": [0.5, 0.6], "X_NI": [0.5, 0.6]},
+        "reference_states": {
+            "CU": {"phase": "LIQUID"},
+            "MG": {"phase": "LIQUID"},
+            "NI": {"phase": "LIQUID"}
+        },
+        "output": "HMR",
+        "values": [[[0]]],
+        "reference": "equilibrium thermochemical tests",
+    }
+    with pytest.raises(DatasetError):
+        check_dataset(COND_VALS_SHAPE_DISAGREEMENT_1_1_2)
+
+    COND_VALS_SHAPE_DISAGREEMENT_1_2_2 = {
+        "components": ["CU", "MG"],
+        "phases": ["LIQUID"],
+        "conditions": {"P": 101325, "T": [1400, 1500], "X_MG": [0.5, 0.6]},
+        "reference_states": {
+            "CU": {"phase": "LIQUID"},
+            "MG": {"phase": "LIQUID"}
+        },
+        "output": "HMR",
+        "values": [[[0, 0]]],
+        "reference": "equilibrium thermochemical tests",
+    }
+    with pytest.raises(DatasetError):
+        check_dataset(COND_VALS_SHAPE_DISAGREEMENT_1_2_2)
+
+    # we don't broadcast over compositions, so composition conditions shapes need to match
+    MISMATCHED_COMPOSITION_CONDS = {
+        "components": ["CU", "MG", "NI"],
+        "phases": ["LIQUID"],
+        "conditions": {"P": 101325, "T": [1400], "X_MG": [0.5, 0.6], "X_NI": [0.5]},
+        "reference_states": {
+            "CU": {"phase": "LIQUID"},
+            "MG": {"phase": "LIQUID"},
+            "NI": {"phase": "LIQUID"}
+        },
+        "output": "HMR",
+        "values": [[[0, 0]]],
+        "reference": "equilibrium thermochemical tests",
+    }
+    with pytest.raises(DatasetError):
+        check_dataset(MISMATCHED_COMPOSITION_CONDS)
+
+
 def test_check_datasets_works_on_activity_data():
     """Passed activity datasets should work correctly."""
     check_dataset(CU_MG_EXP_ACTIVITY)

From 1fa67243cf8d337aecac191775d9a0a69911a0c1 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 18:23:54 -0700
Subject: [PATCH 08/14] Delete recursive_map as dead code

---
 espei/datasets/db.py     | 28 ----------------------------
 tests/test_core_utils.py | 14 --------------
 2 files changed, 42 deletions(-)

diff --git a/espei/datasets/db.py b/espei/datasets/db.py
index 34ebde77..87e593a9 100644
--- a/espei/datasets/db.py
+++ b/espei/datasets/db.py
@@ -10,34 +10,6 @@
 from .dataset_models import Dataset, ActivityPropertyDataset, BroadcastSinglePhaseFixedConfigurationDataset, EquilibriumPropertyDataset, ZPFDataset, DatasetError
 
 
-
-def recursive_map(f, x):
-    """
-    map, but over nested lists
-
-    Parameters
-    ----------
-    f : callable
-        Function to apply to x
-    x : list or value
-        Value passed to v
-
-    Returns
-    -------
-    list or value
-    """
-    if isinstance(x, list):
-        if [isinstance(xx, list) for xx in x]:
-            # we got a nested list
-            return [recursive_map(f, xx) for xx in x]
-        else:
-            # it's a list with some values inside
-            return list(map(f, x))
-    else:
-        # not a list, probably just a singular value
-        return f(x)
-
-
 def check_dataset(dataset: dict[str, Any]) -> Dataset:
     """Ensure that the dataset is valid and consistent.
 
diff --git a/tests/test_core_utils.py b/tests/test_core_utils.py
index 5c150f1e..9fd45dca 100644
--- a/tests/test_core_utils.py
+++ b/tests/test_core_utils.py
@@ -2,7 +2,6 @@
 import tinydb
 
 from espei.core_utils import get_prop_data, filter_configurations, filter_temperatures, symmetry_filter, ravel_zpf_values
-from espei.datasets import recursive_map
 from espei.sublattice_tools import recursive_tuplify
 from espei.utils import PickleableTinyDB, MemoryStorage
 from espei.error_functions.non_equilibrium_thermochemical_error import get_prop_samples
@@ -55,19 +54,6 @@ def test_get_data_for_a_minimal_example():
     assert desired_data['values'] == np.array([[[34720.0]]])
 
 
-def test_recursive_map():
-    """Test that recursive map function works"""
-
-    strings = [[["1.0"], ["5.5", "8.8"], ["10.7"]]]
-    floats = [[[1.0], [5.5, 8.8], [10.7]]]
-
-    assert recursive_map(float, strings) == floats
-    assert recursive_map(str, floats) == strings
-    assert recursive_map(float, "1.234") == 1.234
-    assert recursive_map(int, ["1", "2", "5"]) == [1, 2, 5]
-    assert recursive_map(float, ["1.0", ["0.5", "0.5"]]) == [1.0, [0.5, 0.5]]
-
-
 def test_get_prop_samples_ravels_correctly():
     """get_prop_samples should ravel non-equilibrium thermochemical data correctly"""
     desired_data = [{

From c31d9ae447ba4a62c0b689b02291da941515f118 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 18:44:10 -0700
Subject: [PATCH 09/14] Ensure tags are present in the dataset models

---
 espei/datasets/dataset_models.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
index 1a9d82d7..d702ca6b 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets/dataset_models.py
@@ -8,6 +8,7 @@
     "ActivityPropertyDataset",
     "EquilibriumPropertyDataset",
     "ZPFDataset",
+    "DatasetError",
 ]
 
 class DatasetError(Exception):
@@ -46,6 +47,7 @@ class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
     dataset_author: str = Field(default="")
     comment: str = Field(default="")
     disabled: bool = Field(default=False)
+    tags: list[str] = Field(default_factory=list)
 
     @model_validator(mode="after")
     def validate_components_entered_match_components_used(self) -> Self:
@@ -130,6 +132,7 @@ class ActivityPropertyDataset(Dataset):
     dataset_author: str = Field(default="")
     comment: str = Field(default="")
     disabled: bool = Field(default=False)
+    tags: list[str] = Field(default_factory=list)
 
     @model_validator(mode="after")
     def validate_condition_value_shape_agreement(self) -> Self:
@@ -179,6 +182,7 @@ class EquilibriumPropertyDataset(Dataset):
     dataset_author: str = Field(default="")
     comment: str = Field(default="")
     disabled: bool = Field(default=False)
+    tags: list[str] = Field(default_factory=list)
 
     @model_validator(mode="after")
     def validate_condition_value_shape_agreement(self) -> Self:
@@ -234,6 +238,7 @@ class ZPFDataset(Dataset):
     dataset_author: str = Field(default="")
     comment: str = Field(default="")
     disabled: bool = Field(default=False)
+    tags: list[str] = Field(default_factory=list)
 
     @model_validator(mode="after")
     def validate_condition_value_shape_agreement(self) -> Self:

From 8a9d38f14822bcafa33f4ea283496c7ddb8935a5 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 18:47:10 -0700
Subject: [PATCH 10/14] Multiple dataset cleanups:

- Add __all__ for datasets
- implement to_Dataset
- deprecate check_dataset
---
 espei/datasets/dataset_models.py | 32 +++++++++++++-
 espei/datasets/db.py             | 63 +++++++--------------------
 tests/test_datasets.py           | 74 ++++++++++++++++----------------
 3 files changed, 84 insertions(+), 85 deletions(-)

diff --git a/espei/datasets/dataset_models.py b/espei/datasets/dataset_models.py
index d702ca6b..74673031 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets/dataset_models.py
@@ -1,4 +1,4 @@
-from typing import Literal, Optional, Union, TypeAlias, Self
+from typing import Any, Literal, Union, TypeAlias, Self
 from pydantic import BaseModel, Field, model_validator, field_validator
 import numpy as np
 
@@ -9,6 +9,7 @@
     "EquilibriumPropertyDataset",
     "ZPFDataset",
     "DatasetError",
+    "to_Dataset",
 ]
 
 class DatasetError(Exception):
@@ -301,4 +302,31 @@ def validate_phase_compositions(cls, values: list[PhaseRegionType]) -> list[Phas
                     raise DatasetError('Mole fractions for phase composition {} ({}) for phase region {} ({}) sum to greater than one.'.format(j, phase_composition, i, phase_region))
                 if any([(mf is not None) and (mf < 0.0) for mf in mole_fraction_list]):
                     raise DatasetError('Got unallowed negative mole fraction for phase composition {} ({}) for phase region {} ({}).'.format(j, phase_composition, i, phase_region))
-        return values
\ No newline at end of file
+        return values
+
+
+def to_Dataset(candidate: dict[str, Any]) -> Dataset:
+    """Return a validated Dataset object for a dataset dict. Raises if a validated dataset cannot be created.
+
+    Parameters
+    ----------
+    candidate : dict[str, Any]
+        Dictionary describing an ESPEI dataset.
+
+    Returns
+    -------
+    Dataset
+
+    Raises
+    ------
+    DatasetError
+        If an error is found in the dataset
+    """
+    if candidate["output"] == "ZPF":
+        return ZPFDataset.model_validate(candidate)
+    elif candidate['output'].startswith('ACR'):
+        return ActivityPropertyDataset.model_validate(candidate)
+    elif 'solver' in candidate.keys():
+        return BroadcastSinglePhaseFixedConfigurationDataset.model_validate(candidate)
+    else:
+        return EquilibriumPropertyDataset.model_validate(candidate)
diff --git a/espei/datasets/db.py b/espei/datasets/db.py
index 87e593a9..53124e8c 100644
--- a/espei/datasets/db.py
+++ b/espei/datasets/db.py
@@ -6,56 +6,26 @@
 from tinydb import where
 
 from espei.utils import PickleableTinyDB
+from .dataset_models import to_Dataset, Dataset, ActivityPropertyDataset, BroadcastSinglePhaseFixedConfigurationDataset, EquilibriumPropertyDataset, ZPFDataset, DatasetError
 
-from .dataset_models import Dataset, ActivityPropertyDataset, BroadcastSinglePhaseFixedConfigurationDataset, EquilibriumPropertyDataset, ZPFDataset, DatasetError
+__all__ = [
+    "load_datasets",
+    "recursive_glob",
+    "apply_tags",
+    "check_dataset",
+    "clean_dataset"
+]
 
-
-def check_dataset(dataset: dict[str, Any]) -> Dataset:
-    """Ensure that the dataset is valid and consistent.
-
-    Currently supports the following validation checks:
-    * data shape is valid
-    * phases and components used match phases and components entered
-    * individual shapes of keys, such as ZPF, sublattice configs and site ratios
-
-    Planned validation checks:
-    * all required keys are present
-
-    Note that this follows some of the implicit assumptions in ESPEI at the time
-    of writing, such that conditions are only P, T, configs for single phase and
-    essentially only T for ZPF data.
-
-    Parameters
-    ----------
-    dataset : Dataset
-        Dictionary of the standard ESPEI dataset.
-
-    Returns
-    -------
-    Dataset
-
-    Raises
-    ------
-    DatasetError
-        If an error is found in the dataset
-    """
-    if dataset["output"] == "ZPF":
-        dataset_obj = ZPFDataset(**dataset)
-    elif dataset['output'].startswith('ACR'):
-        dataset_obj = ActivityPropertyDataset(**dataset)
-    elif 'solver' in dataset.keys():
-        dataset_obj = BroadcastSinglePhaseFixedConfigurationDataset(**dataset)
-    else:
-        dataset_obj = EquilibriumPropertyDataset(**dataset)
-    return dataset_obj
+def check_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
+    """Ensure that the dataset is valid and consistent by round-tripping through pydantic."""
+    warnings.warn(f"check_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
+    return to_Dataset(dataset).model_dump()
 
 
 def clean_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
-    """
-    No-op
-    """
-    warnings.warn(f"clean_dataset deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models.", DeprecationWarning)
-    return dataset
+    """Ensure that the dataset is valid and consistent by round-tripping through pydantic."""
+    warnings.warn(f"clean_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
+    return to_Dataset(dataset).model_dump()
 
 
 def apply_tags(datasets: PickleableTinyDB, tags):
@@ -135,8 +105,7 @@ def load_datasets(dataset_filenames, include_disabled=False) -> PickleableTinyDB
                 if not include_disabled and d.get('disabled', False):
                     # The dataset is disabled and not included
                     continue
-                dataset_obj = check_dataset(d)
-                ds_database.insert(dataset_obj.model_dump())
+                ds_database.insert(to_Dataset(d).model_dump())
             except ValueError as e:
                 raise ValueError('JSON Error in {}: {}'.format(fname, e))
             except DatasetError as e:
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 72078f97..437423bc 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,7 +1,7 @@
 from copy import deepcopy
 import pytest
 import numpy as np
-from espei.datasets import DatasetError, check_dataset, apply_tags
+from espei.datasets import DatasetError, to_Dataset, apply_tags, BroadcastSinglePhaseFixedConfigurationDataset, ZPFDataset
 
 from .testing_data import CU_MG_EXP_ACTIVITY, CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES, CU_MG_DATASET_ZPF_STRING_VALUES, LI_SN_LIQUID_DATA, dataset_multi_valid_ternary
 from .fixtures import datasets_db
@@ -366,35 +366,35 @@
 
 def test_check_datasets_run_on_good_data():
     """Passed valid datasets that should raise DatasetError."""
-    check_dataset(dataset_single_valid)
-    check_dataset(dataset_multi_valid)
-    check_dataset(dataset_multi_valid_ternary)
+    to_Dataset(dataset_single_valid)
+    to_Dataset(dataset_multi_valid)
+    to_Dataset(dataset_multi_valid_ternary)
 
 
 def test_check_datasets_raises_on_misaligned_data():
     """Passed datasets that have misaligned data and conditions should raise DatasetError."""
     with pytest.raises(DatasetError):
-        check_dataset(dataset_single_misaligned)
+        to_Dataset(dataset_single_misaligned)
     with pytest.raises(DatasetError):
-        check_dataset(dataset_multi_misaligned)
+        to_Dataset(dataset_multi_misaligned)
 
 
 def test_check_datasets_raises_with_incorrect_zpf_phases():
     """Passed datasets that have incorrect phases entered than used should raise."""
     with pytest.raises(DatasetError):
-        check_dataset(dataset_multi_incorrect_phases)
+        to_Dataset(dataset_multi_incorrect_phases)
 
 
 def test_check_datasets_raises_with_incorrect_components():
     """Passed datasets that have incorrect components entered vs. used should raise."""
     with pytest.raises(DatasetError):
-        check_dataset(dataset_single_incorrect_components_overspecified)
+        to_Dataset(dataset_single_incorrect_components_overspecified)
     with pytest.raises(DatasetError):
-        check_dataset(dataset_single_incorrect_components_underspecified)
+        to_Dataset(dataset_single_incorrect_components_underspecified)
     with pytest.raises(DatasetError):
-        check_dataset(dataset_multi_incorrect_components_overspecified)
+        to_Dataset(dataset_multi_incorrect_components_overspecified)
     with pytest.raises(DatasetError):
-        check_dataset(dataset_multi_incorrect_components_underspecified)
+        to_Dataset(dataset_multi_incorrect_components_underspecified)
 
     # equilibrium datasets underspecified
     ds_eq_underspecified = {
@@ -409,7 +409,7 @@ def test_check_datasets_raises_with_incorrect_components():
     "values": [[[-1000], [-900], [-800]]]
     }
     with pytest.raises(DatasetError):
-        check_dataset(ds_eq_underspecified)
+        to_Dataset(ds_eq_underspecified)
 
     # equilibrium datasets overspecified
     ds_eq_overspecified = {
@@ -424,25 +424,25 @@ def test_check_datasets_raises_with_incorrect_components():
     "values": [[[-1000], [-900], [-800]]]
     }
     with pytest.raises(DatasetError):
-        check_dataset(ds_eq_overspecified)
+        to_Dataset(ds_eq_overspecified)
 
 
 def test_check_datasets_raises_with_malformed_zpf():
     """Passed datasets that have malformed ZPF values should raise."""
     with pytest.raises((DatasetError, ValidationError)):
-        check_dataset(dataset_multi_malformed_zpfs_components_not_list)
+        to_Dataset(dataset_multi_malformed_zpfs_components_not_list)
     with pytest.raises(DatasetError):
-        check_dataset(dataset_multi_malformed_zpfs_fractions_do_not_match_components)
+        to_Dataset(dataset_multi_malformed_zpfs_fractions_do_not_match_components)
     with pytest.raises(DatasetError):
-        check_dataset(dataset_multi_malformed_zpfs_components_do_not_match_fractions)
+        to_Dataset(dataset_multi_malformed_zpfs_components_do_not_match_fractions)
 
 
 def test_check_datasets_raises_with_malformed_sublattice_configurations():
     """Passed datasets that have malformed ZPF values should raise."""
     with pytest.raises(DatasetError):
-        check_dataset(dataset_single_malformed_site_occupancies)
+        to_Dataset(dataset_single_malformed_site_occupancies)
     with pytest.raises(DatasetError):
-       check_dataset(dataset_single_malformed_site_ratios)
+       to_Dataset(dataset_single_malformed_site_ratios)
 
 
 def test_check_datasets_raises_with_equilibrium_conditions_and_values_shapes_mismatch():
@@ -460,7 +460,7 @@ def test_check_datasets_raises_with_equilibrium_conditions_and_values_shapes_mis
         "reference": "equilibrium thermochemical tests",
     }
     # Good shape should not raise
-    check_dataset(COND_VALS_SHAPE_GOOD)
+    to_Dataset(COND_VALS_SHAPE_GOOD)
 
     COND_VALS_SHAPE_DISAGREEMENT_1_1_2 = {
         "components": ["CU", "MG", "NI"],
@@ -476,7 +476,7 @@ def test_check_datasets_raises_with_equilibrium_conditions_and_values_shapes_mis
         "reference": "equilibrium thermochemical tests",
     }
     with pytest.raises(DatasetError):
-        check_dataset(COND_VALS_SHAPE_DISAGREEMENT_1_1_2)
+        to_Dataset(COND_VALS_SHAPE_DISAGREEMENT_1_1_2)
 
     COND_VALS_SHAPE_DISAGREEMENT_1_2_2 = {
         "components": ["CU", "MG"],
@@ -491,7 +491,7 @@ def test_check_datasets_raises_with_equilibrium_conditions_and_values_shapes_mis
         "reference": "equilibrium thermochemical tests",
     }
     with pytest.raises(DatasetError):
-        check_dataset(COND_VALS_SHAPE_DISAGREEMENT_1_2_2)
+        to_Dataset(COND_VALS_SHAPE_DISAGREEMENT_1_2_2)
 
     # we don't broadcast over compositions, so composition conditions shapes need to match
     MISMATCHED_COMPOSITION_CONDS = {
@@ -508,51 +508,53 @@ def test_check_datasets_raises_with_equilibrium_conditions_and_values_shapes_mis
         "reference": "equilibrium thermochemical tests",
     }
     with pytest.raises(DatasetError):
-        check_dataset(MISMATCHED_COMPOSITION_CONDS)
+        to_Dataset(MISMATCHED_COMPOSITION_CONDS)
 
 
 def test_check_datasets_works_on_activity_data():
     """Passed activity datasets should work correctly."""
-    check_dataset(CU_MG_EXP_ACTIVITY)
+    to_Dataset(CU_MG_EXP_ACTIVITY)
 
 
 def test_check_datasets_raises_with_zpf_fractions_greater_than_one():
     """Passed datasets that have mole fractions greater than one should raise."""
     with pytest.raises(DatasetError):
-        check_dataset(dataset_multi_mole_fractions_as_percents)
+        to_Dataset(dataset_multi_mole_fractions_as_percents)
 
 
 def test_check_datasets_raises_with_negative_zpf_fractions():
     """Passed datasets that have negative mole fractions should raise."""
     with pytest.raises(DatasetError):
-        check_dataset(dataset_zpf_negative_mole_fraction)
+        to_Dataset(dataset_zpf_negative_mole_fraction)
 
 
 def test_check_datasets_raises_with_unsorted_interactions():
     """Passed datasets that have sublattice interactions not in sorted order should raise."""
     with pytest.raises(DatasetError):
-        check_dataset(dataset_single_unsorted_interaction)
+        to_Dataset(dataset_single_unsorted_interaction)
 
 
 def test_datasets_convert_thermochemical_string_values_producing_correct_value(datasets_db):
     """Strings where floats are expected should give correct answers for thermochemical datasets"""
-    ds = check_dataset(CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES).model_dump()
-    assert np.issubdtype(np.array(ds['values']).dtype, np.number)
-    assert np.issubdtype(np.array(ds['conditions']['T']).dtype, np.number)
-    assert np.issubdtype(np.array(ds['conditions']['P']).dtype, np.number)
+    ds = to_Dataset(CU_MG_DATASET_THERMOCHEMICAL_STRING_VALUES)
+    assert isinstance(ds, BroadcastSinglePhaseFixedConfigurationDataset)
+    assert np.issubdtype(np.array(ds.values).dtype, np.number)
+    assert np.issubdtype(np.array(ds.conditions['T']).dtype, np.number)
+    assert np.issubdtype(np.array(ds.conditions['P']).dtype, np.number)
 
 
 def test_datasets_convert_zpf_string_values_producing_correct_value(datasets_db):
     """Strings where floats are expected should give correct answers for ZPF datasets"""
-    ds = check_dataset(CU_MG_DATASET_ZPF_STRING_VALUES).model_dump()
-    assert np.issubdtype(np.array([t[0][2] for t in ds['values']]).dtype, np.number)
-    assert np.issubdtype(np.array(ds['conditions']['T']).dtype, np.number)
-    assert np.issubdtype(np.array(ds['conditions']['P']).dtype, np.number)
+    ds = to_Dataset(CU_MG_DATASET_ZPF_STRING_VALUES)
+    assert isinstance(ds, ZPFDataset)
+    assert np.issubdtype(np.array([t[0][2] for t in ds.values]).dtype, np.number)
+    assert np.issubdtype(np.array(ds.conditions['T']).dtype, np.number)
+    assert np.issubdtype(np.array(ds.conditions['P']).dtype, np.number)
 
 def test_check_datasets_raises_if_configs_occupancies_not_aligned(datasets_db):
     """Checking datasets that don't have the same number/shape of configurations/occupancies should raise."""
     with pytest.raises(DatasetError):
-        check_dataset(dataset_mismatched_configs_occupancies)
+        to_Dataset(dataset_mismatched_configs_occupancies)
 
 
 # Expected to fail, since the dataset checker cannot determine that species are used in the configurations and components should only contain pure elements.
@@ -560,7 +562,7 @@ def test_check_datasets_raises_if_configs_occupancies_not_aligned(datasets_db):
 def test_non_equilibrium_thermo_data_with_species_passes_checker():
     """Non-equilibrium thermochemical data that use species in the configurations should pass the dataset checker.
     """
-    check_dataset(LI_SN_LIQUID_DATA)
+    to_Dataset(LI_SN_LIQUID_DATA)
 
 
 def test_applying_tags(datasets_db):

From 02299fdaf10e3cda111310c9476fbcaa76b0191f Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 18:56:58 -0700
Subject: [PATCH 11/14] Refactor modules back to simple datasets module

---
 .../dataset_models.py => datasets.py}         | 155 +++++++++++++++++-
 espei/datasets/__init__.py                    |   2 -
 espei/datasets/db.py                          | 137 ----------------
 3 files changed, 149 insertions(+), 145 deletions(-)
 rename espei/{datasets/dataset_models.py => datasets.py} (78%)
 delete mode 100644 espei/datasets/__init__.py
 delete mode 100644 espei/datasets/db.py

diff --git a/espei/datasets/dataset_models.py b/espei/datasets.py
similarity index 78%
rename from espei/datasets/dataset_models.py
rename to espei/datasets.py
index 74673031..bd81f9cb 100644
--- a/espei/datasets/dataset_models.py
+++ b/espei/datasets.py
@@ -1,21 +1,38 @@
 from typing import Any, Literal, Union, TypeAlias, Self
+import warnings
 from pydantic import BaseModel, Field, model_validator, field_validator
 import numpy as np
+import fnmatch, json, os
+from tinydb.storages import MemoryStorage
+from tinydb import where
+
+from espei.utils import PickleableTinyDB
 
 __all__ = [
+    # Models
     "Dataset",
     "BroadcastSinglePhaseFixedConfigurationDataset",
     "ActivityPropertyDataset",
     "EquilibriumPropertyDataset",
     "ZPFDataset",
+
+    # Errors (when validating models)
     "DatasetError",
+
+    # User-facing API
+    "load_datasets",
+    "recursive_glob",
+    "apply_tags",
     "to_Dataset",
+
+    # Deprecated
+    "check_dataset",
+    "clean_dataset",
 ]
 
-class DatasetError(Exception):
-    """Exception raised when datasets are invalid."""
-    pass
 
+# Type aliases - used to clarify intent
+# e.g. when we want a ComponentName rather than a str (even though that's what it is)
 ComponentName: TypeAlias = str
 PhaseName: TypeAlias = str
 PhaseCompositionType: TypeAlias = Union[
@@ -24,9 +41,16 @@ class DatasetError(Exception):
 ]
 PhaseRegionType: TypeAlias = list[PhaseCompositionType]
 
+
+class DatasetError(Exception):
+    """Exception raised when datasets are invalid."""
+    pass
+
+
 class Dataset(BaseModel):
     pass
 
+
 class Solver(BaseModel):
     mode: Literal["manual"] = Field(default="manual")
     sublattice_site_ratios: list[float]
@@ -111,13 +135,11 @@ def validate_configuration_occupancy_shape_agreement(self) -> Self:
         return self
 
 
-# TODO: would be great to remove
 class ActivityDataReferenceState(Dataset):
     phases: list[PhaseName] = Field(min_length=1)
     conditions: dict[str, float]
 
-
-# TODO: refactor to merge this with EquilibriumPropertyDataset
+# TODO: refactor ActivityPropertyDataset to merge with EquilibriumPropertyDataset
 # The validator functions are exactly duplicated in EquilibriumPropertyDataset
 # The duplication simplifies the implementation since the activity special case is
 # ultimately meant to be removed once activity is a PyCalphad Workspace property
@@ -330,3 +352,124 @@ def to_Dataset(candidate: dict[str, Any]) -> Dataset:
         return BroadcastSinglePhaseFixedConfigurationDataset.model_validate(candidate)
     else:
         return EquilibriumPropertyDataset.model_validate(candidate)
+
+
+def apply_tags(datasets: PickleableTinyDB, tags):
+    """
+    Modify datasets using the tags system
+
+    Parameters
+    ----------
+    datasets : PickleableTinyDB
+        Datasets to modify
+    tags : dict
+        Dictionary of {tag: update_dict}
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    In general, everything replaces or is additive. We use the following update rules:
+    1. If the update value is a list, extend the existing list (empty list if key does not exist)
+    2. If the update value is scalar, override the previous (deleting any old value, if present)
+    3. If the update value is a dict, update the exist dict (empty dict if dict does not exist)
+    4. Otherwise, the value is updated, overriding the previous
+
+    Examples
+    --------
+    >>> from espei.utils import PickleableTinyDB
+    >>> from tinydb.storages import MemoryStorage
+    >>> ds = PickleableTinyDB(storage=MemoryStorage)
+    >>> doc_id = ds.insert({'tags': ['dft'], 'excluded_model_contributions': ['contrib']})
+    >>> my_tags = {'dft': {'excluded_model_contributions': ['idmix', 'mag'], 'weight': 5.0}}
+    >>> from espei.datasets import apply_tags
+    >>> apply_tags(ds, my_tags)
+    >>> all_data = ds.all()
+    >>> all(d['excluded_model_contributions'] == ['contrib', 'idmix', 'mag'] for d in all_data)
+    True
+    >>> all(d['weight'] == 5.0 for d in all_data)
+    True
+
+    """
+    for tag, update_dict in tags.items():
+        matching_datasets = datasets.search(where("tags").test(lambda x: tag in x))
+        for newkey, newval in update_dict.items():
+            for match in matching_datasets:
+                if isinstance(newval, list):
+                    match[newkey] = match.get(newkey, []) + newval
+                elif np.isscalar(newval):
+                    match[newkey] = newval
+                elif isinstance(newval, dict):
+                    d = match.get(newkey, dict())
+                    d.update(newval)
+                    match[newkey] = d
+                else:
+                    match[newkey] = newval
+                datasets.update(match, doc_ids=[match.doc_id])
+
+
+def load_datasets(dataset_filenames, include_disabled=False) -> PickleableTinyDB:
+    """
+    Create a PickelableTinyDB with the data from a list of filenames.
+
+    Parameters
+    ----------
+    dataset_filenames : [str]
+        List of filenames to load as datasets
+
+    Returns
+    -------
+    PickleableTinyDB
+    """
+    ds_database = PickleableTinyDB(storage=MemoryStorage)
+    for fname in dataset_filenames:
+        with open(fname) as file_:
+            try:
+                d = json.load(file_)
+                if not include_disabled and d.get('disabled', False):
+                    # The dataset is disabled and not included
+                    continue
+                ds_database.insert(to_Dataset(d).model_dump())
+            except ValueError as e:
+                raise ValueError('JSON Error in {}: {}'.format(fname, e))
+            except DatasetError as e:
+                raise DatasetError('Dataset Error in {}: {}'.format(fname, e))
+    return ds_database
+
+
+def recursive_glob(start, pattern='*.json'):
+    """
+    Recursively glob for the given pattern from the start directory.
+
+    Parameters
+    ----------
+    start : str
+        Path of the directory to walk while for file globbing
+    pattern : str
+        Filename pattern to match in the glob.
+
+    Returns
+    -------
+    [str]
+        List of matched filenames
+
+    """
+    matches = []
+    for root, dirnames, filenames in os.walk(start, followlinks=True):
+        for filename in fnmatch.filter(filenames, pattern):
+            matches.append(os.path.join(root, filename))
+    return sorted(matches)
+
+
+def check_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
+    """Ensure that the dataset is valid and consistent by round-tripping through pydantic."""
+    warnings.warn(f"check_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
+    return to_Dataset(dataset).model_dump()
+
+
+def clean_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
+    """Ensure that the dataset is valid and consistent by round-tripping through pydantic."""
+    warnings.warn(f"clean_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
+    return to_Dataset(dataset).model_dump()
diff --git a/espei/datasets/__init__.py b/espei/datasets/__init__.py
deleted file mode 100644
index 18dc966a..00000000
--- a/espei/datasets/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .dataset_models import *
-from .db import *
\ No newline at end of file
diff --git a/espei/datasets/db.py b/espei/datasets/db.py
deleted file mode 100644
index 53124e8c..00000000
--- a/espei/datasets/db.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import fnmatch, json, os
-from typing import Any, Dict, List, TypeAlias
-import warnings
-import numpy as np
-from tinydb.storages import MemoryStorage
-from tinydb import where
-
-from espei.utils import PickleableTinyDB
-from .dataset_models import to_Dataset, Dataset, ActivityPropertyDataset, BroadcastSinglePhaseFixedConfigurationDataset, EquilibriumPropertyDataset, ZPFDataset, DatasetError
-
-__all__ = [
-    "load_datasets",
-    "recursive_glob",
-    "apply_tags",
-    "check_dataset",
-    "clean_dataset"
-]
-
-def check_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
-    """Ensure that the dataset is valid and consistent by round-tripping through pydantic."""
-    warnings.warn(f"check_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
-    return to_Dataset(dataset).model_dump()
-
-
-def clean_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
-    """Ensure that the dataset is valid and consistent by round-tripping through pydantic."""
-    warnings.warn(f"clean_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
-    return to_Dataset(dataset).model_dump()
-
-
-def apply_tags(datasets: PickleableTinyDB, tags):
-    """
-    Modify datasets using the tags system
-
-    Parameters
-    ----------
-    datasets : PickleableTinyDB
-        Datasets to modify
-    tags : dict
-        Dictionary of {tag: update_dict}
-
-    Returns
-    -------
-    None
-
-    Notes
-    -----
-    In general, everything replaces or is additive. We use the following update rules:
-    1. If the update value is a list, extend the existing list (empty list if key does not exist)
-    2. If the update value is scalar, override the previous (deleting any old value, if present)
-    3. If the update value is a dict, update the exist dict (empty dict if dict does not exist)
-    4. Otherwise, the value is updated, overriding the previous
-
-    Examples
-    --------
-    >>> from espei.utils import PickleableTinyDB
-    >>> from tinydb.storages import MemoryStorage
-    >>> ds = PickleableTinyDB(storage=MemoryStorage)
-    >>> doc_id = ds.insert({'tags': ['dft'], 'excluded_model_contributions': ['contrib']})
-    >>> my_tags = {'dft': {'excluded_model_contributions': ['idmix', 'mag'], 'weight': 5.0}}
-    >>> from espei.datasets import apply_tags
-    >>> apply_tags(ds, my_tags)
-    >>> all_data = ds.all()
-    >>> all(d['excluded_model_contributions'] == ['contrib', 'idmix', 'mag'] for d in all_data)
-    True
-    >>> all(d['weight'] == 5.0 for d in all_data)
-    True
-
-    """
-    for tag, update_dict in tags.items():
-        matching_datasets = datasets.search(where("tags").test(lambda x: tag in x))
-        for newkey, newval in update_dict.items():
-            for match in matching_datasets:
-                if isinstance(newval, list):
-                    match[newkey] = match.get(newkey, []) + newval
-                elif np.isscalar(newval):
-                    match[newkey] = newval
-                elif isinstance(newval, dict):
-                    d = match.get(newkey, dict())
-                    d.update(newval)
-                    match[newkey] = d
-                else:
-                    match[newkey] = newval
-                datasets.update(match, doc_ids=[match.doc_id])
-
-
-def load_datasets(dataset_filenames, include_disabled=False) -> PickleableTinyDB:
-    """
-    Create a PickelableTinyDB with the data from a list of filenames.
-
-    Parameters
-    ----------
-    dataset_filenames : [str]
-        List of filenames to load as datasets
-
-    Returns
-    -------
-    PickleableTinyDB
-    """
-    ds_database = PickleableTinyDB(storage=MemoryStorage)
-    for fname in dataset_filenames:
-        with open(fname) as file_:
-            try:
-                d = json.load(file_)
-                if not include_disabled and d.get('disabled', False):
-                    # The dataset is disabled and not included
-                    continue
-                ds_database.insert(to_Dataset(d).model_dump())
-            except ValueError as e:
-                raise ValueError('JSON Error in {}: {}'.format(fname, e))
-            except DatasetError as e:
-                raise DatasetError('Dataset Error in {}: {}'.format(fname, e))
-    return ds_database
-
-
-def recursive_glob(start, pattern='*.json'):
-    """
-    Recursively glob for the given pattern from the start directory.
-
-    Parameters
-    ----------
-    start : str
-        Path of the directory to walk while for file globbing
-    pattern : str
-        Filename pattern to match in the glob.
-
-    Returns
-    -------
-    [str]
-        List of matched filenames
-
-    """
-    matches = []
-    for root, dirnames, filenames in os.walk(start, followlinks=True):
-        for filename in fnmatch.filter(filenames, pattern):
-            matches.append(os.path.join(root, filename))
-    return sorted(matches)

From 42a6e3a355ba9cee111d78c0f39e5ace76768c91 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 18:57:51 -0700
Subject: [PATCH 12/14] Ruff check datasets.py

---
 espei/datasets.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/espei/datasets.py b/espei/datasets.py
index bd81f9cb..092dda3d 100644
--- a/espei/datasets.py
+++ b/espei/datasets.py
@@ -2,7 +2,9 @@
 import warnings
 from pydantic import BaseModel, Field, model_validator, field_validator
 import numpy as np
-import fnmatch, json, os
+import fnmatch
+import json
+import os
 from tinydb.storages import MemoryStorage
 from tinydb import where
 
@@ -465,11 +467,11 @@ def recursive_glob(start, pattern='*.json'):
 
 def check_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
     """Ensure that the dataset is valid and consistent by round-tripping through pydantic."""
-    warnings.warn(f"check_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
+    warnings.warn("check_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
     return to_Dataset(dataset).model_dump()
 
 
 def clean_dataset(dataset: dict[str, Any]) -> dict[str, Any]:
     """Ensure that the dataset is valid and consistent by round-tripping through pydantic."""
-    warnings.warn(f"clean_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
+    warnings.warn("clean_dataset is deprecated will be removed in ESPEI 0.11. Behavior has been migrated to the pydantic dataset implementations in espei.datasets.dataset_models. To get a Dataset object, use espei.datasets.to_Dataset.", DeprecationWarning)
     return to_Dataset(dataset).model_dump()

From b01668a20f7608410f3ca97a1d54bb796fc3e643 Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 18:59:54 -0700
Subject: [PATCH 13/14] Delete comment field

---
 espei/datasets.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/espei/datasets.py b/espei/datasets.py
index 092dda3d..e433bac0 100644
--- a/espei/datasets.py
+++ b/espei/datasets.py
@@ -72,7 +72,6 @@ class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
     reference: str = Field(default="")
     bibtex: str = Field(default="")
     dataset_author: str = Field(default="")
-    comment: str = Field(default="")
     disabled: bool = Field(default=False)
     tags: list[str] = Field(default_factory=list)
 
@@ -155,7 +154,6 @@ class ActivityPropertyDataset(Dataset):
     reference: str = Field(default="")
     bibtex: str = Field(default="")
     dataset_author: str = Field(default="")
-    comment: str = Field(default="")
     disabled: bool = Field(default=False)
     tags: list[str] = Field(default_factory=list)
 
@@ -205,7 +203,6 @@ class EquilibriumPropertyDataset(Dataset):
     reference: str = Field(default="")
     bibtex: str = Field(default="")
     dataset_author: str = Field(default="")
-    comment: str = Field(default="")
     disabled: bool = Field(default=False)
     tags: list[str] = Field(default_factory=list)
 
@@ -261,7 +258,6 @@ class ZPFDataset(Dataset):
     reference: str = Field(default="")
     bibtex: str = Field(default="")
     dataset_author: str = Field(default="")
-    comment: str = Field(default="")
     disabled: bool = Field(default=False)
     tags: list[str] = Field(default_factory=list)
 

From 2ffd7d61ebca6d79364fb3c21baaa9563abe9ddb Mon Sep 17 00:00:00 2001
From: bocklund <brandonbocklund@gmail.com>
Date: Sun, 17 Aug 2025 19:07:59 -0700
Subject: [PATCH 14/14] Refactor types to common datasets. Datasets pretty much
 only have to implement a value and the validators

---
 espei/datasets.py     | 83 +++++++++++++++++--------------------------
 tests/testing_data.py |  5 ---
 2 files changed, 32 insertions(+), 56 deletions(-)

diff --git a/espei/datasets.py b/espei/datasets.py
index e433bac0..0bf65eaf 100644
--- a/espei/datasets.py
+++ b/espei/datasets.py
@@ -49,10 +49,7 @@ class DatasetError(Exception):
     pass
 
 
-class Dataset(BaseModel):
-    pass
-
-
+# Used by BroadcastSinglePhaseFixedConfigurationDataset to define internal DOF
 class Solver(BaseModel):
     mode: Literal["manual"] = Field(default="manual")
     sublattice_site_ratios: list[float]
@@ -61,19 +58,41 @@ class Solver(BaseModel):
     sublattice_occupancies: list[list[float | list[float]]] | None = Field(default=None)
 
 
-class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
+# Activity dataset special case reference state
+class ActivityDataReferenceState(BaseModel):
+    phases: list[PhaseName] = Field(min_length=1)
+    conditions: dict[str, float]
+
+
+# More general reference states for equilibrium property datasets
+class ReferenceStates(BaseModel):
+    phase: PhaseName
+    fixed_state_variables: dict[str, float] | None = Field(default=None, description="Fixed potentials for the reference state", examples=[{"T": 298.15, "P": 101325}])
+
+
+class Dataset(BaseModel):
     components: list[ComponentName] = Field(min_length=1)
-    phases: list[PhaseName] = Field(min_length=1, max_length=1)
-    solver: Solver
+    phases: list[PhaseName] = Field(min_length=1)
     conditions: dict[str, float | list[float]]
     output: str
-    values: list[list[list[float]]]
-    excluded_model_contributions: list[str] = Field(default_factory=list)
+    # TODO: weights
+
+    # Control
+    disabled: bool = Field(default=False)
+    tags: list[str] = Field(default_factory=list)
+
+    # Metadata
     reference: str = Field(default="")
     bibtex: str = Field(default="")
     dataset_author: str = Field(default="")
-    disabled: bool = Field(default=False)
-    tags: list[str] = Field(default_factory=list)
+
+
+class BroadcastSinglePhaseFixedConfigurationDataset(Dataset):
+    phases: list[PhaseName] = Field(min_length=1, max_length=1)
+    values: list[list[list[float]]]
+    solver: Solver
+    conditions: dict[str, float | list[float]]
+    excluded_model_contributions: list[str] = Field(default_factory=list)
 
     @model_validator(mode="after")
     def validate_components_entered_match_components_used(self) -> Self:
@@ -136,26 +155,13 @@ def validate_configuration_occupancy_shape_agreement(self) -> Self:
         return self
 
 
-class ActivityDataReferenceState(Dataset):
-    phases: list[PhaseName] = Field(min_length=1)
-    conditions: dict[str, float]
-
 # TODO: refactor ActivityPropertyDataset to merge with EquilibriumPropertyDataset
 # The validator functions are exactly duplicated in EquilibriumPropertyDataset
 # The duplication simplifies the implementation since the activity special case is
 # ultimately meant to be removed once activity is a PyCalphad Workspace property
 class ActivityPropertyDataset(Dataset):
-    components: list[ComponentName] = Field(min_length=1)
-    phases: list[PhaseName] = Field(min_length=1)
-    conditions: dict[str, float | list[float]]
-    reference_state: ActivityDataReferenceState
-    output: str
     values: list[list[list[float]]]
-    reference: str = Field(default="")
-    bibtex: str = Field(default="")
-    dataset_author: str = Field(default="")
-    disabled: bool = Field(default=False)
-    tags: list[str] = Field(default_factory=list)
+    reference_state: ActivityDataReferenceState
 
     @model_validator(mode="after")
     def validate_condition_value_shape_agreement(self) -> Self:
@@ -188,23 +194,9 @@ def validate_components_entered_match_components_used(self) -> Self:
         return self
 
 
-class ReferenceStates(BaseModel):
-    phase: PhaseName
-    fixed_state_variables: dict[str, float] | None = Field(default=None, description="Fixed potentials for the reference state", examples=[{"T": 298.15, "P": 101325}])
-
-
 class EquilibriumPropertyDataset(Dataset):
-    components: list[ComponentName] = Field(min_length=1)
-    phases: list[PhaseName] = Field(min_length=1)
-    conditions: dict[str, float | list[float]]
-    output: str
     values: list[list[list[float]]]
     reference_states: dict[ComponentName, ReferenceStates] | None = Field(default=None)
-    reference: str = Field(default="")
-    bibtex: str = Field(default="")
-    dataset_author: str = Field(default="")
-    disabled: bool = Field(default=False)
-    tags: list[str] = Field(default_factory=list)
 
     @model_validator(mode="after")
     def validate_condition_value_shape_agreement(self) -> Self:
@@ -248,18 +240,7 @@ def validate_reference_state_fully_specified_if_used(self) -> Self:
 
 
 class ZPFDataset(Dataset):
-    components: list[ComponentName] = Field(min_length=1)
-    phases: list[str] = Field(min_length=1)
-    conditions: dict[str, float | list[float]]
-    broadcast_conditions: Literal[False] = Field(default=False)  # TODO: migrate and remove, since True was never supported
-    output: Literal["ZPF"]
-    values: list[PhaseRegionType]  # TODO: validate to be of same shape as conditions
-    excluded_model_contributions: list[str] = Field(default_factory=list)
-    reference: str = Field(default="")
-    bibtex: str = Field(default="")
-    dataset_author: str = Field(default="")
-    disabled: bool = Field(default=False)
-    tags: list[str] = Field(default_factory=list)
+    values: list[PhaseRegionType]
 
     @model_validator(mode="after")
     def validate_condition_value_shape_agreement(self) -> Self:
diff --git a/tests/testing_data.py b/tests/testing_data.py
index 73c9540e..dc528c19 100644
--- a/tests/testing_data.py
+++ b/tests/testing_data.py
@@ -584,7 +584,6 @@
       "P": 101325,
       "T": [1337.97, 1262.238]
     },
-    "broadcast_conditions": false,
     "output": "ZPF",
     "values":   [
         [["LIQUID", ["MG"], [0.0246992]], ["FCC_A1", ["MG"],  [null]]],
@@ -695,7 +694,6 @@
       "P": "101325",
       "T": ["1337.97", "1262.238"]
     },
-    "broadcast_conditions": false,
     "output": "ZPF",
     "values":   [
         [["LIQUID", ["MG"], ["0.0246992"]], ["FCC_A1", ["MG"],  [null]]],
@@ -713,7 +711,6 @@
     "P": 101325,
     "T": [733.15]
   },
-  "broadcast_conditions": false,
   "output": "ZPF",
   "values":   [
     [["__HYPERPLANE__", ["CU"], [0.05]], ["HCP_A3", ["CU"], [null]], ["CUMG2", ["CU"], [null]]]
@@ -964,7 +961,6 @@
 CR_NI_ZPF_DATA = {
     "components": ["CR", "NI", "VA"],
     "phases": ["BCC_A2", "FCC_A1"],
-    "broadcast_conditions": False,
     "conditions": {
         "T": [1073, 1173, 1273, 1373, 1548],
         "P": [101325.0]
@@ -1478,7 +1474,6 @@
 LI_SN_ZPF_DATA = {
     "components": ["LI", "SN"],
     "phases": ["LIQUID", "LI7SN2"],
-    "broadcast_conditions": False,
     "conditions": {
         "T": [1040],
         "P": [101325.0]