From b4fe89677dab2f8817ddb9d2a1ad32d47e171b11 Mon Sep 17 00:00:00 2001
From: Sevy Harris <sevy.harris@gmail.com>
Date: Wed, 27 May 2026 12:06:23 -0400
Subject: [PATCH 1/4] Add uncertainty tests for correlated library match

Add some tests to make sure the correct index is referenced when assembling thermo sources that use libraries. For example, if CH3 is estimated from a CH4 library + a radical correction, we want to make sure the uncertainty source points to CH4 as the index for the library value used (as opposed to CH3)
---
 test/rmgpy/tools/uncertaintyTest.py | 44 +++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/test/rmgpy/tools/uncertaintyTest.py b/test/rmgpy/tools/uncertaintyTest.py
index 35ac31049a..56df45cc8c 100644
--- a/test/rmgpy/tools/uncertaintyTest.py
+++ b/test/rmgpy/tools/uncertaintyTest.py
@@ -180,6 +180,50 @@ def test_uncertainty_assignment(self):
             rtol=1e-4
         )
 
+    def test_source_correlations(self):
+        # Check some examples of different species containing the same sources
+
+        # ------------------------------------------------------------------------------
+        # Make sure CH3 (Library + Radical) has a library index/value in common with CH4
+        i_CH4 = rmgpy.tools.uncertainty.get_i_thing(rmgpy.species.Species(smiles='C'), self.uncertainty.species_list)
+        assert i_CH4 >= 0
+
+        i_CH3 = rmgpy.tools.uncertainty.get_i_thing(rmgpy.species.Species(smiles='[CH3]'), self.uncertainty.species_list)
+        assert i_CH3 >= 0
+
+        self.uncertainty.extract_sources_from_model()
+        self.uncertainty.assign_parameter_uncertainties(correlated=True)
+        
+        src1 = self.uncertainty.species_sources_dict[self.uncertainty.species_list[i_CH4]]  # CH4
+        src2 = self.uncertainty.species_sources_dict[self.uncertainty.species_list[i_CH3]]  # CH3
+
+        assert 'Library' in src1
+        assert 'Library' in src2
+        assert 'GAV' in src2
+        assert src1['Library'] == src2['Library']  # make sure they refer to the same library source
+
+        # -----------------------------------------------------------------------------
+        # Make sure CH3X (Library + GAV + Adsorption Correction) has a library index/value in common with CH4 (Library)
+        i_CH3X = rmgpy.tools.uncertainty.get_i_thing(rmgpy.species.Species(smiles='C*'), self.uncertainty.species_list)
+        assert i_CH3X == -1
+        # This is not in the model, so add it to the species list
+        CH3X = rmgpy.species.Species(smiles='C*')
+        CH3X.thermo = self.uncertainty.database.thermo.get_thermo_data(CH3X)
+        self.uncertainty.species_list.append(CH3X)
+
+        self.uncertainty.extract_sources_from_model()
+        self.uncertainty.assign_parameter_uncertainties(correlated=True)
+
+        src1 = self.uncertainty.species_sources_dict[self.uncertainty.species_list[i_CH4]]  # CH4
+        src2 = self.uncertainty.species_sources_dict[self.uncertainty.species_list[i_CH3X]]  # CH3X
+
+        assert 'Library' in src1
+        assert 'Library' in src2
+        assert 'ADS' in src2
+        assert 'GAV' in src2
+        assert src1['Library'] == src2['Library']  # make sure they refer to the same library source
+        self.uncertainty.species_list.pop()  # remove the extra species so it doesn't affect other tests
+
     def test_local_analysis(self):
         """
         Test to run uncorrelated and then correlated local_analysis and make sure the results are expected

From c72e5217ef163edb2883ab55099f5b6c207c5de8 Mon Sep 17 00:00:00 2001
From: Sevy Harris <sevy.harris@gmail.com>
Date: Wed, 27 May 2026 16:52:49 -0400
Subject: [PATCH 2/4] refactor extract_source_from_comments in thermo.py

---
 rmgpy/data/thermo.py | 250 +++++++++++++++++++++++++++++--------------
 1 file changed, 169 insertions(+), 81 deletions(-)

diff --git a/rmgpy/data/thermo.py b/rmgpy/data/thermo.py
index 6f3e4d35ba..7aac848ead 100644
--- a/rmgpy/data/thermo.py
+++ b/rmgpy/data/thermo.py
@@ -2835,82 +2835,10 @@ def get_ring_groups_from_comments(self, thermo_data):
 
         return ring_groups, polycyclic_groups
 
-    def extract_source_from_comments(self, species):
-        """
-        `species`: A species object containing thermo data and thermo data comments
-        
-        Parses the verbose string of comments from the thermo data of the species object,
-        and extracts the thermo sources.
-
-        Returns a dictionary with keys of 'Library', 'QM', 'ADS', and/or 'GAV'.
-        Commonly, species thermo are estimated using only one of these sources.
-        However, a radical can be estimated with more than one type of source, for 
-        instance a saturated library value and a GAV HBI correction, or a QM saturated value
-        and a GAV HBI correction. Adsorbates can be estimated using a single library
-        for the adsorbate or a combination of a gas phase library for the
-        gas phase portion and an adsorption correction.
-        
-        source = {'Library': String_Name_of_Library_Used,
-                  'QM': String_of_Method_Used,
-                  'GAV': Dictionary_of_Groups_Used,
-                  'ADS': Dictionary_of_Adsorption_Group_Used,
-                  }
-                  
-        The Dictionary_of_Groups_Used looks like 
-        {'groupType':[List of tuples containing (Entry, Weight)]
-        """
-        comment = species.thermo.comment
-        tokens = comment.split()
 
-        source = {}
-
-        if comment.startswith('Thermo library'):
-            # Store name of the library source, which is the 3rd token in the comments
-            source['Library'] = tokens[2]
-
-        elif comment.startswith('QM'):
-            # Store the level of the calculation, which is the 2nd token in the comments
-            source['QM'] = tokens[1]
-
-        elif comment.startswith('Gas phase thermo'):
-            # Handle adsorption correction thermo data of the following format:
-            # Library example
-            # Gas phase thermo for C(T) from Thermo library: primaryThermoLibrary.
-            # Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(Cq*)
-
-            # GAV example
-            # Gas phase thermo for [CH]CC from Thermo group additivity estimation: group(Cs-CsCsHH) + group(Cs-CsHHH) + group(Cs-CsHHH) + radical(CCJ2_triplet).
-            # Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C=*RCR3)"
-
-            comment = comment.replace(r'\n', ' ')
-            comment = comment.replace('\n', ' ')
-            if 'Adsorption correction:' not in comment:
-                raise ValueError(f'adsorption correction in unrecognized format {comment}')
-
-            # Handle the gas-phase portion first
-            gas_comment = comment.split('Adsorption correction: + ')[0].strip()
-            if gas_comment.endswith('.'):
-                gas_comment = gas_comment[:-1]  # delete the . at the end if it exists
-            gas_comment = gas_comment[gas_comment.find('from ', len('Gas phase thermo for ')) + len('from '):]
-            dummy_gas_phase_species = Species()
-            dummy_gas_phase_species.thermo = NASA()
-            dummy_gas_phase_species.thermo.comment = gas_comment
-            source = self.extract_source_from_comments(dummy_gas_phase_species)
-
-            # This is an adsorption correction
-            # comment is split into two parts: the gas phase, and the surface adsorption correction
-            ads_correction_comment = comment.split('Adsorption correction: +')[-1].strip()
-            dummy_adsorption_correction_species = Species()
-            dummy_adsorption_correction_species.thermo = NASA()
-            dummy_adsorption_correction_species.thermo.comment = ads_correction_comment
-            source['ADS'] = self.extract_source_from_comments(dummy_adsorption_correction_species)['GAV']
-
-            return source
-
-        # Check for group additivity contributions to the thermo in this species            
-
-        # The contribution of the groups can be either additive or subtracting
-        # after changes to the polycyclic algorithm
+    def _parse_gav_groups(self, comment):
+        """Extract the groups from the comment"""
+        groups = {}
 
         comment = comment.replace(' + ', ' +')
         comment = comment.replace(' - ', ' -')
@@ -2922,8 +2850,7 @@ def extract_source_from_comments(self, species):
         comment = comment.replace(')\n-', ') -')
         comment = comment.replace('\n', '')
         tokens = comment.split(' ')
-
-        groups = {}
+        
         group_types = list(self.groups.keys())
 
         regex = r"\((.*)\)"  # only hit outermost parentheses
@@ -2951,22 +2878,183 @@ def extract_source_from_comments(self, species):
 
         if groups:
             # Indicate that group additivity is used when it is either an HBI correction
-            # onto a  thermo library or QM value, or if the entire molecule is estimated using group additivity
+            # onto a thermo library or QM value, or if the entire molecule is estimated using group additivity
             # Save the groups into the source dictionary
 
             # Convert groups back into tuples 
             for groupType, groupDict in groups.items():
                 groups[groupType] = list(groupDict.items())
 
-            source['GAV'] = groups
+        return groups
+
+    def _parse_library_source(self, comment, library_species):
+        # handle the library source comment, which looks like "Thermo library: library_name"
+        # we then need to retrieve the specific library entry given the species
+        # but may have unfortunate line breaks in the middle
+
+        # trim the comment down to just the library portion so it starts with Thermo library:
+        split_loc = comment.find('Thermo library:')
+        if split_loc == -1:
+            raise ValueError(f"Expected 'Thermo library:' in comment, got {comment}")
+
+        comment = comment[split_loc:]
+
+        # library name is the token that comes immediately after 'Thermo library:'
+        assert 'Thermo library:' in comment, f"Expected 'Thermo library:' in comment, got {comment}"
+        tokens = comment.split()
+        library_name = tokens[2]
+
+        results = self.get_thermo_data_from_library(library_species, self.libraries[library_name])
+        if results is None:
+            raise DatabaseError(f"Could not find a library match for {library_species} in library {library_name}")
+
+        data, thermo_library, library_entry = results
+        return (library_name, library_entry)
+
+
+    def _parse_adsorption_correction(self, comment):
+        # handle the adsorption correction comment, which looks like
+        # "Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C-XR2CR3)"
+        # but may have unfortunate line breaks in the middle
+
+        # check that the number of tokens matches our expectation for an adsorption correction
+        # should be 8, maybe 9 if there was a weird line break
+        tokens = comment.split()
+        if len(tokens) not in [8, 9]:
+            raise ValueError(f"Expected 8 or 9 tokens in adsorption correction comment, got {len(tokens)}: {comment}")
+
+        ADS = self._parse_gav_groups(comment)
+
+        if len(ADS) > 1:
+            raise ValueError("Only adsorption corrections should be present in the adsorption correction portion of the comment. Found: {}".format(ADS))
+
+        return ADS
+
+    def extract_source_from_comments(self, species):
+        """
+        `species`: A species object containing thermo data and thermo data comments
+        
+        Parses the verbose string of comments from the thermo data of the species object,
+        and extracts the thermo sources.
+
+        Returns a dictionary with keys of 'Library', 'QM', 'ADS', and/or 'GAV'.
+        Commonly, species thermo are estimated using only one of these sources.
+        However, a radical can be estimated with more than one type of source, for 
+        instance a saturated library value and a GAV HBI correction, or a QM saturated value
+        and a GAV HBI correction. Adsorbates can be estimated using a single library
+        for the adsorbate or a combination of a gas phase library for the
+        gas phase portion and an adsorption correction.
+        
+        source = {'Library': String_Name_of_Library_Used,
+                  'QM': String_of_Method_Used,
+                  'GAV': Dictionary_of_Groups_Used,
+                  'ADS': Dictionary_of_Adsorption_Group_Used,
+                  }
+                  
+        The Dictionary_of_Groups_Used looks like 
+        {'groupType':[List of tuples containing (Entry, Weight)]
+        """
+
+        # TODO: solvent, electrocat, LSR
+        source = {}
+
+        comment = species.thermo.comment
+        tokens = comment.split()
+
+        ads_correction = 'Gas phase thermo' in comment and 'Adsorption correction:' in comment
+        library = 'Thermo library' in comment
+        QM = 'QM' in tokens
+        GAV = 'Thermo group additivity estimation:' in comment  # ambiguous since ads correction looks identical to group
+
+        # the biggest thing to split on first is the adsorption correction
+        if ads_correction:
+            # The source options here are:
+            # (Library(gas-phase species), Adsorption correction)
+            # (QM(gas-phase species), Adsorption correction) <--- not really, QM is dead/sleeping
+            # (GAV(gas-phase species), Adsorption correction)
+            # (Library(gas-phase species), GAV(radical correction), Adsorption correction)
+            # (QM(gas-phase species), GAV(radical correction), Adsorption correction) <--- not really, QM is dead/sleeping
+
+            # split the comment into the gas phase thermo portion and the adsorption correction portion
+            split_loc = comment.find('Adsorption correction:')
+            if split_loc == -1:
+                raise ValueError(f"Expected 'Adsorption correction:' in comment, got {comment}")
+            gas_comment = comment[:split_loc].strip()
+            ads_correction_comment = comment[split_loc:].strip()
+            source['ADS'] = self._parse_adsorption_correction(ads_correction_comment)
+            GAV = 'Thermo group additivity estimation:' in gas_comment
+            if GAV:
+                # Get groups first
+                source['GAV'] = self._parse_gav_groups(gas_comment)
+
+                if library:  # (Library(gas-phase species), GAV(radical correction), Adsorption correction)
+                    # this means the library species is the desorbed, saturated gas-phase version of the adsorbate
+                    desorbed_gas_species = Species()
+                    desorbed_gas_species.molecule = [species.molecule[0].get_desorbed_molecules()]  # does deepcopy
+
+                    assert desorbed_gas_species.molecule[0].is_radical(), "Method only valid for radicals."
+                    molecule = desorbed_gas_species.molecule[0]  # no need to deepcopy again since get_desorbed_molecules already does deepcopy
+                    molecule.saturate_radicals()  # note, this returns a dictionary instead of the Molecule object, but it modifies the molecule in place, so we can just ignore the returned dictionary
+                    saturated_desorbed_gas_species = Species(molecule=[molecule])
+                    source['Library'] = self._parse_library_source(gas_comment, saturated_desorbed_gas_species)
+                if QM:  # (QM(gas-phase species), GAV(radical correction), Adsorption correction) <--- not really, QM is dead/sleeping
+                    # whatever token comes immediately after 'QM' is the method used
+                    source['QM'] = tokens[tokens.index('QM') + 1]
+
+            else:
+                # no groups, so this is (Library + ADS) or (QM + ADS)
+                if library:
+                    # in this case, the library species is the desorbed gas-phase molecule of the adsorbate
+                    desorbed_gas_species = Species(molecule=[species.molecule[0].get_desorbed_molecules()])  # does deepcopy
+                    source['Library'] = self._parse_library_source(gas_comment, desorbed_gas_species)
+                if QM:
+                    # whatever token comes immediately after 'QM' is the method used
+                    source['QM'] = tokens[tokens.index('QM') + 1]
+
+        else:
+            # gas phase only, source options are:
+            # (Library)
+            # (QM)
+            # (GAV)
+            # (Library, GAV)
+            # (QM, GAV)
+
+            groups = self._parse_gav_groups(comment)
+            GAV = 'Thermo group additivity estimation:' in comment
+            if GAV and not groups:
+                raise ValueError("No groups were found in the comments but 'Thermo group additivity estimation:' was in the comment. Comment: {}".format(comment))
+            elif not GAV and groups:
+                if 'radical' not in groups.keys():
+                    raise ValueError("Groups were found in the comments but 'Thermo group additivity estimation:' was not in the comment. Comment: {}".format(comment))
+   
+            if groups:
+                # Get groups first
+                source['GAV'] = groups
+                if library:  # (Library, GAV)
+                    # get the saturated species for the library source
+                    if not 'radical' in groups.keys():
+                        raise ValueError("Method only valid for radicals, but no radical groups were found. Comment: {}".format(comment))
+                    
+                    molecule = deepcopy(species.molecule[0])
+                    assert molecule.is_radical(), "Method only valid for radicals."
+                    molecule.saturate_radicals()  # note, this returns a dictionary instead of the Molecule object, but it modifies the molecule in place, so we can just ignore the returned dictionary
+                    saturated_species = Species(molecule=[molecule])
+                    source['Library'] = self._parse_library_source(comment, saturated_species)
+                if QM:  # (QM, GAV) <--- not really, QM is dead/sleeping
+                    # whatever token comes immediately after 'QM' is the method used
+                    source['QM'] = tokens[tokens.index('QM') + 1]
+            else:  # (Library) or (QM)
+                if library:
+                    source['Library'] = self._parse_library_source(comment, species)
+                if QM:
+                    # whatever token comes immediately after 'QM' is the method used
+                    source['QM'] = tokens[tokens.index('QM') + 1]
 
-        # Perform a sanity check that this molecule is estimated by at least one method
         if not list(source.keys()):
             raise ValueError('Species {0} thermo appears to not be estimated using any methods.'.format(species))
 
         return source
 
-
 class ThermoCentralDatabaseInterface(object):
     """
     A class for interfacing with RMG online thermo central database.

From a94ac261765b772d416d31986d47bfd5368e3c0e Mon Sep 17 00:00:00 2001
From: Sevy Harris <sevy.harris@gmail.com>
Date: Wed, 27 May 2026 18:03:19 -0400
Subject: [PATCH 3/4] trying to update tests with new parse comment

---
 test/rmgpy/data/thermoTest.py | 106 ++++++++++------------------------
 1 file changed, 29 insertions(+), 77 deletions(-)

diff --git a/test/rmgpy/data/thermoTest.py b/test/rmgpy/data/thermoTest.py
index f28d73676f..1ea0ebf014 100644
--- a/test/rmgpy/data/thermoTest.py
+++ b/test/rmgpy/data/thermoTest.py
@@ -66,7 +66,7 @@ def setUpModule():
     database = RMGDatabase()
     database.load_thermo(
         os.path.join(settings["database.directory"], "thermo"),
-        thermo_libraries=["DFT_QCI_thermo", "SABIC_aromatics", "primaryThermoLibrary"],
+        thermo_libraries=["DFT_QCI_thermo", "SABIC_aromatics", "primaryThermoLibrary", "surfaceThermoPt111"],
         surface=True,
     )
     database.load_surface(os.path.join(settings["database.directory"], "surface"))
@@ -263,95 +263,47 @@ def test_parse_thermo_comments(self):
         assert len(source["GAV"]["ring"]) == 1
 
         # Pure library thermo
-        dipk = Species(
+        propane = Species(
             index=1,
-            label="DIPK",
+            label="propane",
             thermo=NASA(
                 polynomials=[
-                    NASAPolynomial(
-                        coeffs=[
-                            3.35002,
-                            0.017618,
-                            -2.46235e-05,
-                            1.7684e-08,
-                            -4.87962e-12,
-                            35555.7,
-                            5.75335,
-                        ],
-                        Tmin=(100, "K"),
-                        Tmax=(888.28, "K"),
-                    ),
-                    NASAPolynomial(
-                        coeffs=[
-                            6.36001,
-                            0.00406378,
-                            -1.73509e-06,
-                            5.05949e-10,
-                            -4.49975e-14,
-                            35021,
-                            -8.41155,
-                        ],
-                        Tmin=(888.28, "K"),
-                        Tmax=(5000, "K"),
-                    ),
+                    NASAPolynomial(coeffs=[0.4987,0.0308692,-7.94064e-06,-5.75209e-09,3.10575e-12,-14122,21.6027], Tmin=(298,'K'), Tmax=(1036.31,'K')),
+                    NASAPolynomial(coeffs=[2.0491,0.0302287,-1.47485e-05,3.60335e-09,-3.51534e-13,-14730.3,12.6832], Tmin=(1036.31,'K'), Tmax=(3000,'K'))
                 ],
-                Tmin=(100, "K"),
-                Tmax=(5000, "K"),
-                comment="""Thermo library: DIPK""",
+                Tmin=(298,'K'), Tmax=(3000,'K'), E0=(-119.854,'kJ/mol'), Cp0=(33.2579,'J/(mol*K)'), CpInf=(249.434,'J/(mol*K)'),
+                label="""C3H8""",
+                comment="""Thermo library: DFT_QCI_thermo"""
             ),
-            molecule=[Molecule(smiles="CC(C)C(=O)C(C)C")],
+            molecule=[Molecule(smiles="CCC")],
         )
 
-        source = self.database.extract_source_from_comments(dipk)
-        assert "Library" in source
+        CCC_source = self.database.extract_source_from_comments(propane)
+        assert "Library" in CCC_source
 
         # Mixed library and HBI thermo
-        dipk_rad = Species(
+        propane_rad = Species(
             index=4,
-            label="R_tert",
+            label="propane_rad",
             thermo=NASA(
                 polynomials=[
-                    NASAPolynomial(
-                        coeffs=[
-                            2.90061,
-                            0.0298018,
-                            -7.06268e-05,
-                            6.9636e-08,
-                            -2.42414e-11,
-                            54431,
-                            5.44492,
-                        ],
-                        Tmin=(100, "K"),
-                        Tmax=(882.19, "K"),
-                    ),
-                    NASAPolynomial(
-                        coeffs=[
-                            6.70999,
-                            0.000201027,
-                            6.65617e-07,
-                            -7.99543e-11,
-                            4.08721e-15,
-                            54238.6,
-                            -9.73662,
-                        ],
-                        Tmin=(882.19, "K"),
-                        Tmax=(5000, "K"),
-                    ),
+                    NASAPolynomial(coeffs=[0.599885,0.0320439,-2.63211e-05,1.15072e-08,-2.01662e-12,66381.4,20.2736], Tmin=(298,'K'), Tmax=(1331.58,'K')),
+                    NASAPolynomial(coeffs=[6.43539,0.0145141,-6.57382e-06,1.62046e-09,-1.60381e-13,64827.3,-9.55036], Tmin=(1331.58,'K'), Tmax=(3000,'K'))
                 ],
-                Tmin=(100, "K"),
-                Tmax=(5000, "K"),
-                comment="""Thermo library: DIPK + radical(C2CJCHO)""",
+                Tmin=(298,'K'), Tmax=(3000,'K'), E0=(549.676,'kJ/mol'), Cp0=(33.2579,'J/(mol*K)'), CpInf=(249.434,'J/(mol*K)'),
+                comment="""Thermo library: DFT_QCI_thermo + radical(CJ3)"""
             ),
             molecule=[
-                Molecule(smiles="C[C](C)C(=O)C(C)C"),
-                Molecule(smiles="CC(C)=C([O])C(C)C"),
+                Molecule(smiles="CC[C]")
             ],
         )
 
-        source = self.database.extract_source_from_comments(dipk_rad)
-        assert "Library" in source
-        assert "GAV" in source
-        assert len(source["GAV"]["radical"]) == 1
+        CCC_rad_source = self.database.extract_source_from_comments(propane_rad)
+        assert "Library" in CCC_rad_source
+        assert "GAV" in CCC_rad_source
+        assert len(CCC_rad_source["GAV"]["radical"]) == 1
+
+        assert CCC_source["Library"] == CCC_rad_source["Library"], "The library source should be the same for the radical and non-radical species since they come from the same library entry."
 
         # Pure QM thermo
         cineole = Species(
@@ -579,7 +531,7 @@ def test_parse_thermo_comments(self):
         OX.thermo.comment = 'Thermo library: surfaceThermoPt111'
         source = self.database.extract_source_from_comments(OX)
         assert "Library" in source
-        assert source["Library"] == "surfaceThermoPt111"
+        assert source["Library"][0] == "surfaceThermoPt111"
 
         # Gas library + adsorption correction
         CH2X = rmgpy.species.Species(smiles="[CH2]=*")
@@ -587,7 +539,7 @@ def test_parse_thermo_comments(self):
         CH2X.thermo.comment = 'Gas phase thermo for CH2(T) from Thermo library: primaryThermoLibrary. Adsorption correction: + Thermo group additivity estimation:\nadsorptionPt111(C=XR2)'
         source = self.database.extract_source_from_comments(CH2X)
         assert "Library" in source
-        assert source["Library"] == "primaryThermoLibrary"
+        assert source["Library"][0] == "primaryThermoLibrary"
         assert "ADS" in source
         assert source['ADS']['adsorptionPt111'][0][0].label == 'C=XR2'
         assert source['ADS']['adsorptionPt111'][0][1] == 1  # weight should be 1
@@ -613,7 +565,7 @@ def test_parse_thermo_comments(self):
         CHOX.thermo.comment = 'Gas phase thermo for [CH]=O from Thermo library: primaryThermoLibrary + radical(HCdsJO). Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C-XR3)'
         source = self.database.extract_source_from_comments(CHOX)
         assert "Library" in source
-        assert source["Library"] == "primaryThermoLibrary"
+        assert source["Library"][0] == "primaryThermoLibrary"
         assert "GAV" in source
         assert source["GAV"]["radical"][0][0].label == "HCdsJO"
         assert source["GAV"]["radical"][0][1] == 1  # weight should be 1
@@ -629,7 +581,7 @@ def test_parse_thermo_comments(self):
         OX.thermo.comment = OX_comment  # set the comment to be the generated comment
         source = self.database.extract_source_from_comments(OX)
         assert "Library" in source
-        assert source["Library"] == "primaryThermoLibrary"
+        assert source["Library"][0] == "primaryThermoLibrary"
         assert 'ADS' in source
         assert source['ADS']['adsorptionPt111'][0][0].label == 'OX'
         assert source['ADS']['adsorptionPt111'][0][1] == 1  # weight should be 1
@@ -640,7 +592,7 @@ def test_parse_thermo_comments(self):
         CH2CHCH_ads.thermo = self.database.get_thermo_data(CH2CHCH_ads)
         source = self.database.extract_source_from_comments(CH2CHCH_ads)
         assert "Library" in source
-        assert source["Library"] == "DFT_QCI_thermo"
+        assert source["Library"][0] == "DFT_QCI_thermo"
         assert "GAV" in source
         assert source["GAV"]["radical"][0][0].label == "AllylJ2_triplet"
         assert source["GAV"]["radical"][0][1] == 1  # weight should be 1

From e6b5f6021a87da60d260d8e369ddd444dda9f1ed Mon Sep 17 00:00:00 2001
From: Sevy Harris <sevy.harris@gmail.com>
Date: Wed, 27 May 2026 18:30:10 -0400
Subject: [PATCH 4/4] almost got it, updates to thermoTest

---
 rmgpy/data/thermo.py          | 13 ++++++++++---
 test/rmgpy/data/thermoTest.py | 16 ++++++++++++----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/rmgpy/data/thermo.py b/rmgpy/data/thermo.py
index 7aac848ead..fca8f42993 100644
--- a/rmgpy/data/thermo.py
+++ b/rmgpy/data/thermo.py
@@ -2848,9 +2848,12 @@ def _parse_gav_groups(self, comment):
         # groups are still split by spaces
         comment = comment.replace(')\n+', ') +')
         comment = comment.replace(')\n-', ') -')
+        # `Thermo group additivity estimation:\nadsorptionPt111(...)` shows up in
+        # adsorbate comments - keep the trailing colon separated from the group token.
+        comment = comment.replace(':\n', ': ')
         comment = comment.replace('\n', '')
         tokens = comment.split(' ')
-        
+
         group_types = list(self.groups.keys())
 
         regex = r"\((.*)\)"  # only hit outermost parentheses
@@ -2980,6 +2983,8 @@ def extract_source_from_comments(self, species):
             if split_loc == -1:
                 raise ValueError(f"Expected 'Adsorption correction:' in comment, got {comment}")
             gas_comment = comment[:split_loc].strip()
+            if gas_comment.endswith('.'):
+                gas_comment = gas_comment[:-1]  # the period that closed the gas-phase sentence
             ads_correction_comment = comment[split_loc:].strip()
             source['ADS'] = self._parse_adsorption_correction(ads_correction_comment)
             GAV = 'Thermo group additivity estimation:' in gas_comment
@@ -2990,7 +2995,8 @@ def extract_source_from_comments(self, species):
                 if library:  # (Library(gas-phase species), GAV(radical correction), Adsorption correction)
                     # this means the library species is the desorbed, saturated gas-phase version of the adsorbate
                     desorbed_gas_species = Species()
-                    desorbed_gas_species.molecule = [species.molecule[0].get_desorbed_molecules()]  # does deepcopy
+                    # get_desorbed_molecules already returns a list of Molecule objects
+                    desorbed_gas_species.molecule = species.molecule[0].get_desorbed_molecules()  # does deepcopy
 
                     assert desorbed_gas_species.molecule[0].is_radical(), "Method only valid for radicals."
                     molecule = desorbed_gas_species.molecule[0]  # no need to deepcopy again since get_desorbed_molecules already does deepcopy
@@ -3005,7 +3011,8 @@ def extract_source_from_comments(self, species):
                 # no groups, so this is (Library + ADS) or (QM + ADS)
                 if library:
                     # in this case, the library species is the desorbed gas-phase molecule of the adsorbate
-                    desorbed_gas_species = Species(molecule=[species.molecule[0].get_desorbed_molecules()])  # does deepcopy
+                    # get_desorbed_molecules already returns a list of Molecule objects
+                    desorbed_gas_species = Species(molecule=species.molecule[0].get_desorbed_molecules())  # does deepcopy
                     source['Library'] = self._parse_library_source(gas_comment, desorbed_gas_species)
                 if QM:
                     # whatever token comes immediately after 'QM' is the method used
diff --git a/test/rmgpy/data/thermoTest.py b/test/rmgpy/data/thermoTest.py
index 1ea0ebf014..b253da4042 100644
--- a/test/rmgpy/data/thermoTest.py
+++ b/test/rmgpy/data/thermoTest.py
@@ -66,7 +66,7 @@ def setUpModule():
     database = RMGDatabase()
     database.load_thermo(
         os.path.join(settings["database.directory"], "thermo"),
-        thermo_libraries=["DFT_QCI_thermo", "SABIC_aromatics", "primaryThermoLibrary", "surfaceThermoPt111"],
+        thermo_libraries=["DFT_QCI_thermo", "SABIC_aromatics", "primaryThermoLibrary"],
         surface=True,
     )
     database.load_surface(os.path.join(settings["database.directory"], "surface"))
@@ -123,6 +123,14 @@ def setup_class(cls):
         )
         cls.databaseWithoutLibraries.set_binding_energies("Pt111")
 
+        cls.databaseWithSurfaceThermoPt111 = ThermoDatabase()
+        cls.databaseWithSurfaceThermoPt111.load(
+            os.path.join(settings["database.directory"], "thermo"),
+            libraries=["DFT_QCI_thermo", "SABIC_aromatics", "primaryThermoLibrary", "surfaceThermoPt111"],
+            surface=True,
+        )
+        cls.databaseWithSurfaceThermoPt111.set_binding_energies("Pt111")
+
         # Set up ML estimator - temporarily removed, see rmgpy.ml.estimator
         # models_path = os.path.join(settings["database.directory"], "thermo", "ml", "main")
         # hf298_path = os.path.join(models_path, "hf298")
@@ -529,7 +537,7 @@ def test_parse_thermo_comments(self):
         OX = rmgpy.species.Species(smiles="O=*")
         OX.thermo = rmgpy.thermo.NASA()
         OX.thermo.comment = 'Thermo library: surfaceThermoPt111'
-        source = self.database.extract_source_from_comments(OX)
+        source = self.databaseWithSurfaceThermoPt111.extract_source_from_comments(OX)
         assert "Library" in source
         assert source["Library"][0] == "surfaceThermoPt111"
 
@@ -562,10 +570,10 @@ def test_parse_thermo_comments(self):
         # Gas library + radical for HBI + adsorption correction
         CHOX = rmgpy.species.Species(smiles="O=[CH]*")
         CHOX.thermo = rmgpy.thermo.NASA()
-        CHOX.thermo.comment = 'Gas phase thermo for [CH]=O from Thermo library: primaryThermoLibrary + radical(HCdsJO). Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C-XR3)'
+        CHOX.thermo.comment = 'Gas phase thermo for [CH]=O from Thermo library: DFT_QCI_thermo + radical(HCdsJO). Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C-XR3)'
         source = self.database.extract_source_from_comments(CHOX)
         assert "Library" in source
-        assert source["Library"][0] == "primaryThermoLibrary"
+        assert source["Library"][0] == "DFT_QCI_thermo"
         assert "GAV" in source
         assert source["GAV"]["radical"][0][0].label == "HCdsJO"
         assert source["GAV"]["radical"][0][1] == 1  # weight should be 1