diff --git a/rmgpy/data/thermo.py b/rmgpy/data/thermo.py index 6f3e4d35ba..fca8f42993 100644 --- a/rmgpy/data/thermo.py +++ b/rmgpy/data/thermo.py @@ -2835,82 +2835,10 @@ def get_ring_groups_from_comments(self, thermo_data): return ring_groups, polycyclic_groups - def extract_source_from_comments(self, species): - """ - `species`: A species object containing thermo data and thermo data comments - - Parses the verbose string of comments from the thermo data of the species object, - and extracts the thermo sources. - - Returns a dictionary with keys of 'Library', 'QM', 'ADS', and/or 'GAV'. - Commonly, species thermo are estimated using only one of these sources. - However, a radical can be estimated with more than one type of source, for - instance a saturated library value and a GAV HBI correction, or a QM saturated value - and a GAV HBI correction. Adsorbates can be estimated using a single library - for the adsorbate or a combination of a gas phase library for the - gas phase portion and an adsorption correction. - - source = {'Library': String_Name_of_Library_Used, - 'QM': String_of_Method_Used, - 'GAV': Dictionary_of_Groups_Used, - 'ADS': Dictionary_of_Adsorption_Group_Used, - } - - The Dictionary_of_Groups_Used looks like - {'groupType':[List of tuples containing (Entry, Weight)] - """ - comment = species.thermo.comment - tokens = comment.split() - - source = {} - - if comment.startswith('Thermo library'): - # Store name of the library source, which is the 3rd token in the comments - source['Library'] = tokens[2] - - elif comment.startswith('QM'): - # Store the level of the calculation, which is the 2nd token in the comments - source['QM'] = tokens[1] - - elif comment.startswith('Gas phase thermo'): - # Handle adsorption correction thermo data of the following format: - # Library example - # Gas phase thermo for C(T) from Thermo library: primaryThermoLibrary. - # Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(Cq*) - - # GAV example - # Gas phase thermo for [CH]CC from Thermo group additivity estimation: group(Cs-CsCsHH) + group(Cs-CsHHH) + group(Cs-CsHHH) + radical(CCJ2_triplet). - # Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C=*RCR3)" - - comment = comment.replace(r'\n', ' ') - comment = comment.replace('\n', ' ') - if 'Adsorption correction:' not in comment: - raise ValueError(f'adsorption correction in unrecognized format {comment}') - - # Handle the gas-phase portion first - gas_comment = comment.split('Adsorption correction: + ')[0].strip() - if gas_comment.endswith('.'): - gas_comment = gas_comment[:-1] # delete the . at the end if it exists - gas_comment = gas_comment[gas_comment.find('from ', len('Gas phase thermo for ')) + len('from '):] - dummy_gas_phase_species = Species() - dummy_gas_phase_species.thermo = NASA() - dummy_gas_phase_species.thermo.comment = gas_comment - source = self.extract_source_from_comments(dummy_gas_phase_species) - # This is an adsorption correction - # comment is split into two parts: the gas phase, and the surface adsorption correction - ads_correction_comment = comment.split('Adsorption correction: +')[-1].strip() - dummy_adsorption_correction_species = Species() - dummy_adsorption_correction_species.thermo = NASA() - dummy_adsorption_correction_species.thermo.comment = ads_correction_comment - source['ADS'] = self.extract_source_from_comments(dummy_adsorption_correction_species)['GAV'] - - return source - - # Check for group additivity contributions to the thermo in this species - - # The contribution of the groups can be either additive or subtracting - # after changes to the polycyclic algorithm + def _parse_gav_groups(self, comment): + """Extract the groups from the comment""" + groups = {} comment = comment.replace(' + ', ' +') comment = comment.replace(' - ', ' -') @@ -2920,10 +2848,12 @@ def extract_source_from_comments(self, species): # groups are still split by spaces comment = comment.replace(')\n+', ') +') comment = comment.replace(')\n-', ') -') + # `Thermo group additivity estimation:\nadsorptionPt111(...)` shows up in + # adsorbate comments - keep the trailing colon separated from the group token. + comment = comment.replace(':\n', ': ') comment = comment.replace('\n', '') tokens = comment.split(' ') - groups = {} group_types = list(self.groups.keys()) regex = r"\((.*)\)" # only hit outermost parentheses @@ -2951,22 +2881,187 @@ def extract_source_from_comments(self, species): if groups: # Indicate that group additivity is used when it is either an HBI correction - # onto a thermo library or QM value, or if the entire molecule is estimated using group additivity + # onto a thermo library or QM value, or if the entire molecule is estimated using group additivity # Save the groups into the source dictionary # Convert groups back into tuples for groupType, groupDict in groups.items(): groups[groupType] = list(groupDict.items()) - source['GAV'] = groups + return groups + + def _parse_library_source(self, comment, library_species): + # handle the library source comment, which looks like "Thermo library: library_name" + # we then need to retrieve the specific library entry given the species + # but may have unfortunate line breaks in the middle + + # trim the comment down to just the library portion so it starts with Thermo library: + split_loc = comment.find('Thermo library:') + if split_loc == -1: + raise ValueError(f"Expected 'Thermo library:' in comment, got {comment}") + + comment = comment[split_loc:] + + # library name is the token that comes immediately after 'Thermo library:' + assert 'Thermo library:' in comment, f"Expected 'Thermo library:' in comment, got {comment}" + tokens = comment.split() + library_name = tokens[2] + + results = self.get_thermo_data_from_library(library_species, self.libraries[library_name]) + if results is None: + raise DatabaseError(f"Could not find a library match for {library_species} in library {library_name}") + + data, thermo_library, library_entry = results + return (library_name, library_entry) + + + def _parse_adsorption_correction(self, comment): + # handle the adsorption correction comment, which looks like + # "Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C-XR2CR3)" + # but may have unfortunate line breaks in the middle + + # check that the number of tokens matches our expectation for an adsorption correction + # should be 8, maybe 9 if there was a weird line break + tokens = comment.split() + if len(tokens) not in [8, 9]: + raise ValueError(f"Expected 8 or 9 tokens in adsorption correction comment, got {len(tokens)}: {comment}") + + ADS = self._parse_gav_groups(comment) + + if len(ADS) > 1: + raise ValueError("Only adsorption corrections should be present in the adsorption correction portion of the comment. Found: {}".format(ADS)) + + return ADS + + def extract_source_from_comments(self, species): + """ + `species`: A species object containing thermo data and thermo data comments + + Parses the verbose string of comments from the thermo data of the species object, + and extracts the thermo sources. + + Returns a dictionary with keys of 'Library', 'QM', 'ADS', and/or 'GAV'. + Commonly, species thermo are estimated using only one of these sources. + However, a radical can be estimated with more than one type of source, for + instance a saturated library value and a GAV HBI correction, or a QM saturated value + and a GAV HBI correction. Adsorbates can be estimated using a single library + for the adsorbate or a combination of a gas phase library for the + gas phase portion and an adsorption correction. + + source = {'Library': String_Name_of_Library_Used, + 'QM': String_of_Method_Used, + 'GAV': Dictionary_of_Groups_Used, + 'ADS': Dictionary_of_Adsorption_Group_Used, + } + + The Dictionary_of_Groups_Used looks like + {'groupType':[List of tuples containing (Entry, Weight)] + """ + + # TODO: solvent, electrocat, LSR + source = {} + + comment = species.thermo.comment + tokens = comment.split() + + ads_correction = 'Gas phase thermo' in comment and 'Adsorption correction:' in comment + library = 'Thermo library' in comment + QM = 'QM' in tokens + GAV = 'Thermo group additivity estimation:' in comment # ambiguous since ads correction looks identical to group + + # the biggest thing to split on first is the adsorption correction + if ads_correction: + # The source options here are: + # (Library(gas-phase species), Adsorption correction) + # (QM(gas-phase species), Adsorption correction) <--- not really, QM is dead/sleeping + # (GAV(gas-phase species), Adsorption correction) + # (Library(gas-phase species), GAV(radical correction), Adsorption correction) + # (QM(gas-phase species), GAV(radical correction), Adsorption correction) <--- not really, QM is dead/sleeping + + # split the comment into the gas phase thermo portion and the adsorption correction portion + split_loc = comment.find('Adsorption correction:') + if split_loc == -1: + raise ValueError(f"Expected 'Adsorption correction:' in comment, got {comment}") + gas_comment = comment[:split_loc].strip() + if gas_comment.endswith('.'): + gas_comment = gas_comment[:-1] # the period that closed the gas-phase sentence + ads_correction_comment = comment[split_loc:].strip() + source['ADS'] = self._parse_adsorption_correction(ads_correction_comment) + GAV = 'Thermo group additivity estimation:' in gas_comment + if GAV: + # Get groups first + source['GAV'] = self._parse_gav_groups(gas_comment) + + if library: # (Library(gas-phase species), GAV(radical correction), Adsorption correction) + # this means the library species is the desorbed, saturated gas-phase version of the adsorbate + desorbed_gas_species = Species() + # get_desorbed_molecules already returns a list of Molecule objects + desorbed_gas_species.molecule = species.molecule[0].get_desorbed_molecules() # does deepcopy + + assert desorbed_gas_species.molecule[0].is_radical(), "Method only valid for radicals." + molecule = desorbed_gas_species.molecule[0] # no need to deepcopy again since get_desorbed_molecules already does deepcopy + molecule.saturate_radicals() # note, this returns a dictionary instead of the Molecule object, but it modifies the molecule in place, so we can just ignore the returned dictionary + saturated_desorbed_gas_species = Species(molecule=[molecule]) + source['Library'] = self._parse_library_source(gas_comment, saturated_desorbed_gas_species) + if QM: # (QM(gas-phase species), GAV(radical correction), Adsorption correction) <--- not really, QM is dead/sleeping + # whatever token comes immediately after 'QM' is the method used + source['QM'] = tokens[tokens.index('QM') + 1] + + else: + # no groups, so this is (Library + ADS) or (QM + ADS) + if library: + # in this case, the library species is the desorbed gas-phase molecule of the adsorbate + # get_desorbed_molecules already returns a list of Molecule objects + desorbed_gas_species = Species(molecule=species.molecule[0].get_desorbed_molecules()) # does deepcopy + source['Library'] = self._parse_library_source(gas_comment, desorbed_gas_species) + if QM: + # whatever token comes immediately after 'QM' is the method used + source['QM'] = tokens[tokens.index('QM') + 1] + + else: + # gas phase only, source options are: + # (Library) + # (QM) + # (GAV) + # (Library, GAV) + # (QM, GAV) + + groups = self._parse_gav_groups(comment) + GAV = 'Thermo group additivity estimation:' in comment + if GAV and not groups: + raise ValueError("No groups were found in the comments but 'Thermo group additivity estimation:' was in the comment. Comment: {}".format(comment)) + elif not GAV and groups: + if 'radical' not in groups.keys(): + raise ValueError("Groups were found in the comments but 'Thermo group additivity estimation:' was not in the comment. Comment: {}".format(comment)) + + if groups: + # Get groups first + source['GAV'] = groups + if library: # (Library, GAV) + # get the saturated species for the library source + if not 'radical' in groups.keys(): + raise ValueError("Method only valid for radicals, but no radical groups were found. Comment: {}".format(comment)) + + molecule = deepcopy(species.molecule[0]) + assert molecule.is_radical(), "Method only valid for radicals." + molecule.saturate_radicals() # note, this returns a dictionary instead of the Molecule object, but it modifies the molecule in place, so we can just ignore the returned dictionary + saturated_species = Species(molecule=[molecule]) + source['Library'] = self._parse_library_source(comment, saturated_species) + if QM: # (QM, GAV) <--- not really, QM is dead/sleeping + # whatever token comes immediately after 'QM' is the method used + source['QM'] = tokens[tokens.index('QM') + 1] + else: # (Library) or (QM) + if library: + source['Library'] = self._parse_library_source(comment, species) + if QM: + # whatever token comes immediately after 'QM' is the method used + source['QM'] = tokens[tokens.index('QM') + 1] - # Perform a sanity check that this molecule is estimated by at least one method if not list(source.keys()): raise ValueError('Species {0} thermo appears to not be estimated using any methods.'.format(species)) return source - class ThermoCentralDatabaseInterface(object): """ A class for interfacing with RMG online thermo central database. diff --git a/test/rmgpy/data/thermoTest.py b/test/rmgpy/data/thermoTest.py index f28d73676f..b253da4042 100644 --- a/test/rmgpy/data/thermoTest.py +++ b/test/rmgpy/data/thermoTest.py @@ -123,6 +123,14 @@ def setup_class(cls): ) cls.databaseWithoutLibraries.set_binding_energies("Pt111") + cls.databaseWithSurfaceThermoPt111 = ThermoDatabase() + cls.databaseWithSurfaceThermoPt111.load( + os.path.join(settings["database.directory"], "thermo"), + libraries=["DFT_QCI_thermo", "SABIC_aromatics", "primaryThermoLibrary", "surfaceThermoPt111"], + surface=True, + ) + cls.databaseWithSurfaceThermoPt111.set_binding_energies("Pt111") + # Set up ML estimator - temporarily removed, see rmgpy.ml.estimator # models_path = os.path.join(settings["database.directory"], "thermo", "ml", "main") # hf298_path = os.path.join(models_path, "hf298") @@ -263,95 +271,47 @@ def test_parse_thermo_comments(self): assert len(source["GAV"]["ring"]) == 1 # Pure library thermo - dipk = Species( + propane = Species( index=1, - label="DIPK", + label="propane", thermo=NASA( polynomials=[ - NASAPolynomial( - coeffs=[ - 3.35002, - 0.017618, - -2.46235e-05, - 1.7684e-08, - -4.87962e-12, - 35555.7, - 5.75335, - ], - Tmin=(100, "K"), - Tmax=(888.28, "K"), - ), - NASAPolynomial( - coeffs=[ - 6.36001, - 0.00406378, - -1.73509e-06, - 5.05949e-10, - -4.49975e-14, - 35021, - -8.41155, - ], - Tmin=(888.28, "K"), - Tmax=(5000, "K"), - ), + NASAPolynomial(coeffs=[0.4987,0.0308692,-7.94064e-06,-5.75209e-09,3.10575e-12,-14122,21.6027], Tmin=(298,'K'), Tmax=(1036.31,'K')), + NASAPolynomial(coeffs=[2.0491,0.0302287,-1.47485e-05,3.60335e-09,-3.51534e-13,-14730.3,12.6832], Tmin=(1036.31,'K'), Tmax=(3000,'K')) ], - Tmin=(100, "K"), - Tmax=(5000, "K"), - comment="""Thermo library: DIPK""", + Tmin=(298,'K'), Tmax=(3000,'K'), E0=(-119.854,'kJ/mol'), Cp0=(33.2579,'J/(mol*K)'), CpInf=(249.434,'J/(mol*K)'), + label="""C3H8""", + comment="""Thermo library: DFT_QCI_thermo""" ), - molecule=[Molecule(smiles="CC(C)C(=O)C(C)C")], + molecule=[Molecule(smiles="CCC")], ) - source = self.database.extract_source_from_comments(dipk) - assert "Library" in source + CCC_source = self.database.extract_source_from_comments(propane) + assert "Library" in CCC_source # Mixed library and HBI thermo - dipk_rad = Species( + propane_rad = Species( index=4, - label="R_tert", + label="propane_rad", thermo=NASA( polynomials=[ - NASAPolynomial( - coeffs=[ - 2.90061, - 0.0298018, - -7.06268e-05, - 6.9636e-08, - -2.42414e-11, - 54431, - 5.44492, - ], - Tmin=(100, "K"), - Tmax=(882.19, "K"), - ), - NASAPolynomial( - coeffs=[ - 6.70999, - 0.000201027, - 6.65617e-07, - -7.99543e-11, - 4.08721e-15, - 54238.6, - -9.73662, - ], - Tmin=(882.19, "K"), - Tmax=(5000, "K"), - ), + NASAPolynomial(coeffs=[0.599885,0.0320439,-2.63211e-05,1.15072e-08,-2.01662e-12,66381.4,20.2736], Tmin=(298,'K'), Tmax=(1331.58,'K')), + NASAPolynomial(coeffs=[6.43539,0.0145141,-6.57382e-06,1.62046e-09,-1.60381e-13,64827.3,-9.55036], Tmin=(1331.58,'K'), Tmax=(3000,'K')) ], - Tmin=(100, "K"), - Tmax=(5000, "K"), - comment="""Thermo library: DIPK + radical(C2CJCHO)""", + Tmin=(298,'K'), Tmax=(3000,'K'), E0=(549.676,'kJ/mol'), Cp0=(33.2579,'J/(mol*K)'), CpInf=(249.434,'J/(mol*K)'), + comment="""Thermo library: DFT_QCI_thermo + radical(CJ3)""" ), molecule=[ - Molecule(smiles="C[C](C)C(=O)C(C)C"), - Molecule(smiles="CC(C)=C([O])C(C)C"), + Molecule(smiles="CC[C]") ], ) - source = self.database.extract_source_from_comments(dipk_rad) - assert "Library" in source - assert "GAV" in source - assert len(source["GAV"]["radical"]) == 1 + CCC_rad_source = self.database.extract_source_from_comments(propane_rad) + assert "Library" in CCC_rad_source + assert "GAV" in CCC_rad_source + assert len(CCC_rad_source["GAV"]["radical"]) == 1 + + assert CCC_source["Library"] == CCC_rad_source["Library"], "The library source should be the same for the radical and non-radical species since they come from the same library entry." # Pure QM thermo cineole = Species( @@ -577,9 +537,9 @@ def test_parse_thermo_comments(self): OX = rmgpy.species.Species(smiles="O=*") OX.thermo = rmgpy.thermo.NASA() OX.thermo.comment = 'Thermo library: surfaceThermoPt111' - source = self.database.extract_source_from_comments(OX) + source = self.databaseWithSurfaceThermoPt111.extract_source_from_comments(OX) assert "Library" in source - assert source["Library"] == "surfaceThermoPt111" + assert source["Library"][0] == "surfaceThermoPt111" # Gas library + adsorption correction CH2X = rmgpy.species.Species(smiles="[CH2]=*") @@ -587,7 +547,7 @@ def test_parse_thermo_comments(self): CH2X.thermo.comment = 'Gas phase thermo for CH2(T) from Thermo library: primaryThermoLibrary. Adsorption correction: + Thermo group additivity estimation:\nadsorptionPt111(C=XR2)' source = self.database.extract_source_from_comments(CH2X) assert "Library" in source - assert source["Library"] == "primaryThermoLibrary" + assert source["Library"][0] == "primaryThermoLibrary" assert "ADS" in source assert source['ADS']['adsorptionPt111'][0][0].label == 'C=XR2' assert source['ADS']['adsorptionPt111'][0][1] == 1 # weight should be 1 @@ -610,10 +570,10 @@ def test_parse_thermo_comments(self): # Gas library + radical for HBI + adsorption correction CHOX = rmgpy.species.Species(smiles="O=[CH]*") CHOX.thermo = rmgpy.thermo.NASA() - CHOX.thermo.comment = 'Gas phase thermo for [CH]=O from Thermo library: primaryThermoLibrary + radical(HCdsJO). Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C-XR3)' + CHOX.thermo.comment = 'Gas phase thermo for [CH]=O from Thermo library: DFT_QCI_thermo + radical(HCdsJO). Adsorption correction: + Thermo group additivity estimation: adsorptionPt111(C-XR3)' source = self.database.extract_source_from_comments(CHOX) assert "Library" in source - assert source["Library"] == "primaryThermoLibrary" + assert source["Library"][0] == "DFT_QCI_thermo" assert "GAV" in source assert source["GAV"]["radical"][0][0].label == "HCdsJO" assert source["GAV"]["radical"][0][1] == 1 # weight should be 1 @@ -629,7 +589,7 @@ def test_parse_thermo_comments(self): OX.thermo.comment = OX_comment # set the comment to be the generated comment source = self.database.extract_source_from_comments(OX) assert "Library" in source - assert source["Library"] == "primaryThermoLibrary" + assert source["Library"][0] == "primaryThermoLibrary" assert 'ADS' in source assert source['ADS']['adsorptionPt111'][0][0].label == 'OX' assert source['ADS']['adsorptionPt111'][0][1] == 1 # weight should be 1 @@ -640,7 +600,7 @@ def test_parse_thermo_comments(self): CH2CHCH_ads.thermo = self.database.get_thermo_data(CH2CHCH_ads) source = self.database.extract_source_from_comments(CH2CHCH_ads) assert "Library" in source - assert source["Library"] == "DFT_QCI_thermo" + assert source["Library"][0] == "DFT_QCI_thermo" assert "GAV" in source assert source["GAV"]["radical"][0][0].label == "AllylJ2_triplet" assert source["GAV"]["radical"][0][1] == 1 # weight should be 1 diff --git a/test/rmgpy/tools/uncertaintyTest.py b/test/rmgpy/tools/uncertaintyTest.py index 35ac31049a..56df45cc8c 100644 --- a/test/rmgpy/tools/uncertaintyTest.py +++ b/test/rmgpy/tools/uncertaintyTest.py @@ -180,6 +180,50 @@ def test_uncertainty_assignment(self): rtol=1e-4 ) + def test_source_correlations(self): + # Check some examples of different species containing the same sources + + # ------------------------------------------------------------------------------ + # Make sure CH3 (Library + Radical) has a library index/value in common with CH4 + i_CH4 = rmgpy.tools.uncertainty.get_i_thing(rmgpy.species.Species(smiles='C'), self.uncertainty.species_list) + assert i_CH4 >= 0 + + i_CH3 = rmgpy.tools.uncertainty.get_i_thing(rmgpy.species.Species(smiles='[CH3]'), self.uncertainty.species_list) + assert i_CH3 >= 0 + + self.uncertainty.extract_sources_from_model() + self.uncertainty.assign_parameter_uncertainties(correlated=True) + + src1 = self.uncertainty.species_sources_dict[self.uncertainty.species_list[i_CH4]] # CH4 + src2 = self.uncertainty.species_sources_dict[self.uncertainty.species_list[i_CH3]] # CH3 + + assert 'Library' in src1 + assert 'Library' in src2 + assert 'GAV' in src2 + assert src1['Library'] == src2['Library'] # make sure they refer to the same library source + + # ----------------------------------------------------------------------------- + # Make sure CH3X (Library + GAV + Adsorption Correction) has a library index/value in common with CH4 (Library) + i_CH3X = rmgpy.tools.uncertainty.get_i_thing(rmgpy.species.Species(smiles='C*'), self.uncertainty.species_list) + assert i_CH3X == -1 + # This is not in the model, so add it to the species list + CH3X = rmgpy.species.Species(smiles='C*') + CH3X.thermo = self.uncertainty.database.thermo.get_thermo_data(CH3X) + self.uncertainty.species_list.append(CH3X) + + self.uncertainty.extract_sources_from_model() + self.uncertainty.assign_parameter_uncertainties(correlated=True) + + src1 = self.uncertainty.species_sources_dict[self.uncertainty.species_list[i_CH4]] # CH4 + src2 = self.uncertainty.species_sources_dict[self.uncertainty.species_list[i_CH3X]] # CH3X + + assert 'Library' in src1 + assert 'Library' in src2 + assert 'ADS' in src2 + assert 'GAV' in src2 + assert src1['Library'] == src2['Library'] # make sure they refer to the same library source + self.uncertainty.species_list.pop() # remove the extra species so it doesn't affect other tests + def test_local_analysis(self): """ Test to run uncorrelated and then correlated local_analysis and make sure the results are expected