IBM · RaulFD-creator · Dec 2, 2025 · Sep 11, 2025 · Oct 2, 2025 · Nov 4, 2025
diff --git a/.github/workflows/publish-pages-doc.yml b/.github/workflows/publish-pages-doc.yml
@@ -5,6 +5,8 @@ on:
     branches:
       - main
       - master
+  workflow_dispatch:
+
 jobs:
   deploy:
     runs-on: ubuntu-latest

diff --git a/README.md b/README.md
@@ -60,10 +60,10 @@ To use version 1.0, which may be necessary for retrocompatibility with previousl
 
 ## Model builder <a name="helper"></a>
 
-In order to build a new model, AutoPeptideML (v.2.0), introduces a new utility to automatically prepare an experiment configuration file, to i) improve the reproducibility of the pipeline and ii) to keep a user-friendly interface despite the much increased flexibility.
+In order to build a new model, AutoPeptideML (v.2.0) guides you through the process through a series of prompts.
 
 ```bash
-autopeptideml prepare-config --config-path <config-path>
+autopeptideml build-model
 ```
 This launches an interactive CLI that walks you through:
 
@@ -72,7 +72,6 @@ This launches an interactive CLI that walks you through:
 - Picking models and representations
 - Automatically sampling negatives
 
-
 You’ll be prompted to answer various questions like:
 
 ```
@@ -86,13 +85,13 @@ You’ll be prompted to answer various questions like:
 And so on. The final config is written to:
 
 ```
-<config-path>.yml
+<outputdir>/setup-config.yml
 ```
 
 This config file allows for easy reproducibility of the results, so that anyone can repeat the training processes. You can check the configuration file and make any changes you deem necessary. Finally, you can build the model by simply running:
 
 ```
-autopeptideml build-model --outdir <outdir> --config-path <outputdir>/config.yml
+autopeptideml build-model --outdir <outdir> --config-path <outputdir>/setup-config.yml
 ```
 
 ## Prediction <a name="prediction"></a>
@@ -120,6 +119,7 @@ Installing in a conda environment is recommended. For creating the environment,
 ```bash
 conda create -n autopeptideml python
 conda activate autopeptideml
+conda install quarto -c conda-forge
 ```
 
 ### 1. Python Package
@@ -190,45 +190,6 @@ To use PeptideCLM:
 pip install smilesPE
 ```
 
-## Documentation <a name="documentation"></a>
-
-### Configuration file
-
-
-```yaml
-datasets:
-  main:
-    feat-fields: # Column with peptide sequence/SMILES
-    label-field: # Column with labels/ "Assume all entries are positives"
-    path: # Path to dataset
-  neg-db:
-    activities-to-exclude: # List of activities to exclude
-      - activity-1
-      - activity-2
-      ...
-    feat-fields: null # Column with peptide sequence/SMILES (only if using custom database)
-    path: # Path to custom database or choose: canonical, non-canonical, both
-device: # Device for computing representations. Choose: cpu, mps, cuda
-direction: # Direction of optimization. Choose: maximize or minimize
-metric: # Metric for optimization. mse, mae require direction minimize
-models: # List of machine learning algorithms to explore. List:
-        # knn, svm, rf, gradboost, xgboost, lightgbm
-  - model-1
-  - model-2
-  ...
-n-trials: # Number of optimization steps. Recommended 100-200
-pipeline: to-smiles # Pipeline for preprocessing. Choose: to-smiles, to-sequences
-reps: # List of peptide representations to explore. List:
-      # ecfp, chemberta-2, molformer-xl, peptide-clm, esm2-8m, ...
-  - rep-1
-  - rep-2
-  ...
-
-split-strategy: min # Strategy for splitting train/test. Choose: min, random. 
-task: class # Machine learning type of problem. Choose: class or reg.
-n-jobs: # Number of processes to launch. -1 uses all possible CPU cores.
-```
-
 ### More details about API
 
 Please check the [Code reference documentation](https://ibm.github.io/AutoPeptideML/autopeptideml/)

diff --git a/autopeptideml/apml.py b/autopeptideml/apml.py
@@ -148,7 +148,7 @@ def sample_negatives(
         else:
             path = target_db
         if not self.preprocessed:
-            self._preprocessing_data(n_jobs=n_jobs)
+            self._preprocessing_data(n_jobs=n_jobs, verbose=verbose)
 
         self.metadata['status'] = 'sampling-negatives'
         self.metadata['negative-sampling-metadata'] = {
@@ -245,6 +245,10 @@ def build_models(
         if task not in ['class', 'reg']:
             raise ValueError(f"Task: {task} is not valid.",
                              "Choose one: `class, reg`")
+        if task == 'class':
+            from .utils.dataset_parsing import _is_string_series
+            if _is_string_series(self.df[self.label_field]):
+                raise ValueError("The target column contains textual values, please substitute for '1' for the positive class and '0' for the negative.")
 
         if split_strategy == 'good':
             raise NotImplementedError(
@@ -283,6 +287,7 @@ def build_models(
             random_state=random_state,
             n_jobs=n_jobs,
             n_trials=n_trials,
+            verbose=verbose,
             model_configs=model_configs
         )
         self._evaluating(
@@ -309,19 +314,13 @@ def represent(
     ):
         repengine = self.repengines[rep]
         if rep in PLMs or rep == 'one-hot':
-            if 'apml-seqs' in self.df:
-                seqs = self.df['apml-seqs']
-            else:
-                seqs = self._use_pipeline(mols, 'to-sequences', n_jobs=n_jobs,
-                                          verbose=verbose)
+            seqs = self._use_pipeline(mols, 'to-sequences', n_jobs=n_jobs,
+                                      verbose=verbose)
             x = {rep: repengine.compute_reps(seqs, verbose=verbose,
                                              batch_size=32)}
         else:
-            if 'apml-smiles' in self.df:
-                mols = self.df['apml-smiles']
-            else:
-                mols = self._use_pipeline(mols, 'to-smiles', n_jobs=n_jobs,
-                                          verbose=verbose)
+            mols = self._use_pipeline(mols, 'to-smiles', n_jobs=n_jobs,
+                                      verbose=verbose)
             x = {rep: repengine.compute_reps(mols, verbose=verbose,
                                              batch_size=32)}
         return x
@@ -336,6 +335,7 @@ def _hpo(
         n_folds: int,
         n_jobs: int,
         n_trials: int,
+        verbose: bool,
         random_state: int
     ):
         if task == 'class':
@@ -372,6 +372,7 @@ def _hpo(
             n_trials=n_trials,
             patience=n_trials//5,
             custom_hpspace=model_configs,
+            verbose=2 if verbose else 0,
             random_state=random_state,
             n_jobs=n_jobs,
             db_file=osp.join(self.meta_dir, 'database.sql'),

diff --git a/autopeptideml/data/h_param_search/xgboost_class.yml b/autopeptideml/data/h_param_search/xgboost_class.yml
@@ -81,13 +81,6 @@ tree_method:
     - approx
     - hist
 
-booster:
-  type: categorical
-  values:
-    - gbtree
-    - dart
-    - gblinear
-
 grow_policy:
   type: categorical
   values:

diff --git a/autopeptideml/data/h_param_search/xgboost_reg.yml b/autopeptideml/data/h_param_search/xgboost_reg.yml
@@ -81,13 +81,6 @@ tree_method:
     - approx
     - hist
 
-booster:
-  type: categorical
-  values:
-    - gbtree
-    - dart
-    - gblinear
-
 grow_policy:
   type: categorical
   values:

diff --git a/autopeptideml/reps/__init__.py b/autopeptideml/reps/__init__.py
@@ -5,5 +5,5 @@
         'esm2-15b', 'esm2-3b', 'esm2-650m', 'esm1b', 'esm2-150m',
         'esm2-35m', 'esm2-8m', 'esmc-600m', 'esmc-300m', 'ankh-base',
         'ankh-large']
-CLMs = ['molformer-xl', 'chemberta-2', 'peptideclm']
+CLMs = ['molformer-xl', 'chemberta-2', 'chemberta-3', 'peptideclm']
 FPs = ['ecfp', 'morgan', 'fcfp', 'pepfunn']
diff --git a/autopeptideml/reps/lms.py b/autopeptideml/reps/lms.py
@@ -32,6 +32,7 @@
     'ankh-large': 1536,
     'MoLFormer-XL-both-10pct': 768,
     'ChemBERTa-77M-MLM': 384,
+    'ChemBERTa-100M-MLM': 768,
     'PeptideCLM-23M-all': 768
 }
 
@@ -53,8 +54,8 @@
     'ankh-large': 'ankh-large',
     'molformer-xl': 'MoLFormer-XL-both-10pct',
     'chemberta-2': 'ChemBERTa-77M-MLM',
+    'chemberta-3': 'ChemBERTa-100M-MLM',
     'peptideclm': 'PeptideCLM-23M-all'
-
 }
 
 
@@ -149,17 +150,22 @@ def max_len(self) -> int:
             return 1022
         elif self.lab.lower() == 'evolutionaryscale':
             return 2046
+        elif self.lab.lower() == 'deepchem':
+            return 512
         else:
             return 2046
 
-    def get_num_params(self) -> int:
+    def get_num_params(self, human_readable: bool = False) -> int:
         """
         Returns the total number of parameters in the model.
 
         :rtype: int
           :return: The number of parameters in the model.
         """
-        return sum(p.numel() for p in self.model.parameters())
+        if not human_readable:
+            return sum(p.numel() for p in self.model.parameters())
+        else:
+            return f"{sum(p.numel() for p in self.model.parameters())/1e6:,.3f}M"
 
     def _load_model(self, model: str):
         """
@@ -239,7 +245,7 @@ def _load_model(self, model: str):
                 self.tokenizer = self.model.tokenizer
             else:
                 self.tokenizer = AutoTokenizer.from_pretrained(
-                    f'{self.lab}/{model}', trust_remote_code=True
+                    f'{self.lab}/{model}', trust_remote_code=True,
                 )
 
         self.dimension = AVAILABLE_MODELS[model]
@@ -277,6 +283,8 @@ def _rep_batch(
         :rtype: List[np.ndarray]
           :return: A list of numpy arrays representing the embeddings of each input sequence.
         """
+        batch = [b if len(b) <= self.max_len() else b[:self.max_len()]
+                 for b in batch]
         inputs = self.tokenizer(batch, add_special_tokens=True,
                                 truncation=True,
                                 padding="longest", return_tensors="pt")

diff --git a/autopeptideml/utils/dataset_parsing.py b/autopeptideml/utils/dataset_parsing.py
@@ -52,3 +52,14 @@ def _read_csv(path: str) -> pd.DataFrame:
         return pd.read_csv(path, sep=',')
     elif path.endswith(".tsv"):
         return pd.read_csv(path, sep='\t')
+
+
+def _is_string_series(s: pd.Series):
+    if isinstance(s.dtype, pd.StringDtype):
+        # The series was explicitly created as a string series (Pandas>=1.0.0)
+        return True
+    elif s.dtype == 'object':
+        # Object series, check each value
+        return all((v is None) or isinstance(v, str) for v in s)
+    else:
+        return False
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,8 @@ on: @@
         branches:
           - main
           - master
+      workflow_dispatch:
     jobs:
       deploy:
         runs-on: ubuntu-latest
@@ Expand Down @@