Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/publish-pages-doc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ on:
branches:
- main
- master
workflow_dispatch:

jobs:
deploy:
runs-on: ubuntu-latest
Expand Down
49 changes: 5 additions & 44 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ To use version 1.0, which may be necessary for retrocompatibility with previousl

## Model builder <a name="helper"></a>

In order to build a new model, AutoPeptideML (v.2.0), introduces a new utility to automatically prepare an experiment configuration file, to i) improve the reproducibility of the pipeline and ii) to keep a user-friendly interface despite the much increased flexibility.
In order to build a new model, AutoPeptideML (v.2.0) guides you through the process through a series of prompts.

```bash
autopeptideml prepare-config --config-path <config-path>
autopeptideml build-model
```
This launches an interactive CLI that walks you through:

Expand All @@ -72,7 +72,6 @@ This launches an interactive CLI that walks you through:
- Picking models and representations
- Automatically sampling negatives


You’ll be prompted to answer various questions like:

```
Expand All @@ -86,13 +85,13 @@ You’ll be prompted to answer various questions like:
And so on. The final config is written to:

```
<config-path>.yml
<outputdir>/setup-config.yml
```

This config file allows for easy reproducibility of the results, so that anyone can repeat the training processes. You can check the configuration file and make any changes you deem necessary. Finally, you can build the model by simply running:

```
autopeptideml build-model --outdir <outdir> --config-path <outputdir>/config.yml
autopeptideml build-model --outdir <outdir> --config-path <outputdir>/setup-config.yml
```

## Prediction <a name="prediction"></a>
Expand Down Expand Up @@ -120,6 +119,7 @@ Installing in a conda environment is recommended. For creating the environment,
```bash
conda create -n autopeptideml python
conda activate autopeptideml
conda install quarto -c conda-forge
```

### 1. Python Package
Expand Down Expand Up @@ -190,45 +190,6 @@ To use PeptideCLM:
pip install smilesPE
```

## Documentation <a name="documentation"></a>

### Configuration file


```yaml
datasets:
main:
feat-fields: # Column with peptide sequence/SMILES
label-field: # Column with labels/ "Assume all entries are positives"
path: # Path to dataset
neg-db:
activities-to-exclude: # List of activities to exclude
- activity-1
- activity-2
...
feat-fields: null # Column with peptide sequence/SMILES (only if using custom database)
path: # Path to custom database or choose: canonical, non-canonical, both
device: # Device for computing representations. Choose: cpu, mps, cuda
direction: # Direction of optimization. Choose: maximize or minimize
metric: # Metric for optimization. mse, mae require direction minimize
models: # List of machine learning algorithms to explore. List:
# knn, svm, rf, gradboost, xgboost, lightgbm
- model-1
- model-2
...
n-trials: # Number of optimization steps. Recommended 100-200
pipeline: to-smiles # Pipeline for preprocessing. Choose: to-smiles, to-sequences
reps: # List of peptide representations to explore. List:
# ecfp, chemberta-2, molformer-xl, peptide-clm, esm2-8m, ...
- rep-1
- rep-2
...

split-strategy: min # Strategy for splitting train/test. Choose: min, random.
task: class # Machine learning type of problem. Choose: class or reg.
n-jobs: # Number of processes to launch. -1 uses all possible CPU cores.
```

### More details about API

Please check the [Code reference documentation](https://ibm.github.io/AutoPeptideML/autopeptideml/)
Expand Down
23 changes: 12 additions & 11 deletions autopeptideml/apml.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def sample_negatives(
else:
path = target_db
if not self.preprocessed:
self._preprocessing_data(n_jobs=n_jobs)
self._preprocessing_data(n_jobs=n_jobs, verbose=verbose)

self.metadata['status'] = 'sampling-negatives'
self.metadata['negative-sampling-metadata'] = {
Expand Down Expand Up @@ -245,6 +245,10 @@ def build_models(
if task not in ['class', 'reg']:
raise ValueError(f"Task: {task} is not valid.",
"Choose one: `class, reg`")
if task == 'class':
from .utils.dataset_parsing import _is_string_series
if _is_string_series(self.df[self.label_field]):
raise ValueError("The target column contains textual values, please substitute for '1' for the positive class and '0' for the negative.")

if split_strategy == 'good':
raise NotImplementedError(
Expand Down Expand Up @@ -283,6 +287,7 @@ def build_models(
random_state=random_state,
n_jobs=n_jobs,
n_trials=n_trials,
verbose=verbose,
model_configs=model_configs
)
self._evaluating(
Expand All @@ -309,19 +314,13 @@ def represent(
):
repengine = self.repengines[rep]
if rep in PLMs or rep == 'one-hot':
if 'apml-seqs' in self.df:
seqs = self.df['apml-seqs']
else:
seqs = self._use_pipeline(mols, 'to-sequences', n_jobs=n_jobs,
verbose=verbose)
seqs = self._use_pipeline(mols, 'to-sequences', n_jobs=n_jobs,
verbose=verbose)
x = {rep: repengine.compute_reps(seqs, verbose=verbose,
batch_size=32)}
else:
if 'apml-smiles' in self.df:
mols = self.df['apml-smiles']
else:
mols = self._use_pipeline(mols, 'to-smiles', n_jobs=n_jobs,
verbose=verbose)
mols = self._use_pipeline(mols, 'to-smiles', n_jobs=n_jobs,
verbose=verbose)
x = {rep: repengine.compute_reps(mols, verbose=verbose,
batch_size=32)}
return x
Expand All @@ -336,6 +335,7 @@ def _hpo(
n_folds: int,
n_jobs: int,
n_trials: int,
verbose: bool,
random_state: int
):
if task == 'class':
Expand Down Expand Up @@ -372,6 +372,7 @@ def _hpo(
n_trials=n_trials,
patience=n_trials//5,
custom_hpspace=model_configs,
verbose=2 if verbose else 0,
random_state=random_state,
n_jobs=n_jobs,
db_file=osp.join(self.meta_dir, 'database.sql'),
Expand Down
7 changes: 0 additions & 7 deletions autopeptideml/data/h_param_search/xgboost_class.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,6 @@ tree_method:
- approx
- hist

booster:
type: categorical
values:
- gbtree
- dart
- gblinear

grow_policy:
type: categorical
values:
Expand Down
7 changes: 0 additions & 7 deletions autopeptideml/data/h_param_search/xgboost_reg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,6 @@ tree_method:
- approx
- hist

booster:
type: categorical
values:
- gbtree
- dart
- gblinear

grow_policy:
type: categorical
values:
Expand Down
2 changes: 1 addition & 1 deletion autopeptideml/reps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@
'esm2-15b', 'esm2-3b', 'esm2-650m', 'esm1b', 'esm2-150m',
'esm2-35m', 'esm2-8m', 'esmc-600m', 'esmc-300m', 'ankh-base',
'ankh-large']
CLMs = ['molformer-xl', 'chemberta-2', 'peptideclm']
CLMs = ['molformer-xl', 'chemberta-2', 'chemberta-3', 'peptideclm']
FPs = ['ecfp', 'morgan', 'fcfp', 'pepfunn']
16 changes: 12 additions & 4 deletions autopeptideml/reps/lms.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
'ankh-large': 1536,
'MoLFormer-XL-both-10pct': 768,
'ChemBERTa-77M-MLM': 384,
'ChemBERTa-100M-MLM': 768,
'PeptideCLM-23M-all': 768
}

Expand All @@ -53,8 +54,8 @@
'ankh-large': 'ankh-large',
'molformer-xl': 'MoLFormer-XL-both-10pct',
'chemberta-2': 'ChemBERTa-77M-MLM',
'chemberta-3': 'ChemBERTa-100M-MLM',
'peptideclm': 'PeptideCLM-23M-all'

}


Expand Down Expand Up @@ -149,17 +150,22 @@ def max_len(self) -> int:
return 1022
elif self.lab.lower() == 'evolutionaryscale':
return 2046
elif self.lab.lower() == 'deepchem':
return 512
else:
return 2046

def get_num_params(self) -> int:
def get_num_params(self, human_readable: bool = False) -> int:
"""
Returns the total number of parameters in the model.

:rtype: int
:return: The number of parameters in the model.
"""
return sum(p.numel() for p in self.model.parameters())
if not human_readable:
return sum(p.numel() for p in self.model.parameters())
else:
return f"{sum(p.numel() for p in self.model.parameters())/1e6:,.3f}M"

def _load_model(self, model: str):
"""
Expand Down Expand Up @@ -239,7 +245,7 @@ def _load_model(self, model: str):
self.tokenizer = self.model.tokenizer
else:
self.tokenizer = AutoTokenizer.from_pretrained(
f'{self.lab}/{model}', trust_remote_code=True
f'{self.lab}/{model}', trust_remote_code=True,
)

self.dimension = AVAILABLE_MODELS[model]
Expand Down Expand Up @@ -277,6 +283,8 @@ def _rep_batch(
:rtype: List[np.ndarray]
:return: A list of numpy arrays representing the embeddings of each input sequence.
"""
batch = [b if len(b) <= self.max_len() else b[:self.max_len()]
for b in batch]
inputs = self.tokenizer(batch, add_special_tokens=True,
truncation=True,
padding="longest", return_tensors="pt")
Expand Down
11 changes: 11 additions & 0 deletions autopeptideml/utils/dataset_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,14 @@ def _read_csv(path: str) -> pd.DataFrame:
return pd.read_csv(path, sep=',')
elif path.endswith(".tsv"):
return pd.read_csv(path, sep='\t')


def _is_string_series(s: pd.Series):
if isinstance(s.dtype, pd.StringDtype):
# The series was explicitly created as a string series (Pandas>=1.0.0)
return True
elif s.dtype == 'object':
# Object series, check each value
return all((v is None) or isinstance(v, str) for v in s)
else:
return False
Loading