Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ public and private test set.

## Structure of the bundle

- `competition.yaml`: configuration file for the codabench competition,
- `competition.yaml`: configuration file for the codabench competition,
specifying phases, tasks, and evaluation metrics.
- `ingestion_program/`: contains the ingestion program that will be run on
participant's submissions. It is responsible for loading the code from the
Expand Down
19 changes: 7 additions & 12 deletions competition.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
version: 2
title: Templat competition - Dummy classification
description: Dummy classification task
title: ChemSmiles classification
description: 'Predict the inhi
image: logo.png
registration_auto_approve: False # if True, do not require approval from admin to join the comp

Expand All @@ -16,7 +16,7 @@ pages:
tasks:
- index: 0
name: Developement Task
description: 'Tune models with training data, test against examples contained in public test data'
description: 'Tune models with training data'
input_data: dev_phase/input_data/
reference_data: dev_phase/reference_data/
ingestion_program: ingestion_program/
Expand All @@ -32,8 +32,8 @@ solutions:
phases:
- name: Development Phase
description: 'Development phase: tune your models.'
start: 10-07-2025
end: 03-31-2026
start: 01-04-2026
end: 01-07-2026
tasks:
- 0

Expand All @@ -45,16 +45,11 @@ leaderboards:
key: test
index: 0
sorting: asc
- title: Private Test Accuracy
key: private_test
index: 1
sorting: asc
hidden: True
- title: Train time
key: train_time
index: 2
index: 1
sorting: desc
- title: Test time
key: test_time
index: 3
index: 2
sorting: desc
61 changes: 61 additions & 0 deletions data_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
from rdkit import Chem # type: ignore
from rdkit.Chem import Descriptors # type: ignore

def property_split(df, smiles_col='SMILES', label_col='Answer', property_type='MW', train_frac=0.8):
"""
Separe training and test set
"""
df_split = df.copy()

def calculate_property(smiles):
mol = Chem.MolFromSmiles(str(smiles))
if mol is None: return None
return Descriptors.MolWt(mol) if property_type == 'MW' else Descriptors.MolLogP(mol)

df_split[property_type] = df_split[smiles_col].apply(calculate_property)
df_split = df_split.dropna(subset=[property_type])

df_split = df_split.sort_values(by=property_type, ascending=True).reset_index(drop=True)
split_index = int(len(df_split) * train_frac)

train_df = df_split.iloc[:split_index].copy()
test_df = df_split.iloc[split_index:].copy()

return train_df, test_df


def clean_and_format_dataset(df):
df_clean = df[['TargetMolecule', 'Answer']].copy()

# Renommer pour un format standard
df_clean = df_clean.rename(columns={
'TargetMolecule': 'SMILES',
'Answer': 'Label'
})

# Convertir les labels texte en nombres
df_clean['Label'] = df_clean['Label'].replace({
'<boolean>No</boolean>': 0,
'<boolean>Yes</boolean>': 1
})

# Supprimer les lignes incomplètes et les doublons de molécules
df_clean = df_clean.dropna(subset=['SMILES', 'Label'])

return df_clean

def main():
df = pd.read_parquet("hf://datasets/molvision/BACE-V-SMILES-0/data/train-00000-of-00001.parquet")
df = df.drop_duplicates(subset=['TargetMolecule']).reset_index(drop=True)

train_df, test_df = property_split(df, smiles_col='TargetMolecule', label_col='Answer', property_type='MW')

train_df_clean = clean_and_format_dataset(train_df)
test_df_clean = clean_and_format_dataset(test_df)

train_df_clean.to_csv("train.csv")
test_df_clean.to_csv("test.csv")

if __name__ == "__main__":
main()
15 changes: 8 additions & 7 deletions ingestion_program/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@
import pandas as pd


EVAL_SETS = ["test", "private_test"]
EVAL_SETS = ["test"]


def evaluate_model(model, X_test):

y_pred = model.predict(X_test)
return pd.DataFrame(y_pred)


def get_train_data(data_dir):
data_dir = Path(data_dir)
training_dir = data_dir / "train"
X_train = pd.read_csv(training_dir / "train_features.csv")
y_train = pd.read_csv(training_dir / "train_labels.csv")
training_dir = data_dir
df = pd.read_csv(training_dir / "train.csv")
X_train = df["SMILES"]
y_train = df["Label"]
return X_train, y_train


Expand All @@ -42,8 +42,9 @@ def main(data_dir, output_dir):
start = time.time()
res = {}
for eval_set in EVAL_SETS:
X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}_features.csv")
res[eval_set] = evaluate_model(model, X_test)
X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}.csv")
X_smiles = X_test["SMILES"]
res[eval_set] = evaluate_model(model, X_smiles)
test_time = time.time() - start
print("-" * 10)
duration = train_time + test_time
Expand Down
Binary file modified logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 11 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,13 @@
pandas
scikit-learn
rdkit
transformers
torch
torchvision
tokenizers
numpy
matplotlib
seaborn
jupyterlab
notebook
ipywidgets
14 changes: 7 additions & 7 deletions scoring_program/scoring.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import json
from pathlib import Path

from sklearn.metrics import cohen_kappa_score
import pandas as pd

EVAL_SETS = ["test", "private_test"]
EVAL_SETS = ["test"]


def compute_accuracy(predictions, targets):
def compute_kappa(predictions, targets):
# Make sure there is no NaN, as pandas ignores them in mean computation
predictions = predictions.fillna(-10).values
# Return mean of correct predictions
return (predictions == targets.values).mean()
return cohen_kappa_score(predictions, targets)


def main(reference_dir, prediction_dir, output_dir):
Expand All @@ -22,10 +22,10 @@ def main(reference_dir, prediction_dir, output_dir):
prediction_dir / f'{eval_set}_predictions.csv'
)
targets = pd.read_csv(
reference_dir / f'{eval_set}_labels.csv'
reference_dir / f'{eval_set}.csv'
)

scores[eval_set] = float(compute_accuracy(predictions, targets))
y_true = targets["Label"]
scores[eval_set] = float(compute_kappa(predictions, y_true))

# Add train and test times in the score
json_durations = (prediction_dir / 'metadata.json').read_text()
Expand Down
251 changes: 230 additions & 21 deletions template_starting_kit.ipynb

Large diffs are not rendered by default.

Loading