x-datascience-datacamp · OsirisYetna · Mar 4, 2026 · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ public and private test set.
 
 ## Structure of the bundle
 
-- `competition.yaml`: configuration file for the codabench competition,
+- `competition.yaml`:  configuration file for the codabench competition,
   specifying phases, tasks, and evaluation metrics.
 - `ingestion_program/`: contains the ingestion program that will be run on
   participant's submissions. It is responsible for loading the code from the

diff --git a/competition.yaml b/competition.yaml
@@ -1,6 +1,6 @@
 version: 2
-title: Templat competition - Dummy classification
-description: Dummy classification task
+title: ChemSmiles classification
+description: 'Predict the inhi
 image: logo.png
 registration_auto_approve: False  # if True, do not require approval from admin to join the comp
 
@@ -16,7 +16,7 @@ pages:
 tasks:
   - index: 0
     name: Developement Task
-    description: 'Tune models with training data, test against examples contained in public test data'
+    description: 'Tune models with training data'
     input_data: dev_phase/input_data/
     reference_data: dev_phase/reference_data/
     ingestion_program: ingestion_program/
@@ -32,8 +32,8 @@ solutions:
 phases:
   - name: Development Phase
     description: 'Development phase: tune your models.'
-    start: 10-07-2025
-    end: 03-31-2026
+    start: 01-04-2026
+    end: 01-07-2026
     tasks:
       - 0
 
@@ -45,16 +45,11 @@ leaderboards:
         key: test
         index: 0
         sorting: asc
-      - title: Private Test Accuracy
-        key: private_test
-        index: 1
-        sorting: asc
-        hidden: True
       - title: Train time
         key: train_time
-        index: 2
+        index: 1
         sorting: desc
       - title: Test time
         key: test_time
-        index: 3
+        index: 2
         sorting: desc
diff --git a/data_extraction.py b/data_extraction.py
@@ -0,0 +1,61 @@
+import pandas as pd
+from rdkit import Chem # type: ignore
+from rdkit.Chem import Descriptors # type: ignore
+
+def property_split(df, smiles_col='SMILES', label_col='Answer', property_type='MW', train_frac=0.8):
+    """
+    Separe training and test set
+    """
+    df_split = df.copy()
+
+    def calculate_property(smiles):
+        mol = Chem.MolFromSmiles(str(smiles))
+        if mol is None: return None
+        return Descriptors.MolWt(mol) if property_type == 'MW' else Descriptors.MolLogP(mol)
+
+    df_split[property_type] = df_split[smiles_col].apply(calculate_property)
+    df_split = df_split.dropna(subset=[property_type])
+
+    df_split = df_split.sort_values(by=property_type, ascending=True).reset_index(drop=True)
+    split_index = int(len(df_split) * train_frac)
+
+    train_df = df_split.iloc[:split_index].copy()
+    test_df = df_split.iloc[split_index:].copy()
+
+    return train_df, test_df
+
+
+def clean_and_format_dataset(df):
+    df_clean = df[['TargetMolecule', 'Answer']].copy()
+
+    # Renommer pour un format standard
+    df_clean = df_clean.rename(columns={
+        'TargetMolecule': 'SMILES',
+        'Answer': 'Label'
+    })
+
+    # Convertir les labels texte en nombres
+    df_clean['Label'] = df_clean['Label'].replace({
+        '<boolean>No</boolean>': 0, 
+        '<boolean>Yes</boolean>': 1
+    })
+
+    # Supprimer les lignes incomplètes et les doublons de molécules
+    df_clean = df_clean.dropna(subset=['SMILES', 'Label'])
+
+    return df_clean
+
+def main():
+    df = pd.read_parquet("hf://datasets/molvision/BACE-V-SMILES-0/data/train-00000-of-00001.parquet")
+    df = df.drop_duplicates(subset=['TargetMolecule']).reset_index(drop=True)
+
+    train_df, test_df = property_split(df, smiles_col='TargetMolecule', label_col='Answer', property_type='MW')
+
+    train_df_clean = clean_and_format_dataset(train_df)
+    test_df_clean = clean_and_format_dataset(test_df)
+
+    train_df_clean.to_csv("train.csv")
+    test_df_clean.to_csv("test.csv")
+
+if __name__ == "__main__":
+    main()
diff --git a/ingestion_program/ingestion.py b/ingestion_program/ingestion.py
@@ -6,20 +6,20 @@
 import pandas as pd
 
 
-EVAL_SETS = ["test", "private_test"]
+EVAL_SETS = ["test"]
 
 
 def evaluate_model(model, X_test):
-
     y_pred = model.predict(X_test)
     return pd.DataFrame(y_pred)
 
 
 def get_train_data(data_dir):
     data_dir = Path(data_dir)
-    training_dir = data_dir / "train"
-    X_train = pd.read_csv(training_dir / "train_features.csv")
-    y_train = pd.read_csv(training_dir / "train_labels.csv")
+    training_dir = data_dir
+    df = pd.read_csv(training_dir / "train.csv")
+    X_train = df["SMILES"]
+    y_train = df["Label"]
     return X_train, y_train
 
 
@@ -42,8 +42,9 @@ def main(data_dir, output_dir):
     start = time.time()
     res = {}
     for eval_set in EVAL_SETS:
-        X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}_features.csv")
-        res[eval_set] = evaluate_model(model, X_test)
+        X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}.csv")
+        X_smiles = X_test["SMILES"]
+        res[eval_set] = evaluate_model(model, X_smiles)
     test_time = time.time() - start
     print("-" * 10)
     duration = train_time + test_time

diff --git a/logo.png b/logo.png
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,13 @@
 pandas
 scikit-learn
+rdkit
+transformers
+torch
+torchvision
+tokenizers
+numpy
+matplotlib
+seaborn
+jupyterlab
+notebook
+ipywidgets
diff --git a/scoring_program/scoring.py b/scoring_program/scoring.py
@@ -1,16 +1,16 @@
 import json
 from pathlib import Path
-
+from sklearn.metrics import cohen_kappa_score
 import pandas as pd
 
-EVAL_SETS = ["test", "private_test"]
+EVAL_SETS = ["test"]
 
 
-def compute_accuracy(predictions, targets):
+def compute_kappa(predictions, targets):
     # Make sure there is no NaN, as pandas ignores them in mean computation
     predictions = predictions.fillna(-10).values
     # Return mean of correct predictions
-    return (predictions == targets.values).mean()
+    return cohen_kappa_score(predictions, targets)
 
 
 def main(reference_dir, prediction_dir, output_dir):
@@ -22,10 +22,10 @@ def main(reference_dir, prediction_dir, output_dir):
             prediction_dir / f'{eval_set}_predictions.csv'
         )
         targets = pd.read_csv(
-            reference_dir / f'{eval_set}_labels.csv'
+            reference_dir / f'{eval_set}.csv'
         )
-
-        scores[eval_set] = float(compute_accuracy(predictions, targets))
+        y_true = targets["Label"]
+        scores[eval_set] = float(compute_kappa(predictions, y_true))
 
     # Add train and test times in the score
     json_durations = (prediction_dir / 'metadata.json').read_text()

diff --git a/template_starting_kit.ipynb b/template_starting_kit.ipynb