From 6dfc03b35ee58660602b388d6f508d2a1374562a Mon Sep 17 00:00:00 2001 From: Pedro Marim Date: Sun, 22 Feb 2026 17:29:09 +0100 Subject: [PATCH 1/2] Setup data --- tools/setup_data.py | 83 ++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/tools/setup_data.py b/tools/setup_data.py index 5bdc3a9..edbe29e 100644 --- a/tools/setup_data.py +++ b/tools/setup_data.py @@ -1,53 +1,74 @@ -# Script to download the data from a given source and create the splits -# This is a mock version that generate fake problems +# Script to load the S&P500 data and create the splits for the benchmark from pathlib import Path -import numpy as np import pandas as pd -from sklearn.datasets import make_classification -from sklearn.model_selection import train_test_split -PHASE = 'dev_phase' +PHASE = "dev_phase" -DATA_DIR = Path(PHASE) / 'input_data' -REF_DIR = Path(PHASE) / 'reference_data' +DATA_DIR = Path(PHASE) / "input_data" +REF_DIR = Path(PHASE) / "reference_data" + +RAW_DATA_PATH = Path("raw_data") / "sp500_raw.csv" +TARGET_COL = "Target" def make_csv(data, filepath): filepath.parent.mkdir(parents=True, exist_ok=True) - pd.DataFrame(data).to_csv(filepath, index=False) + data.to_csv(filepath, index=False) if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser( - description='Load or generate data for the benchmark' - ) - parser.add_argument('--seed', type=int, default=42, - help='Random seed for data generation') - args = parser.parse_args() - - # Generate and split the data - rng = np.random.RandomState(args.seed) - X, y = make_classification(n_samples=500, n_features=5, random_state=rng) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, random_state=rng - ) - X_test, X_private_test, y_test, y_private_test = train_test_split( - X_test, y_test, test_size=0.5, random_state=rng - ) + # Load the S&P500 data + print(f"Loading data from {RAW_DATA_PATH}") + df = pd.read_csv(RAW_DATA_PATH) + + # Separate features and target + y = df[TARGET_COL] + X = df.drop(columns=[TARGET_COL]) + + n = len(df) + train_end = int(n * 0.6) + test_end = int(n * 0.8) + + # Split chronologically: 60% train, 20% test, 20% private_test + X_train, y_train = X.iloc[:train_end], y.iloc[:train_end] + X_test, y_test = X.iloc[train_end:test_end], y.iloc[train_end:test_end] + X_private_test, y_private_test = X.iloc[test_end:], y.iloc[test_end:] + + print(f"Dataset shape: {df.shape}") + print(f"Features: {X.shape[1]}, Samples: {n}") + print(f"Target distribution:\n{y.value_counts()}") # Store the data in the correct folders: # - input_data contains train data (both features and labels) and only # test features so the test labels are kept secret # - reference_data contains the test labels for scoring for split, X_split, y_split in [ - ('train', X_train, y_train), - ('test', X_test, y_test), - ('private_test', X_private_test, y_private_test), + ("train", X_train, y_train), + ("test", X_test, y_test), + ("private_test", X_private_test, y_private_test), ]: split_dir = DATA_DIR / split - make_csv(X_split, split_dir / f'{split}_features.csv') + make_csv(X_split, split_dir / f"{split}_features.csv") label_dir = split_dir if split == "train" else REF_DIR - make_csv(y_split, label_dir / f'{split}_labels.csv') \ No newline at end of file + make_csv( + pd.DataFrame({TARGET_COL: y_split}), + label_dir / f"{split}_labels.csv", + ) + + print("\nData splits created successfully!") + print( + f"{'Split':<15} {'Samples':<10} {'First Date':<15} {'Last Date':<15}" + ) + print("-" * 55) + for split, X_split in [ + ("train", X_train), + ("test", X_test), + ("private_test", X_private_test), + ]: + first_date = X_split["Date"].iloc[0] + last_date = X_split["Date"].iloc[-1] + print( + f"{split:<15} {len(X_split):<10} {first_date:<15} {last_date:<15}" + ) From eee907b0e9e8bc332a6267713d9221f39756780e Mon Sep 17 00:00:00 2001 From: Pedro Marim Date: Sun, 22 Feb 2026 17:34:50 +0100 Subject: [PATCH 2/2] Folder for raw data --- raw_data/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 raw_data/.gitkeep diff --git a/raw_data/.gitkeep b/raw_data/.gitkeep new file mode 100644 index 0000000..e69de29