diff --git a/.gitignore b/.gitignore index 9728749..4c2f2aa 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ ingestion_res/* scoring_res/* dev_phase/* +*.pth diff --git a/competition.yaml b/competition.yaml index 3a1e4de..44a4a74 100755 --- a/competition.yaml +++ b/competition.yaml @@ -1,8 +1,26 @@ version: 2 -title: Templat competition - Dummy classification -description: Dummy classification task +title: "Autoregressive Forecasting of the S&P 500 Index" +description: > + Can you predict whether the S&P 500 will close up or down — using only what you know by mid-morning? + + Each trading day, participants receive a feature vector built from: + - Intraday morning signals: the day's open price and early price action + (e.g. open-to-first-hour return, morning high/low range, opening gap vs previous close). + - Historical context: past N days of daily OHLCV data, log-returns, and + rolling statistics (volatility, momentum) up to and including the previous close. + + The target label is binary: **1** if the day's close is strictly above the previous close, + **0** otherwise. No look-ahead is permitted — only information available before noon (ET) + may be used as features for the current day. + + Participants submit a scikit-learn–compatible model via a `submission.py` file + exposing a `get_model()` function. The model is trained server-side on historical + data and evaluated on a held-out test window using **directional accuracy** + (fraction of days where the predicted direction matches the actual close direction). + + This is a DataCamp challenge organised at École Polytechnique (INF554 / MAP583). image: logo.png -registration_auto_approve: False # if True, do not require approval from admin to join the comp +registration_auto_approve: False # set to True to skip manual approval terms: pages/terms.md pages: @@ -15,8 +33,15 @@ pages: tasks: - index: 0 - name: Developement Task - description: 'Tune models with training data, test against examples contained in public test data' + name: Development Task + description: > + Same-day close direction forecasting of the S&P 500 using morning information. + Each sample consists of: (i) intraday morning features for the current trading day + (opening gap, open price, early price action) and (ii) historical daily features + from the past N sessions (log-returns, OHLCV, rolling volatility, momentum). + The label is 1 if today's close > previous close, 0 otherwise. + No information after the morning window may be used; models are scored on + directional accuracy over a public held-out test window. input_data: dev_phase/input_data/ reference_data: dev_phase/reference_data/ ingestion_program: ingestion_program/ @@ -25,13 +50,15 @@ tasks: solutions: - index: 0 tasks: - - 0 + - 0 path: solution/ - phases: - name: Development Phase - description: 'Development phase: tune your models.' + description: > + Tune and validate your autoregressive model using the provided historical + S&P 500 training data. Your predictions are scored against a public test set + so you can iterate quickly. Unlimited submissions are allowed in this phase. start: 10-07-2025 end: 03-31-2026 tasks: @@ -41,20 +68,20 @@ leaderboards: - title: Results key: main columns: - - title: Test Accuracy + - title: Directional Accuracy (public test) key: test index: 0 - sorting: asc - - title: Private Test Accuracy + sorting: desc # higher is better + - title: Directional Accuracy (private test) key: private_test index: 1 - sorting: asc - hidden: True - - title: Train time + sorting: desc + hidden: True # revealed only after the phase ends + - title: Train Time (s) key: train_time index: 2 - sorting: desc - - title: Test time + sorting: asc # lower is better + - title: Predict Time (s) key: test_time index: 3 - sorting: desc + sorting: asc diff --git a/ingestion_program/ingestion.py b/ingestion_program/ingestion.py index f150b05..6585fa5 100755 --- a/ingestion_program/ingestion.py +++ b/ingestion_program/ingestion.py @@ -3,53 +3,156 @@ import time from pathlib import Path +import numpy as np import pandas as pd +import torch +# Number of past trading days fed as a sequence to the model. +# Must be consistent between training and inference. +WINDOW_SIZE = 50 EVAL_SETS = ["test", "private_test"] -def evaluate_model(model, X_test): - - y_pred = model.predict(X_test) - return pd.DataFrame(y_pred) +class SP500Dataset(torch.utils.data.Dataset): + """PyTorch Dataset for the S&P 500 direction-forecasting challenge. + + Each sample is a sliding window of shape (WINDOW_SIZE, n_features) + ending at day `idx`. The target is the binary label of that last day + (1 = close > prev_close, 0 otherwise). + + For the first WINDOW_SIZE-1 days, the window is left-padded with zeros. + + Parameters + ---------- + features_path : Path + Path to the features CSV (columns = feature names, rows = trading days + in chronological order). + labels_path : Path or None + Path to the labels CSV (single column, same row order as features). + Pass None for test sets where labels are withheld. + window_size : int + Number of past days (inclusive of the current day) in each sequence. + """ + + def __init__( + self, features_path, labels_path=None, window_size=WINDOW_SIZE + ): + self.window_size = window_size + # index_col=0: the first column is the row index saved by setup_data.py, + # not a feature — must be excluded from the data arrays. + self.X = pd.read_csv(features_path, index_col=0).values.astype( + np.float32 + ) + self.n_features = self.X.shape[1] + if labels_path is not None: + self.y = ( + pd.read_csv(labels_path, index_col=0) + .values.astype(np.float32) + .ravel() + ) + else: + self.y = None # test mode — labels are unknown + + def __len__(self): + return len(self.X) + + def __getitem__(self, idx): + """Return (window, label) where window has shape (window_size, n_features). + + The label is the binary target for day `idx` (the last day of the window). + During test mode (no labels), only the window tensor is returned. + """ + window_start = max(0, idx - self.window_size + 1) + window = self.X[window_start : idx + 1] # (<=window_size, n_features) + + # Left-pad with zeros if we are at the beginning of the series + if len(window) < self.window_size: + padding = np.zeros( + (self.window_size - len(window), self.n_features), + dtype=np.float32, + ) + window = np.concatenate([padding, window], axis=0) + + x = torch.tensor( + window, dtype=torch.float32 + ) # (window_size, n_features) + + if self.y is not None: + y = torch.tensor(self.y[idx], dtype=torch.float32) # scalar + return x, y + return x # test mode + + +def get_train_dataset(data_dir): + """Build the training Dataset from separate features and labels CSVs.""" + data_dir = Path(data_dir) + features_path = data_dir / "train" / "train_features.csv" + labels_path = data_dir / "train" / "train_labels.csv" + return SP500Dataset(features_path, labels_path) -def get_train_data(data_dir): +def get_test_dataset(data_dir, eval_set): + """Build a test Dataset (no labels) for a given evaluation split.""" data_dir = Path(data_dir) - training_dir = data_dir / "train" - X_train = pd.read_csv(training_dir / "train_features.csv") - y_train = pd.read_csv(training_dir / "train_labels.csv") - return X_train, y_train + features_path = data_dir / eval_set / f"{eval_set}_features.csv" + return SP500Dataset(features_path, labels_path=None) -def main(data_dir, output_dir): - # Here, you can import info from the submission module, to evaluate the - # submission - from submission import get_model +def evaluate_model(model, test_dataset): + """Run inference over a test Dataset and return a DataFrame of probabilities. + + The model outputs probabilities in [0, 1] (sigmoid already applied). + The scoring program is responsible for applying the decision threshold. + """ + device = next(model.parameters()).device + loader = torch.utils.data.DataLoader( + test_dataset, batch_size=64, shuffle=False + ) + probs = [] + model.eval() + with torch.no_grad(): + for x in loader: + # test_dataset returns bare tensors (no label) — x is already the input + x = x.to(device) + batch_probs = model(x).cpu().numpy().tolist() # floats in [0, 1] + probs.extend(batch_probs) + return pd.DataFrame({"Probability": probs}) - X_train, y_train = get_train_data(data_dir) - print("Training the model") +def main(data_dir, output_dir): + from submission import ( + get_model, + ) # imported here so sys.path is set first - model = get_model() + data_dir = Path(data_dir) + output_dir = Path(output_dir) + # ── Training ────────────────────────────────────────────────────────────── + train_dataset = get_train_dataset(data_dir) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=32, shuffle=True + ) + + print("Training the model") start = time.time() - model.fit(X_train, y_train) + model = get_model(train_loader) # participant trains and returns the model train_time = time.time() - start - print("-" * 10) + + # ── Evaluation ──────────────────────────────────────────────────────────── + print("=" * 40) print("Evaluate the model") start = time.time() res = {} for eval_set in EVAL_SETS: - X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}_features.csv") - res[eval_set] = evaluate_model(model, X_test) + test_dataset = get_test_dataset(data_dir, eval_set) + res[eval_set] = evaluate_model(model, test_dataset) test_time = time.time() - start - print("-" * 10) - duration = train_time + test_time - print(f"Completed Prediction. Total duration: {duration}") + print( + f"Completed Prediction. Total duration: {train_time + test_time:.1f}s" + ) - # Write output files + # ── Write outputs ───────────────────────────────────────────────────────── output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "metadata.json", "w+") as f: json.dump(dict(train_time=train_time, test_time=test_time), f) @@ -69,19 +172,19 @@ def main(data_dir, output_dir): parser.add_argument( "--data-dir", type=str, - default="/app/input_data", - help="", + default="dev_phase/input_data", + help="Root folder containing train/, test/, and private_test/ splits.", ) parser.add_argument( "--output-dir", type=str, - default="/app/output", - help="", + default="ingestion_res", + help="Folder where prediction CSVs and metadata.json will be written.", ) parser.add_argument( "--submission-dir", type=str, - default="/app/ingested_program", + default="solution", help="", ) diff --git a/logo.png b/logo.png index 9616456..5255f04 100644 Binary files a/logo.png and b/logo.png differ diff --git a/raw_data/.gitkeep b/raw_data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index fda4dd6..41980e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,11 @@ -pandas -scikit-learn +# ── Core runtime (ingestion + scoring + submission) ─────────────────────────── +numpy==2.2.6 +pandas==2.3.3 +scikit-learn==1.7.2 + +# PyTorch CPU build — participants may swap for a GPU wheel if needed +torch==2.8.0 + +# ── Local development tools ─────────────────────────────────────────────────── +# Required only for tools/run_docker.py (not installed inside the Docker image) +docker diff --git a/scoring_program/scoring.py b/scoring_program/scoring.py index 701f78f..39f675f 100755 --- a/scoring_program/scoring.py +++ b/scoring_program/scoring.py @@ -2,40 +2,39 @@ from pathlib import Path import pandas as pd +from sklearn.metrics import roc_auc_score EVAL_SETS = ["test", "private_test"] -def compute_accuracy(predictions, targets): - # Make sure there is no NaN, as pandas ignores them in mean computation - predictions = predictions.fillna(-10).values - # Return mean of correct predictions - return (predictions == targets.values).mean() +def compute_roc_auc(predictions, targets): + # Make sure there is no NaN + predictions = predictions.fillna(0.5).values + # Return ROC AUC score + return roc_auc_score(targets.values, predictions) def main(reference_dir, prediction_dir, output_dir): scores = {} for eval_set in EVAL_SETS: - print(f'Scoring {eval_set}') + print(f"Scoring {eval_set}") predictions = pd.read_csv( - prediction_dir / f'{eval_set}_predictions.csv' - ) - targets = pd.read_csv( - reference_dir / f'{eval_set}_labels.csv' + prediction_dir / f"{eval_set}_predictions.csv" ) + targets = pd.read_csv(reference_dir / f"{eval_set}_labels.csv") - scores[eval_set] = float(compute_accuracy(predictions, targets)) + scores[eval_set] = float(compute_roc_auc(predictions, targets)) # Add train and test times in the score - json_durations = (prediction_dir / 'metadata.json').read_text() + json_durations = (prediction_dir / "metadata.json").read_text() durations = json.loads(json_durations) scores.update(**durations) print(scores) # Write output scores output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / 'scores.json').write_text(json.dumps(scores)) + (output_dir / "scores.json").write_text(json.dumps(scores)) if __name__ == "__main__": @@ -68,5 +67,5 @@ def main(reference_dir, prediction_dir, output_dir): main( Path(args.reference_dir), Path(args.prediction_dir), - Path(args.output_dir) + Path(args.output_dir), ) diff --git a/solution/submission.py b/solution/submission.py index a8076b0..436486b 100644 --- a/solution/submission.py +++ b/solution/submission.py @@ -1,7 +1,108 @@ -from sklearn.ensemble import RandomForestClassifier +""" +Reference LSTM baseline for the S&P 500 direction-forecasting challenge. +The ingestion program will call: -# The submission here should simply be a function that returns a model -# compatible with scikit-learn API -def get_model(): - return RandomForestClassifier() + model = get_model(train_loader) + +where `train_loader` is a torch.utils.data.DataLoader that yields +(x, y) batches with: + x : FloatTensor of shape (batch, WINDOW_SIZE, n_features) + y : FloatTensor of shape (batch,) — binary labels (1 = up, 0 = down) + +`get_model` must return a trained torch.nn.Module whose forward pass accepts +a tensor of shape (batch, WINDOW_SIZE, n_features) and returns probabilities +in [0, 1] of shape (batch,). The ingestion program applies a 0.5 threshold. +""" + +import torch +import torch.nn as nn + + +# ── Hyper-parameters (feel free to tune) ───────────────────────────────────── +HIDDEN_SIZE = 128 +NUM_LAYERS = 3 +DROPOUT = 0.1 +N_EPOCHS = 3 +LEARNING_RATE = 1e-4 +# ───────────────────────────────────────────────────────────────────────────── + + +class LSTMClassifier(nn.Module): + """Sequence-to-one LSTM for binary direction prediction. + + Takes a window of shape (batch, seq_len, input_size) and returns + a scalar logit per sample (shape: (batch,)). + + Architecture + ------------ + LSTM (num_layers, hidden_size, dropout) → hidden state of last timestep + → Linear(hidden_size → 1) → squeeze → Sigmoid → probability in [0, 1] + """ + + def __init__( + self, + input_size: int, + hidden_size: int = HIDDEN_SIZE, + num_layers: int = NUM_LAYERS, + dropout: float = DROPOUT, + ): + super().__init__() + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout if num_layers > 1 else 0.0, + ) + self.head = nn.Linear(hidden_size, 1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x: (batch, seq_len, input_size) + out, _ = self.lstm(x) # (batch, seq_len, hidden_size) + last = out[:, -1, :] # (batch, hidden_size) — last timestep + logit = self.head(last).squeeze(-1) # (batch,) + return torch.sigmoid(logit) # (batch,) — probability in [0, 1] + + +def get_model(train_loader: torch.utils.data.DataLoader) -> nn.Module: + """Train an LSTM on the provided DataLoader and return the trained model. + + Parameters + ---------- + train_loader : DataLoader + Yields (x, y) batches where x has shape (batch, WINDOW_SIZE, n_features) + and y has shape (batch,) with values in {0, 1}. + + Returns + ------- + model : nn.Module (in eval mode) + Trained LSTMClassifier whose forward pass returns probabilities in [0, 1]. + """ + # Infer input size from the first batch + x_sample, _ = next(iter(train_loader)) + input_size = x_sample.shape[-1] # n_features + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Training on: {device}") + + model = LSTMClassifier(input_size=input_size).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) + criterion = nn.BCELoss() # model already applies sigmoid + + model.train() + for epoch in range(N_EPOCHS): + total_loss = 0.0 + for x, y in train_loader: + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + probs = model(x) # (batch,) — probabilities in [0, 1] + loss = criterion(probs, y) + loss.backward() + optimizer.step() + total_loss += loss.item() + avg_loss = total_loss / len(train_loader) + print(f" Epoch {epoch + 1:>2}/{N_EPOCHS} loss={avg_loss:.4f}") + + model.eval() + return model diff --git a/tools/Dockerfile b/tools/Dockerfile index 8cb1eca..dd9b157 100644 --- a/tools/Dockerfile +++ b/tools/Dockerfile @@ -1,20 +1,22 @@ -# Step 1: Start from an official Docker image with desired base environment -# Good starting points are the official codalab images or -# pytorch images with CUDA support: -# - Codalab: codalab/codalab-legacy:py39 -# - Codalab GPU: codalab/codalab-legacy:gpu310 -# - Pytorch: pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime -FROM codalab/codalab-legacy:py39 +# Step 1: Start from an official PyTorch + GPU base image +# (Ubuntu 22.04, Python 3.11, PyTorch, CUDA 12.6, torch 2.8) +FROM pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime # Set environment variables to prevent interactive prompts ENV DEBIAN_FRONTEND=noninteractive # Step 2: Install system-level dependencies (if any) # e.g., git, wget, or common libraries for OpenCV like libgl1 -RUN pip install -U pip +RUN apt-get update && apt-get install -y \ + git \ + wget \ + libgl1-mesa-glx \ + && rm -rf /var/lib/apt/lists/* \ + && pip install -U pip # Step 3: Copy and pre-install all Python dependencies # This 'requirements.txt' file should list pandas, scikit-learn, timm, etc. # Place it in the same directory as this Dockerfile. COPY requirements.txt /tmp/requirements.txt RUN pip install --no-cache-dir -r /tmp/requirements.txt +RUN pip install opencv-python-headless diff --git a/tools/setup_data.py b/tools/setup_data.py index 5bdc3a9..125549b 100644 --- a/tools/setup_data.py +++ b/tools/setup_data.py @@ -1,53 +1,72 @@ -# Script to download the data from a given source and create the splits -# This is a mock version that generate fake problems +# Script to load the S&P500 data and create the splits for the benchmark from pathlib import Path -import numpy as np import pandas as pd -from sklearn.datasets import make_classification -from sklearn.model_selection import train_test_split -PHASE = 'dev_phase' +PHASE = "dev_phase" -DATA_DIR = Path(PHASE) / 'input_data' -REF_DIR = Path(PHASE) / 'reference_data' +DATA_DIR = Path(PHASE) / "input_data" +REF_DIR = Path(PHASE) / "reference_data" + +RAW_DATA_PATH = Path("raw_data") / "sp500_raw.csv" +TARGET_COL = "Target" def make_csv(data, filepath): filepath.parent.mkdir(parents=True, exist_ok=True) - pd.DataFrame(data).to_csv(filepath, index=False) + data.to_csv(filepath, index=True) # integer row index saved as first column if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser( - description='Load or generate data for the benchmark' - ) - parser.add_argument('--seed', type=int, default=42, - help='Random seed for data generation') - args = parser.parse_args() - - # Generate and split the data - rng = np.random.RandomState(args.seed) - X, y = make_classification(n_samples=500, n_features=5, random_state=rng) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, random_state=rng - ) - X_test, X_private_test, y_test, y_private_test = train_test_split( - X_test, y_test, test_size=0.5, random_state=rng - ) + # Load the S&P500 data + print(f"Loading data from {RAW_DATA_PATH}") + df = pd.read_csv(RAW_DATA_PATH) + + # Separate features and target; drop Date (not a model input) + y = df[TARGET_COL] + X = df.drop(columns=[TARGET_COL, "Date"]).reset_index(drop=True) + + n = len(df) + train_end = int(n * 0.6) + test_end = int(n * 0.8) + + # Split chronologically: 60% train, 20% test, 20% private_test + X_train, y_train = X.iloc[:train_end], y.iloc[:train_end] + X_test, y_test = X.iloc[train_end:test_end], y.iloc[train_end:test_end] + X_private_test, y_private_test = X.iloc[test_end:], y.iloc[test_end:] + + print(f"Dataset shape: {df.shape}") + print(f"Features: {X.shape[1]}, Samples: {n}") + print(f"Target distribution:\n{y.value_counts()}") # Store the data in the correct folders: # - input_data contains train data (both features and labels) and only # test features so the test labels are kept secret # - reference_data contains the test labels for scoring for split, X_split, y_split in [ - ('train', X_train, y_train), - ('test', X_test, y_test), - ('private_test', X_private_test, y_private_test), + ("train", X_train, y_train), + ("test", X_test, y_test), + ("private_test", X_private_test, y_private_test), ]: split_dir = DATA_DIR / split - make_csv(X_split, split_dir / f'{split}_features.csv') + make_csv(X_split, split_dir / f"{split}_features.csv") label_dir = split_dir if split == "train" else REF_DIR - make_csv(y_split, label_dir / f'{split}_labels.csv') \ No newline at end of file + make_csv( + pd.DataFrame({TARGET_COL: y_split}), + label_dir / f"{split}_labels.csv", + ) + + print("\nData splits created successfully!") + print( + f"{'Split':<15} {'Samples':<10} {'Index start':<15} {'Index end':<15}" + ) + print("-" * 55) + for split, X_split in [ + ("train", X_train), + ("test", X_test), + ("private_test", X_private_test), + ]: + print( + f"{split:<15} {len(X_split):<10} {X_split.index[0]:<15} {X_split.index[-1]:<15}" + )