diff --git a/.gitignore b/.gitignore index 9728749..4c2f2aa 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ ingestion_res/* scoring_res/* dev_phase/* +*.pth diff --git a/competition.yaml b/competition.yaml index 3a1e4de..44a4a74 100755 --- a/competition.yaml +++ b/competition.yaml @@ -1,8 +1,26 @@ version: 2 -title: Templat competition - Dummy classification -description: Dummy classification task +title: "Autoregressive Forecasting of the S&P 500 Index" +description: > + Can you predict whether the S&P 500 will close up or down — using only what you know by mid-morning? + + Each trading day, participants receive a feature vector built from: + - Intraday morning signals: the day's open price and early price action + (e.g. open-to-first-hour return, morning high/low range, opening gap vs previous close). + - Historical context: past N days of daily OHLCV data, log-returns, and + rolling statistics (volatility, momentum) up to and including the previous close. + + The target label is binary: **1** if the day's close is strictly above the previous close, + **0** otherwise. No look-ahead is permitted — only information available before noon (ET) + may be used as features for the current day. + + Participants submit a scikit-learn–compatible model via a `submission.py` file + exposing a `get_model()` function. The model is trained server-side on historical + data and evaluated on a held-out test window using **directional accuracy** + (fraction of days where the predicted direction matches the actual close direction). + + This is a DataCamp challenge organised at École Polytechnique (INF554 / MAP583). image: logo.png -registration_auto_approve: False # if True, do not require approval from admin to join the comp +registration_auto_approve: False # set to True to skip manual approval terms: pages/terms.md pages: @@ -15,8 +33,15 @@ pages: tasks: - index: 0 - name: Developement Task - description: 'Tune models with training data, test against examples contained in public test data' + name: Development Task + description: > + Same-day close direction forecasting of the S&P 500 using morning information. + Each sample consists of: (i) intraday morning features for the current trading day + (opening gap, open price, early price action) and (ii) historical daily features + from the past N sessions (log-returns, OHLCV, rolling volatility, momentum). + The label is 1 if today's close > previous close, 0 otherwise. + No information after the morning window may be used; models are scored on + directional accuracy over a public held-out test window. input_data: dev_phase/input_data/ reference_data: dev_phase/reference_data/ ingestion_program: ingestion_program/ @@ -25,13 +50,15 @@ tasks: solutions: - index: 0 tasks: - - 0 + - 0 path: solution/ - phases: - name: Development Phase - description: 'Development phase: tune your models.' + description: > + Tune and validate your autoregressive model using the provided historical + S&P 500 training data. Your predictions are scored against a public test set + so you can iterate quickly. Unlimited submissions are allowed in this phase. start: 10-07-2025 end: 03-31-2026 tasks: @@ -41,20 +68,20 @@ leaderboards: - title: Results key: main columns: - - title: Test Accuracy + - title: Directional Accuracy (public test) key: test index: 0 - sorting: asc - - title: Private Test Accuracy + sorting: desc # higher is better + - title: Directional Accuracy (private test) key: private_test index: 1 - sorting: asc - hidden: True - - title: Train time + sorting: desc + hidden: True # revealed only after the phase ends + - title: Train Time (s) key: train_time index: 2 - sorting: desc - - title: Test time + sorting: asc # lower is better + - title: Predict Time (s) key: test_time index: 3 - sorting: desc + sorting: asc diff --git a/ingestion_program/ingestion.py b/ingestion_program/ingestion.py index f150b05..3c3143f 100755 --- a/ingestion_program/ingestion.py +++ b/ingestion_program/ingestion.py @@ -3,53 +3,156 @@ import time from pathlib import Path +import numpy as np import pandas as pd +import torch +# Number of past trading days fed as a sequence to the model. +# Must be consistent between training and inference. +WINDOW_SIZE = 50 EVAL_SETS = ["test", "private_test"] -def evaluate_model(model, X_test): - - y_pred = model.predict(X_test) - return pd.DataFrame(y_pred) +class SP500Dataset(torch.utils.data.Dataset): + """PyTorch Dataset for the S&P 500 direction-forecasting challenge. + + Each sample is a sliding window of shape (WINDOW_SIZE, n_features) + ending at day `idx`. The target is the binary label of that last day + (1 = close > prev_close, 0 otherwise). + + For the first WINDOW_SIZE-1 days, the window is left-padded with zeros. + + Parameters + ---------- + features_path : Path + Path to the features CSV (columns = feature names, rows = trading days + in chronological order). + labels_path : Path or None + Path to the labels CSV (single column, same row order as features). + Pass None for test sets where labels are withheld. + window_size : int + Number of past days (inclusive of the current day) in each sequence. + """ + + def __init__( + self, features_path, labels_path=None, window_size=WINDOW_SIZE + ): + self.window_size = window_size + # index_col=0: the first column is the row index saved by setup_data.py, + # not a feature — must be excluded from the data arrays. + self.X = pd.read_csv(features_path, index_col=0).values.astype( + np.float32 + ) + self.n_features = self.X.shape[1] + if labels_path is not None: + self.y = ( + pd.read_csv(labels_path, index_col=0) + .values.astype(np.float32) + .ravel() + ) + else: + self.y = None # test mode — labels are unknown + + def __len__(self): + return len(self.X) + + def __getitem__(self, idx): + """Return (window, label) where window has shape (window_size, n_features). + + The label is the binary target for day `idx` (the last day of the window). + During test mode (no labels), only the window tensor is returned. + """ + window_start = max(0, idx - self.window_size + 1) + window = self.X[window_start : idx + 1] # (<=window_size, n_features) + + # Left-pad with zeros if we are at the beginning of the series + if len(window) < self.window_size: + padding = np.zeros( + (self.window_size - len(window), self.n_features), + dtype=np.float32, + ) + window = np.concatenate([padding, window], axis=0) + + x = torch.tensor( + window, dtype=torch.float32 + ) # (window_size, n_features) + + if self.y is not None: + y = torch.tensor(self.y[idx], dtype=torch.float32) # scalar + return x, y + return x # test mode + + +def get_train_dataset(data_dir): + """Build the training Dataset from separate features and labels CSVs.""" + data_dir = Path(data_dir) + features_path = data_dir / "train" / "train_features.csv" + labels_path = data_dir / "train" / "train_labels.csv" + return SP500Dataset(features_path, labels_path) -def get_train_data(data_dir): +def get_test_dataset(data_dir, eval_set): + """Build a test Dataset (no labels) for a given evaluation split.""" data_dir = Path(data_dir) - training_dir = data_dir / "train" - X_train = pd.read_csv(training_dir / "train_features.csv") - y_train = pd.read_csv(training_dir / "train_labels.csv") - return X_train, y_train + features_path = data_dir / eval_set / f"{eval_set}_features.csv" + return SP500Dataset(features_path, labels_path=None) -def main(data_dir, output_dir): - # Here, you can import info from the submission module, to evaluate the - # submission - from submission import get_model +def evaluate_model(model, test_dataset): + """Run inference over a test Dataset and return a DataFrame of probabilities. - X_train, y_train = get_train_data(data_dir) + The model outputs probabilities in [0, 1] (sigmoid already applied). + The scoring program is responsible for applying the decision threshold. + """ + device = next(model.parameters()).device + loader = torch.utils.data.DataLoader( + test_dataset, batch_size=64, shuffle=False + ) + probs = [] + model.eval() + with torch.no_grad(): + for x in loader: + # test_dataset returns bare tensors (no label) — x is already the input + x = x.to(device) + batch_probs = model(x).cpu().numpy().tolist() # floats in [0, 1] + probs.extend(batch_probs) + return pd.DataFrame({"Probability": probs}) - print("Training the model") - model = get_model() +def main(data_dir, output_dir): + from submission import ( + get_model, + ) # imported here so sys.path is set first + data_dir = Path(data_dir) + output_dir = Path(output_dir) + + # ── Training ────────────────────────────────────────────────────────────── + train_dataset = get_train_dataset(data_dir) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=32, shuffle=True + ) + + print("Training the model") start = time.time() - model.fit(X_train, y_train) + model = get_model(train_loader) # participant trains and returns the model train_time = time.time() - start - print("-" * 10) + + # ── Evaluation ──────────────────────────────────────────────────────────── + print("=" * 40) print("Evaluate the model") start = time.time() res = {} for eval_set in EVAL_SETS: - X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}_features.csv") - res[eval_set] = evaluate_model(model, X_test) + test_dataset = get_test_dataset(data_dir, eval_set) + res[eval_set] = evaluate_model(model, test_dataset) test_time = time.time() - start - print("-" * 10) - duration = train_time + test_time - print(f"Completed Prediction. Total duration: {duration}") + print( + f"Completed Prediction. Total duration: {train_time + test_time:.1f}s" + ) - # Write output files + # ── Write outputs ───────────────────────────────────────────────────────── output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "metadata.json", "w+") as f: json.dump(dict(train_time=train_time, test_time=test_time), f) @@ -70,19 +173,25 @@ def main(data_dir, output_dir): "--data-dir", type=str, default="/app/input_data", - help="", + help="Root folder containing train/, test/, and private_test/ splits. " + "Codabench mounts data at /app/input_data. " + "For local testing pass: --data-dir dev_phase/input_data", ) parser.add_argument( "--output-dir", type=str, default="/app/output", - help="", + help="Folder where prediction CSVs and metadata.json will be written. " + "Codabench expects output at /app/output. " + "For local testing pass: --output-dir ingestion_res", ) parser.add_argument( "--submission-dir", type=str, default="/app/ingested_program", - help="", + help="Directory containing submission.py. " + "Codabench mounts participant code at /app/ingested_program. " + "For local testing pass: --submission-dir solution", ) args = parser.parse_args() diff --git a/logo.png b/logo.png index 9616456..5255f04 100644 Binary files a/logo.png and b/logo.png differ diff --git a/pages/participate.md b/pages/participate.md index 4d5427b..c44336d 100755 --- a/pages/participate.md +++ b/pages/participate.md @@ -1,10 +1,55 @@ -# How to participate +# How to Participate -You should submit an untrained model in a python file `model.py` which contains -your `class Model`, which will be imported, trained, and tested on Codalab. +## Objective -See the "Seed" page for the outline of a `Model` class, with the expected -function names. +Build a model that predicts whether the S&P 500 index will **close higher or lower** than the previous day, +using only information available before noon (ET) on the trading day in question. -See the "Timeline" page for additional information about the phases of this -competition +## Input Features + +Each sample in the dataset is a row in a CSV with the following columns (all values are for the **current trading day** or computed from past days only): + +| Column | Description | +|--------|-------------| +| `Open` | Opening price of the current trading day | +| `High` | Intraday high up to the morning window | +| `Low` | Intraday low up to the morning window | +| `Close` | Previous day's closing price | +| `Volume` | Trading volume up to the morning window | + +The ingestion program constructs **sliding windows** of the last 20 trading days for each sample and feeds them to your model as tensors of shape `(batch, 20, n_features)`. + +## Target Label + +- **1** — today's close will be **strictly above** the previous close +- **0** — today's close will be **at or below** the previous close + +## What to Submit + +Submit a single file named **`submission.py`** containing a function: + +```python +def get_model(train_loader): + ... + return model +``` + +`train_loader` is a `torch.utils.data.DataLoader` yielding `(x, y)` batches where: +- `x` has shape `(batch, 20, n_features)` — a sequence of 20 daily feature vectors +- `y` has shape `(batch,)` — binary labels `{0, 1}` + +Your `get_model` function must **train the model** using the provided loader and return a trained `torch.nn.Module` whose `forward(x)` outputs **probabilities in [0, 1]** of shape `(batch,)` — i.e. sigmoid must already be applied. + +See the **Seed** page for a working skeleton to get started. + +## Evaluation Metric + +Submissions are ranked by **ROC-AUC score** on the held-out test set. +A perfect model scores 1.0; random guessing scores ~0.5. + +## Rules + +- Your model may only use information in the provided feature set — no external data sources. +- External Python libraries (e.g. `torch`, `sklearn`, `numpy`) are allowed. +- You may submit as many times as you like during the Development Phase. +- The private test set is only revealed after the phase ends. diff --git a/pages/seed.md b/pages/seed.md index 9b15f6a..0610971 100644 --- a/pages/seed.md +++ b/pages/seed.md @@ -1,21 +1,85 @@ -# Seed: +# Seed — Starter Template +Copy this file as `submission.py` and implement your model inside `get_model`. + +The ingestion program will call `get_model(train_loader)` and expect back a trained +`torch.nn.Module` whose `forward(x)` returns probabilities in **[0, 1]**. + +```python +import torch +import torch.nn as nn + + +def get_model(train_loader): + """ + Train a model on the S&P 500 direction-forecasting task and return it. + + Parameters + ---------- + train_loader : torch.utils.data.DataLoader + Yields (x, y) batches where: + x — FloatTensor of shape (batch, 20, n_features) + A sliding window of the last 20 daily feature vectors. + Features: Open, High, Low, Close, Volume (current and past days). + y — FloatTensor of shape (batch,) + Binary label: 1 if today's close > previous close, else 0. + + Returns + ------- + model : torch.nn.Module + Trained model in eval() mode. + forward(x) must accept shape (batch, 20, n_features) + and return probabilities in [0, 1] of shape (batch,). + The ingestion program applies a 0.5 threshold to produce 0/1 predictions. + """ + + # --- Infer input size from the first batch --- + x_sample, _ = next(iter(train_loader)) + input_size = x_sample.shape[-1] # number of features per timestep + seq_len = x_sample.shape[1] # window size (20) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # --- Define your model here --- + # Example: single-layer LSTM + linear head + sigmoid + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.lstm = nn.LSTM(input_size, hidden_size=64, + num_layers=1, batch_first=True) + self.head = nn.Linear(64, 1) + + def forward(self, x): + out, _ = self.lstm(x) # (batch, seq_len, 64) + last = out[:, -1, :] # (batch, 64) — last timestep + return torch.sigmoid(self.head(last).squeeze(-1)) # (batch,) + + model = MyModel().to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + criterion = nn.BCELoss() # BCELoss because sigmoid is already applied + + # --- Training loop --- + N_EPOCHS = 10 + model.train() + for epoch in range(N_EPOCHS): + total_loss = 0.0 + for x, y in train_loader: + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + probs = model(x) # (batch,) + loss = criterion(probs, y) + loss.backward() + optimizer.step() + total_loss += loss.item() + print(f"Epoch {epoch+1}/{N_EPOCHS} loss={total_loss/len(train_loader):.4f}") + + model.eval() + return model ``` -class Model: - def fit(self, X_train, y_train): - """ - This should handle the logic of training your model - :param X_train: np.array of training data - :param y_train: np.array of the same length as X_train. Contains classifications of X_train - """ - pass - - def predict(self, X_test): - """ - This should handle making predictions with a trained model - :param X_test: np.array of testing data - :return: np.array of the same length as X_test containing predictions to each point in X_test - """ - pass - -``` \ No newline at end of file + +## Tips + +- You can replace the LSTM with a GRU (`nn.GRU`), Transformer (`nn.TransformerEncoder`), or any other architecture. +- The window size is fixed at **20** timesteps by the ingestion program. +- Keep training time reasonable — the Codabench environment has limited CPU resources. +- You are free to add dropout, batch normalisation, learning rate schedulers, etc. diff --git a/pages/terms.md b/pages/terms.md index 0d69023..ece5c3b 100755 --- a/pages/terms.md +++ b/pages/terms.md @@ -1,18 +1,73 @@ # Terms and Conditions -## Lorem Ipsum - -Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. -Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure -dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non -proident, sunt in culpa qui officia deserunt mollit anim id est laborum. - -## Sed ut perspiciatis - -Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae -ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit -aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam -est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore -et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, -nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae -consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur? +## 1. Eligibility + +This challenge is open to students enrolled in the DataCamp courses at École Polytechnique (INF554 / MAP583). Participation is voluntary and free of charge. + +Participants may compete individually or in teams, subject to the rules specified by the course instructors. +2. Data Usage + +The dataset provided for this challenge (historical S&P 500 daily OHLCV data sourced from public market data) is intended solely for educational and research purposes within the scope of this course. + +Participants agree to: + + Use the data only for the purpose of this challenge. + Not redistribute the data outside the course or upload it to public repositories. + Not attempt to identify, reverse-engineer, or misuse the data beyond its intended scientific context. + +3. Training and Evaluation Restrictions + +Participants must comply with the following rules: + + Only the provided training split may be used for training and validation. + The test and private test splits are reserved for evaluation only and must not be used in the training loop. + Any attempt to directly or indirectly train on evaluation data will result in disqualification. + +4. Submission Rules + +Participants must submit: + + A compressed folder containing the code necessary to train a model (see page seed.md). + Any accompanying code or documentation as specified by the instructors. + +Submissions must be the original work of the participants. +5. Academic Integrity + +Participants are expected to adhere to the École Polytechnique’s academic integrity policies. + +Specifically: + + Plagiarism, including copying code or solutions without proper attribution, is prohibited. + The use of external libraries and pre-trained models is allowed unless otherwise stated, provided their use is clearly documented. + Collaboration between teams is not allowed unless explicitly permitted by the instructors. + +Violations of academic integrity rules may result in penalties, including disqualification or academic sanctions. +6. Intellectual Property + +Participants retain ownership of the code and models they develop as part of this challenge. + +By submitting their results, participants grant the course instructors and the hosting institution a non-exclusive, royalty-free right to: + + Use the submissions for grading and evaluation. + Use anonymized results or visualizations for teaching, presentations, or future course materials. + +7. Liability + +The organizers provide the data and evaluation infrastructure “as is” and make no guarantees regarding accuracy, completeness, or fitness for a particular purpose. + +The organizers are not responsible for: + + Technical issues, data loss, or submission failures. + Any damages or losses arising from participation in the challenge. + +8. Modification and Termination + +The organizers reserve the right to: + + Modify the challenge rules, datasets, or evaluation criteria if necessary. + Terminate the challenge in case of technical issues or unforeseen circumstances. + +Any changes will be communicated to participants in a timely manner. +9. Acceptance of Terms + +By participating in the challenge, participants acknowledge that they have read, understood, and agreed to these Terms and Conditions. diff --git a/pages/timeline.md b/pages/timeline.md index 4e613bf..015ea39 100644 --- a/pages/timeline.md +++ b/pages/timeline.md @@ -1,11 +1,28 @@ # Timeline -## Development phase -This phase should be used to tune your models, testing against a small -set of testing data - -## Final phase -Resubmit your preferred submission from the development phase to test -against a new set of testing data. Your model will be trained against -the same set of training data as in the development phase. You may only -make one submission to this phase, so choose wisely. +## Development Phase — October 7, 2025 → March 31, 2026 + +The development phase is open for the full duration of the course. + +- **Training data**: historical S&P 500 daily OHLCV data (roughly 2006–2022). +- **Public test set**: a held-out window of ~250 trading days (~2022–2023). + Your submission is scored against this set after every submission. +- **Submissions**: unlimited — iterate freely and track your progress on the leaderboard. +- **Goal**: tune your model architecture, features, and hyper-parameters to maximise ROC-AUC on the public test set. + +## Private Leaderboard — revealed at end of Development Phase + +Once the development phase closes on **March 31, 2026**, the private test set +(a further ~250 trading days, ~2023–2024) is scored for all submissions. +Final rankings are based on the **private test ROC-AUC**. + +The private test set is completely hidden during the development phase — optimising +sole ly for the public leaderboard may not generalise. + +## Key Dates + +| Date | Event | +|------|-------| +| October 7, 2025 | Competition opens, development phase begins | +| March 31, 2026 | Development phase closes, no further submissions accepted | +| Early April 2026 | Private leaderboard revealed, final rankings published | diff --git a/raw_data/.gitkeep b/raw_data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index fda4dd6..41980e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,11 @@ -pandas -scikit-learn +# ── Core runtime (ingestion + scoring + submission) ─────────────────────────── +numpy==2.2.6 +pandas==2.3.3 +scikit-learn==1.7.2 + +# PyTorch CPU build — participants may swap for a GPU wheel if needed +torch==2.8.0 + +# ── Local development tools ─────────────────────────────────────────────────── +# Required only for tools/run_docker.py (not installed inside the Docker image) +docker diff --git a/scoring_program/scoring.py b/scoring_program/scoring.py index 701f78f..971e3ec 100755 --- a/scoring_program/scoring.py +++ b/scoring_program/scoring.py @@ -2,40 +2,41 @@ from pathlib import Path import pandas as pd +from sklearn.metrics import roc_auc_score EVAL_SETS = ["test", "private_test"] -def compute_accuracy(predictions, targets): - # Make sure there is no NaN, as pandas ignores them in mean computation - predictions = predictions.fillna(-10).values - # Return mean of correct predictions - return (predictions == targets.values).mean() +def compute_roc_auc(predictions, targets): + # Make sure there is no NaN + predictions = predictions.fillna(0.5).values + # Return ROC AUC score + return roc_auc_score(targets, predictions) def main(reference_dir, prediction_dir, output_dir): scores = {} for eval_set in EVAL_SETS: - print(f'Scoring {eval_set}') + print(f"Scoring {eval_set}") predictions = pd.read_csv( - prediction_dir / f'{eval_set}_predictions.csv' - ) - targets = pd.read_csv( - reference_dir / f'{eval_set}_labels.csv' + prediction_dir / f"{eval_set}_predictions.csv" ) + targets = pd.read_csv(reference_dir / f"{eval_set}_labels.csv") - scores[eval_set] = float(compute_accuracy(predictions, targets)) + scores[eval_set] = float( + compute_roc_auc(predictions, targets["Target"].values) + ) # Add train and test times in the score - json_durations = (prediction_dir / 'metadata.json').read_text() + json_durations = (prediction_dir / "metadata.json").read_text() durations = json.loads(json_durations) scores.update(**durations) print(scores) # Write output scores output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / 'scores.json').write_text(json.dumps(scores)) + (output_dir / "scores.json").write_text(json.dumps(scores)) if __name__ == "__main__": @@ -68,5 +69,5 @@ def main(reference_dir, prediction_dir, output_dir): main( Path(args.reference_dir), Path(args.prediction_dir), - Path(args.output_dir) + Path(args.output_dir), ) diff --git a/solution/submission.py b/solution/submission.py index a8076b0..436486b 100644 --- a/solution/submission.py +++ b/solution/submission.py @@ -1,7 +1,108 @@ -from sklearn.ensemble import RandomForestClassifier +""" +Reference LSTM baseline for the S&P 500 direction-forecasting challenge. +The ingestion program will call: -# The submission here should simply be a function that returns a model -# compatible with scikit-learn API -def get_model(): - return RandomForestClassifier() + model = get_model(train_loader) + +where `train_loader` is a torch.utils.data.DataLoader that yields +(x, y) batches with: + x : FloatTensor of shape (batch, WINDOW_SIZE, n_features) + y : FloatTensor of shape (batch,) — binary labels (1 = up, 0 = down) + +`get_model` must return a trained torch.nn.Module whose forward pass accepts +a tensor of shape (batch, WINDOW_SIZE, n_features) and returns probabilities +in [0, 1] of shape (batch,). The ingestion program applies a 0.5 threshold. +""" + +import torch +import torch.nn as nn + + +# ── Hyper-parameters (feel free to tune) ───────────────────────────────────── +HIDDEN_SIZE = 128 +NUM_LAYERS = 3 +DROPOUT = 0.1 +N_EPOCHS = 3 +LEARNING_RATE = 1e-4 +# ───────────────────────────────────────────────────────────────────────────── + + +class LSTMClassifier(nn.Module): + """Sequence-to-one LSTM for binary direction prediction. + + Takes a window of shape (batch, seq_len, input_size) and returns + a scalar logit per sample (shape: (batch,)). + + Architecture + ------------ + LSTM (num_layers, hidden_size, dropout) → hidden state of last timestep + → Linear(hidden_size → 1) → squeeze → Sigmoid → probability in [0, 1] + """ + + def __init__( + self, + input_size: int, + hidden_size: int = HIDDEN_SIZE, + num_layers: int = NUM_LAYERS, + dropout: float = DROPOUT, + ): + super().__init__() + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout if num_layers > 1 else 0.0, + ) + self.head = nn.Linear(hidden_size, 1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x: (batch, seq_len, input_size) + out, _ = self.lstm(x) # (batch, seq_len, hidden_size) + last = out[:, -1, :] # (batch, hidden_size) — last timestep + logit = self.head(last).squeeze(-1) # (batch,) + return torch.sigmoid(logit) # (batch,) — probability in [0, 1] + + +def get_model(train_loader: torch.utils.data.DataLoader) -> nn.Module: + """Train an LSTM on the provided DataLoader and return the trained model. + + Parameters + ---------- + train_loader : DataLoader + Yields (x, y) batches where x has shape (batch, WINDOW_SIZE, n_features) + and y has shape (batch,) with values in {0, 1}. + + Returns + ------- + model : nn.Module (in eval mode) + Trained LSTMClassifier whose forward pass returns probabilities in [0, 1]. + """ + # Infer input size from the first batch + x_sample, _ = next(iter(train_loader)) + input_size = x_sample.shape[-1] # n_features + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Training on: {device}") + + model = LSTMClassifier(input_size=input_size).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) + criterion = nn.BCELoss() # model already applies sigmoid + + model.train() + for epoch in range(N_EPOCHS): + total_loss = 0.0 + for x, y in train_loader: + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + probs = model(x) # (batch,) — probabilities in [0, 1] + loss = criterion(probs, y) + loss.backward() + optimizer.step() + total_loss += loss.item() + avg_loss = total_loss / len(train_loader) + print(f" Epoch {epoch + 1:>2}/{N_EPOCHS} loss={avg_loss:.4f}") + + model.eval() + return model diff --git a/tools/Dockerfile b/tools/Dockerfile index 8cb1eca..e3fa61a 100644 --- a/tools/Dockerfile +++ b/tools/Dockerfile @@ -1,20 +1,45 @@ -# Step 1: Start from an official Docker image with desired base environment -# Good starting points are the official codalab images or -# pytorch images with CUDA support: -# - Codalab: codalab/codalab-legacy:py39 -# - Codalab GPU: codalab/codalab-legacy:gpu310 -# - Pytorch: pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime -FROM codalab/codalab-legacy:py39 +# ───────────────────────────────────────────────────────────────────────────── +# Codabench Docker image — S&P 500 Autoregressive Forecasting Challenge +# +# Codabench mounts the following directories at runtime: +# /app/input_data ← input splits (train/, test/, private_test/) +# /app/ingested_program ← participant's submission.py +# /app/ingestion_program ← ingestion.py (organiser code) +# /app/scoring_program ← scoring.py (organiser code) +# /app/output ← ingestion writes predictions here +# /app/input/ref ← reference labels (scoring) +# /app/input/res ← predictions to score (scoring) +# +# Build from the PROJECT ROOT, not from tools/: +# docker build -t sp500-challenge -f tools/Dockerfile . +# +# The build context must be the project root so that requirements.txt is +# accessible via COPY. +# ───────────────────────────────────────────────────────────────────────────── -# Set environment variables to prevent interactive prompts -ENV DEBIAN_FRONTEND=noninteractive +FROM python:3.11-slim -# Step 2: Install system-level dependencies (if any) -# e.g., git, wget, or common libraries for OpenCV like libgl1 -RUN pip install -U pip +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 -# Step 3: Copy and pre-install all Python dependencies -# This 'requirements.txt' file should list pandas, scikit-learn, timm, etc. -# Place it in the same directory as this Dockerfile. +# Minimal system deps: gcc is needed to compile some numpy/pandas C extensions +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Pre-install Python dependencies. +# The 'docker' package is a local dev tool — skip it inside the container. +# torch is installed from the official CPU wheel index to avoid pulling the +# full CUDA build (~2 GB saved). COPY requirements.txt /tmp/requirements.txt -RUN pip install --no-cache-dir -r /tmp/requirements.txt +RUN pip install --no-cache-dir --upgrade pip \ + && grep -v "^docker" /tmp/requirements.txt \ + | grep -v "^#" \ + | grep -v "^$" \ + | grep -v "torch" \ + | pip install --no-cache-dir -r /dev/stdin \ + && pip install --no-cache-dir \ + torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu diff --git a/tools/setup_data.py b/tools/setup_data.py index 5bdc3a9..125549b 100644 --- a/tools/setup_data.py +++ b/tools/setup_data.py @@ -1,53 +1,72 @@ -# Script to download the data from a given source and create the splits -# This is a mock version that generate fake problems +# Script to load the S&P500 data and create the splits for the benchmark from pathlib import Path -import numpy as np import pandas as pd -from sklearn.datasets import make_classification -from sklearn.model_selection import train_test_split -PHASE = 'dev_phase' +PHASE = "dev_phase" -DATA_DIR = Path(PHASE) / 'input_data' -REF_DIR = Path(PHASE) / 'reference_data' +DATA_DIR = Path(PHASE) / "input_data" +REF_DIR = Path(PHASE) / "reference_data" + +RAW_DATA_PATH = Path("raw_data") / "sp500_raw.csv" +TARGET_COL = "Target" def make_csv(data, filepath): filepath.parent.mkdir(parents=True, exist_ok=True) - pd.DataFrame(data).to_csv(filepath, index=False) + data.to_csv(filepath, index=True) # integer row index saved as first column if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser( - description='Load or generate data for the benchmark' - ) - parser.add_argument('--seed', type=int, default=42, - help='Random seed for data generation') - args = parser.parse_args() - - # Generate and split the data - rng = np.random.RandomState(args.seed) - X, y = make_classification(n_samples=500, n_features=5, random_state=rng) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, random_state=rng - ) - X_test, X_private_test, y_test, y_private_test = train_test_split( - X_test, y_test, test_size=0.5, random_state=rng - ) + # Load the S&P500 data + print(f"Loading data from {RAW_DATA_PATH}") + df = pd.read_csv(RAW_DATA_PATH) + + # Separate features and target; drop Date (not a model input) + y = df[TARGET_COL] + X = df.drop(columns=[TARGET_COL, "Date"]).reset_index(drop=True) + + n = len(df) + train_end = int(n * 0.6) + test_end = int(n * 0.8) + + # Split chronologically: 60% train, 20% test, 20% private_test + X_train, y_train = X.iloc[:train_end], y.iloc[:train_end] + X_test, y_test = X.iloc[train_end:test_end], y.iloc[train_end:test_end] + X_private_test, y_private_test = X.iloc[test_end:], y.iloc[test_end:] + + print(f"Dataset shape: {df.shape}") + print(f"Features: {X.shape[1]}, Samples: {n}") + print(f"Target distribution:\n{y.value_counts()}") # Store the data in the correct folders: # - input_data contains train data (both features and labels) and only # test features so the test labels are kept secret # - reference_data contains the test labels for scoring for split, X_split, y_split in [ - ('train', X_train, y_train), - ('test', X_test, y_test), - ('private_test', X_private_test, y_private_test), + ("train", X_train, y_train), + ("test", X_test, y_test), + ("private_test", X_private_test, y_private_test), ]: split_dir = DATA_DIR / split - make_csv(X_split, split_dir / f'{split}_features.csv') + make_csv(X_split, split_dir / f"{split}_features.csv") label_dir = split_dir if split == "train" else REF_DIR - make_csv(y_split, label_dir / f'{split}_labels.csv') \ No newline at end of file + make_csv( + pd.DataFrame({TARGET_COL: y_split}), + label_dir / f"{split}_labels.csv", + ) + + print("\nData splits created successfully!") + print( + f"{'Split':<15} {'Samples':<10} {'Index start':<15} {'Index end':<15}" + ) + print("-" * 55) + for split, X_split in [ + ("train", X_train), + ("test", X_test), + ("private_test", X_private_test), + ]: + print( + f"{split:<15} {len(X_split):<10} {X_split.index[0]:<15} {X_split.index[-1]:<15}" + )