diff --git a/.gitignore b/.gitignore index 9728749..4c2f2aa 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ ingestion_res/* scoring_res/* dev_phase/* +*.pth diff --git a/competition.yaml b/competition.yaml index 3a1e4de..451fbf8 100755 --- a/competition.yaml +++ b/competition.yaml @@ -1,8 +1,34 @@ version: 2 -title: Templat competition - Dummy classification -description: Dummy classification task +title: "Directional Forecasting of the S&P 500 Index" +# Docker image used by Codabench to run ingestion and scoring. +# Build and push with: +# docker build -t nicolasnoya2001/sp500-challenge:v2 -f tools/Dockerfile . +# docker push nicolasnoya2001/sp500-challenge:v2 +docker_image: nicolasnoya2001/sp500-challenge:v2 +description: > + Can you predict whether the S&P 500 will close UP or DOWN tomorrow? + + Each trading day, participants receive a historical feature vector built from + past daily OHLCV data (Open, High, Low, Close, Volume) of the S&P 500 index. + + The target label is binary: **1** if the next trading day's close is strictly + above the current day's close, **0** otherwise. Participants are encouraged to + engineer their own historical context features (e.g., rolling volatility, moving averages) + using the provided sequential data. + + Participants submit a PyTorch model via a `submission.py` file exposing a `get_model(train_loader)` + function. The ingestion program passes a `DataLoader` yielding `(x, y)` batches where: + - `x` is a `FloatTensor` of shape `(batch, WINDOW_SIZE, n_features)` — a sliding window of historical daily features + - `y` is a `FloatTensor` of shape `(batch,)` — binary labels (1 = up, 0 = down) + + `get_model` must return a trained `torch.nn.Module` whose forward pass accepts a tensor of + shape `(batch, WINDOW_SIZE, n_features)` and returns **probabilities in [0, 1]** of shape `(batch,)`. + + Submissions are ranked by their **ROC-AUC** score computed from the predicted probabilities. + + This is a DataCamp challenge organised at École Polytechnique (INF554 / MAP583). image: logo.png -registration_auto_approve: False # if True, do not require approval from admin to join the comp +registration_auto_approve: False terms: pages/terms.md pages: @@ -15,23 +41,30 @@ pages: tasks: - index: 0 - name: Developement Task - description: 'Tune models with training data, test against examples contained in public test data' + name: Development Task + description: > + Next-day close direction forecasting of the S&P 500 using sliding windows of daily OHLCV data. + Models must be PyTorch modules trained via `get_model(train_loader)` and must output + probabilities (not hard 0s and 1s) to be properly scored via ROC-AUC over a public held-out test window. input_data: dev_phase/input_data/ reference_data: dev_phase/reference_data/ ingestion_program: ingestion_program/ scoring_program: scoring_program/ + public_data: dev_phase/input_data/train + starting_kit: template_starting_kit.ipynb solutions: - index: 0 tasks: - - 0 + - 0 path: solution/ - phases: - name: Development Phase - description: 'Development phase: tune your models.' + description: > + Tune and validate your forecasting model using the provided historical + S&P 500 training data. Your predictions are scored against a public test set + so you can iterate quickly. Unlimited submissions are allowed in this phase. start: 10-07-2025 end: 03-31-2026 tasks: @@ -41,20 +74,20 @@ leaderboards: - title: Results key: main columns: - - title: Test Accuracy + - title: ROC-AUC (public test) key: test index: 0 - sorting: asc - - title: Private Test Accuracy + sorting: desc # higher is better + - title: ROC-AUC (private test) key: private_test index: 1 - sorting: asc - hidden: True - - title: Train time + sorting: desc + hidden: True # revealed only after the phase ends + - title: Train Time (s) key: train_time index: 2 - sorting: desc - - title: Test time + sorting: asc # lower is better + - title: Predict Time (s) key: test_time index: 3 - sorting: desc + sorting: asc \ No newline at end of file diff --git a/ingestion_program/ingestion.py b/ingestion_program/ingestion.py index f150b05..3c3143f 100755 --- a/ingestion_program/ingestion.py +++ b/ingestion_program/ingestion.py @@ -3,53 +3,156 @@ import time from pathlib import Path +import numpy as np import pandas as pd +import torch +# Number of past trading days fed as a sequence to the model. +# Must be consistent between training and inference. +WINDOW_SIZE = 50 EVAL_SETS = ["test", "private_test"] -def evaluate_model(model, X_test): - - y_pred = model.predict(X_test) - return pd.DataFrame(y_pred) +class SP500Dataset(torch.utils.data.Dataset): + """PyTorch Dataset for the S&P 500 direction-forecasting challenge. + + Each sample is a sliding window of shape (WINDOW_SIZE, n_features) + ending at day `idx`. The target is the binary label of that last day + (1 = close > prev_close, 0 otherwise). + + For the first WINDOW_SIZE-1 days, the window is left-padded with zeros. + + Parameters + ---------- + features_path : Path + Path to the features CSV (columns = feature names, rows = trading days + in chronological order). + labels_path : Path or None + Path to the labels CSV (single column, same row order as features). + Pass None for test sets where labels are withheld. + window_size : int + Number of past days (inclusive of the current day) in each sequence. + """ + + def __init__( + self, features_path, labels_path=None, window_size=WINDOW_SIZE + ): + self.window_size = window_size + # index_col=0: the first column is the row index saved by setup_data.py, + # not a feature — must be excluded from the data arrays. + self.X = pd.read_csv(features_path, index_col=0).values.astype( + np.float32 + ) + self.n_features = self.X.shape[1] + if labels_path is not None: + self.y = ( + pd.read_csv(labels_path, index_col=0) + .values.astype(np.float32) + .ravel() + ) + else: + self.y = None # test mode — labels are unknown + + def __len__(self): + return len(self.X) + + def __getitem__(self, idx): + """Return (window, label) where window has shape (window_size, n_features). + + The label is the binary target for day `idx` (the last day of the window). + During test mode (no labels), only the window tensor is returned. + """ + window_start = max(0, idx - self.window_size + 1) + window = self.X[window_start : idx + 1] # (<=window_size, n_features) + + # Left-pad with zeros if we are at the beginning of the series + if len(window) < self.window_size: + padding = np.zeros( + (self.window_size - len(window), self.n_features), + dtype=np.float32, + ) + window = np.concatenate([padding, window], axis=0) + + x = torch.tensor( + window, dtype=torch.float32 + ) # (window_size, n_features) + + if self.y is not None: + y = torch.tensor(self.y[idx], dtype=torch.float32) # scalar + return x, y + return x # test mode + + +def get_train_dataset(data_dir): + """Build the training Dataset from separate features and labels CSVs.""" + data_dir = Path(data_dir) + features_path = data_dir / "train" / "train_features.csv" + labels_path = data_dir / "train" / "train_labels.csv" + return SP500Dataset(features_path, labels_path) -def get_train_data(data_dir): +def get_test_dataset(data_dir, eval_set): + """Build a test Dataset (no labels) for a given evaluation split.""" data_dir = Path(data_dir) - training_dir = data_dir / "train" - X_train = pd.read_csv(training_dir / "train_features.csv") - y_train = pd.read_csv(training_dir / "train_labels.csv") - return X_train, y_train + features_path = data_dir / eval_set / f"{eval_set}_features.csv" + return SP500Dataset(features_path, labels_path=None) -def main(data_dir, output_dir): - # Here, you can import info from the submission module, to evaluate the - # submission - from submission import get_model +def evaluate_model(model, test_dataset): + """Run inference over a test Dataset and return a DataFrame of probabilities. - X_train, y_train = get_train_data(data_dir) + The model outputs probabilities in [0, 1] (sigmoid already applied). + The scoring program is responsible for applying the decision threshold. + """ + device = next(model.parameters()).device + loader = torch.utils.data.DataLoader( + test_dataset, batch_size=64, shuffle=False + ) + probs = [] + model.eval() + with torch.no_grad(): + for x in loader: + # test_dataset returns bare tensors (no label) — x is already the input + x = x.to(device) + batch_probs = model(x).cpu().numpy().tolist() # floats in [0, 1] + probs.extend(batch_probs) + return pd.DataFrame({"Probability": probs}) - print("Training the model") - model = get_model() +def main(data_dir, output_dir): + from submission import ( + get_model, + ) # imported here so sys.path is set first + data_dir = Path(data_dir) + output_dir = Path(output_dir) + + # ── Training ────────────────────────────────────────────────────────────── + train_dataset = get_train_dataset(data_dir) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=32, shuffle=True + ) + + print("Training the model") start = time.time() - model.fit(X_train, y_train) + model = get_model(train_loader) # participant trains and returns the model train_time = time.time() - start - print("-" * 10) + + # ── Evaluation ──────────────────────────────────────────────────────────── + print("=" * 40) print("Evaluate the model") start = time.time() res = {} for eval_set in EVAL_SETS: - X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}_features.csv") - res[eval_set] = evaluate_model(model, X_test) + test_dataset = get_test_dataset(data_dir, eval_set) + res[eval_set] = evaluate_model(model, test_dataset) test_time = time.time() - start - print("-" * 10) - duration = train_time + test_time - print(f"Completed Prediction. Total duration: {duration}") + print( + f"Completed Prediction. Total duration: {train_time + test_time:.1f}s" + ) - # Write output files + # ── Write outputs ───────────────────────────────────────────────────────── output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "metadata.json", "w+") as f: json.dump(dict(train_time=train_time, test_time=test_time), f) @@ -70,19 +173,25 @@ def main(data_dir, output_dir): "--data-dir", type=str, default="/app/input_data", - help="", + help="Root folder containing train/, test/, and private_test/ splits. " + "Codabench mounts data at /app/input_data. " + "For local testing pass: --data-dir dev_phase/input_data", ) parser.add_argument( "--output-dir", type=str, default="/app/output", - help="", + help="Folder where prediction CSVs and metadata.json will be written. " + "Codabench expects output at /app/output. " + "For local testing pass: --output-dir ingestion_res", ) parser.add_argument( "--submission-dir", type=str, default="/app/ingested_program", - help="", + help="Directory containing submission.py. " + "Codabench mounts participant code at /app/ingested_program. " + "For local testing pass: --submission-dir solution", ) args = parser.parse_args() diff --git a/ingestion_program/metadata.yaml b/ingestion_program/metadata.yaml index bcf0d24..8a64ce5 100755 --- a/ingestion_program/metadata.yaml +++ b/ingestion_program/metadata.yaml @@ -1 +1,2 @@ -command: python3 ingestion.py \ No newline at end of file +command: python3 ingestion.py +image: nicolasnoya2001/sp500-challenge:v2 diff --git a/logo.png b/logo.png index 9616456..5255f04 100644 Binary files a/logo.png and b/logo.png differ diff --git a/pages/data.md b/pages/data.md new file mode 100644 index 0000000..22ee78c --- /dev/null +++ b/pages/data.md @@ -0,0 +1,4 @@ +You can download the data for this challenge from here: + +- Training Features: https://nicolas-public-images.s3.us-east-1.amazonaws.com/train/train_features.csv +- True Labels: https://nicolas-public-images.s3.us-east-1.amazonaws.com/train/train_labels.csv \ No newline at end of file diff --git a/pages/participate.md b/pages/participate.md index 4d5427b..1cfc38d 100755 --- a/pages/participate.md +++ b/pages/participate.md @@ -1,10 +1,61 @@ -# How to participate +# How to Participate -You should submit an untrained model in a python file `model.py` which contains -your `class Model`, which will be imported, trained, and tested on Codalab. +## Objective -See the "Seed" page for the outline of a `Model` class, with the expected -function names. +Build a model that predicts whether the S&P 500 index will **close strictly above** the current day's close on the **next trading day**, +using only the provided historical OHLCV features. -See the "Timeline" page for additional information about the phases of this -competition +## Input Features + +Each sample in the dataset is a row in a CSV with the following columns (all values are for the **current trading day** or computed from past days only): + +| Column | Description | +|--------|-------------| +| `Open` | Opening price of the trading day | +| `High` | Intraday high | +| `Low` | Intraday low | +| `Close` | Closing price of the trading day | +| `Volume` | Total trading volume | + +The ingestion program constructs **sliding windows** of the last **50 trading days** for each sample and feeds them to your model as tensors of shape `(batch, 50, n_features)`. + +## Target Label + +- **1** — today's close will be **strictly above** the previous close +- **0** — today's close will be **at or below** the previous close + +## What to Submit + +Submit a single file named **`submission.py`** containing a function: + +```python +def get_model(train_loader): + ... + return model +``` + +`train_loader` is a `torch.utils.data.DataLoader` yielding `(x, y)` batches where: +- `x` has shape `(batch, 50, n_features)` — a sliding window of the last 50 daily feature vectors +- `y` has shape `(batch,)` — binary labels `{0, 1}` + +Your `get_model` function must **train the model** using the provided loader and return a trained `torch.nn.Module` whose `forward(x)` outputs **probabilities in [0, 1]** of shape `(batch,)` — i.e. sigmoid must already be applied inside `forward`. + +See the **Seed** page for a working skeleton to get started. + +## Evaluation Metric + +Submissions are ranked by **ROC-AUC score** on the held-out test set. +A perfect model scores 1.0; random guessing scores ~0.5. + +## How to Submit + +1. Write your `submission.py` with a `get_model(train_loader)` function. +2. Zip it: `zip submission.zip submission.py` +3. Upload the zip on the **My Submissions** page. + +## Rules + +- Your model may only use information in the provided feature set — no external data sources. +- External Python libraries (e.g. `torch`, `sklearn`, `numpy`) are allowed. +- You may submit as many times as you like during the Development Phase. +- The private test set is only revealed after the phase ends. diff --git a/pages/seed.md b/pages/seed.md index 9b15f6a..6853dfe 100644 --- a/pages/seed.md +++ b/pages/seed.md @@ -1,21 +1,85 @@ -# Seed: +# Seed — Starter Template +Copy this file as `submission.py` and implement your model inside `get_model`. + +The ingestion program will call `get_model(train_loader)` and expect back a trained +`torch.nn.Module` whose `forward(x)` returns probabilities in **[0, 1]**. + +```python +import torch +import torch.nn as nn + + +def get_model(train_loader): + """ + Train a model on the S&P 500 direction-forecasting task and return it. + + Parameters + ---------- + train_loader : torch.utils.data.DataLoader + Yields (x, y) batches where: + x — FloatTensor of shape (batch, 50, n_features) + A sliding window of the last 50 daily feature vectors. + Features: Open, High, Low, Close, Volume (current and past days). + y — FloatTensor of shape (batch,) + Binary label: 1 if today's close > previous close, else 0. + + Returns + ------- + model : torch.nn.Module + Trained model in eval() mode. + forward(x) must accept shape (batch, 50, n_features) + and return probabilities in [0, 1] of shape (batch,). + Probabilities are used directly by the scoring program to compute ROC-AUC. + """ + + # --- Infer input size from the first batch --- + x_sample, _ = next(iter(train_loader)) + input_size = x_sample.shape[-1] # number of features per timestep + seq_len = x_sample.shape[1] # window size (50) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # --- Define your model here --- + # Example: single-layer LSTM + linear head + sigmoid + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.lstm = nn.LSTM(input_size, hidden_size=64, + num_layers=1, batch_first=True) + self.head = nn.Linear(64, 1) + + def forward(self, x): + out, _ = self.lstm(x) # (batch, seq_len, 64) + last = out[:, -1, :] # (batch, 64) — last timestep + return torch.sigmoid(self.head(last).squeeze(-1)) # (batch,) + + model = MyModel().to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + criterion = nn.BCELoss() # BCELoss because sigmoid is already applied + + # --- Training loop --- + N_EPOCHS = 10 + model.train() + for epoch in range(N_EPOCHS): + total_loss = 0.0 + for x, y in train_loader: + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + probs = model(x) # (batch,) + loss = criterion(probs, y) + loss.backward() + optimizer.step() + total_loss += loss.item() + print(f"Epoch {epoch+1}/{N_EPOCHS} loss={total_loss/len(train_loader):.4f}") + + model.eval() + return model ``` -class Model: - def fit(self, X_train, y_train): - """ - This should handle the logic of training your model - :param X_train: np.array of training data - :param y_train: np.array of the same length as X_train. Contains classifications of X_train - """ - pass - - def predict(self, X_test): - """ - This should handle making predictions with a trained model - :param X_test: np.array of testing data - :return: np.array of the same length as X_test containing predictions to each point in X_test - """ - pass - -``` \ No newline at end of file + +## Tips + +- You can replace the LSTM with a GRU (`nn.GRU`), Transformer (`nn.TransformerEncoder`), or any other architecture. +- The window size is fixed at **50** timesteps by the ingestion program. +- Keep training time reasonable — the Codabench environment has limited CPU resources. +- You are free to add dropout, batch normalisation, learning rate schedulers, etc. diff --git a/pages/terms.md b/pages/terms.md index 0d69023..e6135f0 100755 --- a/pages/terms.md +++ b/pages/terms.md @@ -1,18 +1,73 @@ # Terms and Conditions -## Lorem Ipsum - -Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. -Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure -dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non -proident, sunt in culpa qui officia deserunt mollit anim id est laborum. - -## Sed ut perspiciatis - -Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae -ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit -aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam -est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore -et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, -nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae -consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur? +## 1. Eligibility + +This challenge is open to students enrolled in the DataCamp courses at École Polytechnique (INF554 / MAP583). Participation is voluntary and free of charge. + +Participants may compete individually or in teams, subject to the rules specified by the course instructors. +2. Data Usage + +The dataset provided for this challenge (historical S&P 500 daily OHLCV data sourced from public market data) is intended solely for educational and research purposes within the scope of this course. + +Participants agree to: + + Use the data only for the purpose of this challenge. + Not redistribute the data outside the course or upload it to public repositories. + Not attempt to identify, reverse-engineer, or misuse the data beyond its intended scientific context. + +3. Training and Evaluation Restrictions + +Participants must comply with the following rules: + + Only the provided training split may be used for training and validation. + The test and private test splits are reserved for evaluation only and must not be used in the training loop. + Any attempt to directly or indirectly train on evaluation data will result in disqualification. + +4. Submission Rules + +Participants must submit: + + A single file named `submission.py` containing a `get_model(train_loader)` function, zipped as `submission.zip` (see the **Seed** page for a full working template). + Any accompanying code or documentation as specified by the instructors. + +Submissions must be the original work of the participants. +5. Academic Integrity + +Participants are expected to adhere to the École Polytechnique’s academic integrity policies. + +Specifically: + + Plagiarism, including copying code or solutions without proper attribution, is prohibited. + The use of external libraries and pre-trained models is allowed unless otherwise stated, provided their use is clearly documented. + Collaboration between teams is not allowed unless explicitly permitted by the instructors. + +Violations of academic integrity rules may result in penalties, including disqualification or academic sanctions. +6. Intellectual Property + +Participants retain ownership of the code and models they develop as part of this challenge. + +By submitting their results, participants grant the course instructors and the hosting institution a non-exclusive, royalty-free right to: + + Use the submissions for grading and evaluation. + Use anonymized results or visualizations for teaching, presentations, or future course materials. + +7. Liability + +The organizers provide the data and evaluation infrastructure “as is” and make no guarantees regarding accuracy, completeness, or fitness for a particular purpose. + +The organizers are not responsible for: + + Technical issues, data loss, or submission failures. + Any damages or losses arising from participation in the challenge. + +8. Modification and Termination + +The organizers reserve the right to: + + Modify the challenge rules, datasets, or evaluation criteria if necessary. + Terminate the challenge in case of technical issues or unforeseen circumstances. + +Any changes will be communicated to participants in a timely manner. +9. Acceptance of Terms + +By participating in the challenge, participants acknowledge that they have read, understood, and agreed to these Terms and Conditions. diff --git a/pages/timeline.md b/pages/timeline.md index 4e613bf..5ef189a 100644 --- a/pages/timeline.md +++ b/pages/timeline.md @@ -1,11 +1,27 @@ # Timeline -## Development phase -This phase should be used to tune your models, testing against a small -set of testing data - -## Final phase -Resubmit your preferred submission from the development phase to test -against a new set of testing data. Your model will be trained against -the same set of training data as in the development phase. You may only -make one submission to this phase, so choose wisely. +## Development Phase — October 7, 2025 → March 31, 2026 + +The development phase is open for the full duration of the course. + +- **Training data**: historical S&P 500 daily OHLCV data (roughly 2006–2022). +- **Public test set**: a held-out window of ~250 trading days (~2022–2023). + Your submission is scored against this set after every submission. +- **Submissions**: unlimited — iterate freely and track your progress on the leaderboard. +- **Goal**: tune your model architecture, features, and hyper-parameters to maximise ROC-AUC on the public test set. + +## Private Leaderboard — revealed at end of Development Phase + +Once the development phase closes on **March 31, 2026**, the private test set +(a further ~250 trading days, ~2023–2024) is scored for all submissions. +Final rankings are based on the **private test ROC-AUC**. + +The private test set is completely hidden during the development phase — optimising solely for the public leaderboard may not generalise. + +## Key Dates + +| Date | Event | +|------|-------| +| October 7, 2025 | Competition opens, development phase begins | +| March 31, 2026 | Development phase closes, no further submissions accepted | +| Early April 2026 | Private leaderboard revealed, final rankings published | diff --git a/raw_data/.gitkeep b/raw_data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index fda4dd6..2956b1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,11 @@ -pandas +# ── Core runtime (ingestion + scoring + submission) ─────────────────────────── +numpy==2.2.6 +pandas==2.3.3 scikit-learn + +# PyTorch CPU build — participants may swap for a GPU wheel if needed +torch + +# ── Local development tools ─────────────────────────────────────────────────── +# Required only for tools/run_docker.py (not installed inside the Docker image) +docker diff --git a/scoring_program/metadata.yaml b/scoring_program/metadata.yaml index 1dfabf8..fabbca7 100755 --- a/scoring_program/metadata.yaml +++ b/scoring_program/metadata.yaml @@ -1 +1,2 @@ -command: python3 scoring.py \ No newline at end of file +command: python3 scoring.py +image: nicolasnoya2001/sp500-challenge:v2 diff --git a/scoring_program/scoring.py b/scoring_program/scoring.py index 701f78f..27b8f53 100755 --- a/scoring_program/scoring.py +++ b/scoring_program/scoring.py @@ -2,40 +2,41 @@ from pathlib import Path import pandas as pd +from sklearn.metrics import roc_auc_score EVAL_SETS = ["test", "private_test"] -def compute_accuracy(predictions, targets): - # Make sure there is no NaN, as pandas ignores them in mean computation - predictions = predictions.fillna(-10).values - # Return mean of correct predictions - return (predictions == targets.values).mean() +def compute_roc_auc(predictions, targets): + # Flatten to 1D — .values on a single-column DataFrame gives shape (n, 1) + preds = predictions.iloc[:, 0].fillna(0.5).to_numpy() + # Return ROC AUC score + return roc_auc_score(targets, preds) def main(reference_dir, prediction_dir, output_dir): scores = {} for eval_set in EVAL_SETS: - print(f'Scoring {eval_set}') + print(f"Scoring {eval_set}") predictions = pd.read_csv( - prediction_dir / f'{eval_set}_predictions.csv' - ) - targets = pd.read_csv( - reference_dir / f'{eval_set}_labels.csv' + prediction_dir / f"{eval_set}_predictions.csv" ) + targets = pd.read_csv(reference_dir / f"{eval_set}_labels.csv") - scores[eval_set] = float(compute_accuracy(predictions, targets)) + scores[eval_set] = float( + compute_roc_auc(predictions, targets["Target"].values) + ) # Add train and test times in the score - json_durations = (prediction_dir / 'metadata.json').read_text() + json_durations = (prediction_dir / "metadata.json").read_text() durations = json.loads(json_durations) scores.update(**durations) print(scores) # Write output scores output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / 'scores.json').write_text(json.dumps(scores)) + (output_dir / "scores.json").write_text(json.dumps(scores)) if __name__ == "__main__": @@ -68,5 +69,5 @@ def main(reference_dir, prediction_dir, output_dir): main( Path(args.reference_dir), Path(args.prediction_dir), - Path(args.output_dir) + Path(args.output_dir), ) diff --git a/solution/submission.py b/solution/submission.py index a8076b0..436486b 100644 --- a/solution/submission.py +++ b/solution/submission.py @@ -1,7 +1,108 @@ -from sklearn.ensemble import RandomForestClassifier +""" +Reference LSTM baseline for the S&P 500 direction-forecasting challenge. +The ingestion program will call: -# The submission here should simply be a function that returns a model -# compatible with scikit-learn API -def get_model(): - return RandomForestClassifier() + model = get_model(train_loader) + +where `train_loader` is a torch.utils.data.DataLoader that yields +(x, y) batches with: + x : FloatTensor of shape (batch, WINDOW_SIZE, n_features) + y : FloatTensor of shape (batch,) — binary labels (1 = up, 0 = down) + +`get_model` must return a trained torch.nn.Module whose forward pass accepts +a tensor of shape (batch, WINDOW_SIZE, n_features) and returns probabilities +in [0, 1] of shape (batch,). The ingestion program applies a 0.5 threshold. +""" + +import torch +import torch.nn as nn + + +# ── Hyper-parameters (feel free to tune) ───────────────────────────────────── +HIDDEN_SIZE = 128 +NUM_LAYERS = 3 +DROPOUT = 0.1 +N_EPOCHS = 3 +LEARNING_RATE = 1e-4 +# ───────────────────────────────────────────────────────────────────────────── + + +class LSTMClassifier(nn.Module): + """Sequence-to-one LSTM for binary direction prediction. + + Takes a window of shape (batch, seq_len, input_size) and returns + a scalar logit per sample (shape: (batch,)). + + Architecture + ------------ + LSTM (num_layers, hidden_size, dropout) → hidden state of last timestep + → Linear(hidden_size → 1) → squeeze → Sigmoid → probability in [0, 1] + """ + + def __init__( + self, + input_size: int, + hidden_size: int = HIDDEN_SIZE, + num_layers: int = NUM_LAYERS, + dropout: float = DROPOUT, + ): + super().__init__() + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout if num_layers > 1 else 0.0, + ) + self.head = nn.Linear(hidden_size, 1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x: (batch, seq_len, input_size) + out, _ = self.lstm(x) # (batch, seq_len, hidden_size) + last = out[:, -1, :] # (batch, hidden_size) — last timestep + logit = self.head(last).squeeze(-1) # (batch,) + return torch.sigmoid(logit) # (batch,) — probability in [0, 1] + + +def get_model(train_loader: torch.utils.data.DataLoader) -> nn.Module: + """Train an LSTM on the provided DataLoader and return the trained model. + + Parameters + ---------- + train_loader : DataLoader + Yields (x, y) batches where x has shape (batch, WINDOW_SIZE, n_features) + and y has shape (batch,) with values in {0, 1}. + + Returns + ------- + model : nn.Module (in eval mode) + Trained LSTMClassifier whose forward pass returns probabilities in [0, 1]. + """ + # Infer input size from the first batch + x_sample, _ = next(iter(train_loader)) + input_size = x_sample.shape[-1] # n_features + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Training on: {device}") + + model = LSTMClassifier(input_size=input_size).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) + criterion = nn.BCELoss() # model already applies sigmoid + + model.train() + for epoch in range(N_EPOCHS): + total_loss = 0.0 + for x, y in train_loader: + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + probs = model(x) # (batch,) — probabilities in [0, 1] + loss = criterion(probs, y) + loss.backward() + optimizer.step() + total_loss += loss.item() + avg_loss = total_loss / len(train_loader) + print(f" Epoch {epoch + 1:>2}/{N_EPOCHS} loss={avg_loss:.4f}") + + model.eval() + return model diff --git a/template_starting_kit.ipynb b/template_starting_kit.ipynb index 7167a3a..2434ef9 100644 --- a/template_starting_kit.ipynb +++ b/template_starting_kit.ipynb @@ -4,17 +4,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", + "# Directional Forecasting of the S&P 500 Index\n", "\n", - "# Template Kit for Cadabench challenge in the Datacamp\n", + "*DataCamp Challenge — École Polytechnique (INF554 / MAP583)*\n", "\n", - " Thomas Moreau (Inria)
\n", - " Pedro Rodrigues (Inria) " + "---\n", + "\n", + "Can you predict whether the S&P 500 will close **UP** or **DOWN** tomorrow?\n", + "\n", + "This notebook walks you through the data, the evaluation metric, and how to build and test a submission locally before uploading it to Codabench.\n" ] }, { @@ -23,28 +21,41 @@ "source": [ "## Introduction\n", "\n", - "Describe the challenge, in particular:\n", + "### The Task\n", + "\n", + "This is a **binary classification** challenge: given the recent history of the S&P 500 index, predict whether the next trading day's closing price will be **strictly above** (`1`) or **at or below** (`0`) the current day's closing price.\n", + "\n", + "### The Data\n", + "\n", + "Each row in the dataset represents one **trading day** and contains the following raw OHLCV features:\n", + "\n", + "| Column | Description |\n", + "|----------|-------------|\n", + "| `Open` | Opening price of the day |\n", + "| `High` | Intraday high |\n", + "| `Low` | Intraday low |\n", + "| `Close` | Closing price of the day |\n", + "| `Volume` | Total trading volume |\n", + "\n", + "The ingestion program wraps these rows into **sliding windows of 50 consecutive trading days**, so your model receives sequences of shape `(batch, 50, 5)`.\n", "\n", - "- Where the data comes from?\n", - "- What is the task this challenge aims to solve?\n", - "- Why does it matter?" + "### Why It Matters\n", + "\n", + "Predicting market direction is a canonical and challenging time-series problem — the signal-to-noise ratio is very low, and models that generalise beyond the training period are rare. The challenge rewards robust, well-regularised approaches over overfit ones.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Exploratory data analysis\n", - "\n", - "The goal of this section is to show what's in the data, and how to play with it.\n", - "This is the first set in any data science project, and here, you should give a sense of the data the participants will be working with.\n", + "## Exploratory Data Analysis\n", "\n", - "You can first load and describe the data, and then show some interesting properties of it." + "Let's load the raw training data and get a feel for what we're working with.\n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -52,68 +63,184 @@ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", - "pd.set_option('display.max_columns', None)\n", "\n", - "# Load the data\n", - "from ingestion_program.ingestion import get_train_data\n", - "X_df, y = get_train_data(\"dev_phase/input_data\")" + "pd.set_option(\"display.max_columns\", None)\n", + "\n", + "DATA_DIR = \"dev_phase/input_data\"\n", + "\n", + "# Load raw CSV files for exploration\n", + "features = pd.read_csv(f\"{DATA_DIR}/train/train_features.csv\", index_col=0)\n", + "labels = pd.read_csv(f\"{DATA_DIR}/train/train_labels.csv\", index_col=0)\n", + "\n", + "print(f\"Training samples: {len(features)}\")\n", + "print(f\"Features: {list(features.columns)}\")\n", + "print(f\"\\nLabel distribution:\\n{labels['Target'].value_counts().rename({1: 'UP (1)', 0: 'DOWN (0)'})}\")\n", + "features.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the Close price series with UP/DOWN days coloured\n", + "fig, axes = plt.subplots(2, 1, figsize=(14, 7), sharex=True)\n", + "\n", + "up = labels[\"Target\"] == 1\n", + "axes[0].plot(features.index, features[\"Close\"], color=\"steelblue\", linewidth=0.8)\n", + "axes[0].set_title(\"S&P 500 Close Price (training period)\")\n", + "axes[0].set_ylabel(\"Close price\")\n", + "\n", + "axes[1].bar(features.index[up], 1, color=\"green\", width=1, label=\"UP (1)\")\n", + "axes[1].bar(features.index[~up], 1, color=\"red\", width=1, label=\"DOWN (0)\")\n", + "axes[1].set_title(\"Daily direction label\")\n", + "axes[1].set_ylabel(\"Label\")\n", + "axes[1].legend(loc=\"upper right\")\n", + "axes[1].set_xlabel(\"Trading day index\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature statistics\n", + "features.describe().round(2)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Challenge evaluation\n", + "## Challenge Evaluation\n", + "\n", + "Submissions are ranked by **ROC-AUC** (Area Under the ROC Curve) computed on a held-out test window.\n", + "\n", + "- A **perfect** model scores **1.0**\n", + "- **Random guessing** (outputting 0.5 for every sample) scores **≈ 0.5**\n", + "- Predicting hard 0/1 labels instead of probabilities will likely score around 0.5 — always output **sigmoid probabilities**.\n", "\n", - "A particularly important point in a challenge is to describe how it is evaluated. This is the section where you should describe the metric that will be used to evaluate the participants' submissions, as well as your evaluation strategy, in particular if there is some complexity in the way the data should be split to ensure valid results." + "The key advantage of ROC-AUC is that it is **threshold-independent**: it rewards models that rank UP days above DOWN days regardless of the absolute probability values they produce.\n", + "\n", + "There are two splits:\n", + "- **Public test** — visible on the leaderboard during the development phase \n", + "- **Private test** — revealed only after the phase ends (final ranking)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Submission format\n", + "## Submission Format\n", + "\n", + "You must submit a **single file** named `submission.py` that exposes one function:\n", + "\n", + "```python\n", + "def get_model(train_loader: torch.utils.data.DataLoader) -> torch.nn.Module:\n", + " ...\n", + " return model # already in eval mode\n", + "```\n", "\n", - "Here, you should describe the submission format. This is the format the participants should follow to submit their predictions on the codabench platform." + "**Contract:**\n", + "- `train_loader` yields `(x, y)` batches where `x` has shape `(batch, 50, 5)` and `y` has shape `(batch,)` with values in `{0, 1}`\n", + "- The returned model's `forward(x)` must accept `(batch, 50, 5)` tensors and return **probabilities in [0, 1]** of shape `(batch,)` — i.e. apply `sigmoid` inside `forward`\n", + "\n", + "The cell below shows the reference LSTM baseline included in the challenge.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## The submission file\n", + "### Baseline: LSTM Classifier\n", "\n", - "The input data are stored in a dataframe. To go from a dataframe to a numpy array we will use a scikit-learn column transformer. The first example we will write will just consist in selecting a subset of columns we want to work with." + "The baseline uses a multi-layer LSTM that reads the 50-day window and outputs a direction probability from the last hidden state.\n", + "Feel free to replace this architecture entirely — a Transformer, a 1-D CNN, or even a simple MLP over flattened windows are all valid approaches.\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# %load solution/submission.py\n", - "from sklearn.ensemble import RandomForestClassifier\n", + "import torch\n", + "import torch.nn as nn\n", + "\n", + "# ── Hyper-parameters ──────────────────────────────────────────────────────────\n", + "HIDDEN_SIZE = 128\n", + "NUM_LAYERS = 3\n", + "DROPOUT = 0.1\n", + "N_EPOCHS = 3\n", + "LEARNING_RATE = 1e-4\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "\n", + "\n", + "class LSTMClassifier(nn.Module):\n", + " \"\"\"Sequence-to-one LSTM: (batch, seq_len, n_features) → (batch,) probability.\"\"\"\n", + "\n", + " def __init__(self, input_size, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=DROPOUT):\n", + " super().__init__()\n", + " self.lstm = nn.LSTM(\n", + " input_size=input_size,\n", + " hidden_size=hidden_size,\n", + " num_layers=num_layers,\n", + " batch_first=True,\n", + " dropout=dropout if num_layers > 1 else 0.0,\n", + " )\n", + " self.head = nn.Linear(hidden_size, 1)\n", + "\n", + " def forward(self, x):\n", + " out, _ = self.lstm(x) # (batch, seq_len, hidden_size)\n", + " last = out[:, -1, :] # (batch, hidden_size) — last timestep\n", + " logit = self.head(last).squeeze(-1) # (batch,)\n", + " return torch.sigmoid(logit) # probability in [0, 1]\n", "\n", "\n", - "# The submission here should simply be a function that returns a model\n", - "# compatible with scikit-learn API\n", - "def get_model():\n", - " return RandomForestClassifier()\n" + "def get_model(train_loader):\n", + " x_sample, _ = next(iter(train_loader))\n", + " input_size = x_sample.shape[-1] # n_features (5 for raw OHLCV)\n", + "\n", + " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + " model = LSTMClassifier(input_size=input_size).to(device)\n", + " optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)\n", + " criterion = nn.BCELoss()\n", + "\n", + " model.train()\n", + " for epoch in range(N_EPOCHS):\n", + " total_loss = 0.0\n", + " for x, y in train_loader:\n", + " x, y = x.to(device), y.to(device)\n", + " optimizer.zero_grad()\n", + " loss = criterion(model(x), y)\n", + " loss.backward()\n", + " optimizer.step()\n", + " total_loss += loss.item()\n", + " print(f\"Epoch {epoch + 1}/{N_EPOCHS} loss={total_loss / len(train_loader):.4f}\")\n", + "\n", + " model.eval()\n", + " return model\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Local testing pipeline\n", + "## Local Testing Pipeline\n", "\n", - "Here you can show how the model will be used to generate predictions on the test set, and how the evaluation will be performed." + "Before submitting to Codabench you can run the full ingestion + scoring pipeline locally.\n", + "This mirrors exactly what happens on the platform.\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -133,31 +260,49 @@ } ], "source": [ - "model = get_model()\n", - "X_train, y_train = get_train_data(\"dev_phase/input_data\")\n", - "model.fit(X_train, y_train)\n", + "import sys\n", + "sys.path.insert(0, \".\") # make sure ingestion_program/ and solution/ are importable\n", + "\n", + "import torch\n", + "from ingestion_program.ingestion import get_train_dataset, get_test_dataset, evaluate_model\n", + "from scoring_program.scoring import compute_roc_auc\n", + "\n", + "DATA_DIR = \"dev_phase/input_data\"\n", + "\n", + "# ── 1. Build training DataLoader ──────────────────────────────────────────────\n", + "train_dataset = get_train_dataset(DATA_DIR)\n", + "train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)\n", + "print(f\"Training samples : {len(train_dataset)}\")\n", + "\n", + "# ── 2. Train the model ────────────────────────────────────────────────────────\n", + "model = get_model(train_loader)\n", "\n", - "X_test = pd.read_csv(\"dev_phase/input_data/test/test_features.csv\")\n", - "from ingestion_program.ingestion import evaluate_model\n", - "y_test = evaluate_model(model, X_test)\n", + "# ── 3. Predict on the public test set ────────────────────────────────────────\n", + "test_dataset = get_test_dataset(DATA_DIR, \"test\")\n", + "predictions = evaluate_model(model, test_dataset) # DataFrame with \"Probability\" column\n", "\n", - "from scoring_program.scoring import compute_accuracy\n", - "print(\"Accuracy on test set:\", compute_accuracy(y_test, pd.read_csv(\"dev_phase/input_data/test/test_labels.csv\")))" + "# ── 4. Score against the reference labels ────────────────────────────────────\n", + "import pandas as pd\n", + "test_labels = pd.read_csv(\"dev_phase/reference_data/test_labels.csv\")\n", + "auc = compute_roc_auc(predictions, test_labels[\"Target\"].values)\n", + "print(f\"\\nROC-AUC on public test set: {auc:.4f}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Submission\n", + "## Submitting to Codabench\n", + "\n", + "1. Write your `submission.py` — it must define `get_model(train_loader)` returning a trained `nn.Module`.\n", + "2. Create a zip containing only that file:\n", + " ```bash\n", + " zip submission.zip submission.py\n", + " ```\n", + "3. Go to the challenge page on Codabench → **My Submissions** → **Upload**.\n", "\n", - "To submit your code, you can refer to the actual challenge." + "That's it — the platform will run your code, output predictions, compute the ROC-AUC, and update the leaderboard automatically.\n" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { diff --git a/tools/Dockerfile b/tools/Dockerfile index 8cb1eca..3724efa 100644 --- a/tools/Dockerfile +++ b/tools/Dockerfile @@ -1,20 +1,43 @@ -# Step 1: Start from an official Docker image with desired base environment -# Good starting points are the official codalab images or -# pytorch images with CUDA support: -# - Codalab: codalab/codalab-legacy:py39 -# - Codalab GPU: codalab/codalab-legacy:gpu310 -# - Pytorch: pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime -FROM codalab/codalab-legacy:py39 +# ───────────────────────────────────────────────────────────────────────────── +# Codabench Docker image — S&P 500 Autoregressive Forecasting Challenge +# +# Codabench mounts the following directories at runtime: +# /app/input_data ← input splits (train/, test/, private_test/) +# /app/ingested_program ← participant's submission.py +# /app/ingestion_program ← ingestion.py (organiser code) +# /app/scoring_program ← scoring.py (organiser code) +# /app/output ← ingestion writes predictions here +# /app/input/ref ← reference labels (scoring) +# /app/input/res ← predictions to score (scoring) +# +# Build from the PROJECT ROOT, not from tools/: +# docker build -t sp500-challenge -f tools/Dockerfile . +# +# The build context must be the project root so that requirements.txt is +# accessible via COPY. +# ───────────────────────────────────────────────────────────────────────────── -# Set environment variables to prevent interactive prompts -ENV DEBIAN_FRONTEND=noninteractive +FROM python:3.11-slim -# Step 2: Install system-level dependencies (if any) -# e.g., git, wget, or common libraries for OpenCV like libgl1 -RUN pip install -U pip +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 -# Step 3: Copy and pre-install all Python dependencies -# This 'requirements.txt' file should list pandas, scikit-learn, timm, etc. -# Place it in the same directory as this Dockerfile. -COPY requirements.txt /tmp/requirements.txt -RUN pip install --no-cache-dir -r /tmp/requirements.txt +# Minimal system deps: gcc is needed to compile some numpy/pandas C extensions +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Pre-install Python dependencies. +# torch is installed explicitly with the CPU-only wheel to keep the image small. +# The 'docker' package is a local dev tool and is excluded from the container. +RUN pip install --no-cache-dir --upgrade pip + +# Install torch CPU wheel first (separate layer so it's cached independently) +# RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu +RUN pip install torch --index-url https://download.pytorch.org/whl/cpu + +# Install remaining runtime deps (excluding torch and docker which are handled separately) +RUN pip install --no-cache-dir numpy pandas scikit-learn diff --git a/tools/create_bundle.py b/tools/create_bundle.py index 9f612ec..01e1398 100644 --- a/tools/create_bundle.py +++ b/tools/create_bundle.py @@ -13,6 +13,7 @@ ROOT_DIR / "competition.yaml", ROOT_DIR / "logo.png", ROOT_DIR / "solution/submission.py", + ROOT_DIR / "template_starting_kit.ipynb", ] diff --git a/tools/run_docker.py b/tools/run_docker.py index d888b76..0413a40 100644 --- a/tools/run_docker.py +++ b/tools/run_docker.py @@ -1,4 +1,5 @@ from pathlib import Path + try: import docker except ImportError: @@ -14,12 +15,18 @@ print("Docker client initialized successfully.") print("Building Docker image...") - client.images.build(path=".", tag="tommoral/template:v1") - print("Docker image built successfully with tag 'tommoral/template:v1'.") + client.images.build( + path=str(REPO), + dockerfile=str(REPO / "tools" / "Dockerfile"), + tag="nicolasnoya2001/sp500-challenge:v2", + ) + print( + "Docker image built successfully with tag 'nicolasnoya2001/sp500-challenge:v2'." + ) print("Running Docker container...") logs = client.containers.run( - image="tommoral/template:v1", + image="nicolasnoya2001/sp500-challenge:v2", command="python3 /app/ingestion_program/ingestion.py", remove=True, name="ingestion", @@ -29,11 +36,11 @@ f"{REPO}/dev_phase/input_data:/app/input_data", f"{REPO}/ingestion_res:/app/output", f"{REPO}/solution:/app/ingested_program", - ] + ], ) print(logs.decode("utf-8")) logs = client.containers.run( - image="tommoral/template:v1", + image="nicolasnoya2001/sp500-challenge:v2", command="python3 /app/scoring_program/scoring.py", remove=True, name="scoring", @@ -43,7 +50,7 @@ f"{REPO}/dev_phase/reference_data:/app/input/ref", f"{REPO}/ingestion_res:/app/input/res", f"{REPO}/scoring_res:/app/", - ] + ], ) print(logs.decode("utf-8")) print("Docker container ran successfully.") diff --git a/tools/setup_data.py b/tools/setup_data.py index 5bdc3a9..125549b 100644 --- a/tools/setup_data.py +++ b/tools/setup_data.py @@ -1,53 +1,72 @@ -# Script to download the data from a given source and create the splits -# This is a mock version that generate fake problems +# Script to load the S&P500 data and create the splits for the benchmark from pathlib import Path -import numpy as np import pandas as pd -from sklearn.datasets import make_classification -from sklearn.model_selection import train_test_split -PHASE = 'dev_phase' +PHASE = "dev_phase" -DATA_DIR = Path(PHASE) / 'input_data' -REF_DIR = Path(PHASE) / 'reference_data' +DATA_DIR = Path(PHASE) / "input_data" +REF_DIR = Path(PHASE) / "reference_data" + +RAW_DATA_PATH = Path("raw_data") / "sp500_raw.csv" +TARGET_COL = "Target" def make_csv(data, filepath): filepath.parent.mkdir(parents=True, exist_ok=True) - pd.DataFrame(data).to_csv(filepath, index=False) + data.to_csv(filepath, index=True) # integer row index saved as first column if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser( - description='Load or generate data for the benchmark' - ) - parser.add_argument('--seed', type=int, default=42, - help='Random seed for data generation') - args = parser.parse_args() - - # Generate and split the data - rng = np.random.RandomState(args.seed) - X, y = make_classification(n_samples=500, n_features=5, random_state=rng) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, random_state=rng - ) - X_test, X_private_test, y_test, y_private_test = train_test_split( - X_test, y_test, test_size=0.5, random_state=rng - ) + # Load the S&P500 data + print(f"Loading data from {RAW_DATA_PATH}") + df = pd.read_csv(RAW_DATA_PATH) + + # Separate features and target; drop Date (not a model input) + y = df[TARGET_COL] + X = df.drop(columns=[TARGET_COL, "Date"]).reset_index(drop=True) + + n = len(df) + train_end = int(n * 0.6) + test_end = int(n * 0.8) + + # Split chronologically: 60% train, 20% test, 20% private_test + X_train, y_train = X.iloc[:train_end], y.iloc[:train_end] + X_test, y_test = X.iloc[train_end:test_end], y.iloc[train_end:test_end] + X_private_test, y_private_test = X.iloc[test_end:], y.iloc[test_end:] + + print(f"Dataset shape: {df.shape}") + print(f"Features: {X.shape[1]}, Samples: {n}") + print(f"Target distribution:\n{y.value_counts()}") # Store the data in the correct folders: # - input_data contains train data (both features and labels) and only # test features so the test labels are kept secret # - reference_data contains the test labels for scoring for split, X_split, y_split in [ - ('train', X_train, y_train), - ('test', X_test, y_test), - ('private_test', X_private_test, y_private_test), + ("train", X_train, y_train), + ("test", X_test, y_test), + ("private_test", X_private_test, y_private_test), ]: split_dir = DATA_DIR / split - make_csv(X_split, split_dir / f'{split}_features.csv') + make_csv(X_split, split_dir / f"{split}_features.csv") label_dir = split_dir if split == "train" else REF_DIR - make_csv(y_split, label_dir / f'{split}_labels.csv') \ No newline at end of file + make_csv( + pd.DataFrame({TARGET_COL: y_split}), + label_dir / f"{split}_labels.csv", + ) + + print("\nData splits created successfully!") + print( + f"{'Split':<15} {'Samples':<10} {'Index start':<15} {'Index end':<15}" + ) + print("-" * 55) + for split, X_split in [ + ("train", X_train), + ("test", X_test), + ("private_test", X_private_test), + ]: + print( + f"{split:<15} {len(X_split):<10} {X_split.index[0]:<15} {X_split.index[-1]:<15}" + )