diff --git a/.gitignore b/.gitignore
index 9728749..4c2f2aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
 ingestion_res/*
 scoring_res/*
 dev_phase/*
+*.pth
diff --git a/competition.yaml b/competition.yaml
index 3a1e4de..44a4a74 100755
--- a/competition.yaml
+++ b/competition.yaml
@@ -1,8 +1,26 @@
 version: 2
-title: Templat competition - Dummy classification
-description: Dummy classification task
+title: "Autoregressive Forecasting of the S&P 500 Index"
+description: >
+  Can you predict whether the S&P 500 will close up or down — using only what you know by mid-morning?
+
+  Each trading day, participants receive a feature vector built from:
+    - Intraday morning signals: the day's open price and early price action
+      (e.g. open-to-first-hour return, morning high/low range, opening gap vs previous close).
+    - Historical context: past N days of daily OHLCV data, log-returns, and
+      rolling statistics (volatility, momentum) up to and including the previous close.
+
+  The target label is binary: **1** if the day's close is strictly above the previous close,
+  **0** otherwise. No look-ahead is permitted — only information available before noon (ET)
+  may be used as features for the current day.
+
+  Participants submit a scikit-learn–compatible model via a `submission.py` file
+  exposing a `get_model()` function. The model is trained server-side on historical
+  data and evaluated on a held-out test window using **directional accuracy**
+  (fraction of days where the predicted direction matches the actual close direction).
+
+  This is a DataCamp challenge organised at École Polytechnique (INF554 / MAP583).
 image: logo.png
-registration_auto_approve: False  # if True, do not require approval from admin to join the comp
+registration_auto_approve: False  # set to True to skip manual approval
 
 terms: pages/terms.md
 pages:
@@ -15,8 +33,15 @@ pages:
 
 tasks:
   - index: 0
-    name: Developement Task
-    description: 'Tune models with training data, test against examples contained in public test data'
+    name: Development Task
+    description: >
+      Same-day close direction forecasting of the S&P 500 using morning information.
+      Each sample consists of: (i) intraday morning features for the current trading day
+      (opening gap, open price, early price action) and (ii) historical daily features
+      from the past N sessions (log-returns, OHLCV, rolling volatility, momentum).
+      The label is 1 if today's close > previous close, 0 otherwise.
+      No information after the morning window may be used; models are scored on
+      directional accuracy over a public held-out test window.
     input_data: dev_phase/input_data/
     reference_data: dev_phase/reference_data/
     ingestion_program: ingestion_program/
@@ -25,13 +50,15 @@ tasks:
 solutions:
   - index: 0
     tasks:
-    - 0
+      - 0
     path: solution/
 
-
 phases:
   - name: Development Phase
-    description: 'Development phase: tune your models.'
+    description: >
+      Tune and validate your autoregressive model using the provided historical
+      S&P 500 training data. Your predictions are scored against a public test set
+      so you can iterate quickly. Unlimited submissions are allowed in this phase.
     start: 10-07-2025
     end: 03-31-2026
     tasks:
@@ -41,20 +68,20 @@ leaderboards:
   - title: Results
     key: main
     columns:
-      - title: Test Accuracy
+      - title: Directional Accuracy (public test)
         key: test
         index: 0
-        sorting: asc
-      - title: Private Test Accuracy
+        sorting: desc          # higher is better
+      - title: Directional Accuracy (private test)
         key: private_test
         index: 1
-        sorting: asc
-        hidden: True
-      - title: Train time
+        sorting: desc
+        hidden: True           # revealed only after the phase ends
+      - title: Train Time (s)
         key: train_time
         index: 2
-        sorting: desc
-      - title: Test time
+        sorting: asc           # lower is better
+      - title: Predict Time (s)
         key: test_time
         index: 3
-        sorting: desc
+        sorting: asc
diff --git a/ingestion_program/ingestion.py b/ingestion_program/ingestion.py
index f150b05..3c3143f 100755
--- a/ingestion_program/ingestion.py
+++ b/ingestion_program/ingestion.py
@@ -3,53 +3,156 @@
 import time
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
+import torch
 
+# Number of past trading days fed as a sequence to the model.
+# Must be consistent between training and inference.
+WINDOW_SIZE = 50
 
 EVAL_SETS = ["test", "private_test"]
 
 
-def evaluate_model(model, X_test):
-
-    y_pred = model.predict(X_test)
-    return pd.DataFrame(y_pred)
+class SP500Dataset(torch.utils.data.Dataset):
+    """PyTorch Dataset for the S&P 500 direction-forecasting challenge.
+
+    Each sample is a sliding window of shape (WINDOW_SIZE, n_features)
+    ending at day `idx`. The target is the binary label of that last day
+    (1 = close > prev_close, 0 otherwise).
+
+    For the first WINDOW_SIZE-1 days, the window is left-padded with zeros.
+
+    Parameters
+    ----------
+    features_path : Path
+        Path to the features CSV (columns = feature names, rows = trading days
+        in chronological order).
+    labels_path : Path or None
+        Path to the labels CSV (single column, same row order as features).
+        Pass None for test sets where labels are withheld.
+    window_size : int
+        Number of past days (inclusive of the current day) in each sequence.
+    """
+
+    def __init__(
+        self, features_path, labels_path=None, window_size=WINDOW_SIZE
+    ):
+        self.window_size = window_size
+        # index_col=0: the first column is the row index saved by setup_data.py,
+        # not a feature — must be excluded from the data arrays.
+        self.X = pd.read_csv(features_path, index_col=0).values.astype(
+            np.float32
+        )
+        self.n_features = self.X.shape[1]
+        if labels_path is not None:
+            self.y = (
+                pd.read_csv(labels_path, index_col=0)
+                .values.astype(np.float32)
+                .ravel()
+            )
+        else:
+            self.y = None  # test mode — labels are unknown
+
+    def __len__(self):
+        return len(self.X)
+
+    def __getitem__(self, idx):
+        """Return (window, label) where window has shape (window_size, n_features).
+
+        The label is the binary target for day `idx` (the last day of the window).
+        During test mode (no labels), only the window tensor is returned.
+        """
+        window_start = max(0, idx - self.window_size + 1)
+        window = self.X[window_start : idx + 1]  # (<=window_size, n_features)
+
+        # Left-pad with zeros if we are at the beginning of the series
+        if len(window) < self.window_size:
+            padding = np.zeros(
+                (self.window_size - len(window), self.n_features),
+                dtype=np.float32,
+            )
+            window = np.concatenate([padding, window], axis=0)
+
+        x = torch.tensor(
+            window, dtype=torch.float32
+        )  # (window_size, n_features)
+
+        if self.y is not None:
+            y = torch.tensor(self.y[idx], dtype=torch.float32)  # scalar
+            return x, y
+        return x  # test mode
+
+
+def get_train_dataset(data_dir):
+    """Build the training Dataset from separate features and labels CSVs."""
+    data_dir = Path(data_dir)
+    features_path = data_dir / "train" / "train_features.csv"
+    labels_path = data_dir / "train" / "train_labels.csv"
+    return SP500Dataset(features_path, labels_path)
 
 
-def get_train_data(data_dir):
+def get_test_dataset(data_dir, eval_set):
+    """Build a test Dataset (no labels) for a given evaluation split."""
     data_dir = Path(data_dir)
-    training_dir = data_dir / "train"
-    X_train = pd.read_csv(training_dir / "train_features.csv")
-    y_train = pd.read_csv(training_dir / "train_labels.csv")
-    return X_train, y_train
+    features_path = data_dir / eval_set / f"{eval_set}_features.csv"
+    return SP500Dataset(features_path, labels_path=None)
 
 
-def main(data_dir, output_dir):
-    # Here, you can import info from the submission module, to evaluate the
-    # submission
-    from submission import get_model
+def evaluate_model(model, test_dataset):
+    """Run inference over a test Dataset and return a DataFrame of probabilities.
 
-    X_train, y_train = get_train_data(data_dir)
+    The model outputs probabilities in [0, 1] (sigmoid already applied).
+    The scoring program is responsible for applying the decision threshold.
+    """
+    device = next(model.parameters()).device
+    loader = torch.utils.data.DataLoader(
+        test_dataset, batch_size=64, shuffle=False
+    )
+    probs = []
+    model.eval()
+    with torch.no_grad():
+        for x in loader:
+            # test_dataset returns bare tensors (no label) — x is already the input
+            x = x.to(device)
+            batch_probs = model(x).cpu().numpy().tolist()  # floats in [0, 1]
+            probs.extend(batch_probs)
+    return pd.DataFrame({"Probability": probs})
 
-    print("Training the model")
 
-    model = get_model()
+def main(data_dir, output_dir):
+    from submission import (
+        get_model,
+    )  # imported here so sys.path is set first
 
+    data_dir = Path(data_dir)
+    output_dir = Path(output_dir)
+
+    # ── Training ──────────────────────────────────────────────────────────────
+    train_dataset = get_train_dataset(data_dir)
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=32, shuffle=True
+    )
+
+    print("Training the model")
     start = time.time()
-    model.fit(X_train, y_train)
+    model = get_model(train_loader)  # participant trains and returns the model
     train_time = time.time() - start
-    print("-" * 10)
+
+    # ── Evaluation ────────────────────────────────────────────────────────────
+    print("=" * 40)
     print("Evaluate the model")
     start = time.time()
     res = {}
     for eval_set in EVAL_SETS:
-        X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}_features.csv")
-        res[eval_set] = evaluate_model(model, X_test)
+        test_dataset = get_test_dataset(data_dir, eval_set)
+        res[eval_set] = evaluate_model(model, test_dataset)
     test_time = time.time() - start
-    print("-" * 10)
-    duration = train_time + test_time
-    print(f"Completed Prediction. Total duration: {duration}")
+    print(
+        f"Completed Prediction. Total duration: {train_time + test_time:.1f}s"
+    )
 
-    # Write output files
+    # ── Write outputs ─────────────────────────────────────────────────────────
     output_dir.mkdir(parents=True, exist_ok=True)
     with open(output_dir / "metadata.json", "w+") as f:
         json.dump(dict(train_time=train_time, test_time=test_time), f)
@@ -70,19 +173,25 @@ def main(data_dir, output_dir):
         "--data-dir",
         type=str,
         default="/app/input_data",
-        help="",
+        help="Root folder containing train/, test/, and private_test/ splits. "
+        "Codabench mounts data at /app/input_data. "
+        "For local testing pass: --data-dir dev_phase/input_data",
     )
     parser.add_argument(
         "--output-dir",
         type=str,
         default="/app/output",
-        help="",
+        help="Folder where prediction CSVs and metadata.json will be written. "
+        "Codabench expects output at /app/output. "
+        "For local testing pass: --output-dir ingestion_res",
     )
     parser.add_argument(
         "--submission-dir",
         type=str,
         default="/app/ingested_program",
-        help="",
+        help="Directory containing submission.py. "
+        "Codabench mounts participant code at /app/ingested_program. "
+        "For local testing pass: --submission-dir solution",
     )
 
     args = parser.parse_args()
diff --git a/logo.png b/logo.png
index 9616456..5255f04 100644
Binary files a/logo.png and b/logo.png differ
diff --git a/pages/participate.md b/pages/participate.md
index 4d5427b..c44336d 100755
--- a/pages/participate.md
+++ b/pages/participate.md
@@ -1,10 +1,55 @@
-# How to participate
+# How to Participate
 
-You should submit an untrained model in a python file `model.py` which contains
-your `class Model`, which will be imported, trained, and tested on Codalab.
+## Objective
 
-See the "Seed" page for the outline of a `Model` class, with the expected
-function names.
+Build a model that predicts whether the S&P 500 index will **close higher or lower** than the previous day,
+using only information available before noon (ET) on the trading day in question.
 
-See the "Timeline" page for additional information about the phases of this
-competition
+## Input Features
+
+Each sample in the dataset is a row in a CSV with the following columns (all values are for the **current trading day** or computed from past days only):
+
+| Column | Description |
+|--------|-------------|
+| `Open` | Opening price of the current trading day |
+| `High` | Intraday high up to the morning window |
+| `Low` | Intraday low up to the morning window |
+| `Close` | Previous day's closing price |
+| `Volume` | Trading volume up to the morning window |
+
+The ingestion program constructs **sliding windows** of the last 20 trading days for each sample and feeds them to your model as tensors of shape `(batch, 20, n_features)`.
+
+## Target Label
+
+- **1** — today's close will be **strictly above** the previous close
+- **0** — today's close will be **at or below** the previous close
+
+## What to Submit
+
+Submit a single file named **`submission.py`** containing a function:
+
+```python
+def get_model(train_loader):
+    ...
+    return model
+```
+
+`train_loader` is a `torch.utils.data.DataLoader` yielding `(x, y)` batches where:
+- `x` has shape `(batch, 20, n_features)` — a sequence of 20 daily feature vectors
+- `y` has shape `(batch,)` — binary labels `{0, 1}`
+
+Your `get_model` function must **train the model** using the provided loader and return a trained `torch.nn.Module` whose `forward(x)` outputs **probabilities in [0, 1]** of shape `(batch,)` — i.e. sigmoid must already be applied.
+
+See the **Seed** page for a working skeleton to get started.
+
+## Evaluation Metric
+
+Submissions are ranked by **ROC-AUC score** on the held-out test set.
+A perfect model scores 1.0; random guessing scores ~0.5.
+
+## Rules
+
+- Your model may only use information in the provided feature set — no external data sources.
+- External Python libraries (e.g. `torch`, `sklearn`, `numpy`) are allowed.
+- You may submit as many times as you like during the Development Phase.
+- The private test set is only revealed after the phase ends.
diff --git a/pages/seed.md b/pages/seed.md
index 9b15f6a..0610971 100644
--- a/pages/seed.md
+++ b/pages/seed.md
@@ -1,21 +1,85 @@
-# Seed:
+# Seed — Starter Template
 
+Copy this file as `submission.py` and implement your model inside `get_model`.
+
+The ingestion program will call `get_model(train_loader)` and expect back a trained
+`torch.nn.Module` whose `forward(x)` returns probabilities in **[0, 1]**.
+
+```python
+import torch
+import torch.nn as nn
+
+
+def get_model(train_loader):
+    """
+    Train a model on the S&P 500 direction-forecasting task and return it.
+
+    Parameters
+    ----------
+    train_loader : torch.utils.data.DataLoader
+        Yields (x, y) batches where:
+          x — FloatTensor of shape (batch, 20, n_features)
+              A sliding window of the last 20 daily feature vectors.
+              Features: Open, High, Low, Close, Volume (current and past days).
+          y — FloatTensor of shape (batch,)
+              Binary label: 1 if today's close > previous close, else 0.
+
+    Returns
+    -------
+    model : torch.nn.Module
+        Trained model in eval() mode.
+        forward(x) must accept shape (batch, 20, n_features)
+        and return probabilities in [0, 1] of shape (batch,).
+        The ingestion program applies a 0.5 threshold to produce 0/1 predictions.
+    """
+
+    # --- Infer input size from the first batch ---
+    x_sample, _ = next(iter(train_loader))
+    input_size = x_sample.shape[-1]      # number of features per timestep
+    seq_len    = x_sample.shape[1]       # window size (20)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # --- Define your model here ---
+    # Example: single-layer LSTM + linear head + sigmoid
+    class MyModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.lstm = nn.LSTM(input_size, hidden_size=64,
+                                num_layers=1, batch_first=True)
+            self.head = nn.Linear(64, 1)
+
+        def forward(self, x):
+            out, _ = self.lstm(x)           # (batch, seq_len, 64)
+            last   = out[:, -1, :]          # (batch, 64) — last timestep
+            return torch.sigmoid(self.head(last).squeeze(-1))  # (batch,)
+
+    model     = MyModel().to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    criterion = nn.BCELoss()              # BCELoss because sigmoid is already applied
+
+    # --- Training loop ---
+    N_EPOCHS = 10
+    model.train()
+    for epoch in range(N_EPOCHS):
+        total_loss = 0.0
+        for x, y in train_loader:
+            x, y = x.to(device), y.to(device)
+            optimizer.zero_grad()
+            probs = model(x)              # (batch,)
+            loss  = criterion(probs, y)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        print(f"Epoch {epoch+1}/{N_EPOCHS}  loss={total_loss/len(train_loader):.4f}")
+
+    model.eval()
+    return model
 ```
-class Model:
-    def fit(self, X_train, y_train):
-        """
-        This should handle the logic of training your model
-        :param X_train: np.array of training data
-        :param y_train: np.array of the same length as X_train. Contains classifications of X_train
-        """
-        pass
-
-    def predict(self, X_test):
-        """
-        This should handle making predictions with a trained model
-        :param X_test: np.array of testing data
-        :return: np.array of the same length as X_test containing predictions to each point in X_test
-        """
-        pass
-
-```
\ No newline at end of file
+
+## Tips
+
+- You can replace the LSTM with a GRU (`nn.GRU`), Transformer (`nn.TransformerEncoder`), or any other architecture.
+- The window size is fixed at **20** timesteps by the ingestion program.
+- Keep training time reasonable — the Codabench environment has limited CPU resources.
+- You are free to add dropout, batch normalisation, learning rate schedulers, etc.
diff --git a/pages/terms.md b/pages/terms.md
index 0d69023..ece5c3b 100755
--- a/pages/terms.md
+++ b/pages/terms.md
@@ -1,18 +1,73 @@
 # Terms and Conditions
 
-## Lorem Ipsum
-
-Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure
-dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
-proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-
-## Sed ut perspiciatis
-
-Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae 
-ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit 
-aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam 
-est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore 
-et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, 
-nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae 
-consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?
+## 1. Eligibility
+
+This challenge is open to students enrolled in the DataCamp courses at École Polytechnique (INF554 / MAP583). Participation is voluntary and free of charge.
+
+Participants may compete individually or in teams, subject to the rules specified by the course instructors.
+2. Data Usage
+
+The dataset provided for this challenge (historical S&P 500 daily OHLCV data sourced from public market data) is intended solely for educational and research purposes within the scope of this course.
+
+Participants agree to:
+
+    Use the data only for the purpose of this challenge.
+    Not redistribute the data outside the course or upload it to public repositories.
+    Not attempt to identify, reverse-engineer, or misuse the data beyond its intended scientific context.
+
+3. Training and Evaluation Restrictions
+
+Participants must comply with the following rules:
+
+    Only the provided training split may be used for training and validation.
+    The test and private test splits are reserved for evaluation only and must not be used in the training loop.
+    Any attempt to directly or indirectly train on evaluation data will result in disqualification.
+
+4. Submission Rules
+
+Participants must submit:
+
+    A compressed folder containing the code necessary to train a model (see page seed.md).
+    Any accompanying code or documentation as specified by the instructors.
+
+Submissions must be the original work of the participants.
+5. Academic Integrity
+
+Participants are expected to adhere to the École Polytechnique’s academic integrity policies.
+
+Specifically:
+
+    Plagiarism, including copying code or solutions without proper attribution, is prohibited.
+    The use of external libraries and pre-trained models is allowed unless otherwise stated, provided their use is clearly documented.
+    Collaboration between teams is not allowed unless explicitly permitted by the instructors.
+
+Violations of academic integrity rules may result in penalties, including disqualification or academic sanctions.
+6. Intellectual Property
+
+Participants retain ownership of the code and models they develop as part of this challenge.
+
+By submitting their results, participants grant the course instructors and the hosting institution a non-exclusive, royalty-free right to:
+
+    Use the submissions for grading and evaluation.
+    Use anonymized results or visualizations for teaching, presentations, or future course materials.
+
+7. Liability
+
+The organizers provide the data and evaluation infrastructure “as is” and make no guarantees regarding accuracy, completeness, or fitness for a particular purpose.
+
+The organizers are not responsible for:
+
+    Technical issues, data loss, or submission failures.
+    Any damages or losses arising from participation in the challenge.
+
+8. Modification and Termination
+
+The organizers reserve the right to:
+
+    Modify the challenge rules, datasets, or evaluation criteria if necessary.
+    Terminate the challenge in case of technical issues or unforeseen circumstances.
+
+Any changes will be communicated to participants in a timely manner.
+9. Acceptance of Terms
+
+By participating in the challenge, participants acknowledge that they have read, understood, and agreed to these Terms and Conditions.
diff --git a/pages/timeline.md b/pages/timeline.md
index 4e613bf..015ea39 100644
--- a/pages/timeline.md
+++ b/pages/timeline.md
@@ -1,11 +1,28 @@
 # Timeline
 
-## Development phase
-This phase should be used to tune your models, testing against a small
-set of testing data
-
-## Final phase
-Resubmit your preferred submission from the development phase to test
-against a new set of testing data. Your model will be trained against
-the same set of training data as in the development phase. You may only
-make one submission to this phase, so choose wisely.
+## Development Phase — October 7, 2025 → March 31, 2026
+
+The development phase is open for the full duration of the course.
+
+- **Training data**: historical S&P 500 daily OHLCV data (roughly 2006–2022).
+- **Public test set**: a held-out window of ~250 trading days (~2022–2023).
+  Your submission is scored against this set after every submission.
+- **Submissions**: unlimited — iterate freely and track your progress on the leaderboard.
+- **Goal**: tune your model architecture, features, and hyper-parameters to maximise ROC-AUC on the public test set.
+
+## Private Leaderboard — revealed at end of Development Phase
+
+Once the development phase closes on **March 31, 2026**, the private test set
+(a further ~250 trading days, ~2023–2024) is scored for all submissions.
+Final rankings are based on the **private test ROC-AUC**.
+
+The private test set is completely hidden during the development phase — optimising
+sole ly for the public leaderboard may not generalise.
+
+## Key Dates
+
+| Date | Event |
+|------|-------|
+| October 7, 2025 | Competition opens, development phase begins |
+| March 31, 2026 | Development phase closes, no further submissions accepted |
+| Early April 2026 | Private leaderboard revealed, final rankings published |
diff --git a/raw_data/.gitkeep b/raw_data/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
index fda4dd6..41980e6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,11 @@
-pandas
-scikit-learn
+# ── Core runtime (ingestion + scoring + submission) ───────────────────────────
+numpy==2.2.6
+pandas==2.3.3
+scikit-learn==1.7.2
+
+# PyTorch CPU build — participants may swap for a GPU wheel if needed
+torch==2.8.0
+
+# ── Local development tools ───────────────────────────────────────────────────
+# Required only for tools/run_docker.py (not installed inside the Docker image)
+docker
diff --git a/scoring_program/scoring.py b/scoring_program/scoring.py
index 701f78f..971e3ec 100755
--- a/scoring_program/scoring.py
+++ b/scoring_program/scoring.py
@@ -2,40 +2,41 @@
 from pathlib import Path
 
 import pandas as pd
+from sklearn.metrics import roc_auc_score
 
 EVAL_SETS = ["test", "private_test"]
 
 
-def compute_accuracy(predictions, targets):
-    # Make sure there is no NaN, as pandas ignores them in mean computation
-    predictions = predictions.fillna(-10).values
-    # Return mean of correct predictions
-    return (predictions == targets.values).mean()
+def compute_roc_auc(predictions, targets):
+    # Make sure there is no NaN
+    predictions = predictions.fillna(0.5).values
+    # Return ROC AUC score
+    return roc_auc_score(targets, predictions)
 
 
 def main(reference_dir, prediction_dir, output_dir):
     scores = {}
     for eval_set in EVAL_SETS:
-        print(f'Scoring {eval_set}')
+        print(f"Scoring {eval_set}")
 
         predictions = pd.read_csv(
-            prediction_dir / f'{eval_set}_predictions.csv'
-        )
-        targets = pd.read_csv(
-            reference_dir / f'{eval_set}_labels.csv'
+            prediction_dir / f"{eval_set}_predictions.csv"
         )
+        targets = pd.read_csv(reference_dir / f"{eval_set}_labels.csv")
 
-        scores[eval_set] = float(compute_accuracy(predictions, targets))
+        scores[eval_set] = float(
+            compute_roc_auc(predictions, targets["Target"].values)
+        )
 
     # Add train and test times in the score
-    json_durations = (prediction_dir / 'metadata.json').read_text()
+    json_durations = (prediction_dir / "metadata.json").read_text()
     durations = json.loads(json_durations)
     scores.update(**durations)
     print(scores)
 
     # Write output scores
     output_dir.mkdir(parents=True, exist_ok=True)
-    (output_dir / 'scores.json').write_text(json.dumps(scores))
+    (output_dir / "scores.json").write_text(json.dumps(scores))
 
 
 if __name__ == "__main__":
@@ -68,5 +69,5 @@ def main(reference_dir, prediction_dir, output_dir):
     main(
         Path(args.reference_dir),
         Path(args.prediction_dir),
-        Path(args.output_dir)
+        Path(args.output_dir),
     )
diff --git a/solution/submission.py b/solution/submission.py
index a8076b0..436486b 100644
--- a/solution/submission.py
+++ b/solution/submission.py
@@ -1,7 +1,108 @@
-from sklearn.ensemble import RandomForestClassifier
+"""
+Reference LSTM baseline for the S&P 500 direction-forecasting challenge.
 
+The ingestion program will call:
 
-# The submission here should simply be a function that returns a model
-# compatible with scikit-learn API
-def get_model():
-    return RandomForestClassifier()
+    model = get_model(train_loader)
+
+where `train_loader` is a torch.utils.data.DataLoader that yields
+(x, y) batches with:
+    x : FloatTensor of shape (batch, WINDOW_SIZE, n_features)
+    y : FloatTensor of shape (batch,)  — binary labels (1 = up, 0 = down)
+
+`get_model` must return a trained torch.nn.Module whose forward pass accepts
+a tensor of shape (batch, WINDOW_SIZE, n_features) and returns probabilities
+in [0, 1] of shape (batch,). The ingestion program applies a 0.5 threshold.
+"""
+
+import torch
+import torch.nn as nn
+
+
+# ── Hyper-parameters (feel free to tune) ─────────────────────────────────────
+HIDDEN_SIZE = 128
+NUM_LAYERS = 3
+DROPOUT = 0.1
+N_EPOCHS = 3
+LEARNING_RATE = 1e-4
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class LSTMClassifier(nn.Module):
+    """Sequence-to-one LSTM for binary direction prediction.
+
+    Takes a window of shape (batch, seq_len, input_size) and returns
+    a scalar logit per sample (shape: (batch,)).
+
+    Architecture
+    ------------
+    LSTM (num_layers, hidden_size, dropout) → hidden state of last timestep
+    → Linear(hidden_size → 1) → squeeze → Sigmoid → probability in [0, 1]
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int = HIDDEN_SIZE,
+        num_layers: int = NUM_LAYERS,
+        dropout: float = DROPOUT,
+    ):
+        super().__init__()
+        self.lstm = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0.0,
+        )
+        self.head = nn.Linear(hidden_size, 1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (batch, seq_len, input_size)
+        out, _ = self.lstm(x)  # (batch, seq_len, hidden_size)
+        last = out[:, -1, :]  # (batch, hidden_size) — last timestep
+        logit = self.head(last).squeeze(-1)  # (batch,)
+        return torch.sigmoid(logit)  # (batch,) — probability in [0, 1]
+
+
+def get_model(train_loader: torch.utils.data.DataLoader) -> nn.Module:
+    """Train an LSTM on the provided DataLoader and return the trained model.
+
+    Parameters
+    ----------
+    train_loader : DataLoader
+        Yields (x, y) batches where x has shape (batch, WINDOW_SIZE, n_features)
+        and y has shape (batch,) with values in {0, 1}.
+
+    Returns
+    -------
+    model : nn.Module (in eval mode)
+        Trained LSTMClassifier whose forward pass returns probabilities in [0, 1].
+    """
+    # Infer input size from the first batch
+    x_sample, _ = next(iter(train_loader))
+    input_size = x_sample.shape[-1]  # n_features
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Training on: {device}")
+
+    model = LSTMClassifier(input_size=input_size).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
+    criterion = nn.BCELoss()  # model already applies sigmoid
+
+    model.train()
+    for epoch in range(N_EPOCHS):
+        total_loss = 0.0
+        for x, y in train_loader:
+            x, y = x.to(device), y.to(device)
+            optimizer.zero_grad()
+            probs = model(x)  # (batch,) — probabilities in [0, 1]
+            loss = criterion(probs, y)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        avg_loss = total_loss / len(train_loader)
+        print(f"  Epoch {epoch + 1:>2}/{N_EPOCHS}  loss={avg_loss:.4f}")
+
+    model.eval()
+    return model
diff --git a/tools/Dockerfile b/tools/Dockerfile
index 8cb1eca..e3fa61a 100644
--- a/tools/Dockerfile
+++ b/tools/Dockerfile
@@ -1,20 +1,45 @@
-# Step 1: Start from an official Docker image with desired base environment
-# Good starting points are the official codalab images or
-# pytorch images with CUDA support:
-#    - Codalab: codalab/codalab-legacy:py39
-#    - Codalab GPU: codalab/codalab-legacy:gpu310
-#    - Pytorch: pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime
-FROM codalab/codalab-legacy:py39
+# ─────────────────────────────────────────────────────────────────────────────
+# Codabench Docker image — S&P 500 Autoregressive Forecasting Challenge
+#
+# Codabench mounts the following directories at runtime:
+#   /app/input_data        ← input splits (train/, test/, private_test/)
+#   /app/ingested_program  ← participant's submission.py
+#   /app/ingestion_program ← ingestion.py (organiser code)
+#   /app/scoring_program   ← scoring.py   (organiser code)
+#   /app/output            ← ingestion writes predictions here
+#   /app/input/ref         ← reference labels (scoring)
+#   /app/input/res         ← predictions to score (scoring)
+#
+# Build from the PROJECT ROOT, not from tools/:
+#   docker build -t sp500-challenge -f tools/Dockerfile .
+#
+# The build context must be the project root so that requirements.txt is
+# accessible via COPY.
+# ─────────────────────────────────────────────────────────────────────────────
 
-# Set environment variables to prevent interactive prompts
-ENV DEBIAN_FRONTEND=noninteractive
+FROM python:3.11-slim
 
-# Step 2: Install system-level dependencies (if any)
-# e.g., git, wget, or common libraries for OpenCV like libgl1
-RUN pip install -U pip
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
 
-# Step 3: Copy and pre-install all Python dependencies
-# This 'requirements.txt' file should list pandas, scikit-learn, timm, etc.
-# Place it in the same directory as this Dockerfile.
+# Minimal system deps: gcc is needed to compile some numpy/pandas C extensions
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        gcc \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Pre-install Python dependencies.
+# The 'docker' package is a local dev tool — skip it inside the container.
+# torch is installed from the official CPU wheel index to avoid pulling the
+# full CUDA build (~2 GB saved).
 COPY requirements.txt /tmp/requirements.txt
-RUN pip install --no-cache-dir -r /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip \
+ && grep -v "^docker" /tmp/requirements.txt \
+    | grep -v "^#" \
+    | grep -v "^$" \
+    | grep -v "torch" \
+    | pip install --no-cache-dir -r /dev/stdin \
+ && pip install --no-cache-dir \
+        torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu
diff --git a/tools/setup_data.py b/tools/setup_data.py
index 5bdc3a9..125549b 100644
--- a/tools/setup_data.py
+++ b/tools/setup_data.py
@@ -1,53 +1,72 @@
-# Script to download the data from a given source and create the splits
-# This is a mock version that generate fake problems
+# Script to load the S&P500 data and create the splits for the benchmark
 from pathlib import Path
 
-import numpy as np
 import pandas as pd
-from sklearn.datasets import make_classification
-from sklearn.model_selection import train_test_split
 
-PHASE = 'dev_phase'
+PHASE = "dev_phase"
 
-DATA_DIR = Path(PHASE) / 'input_data'
-REF_DIR = Path(PHASE) / 'reference_data'
+DATA_DIR = Path(PHASE) / "input_data"
+REF_DIR = Path(PHASE) / "reference_data"
+
+RAW_DATA_PATH = Path("raw_data") / "sp500_raw.csv"
+TARGET_COL = "Target"
 
 
 def make_csv(data, filepath):
     filepath.parent.mkdir(parents=True, exist_ok=True)
-    pd.DataFrame(data).to_csv(filepath, index=False)
+    data.to_csv(filepath, index=True)  # integer row index saved as first column
 
 
 if __name__ == "__main__":
 
-    import argparse
-    parser = argparse.ArgumentParser(
-        description='Load or generate data for the benchmark'
-    )
-    parser.add_argument('--seed', type=int, default=42,
-                        help='Random seed for data generation')
-    args = parser.parse_args()
-
-    # Generate and split the data
-    rng = np.random.RandomState(args.seed)
-    X, y = make_classification(n_samples=500, n_features=5, random_state=rng)
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.4, random_state=rng
-    )
-    X_test, X_private_test, y_test, y_private_test = train_test_split(
-        X_test, y_test, test_size=0.5, random_state=rng
-    )
+    # Load the S&P500 data
+    print(f"Loading data from {RAW_DATA_PATH}")
+    df = pd.read_csv(RAW_DATA_PATH)
+
+    # Separate features and target; drop Date (not a model input)
+    y = df[TARGET_COL]
+    X = df.drop(columns=[TARGET_COL, "Date"]).reset_index(drop=True)
+
+    n = len(df)
+    train_end = int(n * 0.6)
+    test_end = int(n * 0.8)
+
+    # Split chronologically: 60% train, 20% test, 20% private_test
+    X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
+    X_test, y_test = X.iloc[train_end:test_end], y.iloc[train_end:test_end]
+    X_private_test, y_private_test = X.iloc[test_end:], y.iloc[test_end:]
+
+    print(f"Dataset shape: {df.shape}")
+    print(f"Features: {X.shape[1]}, Samples: {n}")
+    print(f"Target distribution:\n{y.value_counts()}")
 
     # Store the data in the correct folders:
     # - input_data contains train data (both features and labels) and only
     #   test features so the test labels are kept secret
     # - reference_data contains the test labels for scoring
     for split, X_split, y_split in [
-        ('train', X_train, y_train),
-        ('test', X_test, y_test),
-        ('private_test', X_private_test, y_private_test),
+        ("train", X_train, y_train),
+        ("test", X_test, y_test),
+        ("private_test", X_private_test, y_private_test),
     ]:
         split_dir = DATA_DIR / split
-        make_csv(X_split, split_dir / f'{split}_features.csv')
+        make_csv(X_split, split_dir / f"{split}_features.csv")
         label_dir = split_dir if split == "train" else REF_DIR
-        make_csv(y_split, label_dir / f'{split}_labels.csv')
\ No newline at end of file
+        make_csv(
+            pd.DataFrame({TARGET_COL: y_split}),
+            label_dir / f"{split}_labels.csv",
+        )
+
+    print("\nData splits created successfully!")
+    print(
+        f"{'Split':<15} {'Samples':<10} {'Index start':<15} {'Index end':<15}"
+    )
+    print("-" * 55)
+    for split, X_split in [
+        ("train", X_train),
+        ("test", X_test),
+        ("private_test", X_private_test),
+    ]:
+        print(
+            f"{split:<15} {len(X_split):<10} {X_split.index[0]:<15} {X_split.index[-1]:<15}"
+        )