diff --git a/.gitignore b/.gitignore
index 9728749..4c2f2aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
 ingestion_res/*
 scoring_res/*
 dev_phase/*
+*.pth
diff --git a/competition.yaml b/competition.yaml
index 3a1e4de..44a4a74 100755
--- a/competition.yaml
+++ b/competition.yaml
@@ -1,8 +1,26 @@
 version: 2
-title: Templat competition - Dummy classification
-description: Dummy classification task
+title: "Autoregressive Forecasting of the S&P 500 Index"
+description: >
+  Can you predict whether the S&P 500 will close up or down — using only what you know by mid-morning?
+
+  Each trading day, participants receive a feature vector built from:
+    - Intraday morning signals: the day's open price and early price action
+      (e.g. open-to-first-hour return, morning high/low range, opening gap vs previous close).
+    - Historical context: past N days of daily OHLCV data, log-returns, and
+      rolling statistics (volatility, momentum) up to and including the previous close.
+
+  The target label is binary: **1** if the day's close is strictly above the previous close,
+  **0** otherwise. No look-ahead is permitted — only information available before noon (ET)
+  may be used as features for the current day.
+
+  Participants submit a scikit-learn–compatible model via a `submission.py` file
+  exposing a `get_model()` function. The model is trained server-side on historical
+  data and evaluated on a held-out test window using **directional accuracy**
+  (fraction of days where the predicted direction matches the actual close direction).
+
+  This is a DataCamp challenge organised at École Polytechnique (INF554 / MAP583).
 image: logo.png
-registration_auto_approve: False  # if True, do not require approval from admin to join the comp
+registration_auto_approve: False  # set to True to skip manual approval
 
 terms: pages/terms.md
 pages:
@@ -15,8 +33,15 @@ pages:
 
 tasks:
   - index: 0
-    name: Developement Task
-    description: 'Tune models with training data, test against examples contained in public test data'
+    name: Development Task
+    description: >
+      Same-day close direction forecasting of the S&P 500 using morning information.
+      Each sample consists of: (i) intraday morning features for the current trading day
+      (opening gap, open price, early price action) and (ii) historical daily features
+      from the past N sessions (log-returns, OHLCV, rolling volatility, momentum).
+      The label is 1 if today's close > previous close, 0 otherwise.
+      No information after the morning window may be used; models are scored on
+      directional accuracy over a public held-out test window.
     input_data: dev_phase/input_data/
     reference_data: dev_phase/reference_data/
     ingestion_program: ingestion_program/
@@ -25,13 +50,15 @@ tasks:
 solutions:
   - index: 0
     tasks:
-    - 0
+      - 0
     path: solution/
 
-
 phases:
   - name: Development Phase
-    description: 'Development phase: tune your models.'
+    description: >
+      Tune and validate your autoregressive model using the provided historical
+      S&P 500 training data. Your predictions are scored against a public test set
+      so you can iterate quickly. Unlimited submissions are allowed in this phase.
     start: 10-07-2025
     end: 03-31-2026
     tasks:
@@ -41,20 +68,20 @@ leaderboards:
   - title: Results
     key: main
     columns:
-      - title: Test Accuracy
+      - title: Directional Accuracy (public test)
         key: test
         index: 0
-        sorting: asc
-      - title: Private Test Accuracy
+        sorting: desc          # higher is better
+      - title: Directional Accuracy (private test)
         key: private_test
         index: 1
-        sorting: asc
-        hidden: True
-      - title: Train time
+        sorting: desc
+        hidden: True           # revealed only after the phase ends
+      - title: Train Time (s)
         key: train_time
         index: 2
-        sorting: desc
-      - title: Test time
+        sorting: asc           # lower is better
+      - title: Predict Time (s)
         key: test_time
         index: 3
-        sorting: desc
+        sorting: asc
diff --git a/ingestion_program/ingestion.py b/ingestion_program/ingestion.py
index f150b05..6585fa5 100755
--- a/ingestion_program/ingestion.py
+++ b/ingestion_program/ingestion.py
@@ -3,53 +3,156 @@
 import time
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
+import torch
 
+# Number of past trading days fed as a sequence to the model.
+# Must be consistent between training and inference.
+WINDOW_SIZE = 50
 
 EVAL_SETS = ["test", "private_test"]
 
 
-def evaluate_model(model, X_test):
-
-    y_pred = model.predict(X_test)
-    return pd.DataFrame(y_pred)
+class SP500Dataset(torch.utils.data.Dataset):
+    """PyTorch Dataset for the S&P 500 direction-forecasting challenge.
+
+    Each sample is a sliding window of shape (WINDOW_SIZE, n_features)
+    ending at day `idx`. The target is the binary label of that last day
+    (1 = close > prev_close, 0 otherwise).
+
+    For the first WINDOW_SIZE-1 days, the window is left-padded with zeros.
+
+    Parameters
+    ----------
+    features_path : Path
+        Path to the features CSV (columns = feature names, rows = trading days
+        in chronological order).
+    labels_path : Path or None
+        Path to the labels CSV (single column, same row order as features).
+        Pass None for test sets where labels are withheld.
+    window_size : int
+        Number of past days (inclusive of the current day) in each sequence.
+    """
+
+    def __init__(
+        self, features_path, labels_path=None, window_size=WINDOW_SIZE
+    ):
+        self.window_size = window_size
+        # index_col=0: the first column is the row index saved by setup_data.py,
+        # not a feature — must be excluded from the data arrays.
+        self.X = pd.read_csv(features_path, index_col=0).values.astype(
+            np.float32
+        )
+        self.n_features = self.X.shape[1]
+        if labels_path is not None:
+            self.y = (
+                pd.read_csv(labels_path, index_col=0)
+                .values.astype(np.float32)
+                .ravel()
+            )
+        else:
+            self.y = None  # test mode — labels are unknown
+
+    def __len__(self):
+        return len(self.X)
+
+    def __getitem__(self, idx):
+        """Return (window, label) where window has shape (window_size, n_features).
+
+        The label is the binary target for day `idx` (the last day of the window).
+        During test mode (no labels), only the window tensor is returned.
+        """
+        window_start = max(0, idx - self.window_size + 1)
+        window = self.X[window_start : idx + 1]  # (<=window_size, n_features)
+
+        # Left-pad with zeros if we are at the beginning of the series
+        if len(window) < self.window_size:
+            padding = np.zeros(
+                (self.window_size - len(window), self.n_features),
+                dtype=np.float32,
+            )
+            window = np.concatenate([padding, window], axis=0)
+
+        x = torch.tensor(
+            window, dtype=torch.float32
+        )  # (window_size, n_features)
+
+        if self.y is not None:
+            y = torch.tensor(self.y[idx], dtype=torch.float32)  # scalar
+            return x, y
+        return x  # test mode
+
+
+def get_train_dataset(data_dir):
+    """Build the training Dataset from separate features and labels CSVs."""
+    data_dir = Path(data_dir)
+    features_path = data_dir / "train" / "train_features.csv"
+    labels_path = data_dir / "train" / "train_labels.csv"
+    return SP500Dataset(features_path, labels_path)
 
 
-def get_train_data(data_dir):
+def get_test_dataset(data_dir, eval_set):
+    """Build a test Dataset (no labels) for a given evaluation split."""
     data_dir = Path(data_dir)
-    training_dir = data_dir / "train"
-    X_train = pd.read_csv(training_dir / "train_features.csv")
-    y_train = pd.read_csv(training_dir / "train_labels.csv")
-    return X_train, y_train
+    features_path = data_dir / eval_set / f"{eval_set}_features.csv"
+    return SP500Dataset(features_path, labels_path=None)
 
 
-def main(data_dir, output_dir):
-    # Here, you can import info from the submission module, to evaluate the
-    # submission
-    from submission import get_model
+def evaluate_model(model, test_dataset):
+    """Run inference over a test Dataset and return a DataFrame of probabilities.
+
+    The model outputs probabilities in [0, 1] (sigmoid already applied).
+    The scoring program is responsible for applying the decision threshold.
+    """
+    device = next(model.parameters()).device
+    loader = torch.utils.data.DataLoader(
+        test_dataset, batch_size=64, shuffle=False
+    )
+    probs = []
+    model.eval()
+    with torch.no_grad():
+        for x in loader:
+            # test_dataset returns bare tensors (no label) — x is already the input
+            x = x.to(device)
+            batch_probs = model(x).cpu().numpy().tolist()  # floats in [0, 1]
+            probs.extend(batch_probs)
+    return pd.DataFrame({"Probability": probs})
 
-    X_train, y_train = get_train_data(data_dir)
 
-    print("Training the model")
+def main(data_dir, output_dir):
+    from submission import (
+        get_model,
+    )  # imported here so sys.path is set first
 
-    model = get_model()
+    data_dir = Path(data_dir)
+    output_dir = Path(output_dir)
 
+    # ── Training ──────────────────────────────────────────────────────────────
+    train_dataset = get_train_dataset(data_dir)
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=32, shuffle=True
+    )
+
+    print("Training the model")
     start = time.time()
-    model.fit(X_train, y_train)
+    model = get_model(train_loader)  # participant trains and returns the model
     train_time = time.time() - start
-    print("-" * 10)
+
+    # ── Evaluation ────────────────────────────────────────────────────────────
+    print("=" * 40)
     print("Evaluate the model")
     start = time.time()
     res = {}
     for eval_set in EVAL_SETS:
-        X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}_features.csv")
-        res[eval_set] = evaluate_model(model, X_test)
+        test_dataset = get_test_dataset(data_dir, eval_set)
+        res[eval_set] = evaluate_model(model, test_dataset)
     test_time = time.time() - start
-    print("-" * 10)
-    duration = train_time + test_time
-    print(f"Completed Prediction. Total duration: {duration}")
+    print(
+        f"Completed Prediction. Total duration: {train_time + test_time:.1f}s"
+    )
 
-    # Write output files
+    # ── Write outputs ─────────────────────────────────────────────────────────
     output_dir.mkdir(parents=True, exist_ok=True)
     with open(output_dir / "metadata.json", "w+") as f:
         json.dump(dict(train_time=train_time, test_time=test_time), f)
@@ -69,19 +172,19 @@ def main(data_dir, output_dir):
     parser.add_argument(
         "--data-dir",
         type=str,
-        default="/app/input_data",
-        help="",
+        default="dev_phase/input_data",
+        help="Root folder containing train/, test/, and private_test/ splits.",
     )
     parser.add_argument(
         "--output-dir",
         type=str,
-        default="/app/output",
-        help="",
+        default="ingestion_res",
+        help="Folder where prediction CSVs and metadata.json will be written.",
     )
     parser.add_argument(
         "--submission-dir",
         type=str,
-        default="/app/ingested_program",
+        default="solution",
         help="",
     )
 
diff --git a/logo.png b/logo.png
index 9616456..5255f04 100644
Binary files a/logo.png and b/logo.png differ
diff --git a/raw_data/.gitkeep b/raw_data/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
index fda4dd6..41980e6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,11 @@
-pandas
-scikit-learn
+# ── Core runtime (ingestion + scoring + submission) ───────────────────────────
+numpy==2.2.6
+pandas==2.3.3
+scikit-learn==1.7.2
+
+# PyTorch CPU build — participants may swap for a GPU wheel if needed
+torch==2.8.0
+
+# ── Local development tools ───────────────────────────────────────────────────
+# Required only for tools/run_docker.py (not installed inside the Docker image)
+docker
diff --git a/scoring_program/scoring.py b/scoring_program/scoring.py
index 701f78f..39f675f 100755
--- a/scoring_program/scoring.py
+++ b/scoring_program/scoring.py
@@ -2,40 +2,39 @@
 from pathlib import Path
 
 import pandas as pd
+from sklearn.metrics import roc_auc_score
 
 EVAL_SETS = ["test", "private_test"]
 
 
-def compute_accuracy(predictions, targets):
-    # Make sure there is no NaN, as pandas ignores them in mean computation
-    predictions = predictions.fillna(-10).values
-    # Return mean of correct predictions
-    return (predictions == targets.values).mean()
+def compute_roc_auc(predictions, targets):
+    # Make sure there is no NaN
+    predictions = predictions.fillna(0.5).values
+    # Return ROC AUC score
+    return roc_auc_score(targets.values, predictions)
 
 
 def main(reference_dir, prediction_dir, output_dir):
     scores = {}
     for eval_set in EVAL_SETS:
-        print(f'Scoring {eval_set}')
+        print(f"Scoring {eval_set}")
 
         predictions = pd.read_csv(
-            prediction_dir / f'{eval_set}_predictions.csv'
-        )
-        targets = pd.read_csv(
-            reference_dir / f'{eval_set}_labels.csv'
+            prediction_dir / f"{eval_set}_predictions.csv"
         )
+        targets = pd.read_csv(reference_dir / f"{eval_set}_labels.csv")
 
-        scores[eval_set] = float(compute_accuracy(predictions, targets))
+        scores[eval_set] = float(compute_roc_auc(predictions, targets))
 
     # Add train and test times in the score
-    json_durations = (prediction_dir / 'metadata.json').read_text()
+    json_durations = (prediction_dir / "metadata.json").read_text()
     durations = json.loads(json_durations)
     scores.update(**durations)
     print(scores)
 
     # Write output scores
     output_dir.mkdir(parents=True, exist_ok=True)
-    (output_dir / 'scores.json').write_text(json.dumps(scores))
+    (output_dir / "scores.json").write_text(json.dumps(scores))
 
 
 if __name__ == "__main__":
@@ -68,5 +67,5 @@ def main(reference_dir, prediction_dir, output_dir):
     main(
         Path(args.reference_dir),
         Path(args.prediction_dir),
-        Path(args.output_dir)
+        Path(args.output_dir),
     )
diff --git a/solution/submission.py b/solution/submission.py
index a8076b0..436486b 100644
--- a/solution/submission.py
+++ b/solution/submission.py
@@ -1,7 +1,108 @@
-from sklearn.ensemble import RandomForestClassifier
+"""
+Reference LSTM baseline for the S&P 500 direction-forecasting challenge.
 
+The ingestion program will call:
 
-# The submission here should simply be a function that returns a model
-# compatible with scikit-learn API
-def get_model():
-    return RandomForestClassifier()
+    model = get_model(train_loader)
+
+where `train_loader` is a torch.utils.data.DataLoader that yields
+(x, y) batches with:
+    x : FloatTensor of shape (batch, WINDOW_SIZE, n_features)
+    y : FloatTensor of shape (batch,)  — binary labels (1 = up, 0 = down)
+
+`get_model` must return a trained torch.nn.Module whose forward pass accepts
+a tensor of shape (batch, WINDOW_SIZE, n_features) and returns probabilities
+in [0, 1] of shape (batch,). The ingestion program applies a 0.5 threshold.
+"""
+
+import torch
+import torch.nn as nn
+
+
+# ── Hyper-parameters (feel free to tune) ─────────────────────────────────────
+HIDDEN_SIZE = 128
+NUM_LAYERS = 3
+DROPOUT = 0.1
+N_EPOCHS = 3
+LEARNING_RATE = 1e-4
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class LSTMClassifier(nn.Module):
+    """Sequence-to-one LSTM for binary direction prediction.
+
+    Takes a window of shape (batch, seq_len, input_size) and returns
+    a scalar logit per sample (shape: (batch,)).
+
+    Architecture
+    ------------
+    LSTM (num_layers, hidden_size, dropout) → hidden state of last timestep
+    → Linear(hidden_size → 1) → squeeze → Sigmoid → probability in [0, 1]
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int = HIDDEN_SIZE,
+        num_layers: int = NUM_LAYERS,
+        dropout: float = DROPOUT,
+    ):
+        super().__init__()
+        self.lstm = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0.0,
+        )
+        self.head = nn.Linear(hidden_size, 1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (batch, seq_len, input_size)
+        out, _ = self.lstm(x)  # (batch, seq_len, hidden_size)
+        last = out[:, -1, :]  # (batch, hidden_size) — last timestep
+        logit = self.head(last).squeeze(-1)  # (batch,)
+        return torch.sigmoid(logit)  # (batch,) — probability in [0, 1]
+
+
+def get_model(train_loader: torch.utils.data.DataLoader) -> nn.Module:
+    """Train an LSTM on the provided DataLoader and return the trained model.
+
+    Parameters
+    ----------
+    train_loader : DataLoader
+        Yields (x, y) batches where x has shape (batch, WINDOW_SIZE, n_features)
+        and y has shape (batch,) with values in {0, 1}.
+
+    Returns
+    -------
+    model : nn.Module (in eval mode)
+        Trained LSTMClassifier whose forward pass returns probabilities in [0, 1].
+    """
+    # Infer input size from the first batch
+    x_sample, _ = next(iter(train_loader))
+    input_size = x_sample.shape[-1]  # n_features
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Training on: {device}")
+
+    model = LSTMClassifier(input_size=input_size).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
+    criterion = nn.BCELoss()  # model already applies sigmoid
+
+    model.train()
+    for epoch in range(N_EPOCHS):
+        total_loss = 0.0
+        for x, y in train_loader:
+            x, y = x.to(device), y.to(device)
+            optimizer.zero_grad()
+            probs = model(x)  # (batch,) — probabilities in [0, 1]
+            loss = criterion(probs, y)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        avg_loss = total_loss / len(train_loader)
+        print(f"  Epoch {epoch + 1:>2}/{N_EPOCHS}  loss={avg_loss:.4f}")
+
+    model.eval()
+    return model
diff --git a/tools/Dockerfile b/tools/Dockerfile
index 8cb1eca..dd9b157 100644
--- a/tools/Dockerfile
+++ b/tools/Dockerfile
@@ -1,20 +1,22 @@
-# Step 1: Start from an official Docker image with desired base environment
-# Good starting points are the official codalab images or
-# pytorch images with CUDA support:
-#    - Codalab: codalab/codalab-legacy:py39
-#    - Codalab GPU: codalab/codalab-legacy:gpu310
-#    - Pytorch: pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime
-FROM codalab/codalab-legacy:py39
+# Step 1: Start from an official PyTorch + GPU base image
+# (Ubuntu 22.04, Python 3.11, PyTorch, CUDA 12.6, torch 2.8)
+FROM pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime
 
 # Set environment variables to prevent interactive prompts
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Step 2: Install system-level dependencies (if any)
 # e.g., git, wget, or common libraries for OpenCV like libgl1
-RUN pip install -U pip
+RUN apt-get update && apt-get install -y \
+    git \
+    wget \
+    libgl1-mesa-glx \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip install -U pip
 
 # Step 3: Copy and pre-install all Python dependencies
 # This 'requirements.txt' file should list pandas, scikit-learn, timm, etc.
 # Place it in the same directory as this Dockerfile.
 COPY requirements.txt /tmp/requirements.txt
 RUN pip install --no-cache-dir -r /tmp/requirements.txt
+RUN pip install opencv-python-headless
diff --git a/tools/setup_data.py b/tools/setup_data.py
index 5bdc3a9..125549b 100644
--- a/tools/setup_data.py
+++ b/tools/setup_data.py
@@ -1,53 +1,72 @@
-# Script to download the data from a given source and create the splits
-# This is a mock version that generate fake problems
+# Script to load the S&P500 data and create the splits for the benchmark
 from pathlib import Path
 
-import numpy as np
 import pandas as pd
-from sklearn.datasets import make_classification
-from sklearn.model_selection import train_test_split
 
-PHASE = 'dev_phase'
+PHASE = "dev_phase"
 
-DATA_DIR = Path(PHASE) / 'input_data'
-REF_DIR = Path(PHASE) / 'reference_data'
+DATA_DIR = Path(PHASE) / "input_data"
+REF_DIR = Path(PHASE) / "reference_data"
+
+RAW_DATA_PATH = Path("raw_data") / "sp500_raw.csv"
+TARGET_COL = "Target"
 
 
 def make_csv(data, filepath):
     filepath.parent.mkdir(parents=True, exist_ok=True)
-    pd.DataFrame(data).to_csv(filepath, index=False)
+    data.to_csv(filepath, index=True)  # integer row index saved as first column
 
 
 if __name__ == "__main__":
 
-    import argparse
-    parser = argparse.ArgumentParser(
-        description='Load or generate data for the benchmark'
-    )
-    parser.add_argument('--seed', type=int, default=42,
-                        help='Random seed for data generation')
-    args = parser.parse_args()
-
-    # Generate and split the data
-    rng = np.random.RandomState(args.seed)
-    X, y = make_classification(n_samples=500, n_features=5, random_state=rng)
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.4, random_state=rng
-    )
-    X_test, X_private_test, y_test, y_private_test = train_test_split(
-        X_test, y_test, test_size=0.5, random_state=rng
-    )
+    # Load the S&P500 data
+    print(f"Loading data from {RAW_DATA_PATH}")
+    df = pd.read_csv(RAW_DATA_PATH)
+
+    # Separate features and target; drop Date (not a model input)
+    y = df[TARGET_COL]
+    X = df.drop(columns=[TARGET_COL, "Date"]).reset_index(drop=True)
+
+    n = len(df)
+    train_end = int(n * 0.6)
+    test_end = int(n * 0.8)
+
+    # Split chronologically: 60% train, 20% test, 20% private_test
+    X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
+    X_test, y_test = X.iloc[train_end:test_end], y.iloc[train_end:test_end]
+    X_private_test, y_private_test = X.iloc[test_end:], y.iloc[test_end:]
+
+    print(f"Dataset shape: {df.shape}")
+    print(f"Features: {X.shape[1]}, Samples: {n}")
+    print(f"Target distribution:\n{y.value_counts()}")
 
     # Store the data in the correct folders:
     # - input_data contains train data (both features and labels) and only
     #   test features so the test labels are kept secret
     # - reference_data contains the test labels for scoring
     for split, X_split, y_split in [
-        ('train', X_train, y_train),
-        ('test', X_test, y_test),
-        ('private_test', X_private_test, y_private_test),
+        ("train", X_train, y_train),
+        ("test", X_test, y_test),
+        ("private_test", X_private_test, y_private_test),
     ]:
         split_dir = DATA_DIR / split
-        make_csv(X_split, split_dir / f'{split}_features.csv')
+        make_csv(X_split, split_dir / f"{split}_features.csv")
         label_dir = split_dir if split == "train" else REF_DIR
-        make_csv(y_split, label_dir / f'{split}_labels.csv')
\ No newline at end of file
+        make_csv(
+            pd.DataFrame({TARGET_COL: y_split}),
+            label_dir / f"{split}_labels.csv",
+        )
+
+    print("\nData splits created successfully!")
+    print(
+        f"{'Split':<15} {'Samples':<10} {'Index start':<15} {'Index end':<15}"
+    )
+    print("-" * 55)
+    for split, X_split in [
+        ("train", X_train),
+        ("test", X_test),
+        ("private_test", X_private_test),
+    ]:
+        print(
+            f"{split:<15} {len(X_split):<10} {X_split.index[0]:<15} {X_split.index[-1]:<15}"
+        )