x-datascience-datacamp · fasfous92 · Feb 13, 2026 · Feb 13, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@
 ingestion_res/*
 scoring_res/*
 dev_phase/*
+test_notebook/*
+raw_data/*
diff --git a/README.md b/README.md
@@ -67,7 +67,9 @@ public and private test set.
 Make sure that the `setup_data.py` script has been run to generate the data for
 the competition.
 
-Then, run the `create_bundle.py` script to create the codabench bundle archive:
+First, run the `tools/create_starting_kit.py` script
+
+Then, run the `tools/create_bundle.py` script to create the codabench bundle archive:
 
 ```bash
 python create_bundle.py
@@ -89,47 +91,3 @@ To test the scoring program, run:
 python scoring_program/scoring.py --reference-dir dev_phase/reference_data/ --output-dir scoring_res  --prediction-dir ingestion_res/
 ```
 
-
-### Setting up and testing the docker image
-
-For convenience, a python script `tools/run_docker.py` is provided to build
-the docker image, and run the ingestion and scoring programs inside the docker
-container.
-This script requires installing the `docker` python package, which can be done via pip:
-
-```bash
-pip install docker
-python tools/run_docker.py
-```
-
-You can also perform these steps manually.
-You first need to build the docker image locally from the `Dockerfile` with:
-
-```bash
-docker build -t docker-image tools
-```
-
-To test the docker image locally, run:
-
-```bash
-docker run --rm -u root \
-    -v "./ingestion_program":"/app/ingestion_program" \
-    -v "./dev_phase/input_data":/app/input_data \
-    -v "./ingestion_res":/app/output \
-    -v "./solution":/app/ingested_program \
-    --name ingestion docker-image \
-        python /app/ingestion_program/ingestion.py
-
-docker run --rm -u root \
-    -v "./scoring_program":"/app/scoring_program" \
-    -v "./dev_phase/reference_data":/app/input/ref \
-    -v "./ingestion_res":/app/input/res \
-    -v "./scoring_res":/app/output \
-    --name scoring docker-image \
-        python /app/scoring_program/scoring.py
-```
-
-### CI for the bundle
-
-This repo defines a CI for the bundle, which build a docker image from the `tools/Dockerfile`,
-and try to run `tools/setup_data.py` and then the ingestion/scoring programs.
diff --git a/competition.yaml b/competition.yaml
@@ -1,60 +1,66 @@
 version: 2
-title: Templat competition - Dummy classification
-description: Dummy classification task
+title: "GEMS-GER Groundwater Level Prediction Challenge"
+description: "Predict groundwater levels (GWL) across Germany."
 image: logo.png
-registration_auto_approve: False  # if True, do not require approval from admin to join the comp
-
+registration_auto_approve: True
 terms: pages/terms.md
+
 pages:
+  - title: Overview
+    file: pages/overview.md
+  - title: Data Description
+    file: pages/data.md
   - title: Participation
     file: pages/participate.md
-  - title: Seed
-    file: pages/seed.md
   - title: Timeline
     file: pages/timeline.md
 
 tasks:
   - index: 0
-    name: Developement Task
-    description: 'Tune models with training data, test against examples contained in public test data'
-    input_data: dev_phase/input_data/
-    reference_data: dev_phase/reference_data/
+    name: "Development Task"
+    # Notice we left the data out! We will add it in the UI later.
+    scoring_program: scoring_program/
     ingestion_program: ingestion_program/
+  - index: 1
+    name: "Final Task"
     scoring_program: scoring_program/
+    ingestion_program: ingestion_program/
 
-solutions:
+phases:
   - index: 0
-    tasks:
-    - 0
-    path: solution/
-
+    name: "Development Phase"
+    start: 2026-02-13 12:00:00
+    end: 2026-04-14 00:00:00  
+    max_submissions: 100
+    max_submissions_per_day: 5
+    starting_kit: starting_kit.zip
+    # public_data removed to prevent server crashes
+    tasks: [0]
 
-phases:
-  - name: Development Phase
-    description: 'Development phase: tune your models.'
-    start: 10-07-2025
-    end: 03-31-2026
-    tasks:
-      - 0
+  - index: 1
+    name: "Final Phase"
+    start: 2026-04-15 00:00:00  # Added the required timestamp!
+    max_submissions: 1
+    auto_migrate_to_this_phase: True
+    tasks: [1]
 
 leaderboards:
   - title: Results
     key: main
     columns:
-      - title: Test Accuracy
-        key: test
+      - title: RMSE
+        key: rmse
         index: 0
         sorting: asc
-      - title: Private Test Accuracy
-        key: private_test
-        index: 1
-        sorting: asc
-        hidden: True
-      - title: Train time
+      - title: Train Time (s)
         key: train_time
-        index: 2
+        index: 1
         sorting: desc
-      - title: Test time
+      - title: Test Time (s)
         key: test_time
+        index: 2
+        sorting: desc
+      - title: Total Duration
+        key: duration
         index: 3
         sorting: desc
diff --git a/ingestion_program/ingestion.py b/ingestion_program/ingestion.py
@@ -2,91 +2,96 @@
 import sys
 import time
 from pathlib import Path
-
 import pandas as pd
 
-
-EVAL_SETS = ["test", "private_test"]
-
+# We only have one test set now
+EVAL_SETS = ["test"]
 
 def evaluate_model(model, X_test):
-
+    """
+    Generate predictions. 
+    Note: We pass the RAW X_test to the model. 
+    The participant's pipeline must handle dropping strings!
+    """
     y_pred = model.predict(X_test)
-    return pd.DataFrame(y_pred)
-
-
-def get_train_data(data_dir):
-    data_dir = Path(data_dir)
-    training_dir = data_dir / "train"
-    X_train = pd.read_csv(training_dir / "train_features.csv")
-    y_train = pd.read_csv(training_dir / "train_labels.csv")
+    return pd.DataFrame({'GWL': y_pred})
+
+def get_train_data(data_dir, chunksize=200000):
+    train_path = Path(data_dir) / "train" / "train.csv"
+    chunks = []
+
+    print(f"Reading training data in chunks of {chunksize}...")
+    # Use chunksize to keep RAM usage low during load
+    for chunk in pd.read_csv(train_path, chunksize=chunksize):
+        # Optional: Optimize float precision to save more RAM
+        for col in chunk.select_dtypes(include=['float64']).columns:
+            chunk[col] = chunk[col].astype('float32')
+        chunks.append(chunk)
+
+    full_df = pd.concat(chunks, axis=0)
+    y_train = full_df['GWL']
+    X_train = full_df.drop(columns=['GWL'])
+
     return X_train, y_train
 
-
 def main(data_dir, output_dir):
-    # Here, you can import info from the submission module, to evaluate the
-    # submission
+    # Import the participant's model
     from submission import get_model
 
+    print("--- 1. Loading Training Data ---")
     X_train, y_train = get_train_data(data_dir)
 
-    print("Training the model")
-
+    print("--- 2. Training the Model ---")
     model = get_model()
 
-    start = time.time()
+    start_train = time.time()
     model.fit(X_train, y_train)
-    train_time = time.time() - start
-    print("-" * 10)
-    print("Evaluate the model")
-    start = time.time()
-    res = {}
-    for eval_set in EVAL_SETS:
-        X_test = pd.read_csv(data_dir / eval_set / f"{eval_set}_features.csv")
-        res[eval_set] = evaluate_model(model, X_test)
-    test_time = time.time() - start
-    print("-" * 10)
-    duration = train_time + test_time
-    print(f"Completed Prediction. Total duration: {duration}")
-
-    # Write output files
+    train_time = time.time() - start_train
+    print(f"Training completed in {train_time:.2f}s")
+
+    print("--- 3. Evaluating on Test Set ---")
+    start_test = time.time()
+
+    # Load test features (Matches setup_data.py output)
+    X_test_path = data_dir / "test" / "test_features.csv"
+    X_test = pd.read_csv(X_test_path)
+
+    y_test_pred = evaluate_model(model, X_test)
+
+    test_time = time.time() - start_test
+    print(f"Testing completed in {test_time:.2f}s")
+
+    # --- 4. Write Output Files ---
     output_dir.mkdir(parents=True, exist_ok=True)
-    with open(output_dir / "metadata.json", "w+") as f:
-        json.dump(dict(train_time=train_time, test_time=test_time), f)
-    for eval_set in EVAL_SETS:
-        filepath = output_dir / f"{eval_set}_predictions.csv"
-        res[eval_set].to_csv(filepath, index=False)
-    print()
-    print("Ingestion Program finished. Moving on to scoring")
-
+
+    # Save metadata for the leaderboard (Runtime)
+    with open(output_dir / "metadata.json", "w") as f:
+        json.dump({
+            "train_time": train_time, 
+            "test_time": test_time,
+            "duration": train_time + test_time
+        }, f)
+
+    # Save predictions (Matches what scoring.py expects)
+    y_test_pred.to_csv(output_dir / "test_predictions.csv", index=False)
+
+    print(f"Ingestion finished. Total duration: {train_time + test_time:.2f}s")
 
 if __name__ == "__main__":
     import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Ingestion program for codabench"
-    )
-    parser.add_argument(
-        "--data-dir",
-        type=str,
-        default="/app/input_data",
-        help="",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default="/app/output",
-        help="",
-    )
-    parser.add_argument(
-        "--submission-dir",
-        type=str,
-        default="/app/ingested_program",
-        help="",
-    )
+    parser = argparse.ArgumentParser(description="Ingestion program")
+
+    # Changed from optional flags (--flag) to positional arguments 
+    # to match the Codabench backend command formatin
+    parser.add_argument("data_dir", type=str, help="Path to input data")
+    parser.add_argument("output_dir", type=str, help="Path to output results")
+    parser.add_argument("program_dir", type=str, help="Path to the ingestion program")
+    parser.add_argument("submission_dir", type=str, help="Path to the participant's submission")
 
     args = parser.parse_args()
+
+    # Add submission and current folder to path so we can find 'submission.py'
     sys.path.append(args.submission_dir)
-    sys.path.append(Path(__file__).parent.resolve())
+    sys.path.append(args.program_dir)
 
     main(Path(args.data_dir), Path(args.output_dir))
diff --git a/ingestion_program/metadata.yaml b/ingestion_program/metadata.yaml
@@ -1 +1 @@
-command: python3 ingestion.py
+command: python3 ingestion.py /app/input_data/ /app/output/ /app/program /app/ingested_program
diff --git a/pages/data.md b/pages/data.md
@@ -0,0 +1,34 @@
+# Data Description
+
+This page provides details about the GEMS-GER dataset and the submission format.
+
+## Dataset Overview
+- **Total Samples:** 1,000,000 historical records.
+- **Goal:** Predict groundwater depth (**GWL**) based on environmental factors.
+
+## File Structure
+The competition data is organized as follows:
+
+- **Training Data:** `dev_phase/input_data/train/train.csv`
+  - Contains features and the target variable `GWL`.
+- **Test Features:** `dev_phase/input_data/test/test_features.csv`
+  - Contains features for which you must provide predictions.
+- **Sample Submission:** `dev_phase/input_data/test/sample_submission.csv`
+  - A template file showing the exact format required for your results.
+
+## Features and Target Variable
+### Input Features (X)
+Includes daily precipitation, temperature, soil moisture, and static well characteristics.
+
+### Target Variable (y)
+- **GWL:** Depth to groundwater in meters.
+
+## 📋 Submission Format
+Your model must output a CSV file that matches the structure of the sample found in the `test` folder.
+- **Columns:** `Id` (matching the test feature index) and `GWL` (your prediction).
+- **Header:** The file must include the header row `Id,GWL`.
+- **Example:**
+  ```csv
+  Id,GWL
+  0,5.23
+  1,5.45
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		command: python3 ingestion.py
		command: python3 ingestion.py /app/input_data/ /app/output/ /app/program /app/ingested_program