From 1e40be17ccd9baf07dd5ed6ff070afe852770a40 Mon Sep 17 00:00:00 2001
From: Abhishek <abhishekup082gmail.com@Abhisheks-MacBook-Air.local>
Date: Thu, 26 Feb 2026 23:02:55 +0530
Subject: [PATCH 1/6] [MNT] Diagnose and address long test runtimes (#1633)

- Add global per-test timeout (600s) to pytest config
- CI: report all test durations (--durations=0) for diagnosis
- CI: add explicit --timeout=600 to prevent hanging tests
- Optimize verify_cache_state fixture: scope function -> module
- Add scripts/profile_tests.sh for local duration profiling
---
 .github/workflows/test.yml |  6 +++---
 pyproject.toml             |  1 +
 scripts/profile_tests.sh   | 27 +++++++++++++++++++++++++++
 tests/conftest.py          | 10 ++++++----
 4 files changed, 37 insertions(+), 7 deletions(-)
 create mode 100755 scripts/profile_tests.sh

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index dc0995fc6..44fccc2e7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -153,7 +153,7 @@ jobs:
           marks="not production_server"
         fi
 
-        pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        pytest -n 4 --durations=0 --timeout=600 --dist load -sv $codecov -o log_cli=true -m "$marks"
 
     - name: Run tests on Ubuntu Production
       if: matrix.os == 'ubuntu-latest'
@@ -171,14 +171,14 @@ jobs:
           marks="production_server"
         fi
 
-        pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        pytest -n 4 --durations=0 --timeout=600 --dist load -sv $codecov -o log_cli=true -m "$marks"
 
     - name: Run tests on Windows
       if: matrix.os == 'windows-latest'
       env:
         OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
-        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
+        pytest -n 4 --durations=0 --timeout=600 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
 
     - name: Upload coverage
       if: matrix.code-cov && always()
diff --git a/pyproject.toml b/pyproject.toml
index 8c463968b..91235ba04 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -130,6 +130,7 @@ log_level="DEBUG"
 testpaths = ["tests"]
 minversion = "7.0"
 xfail_strict = true
+timeout = 600
 filterwarnings=[
     "ignore:the matrix subclass:PendingDeprecationWarning"
 ]
diff --git a/scripts/profile_tests.sh b/scripts/profile_tests.sh
new file mode 100755
index 000000000..593700cff
--- /dev/null
+++ b/scripts/profile_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Profile test durations to diagnose slow tests (Issue #1633)
+# Usage: ./scripts/profile_tests.sh [marker_filter]
+#
+# Examples:
+#   ./scripts/profile_tests.sh                               # non-server tests
+#   ./scripts/profile_tests.sh "production_server"            # production server tests only
+#   ./scripts/profile_tests.sh "sklearn"                      # sklearn tests only
+
+set -euo pipefail
+
+MARKER_FILTER="${1:-not production_server and not test_server}"
+
+echo "=== OpenML Test Duration Profiler ==="
+echo "Marker filter: $MARKER_FILTER"
+echo "Timeout per test: 300s"
+echo ""
+
+pytest \
+  --durations=0 \
+  --timeout=300 \
+  -q \
+  -m "$MARKER_FILTER" \
+  2>&1 | tee test_durations_report.txt
+
+echo ""
+echo "=== Report saved to test_durations_report.txt ==="
diff --git a/tests/conftest.py b/tests/conftest.py
index 1359e6247..bbb486b3d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -106,7 +106,8 @@ def delete_remote_files(tracker, flow_names) -> None:
     if "flow" in tracker:
         to_sort = list(zip(tracker["flow"], flow_names))
         flow_deletion_order = [
-            entity_id for entity_id, _ in sorted(to_sort, key=lambda x: len(x[1]), reverse=True)
+            entity_id
+            for entity_id, _ in sorted(to_sort, key=lambda x: len(x[1]), reverse=True)
         ]
         tracker["flow"] = [flow_deletion_order[1] for flow_id, _ in flow_deletion_order]
 
@@ -275,7 +276,7 @@ def test_apikey_v2() -> str:
     return openml.config.get_test_servers()[APIVersion.V2]["apikey"]
 
 
-@pytest.fixture(autouse=True, scope="function")
+@pytest.fixture(autouse=True, scope="module")
 def verify_cache_state(test_files_directory) -> Iterator[None]:
     assert_static_test_cache_correct(test_files_directory)
     yield
@@ -324,11 +325,12 @@ def with_test_cache(test_files_directory, request):
     openml.config.set_root_cache_directory(_root_cache_directory)
     if tmp_cache.exists():
         shutil.rmtree(tmp_cache)
-        
+
 
 @pytest.fixture
 def static_cache_dir():
-    return Path(__file__).parent / "files" 
+    return Path(__file__).parent / "files"
+
 
 @pytest.fixture
 def workdir(tmp_path):

From 0644d2c1f901ea5e264a1b2ef91a519c7ff02794 Mon Sep 17 00:00:00 2001
From: Abhishek <abhishekup082gmail.com@Abhisheks-MacBook-Air.local>
Date: Sun, 1 Mar 2026 21:36:51 +0530
Subject: [PATCH 2/6] Address review feedback: revert CI/conftest changes,
 improve profile script

- Revert CI workflow to original --durations=20 (no timeout)
- Remove global timeout from pyproject.toml
- Revert conftest.py verify_cache_state scope to function
- Update profile_tests.sh: accept CLI args (-m, -d, -t, -o) with defaults
---
 .github/workflows/test.yml |  6 +++---
 pyproject.toml             |  1 -
 scripts/profile_tests.sh   | 44 +++++++++++++++++++++++++++++---------
 tests/conftest.py          |  2 +-
 4 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 44fccc2e7..dc0995fc6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -153,7 +153,7 @@ jobs:
           marks="not production_server"
         fi
 
-        pytest -n 4 --durations=0 --timeout=600 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
 
     - name: Run tests on Ubuntu Production
       if: matrix.os == 'ubuntu-latest'
@@ -171,14 +171,14 @@ jobs:
           marks="production_server"
         fi
 
-        pytest -n 4 --durations=0 --timeout=600 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
 
     - name: Run tests on Windows
       if: matrix.os == 'windows-latest'
       env:
         OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
-        pytest -n 4 --durations=0 --timeout=600 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
+        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
 
     - name: Upload coverage
       if: matrix.code-cov && always()
diff --git a/pyproject.toml b/pyproject.toml
index 91235ba04..8c463968b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -130,7 +130,6 @@ log_level="DEBUG"
 testpaths = ["tests"]
 minversion = "7.0"
 xfail_strict = true
-timeout = 600
 filterwarnings=[
     "ignore:the matrix subclass:PendingDeprecationWarning"
 ]
diff --git a/scripts/profile_tests.sh b/scripts/profile_tests.sh
index 593700cff..88e6f0ad7 100755
--- a/scripts/profile_tests.sh
+++ b/scripts/profile_tests.sh
@@ -1,27 +1,51 @@
 #!/bin/bash
 # Profile test durations to diagnose slow tests (Issue #1633)
-# Usage: ./scripts/profile_tests.sh [marker_filter]
+#
+# Usage: ./scripts/profile_tests.sh [options]
+#
+# Options:
+#   -m MARKER    Pytest marker filter (default: "not production_server and not test_server")
+#   -d DURATION  Number of slowest durations to show, 0 for all (default: 20)
+#   -t TIMEOUT   Per-test timeout in seconds (default: 300)
+#   -o OUTPUT    Output file path for the report (default: test_durations_report.txt)
 #
 # Examples:
-#   ./scripts/profile_tests.sh                               # non-server tests
-#   ./scripts/profile_tests.sh "production_server"            # production server tests only
-#   ./scripts/profile_tests.sh "sklearn"                      # sklearn tests only
+#   ./scripts/profile_tests.sh
+#   ./scripts/profile_tests.sh -m "production_server" -d 0 -t 600
+#   ./scripts/profile_tests.sh -m "sklearn" -o sklearn_report.txt
 
 set -euo pipefail
 
-MARKER_FILTER="${1:-not production_server and not test_server}"
+# Default values
+MARKER_FILTER="not production_server and not test_server"
+DURATIONS=20
+TIMEOUT=300
+OUTPUT_FILE="test_durations_report.txt"
+
+# Parse command line arguments
+while getopts "m:d:t:o:" opt; do
+  case $opt in
+    m) MARKER_FILTER="$OPTARG" ;;
+    d) DURATIONS="$OPTARG" ;;
+    t) TIMEOUT="$OPTARG" ;;
+    o) OUTPUT_FILE="$OPTARG" ;;
+    *) echo "Usage: $0 [-m marker] [-d durations] [-t timeout] [-o output_file]" && exit 1 ;;
+  esac
+done
 
 echo "=== OpenML Test Duration Profiler ==="
 echo "Marker filter: $MARKER_FILTER"
-echo "Timeout per test: 300s"
+echo "Durations to show: $DURATIONS"
+echo "Timeout per test: ${TIMEOUT}s"
+echo "Output file: $OUTPUT_FILE"
 echo ""
 
 pytest \
-  --durations=0 \
-  --timeout=300 \
+  --durations="$DURATIONS" \
+  --timeout="$TIMEOUT" \
   -q \
   -m "$MARKER_FILTER" \
-  2>&1 | tee test_durations_report.txt
+  2>&1 | tee "$OUTPUT_FILE"
 
 echo ""
-echo "=== Report saved to test_durations_report.txt ==="
+echo "=== Report saved to $OUTPUT_FILE ==="
diff --git a/tests/conftest.py b/tests/conftest.py
index bbb486b3d..03aaafe2d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -276,7 +276,7 @@ def test_apikey_v2() -> str:
     return openml.config.get_test_servers()[APIVersion.V2]["apikey"]
 
 
-@pytest.fixture(autouse=True, scope="module")
+@pytest.fixture(autouse=True, scope="function")
 def verify_cache_state(test_files_directory) -> Iterator[None]:
     assert_static_test_cache_correct(test_files_directory)
     yield

From 37d605c9641d9b881d15ab1e2c18b3f8d5c20b2e Mon Sep 17 00:00:00 2001
From: Abhishek <abhishekup082gmail.com@Abhisheks-MacBook-Air.local>
Date: Sun, 1 Mar 2026 22:11:47 +0530
Subject: [PATCH 3/6] Update profile_tests.sh: add -n workers, --dist=load,
 remove -q

- Add -n flag for parallel workers (default: 4)
- Add --dist=load to distribute tests across workers
- Remove -q flag for full pytest output
- Mimics exact pytest command used in CI
---
 scripts/profile_tests.sh | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/scripts/profile_tests.sh b/scripts/profile_tests.sh
index 88e6f0ad7..05a8cd2fe 100755
--- a/scripts/profile_tests.sh
+++ b/scripts/profile_tests.sh
@@ -7,12 +7,13 @@
 #   -m MARKER    Pytest marker filter (default: "not production_server and not test_server")
 #   -d DURATION  Number of slowest durations to show, 0 for all (default: 20)
 #   -t TIMEOUT   Per-test timeout in seconds (default: 300)
+#   -n WORKERS   Number of parallel workers (default: 4)
 #   -o OUTPUT    Output file path for the report (default: test_durations_report.txt)
 #
 # Examples:
 #   ./scripts/profile_tests.sh
 #   ./scripts/profile_tests.sh -m "production_server" -d 0 -t 600
-#   ./scripts/profile_tests.sh -m "sklearn" -o sklearn_report.txt
+#   ./scripts/profile_tests.sh -m "sklearn" -n 2 -o sklearn_report.txt
 
 set -euo pipefail
 
@@ -20,16 +21,18 @@ set -euo pipefail
 MARKER_FILTER="not production_server and not test_server"
 DURATIONS=20
 TIMEOUT=300
+NUM_WORKERS=4
 OUTPUT_FILE="test_durations_report.txt"
 
 # Parse command line arguments
-while getopts "m:d:t:o:" opt; do
+while getopts "m:d:t:n:o:" opt; do
   case $opt in
     m) MARKER_FILTER="$OPTARG" ;;
     d) DURATIONS="$OPTARG" ;;
     t) TIMEOUT="$OPTARG" ;;
+    n) NUM_WORKERS="$OPTARG" ;;
     o) OUTPUT_FILE="$OPTARG" ;;
-    *) echo "Usage: $0 [-m marker] [-d durations] [-t timeout] [-o output_file]" && exit 1 ;;
+    *) echo "Usage: $0 [-m marker] [-d durations] [-t timeout] [-n workers] [-o output_file]" && exit 1 ;;
   esac
 done
 
@@ -37,13 +40,15 @@ echo "=== OpenML Test Duration Profiler ==="
 echo "Marker filter: $MARKER_FILTER"
 echo "Durations to show: $DURATIONS"
 echo "Timeout per test: ${TIMEOUT}s"
+echo "Workers: $NUM_WORKERS"
 echo "Output file: $OUTPUT_FILE"
 echo ""
 
 pytest \
+  --dist=load \
+  -n="$NUM_WORKERS" \
   --durations="$DURATIONS" \
   --timeout="$TIMEOUT" \
-  -q \
   -m "$MARKER_FILTER" \
   2>&1 | tee "$OUTPUT_FILE"
 

From 30e62827faee82e4b82abc93aba627dd71cd6f99 Mon Sep 17 00:00:00 2001
From: Abhishek <abhishekup082@gmail.com>
Date: Wed, 20 May 2026 20:14:51 +0530
Subject: [PATCH 4/6] Remove profile_tests.sh script

Per review feedback from @PGijsbers: drop the brittle wrapper script in favor of documenting example pytest invocations directly in CONTRIBUTING.md, which keeps things flexible (e.g. --setup-only, running specific files/classes).
---
 scripts/profile_tests.sh | 56 ----------------------------------------
 1 file changed, 56 deletions(-)
 delete mode 100755 scripts/profile_tests.sh

diff --git a/scripts/profile_tests.sh b/scripts/profile_tests.sh
deleted file mode 100755
index 05a8cd2fe..000000000
--- a/scripts/profile_tests.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Profile test durations to diagnose slow tests (Issue #1633)
-#
-# Usage: ./scripts/profile_tests.sh [options]
-#
-# Options:
-#   -m MARKER    Pytest marker filter (default: "not production_server and not test_server")
-#   -d DURATION  Number of slowest durations to show, 0 for all (default: 20)
-#   -t TIMEOUT   Per-test timeout in seconds (default: 300)
-#   -n WORKERS   Number of parallel workers (default: 4)
-#   -o OUTPUT    Output file path for the report (default: test_durations_report.txt)
-#
-# Examples:
-#   ./scripts/profile_tests.sh
-#   ./scripts/profile_tests.sh -m "production_server" -d 0 -t 600
-#   ./scripts/profile_tests.sh -m "sklearn" -n 2 -o sklearn_report.txt
-
-set -euo pipefail
-
-# Default values
-MARKER_FILTER="not production_server and not test_server"
-DURATIONS=20
-TIMEOUT=300
-NUM_WORKERS=4
-OUTPUT_FILE="test_durations_report.txt"
-
-# Parse command line arguments
-while getopts "m:d:t:n:o:" opt; do
-  case $opt in
-    m) MARKER_FILTER="$OPTARG" ;;
-    d) DURATIONS="$OPTARG" ;;
-    t) TIMEOUT="$OPTARG" ;;
-    n) NUM_WORKERS="$OPTARG" ;;
-    o) OUTPUT_FILE="$OPTARG" ;;
-    *) echo "Usage: $0 [-m marker] [-d durations] [-t timeout] [-n workers] [-o output_file]" && exit 1 ;;
-  esac
-done
-
-echo "=== OpenML Test Duration Profiler ==="
-echo "Marker filter: $MARKER_FILTER"
-echo "Durations to show: $DURATIONS"
-echo "Timeout per test: ${TIMEOUT}s"
-echo "Workers: $NUM_WORKERS"
-echo "Output file: $OUTPUT_FILE"
-echo ""
-
-pytest \
-  --dist=load \
-  -n="$NUM_WORKERS" \
-  --durations="$DURATIONS" \
-  --timeout="$TIMEOUT" \
-  -m "$MARKER_FILTER" \
-  2>&1 | tee "$OUTPUT_FILE"
-
-echo ""
-echo "=== Report saved to $OUTPUT_FILE ==="

From 7412079db8e19bfff7ff1d2a260a6d88be25c49c Mon Sep 17 00:00:00 2001
From: Abhishek <abhishekup082@gmail.com>
Date: Wed, 20 May 2026 20:17:41 +0530
Subject: [PATCH 5/6] Refactor flow deletion order sorting in conftest.pyRevert
 formatting-only changes in tests/conftest.py

Per @PGijsbers review: this PR shouldn't touch conftest.py since the changes were purely formatting. Restoring the file to match main.
---
 tests/conftest.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 03aaafe2d..1359e6247 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -106,8 +106,7 @@ def delete_remote_files(tracker, flow_names) -> None:
     if "flow" in tracker:
         to_sort = list(zip(tracker["flow"], flow_names))
         flow_deletion_order = [
-            entity_id
-            for entity_id, _ in sorted(to_sort, key=lambda x: len(x[1]), reverse=True)
+            entity_id for entity_id, _ in sorted(to_sort, key=lambda x: len(x[1]), reverse=True)
         ]
         tracker["flow"] = [flow_deletion_order[1] for flow_id, _ in flow_deletion_order]
 
@@ -325,12 +324,11 @@ def with_test_cache(test_files_directory, request):
     openml.config.set_root_cache_directory(_root_cache_directory)
     if tmp_cache.exists():
         shutil.rmtree(tmp_cache)
-
+        
 
 @pytest.fixture
 def static_cache_dir():
-    return Path(__file__).parent / "files"
-
+    return Path(__file__).parent / "files" 
 
 @pytest.fixture
 def workdir(tmp_path):

From c5b905df4cb20b65163c8a94b20a10b4d1a299e8 Mon Sep 17 00:00:00 2001
From: Abhishek <abhishekup082@gmail.com>
Date: Wed, 20 May 2026 20:20:59 +0530
Subject: [PATCH 6/6] Add diagnostics for slow tests in CONTRIBUTING.mdDocument
 how to diagnose slow tests in CONTRIBUTING.md

Added section on diagnosing slow tests with pytest.Following @PGijsbers's suggestion, add a 'Diagnosing Slow Tests' subsection to the testing documentation with example pytest invocations (--durations, --timeout, --setup-only, marker filters, xdist) instead of shipping a dedicated wrapper script. This keeps the setup flexible: contributors can scope their investigation to specific files/classes or use any pytest argument they need.
---
 CONTRIBUTING.md | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d194525ef..38b13825b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -107,6 +107,32 @@ $env:OPENML_TEST_SERVER_ADMIN_KEY = "admin-key"
 export OPENML_TEST_SERVER_ADMIN_KEY="admin-key"
 ```
 
+#### Diagnosing Slow Tests
+
+If you suspect a test (or the suite as a whole) is running too slowly, `pytest` already exposes everything you need to investigate it. A few invocations that are useful when looking into test runtimes:
+
+```bash
+# Show the 20 slowest tests (use 0 to list every test's duration)
+pytest tests --durations=20
+
+# Fail any test that exceeds the given timeout (requires pytest-timeout)
+pytest tests --timeout=600
+
+# Investigate only fixture/setup costs without actually running the tests
+pytest tests --setup-only
+
+# Profile a specific module, class, or test
+pytest tests/test_datasets/test_dataset.py --durations=0
+
+# Skip the slow live-server tests while profiling locally
+pytest tests --durations=0 -m "not production_server and not test_server"
+
+# Run the suite in parallel to reproduce CI behaviour (requires pytest-xdist)
+pytest tests -n 4 --dist=load --durations=0
+```
+
+Combining these with the marker filters (`production_server`, `test_server`, `sklearn`) makes it straightforward to narrow the investigation down to the slow tests without changing project configuration.
+
 ### Pull Request Checklist
 
 You can go to the `openml-python` GitHub repository to create the pull request by [comparing the branch](https://github.com/openml/openml-python/compare) from your fork with the `main` branch of the `openml-python` repository. When creating a pull request, make sure to follow the comments and structured provided by the template on GitHub.
@@ -214,4 +240,4 @@ When dependencies are installed, run
 ```bash
 mkdocs serve
 ```
-This will open a preview of the website.
\ No newline at end of file
+This will open a preview of the website.