Skip to content

Commit 3e7c415

Browse files
committed
Download methods
1 parent 3ad7268 commit 3e7c415

3 files changed

Lines changed: 226 additions & 46 deletions

File tree

openml/_api/http/client.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,29 @@
11
from __future__ import annotations
22

3+
import contextlib
4+
import shutil
5+
import urllib
6+
import urllib.parse
7+
import zipfile
8+
from collections.abc import Callable
39
from pathlib import Path
410
from typing import TYPE_CHECKING, Any
511
from urllib.parse import urlencode, urljoin, urlparse
612

13+
import minio
714
import requests
815
from requests import Response
16+
from urllib3 import ProxyManager
917

1018
from openml.__version__ import __version__
1119
from openml._api.config import settings
1220

1321
if TYPE_CHECKING:
1422
from openml._api.config import APIConfig
1523

24+
import openml.config
25+
from openml.utils import ProgressBar
26+
1627

1728
class CacheMixin:
1829
@property
@@ -149,3 +160,143 @@ def delete(
149160
use_api_key=True,
150161
**request_kwargs,
151162
)
163+
164+
def download(
165+
self,
166+
url: str,
167+
handler: Callable[[Response, Path, str], Path],
168+
encoding: str = "utf-8",
169+
) -> Path:
170+
response = self.get(url)
171+
dir_path = self._get_cache_dir(url, {})
172+
dir_path = dir_path.expanduser()
173+
if handler is not None:
174+
return handler(response, dir_path, encoding)
175+
176+
return self._text_handler(response, dir_path, encoding, url)
177+
178+
def _text_handler(self, response: Response, path: Path, encoding: str) -> Path:
179+
if path.is_dir():
180+
path = path / "response.txt"
181+
path.parent.mkdir(parents=True, exist_ok=True)
182+
with path.open("w", encoding=encoding) as f:
183+
f.write(response.text)
184+
return path
185+
186+
187+
class MinIOClient(CacheMixin):
188+
def __init__(self) -> None:
189+
self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}
190+
191+
def download_minio_file(
192+
self,
193+
source: str,
194+
destination: str | Path | None = None,
195+
exists_ok: bool = True, # noqa: FBT002
196+
proxy: str | None = "auto",
197+
) -> str:
198+
"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.
199+
200+
Parameters
201+
----------
202+
source : str
203+
URL to a file in a MinIO bucket.
204+
destination : str | Path
205+
Path to store the file to, if a directory is provided the original filename is used.
206+
exists_ok : bool, optional (default=True)
207+
If False, raise FileExists if a file already exists in ``destination``.
208+
proxy: str, optional (default = "auto")
209+
The proxy server to use. By default it's "auto" which uses ``requests`` to
210+
automatically find the proxy to use. Pass None or the environment variable
211+
``no_proxy="*"`` to disable proxies.
212+
"""
213+
destination = self._get_cache_dir(source, {}) if destination is None else Path(destination)
214+
parsed_url = urllib.parse.urlparse(source)
215+
216+
# expect path format: /BUCKET/path/to/file.ext
217+
bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
218+
if destination.is_dir():
219+
destination = Path(destination, object_name)
220+
if destination.is_file() and not exists_ok:
221+
raise FileExistsError(f"File already exists in {destination}.")
222+
223+
destination = destination.expanduser()
224+
destination.parent.mkdir(parents=True, exist_ok=True)
225+
226+
if proxy == "auto":
227+
resolved_proxies = requests.utils.get_environ_proxies(parsed_url.geturl())
228+
proxy = requests.utils.select_proxy(parsed_url.geturl(), resolved_proxies) # type: ignore
229+
230+
proxy_client = ProxyManager(proxy) if proxy else None
231+
232+
client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)
233+
try:
234+
client.fget_object(
235+
bucket_name=bucket,
236+
object_name=object_name,
237+
file_path=str(destination),
238+
progress=ProgressBar() if openml.config.show_progress else None,
239+
request_headers=self.headers,
240+
)
241+
if destination.is_file() and destination.suffix == ".zip":
242+
with zipfile.ZipFile(destination, "r") as zip_ref:
243+
zip_ref.extractall(destination.parent)
244+
245+
except minio.error.S3Error as e:
246+
if e.message is not None and e.message.startswith("Object does not exist"):
247+
raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
248+
# e.g. permission error, or a bucket does not exist (which is also interpreted as a
249+
# permission error on minio level).
250+
raise FileNotFoundError("Bucket does not exist or is private.") from e
251+
252+
return str(destination)
253+
254+
def download_minio_bucket(self, source: str, destination: str | Path) -> None:
255+
"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.
256+
257+
Does not redownload files which already exist.
258+
259+
Parameters
260+
----------
261+
source : str
262+
URL to a MinIO bucket.
263+
destination : str | Path
264+
Path to a directory to store the bucket content in.
265+
"""
266+
destination = self._get_cache_dir(source, {}) if destination is None else Path(destination)
267+
parsed_url = urllib.parse.urlparse(source)
268+
269+
# expect path format: /BUCKET/path/to/file.ext
270+
_, bucket, *prefixes, _file = parsed_url.path.split("/")
271+
prefix = "/".join(prefixes)
272+
273+
client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
274+
275+
for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
276+
if file_object.object_name is None:
277+
raise ValueError(f"Object name is None for object {file_object!r}")
278+
if file_object.etag is None:
279+
raise ValueError(f"Object etag is None for object {file_object!r}")
280+
281+
marker = destination / file_object.etag
282+
if marker.exists():
283+
continue
284+
285+
file_destination = destination / file_object.object_name.rsplit("/", 1)[1]
286+
if (file_destination.parent / file_destination.stem).exists():
287+
# Marker is missing but archive exists means the server archive changed
288+
# force a refresh
289+
shutil.rmtree(file_destination.parent / file_destination.stem)
290+
291+
with contextlib.suppress(FileExistsError):
292+
self.download_minio_file(
293+
source=source.rsplit("/", 1)[0]
294+
+ "/"
295+
+ file_object.object_name.rsplit("/", 1)[1],
296+
destination=file_destination,
297+
exists_ok=False,
298+
)
299+
300+
if file_destination.is_file() and file_destination.suffix == ".zip":
301+
file_destination.unlink()
302+
marker.touch()

openml/_api/resources/base.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22

33
from abc import ABC, abstractmethod
44
from pathlib import Path
5-
from typing import TYPE_CHECKING, Any
6-
from typing_extensions import Literal
5+
from typing import TYPE_CHECKING, Any, Literal
76

87
if TYPE_CHECKING:
98
import pandas as pd
@@ -87,7 +86,16 @@ def parse_features_file(
8786
@abstractmethod
8887
def parse_qualities_file(
8988
self, qualities_file: Path, qualities_pickle_file: Path
90-
) -> dict[str, float] | None: ...
89+
) -> dict[str, float]: ...
90+
91+
@abstractmethod
92+
def download_file(self, url_ext: str, encoding: str = "utf-8") -> Path: ...
93+
94+
@abstractmethod
95+
def download_features_file(self, dataset_id: int) -> Path: ...
96+
97+
@abstractmethod
98+
def download_qualities_file(self, dataset_id: int) -> Path: ...
9199

92100

93101
class TasksAPI(ResourceAPI, ABC):

0 commit comments

Comments
 (0)