diff --git a/.gitignore b/.gitignore index 3cf8b28..f6241cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.egg-info *.pyc .DS_Store +.envrc /.idea/ /venv/ /.venv/ diff --git a/README.md b/README.md index 62dbf03..fdb3434 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ print(*items, sep="\n") * Cudo Compute * Verda * GCP +* JarvisLabs * LambdaLabs * Nebius * OCI diff --git a/src/gpuhunt/__main__.py b/src/gpuhunt/__main__.py index 526b505..3556e76 100644 --- a/src/gpuhunt/__main__.py +++ b/src/gpuhunt/__main__.py @@ -20,6 +20,7 @@ def main(): "digitalocean", "gcp", "hotaisle", + "jarvislabs", "lambdalabs", "nebius", "oci", @@ -77,6 +78,12 @@ def main(): provider = HotAisleProvider( api_key=os.getenv("HOTAISLE_API_KEY"), team_handle=os.getenv("HOTAISLE_TEAM_HANDLE") ) + elif args.provider == "jarvislabs": + from gpuhunt.providers.jarvislabs import JarvisLabsProvider + + provider = JarvisLabsProvider( + api_key=os.getenv("JL_API_KEY"), api_url=os.getenv("JARVISLABS_API_URL") + ) elif args.provider == "lambdalabs": from gpuhunt.providers.lambdalabs import LambdaLabsProvider diff --git a/src/gpuhunt/_internal/catalog.py b/src/gpuhunt/_internal/catalog.py index 8c7d682..b2230cb 100644 --- a/src/gpuhunt/_internal/catalog.py +++ b/src/gpuhunt/_internal/catalog.py @@ -32,7 +32,15 @@ "runpod", "cloudrift", ] -ONLINE_PROVIDERS = ["crusoe", "cudo", "digitalocean", "hotaisle", "vastai", "vultr"] +ONLINE_PROVIDERS = [ + "crusoe", + "cudo", + "digitalocean", + "hotaisle", + "jarvislabs", + "vastai", + "vultr", +] RELOAD_INTERVAL = 15 * 60 # 15 minutes diff --git a/src/gpuhunt/_internal/default.py b/src/gpuhunt/_internal/default.py index a102b78..8231993 100644 --- a/src/gpuhunt/_internal/default.py +++ b/src/gpuhunt/_internal/default.py @@ -25,6 +25,7 @@ def default_catalog() -> Catalog: ("gpuhunt.providers.crusoe", "CrusoeProvider"), ("gpuhunt.providers.vultr", "VultrProvider"), ("gpuhunt.providers.hotaisle", "HotAisleProvider"), + ("gpuhunt.providers.jarvislabs", "JarvisLabsProvider"), ("gpuhunt.providers.digitalocean", "DigitalOceanProvider"), ]: try: diff --git a/src/gpuhunt/providers/jarvislabs.py b/src/gpuhunt/providers/jarvislabs.py new file mode 100644 index 0000000..b4bf09c --- /dev/null +++ b/src/gpuhunt/providers/jarvislabs.py @@ -0,0 +1,265 @@ +import logging +import os + +import requests +from requests import Response + +from gpuhunt._internal.models import AcceleratorVendor, QueryFilter, RawCatalogItem +from gpuhunt.providers import AbstractProvider + +logger = logging.getLogger(__name__) + +API_URL = "https://backendprod.jarvislabs.net" +SERVER_META_PATH = "/misc/server_meta" +TIMEOUT = 30 +# JarvisLabs exposes offer regions in server_meta, but VM provisioning calls must be sent +# to region-specific API hosts and server_meta does not include those hosts. Keep this +# allowlist in sync with the known provisioning hosts and do not advertise offers for +# unknown regions, otherwise dstack may select capacity it cannot create. +JARVISLABS_REGION_URLS = { + "india-01": "https://backendprod.jarvislabs.net", + "india-noida-01": "https://backendn.jarvislabs.net", + "europe-01": "https://backendeu.jarvislabs.net", +} +# dstack provisions JarvisLabs GPU VMs by passing a GPU type back to the API. +# Keep ambiguous API names with spaces out of the catalog; otherwise the +# normalized gpuhunt name cannot be converted back safely without provider_data. +JARVISLABS_GPU_NAME_OVERRIDES = { + "A100-80GB": ("A100", 80.0), +} + + +class JarvisLabsProvider(AbstractProvider): + NAME = "jarvislabs" + + def __init__(self, api_key: str | None = None, api_url: str | None = None): + self.api_key = api_key or os.getenv("JL_API_KEY") + if not self.api_key: + raise ValueError("Set the JL_API_KEY environment variable.") + + self.api_url = (api_url or os.getenv("JARVISLABS_API_URL", API_URL)).rstrip("/") + + def get( + self, query_filter: QueryFilter | None = None, balance_resources: bool = True + ) -> list[RawCatalogItem]: + offers = self.fetch_offers(query_filter=query_filter) + return sorted(offers, key=lambda i: i.price) + + def fetch_offers(self, query_filter: QueryFilter | None = None) -> list[RawCatalogItem]: + response = self._make_request("GET", SERVER_META_PATH) + return convert_response_to_raw_catalog_items(response.json()) + + def _make_request(self, method: str, path: str) -> Response: + response = requests.request( + method=method, + url=f"{self.api_url}{path}", + headers={"Authorization": f"Bearer {self.api_key}"}, + timeout=TIMEOUT, + ) + response.raise_for_status() + return response + + +def convert_response_to_raw_catalog_items(data: dict) -> list[RawCatalogItem]: + offers = [] + for gpu in data.get("server_meta") or []: + offers.extend(_make_gpu_catalog_items(gpu)) + offers.extend(_make_cpu_catalog_items(data.get("cpu_meta") or {})) + return offers + + +def _make_gpu_catalog_items(gpu: dict) -> list[RawCatalogItem]: + region = gpu.get("region") + if not region: + return [] + workload_type = gpu.get("workload_type") + # JarvisLabs returns `None` for older VM-capable rows, e.g. EU H100/H200. + # Confirmed by provisioning an H100 VM from a `None` row. + if workload_type not in ("vm", None): + return [] + if region not in JARVISLABS_REGION_URLS: + logger.warning( + "Skipping JarvisLabs GPU VM offer in unsupported region %s; " + "JarvisLabs does not expose provisioning endpoint discovery", + region, + ) + return [] + + gpu_type = gpu.get("gpu_type") + if not gpu_type: + logger.warning("Skipping JarvisLabs GPU offer without gpu_type: %s", gpu) + return [] + + price = _as_float(gpu.get("price_per_hour")) + if price is None: + logger.warning("Skipping JarvisLabs GPU offer without price: %s", gpu_type) + return [] + + gpu_spec = _gpu_name_and_memory(gpu_type, gpu.get("vram")) + if gpu_spec is None: + logger.warning("Skipping JarvisLabs GPU offer with ambiguous gpu_type: %s", gpu_type) + return [] + gpu_name, gpu_memory = gpu_spec + if gpu_memory is None: + logger.warning("Skipping JarvisLabs GPU offer with unknown VRAM: %s", gpu_type) + return [] + + cpu_per_gpu = _as_int(gpu.get("cpus_per_gpu")) + ram_per_gpu = _as_float(gpu.get("ram_per_gpu")) + if cpu_per_gpu is None or ram_per_gpu is None: + logger.warning("Skipping JarvisLabs GPU offer without CPU/RAM: %s", gpu_type) + return [] + + items = _make_gpu_catalog_items_for_price( + region=region, + gpu_name=gpu_name, + gpu_memory=gpu_memory, + price=price, + cpu_per_gpu=cpu_per_gpu, + ram_per_gpu=ram_per_gpu, + available_devices=_available_devices(gpu), + max_gpus_per_instance=_max_gpus_per_instance(gpu), + spot=False, + ) + + spot_price = _as_float(gpu.get("spot_price")) + if spot_price is not None: + items.extend( + _make_gpu_catalog_items_for_price( + region=region, + gpu_name=gpu_name, + gpu_memory=gpu_memory, + price=spot_price, + cpu_per_gpu=cpu_per_gpu, + ram_per_gpu=ram_per_gpu, + available_devices=_spot_available_devices(gpu), + max_gpus_per_instance=_max_gpus_per_instance(gpu), + spot=True, + ) + ) + return items + + +def _make_gpu_catalog_items_for_price( + *, + region: str, + gpu_name: str, + gpu_memory: float, + price: float, + cpu_per_gpu: int, + ram_per_gpu: float, + available_devices: int, + max_gpus_per_instance: int, + spot: bool, +) -> list[RawCatalogItem]: + items = [] + for gpu_count in _supported_gpu_counts( + available_devices=available_devices, + max_gpus_per_instance=max_gpus_per_instance, + ): + items.append( + RawCatalogItem( + instance_name=_gpu_instance_name(gpu_name, gpu_count), + location=region, + price=round(price * gpu_count, 5), + cpu=cpu_per_gpu * gpu_count, + memory=ram_per_gpu * gpu_count, + gpu_vendor=AcceleratorVendor.NVIDIA.value, + gpu_count=gpu_count, + gpu_name=gpu_name, + gpu_memory=gpu_memory, + spot=spot, + disk_size=None, + ) + ) + return items + + +def _make_cpu_catalog_items(cpu_meta: dict) -> list[RawCatalogItem]: + offers = [] + # The JarvisLabs SDK resolves CPU VMs from cpu_meta.combinations and creates them via + # templates/vm/cpu/create; cpu_meta.workload_type is not the GPU workload selector. + for combo in cpu_meta.get("combinations") or []: + if not combo.get("available"): + continue + vcpus = _as_int(combo.get("vcpus")) + ram_gb = _as_float(combo.get("ram_gb")) + price = _as_float(combo.get("price")) + if vcpus is None or ram_gb is None or price is None: + logger.warning("Skipping JarvisLabs CPU offer with incomplete specs: %s", combo) + continue + for region, available in (combo.get("regions") or {}).items(): + if not available: + continue + if region not in JARVISLABS_REGION_URLS: + logger.warning( + "Skipping JarvisLabs CPU VM offer in unsupported region %s; " + "JarvisLabs does not expose provisioning endpoint discovery", + region, + ) + continue + offers.append( + RawCatalogItem( + instance_name=f"cpu-{vcpus}x{int(ram_gb)}", + location=region, + price=price, + cpu=vcpus, + memory=ram_gb, + gpu_vendor=None, + gpu_count=0, + gpu_name=None, + gpu_memory=None, + spot=False, + disk_size=None, + ) + ) + return offers + + +def _supported_gpu_counts(*, available_devices: int, max_gpus_per_instance: int) -> list[int]: + if available_devices <= 0 or max_gpus_per_instance <= 0: + return [] + return list(range(1, min(available_devices, max_gpus_per_instance) + 1)) + + +def _available_devices(gpu: dict) -> int: + return ( + _as_int(gpu.get("effective_num_free_devices")) or _as_int(gpu.get("num_free_devices")) or 0 + ) + + +def _spot_available_devices(gpu: dict) -> int: + return _as_int(gpu.get("spot_num_free_devices")) or 0 + + +def _max_gpus_per_instance(gpu: dict) -> int: + return _as_int(gpu.get("num_gpus")) or 1 + + +def _gpu_name_and_memory(gpu_type: str, vram: object) -> tuple[str, float | None] | None: + if any(c.isspace() for c in gpu_type): + return None + gpu_name, default_memory = JARVISLABS_GPU_NAME_OVERRIDES.get(gpu_type, (gpu_type, None)) + return gpu_name, _as_float(vram) or default_memory + + +def _gpu_instance_name(gpu_name: str, gpu_count: int) -> str: + return f"{gpu_name}-{gpu_count}x" + + +def _as_int(value: object) -> int | None: + if value is None or value == "": + return None + try: + return int(value) + except (TypeError, ValueError): + return None + + +def _as_float(value: object) -> float | None: + if value is None or value == "": + return None + try: + return float(value) + except (TypeError, ValueError): + return None diff --git a/src/tests/providers/test_jarvislabs.py b/src/tests/providers/test_jarvislabs.py new file mode 100644 index 0000000..d39e3ef --- /dev/null +++ b/src/tests/providers/test_jarvislabs.py @@ -0,0 +1,248 @@ +import gpuhunt._internal.catalog as internal_catalog +from gpuhunt import Catalog +from gpuhunt._internal.models import QueryFilter +from gpuhunt.providers.jarvislabs import ( + JarvisLabsProvider, + convert_response_to_raw_catalog_items, +) + +SERVER_META_RESPONSE = { + "server_meta": [ + { + "gpu_type": "L4", + "region": "india-noida-01", + "num_free_devices": 3, + "effective_num_free_devices": 3, + "spot_num_free_devices": 2, + "price_per_hour": 0.44, + "spot_price": 0.29, + "vram": "24", + "cpus_per_gpu": 28, + "ram_per_gpu": 124, + "workload_type": "container", + "num_gpus": "8", + }, + { + "gpu_type": "L4", + "region": "india-noida-01", + "num_free_devices": 3, + "effective_num_free_devices": 3, + "spot_num_free_devices": 2, + "price_per_hour": 0.44, + "spot_price": 0.29, + "vram": "24", + "cpus_per_gpu": 28, + "ram_per_gpu": 124, + "workload_type": "vm", + "num_gpus": "8", + }, + { + "gpu_type": "A100-80GB", + "region": "india-noida-01", + "num_free_devices": 1, + "effective_num_free_devices": 1, + "spot_num_free_devices": 1, + "price_per_hour": 1.49, + "spot_price": 0.89, + "vram": "80", + "cpus_per_gpu": 28, + "ram_per_gpu": 112, + "workload_type": "vm", + "num_gpus": "4", + }, + { + "gpu_type": "H100", + "region": "europe-01", + "num_free_devices": 25, + "effective_num_free_devices": 25, + "spot_num_free_devices": 25, + "price_per_hour": 2.99, + "spot_price": None, + "vram": "80", + "cpus_per_gpu": 16, + "ram_per_gpu": 200, + "workload_type": None, + }, + { + "gpu_type": "H100", + "region": "unknown-region", + "num_free_devices": 1, + "effective_num_free_devices": 1, + "spot_num_free_devices": 1, + "price_per_hour": 2.99, + "spot_price": None, + "vram": "80", + "cpus_per_gpu": 16, + "ram_per_gpu": 200, + "workload_type": "vm", + }, + ], + "cpu_meta": { + "workload_type": "container", + "combinations": [ + { + "vcpus": 4, + "ram_gb": 16, + "price": 0.0992, + "available": True, + "regions": { + "india-noida-01": True, + "europe-01": False, + "unknown-region": True, + }, + } + ], + }, +} + + +def test_convert_response_to_raw_catalog_items(): + offers = convert_response_to_raw_catalog_items(SERVER_META_RESPONSE) + + assert all(o.provider_data == {} for o in offers) + + l4_vm = [o for o in offers if o.gpu_name == "L4" and not o.spot] + assert [o.gpu_count for o in l4_vm] == [1, 2, 3] + assert [o.price for o in l4_vm] == [0.44, 0.88, 1.32] + assert [o.instance_name for o in l4_vm] == ["L4-1x", "L4-2x", "L4-3x"] + + l4_spot = [o for o in offers if o.gpu_name == "L4" and o.spot] + assert [o.gpu_count for o in l4_spot] == [1, 2] + assert [o.price for o in l4_spot] == [0.29, 0.58] + assert [o.instance_name for o in l4_spot] == ["L4-1x", "L4-2x"] + + a100 = next(o for o in offers if o.instance_name == "A100-1x" and not o.spot) + assert a100.gpu_name == "A100" + assert a100.gpu_memory == 80 + assert a100.location == "india-noida-01" + assert a100.disk_size is None + assert a100.provider_data == {} + + a100_spot = next(o for o in offers if o.instance_name == "A100-1x" and o.spot) + assert a100_spot.price == 0.89 + + h100 = next(o for o in offers if o.gpu_name == "H100") + assert h100.gpu_count == 1 + assert h100.location == "europe-01" + assert h100.provider_data == {} + assert h100.disk_size is None + + cpu = next(o for o in offers if o.gpu_count == 0) + assert cpu.instance_name == "cpu-4x16" + assert cpu.location == "india-noida-01" + assert cpu.cpu == 4 + assert cpu.memory == 16 + assert cpu.provider_data == {} + assert cpu.disk_size is None + + assert not any(o.location == "unknown-region" for o in offers) + + +def test_convert_response_warns_and_skips_unsupported_regions(caplog): + convert_response_to_raw_catalog_items(SERVER_META_RESPONSE) + + assert "Skipping JarvisLabs GPU VM offer in unsupported region unknown-region" in caplog.text + assert "Skipping JarvisLabs CPU VM offer in unsupported region unknown-region" in caplog.text + + +def test_convert_response_skips_ambiguous_gpu_types_with_spaces(caplog): + response = { + "server_meta": [ + { + "gpu_type": "H100 NVL", + "region": "india-noida-01", + "num_free_devices": 1, + "price_per_hour": 2.99, + "vram": "94", + "cpus_per_gpu": 16, + "ram_per_gpu": 200, + "workload_type": "vm", + }, + ], + } + + assert convert_response_to_raw_catalog_items(response) == [] + assert "Skipping JarvisLabs GPU offer with ambiguous gpu_type: H100 NVL" in caplog.text + + +def test_convert_response_skips_malformed_specs(caplog): + response = { + "server_meta": [ + { + "gpu_type": "L4", + "region": "india-noida-01", + "num_free_devices": "bad", + "price_per_hour": "bad", + "vram": "24", + "cpus_per_gpu": 28, + "ram_per_gpu": 124, + "workload_type": "vm", + }, + { + "gpu_type": "H100", + "region": "india-noida-01", + "num_free_devices": 1, + "price_per_hour": 2.69, + "vram": "bad", + "cpus_per_gpu": 16, + "ram_per_gpu": 200, + "workload_type": "vm", + }, + ], + "cpu_meta": { + "combinations": [ + { + "vcpus": "bad", + "ram_gb": 16, + "price": 0.0992, + "available": True, + "regions": {"india-noida-01": True}, + } + ] + }, + } + + offers = convert_response_to_raw_catalog_items(response) + + assert offers == [] + assert "Skipping JarvisLabs GPU offer without price: L4" in caplog.text + assert "Skipping JarvisLabs GPU offer with unknown VRAM: H100" in caplog.text + assert "Skipping JarvisLabs CPU offer with incomplete specs" in caplog.text + + +def test_fetch_offers(requests_mock): + requests_mock.get("https://api.jarvislabs.test/misc/server_meta", json=SERVER_META_RESPONSE) + + provider = JarvisLabsProvider(api_key="test-token", api_url="https://api.jarvislabs.test") + offers = provider.fetch_offers() + + assert requests_mock.last_request.headers["Authorization"] == "Bearer test-token" + assert len(offers) == 9 + assert all(o.disk_size is None for o in offers) + + offers = provider.fetch_offers(query_filter=QueryFilter(min_disk_size=250)) + assert len(offers) == 9 + assert all(o.disk_size is None for o in offers) + + offers = provider.fetch_offers(query_filter=QueryFilter(min_disk_size=50)) + assert len(offers) == 9 + assert all(o.disk_size is None for o in offers) + + +def test_catalog_query(requests_mock, monkeypatch): + requests_mock.get("https://api.jarvislabs.test/misc/server_meta", json=SERVER_META_RESPONSE) + monkeypatch.setattr(internal_catalog, "ONLINE_PROVIDERS", ["jarvislabs"]) + monkeypatch.setattr(internal_catalog, "OFFLINE_PROVIDERS", []) + + catalog = Catalog(balance_resources=False, auto_reload=False) + catalog.add_provider( + JarvisLabsProvider(api_key="test-token", api_url="https://api.jarvislabs.test") + ) + + assert len(catalog.query(provider=["jarvislabs"], min_gpu_count=2, gpu_name="L4")) == 3 + assert len(catalog.query(provider=["jarvislabs"], gpu_name="A100", min_gpu_memory=80)) == 2 + assert len(catalog.query(provider=["jarvislabs"], max_gpu_count=0)) == 1 + assert len(catalog.query(provider=["jarvislabs"], min_disk_size=250)) == 9 + assert len(catalog.query(provider=["jarvislabs"], max_disk_size=50)) == 9 + assert len(catalog.query(provider=["jarvislabs"], gpu_name="L4", spot=False)) == 3 + assert len(catalog.query(provider=["jarvislabs"], gpu_name="L4", spot=True)) == 2