Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/gpuhunt/_internal/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def is_nvidia_superchip(gpu_name: str) -> bool:
NvidiaGPUInfo(name="RTX2000Ada", memory=16, compute_capability=(8, 9)),
NvidiaGPUInfo(name="RTX4000Ada", memory=20, compute_capability=(8, 9)),
NvidiaGPUInfo(name="RTX6000Ada", memory=48, compute_capability=(8, 9)),
NvidiaGPUInfo(name="RTXPRO6000", memory=96, compute_capability=(12, 0)),
NvidiaGPUInfo(name="T4", memory=16, compute_capability=(7, 5)),
NvidiaGPUInfo(name="V100", memory=16, compute_capability=(7, 0)),
NvidiaGPUInfo(name="V100", memory=32, compute_capability=(7, 0)),
Expand Down
58 changes: 31 additions & 27 deletions src/gpuhunt/providers/jarvislabs.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import logging
import os
from typing import cast

import requests
from requests import Response
from typing_extensions import NotRequired, TypedDict

from gpuhunt._internal.models import AcceleratorVendor, QueryFilter, RawCatalogItem
from gpuhunt._internal.models import AcceleratorVendor, JSONObject, QueryFilter, RawCatalogItem
from gpuhunt.providers import AbstractProvider

logger = logging.getLogger(__name__)

API_URL = "https://backendprod.jarvislabs.net"
API_URL = "https://backendn.jarvislabs.net"
SERVER_META_PATH = "/misc/server_meta"
TIMEOUT = 30
# JarvisLabs exposes offer regions in server_meta, but VM provisioning calls must be sent
Expand All @@ -18,17 +20,27 @@
# unknown regions, otherwise dstack may select capacity it cannot create.
JARVISLABS_REGION_URLS = {
"india-01": "https://backendprod.jarvislabs.net",
"india-chennai-01": "https://backendc.jarvislabs.net",
"india-noida-01": "https://backendn.jarvislabs.net",
"europe-01": "https://backendeu.jarvislabs.net",
}
# dstack provisions JarvisLabs GPU VMs by passing a GPU type back to the API.
# Keep ambiguous API names with spaces out of the catalog; otherwise the
# normalized gpuhunt name cannot be converted back safely without provider_data.
# Explicit mappings for human-reviewed JarvisLabs GPU tokens that differ from
# gpuhunt canonical GPU names. Keep unmapped spaced names out of the catalog so
# new provider tokens do not get normalized incorrectly and silently.
JARVISLABS_GPU_NAME_OVERRIDES = {
"A100-80GB": ("A100", 80.0),
"RTX-PRO6000": ("RTXPRO6000", 96.0),
"RTX PRO 6000": ("RTXPRO6000", 96.0),
}


class JarvisLabsCatalogItemProviderData(TypedDict):
# Original JarvisLabs API GPU type, set only when gpuhunt normalization loses
# the create-time token, e.g. A100-80GB -> A100 or RTX-PRO6000 -> RTXPRO6000.
# dstack uses this value for VM creation.
gpu_type: NotRequired[str]


class JarvisLabsProvider(AbstractProvider):
NAME = "jarvislabs"

Expand Down Expand Up @@ -97,7 +109,7 @@ def _make_gpu_catalog_items(gpu: dict) -> list[RawCatalogItem]:

gpu_spec = _gpu_name_and_memory(gpu_type, gpu.get("vram"))
if gpu_spec is None:
logger.warning("Skipping JarvisLabs GPU offer with ambiguous gpu_type: %s", gpu_type)
logger.warning("Skipping JarvisLabs GPU offer with unmapped gpu_type: %s", gpu_type)
return []
gpu_name, gpu_memory = gpu_spec
if gpu_memory is None:
Expand All @@ -119,24 +131,12 @@ def _make_gpu_catalog_items(gpu: dict) -> list[RawCatalogItem]:
ram_per_gpu=ram_per_gpu,
available_devices=_available_devices(gpu),
max_gpus_per_instance=_max_gpus_per_instance(gpu),
provider_data=_gpu_provider_data(gpu_type, gpu_name),
spot=False,
)

spot_price = _as_float(gpu.get("spot_price"))
if spot_price is not None:
items.extend(
_make_gpu_catalog_items_for_price(
region=region,
gpu_name=gpu_name,
gpu_memory=gpu_memory,
price=spot_price,
cpu_per_gpu=cpu_per_gpu,
ram_per_gpu=ram_per_gpu,
available_devices=_spot_available_devices(gpu),
max_gpus_per_instance=_max_gpus_per_instance(gpu),
spot=True,
)
)
# JarvisLabs supports spot for containers/templates, not VMs. This provider
# only publishes VM-capable offers because dstack provisions JarvisLabs VMs.
return items


Expand All @@ -150,6 +150,7 @@ def _make_gpu_catalog_items_for_price(
ram_per_gpu: float,
available_devices: int,
max_gpus_per_instance: int,
provider_data: JSONObject,
spot: bool,
) -> list[RawCatalogItem]:
items = []
Expand All @@ -170,6 +171,7 @@ def _make_gpu_catalog_items_for_price(
gpu_memory=gpu_memory,
spot=spot,
disk_size=None,
provider_data=provider_data,
)
)
return items
Expand Down Expand Up @@ -216,6 +218,12 @@ def _make_cpu_catalog_items(cpu_meta: dict) -> list[RawCatalogItem]:
return offers


def _gpu_provider_data(gpu_type: str, gpu_name: str) -> JSONObject:
if gpu_type == gpu_name:
return {}
return cast(JSONObject, JarvisLabsCatalogItemProviderData(gpu_type=gpu_type))


def _supported_gpu_counts(*, available_devices: int, max_gpus_per_instance: int) -> list[int]:
if available_devices <= 0 or max_gpus_per_instance <= 0:
return []
Expand All @@ -228,18 +236,14 @@ def _available_devices(gpu: dict) -> int:
)


def _spot_available_devices(gpu: dict) -> int:
return _as_int(gpu.get("spot_num_free_devices")) or 0


def _max_gpus_per_instance(gpu: dict) -> int:
return _as_int(gpu.get("num_gpus")) or 1


def _gpu_name_and_memory(gpu_type: str, vram: object) -> tuple[str, float | None] | None:
if any(c.isspace() for c in gpu_type):
return None
gpu_name, default_memory = JARVISLABS_GPU_NAME_OVERRIDES.get(gpu_type, (gpu_type, None))
if gpu_name == gpu_type and any(c.isspace() for c in gpu_type):
return None
Comment on lines +245 to +246

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(nit) Now that you pass gpu_type in provider_data, it should be possible to normalize GPU names with spaces instead of skipping them (like you suggested in the initial JarvisLabs PR versiom)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’d prefer to keep this fail-closed. provider_data["gpu_type"] solves the create-time token issue once we have a reviewed mapping, but it does not make arbitrary spaced GPU names safe to canonicalize. For example, blindly stripping spaces would map RTX A6000 to RTXA6000, while gpuhunt uses A6000.

So I kept unmapped spaced JarvisLabs GPU types skipped with a warning and renamed the warning from “ambiguous” to “unmapped”. For any spaced JarvisLabs token we verify, we can add an explicit mapping.

return gpu_name, _as_float(vram) or default_memory


Expand Down
12 changes: 12 additions & 0 deletions src/tests/_internal/test_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from gpuhunt._internal.constraints import (
correct_gpu_memory_gib,
find_accelerators,
get_compute_capability,
get_gpu_vendor,
matches,
)
Expand Down Expand Up @@ -250,3 +251,14 @@ def test_tenstorrent_accelerators(gpu_name: str, expected_memories_gib: set[int]
assert {accelerator.name for accelerator in accelerators} == {gpu_name}
assert {accelerator.memory for accelerator in accelerators} == expected_memories_gib
assert get_gpu_vendor(gpu_name.upper()) == AcceleratorVendor.TENSTORRENT


def test_rtx_pro_6000_accelerator() -> None:
accelerators = find_accelerators(
names=["RTXPRO6000"],
vendors=[AcceleratorVendor.NVIDIA],
)

assert [accelerator.memory for accelerator in accelerators] == [96]
assert get_compute_capability("RTXPRO6000") == (12, 0)
assert get_gpu_vendor("RTXPRO6000") == AcceleratorVendor.NVIDIA
74 changes: 55 additions & 19 deletions src/tests/providers/test_jarvislabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,34 @@
"workload_type": "vm",
"num_gpus": "4",
},
{
"gpu_type": "RTX-PRO6000",
"region": "india-chennai-01",
"num_free_devices": 2,
"effective_num_free_devices": 2,
"spot_num_free_devices": 1,
"price_per_hour": 1.89,
"spot_price": 1.19,
"vram": "96",
"cpus_per_gpu": 28,
"ram_per_gpu": 160,
"workload_type": "vm",
"num_gpus": "8",
},
{
"gpu_type": "RTX PRO 6000",
"region": "india-noida-01",
"num_free_devices": 1,
"effective_num_free_devices": 1,
"spot_num_free_devices": 0,
"price_per_hour": 1.89,
"spot_price": None,
"vram": "96",
"cpus_per_gpu": 28,
"ram_per_gpu": 160,
"workload_type": "vm",
"num_gpus": "8",
},
{
"gpu_type": "H100",
"region": "europe-01",
Expand Down Expand Up @@ -98,28 +126,35 @@

def test_convert_response_to_raw_catalog_items():
offers = convert_response_to_raw_catalog_items(SERVER_META_RESPONSE)

assert all(o.provider_data == {} for o in offers)
assert not any(o.spot for o in offers)

l4_vm = [o for o in offers if o.gpu_name == "L4" and not o.spot]
assert [o.gpu_count for o in l4_vm] == [1, 2, 3]
assert [o.price for o in l4_vm] == [0.44, 0.88, 1.32]
assert [o.instance_name for o in l4_vm] == ["L4-1x", "L4-2x", "L4-3x"]

l4_spot = [o for o in offers if o.gpu_name == "L4" and o.spot]
assert [o.gpu_count for o in l4_spot] == [1, 2]
assert [o.price for o in l4_spot] == [0.29, 0.58]
assert [o.instance_name for o in l4_spot] == ["L4-1x", "L4-2x"]
assert all(o.provider_data == {} for o in l4_vm)

a100 = next(o for o in offers if o.instance_name == "A100-1x" and not o.spot)
assert a100.gpu_name == "A100"
assert a100.gpu_memory == 80
assert a100.location == "india-noida-01"
assert a100.disk_size is None
assert a100.provider_data == {}
assert a100.provider_data == {"gpu_type": "A100-80GB"}

a100_spot = next(o for o in offers if o.instance_name == "A100-1x" and o.spot)
assert a100_spot.price == 0.89
rtx_pro_6000 = [o for o in offers if o.gpu_name == "RTXPRO6000" and not o.spot]
assert [o.gpu_count for o in rtx_pro_6000] == [1, 2, 1]
assert [o.instance_name for o in rtx_pro_6000] == [
"RTXPRO6000-1x",
"RTXPRO6000-2x",
"RTXPRO6000-1x",
]
assert [o.provider_data for o in rtx_pro_6000] == [
{"gpu_type": "RTX-PRO6000"},
{"gpu_type": "RTX-PRO6000"},
{"gpu_type": "RTX PRO 6000"},
]
assert rtx_pro_6000[0].location == "india-chennai-01"
assert all(o.gpu_memory == 96 for o in rtx_pro_6000)

h100 = next(o for o in offers if o.gpu_name == "H100")
assert h100.gpu_count == 1
Expand All @@ -145,24 +180,24 @@ def test_convert_response_warns_and_skips_unsupported_regions(caplog):
assert "Skipping JarvisLabs CPU VM offer in unsupported region unknown-region" in caplog.text


def test_convert_response_skips_ambiguous_gpu_types_with_spaces(caplog):
def test_convert_response_skips_unmapped_gpu_types_with_spaces(caplog):
response = {
"server_meta": [
{
"gpu_type": "H100 NVL",
"gpu_type": "RTX A6000",
"region": "india-noida-01",
"num_free_devices": 1,
"price_per_hour": 2.99,
"vram": "94",
"price_per_hour": 0.79,
"vram": "48",
"cpus_per_gpu": 16,
"ram_per_gpu": 200,
"ram_per_gpu": 100,
"workload_type": "vm",
},
],
}

assert convert_response_to_raw_catalog_items(response) == []
assert "Skipping JarvisLabs GPU offer with ambiguous gpu_type: H100 NVL" in caplog.text
assert "Skipping JarvisLabs GPU offer with unmapped gpu_type: RTX A6000" in caplog.text


def test_convert_response_skips_malformed_specs(caplog):
Expand Down Expand Up @@ -239,10 +274,11 @@ def test_catalog_query(requests_mock, monkeypatch):
JarvisLabsProvider(api_key="test-token", api_url="https://api.jarvislabs.test")
)

assert len(catalog.query(provider=["jarvislabs"], min_gpu_count=2, gpu_name="L4")) == 3
assert len(catalog.query(provider=["jarvislabs"], gpu_name="A100", min_gpu_memory=80)) == 2
assert len(catalog.query(provider=["jarvislabs"], min_gpu_count=2, gpu_name="L4")) == 2
assert len(catalog.query(provider=["jarvislabs"], gpu_name="A100", min_gpu_memory=80)) == 1
assert len(catalog.query(provider=["jarvislabs"], gpu_name="RTXPRO6000")) == 3
assert len(catalog.query(provider=["jarvislabs"], max_gpu_count=0)) == 1
assert len(catalog.query(provider=["jarvislabs"], min_disk_size=250)) == 9
assert len(catalog.query(provider=["jarvislabs"], max_disk_size=50)) == 9
assert len(catalog.query(provider=["jarvislabs"], gpu_name="L4", spot=False)) == 3
assert len(catalog.query(provider=["jarvislabs"], gpu_name="L4", spot=True)) == 2
assert len(catalog.query(provider=["jarvislabs"], gpu_name="L4", spot=True)) == 0
Loading