Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/regenerate_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
},
}

PLACEHOLDER_MODEL = "<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>"
PLACEHOLDER_MODEL = "<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>"
PLACEHOLDER_ENDPOINT = "<ENDPOINT_URL eg: http://localhost:8000>"


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

import argparse
import asyncio
import logging
from contextlib import AbstractContextManager, nullcontext
from pathlib import Path

from huggingface_hub.errors import GatedRepoError
from inference_endpoint.async_utils.loop_manager import LoopManager
from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
from inference_endpoint.async_utils.transport.zmq.ready_check import send_ready_signal
Expand All @@ -29,6 +31,8 @@
from .kv_store import BasicKVStore
from .token_metrics import TokenizePool

logger = logging.getLogger(__name__)


async def main() -> None:
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -91,10 +95,28 @@ async def main() -> None:

# Using ternary operator causes errors in MyPy object type coalescing
# (coalesces to 'object' not 'AbstractContextManager[TokenizePool | None]')
pool_cm: AbstractContextManager[TokenizePool | None]
if args.tokenizer:
pool_cm: AbstractContextManager[TokenizePool | None] = TokenizePool(
args.tokenizer, n_workers=args.tokenizer_workers
)
try:
pool_cm = TokenizePool(args.tokenizer, n_workers=args.tokenizer_workers)
except (GatedRepoError, OSError) as e:
# transformers re-raises GatedRepoError as OSError containing "gated"
if isinstance(e, GatedRepoError) or "gated" in str(e).lower():
logger.warning(
"Tokenizer '%s' is a gated HuggingFace repo and this "
"environment has no access. Set HF_TOKEN or use an "
"ungated model to enable token metrics. Continuing "
"without token metrics (ISL/OSL/TPOT unavailable).",
args.tokenizer,
)
else:
logger.warning(
"Failed to load tokenizer '%s': %s. Continuing without "
"token metrics (ISL/OSL/TPOT unavailable).",
args.tokenizer,
e,
)
pool_cm = nullcontext()
Comment on lines +102 to +119
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The error handling for TokenizePool initialization is robust, but the string matching on OSError (line 104) is slightly fragile as it depends on the internal error message format of the transformers library. While this is a known pattern for gated repo errors in transformers, consider if there's a more reliable way to detect this, or ensure that any other OSError that might occur during initialization is also handled correctly (which the else block currently does).

else:
pool_cm = nullcontext()

Expand Down
41 changes: 27 additions & 14 deletions src/inference_endpoint/commands/benchmark/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
from urllib.parse import urljoin

import msgspec.json
from huggingface_hub import model_info
from huggingface_hub import HfApi, model_info
from huggingface_hub.errors import GatedRepoError
from tqdm import tqdm
from transformers.utils import logging as transformers_logging

Expand Down Expand Up @@ -179,27 +180,39 @@ def enable_streaming(self) -> bool:


def _check_tokenizer_exists(model_name: str) -> bool:
"""Check if a HuggingFace tokenizer exists for the model (API only, no download).

Returns True if the model repo exists and has tokenizer files, False otherwise.
This function is a probe — it never loads or downloads the tokenizer itself.
Downstream consumers that need tokenization (e.g. the MetricsAggregator
subprocess for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing,
and any future plugin with its own tokenization need) each load their own
instance as required.
"""Check if a HuggingFace tokenizer exists and is accessible for the model.

Returns True if the model repo exists, has tokenizer files, and the current
environment has read access. Returns False otherwise. This function is a
probe — it does not download the tokenizer itself, but for gated repos it
issues a HEAD request (via HfApi.auth_check) to verify that downstream
consumers (e.g. the MetricsAggregator subprocess for ISL/OSL/TPOT) will be
able to load it without auth errors.
"""
try:
info = model_info(model_name)
# Check for tokenizer files in the repo
siblings = {s.rfilename for s in (info.siblings or [])}
has_tokenizer = (
"tokenizer_config.json" in siblings or "tokenizer.json" in siblings
)
if has_tokenizer:
logger.info(f"Tokenizer available for model: {model_name}")
else:
if not has_tokenizer:
logger.warning(f"Model {model_name} found but has no tokenizer files")
return has_tokenizer
return False
# The metadata API returns 200 for gated repos even without auth; we
# must explicitly check access or the subprocess will 401 on download.
if info.gated:
try:
HfApi().auth_check(repo_id=model_name)
except GatedRepoError:
logger.warning(
f"Model '{model_name}' is a gated HuggingFace repo and "
"this environment has no access. Set HF_TOKEN or use an "
"ungated model to enable token metrics. Continuing "
"without token metrics (ISL/OSL/TPOT unavailable)."
)
return False
Comment on lines +203 to +213
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The addition of HfApi().auth_check is a good proactive measure to verify access to gated repositories before launching the aggregator subprocess. However, note that auth_check was introduced in huggingface_hub version 0.23.0. If the environment uses an older version, this call will raise an AttributeError. While the general except Exception block at line 222 will catch this and allow the benchmark to continue without token metrics, it might be worth ensuring the project's dependencies are updated to reflect this requirement.

logger.info(f"Tokenizer available for model: {model_name}")
return True
except ImportError:
# huggingface_hub not installed — fall back to assuming it works
logger.info(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: concurrency_benchmark
type: online # Test type: offline, online, eval, submission | options: offline, online, eval, submission
model_params:
name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
datasets: # Dataset configs
- name: perf
type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ type: online # Test type: offline, online, eval, submission | options: offline,
submission_ref: null
benchmark_mode: null # options: offline, online
model_params:
name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
temperature: null # Sampling temperature
top_k: null # Top-K sampling
top_p: null # Top-P (nucleus) sampling
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version: "1.0"
type: "eval"

model_params:
name: "meta-llama/Llama-3.1-8B-Instruct"
name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
temperature: 0.0
max_new_tokens: 2048

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: offline_benchmark
type: offline # Test type: offline, online, eval, submission | options: offline, online, eval, submission
model_params:
name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
datasets: # Dataset configs
- name: perf
type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ type: offline # Test type: offline, online, eval, submission | options: offline
submission_ref: null
benchmark_mode: null # options: offline, online
model_params:
name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
temperature: null # Sampling temperature
top_k: null # Top-K sampling
top_p: null # Top-P (nucleus) sampling
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: online_benchmark
type: online # Test type: offline, online, eval, submission | options: offline, online, eval, submission
model_params:
name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
datasets: # Dataset configs
- name: perf
type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ type: online # Test type: offline, online, eval, submission | options: offline,
submission_ref: null
benchmark_mode: null # options: offline, online
model_params:
name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
temperature: null # Sampling temperature
top_k: null # Top-K sampling
top_p: null # Top-P (nucleus) sampling
Expand Down
Loading