diff --git a/scripts/regenerate_templates.py b/scripts/regenerate_templates.py index 30d0bb0d..92c470f1 100644 --- a/scripts/regenerate_templates.py +++ b/scripts/regenerate_templates.py @@ -92,7 +92,7 @@ }, } -PLACEHOLDER_MODEL = "" +PLACEHOLDER_MODEL = "" PLACEHOLDER_ENDPOINT = "" diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py index 50a3163d..bf62c4c1 100644 --- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py +++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py @@ -17,9 +17,11 @@ import argparse import asyncio +import logging from contextlib import AbstractContextManager, nullcontext from pathlib import Path +from huggingface_hub.errors import GatedRepoError from inference_endpoint.async_utils.loop_manager import LoopManager from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext from inference_endpoint.async_utils.transport.zmq.ready_check import send_ready_signal @@ -29,6 +31,8 @@ from .kv_store import BasicKVStore from .token_metrics import TokenizePool +logger = logging.getLogger(__name__) + async def main() -> None: parser = argparse.ArgumentParser( @@ -91,10 +95,28 @@ async def main() -> None: # Using ternary operator causes errors in MyPy object type coalescing # (coalesces to 'object' not 'AbstractContextManager[TokenizePool | None]') + pool_cm: AbstractContextManager[TokenizePool | None] if args.tokenizer: - pool_cm: AbstractContextManager[TokenizePool | None] = TokenizePool( - args.tokenizer, n_workers=args.tokenizer_workers - ) + try: + pool_cm = TokenizePool(args.tokenizer, n_workers=args.tokenizer_workers) + except (GatedRepoError, OSError) as e: + # transformers re-raises GatedRepoError as OSError containing "gated" + if isinstance(e, GatedRepoError) or "gated" in str(e).lower(): + logger.warning( + "Tokenizer '%s' is a gated HuggingFace repo and this " + "environment has no access. Set HF_TOKEN or use an " + "ungated model to enable token metrics. Continuing " + "without token metrics (ISL/OSL/TPOT unavailable).", + args.tokenizer, + ) + else: + logger.warning( + "Failed to load tokenizer '%s': %s. Continuing without " + "token metrics (ISL/OSL/TPOT unavailable).", + args.tokenizer, + e, + ) + pool_cm = nullcontext() else: pool_cm = nullcontext() diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 35bd23b7..4ac5e8f6 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -38,7 +38,8 @@ from urllib.parse import urljoin import msgspec.json -from huggingface_hub import model_info +from huggingface_hub import HfApi, model_info +from huggingface_hub.errors import GatedRepoError from tqdm import tqdm from transformers.utils import logging as transformers_logging @@ -179,27 +180,39 @@ def enable_streaming(self) -> bool: def _check_tokenizer_exists(model_name: str) -> bool: - """Check if a HuggingFace tokenizer exists for the model (API only, no download). - - Returns True if the model repo exists and has tokenizer files, False otherwise. - This function is a probe — it never loads or downloads the tokenizer itself. - Downstream consumers that need tokenization (e.g. the MetricsAggregator - subprocess for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing, - and any future plugin with its own tokenization need) each load their own - instance as required. + """Check if a HuggingFace tokenizer exists and is accessible for the model. + + Returns True if the model repo exists, has tokenizer files, and the current + environment has read access. Returns False otherwise. This function is a + probe — it does not download the tokenizer itself, but for gated repos it + issues a HEAD request (via HfApi.auth_check) to verify that downstream + consumers (e.g. the MetricsAggregator subprocess for ISL/OSL/TPOT) will be + able to load it without auth errors. """ try: info = model_info(model_name) - # Check for tokenizer files in the repo siblings = {s.rfilename for s in (info.siblings or [])} has_tokenizer = ( "tokenizer_config.json" in siblings or "tokenizer.json" in siblings ) - if has_tokenizer: - logger.info(f"Tokenizer available for model: {model_name}") - else: + if not has_tokenizer: logger.warning(f"Model {model_name} found but has no tokenizer files") - return has_tokenizer + return False + # The metadata API returns 200 for gated repos even without auth; we + # must explicitly check access or the subprocess will 401 on download. + if info.gated: + try: + HfApi().auth_check(repo_id=model_name) + except GatedRepoError: + logger.warning( + f"Model '{model_name}' is a gated HuggingFace repo and " + "this environment has no access. Set HF_TOKEN or use an " + "ungated model to enable token metrics. Continuing " + "without token metrics (ISL/OSL/TPOT unavailable)." + ) + return False + logger.info(f"Tokenizer available for model: {model_name}") + return True except ImportError: # huggingface_hub not installed — fall back to assuming it works logger.info( diff --git a/src/inference_endpoint/config/templates/concurrency_template.yaml b/src/inference_endpoint/config/templates/concurrency_template.yaml index e3f2bc2e..a6490b18 100644 --- a/src/inference_endpoint/config/templates/concurrency_template.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template.yaml @@ -1,7 +1,7 @@ name: concurrency_benchmark type: online # Test type: offline, online, eval, submission | options: offline, online, eval, submission model_params: - name: '' + name: '' datasets: # Dataset configs - name: perf type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index bf233cec..a766315b 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -4,7 +4,7 @@ type: online # Test type: offline, online, eval, submission | options: offline, submission_ref: null benchmark_mode: null # options: offline, online model_params: - name: '' + name: '' temperature: null # Sampling temperature top_k: null # Top-K sampling top_p: null # Top-P (nucleus) sampling diff --git a/src/inference_endpoint/config/templates/eval_template.yaml b/src/inference_endpoint/config/templates/eval_template.yaml index 3213efd1..42bd81df 100644 --- a/src/inference_endpoint/config/templates/eval_template.yaml +++ b/src/inference_endpoint/config/templates/eval_template.yaml @@ -4,7 +4,7 @@ version: "1.0" type: "eval" model_params: - name: "meta-llama/Llama-3.1-8B-Instruct" + name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" temperature: 0.0 max_new_tokens: 2048 diff --git a/src/inference_endpoint/config/templates/offline_template.yaml b/src/inference_endpoint/config/templates/offline_template.yaml index 97ec2769..9bba04bc 100644 --- a/src/inference_endpoint/config/templates/offline_template.yaml +++ b/src/inference_endpoint/config/templates/offline_template.yaml @@ -1,7 +1,7 @@ name: offline_benchmark type: offline # Test type: offline, online, eval, submission | options: offline, online, eval, submission model_params: - name: '' + name: '' datasets: # Dataset configs - name: perf type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index b716ed6c..f4dc44a3 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -4,7 +4,7 @@ type: offline # Test type: offline, online, eval, submission | options: offline submission_ref: null benchmark_mode: null # options: offline, online model_params: - name: '' + name: '' temperature: null # Sampling temperature top_k: null # Top-K sampling top_p: null # Top-P (nucleus) sampling diff --git a/src/inference_endpoint/config/templates/online_template.yaml b/src/inference_endpoint/config/templates/online_template.yaml index e321daa1..9e1eec14 100644 --- a/src/inference_endpoint/config/templates/online_template.yaml +++ b/src/inference_endpoint/config/templates/online_template.yaml @@ -1,7 +1,7 @@ name: online_benchmark type: online # Test type: offline, online, eval, submission | options: offline, online, eval, submission model_params: - name: '' + name: '' datasets: # Dataset configs - name: perf type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 31e49d73..35dfa2f3 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -4,7 +4,7 @@ type: online # Test type: offline, online, eval, submission | options: offline, submission_ref: null benchmark_mode: null # options: offline, online model_params: - name: '' + name: '' temperature: null # Sampling temperature top_k: null # Top-K sampling top_p: null # Top-P (nucleus) sampling