mlcommons · arekay-nv · Apr 22, 2026 · gemini-code-assist · Apr 22, 2026 · gemini-code-assist
@@ -92,7 +92,7 @@
     },
 }
 
-PLACEHOLDER_MODEL = "<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>"
+PLACEHOLDER_MODEL = "<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>"
 PLACEHOLDER_ENDPOINT = "<ENDPOINT_URL eg: http://localhost:8000>"
 
 

@@ -17,9 +17,11 @@
 
 import argparse
 import asyncio
+import logging
 from contextlib import AbstractContextManager, nullcontext
 from pathlib import Path
 
+from huggingface_hub.errors import GatedRepoError
 from inference_endpoint.async_utils.loop_manager import LoopManager
 from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
 from inference_endpoint.async_utils.transport.zmq.ready_check import send_ready_signal
@@ -29,6 +31,8 @@
 from .kv_store import BasicKVStore
 from .token_metrics import TokenizePool
 
+logger = logging.getLogger(__name__)
+
 
 async def main() -> None:
     parser = argparse.ArgumentParser(
@@ -91,10 +95,28 @@ async def main() -> None:
 
     # Using ternary operator causes errors in MyPy object type coalescing
     # (coalesces to 'object' not 'AbstractContextManager[TokenizePool | None]')
+    pool_cm: AbstractContextManager[TokenizePool | None]
     if args.tokenizer:
-        pool_cm: AbstractContextManager[TokenizePool | None] = TokenizePool(
-            args.tokenizer, n_workers=args.tokenizer_workers
-        )
+        try:
+            pool_cm = TokenizePool(args.tokenizer, n_workers=args.tokenizer_workers)
+        except (GatedRepoError, OSError) as e:
+            # transformers re-raises GatedRepoError as OSError containing "gated"
+            if isinstance(e, GatedRepoError) or "gated" in str(e).lower():
+                logger.warning(
+                    "Tokenizer '%s' is a gated HuggingFace repo and this "
+                    "environment has no access. Set HF_TOKEN or use an "
+                    "ungated model to enable token metrics. Continuing "
+                    "without token metrics (ISL/OSL/TPOT unavailable).",
+                    args.tokenizer,
+                )
+            else:
+                logger.warning(
+                    "Failed to load tokenizer '%s': %s. Continuing without "
+                    "token metrics (ISL/OSL/TPOT unavailable).",
+                    args.tokenizer,
+                    e,
+                )
+            pool_cm = nullcontext()
     else:
         pool_cm = nullcontext()
 

@@ -38,7 +38,8 @@
 from urllib.parse import urljoin
 
 import msgspec.json
-from huggingface_hub import model_info
+from huggingface_hub import HfApi, model_info
+from huggingface_hub.errors import GatedRepoError
 from tqdm import tqdm
 from transformers.utils import logging as transformers_logging
 
@@ -179,27 +180,39 @@ def enable_streaming(self) -> bool:
 
 
 def _check_tokenizer_exists(model_name: str) -> bool:
-    """Check if a HuggingFace tokenizer exists for the model (API only, no download).
-
-    Returns True if the model repo exists and has tokenizer files, False otherwise.
-    This function is a probe — it never loads or downloads the tokenizer itself.
-    Downstream consumers that need tokenization (e.g. the MetricsAggregator
-    subprocess for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing,
-    and any future plugin with its own tokenization need) each load their own
-    instance as required.
+    """Check if a HuggingFace tokenizer exists and is accessible for the model.
+
+    Returns True if the model repo exists, has tokenizer files, and the current
+    environment has read access. Returns False otherwise. This function is a
+    probe — it does not download the tokenizer itself, but for gated repos it
+    issues a HEAD request (via HfApi.auth_check) to verify that downstream
+    consumers (e.g. the MetricsAggregator subprocess for ISL/OSL/TPOT) will be
+    able to load it without auth errors.
     """
     try:
         info = model_info(model_name)
-        # Check for tokenizer files in the repo
         siblings = {s.rfilename for s in (info.siblings or [])}
         has_tokenizer = (
             "tokenizer_config.json" in siblings or "tokenizer.json" in siblings
         )
-        if has_tokenizer:
-            logger.info(f"Tokenizer available for model: {model_name}")
-        else:
+        if not has_tokenizer:
             logger.warning(f"Model {model_name} found but has no tokenizer files")
-        return has_tokenizer
+            return False
+        # The metadata API returns 200 for gated repos even without auth; we
+        # must explicitly check access or the subprocess will 401 on download.
+        if info.gated:
+            try:
+                HfApi().auth_check(repo_id=model_name)
+            except GatedRepoError:
+                logger.warning(
+                    f"Model '{model_name}' is a gated HuggingFace repo and "
+                    "this environment has no access. Set HF_TOKEN or use an "
+                    "ungated model to enable token metrics. Continuing "
+                    "without token metrics (ISL/OSL/TPOT unavailable)."
+                )
+                return False
+        logger.info(f"Tokenizer available for model: {model_name}")
+        return True
     except ImportError:
         # huggingface_hub not installed — fall back to assuming it works
         logger.info(

@@ -1,7 +1,7 @@
 name: concurrency_benchmark
 type: online  # Test type: offline, online, eval, submission | options: offline, online, eval, submission
 model_params:
-  name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
+  name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy

@@ -4,7 +4,7 @@ type: online  # Test type: offline, online, eval, submission | options: offline,
 submission_ref: null
 benchmark_mode: null  # options: offline, online
 model_params:
-  name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
+  name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
   temperature: null  # Sampling temperature
   top_k: null  # Top-K sampling
   top_p: null  # Top-P (nucleus) sampling

@@ -4,7 +4,7 @@ version: "1.0"
 type: "eval"
 
 model_params:
-  name: "meta-llama/Llama-3.1-8B-Instruct"
+  name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
   temperature: 0.0
   max_new_tokens: 2048
 

@@ -1,7 +1,7 @@
 name: offline_benchmark
 type: offline  # Test type: offline, online, eval, submission | options: offline, online, eval, submission
 model_params:
-  name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
+  name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy

@@ -4,7 +4,7 @@ type: offline  # Test type: offline, online, eval, submission | options: offline
 submission_ref: null
 benchmark_mode: null  # options: offline, online
 model_params:
-  name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
+  name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
   temperature: null  # Sampling temperature
   top_k: null  # Top-K sampling
   top_p: null  # Top-P (nucleus) sampling

@@ -1,7 +1,7 @@
 name: online_benchmark
 type: online  # Test type: offline, online, eval, submission | options: offline, online, eval, submission
 model_params:
-  name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
+  name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy

@@ -4,7 +4,7 @@ type: online  # Test type: offline, online, eval, submission | options: offline,
 submission_ref: null
 benchmark_mode: null  # options: offline, online
 model_params:
-  name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
+  name: '<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>'
   temperature: null  # Sampling temperature
   top_k: null  # Top-K sampling
   top_p: null  # Top-P (nucleus) sampling
-Original file line number
+Diff line change
@@ Expand Up / @@ -92,7 +92,7 @@ @@
         },
     }
-    PLACEHOLDER_MODEL = "<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>"
+    PLACEHOLDER_MODEL = "<MODEL_NAME eg: TinyLlama/TinyLlama-1.1B-Chat-v1.0>"
     PLACEHOLDER_ENDPOINT = "<ENDPOINT_URL eg: http://localhost:8000>"
@@ Expand Down @@