From c202f299abc5e3e408760cccd7960fa1fa7b019e Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 10 Feb 2026 11:59:21 +0100 Subject: [PATCH 1/2] save --- demos/embeddings/README.md | 4 ++- demos/embeddings/ovms_mteb.py | 57 ++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index b74042f754..7e062da99c 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -572,7 +572,9 @@ Results will be stored in `results` folder: Compare against local HuggingFace execution for reference: ```console mteb run -m thenlper/gte-small -t Banking77Classification --output_folder results -``` +``` + +> **Note**: To run limited number of samples, add `--eval_splits test --max_samples ` to the command. # Usage of tokenize endpoint diff --git a/demos/embeddings/ovms_mteb.py b/demos/embeddings/ovms_mteb.py index ce1bb7af70..9ef2ef91de 100644 --- a/demos/embeddings/ovms_mteb.py +++ b/demos/embeddings/ovms_mteb.py @@ -17,11 +17,12 @@ from __future__ import annotations import logging -from functools import partial +import random from typing import Any import numpy as np import mteb +from datasets import DatasetDict logger = logging.getLogger(__name__) import argparse @@ -32,9 +33,48 @@ dest='model_name') parser.add_argument('--dataset', default='Banking77Classification', help='Dataset to benchmark. default: Banking77Classification', dest='dataset') +parser.add_argument('--eval_splits', nargs='*', default=None, + help='Evaluation splits to use, e.g. --eval_splits test dev. If not set, all splits defined in the task are used.', + dest='eval_splits') +parser.add_argument('--hf_subsets', nargs='*', default=None, + help='HuggingFace dataset subsets to evaluate on, e.g. --hf_subsets en fr. ' + 'Useful for multilingual datasets to test only selected language subsets.', + dest='hf_subsets') +parser.add_argument('--max_samples', type=int, default=None, + help='Maximum number of samples to use per split. ' + 'When set, each evaluation split is truncated to at most this many samples, ' + 'allowing quick smoke-test runs on large datasets.', + dest='max_samples') args = vars(parser.parse_args()) +def truncate_task_datasets(task, max_samples: int, seed: int = 42) -> None: + """Truncate every split of every subset in a loaded task to at most *max_samples* rows. + + Works on the task.dataset object in-place after task.load_data() has been called. + Handles both multilingual layout (subset -> DatasetDict) and flat layout (DatasetDict). + """ + rng = random.Random(seed) + + def _truncate_split(dataset, n): + if len(dataset) <= n: + return dataset + indices = list(range(len(dataset))) + rng.shuffle(indices) + return dataset.select(sorted(indices[:n])) + + if isinstance(task.dataset, dict): + for key in task.dataset: + value = task.dataset[key] + if isinstance(value, DatasetDict): + # Multilingual: subset_name -> DatasetDict(split -> Dataset) + for split in value: + value[split] = _truncate_split(value[split], max_samples) + else: + # Flat: split -> Dataset + task.dataset[key] = _truncate_split(value, max_samples) + + class OVMSModel: def __init__(self, model_name: str, base_url:str, embed_dim: int | None = None, **kwargs) -> None: from openai import OpenAI @@ -45,7 +85,7 @@ def __init__(self, model_name: str, base_url:str, embed_dim: int | None = None, def encode( self, sentences: list[str], **kwargs: Any - ) -> torch.Tensor | np.ndarray: + ) -> np.ndarray: max_batch_size = 32 sublists = [ sentences[i : i + max_batch_size] @@ -70,8 +110,17 @@ def _to_numpy(self, embedding_response) -> np.ndarray: return np.array([e.embedding for e in embedding_response.data]) model = OVMSModel(args['model_name'], args['service_url'] ,1) -tasks = mteb.get_task(args['dataset']) -evaluation = mteb.MTEB(tasks=[tasks]) +task = mteb.get_task(args['dataset'], + eval_splits=args['eval_splits'], + hf_subsets=args['hf_subsets']) + +# If --max_samples is set, load the data early and truncate before evaluation +if args['max_samples'] is not None: + task.load_data() + truncate_task_datasets(task, args['max_samples']) + logger.info("Truncated dataset splits to at most %d samples", args['max_samples']) + +evaluation = mteb.MTEB(tasks=[task]) evaluation.run(model,verbosity=3,overwrite_results=True,output_folder='results') # For full leaderboard tests set run: # benchmark = mteb.get_benchmark("MTEB(eng)") From c77482de74368c97882ad4bd7f1707d4b1528e17 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 10 Feb 2026 12:24:55 +0100 Subject: [PATCH 2/2] after review --- demos/embeddings/README.md | 2 +- demos/embeddings/ovms_mteb.py | 55 ++++------------------------------- 2 files changed, 6 insertions(+), 51 deletions(-) diff --git a/demos/embeddings/README.md b/demos/embeddings/README.md index 7e062da99c..371b3c7fea 100644 --- a/demos/embeddings/README.md +++ b/demos/embeddings/README.md @@ -574,7 +574,7 @@ Compare against local HuggingFace execution for reference: mteb run -m thenlper/gte-small -t Banking77Classification --output_folder results ``` -> **Note**: To run limited number of samples, add `--eval_splits test --max_samples ` to the command. +> **Note**: To run limited number of samples, add `--eval_splits test` to the command. # Usage of tokenize endpoint diff --git a/demos/embeddings/ovms_mteb.py b/demos/embeddings/ovms_mteb.py index 9ef2ef91de..07c9158a54 100644 --- a/demos/embeddings/ovms_mteb.py +++ b/demos/embeddings/ovms_mteb.py @@ -17,12 +17,11 @@ from __future__ import annotations import logging -import random +from functools import partial from typing import Any import numpy as np import mteb -from datasets import DatasetDict logger = logging.getLogger(__name__) import argparse @@ -36,45 +35,9 @@ parser.add_argument('--eval_splits', nargs='*', default=None, help='Evaluation splits to use, e.g. --eval_splits test dev. If not set, all splits defined in the task are used.', dest='eval_splits') -parser.add_argument('--hf_subsets', nargs='*', default=None, - help='HuggingFace dataset subsets to evaluate on, e.g. --hf_subsets en fr. ' - 'Useful for multilingual datasets to test only selected language subsets.', - dest='hf_subsets') -parser.add_argument('--max_samples', type=int, default=None, - help='Maximum number of samples to use per split. ' - 'When set, each evaluation split is truncated to at most this many samples, ' - 'allowing quick smoke-test runs on large datasets.', - dest='max_samples') args = vars(parser.parse_args()) -def truncate_task_datasets(task, max_samples: int, seed: int = 42) -> None: - """Truncate every split of every subset in a loaded task to at most *max_samples* rows. - - Works on the task.dataset object in-place after task.load_data() has been called. - Handles both multilingual layout (subset -> DatasetDict) and flat layout (DatasetDict). - """ - rng = random.Random(seed) - - def _truncate_split(dataset, n): - if len(dataset) <= n: - return dataset - indices = list(range(len(dataset))) - rng.shuffle(indices) - return dataset.select(sorted(indices[:n])) - - if isinstance(task.dataset, dict): - for key in task.dataset: - value = task.dataset[key] - if isinstance(value, DatasetDict): - # Multilingual: subset_name -> DatasetDict(split -> Dataset) - for split in value: - value[split] = _truncate_split(value[split], max_samples) - else: - # Flat: split -> Dataset - task.dataset[key] = _truncate_split(value, max_samples) - - class OVMSModel: def __init__(self, model_name: str, base_url:str, embed_dim: int | None = None, **kwargs) -> None: from openai import OpenAI @@ -85,7 +48,7 @@ def __init__(self, model_name: str, base_url:str, embed_dim: int | None = None, def encode( self, sentences: list[str], **kwargs: Any - ) -> np.ndarray: + ) -> torch.Tensor | np.ndarray: max_batch_size = 32 sublists = [ sentences[i : i + max_batch_size] @@ -110,17 +73,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: return np.array([e.embedding for e in embedding_response.data]) model = OVMSModel(args['model_name'], args['service_url'] ,1) -task = mteb.get_task(args['dataset'], - eval_splits=args['eval_splits'], - hf_subsets=args['hf_subsets']) - -# If --max_samples is set, load the data early and truncate before evaluation -if args['max_samples'] is not None: - task.load_data() - truncate_task_datasets(task, args['max_samples']) - logger.info("Truncated dataset splits to at most %d samples", args['max_samples']) - -evaluation = mteb.MTEB(tasks=[task]) +tasks = mteb.get_task(args['dataset'], + eval_splits=args['eval_splits']) +evaluation = mteb.MTEB(tasks=[tasks]) evaluation.run(model,verbosity=3,overwrite_results=True,output_folder='results') # For full leaderboard tests set run: # benchmark = mteb.get_benchmark("MTEB(eng)")