From 6aacd05e898c07f275f0567e1ca1d61a7e9ce712 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Thu, 18 Sep 2025 18:58:58 +0530 Subject: [PATCH 01/23] nanobier testbench --- inference/Makefile.txt | 18 ++ inference/evaluate_nanobeir.py | 323 +++++++++++++++++++++++++++++++++ inference/nanobeir_config.yaml | 13 ++ inference/requirements.txt | 6 + 4 files changed, 360 insertions(+) create mode 100644 inference/Makefile.txt create mode 100644 inference/evaluate_nanobeir.py create mode 100644 inference/nanobeir_config.yaml create mode 100644 inference/requirements.txt diff --git a/inference/Makefile.txt b/inference/Makefile.txt new file mode 100644 index 0000000..b884a73 --- /dev/null +++ b/inference/Makefile.txt @@ -0,0 +1,18 @@ +PY=python3 +CFG=nanobeir_config.yaml +OUT=./outputs/nanobeir_leaderboard.json + +.PHONY: setup eval show clean + +setup: + $(PY) -m pip install -r requirements.txt + +eval: + $(PY) evaluate_nanobeir.py --config $(CFG) + +show: + @echo "== Leaderboard ==" + @cat $(OUT) || true + +clean: + rm -rf outputs diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py new file mode 100644 index 0000000..d32e1a5 --- /dev/null +++ b/inference/evaluate_nanobeir.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 +""" +NanoBEIR evaluator with one-query debug peek. + +- Optional baseline via Sentence-Transformers +- llama.cpp --embedding HTTP server (/embedding) +- Prints a sanity check for one query so you can see relevant IDs vs retrieved IDs +""" + +import os, json, math, time, statistics, argparse, random +from typing import List, Dict, Any, Iterable, Union + +# ----------------- Tunables ----------------- +MAX_CHARS = 2000 +RETRY_BACKOFF = [0.0, 0.2, 0.6] +RETRY_SHRINK = [1.00, 0.60, 0.35] + +# ----------------- Small utilities ----------------- +def p95(latencies_ms: List[float]) -> float: + if not latencies_ms: + return 0.0 + return float(statistics.quantiles(latencies_ms, n=100)[94]) + +def time_calls(fn, payloads: List[Any], warmup: int = 3) -> float: + if not payloads: + return 0.0 + for _ in range(max(0, warmup)): + _ = fn([payloads[0]]) + lat = [] + for p in payloads: + t0 = time.time() + _ = fn([p]) + t1 = time.time() + lat.append((t1 - t0) * 1000.0) + return p95(lat) + +def median_rank(results, qrels) -> float: + ranks = [] + for qid, doc_scores in results.items(): + if qid not in qrels: + continue + relevant = {did for did, rel in qrels[qid].items() if rel > 0} + ranked = sorted(doc_scores.items(), key=lambda kv: kv[1], reverse=True) + for idx, (did, _) in enumerate(ranked, start=1): + if did in relevant: + ranks.append(idx) + break + if not ranks: + return 0.0 + ranks.sort() + m = len(ranks) + return float(ranks[m//2] if m % 2 else (ranks[m//2 - 1] + ranks[m//2]) / 2) + +# ----------------- Minimal BEIR imports ----------------- +def _imports(): + global util, GenericDataLoader, EvaluateRetrieval, DRES + from beir import util + from beir.datasets.data_loader import GenericDataLoader + from beir.retrieval.evaluation import EvaluateRetrieval + from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES + +# ----------------- Dataset URLs ----------------- +_BEIR_URLS = { + "scifact": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip", + "fiqa": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip", + "scidocs": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scidocs.zip", +} + +def load_dataset(name: str, base_dir: str = "./datasets"): + _imports() + url_or_id = _BEIR_URLS.get(name, name) + data_path = util.download_and_unzip(url_or_id, base_dir) + corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test") + return corpus, queries, qrels + +# ----------------- Model wrappers ----------------- +def _doc_text(doc: Dict[str, str]) -> str: + title = (doc.get("title") or "").strip() + text = (doc.get("text") or "").strip() + merged = (title + " " + text).strip() + return merged[:MAX_CHARS] if len(merged) > MAX_CHARS else merged + +class STModel: + def __init__(self, hf_id: str): + from sentence_transformers import SentenceTransformer + self.model = SentenceTransformer(hf_id) + + def encode_queries(self, queries, batch_size=32, **kwargs): + return self.model.encode( + queries, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=False + ) + + def encode_corpus(self, corpus, batch_size=32, **kwargs): + docs_iter = corpus.values() if isinstance(corpus, dict) else corpus + texts = [((d.get("title") or "") + " " + (d.get("text") or "")).strip()[:MAX_CHARS] + for d in docs_iter] + return self.model.encode( + texts, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True + ) + +class LlamaServerEncoder: + """Client for llama.cpp --embedding server (/embedding). Ensures fixed-dim vectors.""" + def __init__(self, endpoint: str): + import requests + from requests.adapters import HTTPAdapter + from urllib3.util.retry import Retry + self.session = requests.Session() + retry = Retry(total=3, backoff_factor=0.2, status_forcelist=(502, 503, 504)) + self.session.mount("http://", HTTPAdapter(max_retries=retry)) + self.endpoint = endpoint.rstrip("/") + self._dim = None # lock the embedding size after first good response + + def _parse_embedding_json(self, js): + # try to dig out the first numeric list from many shapes + def first_vector(obj): + if isinstance(obj, dict): + # common fields + for k in ("embedding", "vector", "values", "data"): + if k in obj: + return first_vector(obj[k]) + # otherwise try the first value + if obj: + return first_vector(next(iter(obj.values()))) + return [] + if isinstance(obj, (list, tuple)): + if not obj: + return [] + if isinstance(obj[0], dict): + return first_vector(obj[0]) + if isinstance(obj[0], (list, tuple)): + return first_vector(obj[0]) + return obj + return [obj] + return first_vector(js) + + def _to_1d_numeric(self, obj): + def flatten(xs): + for x in xs: + if isinstance(x, (list, tuple)): + yield from flatten(x) + elif isinstance(x, dict): + for k in ("vector", "embedding", "values", "data"): + if k in x: + yield from flatten(x[k]); break + else: + for v in x.values(): + yield from flatten(v) + else: + yield x + vec = list(flatten(self._parse_embedding_json(obj))) + vec = [float(x) for x in vec] + return vec + + def _normalize_vec(self, vec): + vec = self._to_1d_numeric(vec) + if self._dim is None: + self._dim = len(vec) + print(f"[llama] locked embedding dim = {self._dim}", flush=True) + if len(vec) < self._dim: + vec = vec + [0.0] * (self._dim - len(vec)) + elif len(vec) > self._dim: + vec = vec[:self._dim] + return vec + + def _embed_one(self, text: str, timeout=60): + orig = text + for backoff, shrink in zip(RETRY_BACKOFF, RETRY_SHRINK): + if backoff: time.sleep(backoff) + t = orig[: int(MAX_CHARS * shrink)] + r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) + if r.status_code >= 500: + continue + r.raise_for_status() + return self._normalize_vec(r.json()) + # final tiny attempt + t = orig[: min(512, len(orig))] + r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) + r.raise_for_status() + return self._normalize_vec(r.json()) + + def encode_queries(self, queries, batch_size=1, **kwargs): + return [self._embed_one(q) for q in queries] + + def encode_corpus(self, corpus, batch_size=1, **kwargs): + docs_iter = corpus.values() if isinstance(corpus, dict) else corpus + texts = [_doc_text(d) for d in docs_iter] + return [self._embed_one(t) for t in texts] + +# ----------------- Evaluation + Debug Peek ----------------- +def debug_one_query(corpus, queries, qrels, results, k=5): + """Print one query's relevant IDs vs. top-k retrieved IDs.""" + # pick a query that has ground-truth and results + candidate_qids = [qid for qid in qrels.keys() if qid in results] + if not candidate_qids: + print("[debug] No overlapping queries between qrels and results.") + return + qid = random.choice(candidate_qids) + q_text = queries.get(qid, "(query text missing)") + relevant = {did for did, rel in qrels[qid].items() if rel > 0} + + ranked = sorted(results[qid].items(), key=lambda kv: kv[1], reverse=True)[:k] + retrieved_ids = [did for did, _ in ranked] + overlap = relevant.intersection(retrieved_ids) + + print("\n============= DEBUG PEEK =============") + print(f"Query ID: {qid}") + print(f"Query: {q_text}") + print(f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}{' ...' if len(relevant)>10 else ''}") + print("\nTop-{} retrieved:".format(k)) + for i, (did, score) in enumerate(ranked, 1): + title = (corpus.get(did, {}).get('title') if isinstance(corpus, dict) else None) or "" + print(f"{i:>2}. {did} score={score:.4f} title={title[:80]}") + print(f"\nOverlap@{k}: {sorted(list(overlap)) if overlap else 'None'}") + print("=====================================\n", flush=True) + +def evaluate_model(beir_model, corpus, queries, qrels, cost_per_1k: float): + _imports() + retriever_model = DRES(beir_model, batch_size=32) + retriever = EvaluateRetrieval(retriever_model, score_function="cos_sim") + + # Run retrieval + results = retriever.retrieve(corpus, queries) + + # --- Manual Recall@K (hit rate) --- + def recall_at_k(k: int) -> float: + hits = 0 + total = 0 + for qid, rels in qrels.items(): + if qid not in results: + continue + total += 1 + relevant = {did for did, rel in rels.items() if rel > 0} + ranked = sorted(results[qid].items(), key=lambda kv: kv[1], reverse=True)[:k] + retrieved_ids = {did for did, _ in ranked} + if relevant & retrieved_ids: + hits += 1 + return (hits / total) if total > 0 else 0.0 + + R1 = recall_at_k(1) + R5 = recall_at_k(5) + R10 = recall_at_k(10) + + print(f"[debug] R@1={R1:.3f} R@5={R5:.3f} R@10={R10:.3f}", flush=True) + + # --- NDCG@10 from BEIR (robust pick of the key) --- + ndcg, _map, recall_dict, precision = retriever.evaluate(qrels, results, retriever.k_values) + def _pick_ndcg10(d): + for key in (10, "10", "ndcg@10", "NDCG@10"): + if key in d: + return float(d[key]) + # sometimes BEIR nests like {'NDCG': {10: val, ...}} + if isinstance(d, dict): + for v in d.values(): + try: + return _pick_ndcg10(v) + except Exception: + pass + return 0.0 + N10 = _pick_ndcg10(ndcg) + + # --- Median rank of first relevant (as before) --- + medr = float(median_rank(results, qrels)) + + # --- Latency p95 on first ~50 queries --- + qids = list(queries.keys()) + probe = [queries[q] for q in qids[:min(50, len(qids))]] + lat = time_calls(lambda batch: beir_model.encode_queries(batch, batch_size=1), probe, warmup=3) + + return { + "R@1": float(R1), + "R@5": float(R5), + "R@10": float(R10), + "NDCG@10": float(N10), + "MedR": float(medr), + "Latency_ms_p95": float(lat), + "Cost_per_1k_queries": float(cost_per_1k), + } + + +# ----------------- Main ----------------- +def main(): + import yaml + ap = argparse.ArgumentParser() + ap.add_argument("--config", required=True) + args = ap.parse_args() + + with open(args.config, "r") as f: + cfg = yaml.safe_load(f) + + out_dir = cfg.get("output_dir", "./outputs") + os.makedirs(out_dir, exist_ok=True) + leaderboard_path = cfg.get("leaderboard_json", f"{out_dir}/nanobeir_leaderboard.json") + + datasets = cfg.get("datasets", ["scifact"]) + cost = cfg.get("cost_per_1k", {}) + models_cfg = cfg.get("models", {}) + + all_out: Dict[str, Dict[str, Any]] = {} + + for ds in datasets: + corpus, queries, qrels = load_dataset(ds) + all_out.setdefault(ds, {}) + + if "baseline" in models_cfg: + m_id = models_cfg["baseline"]["hf_id"] + print(f"[baseline] {m_id}") + out = evaluate_model(STModel(m_id), corpus, queries, qrels, cost.get("baseline", 0.0)) + all_out[ds]["baseline"] = out + + if "qwen_edge_llamaserver" in models_cfg: + ep = models_cfg["qwen_edge_llamaserver"]["endpoint"] + print(f"[qwen_edge_llamaserver] {ep}") + out = evaluate_model(LlamaServerEncoder(ep), corpus, queries, qrels, + cost.get("qwen_edge_llamaserver", 0.0)) + all_out[ds]["qwen_edge_llamaserver"] = out + + with open(leaderboard_path, "w") as f: + json.dump(all_out, f, indent=2) + print(f"[saved] {leaderboard_path}") + print(json.dumps(all_out, indent=2)) + +if __name__ == "__main__": + main() diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml new file mode 100644 index 0000000..0929cc9 --- /dev/null +++ b/inference/nanobeir_config.yaml @@ -0,0 +1,13 @@ +output_dir: ./outputs +leaderboard_json: ./outputs/nanobeir_leaderboard.json +datasets: ["scifact"] + +models: + baseline: + hf_id: sentence-transformers/all-MiniLM-L6-v2 + # qwen_edge_llamaserver: + # endpoint: http://127.0.0.1:8080/embedding + +cost_per_1k: + baseline: 0.10 # <-- whatever estimate you want (e.g., $0.10 / 1k queries) + qwen_edge_llamaserver: 0.02 # <-- set this too if you run the llama server model diff --git a/inference/requirements.txt b/inference/requirements.txt new file mode 100644 index 0000000..716bd8e --- /dev/null +++ b/inference/requirements.txt @@ -0,0 +1,6 @@ +beir==2.0.0 +git+https://github.com/UKPLab/beir.git@main +sentence-transformers==2.7.0 +pyyaml +requests +faiss-cpu From b9531bd9c73d774c91250fafe287684fb405340e Mon Sep 17 00:00:00 2001 From: vedjaw Date: Sat, 20 Sep 2025 06:15:20 +0530 Subject: [PATCH 02/23] Change the embedding model to Qwen/qwen3-embedding-0.6b --- inference/evaluate_nanobeir.py | 197 +++++++++++++++++---------------- inference/nanobeir_config.yaml | 17 +-- 2 files changed, 112 insertions(+), 102 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index d32e1a5..4e78b1b 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -14,7 +14,7 @@ MAX_CHARS = 2000 RETRY_BACKOFF = [0.0, 0.2, 0.6] RETRY_SHRINK = [1.00, 0.60, 0.35] - +import torch # ----------------- Small utilities ----------------- def p95(latencies_ms: List[float]) -> float: if not latencies_ms: @@ -98,93 +98,96 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): texts, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True ) -class LlamaServerEncoder: - """Client for llama.cpp --embedding server (/embedding). Ensures fixed-dim vectors.""" - def __init__(self, endpoint: str): - import requests - from requests.adapters import HTTPAdapter - from urllib3.util.retry import Retry - self.session = requests.Session() - retry = Retry(total=3, backoff_factor=0.2, status_forcelist=(502, 503, 504)) - self.session.mount("http://", HTTPAdapter(max_retries=retry)) - self.endpoint = endpoint.rstrip("/") - self._dim = None # lock the embedding size after first good response - - def _parse_embedding_json(self, js): - # try to dig out the first numeric list from many shapes - def first_vector(obj): - if isinstance(obj, dict): - # common fields - for k in ("embedding", "vector", "values", "data"): - if k in obj: - return first_vector(obj[k]) - # otherwise try the first value - if obj: - return first_vector(next(iter(obj.values()))) - return [] - if isinstance(obj, (list, tuple)): - if not obj: - return [] - if isinstance(obj[0], dict): - return first_vector(obj[0]) - if isinstance(obj[0], (list, tuple)): - return first_vector(obj[0]) - return obj - return [obj] - return first_vector(js) - - def _to_1d_numeric(self, obj): - def flatten(xs): - for x in xs: - if isinstance(x, (list, tuple)): - yield from flatten(x) - elif isinstance(x, dict): - for k in ("vector", "embedding", "values", "data"): - if k in x: - yield from flatten(x[k]); break - else: - for v in x.values(): - yield from flatten(v) - else: - yield x - vec = list(flatten(self._parse_embedding_json(obj))) - vec = [float(x) for x in vec] - return vec - - def _normalize_vec(self, vec): - vec = self._to_1d_numeric(vec) - if self._dim is None: - self._dim = len(vec) - print(f"[llama] locked embedding dim = {self._dim}", flush=True) - if len(vec) < self._dim: - vec = vec + [0.0] * (self._dim - len(vec)) - elif len(vec) > self._dim: - vec = vec[:self._dim] - return vec - - def _embed_one(self, text: str, timeout=60): - orig = text - for backoff, shrink in zip(RETRY_BACKOFF, RETRY_SHRINK): - if backoff: time.sleep(backoff) - t = orig[: int(MAX_CHARS * shrink)] - r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) - if r.status_code >= 500: - continue - r.raise_for_status() - return self._normalize_vec(r.json()) - # final tiny attempt - t = orig[: min(512, len(orig))] - r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) - r.raise_for_status() - return self._normalize_vec(r.json()) - - def encode_queries(self, queries, batch_size=1, **kwargs): - return [self._embed_one(q) for q in queries] - - def encode_corpus(self, corpus, batch_size=1, **kwargs): - docs_iter = corpus.values() if isinstance(corpus, dict) else corpus - texts = [_doc_text(d) for d in docs_iter] - return [self._embed_one(t) for t in texts] + +class QwenHFEncoder: + """ + HF encoder for Qwen/qwen3-embedding-0.6b (or compatible). + - Handles dict or list corpora (BEIR can pass either) + - Mean-pools token embeddings + - Optional L2 normalization + """ + def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): + import torch + from transformers import AutoModel, AutoTokenizer + + self.torch = torch + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.pooling = pooling + self.normalize = normalize + self.max_len = max_len + + # Use slow tokenizer + trust_remote_code to avoid the fast-tokenizer JSON error + self.tokenizer = AutoTokenizer.from_pretrained( + hf_id, use_fast=False, trust_remote_code=True + ) + self.model = AutoModel.from_pretrained( + hf_id, trust_remote_code=True + ).to(self.device) + self.model.eval() + + def _encode_batch(self, texts, batch_size=32): + embs = [] + for i in range(0, len(texts), batch_size): + batch = texts[i:i + batch_size] + toks = self.tokenizer( + batch, + padding=True, + truncation=True, + max_length=self.max_len, + return_tensors="pt", + ).to(self.device) + + with self.torch.no_grad(): + out = self.model(**toks) + + # Prefer last_hidden_state, fall back to pooler_output if present + if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None: + x = out.last_hidden_state # [B, T, H] + # attention mask-based mean pooling + mask = toks.attention_mask.unsqueeze(-1).type_as(x) # [B, T, 1] + x = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9) # [B, H] + elif hasattr(out, "pooler_output") and out.pooler_output is not None: + x = out.pooler_output # [B, H] + else: + # very defensive fallback + x = out[0] if isinstance(out, (list, tuple)) else out + + if self.normalize: + x = self.torch.nn.functional.normalize(x, p=2, dim=-1) + + embs.extend(x.detach().cpu().tolist()) + return embs + + def encode_queries(self, queries, batch_size=32, **kwargs): + # queries is a list[str] + return self._encode_batch(list(queries), batch_size=batch_size) + + def encode_corpus(self, corpus, batch_size=32, **kwargs): + """ + corpus can be: + - dict: {doc_id: {"title":..., "text":...}, ...} + - list: [{"title":..., "text":...}, ...] + Return must be a list of embeddings aligned to the order BEIR provides. + """ + if isinstance(corpus, dict): + docs_iter = corpus.values() + elif isinstance(corpus, list): + docs_iter = corpus + else: + raise TypeError(f"Unsupported corpus type: {type(corpus)}") + + def _merge(doc): + title = (doc.get("title") or "").strip() + text = (doc.get("text") or "").strip() + s = (title + " " + text).strip() + return s[:2000] if len(s) > 2000 else s + + texts = [_merge(d) for d in docs_iter] + return self._encode_batch(texts, batch_size=batch_size) + + + + # ----------------- Evaluation + Debug Peek ----------------- def debug_one_query(corpus, queries, qrels, results, k=5): @@ -307,12 +310,18 @@ def main(): out = evaluate_model(STModel(m_id), corpus, queries, qrels, cost.get("baseline", 0.0)) all_out[ds]["baseline"] = out - if "qwen_edge_llamaserver" in models_cfg: - ep = models_cfg["qwen_edge_llamaserver"]["endpoint"] - print(f"[qwen_edge_llamaserver] {ep}") - out = evaluate_model(LlamaServerEncoder(ep), corpus, queries, qrels, - cost.get("qwen_edge_llamaserver", 0.0)) - all_out[ds]["qwen_edge_llamaserver"] = out + if "qwen_embedding" in models_cfg: + m = models_cfg["qwen_embedding"] + hf_id = m.get("hf_id", "Qwen/qwen3-embedding-0.6b") + pooling = m.get("pooling", "mean") + normalize = m.get("normalize", True) + max_len = m.get("max_len", 512) + print(f"[qwen_embedding] {hf_id} | pooling={pooling} normalize={normalize} max_len={max_len}") + encoder = QwenHFEncoder(hf_id, pooling=pooling, normalize=normalize, max_len=max_len) + out = evaluate_model(encoder, corpus, queries, qrels, cost.get("qwen_embedding", 0.0)) + all_out[ds]["qwen_embedding"] = out + + with open(leaderboard_path, "w") as f: json.dump(all_out, f, indent=2) diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index 0929cc9..b4eaf51 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -1,13 +1,14 @@ output_dir: ./outputs leaderboard_json: ./outputs/nanobeir_leaderboard.json -datasets: ["scifact"] -models: - baseline: - hf_id: sentence-transformers/all-MiniLM-L6-v2 - # qwen_edge_llamaserver: - # endpoint: http://127.0.0.1:8080/embedding +datasets: ["scifact"] cost_per_1k: - baseline: 0.10 # <-- whatever estimate you want (e.g., $0.10 / 1k queries) - qwen_edge_llamaserver: 0.02 # <-- set this too if you run the llama server model + qwen_embedding: 0.10 # <- set this to whatever your team uses + +models: + qwen_embedding: + hf_id: Qwen/qwen3-embedding-0.6b + pooling: mean # mean | cls | last_token (we’ll implement mean) + normalize: true # L2-normalize embeddings + max_len: 512 # truncate to fit model context From 2a48557481cc724bd715ceb6e3f20c0c5d9e8269 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Mon, 22 Sep 2025 00:14:39 +0530 Subject: [PATCH 03/23] Update nanobeir_config.yaml did the required changes (qwen/qwen3-ebedding-0.6b) --- inference/nanobeir_config.yaml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index 0929cc9..b4eaf51 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -1,13 +1,14 @@ output_dir: ./outputs leaderboard_json: ./outputs/nanobeir_leaderboard.json -datasets: ["scifact"] -models: - baseline: - hf_id: sentence-transformers/all-MiniLM-L6-v2 - # qwen_edge_llamaserver: - # endpoint: http://127.0.0.1:8080/embedding +datasets: ["scifact"] cost_per_1k: - baseline: 0.10 # <-- whatever estimate you want (e.g., $0.10 / 1k queries) - qwen_edge_llamaserver: 0.02 # <-- set this too if you run the llama server model + qwen_embedding: 0.10 # <- set this to whatever your team uses + +models: + qwen_embedding: + hf_id: Qwen/qwen3-embedding-0.6b + pooling: mean # mean | cls | last_token (we’ll implement mean) + normalize: true # L2-normalize embeddings + max_len: 512 # truncate to fit model context From 1ca83396e50e260c2675f8f25f908a72463c7abe Mon Sep 17 00:00:00 2001 From: vedjaw Date: Mon, 22 Sep 2025 00:15:21 +0530 Subject: [PATCH 04/23] Update evaluate_nanobeir.py did the required changes --- inference/evaluate_nanobeir.py | 197 +++++++++++++++++---------------- 1 file changed, 103 insertions(+), 94 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index d32e1a5..4e78b1b 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -14,7 +14,7 @@ MAX_CHARS = 2000 RETRY_BACKOFF = [0.0, 0.2, 0.6] RETRY_SHRINK = [1.00, 0.60, 0.35] - +import torch # ----------------- Small utilities ----------------- def p95(latencies_ms: List[float]) -> float: if not latencies_ms: @@ -98,93 +98,96 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): texts, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True ) -class LlamaServerEncoder: - """Client for llama.cpp --embedding server (/embedding). Ensures fixed-dim vectors.""" - def __init__(self, endpoint: str): - import requests - from requests.adapters import HTTPAdapter - from urllib3.util.retry import Retry - self.session = requests.Session() - retry = Retry(total=3, backoff_factor=0.2, status_forcelist=(502, 503, 504)) - self.session.mount("http://", HTTPAdapter(max_retries=retry)) - self.endpoint = endpoint.rstrip("/") - self._dim = None # lock the embedding size after first good response - - def _parse_embedding_json(self, js): - # try to dig out the first numeric list from many shapes - def first_vector(obj): - if isinstance(obj, dict): - # common fields - for k in ("embedding", "vector", "values", "data"): - if k in obj: - return first_vector(obj[k]) - # otherwise try the first value - if obj: - return first_vector(next(iter(obj.values()))) - return [] - if isinstance(obj, (list, tuple)): - if not obj: - return [] - if isinstance(obj[0], dict): - return first_vector(obj[0]) - if isinstance(obj[0], (list, tuple)): - return first_vector(obj[0]) - return obj - return [obj] - return first_vector(js) - - def _to_1d_numeric(self, obj): - def flatten(xs): - for x in xs: - if isinstance(x, (list, tuple)): - yield from flatten(x) - elif isinstance(x, dict): - for k in ("vector", "embedding", "values", "data"): - if k in x: - yield from flatten(x[k]); break - else: - for v in x.values(): - yield from flatten(v) - else: - yield x - vec = list(flatten(self._parse_embedding_json(obj))) - vec = [float(x) for x in vec] - return vec - - def _normalize_vec(self, vec): - vec = self._to_1d_numeric(vec) - if self._dim is None: - self._dim = len(vec) - print(f"[llama] locked embedding dim = {self._dim}", flush=True) - if len(vec) < self._dim: - vec = vec + [0.0] * (self._dim - len(vec)) - elif len(vec) > self._dim: - vec = vec[:self._dim] - return vec - - def _embed_one(self, text: str, timeout=60): - orig = text - for backoff, shrink in zip(RETRY_BACKOFF, RETRY_SHRINK): - if backoff: time.sleep(backoff) - t = orig[: int(MAX_CHARS * shrink)] - r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) - if r.status_code >= 500: - continue - r.raise_for_status() - return self._normalize_vec(r.json()) - # final tiny attempt - t = orig[: min(512, len(orig))] - r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) - r.raise_for_status() - return self._normalize_vec(r.json()) - - def encode_queries(self, queries, batch_size=1, **kwargs): - return [self._embed_one(q) for q in queries] - - def encode_corpus(self, corpus, batch_size=1, **kwargs): - docs_iter = corpus.values() if isinstance(corpus, dict) else corpus - texts = [_doc_text(d) for d in docs_iter] - return [self._embed_one(t) for t in texts] + +class QwenHFEncoder: + """ + HF encoder for Qwen/qwen3-embedding-0.6b (or compatible). + - Handles dict or list corpora (BEIR can pass either) + - Mean-pools token embeddings + - Optional L2 normalization + """ + def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): + import torch + from transformers import AutoModel, AutoTokenizer + + self.torch = torch + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.pooling = pooling + self.normalize = normalize + self.max_len = max_len + + # Use slow tokenizer + trust_remote_code to avoid the fast-tokenizer JSON error + self.tokenizer = AutoTokenizer.from_pretrained( + hf_id, use_fast=False, trust_remote_code=True + ) + self.model = AutoModel.from_pretrained( + hf_id, trust_remote_code=True + ).to(self.device) + self.model.eval() + + def _encode_batch(self, texts, batch_size=32): + embs = [] + for i in range(0, len(texts), batch_size): + batch = texts[i:i + batch_size] + toks = self.tokenizer( + batch, + padding=True, + truncation=True, + max_length=self.max_len, + return_tensors="pt", + ).to(self.device) + + with self.torch.no_grad(): + out = self.model(**toks) + + # Prefer last_hidden_state, fall back to pooler_output if present + if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None: + x = out.last_hidden_state # [B, T, H] + # attention mask-based mean pooling + mask = toks.attention_mask.unsqueeze(-1).type_as(x) # [B, T, 1] + x = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9) # [B, H] + elif hasattr(out, "pooler_output") and out.pooler_output is not None: + x = out.pooler_output # [B, H] + else: + # very defensive fallback + x = out[0] if isinstance(out, (list, tuple)) else out + + if self.normalize: + x = self.torch.nn.functional.normalize(x, p=2, dim=-1) + + embs.extend(x.detach().cpu().tolist()) + return embs + + def encode_queries(self, queries, batch_size=32, **kwargs): + # queries is a list[str] + return self._encode_batch(list(queries), batch_size=batch_size) + + def encode_corpus(self, corpus, batch_size=32, **kwargs): + """ + corpus can be: + - dict: {doc_id: {"title":..., "text":...}, ...} + - list: [{"title":..., "text":...}, ...] + Return must be a list of embeddings aligned to the order BEIR provides. + """ + if isinstance(corpus, dict): + docs_iter = corpus.values() + elif isinstance(corpus, list): + docs_iter = corpus + else: + raise TypeError(f"Unsupported corpus type: {type(corpus)}") + + def _merge(doc): + title = (doc.get("title") or "").strip() + text = (doc.get("text") or "").strip() + s = (title + " " + text).strip() + return s[:2000] if len(s) > 2000 else s + + texts = [_merge(d) for d in docs_iter] + return self._encode_batch(texts, batch_size=batch_size) + + + + # ----------------- Evaluation + Debug Peek ----------------- def debug_one_query(corpus, queries, qrels, results, k=5): @@ -307,12 +310,18 @@ def main(): out = evaluate_model(STModel(m_id), corpus, queries, qrels, cost.get("baseline", 0.0)) all_out[ds]["baseline"] = out - if "qwen_edge_llamaserver" in models_cfg: - ep = models_cfg["qwen_edge_llamaserver"]["endpoint"] - print(f"[qwen_edge_llamaserver] {ep}") - out = evaluate_model(LlamaServerEncoder(ep), corpus, queries, qrels, - cost.get("qwen_edge_llamaserver", 0.0)) - all_out[ds]["qwen_edge_llamaserver"] = out + if "qwen_embedding" in models_cfg: + m = models_cfg["qwen_embedding"] + hf_id = m.get("hf_id", "Qwen/qwen3-embedding-0.6b") + pooling = m.get("pooling", "mean") + normalize = m.get("normalize", True) + max_len = m.get("max_len", 512) + print(f"[qwen_embedding] {hf_id} | pooling={pooling} normalize={normalize} max_len={max_len}") + encoder = QwenHFEncoder(hf_id, pooling=pooling, normalize=normalize, max_len=max_len) + out = evaluate_model(encoder, corpus, queries, qrels, cost.get("qwen_embedding", 0.0)) + all_out[ds]["qwen_embedding"] = out + + with open(leaderboard_path, "w") as f: json.dump(all_out, f, indent=2) From 2f49e1197788792daf610acc5d00999c34e28ab8 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Mon, 22 Sep 2025 00:32:25 +0530 Subject: [PATCH 05/23] Update nanobeir_config.yaml fixed lint error --- inference/nanobeir_config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index b4eaf51..bc43be7 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -4,11 +4,11 @@ leaderboard_json: ./outputs/nanobeir_leaderboard.json datasets: ["scifact"] cost_per_1k: - qwen_embedding: 0.10 # <- set this to whatever your team uses + qwen_embedding: 0.10 # <- set this to whatever your team uses models: qwen_embedding: hf_id: Qwen/qwen3-embedding-0.6b - pooling: mean # mean | cls | last_token (we’ll implement mean) - normalize: true # L2-normalize embeddings - max_len: 512 # truncate to fit model context + pooling: mean # mean | cls | last_token (we’ll implement mean) + normalize: true # L2-normalize embeddings + max_len: 512 # truncate to fit model context From fa12019fe0153756c955d29a66dff75490bbc786 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Mon, 22 Sep 2025 00:33:28 +0530 Subject: [PATCH 06/23] Update evaluate_nanobeir.py fixing lint --- inference/evaluate_nanobeir.py | 147 ++++++++++++++++++++------------- 1 file changed, 89 insertions(+), 58 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 4e78b1b..450fb06 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -7,20 +7,38 @@ - Prints a sanity check for one query so you can see relevant IDs vs retrieved IDs """ -import os, json, math, time, statistics, argparse, random -from typing import List, Dict, Any, Iterable, Union +# --- Python Standard Library Imports --- +import argparse +import json +import os +import random +import statistics +import time +from typing import Any, Dict, List + +# --- Third-Party Imports --- +import torch +import yaml +from beir import util +from beir.datasets.data_loader import GenericDataLoader +from beir.retrieval.evaluation import EvaluateRetrieval +from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES +from sentence_transformers import SentenceTransformer +from transformers import AutoModel, AutoTokenizer # ----------------- Tunables ----------------- MAX_CHARS = 2000 RETRY_BACKOFF = [0.0, 0.2, 0.6] -RETRY_SHRINK = [1.00, 0.60, 0.35] -import torch +RETRY_SHRINK = [1.00, 0.60, 0.35] + + # ----------------- Small utilities ----------------- def p95(latencies_ms: List[float]) -> float: if not latencies_ms: return 0.0 return float(statistics.quantiles(latencies_ms, n=100)[94]) + def time_calls(fn, payloads: List[Any], warmup: int = 3) -> float: if not payloads: return 0.0 @@ -34,6 +52,7 @@ def time_calls(fn, payloads: List[Any], warmup: int = 3) -> float: lat.append((t1 - t0) * 1000.0) return p95(lat) + def median_rank(results, qrels) -> float: ranks = [] for qid, doc_scores in results.items(): @@ -49,40 +68,34 @@ def median_rank(results, qrels) -> float: return 0.0 ranks.sort() m = len(ranks) - return float(ranks[m//2] if m % 2 else (ranks[m//2 - 1] + ranks[m//2]) / 2) + return float(ranks[m // 2] if m % 2 else (ranks[m // 2 - 1] + ranks[m // 2]) / 2) -# ----------------- Minimal BEIR imports ----------------- -def _imports(): - global util, GenericDataLoader, EvaluateRetrieval, DRES - from beir import util - from beir.datasets.data_loader import GenericDataLoader - from beir.retrieval.evaluation import EvaluateRetrieval - from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES # ----------------- Dataset URLs ----------------- _BEIR_URLS = { "scifact": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip", - "fiqa": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip", + "fiqa": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip", "scidocs": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scidocs.zip", } + def load_dataset(name: str, base_dir: str = "./datasets"): - _imports() url_or_id = _BEIR_URLS.get(name, name) data_path = util.download_and_unzip(url_or_id, base_dir) corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test") return corpus, queries, qrels + # ----------------- Model wrappers ----------------- def _doc_text(doc: Dict[str, str]) -> str: title = (doc.get("title") or "").strip() - text = (doc.get("text") or "").strip() + text = (doc.get("text") or "").strip() merged = (title + " " + text).strip() return merged[:MAX_CHARS] if len(merged) > MAX_CHARS else merged + class STModel: def __init__(self, hf_id: str): - from sentence_transformers import SentenceTransformer self.model = SentenceTransformer(hf_id) def encode_queries(self, queries, batch_size=32, **kwargs): @@ -92,8 +105,10 @@ def encode_queries(self, queries, batch_size=32, **kwargs): def encode_corpus(self, corpus, batch_size=32, **kwargs): docs_iter = corpus.values() if isinstance(corpus, dict) else corpus - texts = [((d.get("title") or "") + " " + (d.get("text") or "")).strip()[:MAX_CHARS] - for d in docs_iter] + texts = [ + ((d.get("title") or "") + " " + (d.get("text") or "")).strip()[:MAX_CHARS] + for d in docs_iter + ] return self.model.encode( texts, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True ) @@ -106,10 +121,8 @@ class QwenHFEncoder: - Mean-pools token embeddings - Optional L2 normalization """ - def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): - import torch - from transformers import AutoModel, AutoTokenizer + def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): self.torch = torch self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.pooling = pooling @@ -120,15 +133,15 @@ def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, ma self.tokenizer = AutoTokenizer.from_pretrained( hf_id, use_fast=False, trust_remote_code=True ) - self.model = AutoModel.from_pretrained( - hf_id, trust_remote_code=True - ).to(self.device) + self.model = AutoModel.from_pretrained(hf_id, trust_remote_code=True).to( + self.device + ) self.model.eval() def _encode_batch(self, texts, batch_size=32): embs = [] for i in range(0, len(texts), batch_size): - batch = texts[i:i + batch_size] + batch = texts[i : i + batch_size] toks = self.tokenizer( batch, padding=True, @@ -178,7 +191,7 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): def _merge(doc): title = (doc.get("title") or "").strip() - text = (doc.get("text") or "").strip() + text = (doc.get("text") or "").strip() s = (title + " " + text).strip() return s[:2000] if len(s) > 2000 else s @@ -186,9 +199,6 @@ def _merge(doc): return self._encode_batch(texts, batch_size=batch_size) - - - # ----------------- Evaluation + Debug Peek ----------------- def debug_one_query(corpus, queries, qrels, results, k=5): """Print one query's relevant IDs vs. top-k retrieved IDs.""" @@ -208,16 +218,22 @@ def debug_one_query(corpus, queries, qrels, results, k=5): print("\n============= DEBUG PEEK =============") print(f"Query ID: {qid}") print(f"Query: {q_text}") - print(f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}{' ...' if len(relevant)>10 else ''}") - print("\nTop-{} retrieved:".format(k)) + print( + f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}" + f"{' ...' if len(relevant)>10 else ''}" + ) + print(f"\nTop-{k} retrieved:") for i, (did, score) in enumerate(ranked, 1): - title = (corpus.get(did, {}).get('title') if isinstance(corpus, dict) else None) or "" + title = ( + (corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None) + or "" + ) print(f"{i:>2}. {did} score={score:.4f} title={title[:80]}") print(f"\nOverlap@{k}: {sorted(list(overlap)) if overlap else 'None'}") print("=====================================\n", flush=True) + def evaluate_model(beir_model, corpus, queries, qrels, cost_per_1k: float): - _imports() retriever_model = DRES(beir_model, batch_size=32) retriever = EvaluateRetrieval(retriever_model, score_function="cos_sim") @@ -233,20 +249,23 @@ def recall_at_k(k: int) -> float: continue total += 1 relevant = {did for did, rel in rels.items() if rel > 0} - ranked = sorted(results[qid].items(), key=lambda kv: kv[1], reverse=True)[:k] + ranked = sorted(results[qid].items(), key=lambda kv: kv[1], reverse=True)[ + :k + ] retrieved_ids = {did for did, _ in ranked} if relevant & retrieved_ids: hits += 1 return (hits / total) if total > 0 else 0.0 - R1 = recall_at_k(1) - R5 = recall_at_k(5) - R10 = recall_at_k(10) + r1 = recall_at_k(1) + r5 = recall_at_k(5) + r10 = recall_at_k(10) - print(f"[debug] R@1={R1:.3f} R@5={R5:.3f} R@10={R10:.3f}", flush=True) + print(f"[debug] R@1={r1:.3f} R@5={r5:.3f} R@10={r10:.3f}", flush=True) # --- NDCG@10 from BEIR (robust pick of the key) --- - ndcg, _map, recall_dict, precision = retriever.evaluate(qrels, results, retriever.k_values) + ndcg, _, _, _ = retriever.evaluate(qrels, results, retriever.k_values) + def _pick_ndcg10(d): for key in (10, "10", "ndcg@10", "NDCG@10"): if key in d: @@ -256,24 +275,27 @@ def _pick_ndcg10(d): for v in d.values(): try: return _pick_ndcg10(v) - except Exception: + except (TypeError, KeyError): pass return 0.0 - N10 = _pick_ndcg10(ndcg) + + n10 = _pick_ndcg10(ndcg) # --- Median rank of first relevant (as before) --- medr = float(median_rank(results, qrels)) # --- Latency p95 on first ~50 queries --- qids = list(queries.keys()) - probe = [queries[q] for q in qids[:min(50, len(qids))]] - lat = time_calls(lambda batch: beir_model.encode_queries(batch, batch_size=1), probe, warmup=3) + probe = [queries[q] for q in qids[: min(50, len(qids))]] + lat = time_calls( + lambda batch: beir_model.encode_queries(batch, batch_size=1), probe, warmup=3 + ) return { - "R@1": float(R1), - "R@5": float(R5), - "R@10": float(R10), - "NDCG@10": float(N10), + "R@1": float(r1), + "R@5": float(r5), + "R@10": float(r10), + "NDCG@10": float(n10), "MedR": float(medr), "Latency_ms_p95": float(lat), "Cost_per_1k_queries": float(cost_per_1k), @@ -282,7 +304,6 @@ def _pick_ndcg10(d): # ----------------- Main ----------------- def main(): - import yaml ap = argparse.ArgumentParser() ap.add_argument("--config", required=True) args = ap.parse_args() @@ -292,7 +313,9 @@ def main(): out_dir = cfg.get("output_dir", "./outputs") os.makedirs(out_dir, exist_ok=True) - leaderboard_path = cfg.get("leaderboard_json", f"{out_dir}/nanobeir_leaderboard.json") + leaderboard_path = cfg.get( + "leaderboard_json", f"{out_dir}/nanobeir_leaderboard.json" + ) datasets = cfg.get("datasets", ["scifact"]) cost = cfg.get("cost_per_1k", {}) @@ -307,26 +330,34 @@ def main(): if "baseline" in models_cfg: m_id = models_cfg["baseline"]["hf_id"] print(f"[baseline] {m_id}") - out = evaluate_model(STModel(m_id), corpus, queries, qrels, cost.get("baseline", 0.0)) + out = evaluate_model( + STModel(m_id), corpus, queries, qrels, cost.get("baseline", 0.0) + ) all_out[ds]["baseline"] = out if "qwen_embedding" in models_cfg: m = models_cfg["qwen_embedding"] - hf_id = m.get("hf_id", "Qwen/qwen3-embedding-0.6b") - pooling = m.get("pooling", "mean") + hf_id = m.get("hf_id", "Qwen/qwen3-embedding-0.6b") + pooling = m.get("pooling", "mean") normalize = m.get("normalize", True) - max_len = m.get("max_len", 512) - print(f"[qwen_embedding] {hf_id} | pooling={pooling} normalize={normalize} max_len={max_len}") - encoder = QwenHFEncoder(hf_id, pooling=pooling, normalize=normalize, max_len=max_len) - out = evaluate_model(encoder, corpus, queries, qrels, cost.get("qwen_embedding", 0.0)) + max_len = m.get("max_len", 512) + print( + f"[qwen_embedding] {hf_id} | pooling={pooling} " + f"normalize={normalize} max_len={max_len}" + ) + encoder = QwenHFEncoder( + hf_id, pooling=pooling, normalize=normalize, max_len=max_len + ) + out = evaluate_model( + encoder, corpus, queries, qrels, cost.get("qwen_embedding", 0.0) + ) all_out[ds]["qwen_embedding"] = out - - with open(leaderboard_path, "w") as f: json.dump(all_out, f, indent=2) print(f"[saved] {leaderboard_path}") print(json.dumps(all_out, indent=2)) + if __name__ == "__main__": main() From 4c94e54c8141b1a593ec554a75d5f702d82aad6c Mon Sep 17 00:00:00 2001 From: vedjaw Date: Wed, 24 Sep 2025 14:54:10 +0530 Subject: [PATCH 07/23] tried to fixing linter error --- inference/evaluate_nanobeir.py | 149 ++++++++++++++++++++------------- inference/nanobeir_config.yaml | 8 +- 2 files changed, 94 insertions(+), 63 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 4e78b1b..a7ac433 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -7,20 +7,38 @@ - Prints a sanity check for one query so you can see relevant IDs vs retrieved IDs """ -import os, json, math, time, statistics, argparse, random -from typing import List, Dict, Any, Iterable, Union +# --- Python Standard Library Imports --- +import argparse +import json +import os +import random +import statistics +import time +from typing import Any, Dict, List + +# --- Third-Party Imports --- +import torch +import yaml +from beir import util +from beir.datasets.data_loader import GenericDataLoader +from beir.retrieval.evaluation import EvaluateRetrieval +from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES +from sentence_transformers import SentenceTransformer +from transformers import AutoModel, AutoTokenizer # ----------------- Tunables ----------------- MAX_CHARS = 2000 RETRY_BACKOFF = [0.0, 0.2, 0.6] -RETRY_SHRINK = [1.00, 0.60, 0.35] -import torch +RETRY_SHRINK = [1.00, 0.60, 0.35] + + # ----------------- Small utilities ----------------- def p95(latencies_ms: List[float]) -> float: if not latencies_ms: return 0.0 return float(statistics.quantiles(latencies_ms, n=100)[94]) + def time_calls(fn, payloads: List[Any], warmup: int = 3) -> float: if not payloads: return 0.0 @@ -34,6 +52,7 @@ def time_calls(fn, payloads: List[Any], warmup: int = 3) -> float: lat.append((t1 - t0) * 1000.0) return p95(lat) + def median_rank(results, qrels) -> float: ranks = [] for qid, doc_scores in results.items(): @@ -49,40 +68,34 @@ def median_rank(results, qrels) -> float: return 0.0 ranks.sort() m = len(ranks) - return float(ranks[m//2] if m % 2 else (ranks[m//2 - 1] + ranks[m//2]) / 2) + return float(ranks[m // 2] if m % 2 else (ranks[m // 2 - 1] + ranks[m // 2]) / 2) -# ----------------- Minimal BEIR imports ----------------- -def _imports(): - global util, GenericDataLoader, EvaluateRetrieval, DRES - from beir import util - from beir.datasets.data_loader import GenericDataLoader - from beir.retrieval.evaluation import EvaluateRetrieval - from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES # ----------------- Dataset URLs ----------------- _BEIR_URLS = { "scifact": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip", - "fiqa": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip", + "fiqa": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip", "scidocs": "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scidocs.zip", } + def load_dataset(name: str, base_dir: str = "./datasets"): - _imports() url_or_id = _BEIR_URLS.get(name, name) data_path = util.download_and_unzip(url_or_id, base_dir) corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test") return corpus, queries, qrels + # ----------------- Model wrappers ----------------- def _doc_text(doc: Dict[str, str]) -> str: title = (doc.get("title") or "").strip() - text = (doc.get("text") or "").strip() + text = (doc.get("text") or "").strip() merged = (title + " " + text).strip() return merged[:MAX_CHARS] if len(merged) > MAX_CHARS else merged + class STModel: def __init__(self, hf_id: str): - from sentence_transformers import SentenceTransformer self.model = SentenceTransformer(hf_id) def encode_queries(self, queries, batch_size=32, **kwargs): @@ -92,8 +105,10 @@ def encode_queries(self, queries, batch_size=32, **kwargs): def encode_corpus(self, corpus, batch_size=32, **kwargs): docs_iter = corpus.values() if isinstance(corpus, dict) else corpus - texts = [((d.get("title") or "") + " " + (d.get("text") or "")).strip()[:MAX_CHARS] - for d in docs_iter] + texts = [ + ((d.get("title") or "") + " " + (d.get("text") or "")).strip()[:MAX_CHARS] + for d in docs_iter + ] return self.model.encode( texts, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True ) @@ -106,10 +121,8 @@ class QwenHFEncoder: - Mean-pools token embeddings - Optional L2 normalization """ - def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): - import torch - from transformers import AutoModel, AutoTokenizer + def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): self.torch = torch self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.pooling = pooling @@ -120,15 +133,15 @@ def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, ma self.tokenizer = AutoTokenizer.from_pretrained( hf_id, use_fast=False, trust_remote_code=True ) - self.model = AutoModel.from_pretrained( - hf_id, trust_remote_code=True - ).to(self.device) + self.model = AutoModel.from_pretrained(hf_id, trust_remote_code=True).to( + self.device + ) self.model.eval() def _encode_batch(self, texts, batch_size=32): embs = [] for i in range(0, len(texts), batch_size): - batch = texts[i:i + batch_size] + batch = texts[i : i + batch_size] toks = self.tokenizer( batch, padding=True, @@ -178,7 +191,7 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): def _merge(doc): title = (doc.get("title") or "").strip() - text = (doc.get("text") or "").strip() + text = (doc.get("text") or "").strip() s = (title + " " + text).strip() return s[:2000] if len(s) > 2000 else s @@ -186,9 +199,6 @@ def _merge(doc): return self._encode_batch(texts, batch_size=batch_size) - - - # ----------------- Evaluation + Debug Peek ----------------- def debug_one_query(corpus, queries, qrels, results, k=5): """Print one query's relevant IDs vs. top-k retrieved IDs.""" @@ -208,16 +218,22 @@ def debug_one_query(corpus, queries, qrels, results, k=5): print("\n============= DEBUG PEEK =============") print(f"Query ID: {qid}") print(f"Query: {q_text}") - print(f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}{' ...' if len(relevant)>10 else ''}") - print("\nTop-{} retrieved:".format(k)) + print( + f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}" + f"{' ...' if len(relevant)>10 else ''}" + ) + print(f"\nTop-{k} retrieved:") for i, (did, score) in enumerate(ranked, 1): - title = (corpus.get(did, {}).get('title') if isinstance(corpus, dict) else None) or "" + title = ( + (corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None) + or "" + ) print(f"{i:>2}. {did} score={score:.4f} title={title[:80]}") print(f"\nOverlap@{k}: {sorted(list(overlap)) if overlap else 'None'}") print("=====================================\n", flush=True) + def evaluate_model(beir_model, corpus, queries, qrels, cost_per_1k: float): - _imports() retriever_model = DRES(beir_model, batch_size=32) retriever = EvaluateRetrieval(retriever_model, score_function="cos_sim") @@ -233,20 +249,23 @@ def recall_at_k(k: int) -> float: continue total += 1 relevant = {did for did, rel in rels.items() if rel > 0} - ranked = sorted(results[qid].items(), key=lambda kv: kv[1], reverse=True)[:k] + ranked = sorted(results[qid].items(), key=lambda kv: kv[1], reverse=True)[ + :k + ] retrieved_ids = {did for did, _ in ranked} if relevant & retrieved_ids: hits += 1 return (hits / total) if total > 0 else 0.0 - R1 = recall_at_k(1) - R5 = recall_at_k(5) - R10 = recall_at_k(10) + r1 = recall_at_k(1) + r5 = recall_at_k(5) + r10 = recall_at_k(10) - print(f"[debug] R@1={R1:.3f} R@5={R5:.3f} R@10={R10:.3f}", flush=True) + print(f"[debug] R@1={r1:.3f} R@5={r5:.3f} R@10={r10:.3f}", flush=True) # --- NDCG@10 from BEIR (robust pick of the key) --- - ndcg, _map, recall_dict, precision = retriever.evaluate(qrels, results, retriever.k_values) + ndcg, _, _, _ = retriever.evaluate(qrels, results, retriever.k_values) + def _pick_ndcg10(d): for key in (10, "10", "ndcg@10", "NDCG@10"): if key in d: @@ -256,24 +275,27 @@ def _pick_ndcg10(d): for v in d.values(): try: return _pick_ndcg10(v) - except Exception: + except (TypeError, KeyError): pass return 0.0 - N10 = _pick_ndcg10(ndcg) + + n10 = _pick_ndcg10(ndcg) # --- Median rank of first relevant (as before) --- medr = float(median_rank(results, qrels)) # --- Latency p95 on first ~50 queries --- qids = list(queries.keys()) - probe = [queries[q] for q in qids[:min(50, len(qids))]] - lat = time_calls(lambda batch: beir_model.encode_queries(batch, batch_size=1), probe, warmup=3) + probe = [queries[q] for q in qids[: min(50, len(qids))]] + lat = time_calls( + lambda batch: beir_model.encode_queries(batch, batch_size=1), probe, warmup=3 + ) return { - "R@1": float(R1), - "R@5": float(R5), - "R@10": float(R10), - "NDCG@10": float(N10), + "R@1": float(r1), + "R@5": float(r5), + "R@10": float(r10), + "NDCG@10": float(n10), "MedR": float(medr), "Latency_ms_p95": float(lat), "Cost_per_1k_queries": float(cost_per_1k), @@ -282,7 +304,6 @@ def _pick_ndcg10(d): # ----------------- Main ----------------- def main(): - import yaml ap = argparse.ArgumentParser() ap.add_argument("--config", required=True) args = ap.parse_args() @@ -292,7 +313,9 @@ def main(): out_dir = cfg.get("output_dir", "./outputs") os.makedirs(out_dir, exist_ok=True) - leaderboard_path = cfg.get("leaderboard_json", f"{out_dir}/nanobeir_leaderboard.json") + leaderboard_path = cfg.get( + "leaderboard_json", f"{out_dir}/nanobeir_leaderboard.json" + ) datasets = cfg.get("datasets", ["scifact"]) cost = cfg.get("cost_per_1k", {}) @@ -307,26 +330,34 @@ def main(): if "baseline" in models_cfg: m_id = models_cfg["baseline"]["hf_id"] print(f"[baseline] {m_id}") - out = evaluate_model(STModel(m_id), corpus, queries, qrels, cost.get("baseline", 0.0)) + out = evaluate_model( + STModel(m_id), corpus, queries, qrels, cost.get("baseline", 0.0) + ) all_out[ds]["baseline"] = out if "qwen_embedding" in models_cfg: m = models_cfg["qwen_embedding"] - hf_id = m.get("hf_id", "Qwen/qwen3-embedding-0.6b") - pooling = m.get("pooling", "mean") + hf_id = m.get("hf_id", "Qwen/qwen3-embedding-0.6b") + pooling = m.get("pooling", "mean") normalize = m.get("normalize", True) - max_len = m.get("max_len", 512) - print(f"[qwen_embedding] {hf_id} | pooling={pooling} normalize={normalize} max_len={max_len}") - encoder = QwenHFEncoder(hf_id, pooling=pooling, normalize=normalize, max_len=max_len) - out = evaluate_model(encoder, corpus, queries, qrels, cost.get("qwen_embedding", 0.0)) + max_len = m.get("max_len", 512) + print( + f"[qwen_embedding] {hf_id} | pooling={pooling} " + f"normalize={normalize} max_len={max_len}" + ) + encoder = QwenHFEncoder( + hf_id, pooling=pooling, normalize=normalize, max_len=max_len + ) + out = evaluate_model( + encoder, corpus, queries, qrels, cost.get("qwen_embedding", 0.0) + ) all_out[ds]["qwen_embedding"] = out - - with open(leaderboard_path, "w") as f: json.dump(all_out, f, indent=2) print(f"[saved] {leaderboard_path}") print(json.dumps(all_out, indent=2)) + if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index b4eaf51..c65df13 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -4,11 +4,11 @@ leaderboard_json: ./outputs/nanobeir_leaderboard.json datasets: ["scifact"] cost_per_1k: - qwen_embedding: 0.10 # <- set this to whatever your team uses + qwen_embedding: 0.10 # <- set this to whatever your team uses models: qwen_embedding: hf_id: Qwen/qwen3-embedding-0.6b - pooling: mean # mean | cls | last_token (we’ll implement mean) - normalize: true # L2-normalize embeddings - max_len: 512 # truncate to fit model context + pooling: mean # mean | cls | last_token (we’ll implement mean) + normalize: true # L2-normalize embeddings + max_len: 512 # truncate to fit model context \ No newline at end of file From 8b63156caff5ccb728128e50a37cb4a0b2a7a9a8 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Wed, 24 Sep 2025 16:22:02 +0530 Subject: [PATCH 08/23] lint error fixing --- inference/evaluate_nanobeir.py | 22 +++++++++++++++------- pyproject.toml | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index a7ac433..63dafcb 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -100,7 +100,10 @@ def __init__(self, hf_id: str): def encode_queries(self, queries, batch_size=32, **kwargs): return self.model.encode( - queries, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=False + queries, + convert_to_numpy=True, + batch_size=batch_size, + show_progress_bar=False, ) def encode_corpus(self, corpus, batch_size=32, **kwargs): @@ -122,7 +125,13 @@ class QwenHFEncoder: - Optional L2 normalization """ - def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): + def __init__( + self, + hf_id: str, + pooling: str = "mean", + normalize: bool = True, + max_len: int = 512, + ): self.torch = torch self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.pooling = pooling @@ -220,14 +229,13 @@ def debug_one_query(corpus, queries, qrels, results, k=5): print(f"Query: {q_text}") print( f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}" - f"{' ...' if len(relevant)>10 else ''}" + f"{' ...' if len(relevant) > 10 else ''}" ) print(f"\nTop-{k} retrieved:") for i, (did, score) in enumerate(ranked, 1): title = ( - (corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None) - or "" - ) + corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None + ) or "" print(f"{i:>2}. {did} score={score:.4f} title={title[:80]}") print(f"\nOverlap@{k}: {sorted(list(overlap)) if overlap else 'None'}") print("=====================================\n", flush=True) @@ -360,4 +368,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/pyproject.toml b/pyproject.toml index 5f29a80..b4943ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,4 +21,4 @@ ignore = [] [tool.ruff.format] quote-style = "double" -indent-style = "space" +indent-style = "space" \ No newline at end of file From f10b33a3c452873a2c01c1808c4e124183ec55f0 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Wed, 24 Sep 2025 16:35:22 +0530 Subject: [PATCH 09/23] lint error fixing --- inference/evaluate_nanobeir.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index d75f5c6..c331bc4 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -116,12 +116,15 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): texts, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True ) + class LlamaServerEncoder: """Client for llama.cpp --embedding server (/embedding). Ensures fixed-dim vectors.""" + def __init__(self, endpoint: str): import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry + self.session = requests.Session() retry = Retry(total=3, backoff_factor=0.2, status_forcelist=(502, 503, 504)) self.session.mount("http://", HTTPAdapter(max_retries=retry)) @@ -149,6 +152,7 @@ def first_vector(obj): return first_vector(obj[0]) return obj return [obj] + return first_vector(js) def _to_1d_numeric(self, obj): @@ -159,12 +163,14 @@ def flatten(xs): elif isinstance(x, dict): for k in ("vector", "embedding", "values", "data"): if k in x: - yield from flatten(x[k]); break + yield from flatten(x[k]) + break else: for v in x.values(): yield from flatten(v) else: yield x + vec = list(flatten(self._parse_embedding_json(obj))) vec = [float(x) for x in vec] return vec @@ -177,13 +183,14 @@ def _normalize_vec(self, vec): if len(vec) < self._dim: vec = vec + [0.0] * (self._dim - len(vec)) elif len(vec) > self._dim: - vec = vec[:self._dim] + vec = vec[: self._dim] return vec def _embed_one(self, text: str, timeout=60): orig = text for backoff, shrink in zip(RETRY_BACKOFF, RETRY_SHRINK): - if backoff: time.sleep(backoff) + if backoff: + time.sleep(backoff) t = orig[: int(MAX_CHARS * shrink)] r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) if r.status_code >= 500: @@ -204,6 +211,7 @@ def encode_corpus(self, corpus, batch_size=1, **kwargs): texts = [_doc_text(d) for d in docs_iter] return [self._embed_one(t) for t in texts] + # ----------------- Evaluation + Debug Peek ----------------- def debug_one_query(corpus, queries, qrels, results, k=5): """Print one query's relevant IDs vs. top-k retrieved IDs.""" @@ -223,10 +231,14 @@ def debug_one_query(corpus, queries, qrels, results, k=5): print("\n============= DEBUG PEEK =============") print(f"Query ID: {qid}") print(f"Query: {q_text}") - print(f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}{' ...' if len(relevant)>10 else ''}") + print( + f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}{' ...' if len(relevant) > 10 else ''}" + ) print("\nTop-{} retrieved:".format(k)) for i, (did, score) in enumerate(ranked, 1): - title = (corpus.get(did, {}).get('title') if isinstance(corpus, dict) else None) or "" + title = ( + corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None + ) or "" print(f"{i:>2}. {did} score={score:.4f} title={title[:80]}") print(f"\nOverlap@{k}: {sorted(list(overlap)) if overlap else 'None'}") print("=====================================\n", flush=True) From 466985c4a05cb5dbc6bb1849ea3a07e1ab89d636 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Sat, 27 Sep 2025 04:35:43 +0530 Subject: [PATCH 10/23] changed few details --- inference/evaluate_nanobeir.py | 193 ++++++++++++++++----------------- inference/nanobeir_config.yaml | 19 +++- 2 files changed, 108 insertions(+), 104 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index c331bc4..a7ac433 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -100,10 +100,7 @@ def __init__(self, hf_id: str): def encode_queries(self, queries, batch_size=32, **kwargs): return self.model.encode( - queries, - convert_to_numpy=True, - batch_size=batch_size, - show_progress_bar=False, + queries, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=False ) def encode_corpus(self, corpus, batch_size=32, **kwargs): @@ -117,99 +114,89 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): ) -class LlamaServerEncoder: - """Client for llama.cpp --embedding server (/embedding). Ensures fixed-dim vectors.""" - - def __init__(self, endpoint: str): - import requests - from requests.adapters import HTTPAdapter - from urllib3.util.retry import Retry - - self.session = requests.Session() - retry = Retry(total=3, backoff_factor=0.2, status_forcelist=(502, 503, 504)) - self.session.mount("http://", HTTPAdapter(max_retries=retry)) - self.endpoint = endpoint.rstrip("/") - self._dim = None # lock the embedding size after first good response - - def _parse_embedding_json(self, js): - # try to dig out the first numeric list from many shapes - def first_vector(obj): - if isinstance(obj, dict): - # common fields - for k in ("embedding", "vector", "values", "data"): - if k in obj: - return first_vector(obj[k]) - # otherwise try the first value - if obj: - return first_vector(next(iter(obj.values()))) - return [] - if isinstance(obj, (list, tuple)): - if not obj: - return [] - if isinstance(obj[0], dict): - return first_vector(obj[0]) - if isinstance(obj[0], (list, tuple)): - return first_vector(obj[0]) - return obj - return [obj] - - return first_vector(js) - - def _to_1d_numeric(self, obj): - def flatten(xs): - for x in xs: - if isinstance(x, (list, tuple)): - yield from flatten(x) - elif isinstance(x, dict): - for k in ("vector", "embedding", "values", "data"): - if k in x: - yield from flatten(x[k]) - break - else: - for v in x.values(): - yield from flatten(v) - else: - yield x - - vec = list(flatten(self._parse_embedding_json(obj))) - vec = [float(x) for x in vec] - return vec - - def _normalize_vec(self, vec): - vec = self._to_1d_numeric(vec) - if self._dim is None: - self._dim = len(vec) - print(f"[llama] locked embedding dim = {self._dim}", flush=True) - if len(vec) < self._dim: - vec = vec + [0.0] * (self._dim - len(vec)) - elif len(vec) > self._dim: - vec = vec[: self._dim] - return vec - - def _embed_one(self, text: str, timeout=60): - orig = text - for backoff, shrink in zip(RETRY_BACKOFF, RETRY_SHRINK): - if backoff: - time.sleep(backoff) - t = orig[: int(MAX_CHARS * shrink)] - r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) - if r.status_code >= 500: - continue - r.raise_for_status() - return self._normalize_vec(r.json()) - # final tiny attempt - t = orig[: min(512, len(orig))] - r = self.session.post(self.endpoint, json={"content": t}, timeout=timeout) - r.raise_for_status() - return self._normalize_vec(r.json()) - - def encode_queries(self, queries, batch_size=1, **kwargs): - return [self._embed_one(q) for q in queries] - - def encode_corpus(self, corpus, batch_size=1, **kwargs): - docs_iter = corpus.values() if isinstance(corpus, dict) else corpus - texts = [_doc_text(d) for d in docs_iter] - return [self._embed_one(t) for t in texts] +class QwenHFEncoder: + """ + HF encoder for Qwen/qwen3-embedding-0.6b (or compatible). + - Handles dict or list corpora (BEIR can pass either) + - Mean-pools token embeddings + - Optional L2 normalization + """ + + def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): + self.torch = torch + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.pooling = pooling + self.normalize = normalize + self.max_len = max_len + + # Use slow tokenizer + trust_remote_code to avoid the fast-tokenizer JSON error + self.tokenizer = AutoTokenizer.from_pretrained( + hf_id, use_fast=False, trust_remote_code=True + ) + self.model = AutoModel.from_pretrained(hf_id, trust_remote_code=True).to( + self.device + ) + self.model.eval() + + def _encode_batch(self, texts, batch_size=32): + embs = [] + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + toks = self.tokenizer( + batch, + padding=True, + truncation=True, + max_length=self.max_len, + return_tensors="pt", + ).to(self.device) + + with self.torch.no_grad(): + out = self.model(**toks) + + # Prefer last_hidden_state, fall back to pooler_output if present + if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None: + x = out.last_hidden_state # [B, T, H] + # attention mask-based mean pooling + mask = toks.attention_mask.unsqueeze(-1).type_as(x) # [B, T, 1] + x = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9) # [B, H] + elif hasattr(out, "pooler_output") and out.pooler_output is not None: + x = out.pooler_output # [B, H] + else: + # very defensive fallback + x = out[0] if isinstance(out, (list, tuple)) else out + + if self.normalize: + x = self.torch.nn.functional.normalize(x, p=2, dim=-1) + + embs.extend(x.detach().cpu().tolist()) + return embs + + def encode_queries(self, queries, batch_size=32, **kwargs): + # queries is a list[str] + return self._encode_batch(list(queries), batch_size=batch_size) + + def encode_corpus(self, corpus, batch_size=32, **kwargs): + """ + corpus can be: + - dict: {doc_id: {"title":..., "text":...}, ...} + - list: [{"title":..., "text":...}, ...] + Return must be a list of embeddings aligned to the order BEIR provides. + """ + if isinstance(corpus, dict): + docs_iter = corpus.values() + elif isinstance(corpus, list): + docs_iter = corpus + else: + raise TypeError(f"Unsupported corpus type: {type(corpus)}") + + def _merge(doc): + title = (doc.get("title") or "").strip() + text = (doc.get("text") or "").strip() + s = (title + " " + text).strip() + return s[:2000] if len(s) > 2000 else s + + texts = [_merge(d) for d in docs_iter] + return self._encode_batch(texts, batch_size=batch_size) # ----------------- Evaluation + Debug Peek ----------------- @@ -232,13 +219,15 @@ def debug_one_query(corpus, queries, qrels, results, k=5): print(f"Query ID: {qid}") print(f"Query: {q_text}") print( - f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}{' ...' if len(relevant) > 10 else ''}" + f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}" + f"{' ...' if len(relevant)>10 else ''}" ) - print("\nTop-{} retrieved:".format(k)) + print(f"\nTop-{k} retrieved:") for i, (did, score) in enumerate(ranked, 1): title = ( - corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None - ) or "" + (corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None) + or "" + ) print(f"{i:>2}. {did} score={score:.4f} title={title[:80]}") print(f"\nOverlap@{k}: {sorted(list(overlap)) if overlap else 'None'}") print("=====================================\n", flush=True) @@ -371,4 +360,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index bc43be7..3c9ea92 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -1,3 +1,18 @@ +# output_dir: ./outputs +# leaderboard_json: ./outputs/nanobeir_leaderboard.json + +# datasets: ["scifact"] + +# cost_per_1k: +# qwen_embedding: 0.10 # <- set this to whatever your team uses + +# models: +# qwen_embedding: +# hf_id: Qwen/qwen3-embedding-0.6b +# pooling: mean # mean | cls | last_token (we’ll implement mean) +# normalize: true # L2-normalize embeddings +# max_len: 512 # truncate to fit model context + output_dir: ./outputs leaderboard_json: ./outputs/nanobeir_leaderboard.json @@ -9,6 +24,6 @@ cost_per_1k: models: qwen_embedding: hf_id: Qwen/qwen3-embedding-0.6b - pooling: mean # mean | cls | last_token (we’ll implement mean) + pooling: mean # mean | cls | last_token (we’ll implement mean) normalize: true # L2-normalize embeddings - max_len: 512 # truncate to fit model context + max_len: 512 # truncate to fit model context \ No newline at end of file From d5fb84dec0151ec15eb2b72c857efedc6f103ad9 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Sat, 27 Sep 2025 05:02:17 +0530 Subject: [PATCH 11/23] fixed linter --- inference/evaluate_nanobeir.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index a7ac433..b9a6def 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -14,7 +14,7 @@ import random import statistics import time -from typing import Any, Dict, List +from typing import Any # --- Third-Party Imports --- import torch @@ -33,13 +33,13 @@ # ----------------- Small utilities ----------------- -def p95(latencies_ms: List[float]) -> float: +def p95(latencies_ms: list[float]) -> float: if not latencies_ms: return 0.0 return float(statistics.quantiles(latencies_ms, n=100)[94]) -def time_calls(fn, payloads: List[Any], warmup: int = 3) -> float: +def time_calls(fn, payloads: list[Any], warmup: int = 3) -> float: if not payloads: return 0.0 for _ in range(max(0, warmup)): @@ -87,7 +87,7 @@ def load_dataset(name: str, base_dir: str = "./datasets"): # ----------------- Model wrappers ----------------- -def _doc_text(doc: Dict[str, str]) -> str: +def _doc_text(doc: dict[str, str]) -> str: title = (doc.get("title") or "").strip() text = (doc.get("text") or "").strip() merged = (title + " " + text).strip() @@ -100,7 +100,10 @@ def __init__(self, hf_id: str): def encode_queries(self, queries, batch_size=32, **kwargs): return self.model.encode( - queries, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=False + queries, + convert_to_numpy=True, + batch_size=batch_size, + show_progress_bar=False, ) def encode_corpus(self, corpus, batch_size=32, **kwargs): @@ -122,7 +125,13 @@ class QwenHFEncoder: - Optional L2 normalization """ - def __init__(self, hf_id: str, pooling: str = "mean", normalize: bool = True, max_len: int = 512): + def __init__( + self, + hf_id: str, + pooling: str = "mean", + normalize: bool = True, + max_len: int = 512 + ): self.torch = torch self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.pooling = pooling @@ -163,7 +172,7 @@ def _encode_batch(self, texts, batch_size=32): x = out.pooler_output # [B, H] else: # very defensive fallback - x = out[0] if isinstance(out, (list, tuple)) else out + x = out[0] if isinstance(out, (list | tuple)) else out if self.normalize: x = self.torch.nn.functional.normalize(x, p=2, dim=-1) @@ -308,7 +317,7 @@ def main(): ap.add_argument("--config", required=True) args = ap.parse_args() - with open(args.config, "r") as f: + with open(args.config) as f: cfg = yaml.safe_load(f) out_dir = cfg.get("output_dir", "./outputs") @@ -321,7 +330,7 @@ def main(): cost = cfg.get("cost_per_1k", {}) models_cfg = cfg.get("models", {}) - all_out: Dict[str, Dict[str, Any]] = {} + all_out: dict[str, dict[str, Any]] = {} for ds in datasets: corpus, queries, qrels = load_dataset(ds) @@ -360,4 +369,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() From ebd910e0b9bab15413aa8c02b98feb0abcc3121a Mon Sep 17 00:00:00 2001 From: vedjaw Date: Sat, 27 Sep 2025 05:11:21 +0530 Subject: [PATCH 12/23] style: format code with ruff --- inference/evaluate_nanobeir.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index b9a6def..67d169a 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -130,7 +130,7 @@ def __init__( hf_id: str, pooling: str = "mean", normalize: bool = True, - max_len: int = 512 + max_len: int = 512, ): self.torch = torch self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -229,14 +229,13 @@ def debug_one_query(corpus, queries, qrels, results, k=5): print(f"Query: {q_text}") print( f"Relevant doc IDs ({len(relevant)}): {sorted(list(relevant))[:10]}" - f"{' ...' if len(relevant)>10 else ''}" + f"{' ...' if len(relevant) > 10 else ''}" ) print(f"\nTop-{k} retrieved:") for i, (did, score) in enumerate(ranked, 1): title = ( - (corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None) - or "" - ) + corpus.get(did, {}).get("title") if isinstance(corpus, dict) else None + ) or "" print(f"{i:>2}. {did} score={score:.4f} title={title[:80]}") print(f"\nOverlap@{k}: {sorted(list(overlap)) if overlap else 'None'}") print("=====================================\n", flush=True) From 08ab7decb606255a665f131b5da3e5dd44447544 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Sat, 27 Sep 2025 22:45:50 +0530 Subject: [PATCH 13/23] tintegrate vLLMAPI encoder --- inference/evaluate_nanobeir.py | 85 ++++++++++++++++++++++++++++++++++ inference/nanobeir_config.yaml | 35 ++++++-------- 2 files changed, 99 insertions(+), 21 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 67d169a..031fca2 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -23,6 +23,7 @@ from beir.datasets.data_loader import GenericDataLoader from beir.retrieval.evaluation import EvaluateRetrieval from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES +from openai import OpenAI from sentence_transformers import SentenceTransformer from transformers import AutoModel, AutoTokenizer @@ -309,6 +310,73 @@ def _pick_ndcg10(d): "Cost_per_1k_queries": float(cost_per_1k), } +# ... (inside evaluate_nanobier.py) + +class VLLMAPIEncoder: + """ + Client for a vLLM OpenAI-compatible server that provides embeddings. + - Handles retry logic for robustness. + """ + def __init__(self, base_url: str, model_name: str, max_len: int = 512): + # NOTE: vLLM embed API uses the client, but is simpler than chat/completion. + self.client = OpenAI(base_url=base_url, api_key="sk-not-used-by-vllm-server") + self.model_name = model_name + self.max_len = max_len + self.normalize = True + + def _merge_and_truncate(self, doc_list): + # Helper function to preprocess text the same way + texts = [] + for d in doc_list: + title = (d.get("title") or "").strip() + text = (d.get("text") or "").strip() + s = (title + " " + text).strip() + texts.append(s[:self.max_len] if len(s) > self.max_len else s) + return texts + + def _call_api(self, texts: list[str]) -> list[list[float]]: + # Using the OpenAI-compatible embedding endpoint + # Add retry logic to handle transient network/server issues + for delay in RETRY_BACKOFF: + time.sleep(delay) + try: + # The input list of strings is sent to the /v1/embeddings endpoint + response = self.client.embeddings.create( + model=self.model_name, + input=texts, + ) + + # The response is typically a list of dicts with 'embedding' key + # We extract the embeddings and return them as a list of lists (vectors) + embeddings = [d.embedding for d in response.data] + return embeddings + + except Exception as e: + print(f"[vLLM-API-ERROR] Failed to get embeddings: {e}. Retrying...") + continue + + # If all retries fail, raise the last exception or return an error state + raise RuntimeError("vLLM API failed after all retries.") + + def encode_queries(self, queries, batch_size=32, **kwargs): + # queries is a list[str]. Note: vLLM handles the batching internally + return self._call_api(list(queries)) + + def encode_corpus(self, corpus, batch_size=32, **kwargs): + # corpus is dict or list (BEIR-compatible format) + if isinstance(corpus, dict): + docs_iter = corpus.values() + elif isinstance(corpus, list): + docs_iter = corpus + else: + raise TypeError(f"Unsupported corpus type: {type(corpus)}") + + texts = self._merge_and_truncate(docs_iter) + # Note: vLLM API server is better at handling large corpus input than + # the local HF encoder, but for robustness with huge datasets, you + # might need to add chunking/batching around _call_api if the number + # of documents exceeds vLLM's max batch size (which is usually very large). + return self._call_api(texts) # ----------------- Main ----------------- def main(): @@ -343,6 +411,23 @@ def main(): ) all_out[ds]["baseline"] = out + if "vllm_api_encoder" in models_cfg: + m = models_cfg["vllm_api_encoder"] + base_url = m.get("base_url", "http://localhost:8000/v1") + model_name = m.get("model_name", "Qwen/qwen3-embedding-0.6b") + max_len = m.get("max_len", 512) + print( + f"[vllm_api_encoder] {model_name} | base_url={base_url} " + f"max_len={max_len}" + ) + encoder = VLLMAPIEncoder( + base_url=base_url, model_name=model_name, max_len=max_len + ) + out = evaluate_model( + encoder, corpus, queries, qrels, cost.get("vllm_api_encoder", 0.0) + ) + all_out[ds]["vllm_api_encoder"] = out + if "qwen_embedding" in models_cfg: m = models_cfg["qwen_embedding"] hf_id = m.get("hf_id", "Qwen/qwen3-embedding-0.6b") diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index 3c9ea92..c718355 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -1,29 +1,22 @@ -# output_dir: ./outputs -# leaderboard_json: ./outputs/nanobeir_leaderboard.json - -# datasets: ["scifact"] - -# cost_per_1k: -# qwen_embedding: 0.10 # <- set this to whatever your team uses - -# models: -# qwen_embedding: -# hf_id: Qwen/qwen3-embedding-0.6b -# pooling: mean # mean | cls | last_token (we’ll implement mean) -# normalize: true # L2-normalize embeddings -# max_len: 512 # truncate to fit model context - output_dir: ./outputs leaderboard_json: ./outputs/nanobeir_leaderboard.json datasets: ["scifact"] cost_per_1k: - qwen_embedding: 0.10 # <- set this to whatever your team uses + #qwen_embedding: 0.10 # <- set this to whatever your team uses + vllm_api_encoder: 0.05 models: - qwen_embedding: - hf_id: Qwen/qwen3-embedding-0.6b - pooling: mean # mean | cls | last_token (we’ll implement mean) - normalize: true # L2-normalize embeddings - max_len: 512 # truncate to fit model context \ No newline at end of file + # qwen_embedding: + # hf_id: Qwen/qwen3-embedding-0.6b + # pooling: mean # mean | cls | last_token (we’ll implement mean) + # normalize: true # L2-normalize embeddings + # max_len: 512 # truncate to fit model context + + vllm_api_encoder: + # This should match the model ID you start the vLLM server with + model_name: Qwen/qwen3-embedding-0.6b + # vLLM's default OpenAI-compatible API endpoint + base_url: http://172.24.16.155:8000/v1 + max_len: 512 # Truncate to match the model's context \ No newline at end of file From 2633666ae3dc111d6c449317c76761136ecf5c03 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Sat, 27 Sep 2025 22:51:54 +0530 Subject: [PATCH 14/23] fixed lint error --- inference/evaluate_nanobeir.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 031fca2..1ce90f9 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -310,13 +310,16 @@ def _pick_ndcg10(d): "Cost_per_1k_queries": float(cost_per_1k), } + # ... (inside evaluate_nanobier.py) + class VLLMAPIEncoder: """ Client for a vLLM OpenAI-compatible server that provides embeddings. - Handles retry logic for robustness. """ + def __init__(self, base_url: str, model_name: str, max_len: int = 512): # NOTE: vLLM embed API uses the client, but is simpler than chat/completion. self.client = OpenAI(base_url=base_url, api_key="sk-not-used-by-vllm-server") @@ -331,7 +334,7 @@ def _merge_and_truncate(self, doc_list): title = (d.get("title") or "").strip() text = (d.get("text") or "").strip() s = (title + " " + text).strip() - texts.append(s[:self.max_len] if len(s) > self.max_len else s) + texts.append(s[: self.max_len] if len(s) > self.max_len else s) return texts def _call_api(self, texts: list[str]) -> list[list[float]]: @@ -378,6 +381,7 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): # of documents exceeds vLLM's max batch size (which is usually very large). return self._call_api(texts) + # ----------------- Main ----------------- def main(): ap = argparse.ArgumentParser() From ede13c010406dfa3de689234968e2ae60171eb77 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Tue, 30 Sep 2025 22:11:58 +0530 Subject: [PATCH 15/23] implemented on zeta-alpha-ai/NanoDBPedia dataset --- inference/evaluate_nanobeir.py | 88 ++++++++++++++++++++++++++++++++-- inference/nanobeir_config.yaml | 5 +- 2 files changed, 87 insertions(+), 6 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 1ce90f9..8c580b0 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -16,13 +16,15 @@ import time from typing import Any +import pandas as pd + # --- Third-Party Imports --- import torch import yaml -from beir import util from beir.datasets.data_loader import GenericDataLoader from beir.retrieval.evaluation import EvaluateRetrieval from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES +from huggingface_hub import snapshot_download from openai import OpenAI from sentence_transformers import SentenceTransformer from transformers import AutoModel, AutoTokenizer @@ -81,9 +83,87 @@ def median_rank(results, qrels) -> float: def load_dataset(name: str, base_dir: str = "./datasets"): - url_or_id = _BEIR_URLS.get(name, name) - data_path = util.download_and_unzip(url_or_id, base_dir) - corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test") + print(f"[dataset] Attempting to load dataset: '{name}'") + + if "/" not in name: + raise ValueError( + f"'{name}' must be a Hugging Face dataset ID like 'user/repo'." + ) + + # --- Step 1: Download the raw files from the Hub --- + print(f"-> Downloading '{name}' from the Hub...") + try: + local_path = snapshot_download(repo_id=name, repo_type="dataset") + print(f"--> Dataset downloaded to cache: {local_path}") + except Exception as e: + error_message = ( + f"\n[ERROR] Failed to download repository '{name}'. " + "Please double-check the name in your YAML config." + ) + print(error_message) + raise e + + # --- Step 2: Convert and Sanitize files from Parquet to BEIR format --- + print("--> Downloaded Parquet files. Converting & sanitizing to BEIR format...") + + # Convert Corpus + corpus_parquet_path = os.path.join( + local_path, "corpus", "train-00000-of-00001.parquet" + ) + corpus_jsonl_path = os.path.join(local_path, "corpus.jsonl") + if not os.path.exists(corpus_jsonl_path): + corpus_df = pd.read_parquet(corpus_parquet_path) + + # --- THE BULLETPROOF FIX --- + print("--> Applying final sanitization to corpus...") + # 1. Ensure a 'title' column exists. If not, create it with empty strings. + if "title" not in corpus_df.columns: + print("--> 'title' column not found. Creating empty 'title' column.") + corpus_df["title"] = "" + + # 2. Sanitize both 'title' and 'text' columns to replace any None values. + corpus_df["title"] = corpus_df["title"].fillna("") + corpus_df["text"] = corpus_df["text"].fillna("") + + corpus_df.to_json(corpus_jsonl_path, orient="records", lines=True) + print(f"--> Converted and saved {corpus_jsonl_path}") + + # Convert Queries (Sanitization for safety) + queries_parquet_path = os.path.join( + local_path, "queries", "train-00000-of-00001.parquet" + ) + queries_jsonl_path = os.path.join(local_path, "queries.jsonl") + if not os.path.exists(queries_jsonl_path): + queries_df = pd.read_parquet(queries_parquet_path) + queries_df["text"] = queries_df["text"].fillna("") + queries_df.to_json(queries_jsonl_path, orient="records", lines=True) + print(f"--> Converted and saved {queries_jsonl_path}") + + # Convert Qrels + qrels_parquet_path = os.path.join( + local_path, "qrels", "train-00000-of-00001.parquet" + ) + qrels_dir = os.path.join(local_path, "qrels") + qrels_tsv_path = os.path.join(qrels_dir, "test.tsv") + if not os.path.exists(qrels_tsv_path): + os.makedirs(qrels_dir, exist_ok=True) + qrels_df = pd.read_parquet(qrels_parquet_path) + if "score" not in qrels_df.columns: + qrels_df["score"] = 1 + qrels_df = qrels_df.rename( + columns={ + "query_id": "query-id", + "corpus_id": "corpus-id", + } + ) + qrels_df.to_csv(qrels_tsv_path, sep="\t", index=False) + print(f"--> Converted and saved {qrels_tsv_path}") + + # --- Step 3: Load the newly converted, BEIR-compatible files --- + print("--> Conversion complete. Loading formatted data...") + corpus, queries, qrels = GenericDataLoader(data_folder=local_path).load( + split="test" + ) return corpus, queries, qrels diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index c718355..a364e64 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -1,8 +1,9 @@ output_dir: ./outputs leaderboard_json: ./outputs/nanobeir_leaderboard.json -datasets: ["scifact"] - +#datasets: ["scifact"] +datasets: ["zeta-alpha-ai/NanoDBPedia"] +#datasets: ["scifact-nano"] cost_per_1k: #qwen_embedding: 0.10 # <- set this to whatever your team uses vllm_api_encoder: 0.05 From ca575d05c8b142379b191f3878a1f387e237c03d Mon Sep 17 00:00:00 2001 From: vedjaw Date: Wed, 1 Oct 2025 13:40:27 +0530 Subject: [PATCH 16/23] did a minor change in json output by removing the cost output. --- inference/evaluate_nanobeir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 8c580b0..a106238 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -387,7 +387,7 @@ def _pick_ndcg10(d): "NDCG@10": float(n10), "MedR": float(medr), "Latency_ms_p95": float(lat), - "Cost_per_1k_queries": float(cost_per_1k), + # "Cost_per_1k_queries": float(cost_per_1k), } From 7a688e414443f684f53d06925e99ed426751c438 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Thu, 2 Oct 2025 17:55:14 +0530 Subject: [PATCH 17/23] throughput calculation done --- inference/evaluate_nanobeir.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index a106238..018d683 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -42,6 +42,28 @@ def p95(latencies_ms: list[float]) -> float: return float(statistics.quantiles(latencies_ms, n=100)[94]) +def measure_throughput(model, queries: list[str], batch_size: int = 32) -> float: + """Measures true batch throughput in Queries Per Second.""" + if not queries: + return 0.0 + + # Warmup run + _ = model.encode_queries(queries[:batch_size], batch_size=batch_size) + + print(f"[debug] Measuring throughput on {len(queries)} queries...") + t_start = time.time() + _ = model.encode_queries(queries, batch_size=batch_size) + t_end = time.time() + + total_time = t_end - t_start + if total_time == 0: + return 0.0 + + qps = len(queries) / total_time + print(f"[debug] Throughput: {qps:.2f} QPS") + return qps + + def time_calls(fn, payloads: list[Any], warmup: int = 3) -> float: if not payloads: return 0.0 @@ -379,7 +401,10 @@ def _pick_ndcg10(d): lat = time_calls( lambda batch: beir_model.encode_queries(batch, batch_size=1), probe, warmup=3 ) - + #lat_p95 = float(lat) + # est_qps = 1000.0 / lat_p95 if lat_p95 > 0 else 0.0 + all_query_texts = list(queries.values()) + true_qps = measure_throughput(beir_model, all_query_texts) return { "R@1": float(r1), "R@5": float(r5), @@ -387,7 +412,8 @@ def _pick_ndcg10(d): "NDCG@10": float(n10), "MedR": float(medr), "Latency_ms_p95": float(lat), - # "Cost_per_1k_queries": float(cost_per_1k), + # "Est_Throughput_QPS": float(est_qps), + "True_Throughput_QPS": float(true_qps), } From 249909a8454fab754aa0b72901542773f9074c5d Mon Sep 17 00:00:00 2001 From: vedjaw Date: Thu, 2 Oct 2025 17:59:17 +0530 Subject: [PATCH 18/23] lint error fix --- inference/evaluate_nanobeir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 018d683..0a1ac72 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -401,7 +401,7 @@ def _pick_ndcg10(d): lat = time_calls( lambda batch: beir_model.encode_queries(batch, batch_size=1), probe, warmup=3 ) - #lat_p95 = float(lat) + # lat_p95 = float(lat) # est_qps = 1000.0 / lat_p95 if lat_p95 > 0 else 0.0 all_query_texts = list(queries.values()) true_qps = measure_throughput(beir_model, all_query_texts) From ea6dc1b6fd1fa656476f4a519bc910ac2188c361 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Thu, 2 Oct 2025 18:44:45 +0530 Subject: [PATCH 19/23] changed model name --- inference/nanobeir_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index a364e64..76ea8b2 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -17,7 +17,7 @@ models: vllm_api_encoder: # This should match the model ID you start the vLLM server with - model_name: Qwen/qwen3-embedding-0.6b + model_name: Qwen/Qwen3-embedding-0.6B # vLLM's default OpenAI-compatible API endpoint base_url: http://172.24.16.155:8000/v1 max_len: 512 # Truncate to match the model's context \ No newline at end of file From eff4408266fb35116b4304eea4845f6aae8f0681 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Thu, 2 Oct 2025 22:12:22 +0530 Subject: [PATCH 20/23] fixed model name --- inference/nanobeir_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index 76ea8b2..d93a204 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -17,7 +17,7 @@ models: vllm_api_encoder: # This should match the model ID you start the vLLM server with - model_name: Qwen/Qwen3-embedding-0.6B + model_name: "Qwen/Qwen3-Embedding-0.6b" # vLLM's default OpenAI-compatible API endpoint base_url: http://172.24.16.155:8000/v1 max_len: 512 # Truncate to match the model's context \ No newline at end of file From af6e1c55aa92829b079c5db8f049083eecbe6178 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Fri, 3 Oct 2025 15:33:34 +0530 Subject: [PATCH 21/23] implementing two new models for benchmarking and used two new datasets for comparison. --- inference/evaluate_nanobeir.py | 58 ++++++++++++++++++++++++++++++++++ inference/nanobeir_config.yaml | 24 +++++++------- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 0a1ac72..498aa6e 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -219,6 +219,56 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): texts, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True ) +# Add this entire class to your evaluate_nanobeir.py script + +class QwenVLModel: + """ + Custom model loader for a Qwen Vision-Language model to extract text embeddings. + This uses the main transformers library instead of sentence-transformers. + """ + def __init__(self, hf_id: str): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.tokenizer = AutoTokenizer.from_pretrained(hf_id, trust_remote_code=True) + self.model = AutoModel.from_pretrained( + hf_id, + trust_remote_code=True, + torch_dtype=torch.bfloat16 + ).to(self.device) + self.model.eval() + print(f"Loaded {hf_id} on {self.device} with dtype {self.model.dtype}") + + def _embed(self, texts: list[str]) -> list[list[float]]: + # This model doesn't use a simple tokenizer, it uses a processor + # that handles both text and images. We are only providing text. + inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True) + inputs = {key: val.to(self.device) for key, val in inputs.items()} + + with torch.no_grad(): + # Get the model's hidden states + outputs = self.model(**inputs, output_hidden_states=True) + + # Use the last hidden state + last_hidden_state = outputs.hidden_states[-1] + + # Perform mean pooling to get a single vector per text + # (This is a standard way to get sentence embeddings) + mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float() + sum_embeddings = torch.sum(last_hidden_state * mask, 1) + sum_mask = torch.clamp(mask.sum(1), min=1e-9) + pooled_embeddings = sum_embeddings / sum_mask + + # Normalize the embeddings + normalized_embeddings = torch.nn.functional.normalize(pooled_embeddings, p=2, dim=1) + + return normalized_embeddings.cpu().tolist() + + def encode_queries(self, queries: list[str], batch_size: int = 32, **kwargs): + return self._embed(queries) + + def encode_corpus(self, corpus: list[dict], batch_size: int = 32, **kwargs): + # The BEIR library passes the corpus as a list of dictionaries + texts = [doc.get("title", "") + " " + doc.get("text", "") for doc in corpus] + return self._embed(texts) class QwenHFEncoder: """ @@ -521,6 +571,14 @@ def main(): ) all_out[ds]["baseline"] = out + if "colqwen_baseline" in models_cfg: + m_id = models_cfg["colqwen_baseline"]["hf_id"] + print(f"[colqwen_baseline] {m_id}") + out = evaluate_model( + QwenVLModel(m_id), corpus, queries, qrels, cost.get("colqwen_baseline", 0.0) + ) + all_out[ds]["colqwen_baseline"] = out + if "vllm_api_encoder" in models_cfg: m = models_cfg["vllm_api_encoder"] base_url = m.get("base_url", "http://localhost:8000/v1") diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index d93a204..d6f7ae1 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -2,22 +2,20 @@ output_dir: ./outputs leaderboard_json: ./outputs/nanobeir_leaderboard.json #datasets: ["scifact"] -datasets: ["zeta-alpha-ai/NanoDBPedia"] +datasets: ["zeta-alpha-ai/NanoDBPedia","zeta-alpha-ai/NanoClimateFEVER","zeta-alpha-ai/NanoFiQA2018"] #datasets: ["scifact-nano"] cost_per_1k: #qwen_embedding: 0.10 # <- set this to whatever your team uses vllm_api_encoder: 0.05 models: - # qwen_embedding: - # hf_id: Qwen/qwen3-embedding-0.6b - # pooling: mean # mean | cls | last_token (we’ll implement mean) - # normalize: true # L2-normalize embeddings - # max_len: 512 # truncate to fit model context - - vllm_api_encoder: - # This should match the model ID you start the vLLM server with - model_name: "Qwen/Qwen3-Embedding-0.6b" - # vLLM's default OpenAI-compatible API endpoint - base_url: http://172.24.16.155:8000/v1 - max_len: 512 # Truncate to match the model's context \ No newline at end of file + # baseline: + # hf_id: "BAAI/bge-base-en-v1.5" + + colqwen_baseline: + hf_id: "vidore/colqwen2-v0.1" + + # vllm_api_encoder: + # model_name: Qwen/Qwen3-Embedding-0.6b + # base_url: http://172.24.16.155:8000/v1 + # max_len: 512 \ No newline at end of file From 12268c06281a9f75c97eb0edffe2183ea0e2e294 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Fri, 3 Oct 2025 15:35:28 +0530 Subject: [PATCH 22/23] changes formatting --- inference/nanobeir_config.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/inference/nanobeir_config.yaml b/inference/nanobeir_config.yaml index d6f7ae1..ae8e1eb 100644 --- a/inference/nanobeir_config.yaml +++ b/inference/nanobeir_config.yaml @@ -2,7 +2,10 @@ output_dir: ./outputs leaderboard_json: ./outputs/nanobeir_leaderboard.json #datasets: ["scifact"] -datasets: ["zeta-alpha-ai/NanoDBPedia","zeta-alpha-ai/NanoClimateFEVER","zeta-alpha-ai/NanoFiQA2018"] +datasets: + - "zeta-alpha-ai/NanoDBPedia" + - "zeta-alpha-ai/NanoClimateFEVER" + - "zeta-alpha-ai/NanoFiQA2018" #datasets: ["scifact-nano"] cost_per_1k: #qwen_embedding: 0.10 # <- set this to whatever your team uses From 1ef8bcfc49b00ab50595750b286e70d5a5248eb8 Mon Sep 17 00:00:00 2001 From: vedjaw Date: Fri, 3 Oct 2025 15:37:13 +0530 Subject: [PATCH 23/23] lint error --- inference/evaluate_nanobeir.py | 35 ++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/inference/evaluate_nanobeir.py b/inference/evaluate_nanobeir.py index 498aa6e..7abdf03 100644 --- a/inference/evaluate_nanobeir.py +++ b/inference/evaluate_nanobeir.py @@ -219,20 +219,21 @@ def encode_corpus(self, corpus, batch_size=32, **kwargs): texts, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True ) + # Add this entire class to your evaluate_nanobeir.py script + class QwenVLModel: """ Custom model loader for a Qwen Vision-Language model to extract text embeddings. This uses the main transformers library instead of sentence-transformers. """ + def __init__(self, hf_id: str): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = AutoTokenizer.from_pretrained(hf_id, trust_remote_code=True) self.model = AutoModel.from_pretrained( - hf_id, - trust_remote_code=True, - torch_dtype=torch.bfloat16 + hf_id, trust_remote_code=True, torch_dtype=torch.bfloat16 ).to(self.device) self.model.eval() print(f"Loaded {hf_id} on {self.device} with dtype {self.model.dtype}") @@ -240,26 +241,35 @@ def __init__(self, hf_id: str): def _embed(self, texts: list[str]) -> list[list[float]]: # This model doesn't use a simple tokenizer, it uses a processor # that handles both text and images. We are only providing text. - inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True) + inputs = self.tokenizer( + texts, return_tensors="pt", padding=True, truncation=True + ) inputs = {key: val.to(self.device) for key, val in inputs.items()} with torch.no_grad(): # Get the model's hidden states outputs = self.model(**inputs, output_hidden_states=True) - + # Use the last hidden state last_hidden_state = outputs.hidden_states[-1] - + # Perform mean pooling to get a single vector per text # (This is a standard way to get sentence embeddings) - mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float() + mask = ( + inputs["attention_mask"] + .unsqueeze(-1) + .expand(last_hidden_state.size()) + .float() + ) sum_embeddings = torch.sum(last_hidden_state * mask, 1) sum_mask = torch.clamp(mask.sum(1), min=1e-9) pooled_embeddings = sum_embeddings / sum_mask # Normalize the embeddings - normalized_embeddings = torch.nn.functional.normalize(pooled_embeddings, p=2, dim=1) - + normalized_embeddings = torch.nn.functional.normalize( + pooled_embeddings, p=2, dim=1 + ) + return normalized_embeddings.cpu().tolist() def encode_queries(self, queries: list[str], batch_size: int = 32, **kwargs): @@ -270,6 +280,7 @@ def encode_corpus(self, corpus: list[dict], batch_size: int = 32, **kwargs): texts = [doc.get("title", "") + " " + doc.get("text", "") for doc in corpus] return self._embed(texts) + class QwenHFEncoder: """ HF encoder for Qwen/qwen3-embedding-0.6b (or compatible). @@ -575,7 +586,11 @@ def main(): m_id = models_cfg["colqwen_baseline"]["hf_id"] print(f"[colqwen_baseline] {m_id}") out = evaluate_model( - QwenVLModel(m_id), corpus, queries, qrels, cost.get("colqwen_baseline", 0.0) + QwenVLModel(m_id), + corpus, + queries, + qrels, + cost.get("colqwen_baseline", 0.0), ) all_out[ds]["colqwen_baseline"] = out