Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,10 @@ data/datasets/
│ ├── documents.jsonl # Processed documents
│ ├── qrels.jsonl # Relevance judgments
│ └── {retrieval_method}/
│ └── {merged|unmerged}/
│ ├── ze_results.jsonl # Initial retrieval results
│ ├── embeddings_cache.db # Embedding cache
│ └── {reranker}/
│ └── ze_scores.jsonl # Reranked results
│ ├── ze_results.jsonl # Initial retrieval results
│ ├── embeddings_cache.db # Embedding cache
│ └── {reranker}/
│ └── ze_scores.jsonl # Reranked results
```

## Performance Tips
Expand Down
70 changes: 31 additions & 39 deletions evals/common.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,9 @@
from typing import Any, Literal
from typing import Any

from pydantic import AliasChoices, BaseModel, Field, computed_field

from evals.utils import ROOT

RetrievalMethod = Literal[
"qwen3_4b", "qwen3_0.6b", "voyageai", "openai_small", "bm25", "hybrid"
]
MergeStatus = Literal["merged", "unmerged"]
RerankerName = Literal[
"cohere",
"salesforce",
"zeroentropy-large",
"zeroentropy-small",
"zeroentropy-small-modal",
"zeroentropy-large-modal",
"zeroentropy-baseten",
"mixedbread",
"jina",
"qwen",
"openai-large-embedding",
"gpt-4o-mini",
"gpt-4.1-mini",
"gpt-5-mini",
"gpt-5-nano",
]


class ZEDataset(BaseModel):
id: str
Expand Down Expand Up @@ -57,38 +35,52 @@ def documents_path(self) -> str:
def qrels_path(self) -> str:
return self.file_path("qrels.jsonl")

def retrieval_method_path(
self,
retrieval_method: str,
include_relevant_docs: bool,
path: str,
) -> str:
if include_relevant_docs:
retrieval_method = f"{retrieval_method}+include_relevant_docs"
return self.file_path(f"{retrieval_method}/{path}")

def ze_results_path(
self, retrieval_method: RetrievalMethod, include_relevant_docs: bool
self,
retrieval_method: str,
include_relevant_docs: bool,
) -> str:
merge_status: MergeStatus = "merged" if include_relevant_docs else "unmerged"
return self.file_path(f"{retrieval_method}/{merge_status}/ze_results.jsonl")
return self.retrieval_method_path(
retrieval_method, include_relevant_docs, "ze_results.jsonl"
)

def embeddings_cache_path(
self, retrieval_method: RetrievalMethod, include_relevant_docs: bool
self,
retrieval_method: str,
include_relevant_docs: bool,
) -> str:
merge_status: MergeStatus = "merged" if include_relevant_docs else "unmerged"
return self.file_path(f"{retrieval_method}/{merge_status}/embeddings_cache.db")
return self.retrieval_method_path(
retrieval_method, include_relevant_docs, "embeddings_cache.db"
)

def reranker_cache_path(
self,
retrieval_method: RetrievalMethod,
retrieval_method: str,
include_relevant_docs: bool,
reranker: RerankerName,
reranker: str,
) -> str:
merge_status: MergeStatus = "merged" if include_relevant_docs else "unmerged"
return self.file_path(
f"{retrieval_method}/{merge_status}/{reranker}/reranker_cache.db"
return self.retrieval_method_path(
retrieval_method, include_relevant_docs, f"{reranker}/reranker_cache.db"
)

def ze_scores_path(
self,
retrieval_method: RetrievalMethod,
retrieval_method: str,
include_relevant_docs: bool,
reranker: RerankerName,
reranker: str,
) -> str:
merge_status: MergeStatus = "merged" if include_relevant_docs else "unmerged"
return self.file_path(
f"{retrieval_method}/{merge_status}/{reranker}/ze_scores.jsonl"
return self.retrieval_method_path(
retrieval_method, include_relevant_docs, f"{reranker}/ze_scores.jsonl"
)


Expand Down
3 changes: 2 additions & 1 deletion evals/run_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@
ai_embedding,
tiktoken_truncate_by_num_tokens,
)
from evals.common import Document, QRel, Query, RetrievalMethod, ZEDataset, ZEResults
from evals.common import Document, QRel, Query, ZEDataset, ZEResults
from evals.ingestors.common import BaseIngestor
from evals.types import (
DEFAULT_INCLUDE_RELEVANT_DOCS,
DEFAULT_INGESTORS,
DEFAULT_RETRIEVAL_METHOD,
RetrievalMethod,
)
from evals.utils import ROOT

Expand Down
4 changes: 2 additions & 2 deletions evals/run_ndcg.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

from evals.common import (
QueryScores,
RerankerName,
RetrievalMethod,
ZEDataset,
ZEResults,
)
Expand All @@ -15,6 +13,8 @@
DEFAULT_INGESTORS,
DEFAULT_RERANKERS,
DEFAULT_RETRIEVAL_METHOD,
RerankerName,
RetrievalMethod,
)
from evals.utils import argsort, avg

Expand Down
3 changes: 2 additions & 1 deletion evals/run_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import asyncio
from typing import Literal

from evals.common import RerankerName, RetrievalMethod
from evals.ingestors.common import BaseIngestor
from evals.run_embeddings import run_embeddings
from evals.run_ingestors import run_ingestors
Expand All @@ -13,6 +12,8 @@
DEFAULT_MAX_QUERIES,
DEFAULT_RERANKERS,
DEFAULT_RETRIEVAL_METHOD,
RerankerName,
RetrievalMethod,
)

INGESTORS: list[BaseIngestor] = DEFAULT_INGESTORS
Expand Down
4 changes: 2 additions & 2 deletions evals/run_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
from evals.common import (
QRel,
QueryScores,
RerankerName,
RetrievalMethod,
ZEDataset,
ZEResults,
)
Expand All @@ -15,6 +13,8 @@
DEFAULT_INGESTORS,
DEFAULT_RERANKERS,
DEFAULT_RETRIEVAL_METHOD,
RerankerName,
RetrievalMethod,
)
from evals.utils import argsort, avg

Expand Down
4 changes: 2 additions & 2 deletions evals/run_rerankers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
from evals.common import (
DocumentScores,
QueryScores,
RerankerName,
RetrievalMethod,
ZEDataset,
ZEResults,
)
Expand All @@ -30,6 +28,8 @@
DEFAULT_INGESTORS,
DEFAULT_RERANKERS,
DEFAULT_RETRIEVAL_METHOD,
RerankerName,
RetrievalMethod,
)
from evals.utils import flatten, read_num_lines_pbar

Expand Down
37 changes: 34 additions & 3 deletions evals/types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Literal

from evals.ai import AIEmbeddingModel, AIModel, AIRerankModel
from evals.ai_rerank import AIModelAsReranker
from evals.common import RerankerName, RetrievalMethod
from evals.ingestors.bioasq import BioasqIngestor
from evals.ingestors.common import BaseIngestor
from evals.ingestors.cosqa import CosqaIngestor
Expand All @@ -23,8 +24,36 @@
from evals.ingestors.quora_swedish import QuoraSwedishIngestor
from evals.ingestors.stackoverflowqa import StackoverflowqaIngestor

RetrievalMethod = Literal[
"qwen3_4b",
"qwen3_0.6b",
"voyageai",
"openai_small",
"bm25",
"hybrid",
]

RerankerName = Literal[
"cohere",
"salesforce",
"zeroentropy-large",
"zeroentropy-small",
"zeroentropy-small-modal",
"zeroentropy-large-modal",
"zeroentropy-baseten",
"mixedbread",
"jina",
"qwen",
"openai-large-embedding",
"gpt-4o-mini",
"gpt-4.1-mini",
"gpt-5-mini",
"gpt-5-nano",
]

ALL_RERANKERS: dict[
RerankerName, AIRerankModel | AIEmbeddingModel | AIModelAsReranker
RerankerName,
AIRerankModel | AIEmbeddingModel | AIModelAsReranker,
] = {
"cohere": AIRerankModel(company="cohere", model="rerank-v3.5"),
"salesforce": AIRerankModel(company="together", model="Salesforce/Llama-Rank-V1"),
Expand Down Expand Up @@ -246,10 +275,12 @@
),
]
ALL_INGESTORS: list[BaseIngestor] = MTEB_INGESTORS + NEW_INGESTORS + ORIGINAL_INGESTORS

# Defaults
DEFAULT_INGESTORS: list[BaseIngestor] = ORIGINAL_INGESTORS
DEFAULT_MAX_QUERIES: int = 100
DEFAULT_RETRIEVAL_METHOD: RetrievalMethod = "openai_small"
DEFAULT_INCLUDE_RELEVANT_DOCS: bool = True
DEFAULT_INCLUDE_RELEVANT_DOCS: bool = False
DEFAULT_RERANKERS: list[RerankerName] = [
# "cohere",
# "gpt-4o-mini",
Expand Down