Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Lint

on:
push:
branches: ["master"]
pull_request:
branches: ["master"]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
build:
runs-on: ubuntu-24.04
name: Lint
steps:
- uses: actions/checkout@v3
- name: Install the latest version of uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: Install python dependencies with uv
run: |
uv sync
- name: Run lint.sh
run: |
./lint.sh --check
1 change: 1 addition & 0 deletions evals/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,7 @@ def get_embeddings_cache_key(
key = f"{model.company}||||{model.model}||||{embedding_type.name}||||{hashlib.md5(text.encode()).hexdigest()}"
return key


def cosine_similarity(vec1: AIEmbedding, vec2: AIEmbedding) -> float:
return np.dot(vec1, vec2)

Expand Down
5 changes: 4 additions & 1 deletion evals/ai_rerank.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
from typing import Literal

import diskcache as dc # pyright: ignore[reportMissingTypeStubs]
from pydantic import BaseModel
import diskcache as dc

from evals.ai import AIMessage, AIModel, ai_call
from evals.utils import clamp


class AIModelAsReranker(BaseModel):
model: AIModel
rerank_type: Literal["listwise"]


class RerankResult(BaseModel):
index: int
score: float


class RerankOutput(BaseModel):
results: list[RerankResult]

Expand Down
33 changes: 17 additions & 16 deletions evals/ingestors/enronqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from evals.common import Document, QRel, Query
from evals.ingestors.common import BaseIngestor, clean_dataset


class EnronQaIngestor(BaseIngestor):
@override
def dataset_id(self) -> str:
Expand All @@ -21,36 +22,36 @@ def ingest(self) -> tuple[list[Query], list[Document], list[QRel]]:
corpus_dataset = cast(Any, load_dataset(dataset_name, "default"))["train"]
queries_dataset = cast(Any, load_dataset(queries_name, "default"))["train"]


# Create documents
documents: list[Document] = []
message_id_to_doc_id: dict[str, int] = {}
for index, document in enumerate(tqdm(corpus_dataset, desc="Documents")):
doc_id = index
email_content = \
f"Subject: {document["subject"]}\n" + \
f"Date: {document["date"].strftime("%Y-%m-%d %H:%M:%S")}\n\n" + \
f"From: {document["from"]}\n" + \
f"To: {', '.join(document["to"])}\n" + \
f"Cc: {', '.join(document["cc"])}\n" + \
f"Bcc: {', '.join(document["bcc"])}\n" + \
f"{document["body"]}"
email_content = (
f"Subject: {document['subject']}\n"
+ f"Date: {document['date'].strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+ f"From: {document['from']}\n"
+ f"To: {', '.join(document['to'])}\n"
+ f"Cc: {', '.join(document['cc'])}\n"
+ f"Bcc: {', '.join(document['bcc'])}\n"
+ f"{document['body']}"
)

message_id_to_doc_id[document["message_id"]] = doc_id
documents.append(
Document(
id=str(doc_id),
content=email_content,
metadata={}
)
Document(id=str(doc_id), content=email_content, metadata={})
)

# Create QRel objects
qrels: list[QRel] = []
valid_query_ids: set[str] = set()
for index, question in enumerate(tqdm(queries_dataset, "QRels")):
related_message_ids = question["message_ids"]
related_message_indices = [message_id_to_doc_id[mid] for mid in related_message_ids if mid in message_id_to_doc_id]
related_message_indices = [
message_id_to_doc_id[mid]
for mid in related_message_ids
if mid in message_id_to_doc_id
]
if len(related_message_indices) > 0:
for doc_index in related_message_indices:
qrels.append(
Expand All @@ -75,4 +76,4 @@ def ingest(self) -> tuple[list[Query], list[Document], list[QRel]]:
)
)

return clean_dataset(queries, documents, qrels)
return clean_dataset(queries, documents, qrels)
18 changes: 9 additions & 9 deletions evals/run_rerankers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from contextlib import ExitStack
from typing import TextIO

import diskcache as dc

import diskcache as dc # pyright: ignore[reportMissingTypeStubs]
from tqdm import tqdm

from evals.ai import (
Expand All @@ -13,8 +12,8 @@
tiktoken_truncate_by_num_tokens,
)
from evals.ai_rerank import (
ai_rerank_by_ai_model,
AIModelAsReranker,
ai_rerank_by_ai_model,
)
from evals.common import (
DocumentScores,
Expand Down Expand Up @@ -109,7 +108,7 @@ async def rerank_dataset(
eviction_policy="none",
)
if USE_RERANKER_CACHE
else None
else None
)

num_lines = read_num_lines_pbar(ze_results_path, display_name=dataset.id)
Expand Down Expand Up @@ -143,23 +142,24 @@ async def wrapped_process_line(line: str) -> None:
for document in ze_results.documents
]
ground_truth_exists = any(
document.scores.get("human", 0) > 0
for document in ze_results.documents
document.scores.get("human", 0) > 0 for document in ze_results.documents
)
if ground_truth_exists:
all_results = await asyncio.gather(
*[
process_query(
ALL_RERANKERS[reranker], query_text, document_texts, reranker_caches[reranker]
ALL_RERANKERS[reranker],
query_text,
document_texts,
reranker_caches[reranker],
)
for reranker in rerankers
]
)
else:
# NOTE: Skip reranker calls when there's no ground truth in the top
all_results = [
[-1 for _document_text in document_texts]
for _reranker in rerankers
[-1 for _document_text in document_texts] for _reranker in rerankers
]
for reranker, results in zip(rerankers, all_results, strict=False):
reranker_scores: QueryScores = QueryScores(
Expand Down
14 changes: 7 additions & 7 deletions evals/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from evals.ingestors.common import BaseIngestor
from evals.ingestors.cosqa import CosqaIngestor
from evals.ingestors.curev1 import CureV1Ingestor
from evals.ingestors.enronqa import EnronQa
from evals.ingestors.enronqa import EnronQaIngestor
from evals.ingestors.financebench import FinancebenchIngestor
from evals.ingestors.finqabench import FinqabenchIngestor
from evals.ingestors.fiqa import FiqaIngestor
Expand Down Expand Up @@ -144,7 +144,7 @@
LeetcodeMultiLanguageIngestor(language="java"),
LeetcodeMultiLanguageIngestor(language="javascript"),
LeetcodeMultiLanguageIngestor(language="c++"),
EnronQa(),
EnronQaIngestor(),
QuoraIngestor(),
QuoraSwedishIngestor(),
MeetingIngestor(),
Expand Down Expand Up @@ -251,10 +251,10 @@
DEFAULT_RETRIEVAL_METHOD: RetrievalMethod = "openai_small"
DEFAULT_INCLUDE_RELEVANT_DOCS: bool = True
DEFAULT_RERANKERS: list[RerankerName] = [
"cohere",
"gpt-4o-mini",
"gpt-4.1-mini",
"gpt-5-mini",
"gpt-5-nano",
# "cohere",
# "gpt-4o-mini",
# "gpt-4.1-mini",
# "gpt-5-mini",
# "gpt-5-nano",
"zeroentropy-large",
]