diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..85d1497 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,28 @@ +name: Lint + +on: + push: + branches: ["master"] + pull_request: + branches: ["master"] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-24.04 + name: Lint + steps: + - uses: actions/checkout@v3 + - name: Install the latest version of uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + - name: Install python dependencies with uv + run: | + uv sync + - name: Run lint.sh + run: | + ./lint.sh --check diff --git a/evals/ai.py b/evals/ai.py index 64e803e..4b2863e 100644 --- a/evals/ai.py +++ b/evals/ai.py @@ -795,6 +795,7 @@ def get_embeddings_cache_key( key = f"{model.company}||||{model.model}||||{embedding_type.name}||||{hashlib.md5(text.encode()).hexdigest()}" return key + def cosine_similarity(vec1: AIEmbedding, vec2: AIEmbedding) -> float: return np.dot(vec1, vec2) diff --git a/evals/ai_rerank.py b/evals/ai_rerank.py index 2624a70..60c29b1 100644 --- a/evals/ai_rerank.py +++ b/evals/ai_rerank.py @@ -1,19 +1,22 @@ from typing import Literal +import diskcache as dc # pyright: ignore[reportMissingTypeStubs] from pydantic import BaseModel -import diskcache as dc from evals.ai import AIMessage, AIModel, ai_call from evals.utils import clamp + class AIModelAsReranker(BaseModel): model: AIModel rerank_type: Literal["listwise"] + class RerankResult(BaseModel): index: int score: float + class RerankOutput(BaseModel): results: list[RerankResult] diff --git a/evals/ingestors/enronqa.py b/evals/ingestors/enronqa.py index 3887c05..85533b8 100644 --- a/evals/ingestors/enronqa.py +++ b/evals/ingestors/enronqa.py @@ -8,6 +8,7 @@ from evals.common import Document, QRel, Query from evals.ingestors.common import BaseIngestor, clean_dataset + class EnronQaIngestor(BaseIngestor): @override def dataset_id(self) -> str: @@ -21,28 +22,24 @@ def ingest(self) -> tuple[list[Query], list[Document], list[QRel]]: corpus_dataset = cast(Any, load_dataset(dataset_name, "default"))["train"] queries_dataset = cast(Any, load_dataset(queries_name, "default"))["train"] - # Create documents documents: list[Document] = [] message_id_to_doc_id: dict[str, int] = {} for index, document in enumerate(tqdm(corpus_dataset, desc="Documents")): doc_id = index - email_content = \ - f"Subject: {document["subject"]}\n" + \ - f"Date: {document["date"].strftime("%Y-%m-%d %H:%M:%S")}\n\n" + \ - f"From: {document["from"]}\n" + \ - f"To: {', '.join(document["to"])}\n" + \ - f"Cc: {', '.join(document["cc"])}\n" + \ - f"Bcc: {', '.join(document["bcc"])}\n" + \ - f"{document["body"]}" + email_content = ( + f"Subject: {document['subject']}\n" + + f"Date: {document['date'].strftime('%Y-%m-%d %H:%M:%S')}\n\n" + + f"From: {document['from']}\n" + + f"To: {', '.join(document['to'])}\n" + + f"Cc: {', '.join(document['cc'])}\n" + + f"Bcc: {', '.join(document['bcc'])}\n" + + f"{document['body']}" + ) message_id_to_doc_id[document["message_id"]] = doc_id documents.append( - Document( - id=str(doc_id), - content=email_content, - metadata={} - ) + Document(id=str(doc_id), content=email_content, metadata={}) ) # Create QRel objects @@ -50,7 +47,11 @@ def ingest(self) -> tuple[list[Query], list[Document], list[QRel]]: valid_query_ids: set[str] = set() for index, question in enumerate(tqdm(queries_dataset, "QRels")): related_message_ids = question["message_ids"] - related_message_indices = [message_id_to_doc_id[mid] for mid in related_message_ids if mid in message_id_to_doc_id] + related_message_indices = [ + message_id_to_doc_id[mid] + for mid in related_message_ids + if mid in message_id_to_doc_id + ] if len(related_message_indices) > 0: for doc_index in related_message_indices: qrels.append( @@ -75,4 +76,4 @@ def ingest(self) -> tuple[list[Query], list[Document], list[QRel]]: ) ) - return clean_dataset(queries, documents, qrels) \ No newline at end of file + return clean_dataset(queries, documents, qrels) diff --git a/evals/run_rerankers.py b/evals/run_rerankers.py index c88d9f6..2779291 100644 --- a/evals/run_rerankers.py +++ b/evals/run_rerankers.py @@ -2,8 +2,7 @@ from contextlib import ExitStack from typing import TextIO -import diskcache as dc - +import diskcache as dc # pyright: ignore[reportMissingTypeStubs] from tqdm import tqdm from evals.ai import ( @@ -13,8 +12,8 @@ tiktoken_truncate_by_num_tokens, ) from evals.ai_rerank import ( - ai_rerank_by_ai_model, AIModelAsReranker, + ai_rerank_by_ai_model, ) from evals.common import ( DocumentScores, @@ -109,7 +108,7 @@ async def rerank_dataset( eviction_policy="none", ) if USE_RERANKER_CACHE - else None + else None ) num_lines = read_num_lines_pbar(ze_results_path, display_name=dataset.id) @@ -143,14 +142,16 @@ async def wrapped_process_line(line: str) -> None: for document in ze_results.documents ] ground_truth_exists = any( - document.scores.get("human", 0) > 0 - for document in ze_results.documents + document.scores.get("human", 0) > 0 for document in ze_results.documents ) if ground_truth_exists: all_results = await asyncio.gather( *[ process_query( - ALL_RERANKERS[reranker], query_text, document_texts, reranker_caches[reranker] + ALL_RERANKERS[reranker], + query_text, + document_texts, + reranker_caches[reranker], ) for reranker in rerankers ] @@ -158,8 +159,7 @@ async def wrapped_process_line(line: str) -> None: else: # NOTE: Skip reranker calls when there's no ground truth in the top all_results = [ - [-1 for _document_text in document_texts] - for _reranker in rerankers + [-1 for _document_text in document_texts] for _reranker in rerankers ] for reranker, results in zip(rerankers, all_results, strict=False): reranker_scores: QueryScores = QueryScores( diff --git a/evals/types.py b/evals/types.py index a80d79a..ef58143 100644 --- a/evals/types.py +++ b/evals/types.py @@ -5,7 +5,7 @@ from evals.ingestors.common import BaseIngestor from evals.ingestors.cosqa import CosqaIngestor from evals.ingestors.curev1 import CureV1Ingestor -from evals.ingestors.enronqa import EnronQa +from evals.ingestors.enronqa import EnronQaIngestor from evals.ingestors.financebench import FinancebenchIngestor from evals.ingestors.finqabench import FinqabenchIngestor from evals.ingestors.fiqa import FiqaIngestor @@ -144,7 +144,7 @@ LeetcodeMultiLanguageIngestor(language="java"), LeetcodeMultiLanguageIngestor(language="javascript"), LeetcodeMultiLanguageIngestor(language="c++"), - EnronQa(), + EnronQaIngestor(), QuoraIngestor(), QuoraSwedishIngestor(), MeetingIngestor(), @@ -251,10 +251,10 @@ DEFAULT_RETRIEVAL_METHOD: RetrievalMethod = "openai_small" DEFAULT_INCLUDE_RELEVANT_DOCS: bool = True DEFAULT_RERANKERS: list[RerankerName] = [ - "cohere", - "gpt-4o-mini", - "gpt-4.1-mini", - "gpt-5-mini", - "gpt-5-nano", + # "cohere", + # "gpt-4o-mini", + # "gpt-4.1-mini", + # "gpt-5-mini", + # "gpt-5-nano", "zeroentropy-large", ]