diff --git a/common/requirements.txt b/common/requirements.txt index f4d5ac6..a52f992 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -140,7 +140,7 @@ python-multipart==0.0.20 python-iso639==2025.2.18 python-magic==0.4.27 pyTigerDriver==1.0.15 -pyTigerGraph>=2.0.3 +pyTigerGraph>=2.0.4 pytz==2025.2 PyYAML==6.0.2 rapidfuzz==3.13.0 diff --git a/docker-compose.yml b/docker-compose.yml index 1034ead..80de0c0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,6 +18,8 @@ services: USE_CYPHER: "true" volumes: - ./configs/:/code/configs + - ./graphrag/tests/regression:/code/tests/regression + - ./graphrag/tests/test_questions:/code/tests/test_questions graphrag-ecc: image: tigergraph/graphrag-ecc:latest diff --git a/graphrag/app/agent/agent_hallucination_check.py b/graphrag/app/agent/agent_hallucination_check.py index c51d2b4..d861bc7 100644 --- a/graphrag/app/agent/agent_hallucination_check.py +++ b/graphrag/app/agent/agent_hallucination_check.py @@ -8,45 +8,117 @@ logger = logging.getLogger(__name__) + class HallucinationCheckResponse(BaseModel): - score: str = Field(description="The score of the hallucination check. Either 'yes' or 'no', indicating if the answer is hallucinated.") + confidence: float = Field( + description=( + "Confidence score between 0.0 and 1.0 that the answer is hallucinated. " + "0.0 = fully grounded in the context, 1.0 = completely hallucinated. " + "Use the full range — e.g. 0.2 for mostly grounded with minor gaps, " + "0.8 for mostly unsupported claims." + ) + ) + reason: str = Field( + description=( + "A concise one-to-two sentence explanation of why you assigned this " + "confidence score, citing specific claims from the answer and context." + ) + ) + class TigerGraphAgentHallucinationCheck: def __init__(self, llm_model): self.llm = llm_model - def check_hallucination(self, generation: str, context: str) -> dict: - """Check if the answer is hallucinated based on the question and context. + def check_hallucination( + self, + generation: str, + context: str, + question: str = "", + ) -> HallucinationCheckResponse: + """Score how likely the generated answer is hallucinated. + Args: - question: str: The question to generate an answer for. - context: str: The context to generate an answer from. + generation: The answer produced by the RAG pipeline. + context: The retrieved context chunks passed to the LLM. + question: The original user question (improves grounding judgement). + Returns: - dict: The answer to the question and a boolean indicating if the answer is hallucinated. + HallucinationCheckResponse with confidence (0–1) and reason. """ LogWriter.info(f"request_id={req_id_cv.get()} ENTRY check_hallucination") - - hallucination_parser = PydanticOutputParser(pydantic_object=HallucinationCheckResponse) + + hallucination_parser = PydanticOutputParser( + pydantic_object=HallucinationCheckResponse + ) + + question_block = ( + f"\nORIGINAL QUESTION:\n{question}\n" if question.strip() else "" + ) prompt = PromptTemplate( - template="""You are a grader assessing whether an answer is grounded in / supported by a set of facts. \n - Here are the facts: - \n ------- \n - {context} - \n ------- \n - Here is the answer: {generation} - Provide a binary score 'yes' or 'no' score to indicate whether the answer is grounded in / supported by a set of facts. - The score should be 'yes' if the information in the answer is found in the context. otherwise, the score should be 'no'. - Provide the binary score as a JSON with a single key 'score' and no preamble or explanation. - Format: {format_instructions}""", - input_variables=["generation", "context"], + template="""You are an expert grader evaluating whether a GraphRAG \ +pipeline answer is grounded in the retrieved context passages. + +GraphRAG answers are SYNTHESIZED — they combine, rephrase, and connect \ +information from multiple passages. Reasonable synthesis and inference are \ +acceptable. However, specific facts (numbers, percentages, names, dates) \ +must be correctly attributed — using a figure from context but applying it \ +to the wrong category or time period IS a hallucination. +{question_block} +RETRIEVED CONTEXT PASSAGES: +------- +{context} +------- + +ANSWER TO EVALUATE: +{generation} + +SCORING GUIDE — confidence that the answer is hallucinated (0.0 = fully \ +grounded, 1.0 = fully hallucinated): + +0.0–0.2 Fully grounded: all key claims are directly stated in or correctly \ +inferred from the context. Paraphrasing and synthesis are expected. + +0.2–0.4 Mostly grounded: core ideas are supported; minor details extend \ +beyond the context but are logically consistent and do not contradict anything. + +0.4–0.6 Partially grounded: some claims are only loosely supported OR \ +specific facts are misattributed (e.g. a correct number applied to the wrong \ +category, time period, or entity). + +0.6–0.8 Mostly hallucinated: several key claims are unsupported or specific \ +facts clearly contradict the context. + +0.8–1.0 Fully hallucinated: the answer is largely unrelated to the context \ +or fabricates significant facts. + +RULES: +- Accept synthesis and logical combination across multiple passages. +- Accept domain knowledge that logically follows from the context topic. +- Context may be incomplete. Judge consistency, not exhaustiveness. +- Use semantic alignment, NOT keyword matching. +- Ignore references to diagrams, figures, or images — context is text only. +- PENALISE when: (1) a specific number/percentage/date is used but applied \ +to the wrong subject; (2) a claim directly contradicts the context; OR \ +(3) the answer introduces topics entirely unrelated to the context. + +Provide your verdict as JSON. +Format: {format_instructions}""", + input_variables=["generation", "context", "question_block"], partial_variables={ "format_instructions": hallucination_parser.get_format_instructions() - } + }, ) prediction = self.llm.invoke_with_parser( - prompt, hallucination_parser, - {"context": context, "generation": generation}, + prompt, + hallucination_parser, + { + "context": context, + "generation": generation, + "question_block": question_block, + }, caller_name="check_hallucination", ) LogWriter.info(f"request_id={req_id_cv.get()} EXIT check_hallucination") diff --git a/graphrag/tests/regression/evaluator.py b/graphrag/tests/regression/evaluator.py new file mode 100644 index 0000000..e6946eb --- /dev/null +++ b/graphrag/tests/regression/evaluator.py @@ -0,0 +1,748 @@ +"""GraphRAG Regression Evaluator + +Scores a live GraphRAG graph against a labelled dataset using two metrics: + 1. Hallucination — TigerGraphAgentHallucinationCheck (confidence 0–1) + 2. Answer Correctness — DeepEval GEval (requires answers.csv) + +Dataset layout expected: + test_questions// + ├── data/ documents for ingest + ├── questions.csv single column: question + ├── answers.csv single column: ground_truth (optional) + └── ExportedGraph/ pre-built graph for --load-exported mode + +Run via: + ./graphrag/tests/regression/run_eval.sh --dataset --graphname + ./graphrag/tests/regression/run_eval.sh ... --detailed (shows per-question reasons) +""" + +from __future__ import annotations + +import argparse +import csv +import json +import logging +import os +import sys +import time +import warnings +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import asdict, dataclass, field +from datetime import datetime +from typing import List, Optional, Tuple + +# Silence DeepEval's progress banner + telemetry. Must be set before deepeval +# is imported anywhere (it is imported lazily inside answer_correctness). +os.environ.setdefault("DEEPEVAL_DISABLE_INDICATOR", "YES") +os.environ.setdefault("DEEPEVAL_TELEMETRY_OPT_OUT", "YES") + +# Some libraries (langchain, opentelemetry, pyTigerGraph …) re-arm the warnings +# filter on import, so PYTHONWARNINGS / simplefilter alone don't hold. Replacing +# showwarning with a no-op drops every warning no matter what filter is active. +warnings.filterwarnings("ignore") +warnings.showwarning = lambda *a, **k: None + +import httpx + +# Python sets sys.path[0] to the script's directory, not /code, so `import common` +# and `import agent` fail without this. /code is where the GraphRAG app lives in +# the container (see Dockerfile: COPY graphrag/app /code && COPY common /code/common). +if "/code" not in sys.path: + sys.path.insert(0, "/code") + +logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(name)s: %(message)s") + +# ANSI colours for clean CLI output. NO_COLOR disables; FORCE_COLOR forces on +# (needed under `docker exec` where stdout is often not a detected TTY). +if os.environ.get("NO_COLOR"): + _USE_COLOR = False +elif os.environ.get("FORCE_COLOR"): + _USE_COLOR = True +else: + _USE_COLOR = sys.stdout.isatty() +_G = "\033[32m" if _USE_COLOR else "" # green +_R = "\033[31m" if _USE_COLOR else "" # red +_Y = "\033[33m" if _USE_COLOR else "" # yellow +_C = "\033[36m" if _USE_COLOR else "" # cyan +_B = "\033[1m" if _USE_COLOR else "" # bold +_X = "\033[0m" if _USE_COLOR else "" # reset + +_HALLUCINATION_THRESHOLD = 0.5 # confidence above which an answer is flagged hallucinated +_MAX_RETRIES = 3 # LLM metric call retries on transient errors +_RETRY_DELAY = 5.0 # base back-off seconds between retries (doubles each attempt) +_RETRY_ERRORS = ( + "APIConnectionError", "ConnectionError", "RateLimitError", + "Timeout", "TimeoutError", "ServiceUnavailableError", +) +_MAX_CHUNKS = 50 # max context passages passed to the hallucination checker +_MAX_CHUNK_CHARS = 3000 # each passage is truncated to this many characters + + +# ─── Data types ─────────────────────────────────────────────────────────────── + +@dataclass +class EvalQuestion: + question: str + ground_truth: Optional[str] + + +@dataclass +class EvalResult: + # ── written to CSV ──────────────────────────────────────────────────────── + index: int + question: str + ground_truth: Optional[str] = None + generated_answer: str = "" + search_type_used: Optional[str] = None + answer_correctness: Optional[float] = None + answer_correctness_reason: Optional[str] = None + hallucination: Optional[str] = None + hallucination_confidence: Optional[float]= None + hallucination_reason: Optional[str] = None + response_time_seconds: float = 0.0 + # ── CLI-only (not written to CSV) ───────────────────────────────────────── + answered_question: bool = field(default=False, repr=False) + error: Optional[str] = field(default=None, repr=False) + metric_errors: dict = field(default_factory=dict, repr=False) + + +# ─── Question loader ────────────────────────────────────────────────────────── + +def _detect_encoding(path: str) -> str: + """Detect CSV encoding. Handles UTF-8 with BOM (Excel default), plain UTF-8, + Shift-JIS, and other encodings via chardet fallback.""" + with open(path, "rb") as f: + raw = f.read(4) + if raw[:3] == b"\xef\xbb\xbf": + return "utf-8-sig" # UTF-8 with BOM — Excel's default UTF-8 export + try: + import chardet + with open(path, "rb") as f: + detected = chardet.detect(f.read()) + enc = detected.get("encoding") or "utf-8" + confidence = detected.get("confidence", 0) + return enc if confidence >= 0.5 else "utf-8" + except ImportError: + return "utf-8" + + +def _read_single_column(path: str, column: str) -> List[str]: + """Read one column from a single-column CSV. + + These CSVs have exactly one data column (question or ground_truth). + csv.DictReader splits unquoted values that contain commas (e.g. Japanese + numbers like 2,693) into multiple fields and corrupts the data. We use + csv.reader, which correctly handles quoted multi-line fields, then + re-join any extra fields that were produced by unquoted commas so the + full original value is always returned. + """ + encoding = _detect_encoding(path) + with open(path, newline="", encoding=encoding) as f: + reader = csv.reader(f) + rows = list(reader) + + if not rows: + sys.exit(f"ERROR: {os.path.basename(path)} is empty.") + + header = rows[0] + try: + col_idx = header.index(column) + except ValueError: + sys.exit( + f"ERROR: {os.path.basename(path)} must have a '{column}' column. " + f"Found: {header}" + ) + + results = [] + for row in rows[1:]: + if not row: + continue + # If the row has more fields than the header, the value contained + # unquoted commas and was split — re-join everything from col_idx + # onward to restore the original string. + value = ",".join(row[col_idx:]).strip() + if value: + results.append(value) + return results + + +def load_questions(dataset_dir: str) -> List[EvalQuestion]: + """Load questions.csv and (optionally) answers.csv from dataset_dir. + + questions.csv — required, single column: question + answers.csv — optional, single column: ground_truth + rows are zipped with questions by index + """ + q_path = os.path.join(dataset_dir, "questions.csv") + a_path = os.path.join(dataset_dir, "answers.csv") + + if not os.path.exists(q_path): + sys.exit(f"ERROR: questions.csv not found at {q_path}") + + questions = _read_single_column(q_path, "question") + + ground_truths: List[Optional[str]] = [None] * len(questions) + if os.path.exists(a_path): + answers = _read_single_column(a_path, "ground_truth") + if len(answers) != len(questions): + sys.exit( + f"ERROR: questions.csv has {len(questions)} rows but " + f"answers.csv has {len(answers)} rows — they must match." + ) + ground_truths = [a or None for a in answers] + + return [ + EvalQuestion(question=q, ground_truth=gt) + for q, gt in zip(questions, ground_truths) + ] + + +# ─── Answer Correctness (DeepEval GEval) ───────────────────────────────────── + +_EVALUATION_STEPS = [ + "Check whether the facts in the actual output contradict any facts in the expected output.", + "Lightly penalize omissions of detail — focus on whether the main idea is correct.", + "Vague language or differing opinions (without factual contradiction) are acceptable.", +] + +def _build_deepeval_llm(): + import re as _re + from deepeval.models.base_model import DeepEvalBaseLLM + from langchain_core.messages import HumanMessage + from common.config import get_chat_config, get_llm_service + + cfg = get_chat_config() + llm_provider = get_llm_service(cfg) + model_name = cfg.get("llm_model", "unknown") + lc_model = getattr(llm_provider, "llm", None) or llm_provider + + def _parse(text, schema): + try: + m = _re.search(r"\{.*\}", text, _re.DOTALL) + if m: + return schema(**json.loads(m.group())) + except Exception: + pass + return text + + class _Judge(DeepEvalBaseLLM): + def __init__(self): + super().__init__(model_name) + + def load_model(self): + return lc_model + + def generate(self, prompt: str, schema=None): + resp = lc_model.invoke([HumanMessage(content=prompt)]) + text = resp.content if hasattr(resp, "content") else str(resp) + return _parse(text, schema) if schema else text + + async def a_generate(self, prompt: str, schema=None): + resp = await lc_model.ainvoke([HumanMessage(content=prompt)]) + text = resp.content if hasattr(resp, "content") else str(resp) + return _parse(text, schema) if schema else text + + def get_model_name(self) -> str: + return model_name + + return _Judge() + + +def answer_correctness( + question: str, + generated_answer: str, + ground_truth: str, +) -> Tuple[Optional[float], Optional[str], Optional[str]]: + """Returns (score 0–1, reason, error). score/reason are None on failure.""" + if not ground_truth: + return None, None, "no ground truth provided" + + try: + import asyncio + from deepeval.metrics import GEval + from deepeval.test_case import LLMTestCase, LLMTestCaseParams + except ImportError as e: + return None, None, f"deepeval not installed: {e}" + + try: + judge = _build_deepeval_llm() + except Exception as e: + return None, None, f"judge init failed: {e}" + + metric = GEval( + name="Answer Correctness", + model=judge, + evaluation_steps=_EVALUATION_STEPS, + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + ) + test_case = LLMTestCase( + input=question, + actual_output=generated_answer, + expected_output=ground_truth, + ) + + last_err: Optional[Exception] = None + for attempt in range(1, _MAX_RETRIES + 1): + try: + # _show_indicator=False suppresses DeepEval's "✨ You're running…" line + try: + asyncio.run(metric.a_measure(test_case, _show_indicator=False)) + except TypeError: + asyncio.run(metric.a_measure(test_case)) + last_err = None + break + except Exception as e: + last_err = e + err_type = type(e).__name__ + retryable = any(r in err_type or r in str(e) for r in _RETRY_ERRORS) + if retryable and attempt < _MAX_RETRIES: + time.sleep(_RETRY_DELAY * (2 ** (attempt - 1))) + else: + break + + if last_err is not None: + return None, None, f"{type(last_err).__name__}: {last_err}" + + return ( + float(metric.score) if metric.score is not None else None, + str(metric.reason).strip() if metric.reason else None, + None, + ) + + +# ─── Hallucination check ────────────────────────────────────────────────────── + +def hallucination_check( + question: str, + generation: str, + context: str, +) -> Tuple[Optional[str], Optional[float], Optional[str], Optional[str]]: + """Returns (verdict, confidence, reason, error). + verdict is 'yes' (hallucinated) or 'no' (grounded). + """ + if not context: + return None, None, None, "no retrieved context to check against" + + try: + from common.config import get_chat_config, get_llm_service + from agent.agent_hallucination_check import ( + TigerGraphAgentHallucinationCheck, + ) + + cfg = get_chat_config() + llm = get_llm_service(cfg) + result = TigerGraphAgentHallucinationCheck(llm).check_hallucination( + generation=generation, + context=context, + question=question, + ) + confidence = max(0.0, min(1.0, float(result.confidence))) + verdict = "yes" if confidence >= _HALLUCINATION_THRESHOLD else "no" + return verdict, confidence, str(result.reason).strip() if result.reason else None, None + except Exception as e: + return None, None, None, f"{type(e).__name__}: {e}" + + +# ─── Context extraction ─────────────────────────────────────────────────────── + +def _flatten_context(query_sources: dict) -> List[str]: + """Extract text chunks from GraphRAG query_sources response.""" + if not query_sources: + return [] + + try: + final_retrieval = query_sources["result"]["final_retrieval"] + if isinstance(final_retrieval, dict): + raw: List[str] = [] + for chunks in final_retrieval.values(): + if isinstance(chunks, list): + raw.extend(str(c) for c in chunks if c) + elif isinstance(chunks, str) and chunks.strip(): + raw.append(chunks) + if raw: + seen: set = set() + out: List[str] = [] + for c in raw: + c = c.strip() + if len(c) < 5 or c in seen: + continue + seen.add(c) + out.append(c[:_MAX_CHUNK_CHARS]) + if len(out) >= _MAX_CHUNKS: + break + return out + except (KeyError, TypeError): + pass + + def _walk(node): + result: List[str] = [] + if isinstance(node, str): + if node.strip(): + result.append(node) + elif isinstance(node, list): + for item in node: + result.extend(_walk(item)) + elif isinstance(node, dict): + for key in ("final_retrieval", "context", "contexts", "chunks", + "documents", "passages", "text", "result"): + if key in node: + result.extend(_walk(node[key])) + if not result: + for v in node.values(): + result.extend(_walk(v)) + return result + + chunks = _walk(query_sources) + seen2: set = set() + out2: List[str] = [] + for c in chunks: + c = c.strip() + if len(c) < 5 or c in seen2: + continue + seen2.add(c) + out2.append(c[:_MAX_CHUNK_CHARS]) + if len(out2) >= _MAX_CHUNKS: + break + return out2 + + +# ─── Core eval loop ─────────────────────────────────────────────────────────── + +def _query_graphrag( + url: str, + graphname: str, + username: str, + password: str, + question: str, + search_type: str, +) -> dict: + resp = httpx.post( + f"{url}/{graphname}/query", + json={"query": question, "rag_method": search_type}, + auth=(username, password), + timeout=120.0, + ) + resp.raise_for_status() + return resp.json() + + +def _eval_one( + idx: int, + q: EvalQuestion, + graphname: str, + url: str, + username: str, + password: str, + search_type: str, +) -> EvalResult: + r = EvalResult(index=idx, question=q.question, ground_truth=q.ground_truth) + t0 = time.monotonic() + + try: + data = _query_graphrag(url, graphname, username, password, q.question, search_type) + r.generated_answer = data.get("natural_language_response") or "" + r.answered_question = bool(data.get("answered_question", False)) + query_sources = data.get("query_sources") or {} + # Map the internal chosen_retriever value to the same human-readable label + # the UI displays (matches CustomChatMessage.tsx). Falls back to response_type. + _RETRIEVER_LABELS = { + "similaritysearch": "Similarity Search", + "contextualsearch": "Contextual Search", + "hybridsearch": "Hybrid Search", + "communitysearch": "Community Search", + } + _chosen = (query_sources.get("chosen_retriever") or "").lower().replace(" ", "") + r.search_type_used = _RETRIEVER_LABELS.get(_chosen) or data.get("response_type") or None + contexts = _flatten_context(query_sources) + except httpx.HTTPStatusError as e: + r.error = f"HTTP {e.response.status_code}: {e.response.text[:200]}" + r.response_time_seconds = time.monotonic() - t0 + return r + except Exception as e: + r.error = f"{type(e).__name__}: {e}" + r.response_time_seconds = time.monotonic() - t0 + return r + finally: + r.response_time_seconds = time.monotonic() - t0 + + if not r.answered_question or not r.generated_answer: + r.metric_errors["all"] = "GraphRAG did not produce an answer" + return r + + ctx_str = "\n\n".join(f"[Passage {i + 1}]\n{c}" for i, c in enumerate(contexts)) + + tasks = {} + if q.ground_truth: + _q, _ans, _gt = q.question, r.generated_answer, q.ground_truth + tasks["correctness"] = lambda _q=_q, _ans=_ans, _gt=_gt: answer_correctness(_q, _ans, _gt) + + _q, _gen, _ctx = q.question, r.generated_answer, ctx_str + tasks["hallucination"] = lambda _q=_q, _gen=_gen, _ctx=_ctx: hallucination_check(_q, _gen, _ctx) + + with ThreadPoolExecutor(max_workers=len(tasks)) as pool: + futures = {m: pool.submit(fn) for m, fn in tasks.items()} + for metric, future in futures.items(): + try: + val = future.result() + if metric == "correctness": + r.answer_correctness, r.answer_correctness_reason, err = val + if err: + r.metric_errors["answer_correctness"] = err + elif metric == "hallucination": + r.hallucination, r.hallucination_confidence, r.hallucination_reason, err = val + if err: + r.metric_errors["hallucination"] = err + if r.hallucination_confidence is not None: + r.hallucination_confidence = round(r.hallucination_confidence, 3) + except Exception as e: + r.metric_errors[metric] = f"unexpected: {type(e).__name__}: {e}" + + return r + + +def _ac_cell(r: EvalResult) -> str: + ac = r.answer_correctness + if ac is not None: + col = _G if ac >= 0.75 else _Y if ac >= 0.5 else _R + return f"{col}{ac * 100:5.1f}%{_X}" + if r.metric_errors.get("answer_correctness"): + return f"{_Y} n/a{_X}" + return " —" + + +def _hal_cell(r: EvalResult) -> str: + hal = r.hallucination + if hal is not None: + col = _R if hal == "yes" else _G + verdict = "Halluc." if hal == "yes" else "Ground." + conf = f" {r.hallucination_confidence * 100:3.0f}%" if r.hallucination_confidence is not None else "" + return f"{col}{verdict}{conf}{_X}" + if r.metric_errors.get("hallucination"): + return f"{_Y}n/a{_X}" + return "—" + + +def _print_compact(r: EvalResult, total: int) -> None: + """One clean colourised line per question (default mode).""" + n = r.index + 1 + if r.error: + print(f" Q{n:>3}/{total} [{r.response_time_seconds:5.1f}s] " + f"{_R}error: {r.error[:100]}{_X}", flush=True) + return + if not r.answered_question: + print(f" Q{n:>3}/{total} [{r.response_time_seconds:5.1f}s] " + f"{_Y}no answer from GraphRAG{_X}", flush=True) + return + extra = "" + if r.metric_errors: + errs = ", ".join(f"{k}: {v[:60]}" for k, v in r.metric_errors.items()) + extra = f" {_Y}metric warn: {errs}{_X}" + print(f" Q{n:>3}/{total} [{r.response_time_seconds:5.1f}s] " + f"AC={_ac_cell(r)} Hal={_hal_cell(r)}{extra}", flush=True) + + +def _print_detailed(r: EvalResult, total: int) -> None: + """Question, answer, scores and reasons (--detailed mode).""" + n = r.index + 1 + q_short = (r.question[:70] + "…") if len(r.question) > 71 else r.question + print(f" {'─' * 70}", flush=True) + print(f" {_B}Q{n}/{total}{_X} [{r.response_time_seconds:.1f}s] {q_short}", flush=True) + if r.error: + print(f" {_R}error: {r.error}{_X}", flush=True) + return + if not r.answered_question: + print(f" {_Y}GraphRAG did not produce an answer{_X}", flush=True) + return + print(f" Answer : {(r.generated_answer or '')[:160]}", flush=True) + + ac_line = f" {_C}Answer Correctness{_X} : AC={_ac_cell(r)}" + if r.metric_errors.get("answer_correctness"): + ac_line += f" {_Y}({r.metric_errors['answer_correctness']}){_X}" + print(ac_line, flush=True) + if r.answer_correctness_reason: + print(f" reason: {r.answer_correctness_reason}", flush=True) + + hal_line = f" {_C}Hallucination{_X} : Hal={_hal_cell(r)}" + if r.metric_errors.get("hallucination"): + hal_line += f" {_Y}({r.metric_errors['hallucination']}){_X}" + print(hal_line, flush=True) + if r.hallucination_reason: + print(f" reason: {r.hallucination_reason}", flush=True) + + +def run_eval( + questions: List[EvalQuestion], + graphname: str, + url: str, + username: str, + password: str, + search_type: str = "Auto", + detailed: bool = False, +) -> List[EvalResult]: + total = len(questions) + workers = min(8, total) + results: List[Optional[EvalResult]] = [None] * total + printer = _print_detailed if detailed else _print_compact + + with ThreadPoolExecutor(max_workers=workers, thread_name_prefix="eval") as pool: + futures = { + pool.submit(_eval_one, i, q, graphname, url, username, password, search_type): i + for i, q in enumerate(questions) + } + for future in as_completed(futures): + r = future.result() + results[r.index] = r + printer(r, total) + + return [r for r in results if r is not None] + + +# ─── Output ─────────────────────────────────────────────────────────────────── + +_CSV_FIELDS = [ + "index", "question", "ground_truth", "generated_answer", + "search_type_used", + "answer_correctness", "answer_correctness_reason", + "hallucination", "hallucination_confidence", "hallucination_reason", + "response_time_seconds", +] + + +def write_results(results: List[EvalResult], output_dir: str, detailed: bool = False) -> None: + os.makedirs(output_dir, exist_ok=True) + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + csv_path = os.path.join(output_dir, f"results_{ts}.csv") + + with open(csv_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=_CSV_FIELDS) + writer.writeheader() + for r in results: + row = asdict(r) + writer.writerow({k: row[k] for k in _CSV_FIELDS}) + + # ── Summary ─────────────────────────────────────────────────────────────── + errors = [r for r in results if r.error] + unanswered = [r for r in results if not r.error and not r.answered_question] + answered = len(results) - len(errors) - len(unanswered) + valid_hal = [r for r in results if r.hallucination is not None] + hal_rate = sum(1 for r in valid_hal if r.hallucination == "yes") / len(valid_hal) if valid_hal else None + corr_vals = [r.answer_correctness for r in results if r.answer_correctness is not None] + avg_corr = sum(corr_vals) / len(corr_vals) if corr_vals else None + + print(f"\n {'═' * 60}") + print(f" {_B}Summary{_X}") + print(f" {'═' * 60}") + print(f" Answered : {_G}{answered}{_X} / {len(results)}") + if unanswered: + print(f" No answer : {_Y}{len(unanswered)}{_X}" + f" (Q{', Q'.join(str(r.index+1) for r in unanswered)})") + if errors: + print(f" Errors : {_R}{len(errors)}{_X}" + f" (Q{', Q'.join(str(r.index+1) for r in errors)})" ) + for r in errors: + print(f" Q{r.index+1}: {r.error}", flush=True) + + if avg_corr is not None: + col = _G if avg_corr >= 0.75 else _Y if avg_corr >= 0.5 else _R + print(f" {_C}Avg Answer Correctness{_X} : {col}{avg_corr * 100:.1f}%{_X}") + else: + print(f" Avg Answer Correctness : — (no ground truth)") + + if hal_rate is not None: + col = _R if hal_rate > 0.2 else _G + print(f" {_C}Hallucination Rate{_X} : {col}{hal_rate * 100:.1f}%{_X}") + else: + print(f" Hallucination Rate : — (check failed or no context)") + + print(f" {'═' * 60}") + print(f"\n CSV : {csv_path}\n") + + +# ─── CLI entry point ────────────────────────────────────────────────────────── + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="GraphRAG Regression Evaluator") + parser.add_argument("--dataset", required=True, + help="Dataset name (folder under graphrag/tests/test_questions/)") + parser.add_argument("--graphname", required=True, + help="GraphRAG graph name to query") + parser.add_argument("--config", default="configs/server_config.json", + help="Path to server_config.json") + parser.add_argument("--url", default=None, + help="GraphRAG base URL (overrides server_config.json graphrag_config.query_url)") + parser.add_argument("--search-type", default="Auto", + help="RAG search type sent to /query (default: Auto)") + parser.add_argument("--output", default=None, + help="Output directory for result CSV/JSON") + parser.add_argument("--detailed", action="store_true", + help="Print per-question breakdown in addition to the summary") + parser.add_argument("--limit", type=int, default=None, + help="Only run the first N questions (useful for quick smoke tests)") + args = parser.parse_args() + + # ── Silence the noise: deprecation warnings + the app's INFO logging ─────── + # The GraphRAG app modules (openai_service, agent_hallucination_check, …) + # log at INFO on every call. logging.disable(INFO) hard-suppresses INFO/DEBUG + # from every logger regardless of its own level, so the only thing on screen + # is our clean per-question output. + warnings.simplefilter("ignore") + logging.disable(logging.INFO) + + config_path = os.path.abspath(args.config) + if not os.path.exists(config_path): + sys.exit(f"ERROR: config file not found: {config_path}") + os.environ.setdefault("SERVER_CONFIG", config_path) + + with open(config_path) as f: + server_cfg = json.load(f) + + db = server_cfg.get("db_config", {}) + username = db.get("username", "") + password = db.get("password", "") + if not username or not password: + sys.exit(f"ERROR: db_config.username / password not set in {config_path}") + + graphrag_url = ( + args.url + or server_cfg.get("graphrag_config", {}).get("query_url", "http://localhost:8000") + ).rstrip("/") + + dataset_dir = os.path.join("/code/tests/test_questions", args.dataset) + if not os.path.isdir(dataset_dir): + sys.exit(f"ERROR: dataset directory not found: {dataset_dir}") + + output_dir = args.output or os.path.join(os.path.dirname(__file__), "results") + + questions = load_questions(dataset_dir) + if args.limit: + questions = questions[:args.limit] + + print(f"\n{_B}GraphRAG Evaluation{_X} dataset={args.dataset} " + f"graph={args.graphname} search={args.search_type} " + f"questions={len(questions)}" + + (f" {_Y}(limit={args.limit}){_X}" if args.limit else "") + + "\n", flush=True) + + results = run_eval( + questions = questions, + graphname = args.graphname, + url = graphrag_url, + username = username, + password = password, + search_type = args.search_type, + detailed = args.detailed, + ) + write_results(results, output_dir, detailed=args.detailed) + + # os._exit skips the interpreter shutdown phase, which otherwise emits + # destructor/asyncio noise ("ImportError: sys.meta_path is None …") from + # the GraphRAG embedding-store and LLM clients after our output is done. + sys.stdout.flush() + sys.stderr.flush() + os._exit(1 if any(r.error for r in results) else 0) diff --git a/graphrag/tests/regression/run_eval.sh b/graphrag/tests/regression/run_eval.sh new file mode 100644 index 0000000..087d407 --- /dev/null +++ b/graphrag/tests/regression/run_eval.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# GraphRAG Regression — Evaluation +# +# Runs the evaluation INSIDE the graphrag container via docker exec, so it +# reuses the container's Python environment (deepeval, langchain, +# common.config, the agent code) — nothing is installed locally. +# +# Requires the regression code + test_questions to be mounted into the +# container (see docker-compose.yml graphrag volumes). +# +# Usage (from repo root, on the host): +# +# Simple summary: +# ./graphrag/tests/regression/run_eval.sh --dataset Toppan --graphname toppan_2edc7217 +# +# Detailed per-question output: +# ./graphrag/tests/regression/run_eval.sh --dataset Toppan --graphname toppan_2edc7217 --detailed +# +# Different search type: +# ./graphrag/tests/regression/run_eval.sh --dataset Toppan --graphname toppan_2edc7217 --search-type "Hybrid Search" +# +# Override the container name with GRAPHRAG_CONTAINER if needed. +# Results are written to graphrag/tests/regression/results/ on the host +# (the directory is mounted into the container). +# All arguments are forwarded to evaluator.py. + +set -euo pipefail + +# Git Bash / MSYS on Windows rewrites "/code/..." into a Windows path +# (e.g. C:/Program Files/Git/code/...) before passing it to docker. +# Disable that POSIX path conversion so the container path is preserved. +export MSYS_NO_PATHCONV=1 +export MSYS2_ARG_CONV_EXCL="*" + +CONTAINER="${GRAPHRAG_CONTAINER:-graphrag}" + +# FORCE_COLOR=1 keeps the colourised output even though docker exec output is +# piped (no TTY). Set NO_COLOR=1 in your shell to turn colours off. +docker exec \ + -e PYTHONUNBUFFERED=1 \ + -e PYTHONWARNINGS=ignore \ + -e FORCE_COLOR="${FORCE_COLOR:-1}" \ + -e NO_COLOR="${NO_COLOR:-}" \ + "${CONTAINER}" \ + python /code/tests/regression/evaluator.py "$@" diff --git a/graphrag/tests/regression/run_load_eval.sh b/graphrag/tests/regression/run_load_eval.sh new file mode 100644 index 0000000..e706410 --- /dev/null +++ b/graphrag/tests/regression/run_load_eval.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# GraphRAG Regression — Load Exported Graph + Evaluate +# +# Loads a pre-built graph from test_questions//ExportedGraph/ and +# then runs the full evaluation against it — both steps execute INSIDE the +# graphrag container via docker exec (no local deps required). +# +# Usage (from repo root, on the host): +# +# ./graphrag/tests/regression/run_load_eval.sh --dataset Toppan --graphname toppan_2edc7217 +# ./graphrag/tests/regression/run_load_eval.sh --dataset Toppan --graphname toppan_2edc7217 --detailed +# +# --graphname is the name the graph is loaded/restored as. If omitted, a new +# UUID-based name is generated by the load step and reused for evaluation. +# +# Override the container name with GRAPHRAG_CONTAINER if needed. + +set -euo pipefail + +# Git Bash / MSYS on Windows rewrites "/code/..." into a Windows path +# (e.g. C:/Program Files/Git/code/...) before passing it to docker. +# Disable that POSIX path conversion so the container path is preserved. +export MSYS_NO_PATHCONV=1 +export MSYS2_ARG_CONV_EXCL="*" + +CONTAINER="${GRAPHRAG_CONTAINER:-graphrag}" + +# ── Parse args ──────────────────────────────────────────────────────────────── +DATASET="" +GRAPHNAME="" +SETUP_ARGS=() +EVAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case "$1" in + --dataset) + DATASET="$2"; SETUP_ARGS+=(--dataset "$2"); EVAL_ARGS+=(--dataset "$2"); shift 2 ;; + --graphname) + GRAPHNAME="$2"; SETUP_ARGS+=(--graphname "$2"); EVAL_ARGS+=(--graphname "$2"); shift 2 ;; + --config|--url) + SETUP_ARGS+=("$1" "$2"); EVAL_ARGS+=("$1" "$2"); shift 2 ;; + --search-type|--output) + EVAL_ARGS+=("$1" "$2"); shift 2 ;; + --detailed) + EVAL_ARGS+=(--detailed); shift ;; + *) + echo "Unknown argument: $1" >&2; exit 1 ;; + esac +done + +if [ -z "${DATASET}" ]; then + echo "ERROR: --dataset is required." >&2 + exit 1 +fi + +# ── Step 1: Load exported graph ─────────────────────────────────────────────── +echo "" +echo "Step 1/2: Loading exported graph ..." +echo "─────────────────────────────────────────────────────────────────" + +LOAD_OUTPUT=$(docker exec -e PYTHONUNBUFFERED=1 "${CONTAINER}" \ + python /code/tests/regression/setup_graph.py --load-exported "${SETUP_ARGS[@]}" | tee /dev/tty) + +# If --graphname was not passed, capture the auto-generated one (last line). +if [ -z "${GRAPHNAME}" ]; then + GRAPHNAME=$(echo "${LOAD_OUTPUT}" | grep -v '^$' | tail -n 1 | tr -d '[:space:]') + if [ -z "${GRAPHNAME}" ]; then + echo "ERROR: Could not determine graphname from setup output." >&2 + exit 1 + fi + EVAL_ARGS+=(--graphname "${GRAPHNAME}") +fi + +# ── Step 2: Run evaluation ──────────────────────────────────────────────────── +echo "" +echo "Step 2/2: Running evaluation against graph '${GRAPHNAME}' ..." +echo "─────────────────────────────────────────────────────────────────" + +docker exec \ + -e PYTHONUNBUFFERED=1 \ + -e PYTHONWARNINGS=ignore \ + -e FORCE_COLOR="${FORCE_COLOR:-1}" \ + -e NO_COLOR="${NO_COLOR:-}" \ + "${CONTAINER}" \ + python /code/tests/regression/evaluator.py "${EVAL_ARGS[@]}" diff --git a/graphrag/tests/regression/run_setup.sh b/graphrag/tests/regression/run_setup.sh new file mode 100644 index 0000000..36f11dd --- /dev/null +++ b/graphrag/tests/regression/run_setup.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# GraphRAG Regression — Graph Setup +# +# Runs the setup INSIDE the graphrag container via docker exec, so it reuses +# the container's Python environment (deepeval, langchain, common.config, etc.) +# — nothing needs to be installed locally. +# +# Requires the regression code + test_questions to be mounted into the +# container (see docker-compose.yml graphrag volumes). After adding the +# mounts, recreate the container once with: docker-compose up -d graphrag +# +# Usage (from repo root, on the host): +# ./graphrag/tests/regression/run_setup.sh --dataset Toppan +# ./graphrag/tests/regression/run_setup.sh --dataset Toppan --skip-rebuild +# +# Override the container name with GRAPHRAG_CONTAINER if needed. +# All arguments are forwarded to setup_graph.py. + +set -euo pipefail + +# Git Bash / MSYS on Windows rewrites "/code/..." into a Windows path +# (e.g. C:/Program Files/Git/code/...) before passing it to docker. +# Disable that POSIX path conversion so the container path is preserved. +export MSYS_NO_PATHCONV=1 +export MSYS2_ARG_CONV_EXCL="*" + +CONTAINER="${GRAPHRAG_CONTAINER:-graphrag}" + +docker exec -e PYTHONUNBUFFERED=1 "${CONTAINER}" \ + python /code/tests/regression/setup_graph.py "$@" diff --git a/graphrag/tests/regression/setup_graph.py b/graphrag/tests/regression/setup_graph.py new file mode 100644 index 0000000..15827fd --- /dev/null +++ b/graphrag/tests/regression/setup_graph.py @@ -0,0 +1,349 @@ +"""GraphRAG Regression — Graph Setup + +Creates and populates a GraphRAG knowledge graph from a dataset folder. + +Two modes: + 1. Fresh setup (default) — creates graph, initialises schema, uploads & + ingests documents, triggers ECC rebuild. + 2. Load exported graph (--load-exported) — restore from ExportedGraph/ + folder (stub — implement load_exported() when confirmed). + +The generated graphname is printed last so run_setup.sh can capture it. + +Run via: + ./graphrag/tests/regression/run_setup.sh --dataset + ./graphrag/tests/regression/run_setup.sh --dataset --graphname +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import re +import sys +import time +import uuid +from typing import List, Optional + +import httpx + +logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(name)s: %(message)s") + +_REBUILD_POLL_SECS = 30 + + +class SetupError(Exception): + pass + + +# ─── HTTP helpers ───────────────────────────────────────────────────────────── + +class _Client: + def __init__(self, base_url: str, username: str, password: str) -> None: + self.base_url = base_url.rstrip("/") + self.auth = (username, password) + + def post(self, path: str, json_body=None) -> dict: + resp = httpx.post( + f"{self.base_url}{path}", + json=json_body, + auth=self.auth, + timeout=None, + ) + resp.raise_for_status() + return resp.json() + + def get(self, path: str) -> dict: + resp = httpx.get( + f"{self.base_url}{path}", + auth=self.auth, + timeout=None, + ) + resp.raise_for_status() + return resp.json() + + def upload_file(self, path: str, filepath: str) -> dict: + fname = os.path.basename(filepath) + with open(filepath, "rb") as fh: + resp = httpx.post( + f"{self.base_url}{path}", + auth=self.auth, + params={"overwrite": "true"}, + files={"files": (fname, fh)}, + timeout=None, + ) + resp.raise_for_status() + return resp.json() + + +# ─── Setup steps ────────────────────────────────────────────────────────────── + +def _create_graph(client: _Client, graphname: str) -> None: + result = client.post(f"/ui/{graphname}/create_graph") + status = result.get("status", "") + if status == "error": + msg = result.get("message", "") + if "already exists" in msg.lower(): + print(f" Graph already exists — continuing.", flush=True) + else: + raise SetupError(f"create_graph: {msg}") + + +def _initialize(client: _Client, graphname: str) -> None: + try: + result = client.post(f"/ui/{graphname}/initialize_graph") + except httpx.HTTPStatusError as e: + if e.response.status_code == 409: + try: + body = e.response.json() + detail = body.get("detail", {}) + if isinstance(detail, dict) and detail.get("reason") == "structural_present": + print(f" Schema already installed — skipping init.", flush=True) + return + except Exception: + pass + raise SetupError(f"initialize ({e.response.status_code}): {e.response.text[:300]}") from e + + state = str(result.get("status", result.get("state", ""))).lower() + if state in ("submitted", "running", "in_progress"): + _poll_initialize(client, graphname) + + +def _poll_initialize(client: _Client, graphname: str, poll_secs: int = 5) -> None: + print(f" Polling initialization status ...", flush=True) + last_msg = None + while True: + status = client.get(f"/ui/{graphname}/initialize_status") + state = str(status.get("state", "")).lower() + message = str(status.get("message") or "") + error = status.get("error") + + if message and message != last_msg: + print(f" [{graphname}] {message}", flush=True) + last_msg = message + + if state == "completed": + return + if state == "failed" or error: + raise SetupError(f"Initialization failed: {error or message}") + time.sleep(poll_secs) + + +def _ingest_documents(client: _Client, graphname: str, doc_files: List[str]) -> None: + # Upload + print(f" Uploading {len(doc_files)} file(s) ...", flush=True) + for fp in doc_files: + result = client.upload_file(f"/ui/{graphname}/uploads", fp) + if result.get("status") not in ("success", "conflict"): + raise SetupError(f"Upload returned unexpected status: {result}") + print(f" + {os.path.basename(fp)}", flush=True) + + # Create ingest job + folder_path = f"uploads/{graphname}" + ingest_info = client.post(f"/ui/{graphname}/create_ingest", json_body={ + "data_source": "server", + "data_source_config": {"data_path": folder_path}, + "loader_config": {}, + "file_format": "json", + }) + load_job_id = ingest_info.get("load_job_id") or ingest_info.get("jobId") + data_source_id = ingest_info.get("data_source_id") or ingest_info.get("dataSourceId") + file_path = ingest_info.get("data_path") or folder_path + + if not load_job_id or data_source_id is None: + raise SetupError(f"create_ingest did not return expected fields: {ingest_info}") + + # Run ingest + client.post(f"/ui/{graphname}/ingest", json_body={ + "load_job_id": load_job_id, + "data_source_id": data_source_id, + "file_path": file_path, + }) + + +def _rebuild(client: _Client, graphname: str) -> None: + client.post(f"/ui/{graphname}/rebuild_graph") + print(f" Polling rebuild status every {_REBUILD_POLL_SECS}s ...", flush=True) + while True: + status = client.get(f"/ui/{graphname}/rebuild_status") + stage = str(status.get("stage") or status.get("status", "")).lower() + if stage in ("complete", "completed", "done", "success"): + return + if stage in ("failed", "error"): + raise SetupError(f"Rebuild failed: {status}") + time.sleep(_REBUILD_POLL_SECS) + + +# ─── Main setup pipeline ────────────────────────────────────────────────────── + +def fresh_setup( + client: _Client, + graphname: str, + doc_files: List[str], + skip_rebuild: bool = False, +) -> None: + errors: List[str] = [] + init_ok = True + + print(f"\n [1/4] Create Graph", flush=True) + t = time.monotonic() + try: + _create_graph(client, graphname) + print(f" Done ({time.monotonic() - t:.1f}s)", flush=True) + except SetupError as e: + print(f" Failed: {e}", flush=True) + errors.append(f"Create Graph: {e}") + init_ok = False + + if init_ok: + print(f"\n [2/4] Initialize Schema (1-3 min ...)", flush=True) + print(f" Waiting 15s for graph to be ready ...", flush=True) + time.sleep(15) + t = time.monotonic() + try: + _initialize(client, graphname) + print(f" Done ({time.monotonic() - t:.1f}s)", flush=True) + except SetupError as e: + print(f" Failed: {e}", flush=True) + errors.append(f"Initialize: {e}") + init_ok = False + + if init_ok and doc_files: + print(f"\n [3/4] Upload & Ingest Documents", flush=True) + print(f" Waiting 30s for schema to settle ...", flush=True) + time.sleep(30) + t = time.monotonic() + try: + _ingest_documents(client, graphname, doc_files) + print(f" Done ({time.monotonic() - t:.1f}s)", flush=True) + except SetupError as e: + print(f" Failed: {e}", flush=True) + errors.append(f"Ingest: {e}") + elif init_ok: + print(f"\n [3/4] Upload & Ingest - skipped (no data/ files found)", flush=True) + + if not skip_rebuild: + print(f"\n [4/4] Rebuild Knowledge Graph (ECC)", flush=True) + t = time.monotonic() + try: + _rebuild(client, graphname) + print(f" Done ({time.monotonic() - t:.1f}s)", flush=True) + except SetupError as e: + print(f" Failed: {e}", flush=True) + errors.append(f"Rebuild: {e}") + else: + print(f"\n [4/4] Rebuild - skipped", flush=True) + + if errors: + print(f"\n Setup completed with errors:") + for err in errors: + print(f" - {err}") + sys.exit(1) + + +def load_exported(graphname: str, exported_dir: str) -> None: + """TODO: Restore graph from ExportedGraph/ folder. + + Implement using TigerGraph's export/import mechanism once confirmed. + Options: + - GSQL: DROP GRAPH + CREATE GRAPH + load GBAR + - GraphRAG restore API endpoint (if available) + - pyTigerGraph backup restore + """ + if not os.path.isdir(exported_dir): + sys.exit( + f"ERROR: ExportedGraph directory not found: {exported_dir}\n" + f"Run setup first or add exported graph files to this directory." + ) + print(f"\n --load-exported is not yet implemented.") + print(f" Edit load_exported() in setup_graph.py with the TigerGraph") + print(f" import command for your environment, then rerun.\n") + sys.exit(1) + + +# ─── CLI entry point ────────────────────────────────────────────────────────── + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="GraphRAG Regression — Graph Setup") + parser.add_argument("--dataset", required=True, + help="Dataset name (folder under graphrag/tests/test_questions/)") + parser.add_argument("--config", default="configs/server_config.json", + help="Path to server_config.json") + parser.add_argument("--url", default=None, + help="GraphRAG base URL (overrides server_config.json)") + parser.add_argument("--graphname", default=None, + help="Override generated graphname (also required with --load-exported)") + parser.add_argument("--load-exported", action="store_true", + help="Load from ExportedGraph/ instead of running fresh setup") + parser.add_argument("--skip-rebuild", action="store_true", + help="Skip ECC rebuild step (useful when testing setup only)") + args = parser.parse_args() + + config_path = os.path.abspath(args.config) + if not os.path.exists(config_path): + sys.exit(f"ERROR: config not found: {config_path}") + + with open(config_path) as f: + server_cfg = json.load(f) + + db = server_cfg.get("db_config", {}) + username = db.get("username", "") + password = db.get("password", "") + if not username or not password: + sys.exit(f"ERROR: db_config.username / password not set in {config_path}") + + graphrag_url = ( + args.url + or server_cfg.get("graphrag_config", {}).get("query_url", "http://localhost:8000") + ).rstrip("/") + + dataset_dir = os.path.join("/code/tests/test_questions", args.dataset) + if not os.path.isdir(dataset_dir): + sys.exit(f"ERROR: dataset directory not found: {dataset_dir}") + + # Auto-generate graphname: _<8-char uuid> + uid = uuid.uuid4().hex[:8] + graphname = args.graphname or f"{args.dataset.lower()}_{uid}" + graphname = re.sub(r"[^A-Za-z0-9_]", "_", graphname) + if graphname[0].isdigit(): + graphname = f"g_{graphname}" + + print(f"\nGraphRAG Regression - Graph Setup") + print("─" * 60) + print(f" Dataset : {args.dataset}") + print(f" Graph : {graphname}") + print(f" URL : {graphrag_url}") + print(f" Mode : {'load-exported' if args.load_exported else 'fresh-setup'}") + print("─" * 60) + + if args.load_exported: + exported_dir = os.path.join(dataset_dir, "ExportedGraph") + load_exported(graphname, exported_dir) + else: + data_dir = os.path.join(dataset_dir, "data") + doc_files = [] + if os.path.isdir(data_dir): + doc_files = [ + os.path.join(data_dir, f) + for f in sorted(os.listdir(data_dir)) + if os.path.isfile(os.path.join(data_dir, f)) + ] + print(f" Documents : {len(doc_files)} file(s)") + for fp in doc_files: + print(f" - {os.path.basename(fp)}") + else: + print(f" Warning: no data/ directory found - skipping ingest") + print("─" * 60) + + client = _Client(graphrag_url, username, password) + fresh_setup(client, graphname, doc_files, skip_rebuild=args.skip_rebuild) + + print(f"\n Setup complete! Graph '{graphname}' is ready.") + print(f"\n Next - run evaluation:") + print(f" ./graphrag/tests/regression/run_eval.sh --dataset {args.dataset} --graphname {graphname}") + print("─" * 60) + + # Print graphname last so run_setup.sh can capture it cleanly + print(graphname) diff --git a/graphrag/tests/test_questions/Apple_SEQ_10/answers.csv b/graphrag/tests/test_questions/Apple_SEQ_10/answers.csv new file mode 100644 index 0000000..d97aaae --- /dev/null +++ b/graphrag/tests/test_questions/Apple_SEQ_10/answers.csv @@ -0,0 +1,13 @@ +ground_truth +"Apple's total net sales have changed over time as follows: - For the quarterly period ended June 25, 2022, the total net sales were $82,959 million. (SOURCE: 2022 Q3 AAPL.pdf) - For the quarterly period ended December 31, 2022, the total net sales were $117,154 million. (SOURCE: 2023 Q1 AAPL.pdf) - For the quarterly period ended April 1, 2023, the total net sales were $94,836 million. (SOURCE: 2023 Q2 AAPL.pdf) - For the quarterly period ended July 1, 2023, the total net sales were $81,797 million. (SOURCE: 2023 Q3 AAPL.pdf) From these figures, it can be observed that there was an increase in total net sales from the quarter ended June 25, 2022, to the quarter ended December 31, 2022. However, there was a subsequent decrease in total net sales in the quarters ended April 1, 2023, and July 1, 2023." +"In the most recent 10-Q for the quarter ended July 1, 2023, the factors contributing to the change in Apple's gross margin compared to previous quarters include: 1. Weakness in foreign currencies relative to the U.S. dollar, which had an unfavorable impact on gross margin. 2. Lower Products volume, which decreased gross margin. 3. Cost savings and a different Products mix, which partially offset the decrease in gross margin. For the third quarter of 2023, the Products gross margin percentage increased compared to the same quarter in 2022 due to cost savings and a different Products mix, despite the negative impact of foreign currency weakness and decreased leverage. However, the year-over-year Products gross margin percentage for the first nine months of 2023 decreased due to the weakness in foreign currencies and decreased leverage, despite cost savings and a different Products mix. The Services gross margin increased due to higher Services net sales but was partially offset by the weakness in foreign currencies and higher Services costs. The Services gross margin percentage decreased due to higher Services costs, partially offset by improved leverage." +"Yes, there has been a change in Apple's operating expenses over the reported quarters. The key drivers for this change are increases in research and development (""R&D"") expense and selling, general and administrative expense. The growth in R&D expense is primarily driven by increases in headcount-related expenses. For the third quarter of 2022, the total operating expenses were $12,809 million, with R&D expenses of $6,797 million and selling, general and administrative expenses of $6,012 million. For the first quarter of 2023, the total operating expenses increased to $14,316 million, with R&D expenses of $7,709 million and selling, general and administrative expenses of $6,607 million. For the second quarter of 2023, the total operating expenses were $13,658 million, with R&D expenses of $7,457 million and selling, general and administrative expenses of $6,201 million. For the third quarter of 2023, the total operating expenses were $13,415 million, with R&D expenses of $7,442 million and selling, general and administrative expenses of $5,973 million. The consistent increase in R&D expenses indicates the company's continued investment in innovation and product development. The selling, general and administrative expenses have also increased, although they saw a slight decrease in the third quarter of 2023 compared to the second quarter of 2023. " +"The revenue from iPhone sales for Apple has fluctuated across the quarters as follows: - In the quarter ended June 25, 2022, the revenue from iPhone sales was $40,665 million. (""2022 Q3 AAPL.pdf"") - In the quarter ended December 31, 2022, the revenue from iPhone sales was $65,775 million. (""2023 Q1 AAPL.pdf"") - In the quarter ended April 1, 2023, the revenue from iPhone sales was $51,334 million. (""2023 Q2 AAPL.pdf"") - In the quarter ended July 1, 2023, the revenue from iPhone sales was $39,669 million" +"- For the quarterly period ended June 25, 2022, weakness in foreign currencies relative to the U.S. dollar had an unfavorable impact on the Company’s total net sales. SOURCE(S): 2023 Q1 AAPL.pdf, Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations - For the quarterly period ended December 31, 2022, weakness in foreign currencies relative to the U.S. dollar had an unfavorable impact on the Company’s total net sales, which decreased 5% or $6.8 billion during the first quarter of 2023 compared to the same quarter in 2022. SOURCE(S): 2023 Q2 AAPL.pdf, Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations - For the quarterly period ended April 1, 2023, weakness in foreign currencies relative to the U.S. dollar had an unfavorable impact on the Company’s total net sales, which decreased 3% or $2.4 billion during the second quarter of 2023 compared to the same quarter in 2022. SOURCE(S): 2023 Q3 AAPL.pdf, Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations - For the quarterly period ended July 1, 2023, weakness in foreign currencies relative to the U.S. dollar had an unfavorable impact on the Company’s total net sales, which decreased 1% or $1.2 billion during the third quarter of 2023 compared to the same quarter in 2022. SOURCE(S): 2023 Q3 AAPL.pdf, Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations" +"The legal proceedings disclosed in the provided 10-Q documents include the following: 1. Epic Games Lawsuit: Epic Games, Inc. filed a lawsuit against Apple Inc. in the U.S. District Court for the Northern District of California, alleging violations of federal and state antitrust laws and California's unfair competition law based on Apple's operation of its App Store. The District Court ruled in favor of Apple on most counts but found that certain provisions of Apple's App Store Review Guidelines violated California's unfair competition law and issued an injunction. The case was appealed to the U.S. Court of Appeals for the Ninth Circuit, which affirmed the District Court's ruling. Further appeals and requests for review are mentioned, including a potential appeal to the U.S. Supreme Court and the Circuit Court's stay of the injunction pending such appeal. 2. European Commission State Aid Decision: The European Commission issued a decision that Ireland granted state aid to Apple by providing tax opinions in 1991 and 2007 concerning the tax allocation of profits of the Irish branches of two Apple subsidiaries. The decision ordered Ireland to recover additional taxes from Apple for the period June 2003 through December 2014. Both Apple and Ireland appealed the decision to the General Court of the Court of Justice of the European Union, which annulled the decision. The European Commission appealed the annulment to the European Court of Justice, and a hearing was held with a decision expected in calendar year 2024. These legal proceedings and contingencies could potentially impact Apple in several ways: - The Epic Games lawsuit could lead to changes in Apple's App Store policies and practices, particularly regarding the prohibition of developers from including external links to purchasing mechanisms other than Apple in-app purchasing. If the injunction takes effect, it could impact Apple's App Store revenue and operating model. - The European Commission State Aid Decision could result in Apple being required to pay additional taxes for the specified period if the appeal by the European Commission is successful. Apple believes it would be eligible to claim a U.S. foreign tax credit for a portion of any incremental Irish corporate income taxes potentially due related to the decision. Both matters could have financial implications for Apple, including potential monetary liabilities and changes to business practices that could affect future revenue and profitability." +"In the "Quarterly Highlights" section of the 2023 Q1 AAPL.pdf document, it is mentioned that during the first quarter of 2023, Apple announced a new iPad, a new iPad Pro powered by the Apple M2 chip, and a new Apple TV 4K. Additionally, in the "Operating Expenses" section, it is stated that the growth in research and development (R&D) expense during the first quarter of 2023 compared to the same quarter in 2022 was driven primarily by increases in headcount-related expenses. While the document does not explicitly state a direct correlation between R&D expenditure and the introduction of new products or services, the mention of new product announcements in the same period as an increase in R&D expenses suggests that the company's investment in R&D may be related to its product development efforts." +"Apple's Q2 2023 report indicates that international markets are a significant component of the company's revenue. The report shows that net sales in various regions outside the United States, such as Europe, Greater China, Japan, and the Rest of Asia Pacific, collectively contribute to the company's total net sales. However, the report also highlights that the company's revenue is impacted by currency fluctuations. Specifically, it mentions that the weakness in foreign currencies relative to the U.S. dollar had an unfavorable impact on the company's total net sales, which decreased by 3% during the second quarter of 2023 compared to the same quarter in 2022. The segment on foreign currency risk within the report discusses how Apple uses derivative instruments to partially offset its business exposure to foreign exchange risk. The company may enter into forward contracts, option contracts, or other instruments to protect its gross margins from fluctuations in foreign currency exchange rates. Apple hedges portions of its forecasted foreign currency exposure associated with revenue and inventory purchases, typically for up to 12 months. Additionally, the company may enter into derivative instruments to protect its foreign currency–denominated term debt or marketable securities from fluctuations in foreign currency exchange rates. In summary, Apple's revenue from international markets is subject to foreign currency risk, and the company actively manages this exposure through the use of derivative instruments to mitigate the impact of currency fluctuations on its financial results." +"The "Liquidity and Capital Resources" section of Apple's most recent 10-Q report indicates that the company believes its balances of cash, cash equivalents, and unrestricted marketable securities, along with cash generated by ongoing operations and continued access to debt markets, will be sufficient to satisfy its cash requirements and capital return program over the next 12 months and beyond. This suggests that the company expects to maintain a strong liquidity position. The "CONDENSED CONSOLIDATED STATEMENTS OF CASH FLOWS" section shows that Apple's cash generated by operating activities was $88.9 billion for the nine months ended July 1, 2023. This indicates that the company's operations are generating a significant amount of cash, which supports the company's assessment in the "Liquidity and Capital Resources" section that the cash generated from operations will contribute to meeting its cash requirements. In summary, the insights from the "Liquidity and Capital Resources" section correlate with the reported changes in Apple's cash flow from operations by confirming that the company's operations continue to generate substantial cash flows, which, along with its other liquid assets, are expected to cover its financial needs." +"The reported changes in Apple's inventory levels for Q3 2023 show an increase in total inventories from $4,946 million as of September 24, 2022, to $7,351 million as of July 1, 2023. This increase is reflected in both components and finished goods, with components increasing from $1,637 million to $3,788 million and finished goods increasing from $3,309 million to $3,563 million. The discussion in the supply chain and logistics section, specifically regarding manufacturing purchase obligations, indicates that Apple has entered into noncancelable manufacturing purchase obligations primarily with its outsourcing partners who manufacture subassemblies or assemble final products for the company. As of July 1, 2023, Apple had manufacturing purchase obligations of $38.4 billion, with $38.1 billion payable within 12 months. These obligations are for the acquisition of components and building products based on demand information supplied by Apple, which typically covers periods up to 150 days. The increase in inventory levels could be associated with Apple's strategy to ensure the availability of components and finished products to meet anticipated customer demand, considering the lead times and obligations with its manufacturing partners. The noncancelable nature of these obligations suggests that Apple is committed to these inventory levels to maintain its supply chain operations." +"The discussion on market risk in the financial section of the latest 10-Q states that there have been no material changes to the Company’s market risk during the first nine months of 2023. This aligns with the risk factors mentioned in the company's business overview, where it is stated that there have been no material changes to the Company’s risk factors since the 2022 Form 10-K." +"Apple's Q1 2023 10-Q report provides information on the company's debt instruments and management's discussion on their debt management strategy. From the financial statements, we can see that Apple has various notes with different maturity dates and interest rates. For instance, the report lists instruments such as 1.375% Notes due 2024, 0.000% Notes due 2025, and 3.600% Notes due 2042, among others, all registered with The Nasdaq Stock Market LLC. In the management discussion section, specifically under "Liquidity and Capital Resources," it is mentioned that Apple believes its balances of cash, cash equivalents, and unrestricted marketable securities, along with cash generated by ongoing operations and continued access to debt markets, will be sufficient to satisfy its cash requirements and capital return program over the next 12 months and beyond. The report also mentions that Apple's contractual cash requirements have not changed materially since the 2022 Form 10-K, except for commercial paper and manufacturing purchase obligations. The report details that Apple issues unsecured short-term promissory notes ("Commercial Paper") under a commercial paper program, which is used for general corporate purposes, including dividends and share repurchases. As of December 31, 2022, Apple had $1.7 billion of Commercial Paper outstanding, all of which was payable within 12 months. Combining this information, we can infer that Apple's debt management strategy involves maintaining a mix of short-term and long-term debt instruments, ensuring sufficient liquidity through a combination of cash reserves, ongoing operations, and access to debt markets, and using instruments like commercial paper for immediate corporate needs, including shareholder returns." diff --git a/graphrag/tests/test_questions/Apple_SEQ_10/data/2022 Q3 AAPL.pdf b/graphrag/tests/test_questions/Apple_SEQ_10/data/2022 Q3 AAPL.pdf new file mode 100644 index 0000000..f8af3db Binary files /dev/null and b/graphrag/tests/test_questions/Apple_SEQ_10/data/2022 Q3 AAPL.pdf differ diff --git a/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q1 AAPL.pdf b/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q1 AAPL.pdf new file mode 100644 index 0000000..4620269 Binary files /dev/null and b/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q1 AAPL.pdf differ diff --git a/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q2 AAPL.pdf b/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q2 AAPL.pdf new file mode 100644 index 0000000..7b8f1a6 Binary files /dev/null and b/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q2 AAPL.pdf differ diff --git a/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q3 AAPL.pdf b/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q3 AAPL.pdf new file mode 100644 index 0000000..4fdfdcb Binary files /dev/null and b/graphrag/tests/test_questions/Apple_SEQ_10/data/2023 Q3 AAPL.pdf differ diff --git a/graphrag/tests/test_questions/Apple_SEQ_10/questions.csv b/graphrag/tests/test_questions/Apple_SEQ_10/questions.csv new file mode 100644 index 0000000..374c0e6 --- /dev/null +++ b/graphrag/tests/test_questions/Apple_SEQ_10/questions.csv @@ -0,0 +1,13 @@ +question +"How has Apple's total net sales changed over time?" +"What are the major factors contributing to the change in Apple's gross margin in the most recent 10-Q compared to the previous quarters?" +"Has there been any significant change in Apple's operating expenses over the reported quarters? If so, what are the key drivers for this change?" +"How has Apple's revenue from iPhone sales fluctuated across quarters?" +"What is the impact of foreign exchange rates on Apple's financial performance? List this out separately for each reported period." +"What legal proceedings or contingencies are disclosed in these 10-Qs and how might they potentially impact Apple?" +"In Apple's Q1 2023 10-Q, what is the correlation between Apple's R&D expenditure and the new products or services introduced by the company?" +"For Apple's Q2 2023 report, analyze the relationship between Apple's revenue from international markets and its segment on foreign currency risk to understand the company's exposure to currency fluctuations." +"From Apple's most recent 10-Q, how do the insights from the liquidity and capital resources section correlate with the reported changes in Apple's cash flow from operations?" +"In Apple's Q3 2023 10-Q, analyze how the reported changes in Apple's inventory levels relate to the discussion in their supply chain and logistics section." +"From the latest 10-Q, how does Apple's discussion on market risk in the financial section align with the risk factors mentioned in the company's business overview?" +"In Apple's Q1 2023 10-Q, combine the data on Apple's debt instruments from the financial statements with the management discussion to understand their debt management strategy." \ No newline at end of file diff --git a/graphrag/tests/test_questions/Systemdesign/answers.csv b/graphrag/tests/test_questions/Systemdesign/answers.csv new file mode 100644 index 0000000..795634a --- /dev/null +++ b/graphrag/tests/test_questions/Systemdesign/answers.csv @@ -0,0 +1,10 @@ +ground_truth +Users access websites through domain names, such as api.mysite.com. The Domain Name System (DNS) is typically a paid service provided by third parties and is not hosted by application servers. When a user attempts to access a website, DNS translates the domain name into an Internet Protocol (IP) address and returns this IP address to the browser or mobile application. Once the IP address is obtained, the client sends Hypertext Transfer Protocol (HTTP) requests directly to the web server. The web server then processes the request and returns HTML pages or JSON responses for rendering. The traffic to the web server can originate from web applications or mobile applications, where web applications use server-side languages for business logic and client-side technologies for presentation, while mobile applications communicate with the server using HTTP and typically exchange data in JSON format. This sequence of resolving a domain name to an IP address and enabling communication with the web server defines the role of DNS in the overall request flow of system architecture. +Once the IP address is obtained, Hypertext Transfer Protocol (HTTP) requests are sent directly to the web server. The web server processes these requests and returns HTML pages or JSON responses for rendering. The traffic to the web server comes from two sources: web applications and mobile applications. Web applications use a combination of server-side languages such as Java or Python to handle business logic and client-side technologies like HTML and JavaScript for presentation. Mobile applications communicate with the web server using HTTP, and data is typically transferred in JavaScript Object Notation (JSON) format. For example, an API request such as GET /users/12 retrieves user data in JSON format from the server. +Vertical scaling, referred to as scale up, means the process of adding more power such as CPU and RAM to your servers. When traffic is low, vertical scaling is a great option, and the simplicity of vertical scaling is its main advantage. However, it comes with serious limitations. Vertical scaling has a hard limit because it is impossible to add unlimited CPU and memory to a single server. Vertical scaling does not have failover and redundancy, so if one server goes down, the website or application goes down with it completely. +Horizontal scaling, also known as scale-out, allows systems to handle increased load by adding more servers to the resource pool. It is preferred for large-scale systems because it improves scalability, availability, and fault tolerance, and avoids the limitations of vertical scaling. +A load balancer evenly distributes incoming traffic among web servers that are defined in a load-balanced set. Users connect to the public IP of the load balancer directly. With this setup, web servers are unreachable directly by clients anymore. For better security, private IPs are used for communication between servers, and the load balancer communicates with web servers through private IPs. If server 1 goes offline, all the traffic will be routed to server 2, preventing the website from going offline. If the website traffic grows and two servers are not enough to handle the traffic, more servers can be added to the pool, and the load balancer automatically starts to send requests to them. +Database replication can be used in many database management systems, usually with a master/slave relationship between the original (master) and the copies (slaves). A master database generally only supports write operations, while a slave database gets copies of the data from the master database and only supports read operations. All data-modifying commands such as insert, delete, or update must be sent to the master database. Most applications require a much higher ratio of reads to writes, so the number of slave databases is usually larger than the number of master databases. This model improves performance because read operations are distributed across slave nodes and queries can be processed in parallel. It improves reliability because data is replicated across multiple database servers and remains preserved even if one server is destroyed. It also improves high availability because the system can continue operating even if a database server is offline. If a slave database goes offline, read operations are redirected to other available databases or temporarily to the master database. If the master database goes offline, a slave database can be promoted to become the new master, and operations continue on the new master. +A cache is a temporary storage area that stores the result of expensive responses or frequently accessed data in memory so that subsequent requests are served more quickly. Every time a new web page loads, one or more database calls are executed to fetch data, and calling the database repeatedly affects application performance. The cache mitigates this problem by storing data in memory. The cache tier is a temporary data store layer that is much faster than the database and can reduce database workloads while improving system performance. When a request is received, the web server first checks if the cache contains the required data. If the data is available, it is returned immediately. If not, the server queries the database, stores the response in the cache, and returns it to the client. This is known as a read-through cache strategy. Cache is useful when data is read frequently but modified infrequently. It is important to define an expiration policy so that cached data is removed after a certain time and does not become stale. Consistency between the cache and the data store must also be maintained because updates to the database and cache may not occur in a single transaction. +A Content Delivery Network (CDN) is a network of geographically dispersed servers used to deliver static content such as images, videos, CSS, and JavaScript files. When a user visits a website, a CDN server closest to the user delivers the static content, and the further users are from CDN servers, the slower the website loads. When a user requests a resource, if the CDN server does not have the content cached, it requests the file from the origin server, which can be a web server or storage system. The origin server returns the content along with a time-to-live (TTL) value that specifies how long the content should be cached. The CDN caches the content and returns it to the user. Subsequent requests for the same content are served directly from the CDN cache until the TTL expires. CDNs are run by third-party providers and incur costs based on data transfer. It is important to set an appropriate cache expiry time so that content is not too stale or frequently reloaded. Systems should also handle CDN failures by falling back to the origin server. Cached content can be invalidated before expiration using APIs or versioning techniques. +A stateful server remembers client data from one request to the next, while a stateless server keeps no state information. In a stateful architecture, user session data is stored in specific servers, and requests from the same client must always be routed to the same server. This requires mechanisms such as sticky sessions and makes it difficult to add or remove servers and handle failures. In a stateless web tier, servers do not store client state information such as session data. Instead, state data is stored in a shared data store such as a database, cache, or NoSQL system. HTTP requests from users can be sent to any web server, and each server retrieves the required state data from the shared data store. This makes the system simpler, more robust, and more scalable. Auto-scaling is easily achieved because servers can be added or removed based on traffic load without affecting user sessions. diff --git a/graphrag/tests/test_questions/Systemdesign/data/SystemDesignInterview.pdf b/graphrag/tests/test_questions/Systemdesign/data/SystemDesignInterview.pdf new file mode 100644 index 0000000..81b0cfa Binary files /dev/null and b/graphrag/tests/test_questions/Systemdesign/data/SystemDesignInterview.pdf differ diff --git a/graphrag/tests/test_questions/Systemdesign/questions.csv b/graphrag/tests/test_questions/Systemdesign/questions.csv new file mode 100644 index 0000000..4d73b49 --- /dev/null +++ b/graphrag/tests/test_questions/Systemdesign/questions.csv @@ -0,0 +1,10 @@ +question +What is the role of DNS in system architecture? +What happens after a client obtains the IP address? +What is vertical scaling and what are its limitations? +What is horizontal scaling and why is it preferred? +What is the purpose of a load balancer? +What is database replication and why is it used? +What is caching and how does it improve system performance? +What is a Content Delivery Network (CDN) and how does it work? +What is a stateless web tier and why is it important?