tigergraph · prinskumar-tigergraph · Jun 15, 2026 · Jun 16, 2026
diff --git a/common/requirements.txt b/common/requirements.txt
@@ -140,7 +140,7 @@ python-multipart==0.0.20
 python-iso639==2025.2.18
 python-magic==0.4.27
 pyTigerDriver==1.0.15
-pyTigerGraph>=2.0.3
+pyTigerGraph>=2.0.4
 pytz==2025.2
 PyYAML==6.0.2
 rapidfuzz==3.13.0

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -18,6 +18,8 @@ services:
       USE_CYPHER: "true"
     volumes:
       - ./configs/:/code/configs
+      - ./graphrag/tests/regression:/code/tests/regression
+      - ./graphrag/tests/test_questions:/code/tests/test_questions
 
   graphrag-ecc:
     image: tigergraph/graphrag-ecc:latest

diff --git a/graphrag/app/agent/agent_hallucination_check.py b/graphrag/app/agent/agent_hallucination_check.py
@@ -8,45 +8,117 @@
 
 logger = logging.getLogger(__name__)
 
+
 class HallucinationCheckResponse(BaseModel):
-    score: str = Field(description="The score of the hallucination check. Either 'yes' or 'no', indicating if the answer is hallucinated.")
+    confidence: float = Field(
+        description=(
+            "Confidence score between 0.0 and 1.0 that the answer is hallucinated. "
+            "0.0 = fully grounded in the context, 1.0 = completely hallucinated. "
+            "Use the full range — e.g. 0.2 for mostly grounded with minor gaps, "
+            "0.8 for mostly unsupported claims."
+        )
+    )
+    reason: str = Field(
+        description=(
+            "A concise one-to-two sentence explanation of why you assigned this "
+            "confidence score, citing specific claims from the answer and context."
+        )
+    )
+
 
 class TigerGraphAgentHallucinationCheck:
     def __init__(self, llm_model):
         self.llm = llm_model
 
-    def check_hallucination(self, generation: str, context: str) -> dict:
-        """Check if the answer is hallucinated based on the question and context.
+    def check_hallucination(
+        self,
+        generation: str,
+        context: str,
+        question: str = "",
+    ) -> HallucinationCheckResponse:
+        """Score how likely the generated answer is hallucinated.
+
         Args:
-            question: str: The question to generate an answer for.
-            context: str: The context to generate an answer from.
+            generation: The answer produced by the RAG pipeline.
+            context:    The retrieved context chunks passed to the LLM.
+            question:   The original user question (improves grounding judgement).
+
         Returns:
-            dict: The answer to the question and a boolean indicating if the answer is hallucinated.
+            HallucinationCheckResponse with confidence (0–1) and reason.
         """
         LogWriter.info(f"request_id={req_id_cv.get()} ENTRY check_hallucination")
-
-        hallucination_parser = PydanticOutputParser(pydantic_object=HallucinationCheckResponse)        
+
+        hallucination_parser = PydanticOutputParser(
+            pydantic_object=HallucinationCheckResponse
+        )
+
+        question_block = (
+            f"\nORIGINAL QUESTION:\n{question}\n" if question.strip() else ""
+        )
 
         prompt = PromptTemplate(
-            template="""You are a grader assessing whether an answer is grounded in / supported by a set of facts. \n
-            Here are the facts:
-            \n ------- \n
-            {context} 
-            \n ------- \n
-            Here is the answer: {generation}
-            Provide a binary score 'yes' or 'no' score to indicate whether the answer is grounded in / supported by a set of facts.
-            The score should be 'yes' if the information in the answer is found in the context. otherwise, the score should be 'no'. 
-            Provide the binary score as a JSON with a single key 'score' and no preamble or explanation.
-            Format: {format_instructions}""",
-            input_variables=["generation", "context"],
+            template="""You are an expert grader evaluating whether a GraphRAG \
+pipeline answer is grounded in the retrieved context passages.
+
+GraphRAG answers are SYNTHESIZED — they combine, rephrase, and connect \
+information from multiple passages. Reasonable synthesis and inference are \
+acceptable. However, specific facts (numbers, percentages, names, dates) \
+must be correctly attributed — using a figure from context but applying it \
+to the wrong category or time period IS a hallucination.
+{question_block}
+RETRIEVED CONTEXT PASSAGES:
+-------
+{context}
+-------
+
+ANSWER TO EVALUATE:
+{generation}
+
+SCORING GUIDE — confidence that the answer is hallucinated (0.0 = fully \
+grounded, 1.0 = fully hallucinated):
+
+0.0–0.2  Fully grounded: all key claims are directly stated in or correctly \
+inferred from the context. Paraphrasing and synthesis are expected.
+
+0.2–0.4  Mostly grounded: core ideas are supported; minor details extend \
+beyond the context but are logically consistent and do not contradict anything.
+
+0.4–0.6  Partially grounded: some claims are only loosely supported OR \
+specific facts are misattributed (e.g. a correct number applied to the wrong \
+category, time period, or entity).
+
+0.6–0.8  Mostly hallucinated: several key claims are unsupported or specific \
+facts clearly contradict the context.
+
+0.8–1.0  Fully hallucinated: the answer is largely unrelated to the context \
+or fabricates significant facts.
+
+RULES:
+- Accept synthesis and logical combination across multiple passages.
+- Accept domain knowledge that logically follows from the context topic.
+- Context may be incomplete. Judge consistency, not exhaustiveness.
+- Use semantic alignment, NOT keyword matching.
+- Ignore references to diagrams, figures, or images — context is text only.
+- PENALISE when: (1) a specific number/percentage/date is used but applied \
+to the wrong subject; (2) a claim directly contradicts the context; OR \
+(3) the answer introduces topics entirely unrelated to the context.
+
+Provide your verdict as JSON.
+Format: {format_instructions}""",
+            input_variables=["generation", "context", "question_block"],
             partial_variables={
                 "format_instructions": hallucination_parser.get_format_instructions()
-            }
+            },
         )
 
         prediction = self.llm.invoke_with_parser(
-            prompt, hallucination_parser,
-            {"context": context, "generation": generation},
+            prompt,
+            hallucination_parser,
+            {
+                "context":        context,
+                "generation":     generation,
+                "question_block": question_block,
+            },
             caller_name="check_hallucination",
         )
         LogWriter.info(f"request_id={req_id_cv.get()} EXIT check_hallucination")