From eb8ef8afb3e3155a63f5e6ff87c4f30bfb7bd89d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 5 May 2026 16:27:34 +0200
Subject: [PATCH 1/6] fix: update LLM IDs in params.ini and refactor evaluation
 logic in main.py

---
 app/config/params.ini        |  12 +-
 app/core/main.py             |  19 +--
 app/core/tests/evaluation.py | 219 +++++++++++++++++++----------------
 app/core/tests/test_main.py  |  14 ++-
 4 files changed, 144 insertions(+), 120 deletions(-)

diff --git a/app/config/params.ini b/app/config/params.ini
index 8812aae..d01f5cb 100644
--- a/app/config/params.ini
+++ b/app/config/params.ini
@@ -1,10 +1,10 @@
 [llm_preview]
-id = gpt-5.4
+id = gpt-5.5
 temperature = 0.3
 max_retries = 3
 
 [llm_o]
-id = gpt-5.4
+id = gpt-5.5
 temperature = 0
 max_retries = 3
 
@@ -15,8 +15,8 @@ temperature = 0
 max_retries = 3
 
 [llm_o3_mini]
-id = o3-mini-2025-01-31
-temperature = 1
+id = gpt-5.4-mini
+temperature = 0
 max_retries = 3
 
 [llm_gpt_5]
@@ -48,7 +48,7 @@ max_retries = 3
 base_url = https://llama-3-1-70b-instruct.endpoints.kepler.ai.cloud.ovh.net/api/openai_compat/v1
 
 [llm_litellm_openai]
-id = gpt-5.4
+id = gpt-5.5
 temperature = 0
 
 
@@ -63,4 +63,4 @@ id = claude-3-opus-20240229
 id = gemini/gemini-1.5-pro
 
 [llm_litellm_mistral]
-id = mistral/mistral-small-latest
\ No newline at end of file
+id = mistral/mistral-small-latest
diff --git a/app/core/main.py b/app/core/main.py
index 4293b2b..4acb783 100644
--- a/app/core/main.py
+++ b/app/core/main.py
@@ -334,6 +334,16 @@ def main():
     global logger
     logger = setup_logger(__name__)
 
+    # Stage user-provided files into the session's input directory before
+    # initializing external services or language model clients.
+    if args.file:
+        try:
+            _prepare_session_files(session_id, args.file)
+        except SessionFilePreparationError as exc:
+            logger.error(str(exc))
+            print(f"Error: {exc}")
+            return
+
     # Initialize LangSmith if available
     langsmith_setup()
 
@@ -347,15 +357,6 @@ def main():
     # Initialize language models
     models = llm_creation(api_key=args.api_key)
 
-    # Stage user-provided files into the session's input directory
-    if args.file:
-        try:
-            _prepare_session_files(session_id, args.file)
-        except SessionFilePreparationError as exc:
-            logger.error(str(exc))
-            print(f"Error: {exc}")
-            return
-
     try:
         workflow = create_workflow(
             models=models,
diff --git a/app/core/tests/evaluation.py b/app/core/tests/evaluation.py
index ccd171e..d6c6c25 100644
--- a/app/core/tests/evaluation.py
+++ b/app/core/tests/evaluation.py
@@ -6,141 +6,154 @@
 from dotenv import load_dotenv
 
 from langsmith import Client
+from langsmith.utils import LangSmithNotFoundError
 from langchain_core.messages import HumanMessage
 from langsmith.evaluation import EvaluationResult, run_evaluator
 from langchain.evaluation import EvaluatorType, load_evaluator
 from langchain.smith import run_on_dataset, RunEvalConfig
 
 from app.core.workflow.langraph_workflow import create_workflow
-from app.core.main import llm_creation
 from app.core.utils import setup_logger
 
-# 1. Load environment variables
-load_dotenv()
-logger = setup_logger(__name__)
-
-# 2. Check for required API keys safely
-api_key = os.getenv("LANGCHAIN_API_KEY") or os.environ.get("LANGSMITH_API_KEY")
-openai_key = os.getenv("OPENAI_API_KEY")
-
-if not api_key:
-    raise ValueError("Missing LANGCHAIN_API_KEY (or LANGSMITH_API_KEY). Please add it to your repo-root .env (see docs/user-guide/configuration.md).")
-if not openai_key:
-    raise ValueError("Missing OPENAI_API_KEY. Please copy .env.template to .env and add your key.")
-
-# Set environment variables for LangSmith
-os.environ["LANGCHAIN_TRACING_V2"] = "true"
-os.environ["LANGCHAIN_PROJECT"] = os.environ.get("LANGCHAIN_PROJECT", "MetaboT evaluation")
-os.environ["LANGCHAIN_ENDPOINT"] = os.environ.get("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
-
 current_dir = os.path.dirname(os.path.abspath(__file__))
 core_dir = os.path.dirname(current_dir)
 app_dir = os.path.dirname(core_dir)
 local_data_path = os.path.join(app_dir, "data", "big_benchmark.csv")
-client = Client(api_key=api_key)
-
 dataset_name = "test_metabot"
-# Check if the dataset exists in the current user's workspace
-try:
-    client.read_dataset(dataset_name=dataset_name)
-    logger.info(f"Dataset '{dataset_name}' found in LangSmith workspace.")
-except Exception:
-    logger.info(f"Dataset '{dataset_name}' not found. Attempting to create it from local file...")
 
+# Custom criteria for SPARQL query evaluation
+custom_criteria = {
+    "structural similarity of SPARQL queries":
+        "How similar is the structure of the generated SPARQL query to the reference SPARQL query? Does the generated query correctly match subjects to their corresponding objects as in the reference query"
+}
 
-    if not os.path.exists(local_data_path):
-        raise FileNotFoundError(
-            f"Could not find '{local_data_path}'. Ensure the local dataset file is shared alongside this script.")
 
+def create_evaluate_result(app):
+    """Evaluate the result based on input."""
 
-    # Load local data and create the dataset in the user's LangSmith account
-    df = pd.read_csv(local_data_path)
-    dataset = client.create_dataset(dataset_name=dataset_name, description="MetaboT  Benchmark")
+    def evaluate_result(_input, thread_id: int = 1):
+        # Note: Adjust the 'question' key below if your dataset uses a different input key
+        input_text = _input.get("question") or _input["messages"][0]["content"]
 
-    inputs = []
-    outputs = []
+        message = {
+            "messages": [HumanMessage(content=input_text)]
+        }
 
-    for _, row in df.iterrows():
-        try:
-            # The CSV stores the JSON arrays as strings, so we need to parse them back into Python objects
-            parsed_messages = json.loads(row["messages"])
-            parsed_end_state = json.loads(row["__end__"])
+        response = app.invoke(message, {"configurable": {"thread_id": thread_id}})
+        return {"output": response}
 
-            inputs.append({"messages": parsed_messages})
-            outputs.append({"__end__": parsed_end_state})
+    return evaluate_result
 
-        except json.JSONDecodeError as e:
-            logger.warning(f"Skipping a row due to JSON parsing error: {e}")
-            continue
 
-    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
-    logger.info(f"Successfully created dataset '{dataset_name}' in LangSmith.")
-# Custom criteria for SPARQL query evaluation
-custom_criteria = {
-    "structural similarity of SPARQL queries":
-        "How similar is the structure of the generated SPARQL query to the reference SPARQL query? Does the generated query correctly match subjects to their corresponding objects as in the reference query"
-}
+def main():
+    # 1. Load environment variables
+    load_dotenv()
+    logger = setup_logger(__name__)
 
-eval_chain_new = load_evaluator(EvaluatorType.LABELED_CRITERIA, criteria=custom_criteria)
+    # 2. Check for required API keys safely
+    api_key = os.getenv("LANGCHAIN_API_KEY") or os.environ.get("LANGSMITH_API_KEY")
+    openai_key = os.getenv("OPENAI_API_KEY")
 
-# Define the evaluation configuration
-evaluation_config = RunEvalConfig(
-    evaluators=[
-        EvaluatorType.QA,
-        RunEvalConfig.LabeledScoreString(
-            {
-                "accuracy": """
-Score 1: The answer is completely unrelated to the reference.
-Score 3: The answer has minor relevance but does not align with the reference.
-Score 5: The answer has moderate relevance but contains inaccuracies.
-Score 7: The answer aligns with the reference but has minor errors or omissions.
-Score 10: The answer is completely accurate and aligns perfectly with the reference."""
-            },
-            normalize_by=10,
-        ),
-    ],
-    custom_evaluators=[eval_chain_new],
-)
+    if not api_key:
+        raise ValueError("Missing LANGCHAIN_API_KEY (or LANGSMITH_API_KEY). Please add it to your repo-root .env (see docs/user-guide/configuration.md).")
+    if not openai_key:
+        raise ValueError("Missing OPENAI_API_KEY. Please copy .env.template to .env and add your key.")
 
-endpoint_url = os.environ.get("KG_ENDPOINT_URL", "https://enpkg.commons-lab.org/graphdb/repositories/ENPKG")
+    # Set environment variables for LangSmith
+    os.environ["LANGCHAIN_TRACING_V2"] = "true"
+    os.environ["LANGCHAIN_PROJECT"] = os.environ.get("LANGCHAIN_PROJECT", "MetaboT evaluation")
+    os.environ["LANGCHAIN_ENDPOINT"] = os.environ.get("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
 
-models = llm_creation()
+    client = Client(api_key=api_key)
 
-# Create workflow in evaluation mode
-app = create_workflow(
-    models=models,
-    endpoint_url=endpoint_url,
-    evaluation=True,
-    api_key=openai_key
-)
+    # Check if the dataset exists in the current user's workspace
+    try:
+        client.read_dataset(dataset_name=dataset_name)
+        logger.info(f"Dataset '{dataset_name}' found in LangSmith workspace.")
+    except LangSmithNotFoundError:
+        logger.info(f"Dataset '{dataset_name}' not found. Attempting to create it from local file...")
 
+        if not os.path.exists(local_data_path):
+            raise FileNotFoundError(
+                f"Could not find '{local_data_path}'. Ensure the local dataset file is shared alongside this script."
+            ) from None
 
-def evaluate_result(_input, thread_id: int = 1):
-    """Evaluate the result based on input."""
-    # Note: Adjust the 'question' key below if your dataset uses a different input key
-    input_text = _input.get("question") or _input["messages"][0]["content"]
+        # Load local data and create the dataset in the user's LangSmith account
+        df = pd.read_csv(local_data_path)
+        dataset = client.create_dataset(dataset_name=dataset_name, description="MetaboT  Benchmark")
+
+        inputs = []
+        outputs = []
 
-    message = {
-        "messages": [HumanMessage(content=input_text)]
-    }
+        for _, row in df.iterrows():
+            try:
+                # The CSV stores the JSON arrays as strings, so we need to parse them back into Python objects
+                parsed_messages = json.loads(row["messages"])
+                parsed_end_state = json.loads(row["__end__"])
 
-    response = app.invoke(message, {"configurable": {"thread_id": thread_id}})
-    return {"output": response}
+                inputs.append({"messages": parsed_messages})
+                outputs.append({"__end__": parsed_end_state})
 
+            except json.JSONDecodeError as e:
+                logger.warning(f"Skipping a row due to JSON parsing error: {e}")
+                continue
 
-unique_id = uuid4().hex[0:8]
+        client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+        logger.info(f"Successfully created dataset '{dataset_name}' in LangSmith.")
 
-# Run evaluation
-chain_results = run_on_dataset(
-    dataset_name=dataset_name,
-    llm_or_chain_factory=evaluate_result,
-    evaluation=evaluation_config,
-    verbose=True,
-    project_name=f"Testing the app-{unique_id}",
-    client=client,
-    project_metadata={
-        "model": "gpt-4o",
-    },
-)
+    eval_chain_new = load_evaluator(EvaluatorType.LABELED_CRITERIA, criteria=custom_criteria)
 
-logger.info(f"Evaluation complete. View results in LangSmith under project: Testing the app-{unique_id}")
\ No newline at end of file
+    # Define the evaluation configuration
+    evaluation_config = RunEvalConfig(
+        evaluators=[
+            EvaluatorType.QA,
+            RunEvalConfig.LabeledScoreString(
+                {
+                    "accuracy": """
+Score 1: The answer is completely unrelated to the reference.
+Score 3: The answer has minor relevance but does not align with the reference.
+Score 5: The answer has moderate relevance but contains inaccuracies.
+Score 7: The answer aligns with the reference but has minor errors or omissions.
+Score 10: The answer is completely accurate and aligns perfectly with the reference."""
+                },
+                normalize_by=10,
+            ),
+        ],
+        custom_evaluators=[eval_chain_new],
+    )
+
+    endpoint_url = os.environ.get("KG_ENDPOINT_URL", "https://enpkg.commons-lab.org/graphdb/repositories/ENPKG")
+
+    from app.core.main import llm_creation
+
+    models = llm_creation()
+
+    # Create workflow in evaluation mode
+    app = create_workflow(
+        models=models,
+        endpoint_url=endpoint_url,
+        evaluation=True,
+        api_key=openai_key
+    )
+
+    evaluate_result = create_evaluate_result(app)
+    unique_id = uuid4().hex[0:8]
+
+    # Run evaluation
+    run_on_dataset(
+        dataset_name=dataset_name,
+        llm_or_chain_factory=evaluate_result,
+        evaluation=evaluation_config,
+        verbose=True,
+        project_name=f"Testing the app-{unique_id}",
+        client=client,
+        project_metadata={
+            "model": "gpt-4o",
+        },
+    )
+
+    logger.info(f"Evaluation complete. View results in LangSmith under project: Testing the app-{unique_id}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/app/core/tests/test_main.py b/app/core/tests/test_main.py
index 4af8b90..5529288 100644
--- a/app/core/tests/test_main.py
+++ b/app/core/tests/test_main.py
@@ -138,8 +138,18 @@ def test_main_prints_user_friendly_error_for_bad_staged_file(monkeypatch, capsys
     monkeypatch.setattr(main_module, "setup_logger", lambda name: DummyLogger())
     monkeypatch.setattr(main_module, "initialize_session_context", lambda session_id: None)
     monkeypatch.setattr(main_module, "create_user_session", lambda session_id=None, user_session_dir=False, input_dir=False: "session-123")
-    monkeypatch.setattr(main_module, "langsmith_setup", lambda: None)
-    monkeypatch.setattr(main_module, "llm_creation", lambda api_key=None, params_file=None: {"llm_o": object()})
+    monkeypatch.setattr(
+        main_module,
+        "langsmith_setup",
+        lambda: (_ for _ in ()).throw(AssertionError("langsmith should not run")),
+    )
+    monkeypatch.setattr(
+        main_module,
+        "llm_creation",
+        lambda api_key=None, params_file=None: (_ for _ in ()).throw(
+            AssertionError("llm creation should not run")
+        ),
+    )
     monkeypatch.setattr(
         main_module,
         "_prepare_session_files",

From 531b0925892f26fc2dc3bcb70d7c20c3d1c3a735 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 5 May 2026 16:40:35 +0200
Subject: [PATCH 2/6] Revert "fix: update LLM IDs in params.ini and refactor
 evaluation logic in main.py"

This reverts commit eb8ef8afb3e3155a63f5e6ff87c4f30bfb7bd89d.
---
 app/config/params.ini        |  12 +-
 app/core/main.py             |  19 ++-
 app/core/tests/evaluation.py | 219 ++++++++++++++++-------------------
 app/core/tests/test_main.py  |  14 +--
 4 files changed, 120 insertions(+), 144 deletions(-)

diff --git a/app/config/params.ini b/app/config/params.ini
index d01f5cb..8812aae 100644
--- a/app/config/params.ini
+++ b/app/config/params.ini
@@ -1,10 +1,10 @@
 [llm_preview]
-id = gpt-5.5
+id = gpt-5.4
 temperature = 0.3
 max_retries = 3
 
 [llm_o]
-id = gpt-5.5
+id = gpt-5.4
 temperature = 0
 max_retries = 3
 
@@ -15,8 +15,8 @@ temperature = 0
 max_retries = 3
 
 [llm_o3_mini]
-id = gpt-5.4-mini
-temperature = 0
+id = o3-mini-2025-01-31
+temperature = 1
 max_retries = 3
 
 [llm_gpt_5]
@@ -48,7 +48,7 @@ max_retries = 3
 base_url = https://llama-3-1-70b-instruct.endpoints.kepler.ai.cloud.ovh.net/api/openai_compat/v1
 
 [llm_litellm_openai]
-id = gpt-5.5
+id = gpt-5.4
 temperature = 0
 
 
@@ -63,4 +63,4 @@ id = claude-3-opus-20240229
 id = gemini/gemini-1.5-pro
 
 [llm_litellm_mistral]
-id = mistral/mistral-small-latest
+id = mistral/mistral-small-latest
\ No newline at end of file
diff --git a/app/core/main.py b/app/core/main.py
index 4acb783..4293b2b 100644
--- a/app/core/main.py
+++ b/app/core/main.py
@@ -334,16 +334,6 @@ def main():
     global logger
     logger = setup_logger(__name__)
 
-    # Stage user-provided files into the session's input directory before
-    # initializing external services or language model clients.
-    if args.file:
-        try:
-            _prepare_session_files(session_id, args.file)
-        except SessionFilePreparationError as exc:
-            logger.error(str(exc))
-            print(f"Error: {exc}")
-            return
-
     # Initialize LangSmith if available
     langsmith_setup()
 
@@ -357,6 +347,15 @@ def main():
     # Initialize language models
     models = llm_creation(api_key=args.api_key)
 
+    # Stage user-provided files into the session's input directory
+    if args.file:
+        try:
+            _prepare_session_files(session_id, args.file)
+        except SessionFilePreparationError as exc:
+            logger.error(str(exc))
+            print(f"Error: {exc}")
+            return
+
     try:
         workflow = create_workflow(
             models=models,
diff --git a/app/core/tests/evaluation.py b/app/core/tests/evaluation.py
index d6c6c25..ccd171e 100644
--- a/app/core/tests/evaluation.py
+++ b/app/core/tests/evaluation.py
@@ -6,154 +6,141 @@
 from dotenv import load_dotenv
 
 from langsmith import Client
-from langsmith.utils import LangSmithNotFoundError
 from langchain_core.messages import HumanMessage
 from langsmith.evaluation import EvaluationResult, run_evaluator
 from langchain.evaluation import EvaluatorType, load_evaluator
 from langchain.smith import run_on_dataset, RunEvalConfig
 
 from app.core.workflow.langraph_workflow import create_workflow
+from app.core.main import llm_creation
 from app.core.utils import setup_logger
 
+# 1. Load environment variables
+load_dotenv()
+logger = setup_logger(__name__)
+
+# 2. Check for required API keys safely
+api_key = os.getenv("LANGCHAIN_API_KEY") or os.environ.get("LANGSMITH_API_KEY")
+openai_key = os.getenv("OPENAI_API_KEY")
+
+if not api_key:
+    raise ValueError("Missing LANGCHAIN_API_KEY (or LANGSMITH_API_KEY). Please add it to your repo-root .env (see docs/user-guide/configuration.md).")
+if not openai_key:
+    raise ValueError("Missing OPENAI_API_KEY. Please copy .env.template to .env and add your key.")
+
+# Set environment variables for LangSmith
+os.environ["LANGCHAIN_TRACING_V2"] = "true"
+os.environ["LANGCHAIN_PROJECT"] = os.environ.get("LANGCHAIN_PROJECT", "MetaboT evaluation")
+os.environ["LANGCHAIN_ENDPOINT"] = os.environ.get("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
+
 current_dir = os.path.dirname(os.path.abspath(__file__))
 core_dir = os.path.dirname(current_dir)
 app_dir = os.path.dirname(core_dir)
 local_data_path = os.path.join(app_dir, "data", "big_benchmark.csv")
-dataset_name = "test_metabot"
+client = Client(api_key=api_key)
 
-# Custom criteria for SPARQL query evaluation
-custom_criteria = {
-    "structural similarity of SPARQL queries":
-        "How similar is the structure of the generated SPARQL query to the reference SPARQL query? Does the generated query correctly match subjects to their corresponding objects as in the reference query"
-}
+dataset_name = "test_metabot"
+# Check if the dataset exists in the current user's workspace
+try:
+    client.read_dataset(dataset_name=dataset_name)
+    logger.info(f"Dataset '{dataset_name}' found in LangSmith workspace.")
+except Exception:
+    logger.info(f"Dataset '{dataset_name}' not found. Attempting to create it from local file...")
 
 
-def create_evaluate_result(app):
-    """Evaluate the result based on input."""
+    if not os.path.exists(local_data_path):
+        raise FileNotFoundError(
+            f"Could not find '{local_data_path}'. Ensure the local dataset file is shared alongside this script.")
 
-    def evaluate_result(_input, thread_id: int = 1):
-        # Note: Adjust the 'question' key below if your dataset uses a different input key
-        input_text = _input.get("question") or _input["messages"][0]["content"]
 
-        message = {
-            "messages": [HumanMessage(content=input_text)]
-        }
+    # Load local data and create the dataset in the user's LangSmith account
+    df = pd.read_csv(local_data_path)
+    dataset = client.create_dataset(dataset_name=dataset_name, description="MetaboT  Benchmark")
 
-        response = app.invoke(message, {"configurable": {"thread_id": thread_id}})
-        return {"output": response}
+    inputs = []
+    outputs = []
 
-    return evaluate_result
+    for _, row in df.iterrows():
+        try:
+            # The CSV stores the JSON arrays as strings, so we need to parse them back into Python objects
+            parsed_messages = json.loads(row["messages"])
+            parsed_end_state = json.loads(row["__end__"])
 
+            inputs.append({"messages": parsed_messages})
+            outputs.append({"__end__": parsed_end_state})
 
-def main():
-    # 1. Load environment variables
-    load_dotenv()
-    logger = setup_logger(__name__)
+        except json.JSONDecodeError as e:
+            logger.warning(f"Skipping a row due to JSON parsing error: {e}")
+            continue
 
-    # 2. Check for required API keys safely
-    api_key = os.getenv("LANGCHAIN_API_KEY") or os.environ.get("LANGSMITH_API_KEY")
-    openai_key = os.getenv("OPENAI_API_KEY")
+    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+    logger.info(f"Successfully created dataset '{dataset_name}' in LangSmith.")
+# Custom criteria for SPARQL query evaluation
+custom_criteria = {
+    "structural similarity of SPARQL queries":
+        "How similar is the structure of the generated SPARQL query to the reference SPARQL query? Does the generated query correctly match subjects to their corresponding objects as in the reference query"
+}
 
-    if not api_key:
-        raise ValueError("Missing LANGCHAIN_API_KEY (or LANGSMITH_API_KEY). Please add it to your repo-root .env (see docs/user-guide/configuration.md).")
-    if not openai_key:
-        raise ValueError("Missing OPENAI_API_KEY. Please copy .env.template to .env and add your key.")
+eval_chain_new = load_evaluator(EvaluatorType.LABELED_CRITERIA, criteria=custom_criteria)
 
-    # Set environment variables for LangSmith
-    os.environ["LANGCHAIN_TRACING_V2"] = "true"
-    os.environ["LANGCHAIN_PROJECT"] = os.environ.get("LANGCHAIN_PROJECT", "MetaboT evaluation")
-    os.environ["LANGCHAIN_ENDPOINT"] = os.environ.get("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
+# Define the evaluation configuration
+evaluation_config = RunEvalConfig(
+    evaluators=[
+        EvaluatorType.QA,
+        RunEvalConfig.LabeledScoreString(
+            {
+                "accuracy": """
+Score 1: The answer is completely unrelated to the reference.
+Score 3: The answer has minor relevance but does not align with the reference.
+Score 5: The answer has moderate relevance but contains inaccuracies.
+Score 7: The answer aligns with the reference but has minor errors or omissions.
+Score 10: The answer is completely accurate and aligns perfectly with the reference."""
+            },
+            normalize_by=10,
+        ),
+    ],
+    custom_evaluators=[eval_chain_new],
+)
 
-    client = Client(api_key=api_key)
+endpoint_url = os.environ.get("KG_ENDPOINT_URL", "https://enpkg.commons-lab.org/graphdb/repositories/ENPKG")
 
-    # Check if the dataset exists in the current user's workspace
-    try:
-        client.read_dataset(dataset_name=dataset_name)
-        logger.info(f"Dataset '{dataset_name}' found in LangSmith workspace.")
-    except LangSmithNotFoundError:
-        logger.info(f"Dataset '{dataset_name}' not found. Attempting to create it from local file...")
+models = llm_creation()
 
-        if not os.path.exists(local_data_path):
-            raise FileNotFoundError(
-                f"Could not find '{local_data_path}'. Ensure the local dataset file is shared alongside this script."
-            ) from None
+# Create workflow in evaluation mode
+app = create_workflow(
+    models=models,
+    endpoint_url=endpoint_url,
+    evaluation=True,
+    api_key=openai_key
+)
 
-        # Load local data and create the dataset in the user's LangSmith account
-        df = pd.read_csv(local_data_path)
-        dataset = client.create_dataset(dataset_name=dataset_name, description="MetaboT  Benchmark")
 
-        inputs = []
-        outputs = []
+def evaluate_result(_input, thread_id: int = 1):
+    """Evaluate the result based on input."""
+    # Note: Adjust the 'question' key below if your dataset uses a different input key
+    input_text = _input.get("question") or _input["messages"][0]["content"]
 
-        for _, row in df.iterrows():
-            try:
-                # The CSV stores the JSON arrays as strings, so we need to parse them back into Python objects
-                parsed_messages = json.loads(row["messages"])
-                parsed_end_state = json.loads(row["__end__"])
+    message = {
+        "messages": [HumanMessage(content=input_text)]
+    }
 
-                inputs.append({"messages": parsed_messages})
-                outputs.append({"__end__": parsed_end_state})
+    response = app.invoke(message, {"configurable": {"thread_id": thread_id}})
+    return {"output": response}
 
-            except json.JSONDecodeError as e:
-                logger.warning(f"Skipping a row due to JSON parsing error: {e}")
-                continue
 
-        client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
-        logger.info(f"Successfully created dataset '{dataset_name}' in LangSmith.")
+unique_id = uuid4().hex[0:8]
 
-    eval_chain_new = load_evaluator(EvaluatorType.LABELED_CRITERIA, criteria=custom_criteria)
+# Run evaluation
+chain_results = run_on_dataset(
+    dataset_name=dataset_name,
+    llm_or_chain_factory=evaluate_result,
+    evaluation=evaluation_config,
+    verbose=True,
+    project_name=f"Testing the app-{unique_id}",
+    client=client,
+    project_metadata={
+        "model": "gpt-4o",
+    },
+)
 
-    # Define the evaluation configuration
-    evaluation_config = RunEvalConfig(
-        evaluators=[
-            EvaluatorType.QA,
-            RunEvalConfig.LabeledScoreString(
-                {
-                    "accuracy": """
-Score 1: The answer is completely unrelated to the reference.
-Score 3: The answer has minor relevance but does not align with the reference.
-Score 5: The answer has moderate relevance but contains inaccuracies.
-Score 7: The answer aligns with the reference but has minor errors or omissions.
-Score 10: The answer is completely accurate and aligns perfectly with the reference."""
-                },
-                normalize_by=10,
-            ),
-        ],
-        custom_evaluators=[eval_chain_new],
-    )
-
-    endpoint_url = os.environ.get("KG_ENDPOINT_URL", "https://enpkg.commons-lab.org/graphdb/repositories/ENPKG")
-
-    from app.core.main import llm_creation
-
-    models = llm_creation()
-
-    # Create workflow in evaluation mode
-    app = create_workflow(
-        models=models,
-        endpoint_url=endpoint_url,
-        evaluation=True,
-        api_key=openai_key
-    )
-
-    evaluate_result = create_evaluate_result(app)
-    unique_id = uuid4().hex[0:8]
-
-    # Run evaluation
-    run_on_dataset(
-        dataset_name=dataset_name,
-        llm_or_chain_factory=evaluate_result,
-        evaluation=evaluation_config,
-        verbose=True,
-        project_name=f"Testing the app-{unique_id}",
-        client=client,
-        project_metadata={
-            "model": "gpt-4o",
-        },
-    )
-
-    logger.info(f"Evaluation complete. View results in LangSmith under project: Testing the app-{unique_id}")
-
-
-if __name__ == "__main__":
-    main()
+logger.info(f"Evaluation complete. View results in LangSmith under project: Testing the app-{unique_id}")
\ No newline at end of file
diff --git a/app/core/tests/test_main.py b/app/core/tests/test_main.py
index 5529288..4af8b90 100644
--- a/app/core/tests/test_main.py
+++ b/app/core/tests/test_main.py
@@ -138,18 +138,8 @@ def test_main_prints_user_friendly_error_for_bad_staged_file(monkeypatch, capsys
     monkeypatch.setattr(main_module, "setup_logger", lambda name: DummyLogger())
     monkeypatch.setattr(main_module, "initialize_session_context", lambda session_id: None)
     monkeypatch.setattr(main_module, "create_user_session", lambda session_id=None, user_session_dir=False, input_dir=False: "session-123")
-    monkeypatch.setattr(
-        main_module,
-        "langsmith_setup",
-        lambda: (_ for _ in ()).throw(AssertionError("langsmith should not run")),
-    )
-    monkeypatch.setattr(
-        main_module,
-        "llm_creation",
-        lambda api_key=None, params_file=None: (_ for _ in ()).throw(
-            AssertionError("llm creation should not run")
-        ),
-    )
+    monkeypatch.setattr(main_module, "langsmith_setup", lambda: None)
+    monkeypatch.setattr(main_module, "llm_creation", lambda api_key=None, params_file=None: {"llm_o": object()})
     monkeypatch.setattr(
         main_module,
         "_prepare_session_files",

From c6d2d60451cee2077bbed367484cc5676a0b599d Mon Sep 17 00:00:00 2001
From: madina1203 <madina.bekbergenova@etu.univ-cotedazur.fr>
Date: Tue, 5 May 2026 16:49:44 +0200
Subject: [PATCH 3/6] Add LangSmith Evaluation to docs

---
 mkdocs.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mkdocs.yml b/mkdocs.yml
index 0d06b7a..5bd1239 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -170,6 +170,7 @@ nav:
     - Graph Management: api-reference/graph-management.md
   - Examples:
     - Basic Usage: examples/basic-usage.md
+    - LangSmith Evaluation: examples/langsmith-evaluation.md
     
   - Contributing: contributing.md
 

From 977a1a0cbe9c7ea14601f06053fa54012c22b361 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Wed, 6 May 2026 16:09:31 +0200
Subject: [PATCH 4/6] Update model IDs in params.ini for consistency

---
 app/config/params.ini | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/app/config/params.ini b/app/config/params.ini
index 2f706a6..e082cce 100644
--- a/app/config/params.ini
+++ b/app/config/params.ini
@@ -1,5 +1,5 @@
 [llm_preview]
-id = gpt-5.4
+id = gpt-5.5
 temperature = 0.3
 max_retries = 3
 
@@ -48,7 +48,7 @@ max_retries = 3
 base_url = https://oai.endpoints.kepler.ai.cloud.ovh.net/v1
 
 [llm_litellm_openai]
-id = gpt-5.4
+id = gpt-5.5
 temperature = 0
 
 
@@ -60,7 +60,7 @@ id = deepseek/deepseek-v4-flash
 id = claude-opus-4-7
 
 [llm_litellm_gemini]
-id = gemini/gemini-2.5-pro
+id = gemini/gemini-3-pro-preview
 
 [llm_litellm_mistral]
-id = mistral/mistral-small-latest
+id = mistral/mistral-large-latest

From cc33d2b4b0e70bbd38185a0f99ccfa84ab3e599b Mon Sep 17 00:00:00 2001
From: madina1203 <madina.bekbergenova@etu.univ-cotedazur.fr>
Date: Wed, 6 May 2026 18:44:25 +0200
Subject: [PATCH 5/6] Clean up params.ini LLM sections and fix gpt-5.5
 temperature

- Remove unused sections (llm_preview, llm_gpt_5, llm_o1) whose ids
  duplicated other sections and were never retrieved at runtime.
- Rename [llm_o3_mini] -> [llm_mini] so the section name matches its
  actual model id (gpt-5.4-mini); update the SPARQL improvement chain,
  test fixture, and docs accordingly.
- Set temperature=1 for gpt-5.5 sections (llm_o, llm_litellm_openai);
  the API rejects temperature=0 for that model, which was crashing
  agent runs with a 400 unsupported_value error.
- Trim llm_handler.py section list to ["llm_o", "llm_mini"] so the
  module no longer references removed sections.
---
 app/config/params.ini                 | 25 ++-----------------------
 app/core/agents/sparql/tool_sparql.py |  2 +-
 app/core/llm_handler.py               |  2 +-
 app/core/tests/test_sparql_tool.py    |  2 +-
 docs/api-reference/agents.md          |  2 +-
 docs/user-guide/configuration.md      |  6 +-----
 6 files changed, 7 insertions(+), 32 deletions(-)

diff --git a/app/config/params.ini b/app/config/params.ini
index e082cce..c8f4f03 100644
--- a/app/config/params.ini
+++ b/app/config/params.ini
@@ -1,31 +1,10 @@
-[llm_preview]
-id = gpt-5.5
-temperature = 0.3
-max_retries = 3
-
 [llm_o]
 id = gpt-5.5
-temperature = 0
+temperature = 1
 max_retries = 3
 
-
 [llm_mini]
 id = gpt-5.4-mini
-temperature = 0
-max_retries = 3
-
-[llm_o3_mini]
-id = gpt-5.4-mini
-temperature = 1
-max_retries = 3
-
-[llm_gpt_5]
-id = gpt-5.5
-temperature = 1
-max_retries = 3
-
-[llm_o1]
-id = gpt-5.5
 temperature = 1
 max_retries = 3
 
@@ -49,7 +28,7 @@ base_url = https://oai.endpoints.kepler.ai.cloud.ovh.net/v1
 
 [llm_litellm_openai]
 id = gpt-5.5
-temperature = 0
+temperature = 1
 
 
 [llm_litellm_deepseek]
diff --git a/app/core/agents/sparql/tool_sparql.py b/app/core/agents/sparql/tool_sparql.py
index 8721f67..214641b 100644
--- a/app/core/agents/sparql/tool_sparql.py
+++ b/app/core/agents/sparql/tool_sparql.py
@@ -250,7 +250,7 @@ def __init__(
                 # verbose=True,  #### FOR debugging
             )
             self.sparql_improvement_chain = LLMChain(
-                llm=llm["llm_o3_mini"],
+                llm=llm["llm_mini"],
                 prompt=SPARQL_IMPROVEMENT_PROMPT,
             )
         except KeyError as e:
diff --git a/app/core/llm_handler.py b/app/core/llm_handler.py
index 77f69af..5ec2af6 100644
--- a/app/core/llm_handler.py
+++ b/app/core/llm_handler.py
@@ -22,7 +22,7 @@ def llm_creation(api_key: Optional[str] = None) -> Dict[str, ChatOpenAI]:
     logger.info(f"Loading configuration from {config_path}")
     config.read(config_path)
 
-    sections = ["llm", "llm_preview", "llm_o", "llm_mini", "llm_o3_mini", "llm_o1"]
+    sections = ["llm_o", "llm_mini"]
     models = {}
 
     # Get the OpenAI API key from the argument or environment variables
diff --git a/app/core/tests/test_sparql_tool.py b/app/core/tests/test_sparql_tool.py
index 6b95ae0..bdea755 100644
--- a/app/core/tests/test_sparql_tool.py
+++ b/app/core/tests/test_sparql_tool.py
@@ -112,7 +112,7 @@ def fake_load_local(path, embeddings, allow_dangerous_deserialization=False):
     monkeypatch.setattr(tool_sparql.FAISS, "load_local", fake_load_local)
 
     tool = tool_sparql.GraphSparqlQAChain(
-        llm={"llm_o": object(), "llm_o3_mini": object()},
+        llm={"llm_o": object(), "llm_mini": object()},
         graph=object(),
         session_id="session-123",
         openai_key=None,
diff --git a/docs/api-reference/agents.md b/docs/api-reference/agents.md
index 7f517c6..f77af05 100644
--- a/docs/api-reference/agents.md
+++ b/docs/api-reference/agents.md
@@ -327,7 +327,7 @@ Modify the supervisor prompt (see [supervisor prompt](https://github.com/holobio
 ### Configuration Updates
 Update [`app/config/langgraph.json`](https://github.com/HolobiomicsLab/MetaboT/blob/main/app/config/langgraph.json) to include your agent in the workflow and specify `llm_choice` based on the models defined in [`app/config/params.ini`](https://github.com/HolobiomicsLab/MetaboT/blob/main/app/config/params.ini). Available models include:
 
-- OpenAI models: `llm_preview`, `llm_o`, `llm_mini`
+- OpenAI models: `llm_o`, `llm_mini`
 - OVH models: `ovh_Meta-Llama-3_1-70B-Instruct`
 - Deepseek models: `deepseek_deepseek-chat`, `deepseek_deepseek-reasoner`
 - LiteLLM compatible models: `llm_litellm_openai`, `llm_litellm_deepseek`, `llm_litellm_claude`, `llm_litellm_gemini`
diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md
index f225e74..07193e5 100644
--- a/docs/user-guide/configuration.md
+++ b/docs/user-guide/configuration.md
@@ -21,12 +21,8 @@ MetaboT reads language model definitions from `app/config/params.ini`.
 
 | Section | Purpose |
 | --- | --- |
-| `llm_preview` | GPT-4o preview-style configuration |
 | `llm_o` | primary OpenAI model configuration |
-| `llm_mini` | lower-cost GPT-4o mini option |
-| `llm_o3_mini` | reasoning-oriented OpenAI configuration |
-| `llm_gpt_5` | GPT-5 configuration example |
-| `llm_o1` | O1 configuration example |
+| `llm_mini` | secondary OpenAI model used for the SPARQL improvement chain |
 | `deepseek_deepseek-chat` | DeepSeek chat endpoint |
 | `deepseek_deepseek-reasoner` | DeepSeek reasoner endpoint |
 | `ovh_Meta-Llama-3_1-70B-Instruct` | OVH-hosted Llama endpoint |

From 39b19526789e22aa1edb7bb3e03db777068e5b44 Mon Sep 17 00:00:00 2001
From: madina1203 <madina.bekbergenova@etu.univ-cotedazur.fr>
Date: Thu, 7 May 2026 09:46:49 +0200
Subject: [PATCH 6/6] Change temperature to 0 in params.ini

---
 app/config/params.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/app/config/params.ini b/app/config/params.ini
index c8f4f03..322ae6d 100644
--- a/app/config/params.ini
+++ b/app/config/params.ini
@@ -1,11 +1,11 @@
 [llm_o]
 id = gpt-5.5
-temperature = 1
+temperature = 0
 max_retries = 3
 
 [llm_mini]
 id = gpt-5.4-mini
-temperature = 1
+temperature = 0
 max_retries = 3
 
 [deepseek_deepseek-chat]