From 4215cb3a3e4d37f206120f154c08e10158874a15 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 5 Jun 2026 21:18:17 +0000 Subject: [PATCH 1/2] Upgrade to DSPy 3.x (latest: 3.2.1) - Install 'dspy' instead of the deprecated 'dspy-ai' package, pinned to >=3.2.1, and uninstall any stale 'dspy-ai' from existing virtualenvs - Build custom signatures with dspy.Signature(fields_dict, instructions) instead of the pydantic create_model + multiple-inheritance workaround - Remove the MIPRO optimizer (removed in DSPy 3.x) from core.py and the UI dropdown - Update MIPROv2 to the new API: auto="medium" sizing instead of num_candidates/ num_batches, and drop the removed eval_kwargs/requires_permission_to_run args - Pass prompt_model to COPRO instead of the unused teacher_settings - Add the hint as a real signature input field for ChainOfThoughtWithHint, since dspy.ChainOfThoughtWithHint was removed and extra kwargs are now ignored - Read .score from Evaluate results, which now return EvaluationResult objects - Use the 'gemini/' LiteLLM prefix for Google models https://claude.ai/code/session_01LNCkRPkq52iz9gCgDnEpLH --- core.py | 63 +++++++++++++++++++--------------------------------- interface.py | 4 ++-- webui.sh | 7 +++--- 3 files changed, 29 insertions(+), 45 deletions(-) diff --git a/core.py b/core.py index 654780a..7ef3f8b 100644 --- a/core.py +++ b/core.py @@ -9,8 +9,7 @@ from typing import List, Dict, Any from dspy.evaluate import Evaluate -from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, MIPRO, MIPROv2, COPRO, BootstrapFinetune -from pydantic import create_model +from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, MIPROv2, COPRO, BootstrapFinetune # List of supported Groq models SUPPORTED_GROQ_MODELS = [ @@ -35,26 +34,18 @@ def create_custom_signature(input_fields: List[str], output_fields: List[str], i fields = {} for i, field in enumerate(input_fields): if i < len(input_descs) and input_descs[i]: - fields[field] = (str, dspy.InputField(default=..., desc=input_descs[i], json_schema_extra={"__dspy_field_type": "input"})) + fields[field] = (str, dspy.InputField(desc=input_descs[i])) else: - fields[field] = (str, dspy.InputField(default=..., json_schema_extra={"__dspy_field_type": "input"})) - + fields[field] = (str, dspy.InputField()) + for i, field in enumerate(output_fields): if i < len(output_descs) and output_descs[i]: - fields[field] = (str, dspy.OutputField(default=..., desc=output_descs[i], json_schema_extra={"__dspy_field_type": "output"})) + fields[field] = (str, dspy.OutputField(desc=output_descs[i])) else: - fields[field] = (str, dspy.OutputField(default=..., json_schema_extra={"__dspy_field_type": "output"})) - - CustomSignatureModel = create_model('CustomSignatureModel', **fields) - - class CustomSignature(dspy.Signature, CustomSignatureModel): - """ - {instructions} - """ - - CustomSignature.__doc__ = CustomSignature.__doc__.format(instructions=instructions) - - return CustomSignature + fields[field] = (str, dspy.OutputField()) + + # DSPy supports building signatures directly from a field dict + return dspy.Signature(fields, instructions) def generate_human_readable_id(input_fields: List[str], output_fields: List[str], dspy_module: str, llm_model: str, teacher_model: str, optimizer: str, instructions: str) -> str: # Create a signature-based name @@ -100,9 +91,12 @@ def forward(self, **kwargs): class CustomChainOfThoughtWithHintModule(dspy.Module): def __init__(self): super().__init__() - self.cot_with_hint = dspy.ChainOfThought(CustomSignature) + # dspy.ChainOfThoughtWithHint was removed in DSPy 3.x, and extra kwargs not in the + # signature are now ignored, so add the hint to the signature as an input field + signature_with_hint = CustomSignature.append("hint", dspy.InputField(desc="A hint to guide the reasoning"), str) + self.cot_with_hint = dspy.ChainOfThought(signature_with_hint) self.hint = hint - + def forward(self, **kwargs): # Inject the hint into the kwargs kwargs['hint'] = self.hint @@ -121,7 +115,7 @@ def compile_program(input_fields: List[str], output_fields: List[str], dspy_modu elif llm_model in SUPPORTED_GROQ_MODELS: lm = dspy.LM(f'groq/{llm_model}', api_key=os.environ.get("GROQ_API_KEY")) elif llm_model in SUPPORTED_GOOGLE_MODELS: - lm = dspy.LM(f'google/{llm_model}', api_key=os.environ.get("GOOGLE_API_KEY")) + lm = dspy.LM(f'gemini/{llm_model}', api_key=os.environ.get("GOOGLE_API_KEY")) else: raise ValueError(f"Unsupported LLM model: {llm_model}") @@ -139,7 +133,7 @@ def compile_program(input_fields: List[str], output_fields: List[str], dspy_modu elif teacher_model in SUPPORTED_GROQ_MODELS: teacher_lm = dspy.LM(f'groq/{teacher_model}', api_key=os.environ.get("GROQ_API_KEY")) elif teacher_model in SUPPORTED_GOOGLE_MODELS: - teacher_lm = dspy.LM(f'google/{teacher_model}', api_key=os.environ.get("GOOGLE_API_KEY")) + teacher_lm = dspy.LM(f'gemini/{teacher_model}', api_key=os.environ.get("GOOGLE_API_KEY")) else: raise ValueError(f"Unsupported teacher model: {teacher_model}") @@ -326,8 +320,9 @@ def metric(gold, pred, trace=None): kwargs = dict(num_threads=1, display_progress=True, display_table=1) # Evaluate the module to establish a baseline + # Evaluate now returns an EvaluationResult object, so extract the score baseline_evaluate = Evaluate(metric=metric, devset=devset, num_threads=1) - baseline_score = baseline_evaluate(module) + baseline_score = baseline_evaluate(module).score # Set up the optimizer if optimizer == "BootstrapFewShot": @@ -337,39 +332,27 @@ def metric(gold, pred, trace=None): teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, teacher_settings=dict(lm=teacher_lm), num_threads=1) compiled_program = teleprompter.compile(module, trainset=trainset, valset=devset) elif optimizer == "COPRO": - teleprompter = COPRO(metric=metric, teacher_settings=dict(lm=teacher_lm)) + teleprompter = COPRO(metric=metric, prompt_model=teacher_lm) compiled_program = teleprompter.compile(module, trainset=trainset, eval_kwargs=kwargs) - elif optimizer == "MIPRO": - teleprompter = MIPRO(metric=metric, teacher_settings=dict(lm=teacher_lm), prompt_model=teacher_lm, task_model=lm) - num_trials = 10 # Adjust this value as needed - max_bootstrapped_demos = 5 # Adjust this value as needed - max_labeled_demos = 5 # Adjust this value as needed - compiled_program = teleprompter.compile(module, trainset=trainset, num_trials=num_trials, - max_bootstrapped_demos=max_bootstrapped_demos, - max_labeled_demos=max_labeled_demos, - eval_kwargs=kwargs, requires_permission_to_run=False) elif optimizer == "MIPROv2": - teleprompter = MIPROv2(metric=metric, prompt_model=lm, task_model=teacher_lm, num_candidates=10, init_temperature=1.0) + # MIPRO was removed in DSPy 3.x; MIPROv2 now sizes its search via the 'auto' setting + teleprompter = MIPROv2(metric=metric, prompt_model=teacher_lm, task_model=lm, teacher_settings=dict(lm=teacher_lm), auto="medium", init_temperature=1.0) - num_batches = 30 max_bootstrapped_demos = 8 max_labeled_demos = 16 compiled_program = teleprompter.compile( module, trainset=trainset, valset=devset, - num_batches=num_batches, max_bootstrapped_demos=max_bootstrapped_demos, - max_labeled_demos=max_labeled_demos, - eval_kwargs=kwargs, - requires_permission_to_run=False + max_labeled_demos=max_labeled_demos ) else: raise ValueError(f"Unsupported optimizer: {optimizer}") # Evaluate the compiled program evaluate = Evaluate(metric=metric, devset=devset, num_threads=1) - score = evaluate(compiled_program) + score = evaluate(compiled_program).score print("Evaluation Score:") print(score) diff --git a/interface.py b/interface.py index b4347fd..86642ce 100644 --- a/interface.py +++ b/interface.py @@ -461,10 +461,10 @@ def generate_response(human_readable_id, row_selector, df): with gr.Row(): optimizer = gr.Dropdown( - ["BootstrapFewShot", "BootstrapFewShotWithRandomSearch", "MIPRO", "MIPROv2", "COPRO"], + ["BootstrapFewShot", "BootstrapFewShotWithRandomSearch", "MIPROv2", "COPRO"], label="Optimizer", value="BootstrapFewShot", - info="Choose optimization strategy: None (no optimization), BootstrapFewShot (small datasets, ~10 examples) uses few-shot learning; BootstrapFewShotWithRandomSearch (medium, ~50) adds randomized search; MIPRO, MIPROv2, and COPRO (large, 300+) also optimize the prompt instructions.", + info="Choose optimization strategy: None (no optimization), BootstrapFewShot (small datasets, ~10 examples) uses few-shot learning; BootstrapFewShotWithRandomSearch (medium, ~50) adds randomized search; MIPROv2 and COPRO (large, 300+) also optimize the prompt instructions.", interactive=True # Add this line ) with gr.Column(): diff --git a/webui.sh b/webui.sh index c3e2d56..19ad15f 100644 --- a/webui.sh +++ b/webui.sh @@ -42,10 +42,11 @@ else echo "No .env file found. Make sure to set any necessary environment variables manually." fi -# Install required packages if not already installed -if ! pip freeze | grep -q "gradio\|dspy\|pandas\|openai\|anthropic\|groq\|sklearn|google-generativeai"; then +# Install required packages if not already installed (the 'dspy-ai' package was renamed to 'dspy' in 3.x) +if ! pip freeze | grep -q "^dspy==3"; then echo "Installing required packages..." - pip install gradio dspy-ai pandas openai anthropic groq scikit-learn google-generativeai + pip uninstall -y dspy-ai 2>/dev/null || true + pip install gradio "dspy>=3.2.1" pandas openai anthropic groq scikit-learn google-generativeai else echo "Required packages are already installed." fi From b382ef019a4946c09e854506c87eaa47a9b676e3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 02:04:45 +0000 Subject: [PATCH 2/2] Add GEPA optimizer GEPA (Genetic-Pareto) is the reflective prompt-evolution optimizer introduced in DSPy 3.x. Wraps the selected metric to GEPA's feedback-metric contract (extra pred_name/pred_trace args, score + textual feedback) and uses the teacher model as the reflection LM. Added to the optimizer dropdown. https://claude.ai/code/session_01LNCkRPkq52iz9gCgDnEpLH --- core.py | 12 +++++++++++- interface.py | 4 ++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/core.py b/core.py index 7ef3f8b..4399c5d 100644 --- a/core.py +++ b/core.py @@ -9,7 +9,7 @@ from typing import List, Dict, Any from dspy.evaluate import Evaluate -from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, MIPROv2, COPRO, BootstrapFinetune +from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, MIPROv2, COPRO, GEPA, BootstrapFinetune # List of supported Groq models SUPPORTED_GROQ_MODELS = [ @@ -347,6 +347,16 @@ def metric(gold, pred, trace=None): max_bootstrapped_demos=max_bootstrapped_demos, max_labeled_demos=max_labeled_demos ) + elif optimizer == "GEPA": + # GEPA metrics receive extra arguments (pred_name, pred_trace) and can return + # textual feedback alongside the score, so wrap the selected metric accordingly + def gepa_metric(gold, pred, trace=None, pred_name=None, pred_trace=None): + score = metric(gold, pred, trace) + return dspy.Prediction(score=float(score), feedback=f"The score for this prediction was {float(score)}.") + + # GEPA reflects on prompts in natural language, so use the stronger teacher model as the reflection LM + teleprompter = GEPA(metric=gepa_metric, auto="light", reflection_lm=teacher_lm, num_threads=1) + compiled_program = teleprompter.compile(module, trainset=trainset, valset=devset) else: raise ValueError(f"Unsupported optimizer: {optimizer}") diff --git a/interface.py b/interface.py index 86642ce..3972f36 100644 --- a/interface.py +++ b/interface.py @@ -461,10 +461,10 @@ def generate_response(human_readable_id, row_selector, df): with gr.Row(): optimizer = gr.Dropdown( - ["BootstrapFewShot", "BootstrapFewShotWithRandomSearch", "MIPROv2", "COPRO"], + ["BootstrapFewShot", "BootstrapFewShotWithRandomSearch", "MIPROv2", "COPRO", "GEPA"], label="Optimizer", value="BootstrapFewShot", - info="Choose optimization strategy: None (no optimization), BootstrapFewShot (small datasets, ~10 examples) uses few-shot learning; BootstrapFewShotWithRandomSearch (medium, ~50) adds randomized search; MIPROv2 and COPRO (large, 300+) also optimize the prompt instructions.", + info="Choose optimization strategy: None (no optimization), BootstrapFewShot (small datasets, ~10 examples) uses few-shot learning; BootstrapFewShotWithRandomSearch (medium, ~50) adds randomized search; MIPROv2 and COPRO (large, 300+) also optimize the prompt instructions; GEPA uses the teacher model to reflectively evolve the prompt instructions.", interactive=True # Add this line ) with gr.Column():