diff --git a/app/cli/core/agent.py b/app/cli/core/agent.py index fa44fc3..5e88863 100644 --- a/app/cli/core/agent.py +++ b/app/cli/core/agent.py @@ -150,19 +150,43 @@ def _check_api_key() -> None: def _parse_methods(raw: str) -> tuple[str, str]: - """Extract load() and predict() bodies from raw LLM output.""" + """Extract load() and predict() bodies from raw LLM output. + + Enforces the system prompt contract: + - Return ONLY the two method bodies as plain Python + - No class wrapper, no markdown fences + - Methods must start at column 0 (not indented) + """ # Strip markdown fences if the model added them anyway raw = re.sub(r"```(?:python)?", "", raw).replace("```", "").strip() - load_match = re.search(r"(def load\(self\).*?)(?=\ndef |\Z)", raw, re.DOTALL) - predict_match = re.search(r"(def predict\(self,.*?)(?=\ndef |\Z)", raw, re.DOTALL) - - if not load_match or not predict_match: + # Split on 'def ' at start of line (no leading whitespace) + # If LLM adds indentation (class wrapper), this will fail—as intended + blocks = re.split(r"(?=^def )", raw, flags=re.MULTILINE) + methods = {} + + for block in blocks: + block = block.strip() + if not block: + continue + if block.startswith("def load(self)"): + methods["load"] = block + elif block.startswith("def predict(self,"): + methods["predict"] = block + else: + # Non-method content (e.g., class wrapper, trailing text) + # Reject to enforce system prompt compliance + raise ValueError( + f"Unexpected content in LLM output (must be exactly two method bodies, " + f"no class wrapper or trailing text):\n{raw}" + ) + + if "load" not in methods or "predict" not in methods: raise ValueError( f"Could not parse load() and predict() from LLM output:\n{raw}" ) - return load_match.group(1).strip(), predict_match.group(1).strip() + return methods["load"], methods["predict"] @dataclass diff --git a/tests/test_cli_phase3_agent.py b/tests/test_cli_phase3_agent.py index efa1425..8af604d 100644 --- a/tests/test_cli_phase3_agent.py +++ b/tests/test_cli_phase3_agent.py @@ -140,6 +140,55 @@ def test_parse_methods_missing_load_raises(): _parse_methods(raw) +def test_parse_methods_no_blank_line_between(): + """Valid format: LLM omits blank line between methods but starts at column 0.""" + from app.cli.core.agent import _parse_methods + raw = ( + "def load(self) -> None:\n" + " import joblib\n" + " self._model = joblib.load('x')\n" + "def predict(self, x):\n" + " return self._model.predict([x])[0]\n" + ) + load, predict = _parse_methods(raw) + assert "def load(self)" in load + assert "self._model" in load + assert "def predict(self," in predict + + +def test_parse_methods_class_wrapped_raises(): + """Invalid: LLM wraps methods in class (violates system prompt).""" + from app.cli.core.agent import _parse_methods + raw = ( + "class GeneratedModel:\n" + " def load(self) -> None:\n" + " import joblib\n" + " self._model = joblib.load('x')\n" + "\n" + " def predict(self, x):\n" + " return self._model.predict([x])[0]\n" + ) + with pytest.raises(ValueError, match="no class wrapper"): + _parse_methods(raw) + + +def test_parse_methods_trailing_text_raises(): + """Invalid: LLM adds trailing content (violates 'ONLY the two method bodies').""" + from app.cli.core.agent import _parse_methods + raw = ( + "def load(self) -> None:\n" + " import joblib\n" + " self._model = joblib.load('x')\n" + "\n" + "def predict(self, x):\n" + " return self._model.predict([x])[0]\n" + "\n" + "# This model works great for sentiment analysis.\n" + ) + with pytest.raises(ValueError, match="Unexpected content"): + _parse_methods(raw) + + # --------------------------------------------------------------------------- # generate() — mocked Groq client # ---------------------------------------------------------------------------