From f0412c525ba856eb83ed41945f36ad663d4bfbde Mon Sep 17 00:00:00 2001 From: James Yoo Date: Thu, 23 Apr 2026 15:19:35 -0700 Subject: [PATCH 1/3] Implementing work to default to `claude-sonnet-4-6` for Avocado --- README.md | 6 +++++- main.py | 2 +- models/default_llm_backend.py | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4f9a1577..3e344127 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,16 @@ make docker-build ## Step 1: Generating C Specifications -To generate specs with an LLM, you first need to put your API key in a `.env` file. +To generate specs with an LLM, you first need to put your API key(s) in a `.env` file. ```sh echo "LLM_API_KEY=" > models/.env +echo "ANTHROPIC_API_KEY=" >> models/.env ``` +The `ANTHROPIC_API_KEY` is required for specification generation and repair via Anthropic's + Claude models. + Then run the Python script ```sh diff --git a/main.py b/main.py index 887c1975..f1beb62d 100755 --- a/main.py +++ b/main.py @@ -40,7 +40,7 @@ VALID_LOG_LEVELS = ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL") -DEFAULT_MODEL = "gpt-4o" +DEFAULT_MODEL = "claude-sonnet-4-6" DEFAULT_HEADERS_FOR_VERIFICATION: Sequence[str] = ( "#include ", "#include ", diff --git a/models/default_llm_backend.py b/models/default_llm_backend.py index 913a24cd..9f4f24a9 100644 --- a/models/default_llm_backend.py +++ b/models/default_llm_backend.py @@ -33,7 +33,8 @@ def __init__(self, model: str): else: self.vertex_credentials = None self.model = model - self.api_key = os.environ["LLM_API_KEY"] + api_key_for_model = "ANTHROPIC_API_KEY" if model.startswith("claude") else "LLM_API_KEY" + self.api_key = os.environ[api_key_for_model] if "claude" in model: self.max_tokens = 64000 From 121dafc875245ffcc8cfe841a94d4dec98858a07 Mon Sep 17 00:00:00 2001 From: James Yoo Date: Thu, 23 Apr 2026 15:45:34 -0700 Subject: [PATCH 2/3] Implementing parallel request logic --- models/default_llm_backend.py | 98 +++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 9 deletions(-) diff --git a/models/default_llm_backend.py b/models/default_llm_backend.py index 9f4f24a9..64100fda 100644 --- a/models/default_llm_backend.py +++ b/models/default_llm_backend.py @@ -12,6 +12,7 @@ import os import pathlib import time +from concurrent.futures import ThreadPoolExecutor import litellm from litellm import completion @@ -47,21 +48,25 @@ def __init__(self, model: str): def send_messages( self, messages: tuple[ConversationMessage, ...], temperature: float = 0, top_k: int = 1 ) -> list[str]: - """messages: [{'role': 'system', 'content': 'You are an intelligent code assistant'}, - {'role': 'user', 'content': 'Translate this program...'}, - {'role': 'assistant', 'content': 'Here is the translation...'}, - {'role': 'user', 'content': 'Do something else...'}] - - : ['Sure, here is...', - 'Okay, let me see...', - ...] - len() == top_k + """Return `top_k` sampled responses from the LLM for the given messages. + + Args: + messages (tuple[ConversationMessage, ...]): The conversation to send to the LLM. + temperature (float): The sampling temperature. Must be non-zero when `top_k > 1`. + top_k (int): The number of responses to sample. + + Returns: + list[str]: The sampled responses. `len(returned) == top_k`. """ if top_k < 1: raise GenerationError("top_k must be >= 1") if top_k != 1 and temperature == 0: raise GenerationError("Top k sampling requires a non-zero temperature") + # Claude models do not support the `n` parameter; issue parallel requests instead. + if "claude" in self.model: + return self._send_parallel(messages, temperature, top_k) + count = 0 while True: try: @@ -104,6 +109,81 @@ def send_messages( return [choice["message"]["content"] for choice in response["choices"]] + def _send_parallel( + self, messages: tuple[ConversationMessage, ...], temperature: float, top_k: int + ) -> list[str]: + """Return `top_k` responses by issuing parallel single requests. + + Used for models that do not support the `n` parameter (e.g. Claude). Each request runs + its own retry loop independently. + + Args: + messages (tuple[ConversationMessage, ...]): The conversation to send. + temperature (float): The sampling temperature. + top_k (int): The number of parallel requests to make. + + Returns: + list[str]: The `top_k` sampled responses. + """ + with ThreadPoolExecutor(max_workers=top_k) as executor: + futures = [ + executor.submit(self._send_one_message, messages, temperature) for _ in range(top_k) + ] + return [f.result() for f in futures] + + def _send_one_message( + self, messages: tuple[ConversationMessage, ...], temperature: float + ) -> str: + """Return a single response from the LLM with retry and compaction logic. + + Args: + messages (tuple[ConversationMessage, ...]): The conversation to send. + temperature (float): The sampling temperature. + + Returns: + str: The model's response text. + """ + count = 0 + while True: + try: + response = completion( + model=self.model, + messages=[message.to_dict() for message in messages], + temperature=temperature, + api_key=self.api_key, + vertex_credentials=self.vertex_credentials, + max_tokens=self.max_tokens, + ) + break + except litellm.ContextWindowExceededError as e: + compacted = self._compact_conversation(messages) + if compacted is None: + msg = "Context window exceeded and conversation is too short to compact" + raise ContextWindowExceededError(msg) from e + logger.warning("Context window exceeded; compacting conversation and retrying") + messages = compacted + except ( + litellm.BadRequestError, + litellm.AuthenticationError, + litellm.NotFoundError, + litellm.UnprocessableEntityError, + ) as e: + raise GenerationError(f"Encountered an error with LLM call {e}") + except ( + litellm.RateLimitError, + litellm.InternalServerError, + litellm.APIConnectionError, + ) as e: + count += 1 + if count >= 5: + raise ModelError("Vertex AI API: Too many retries") + logger.warning(f"LLM Error {e}. Waiting 10 seconds and retrying") + time.sleep(10) + except Exception as e: + raise GenerationError(f"LLM Error: {e}") + + return response["choices"][0]["message"]["content"] + @staticmethod def get_instance(model_name: str) -> LlmBackend: """Return an instance of LlmBackend for the given model. From 523c4ce20a493a6ea21642e1880c2e1a3057839a Mon Sep 17 00:00:00 2001 From: James Yoo Date: Fri, 24 Apr 2026 13:23:20 -0700 Subject: [PATCH 3/3] Addressing CodeRabbit comments --- models/default_llm_backend.py | 84 ++++++++++++++++------------------- 1 file changed, 39 insertions(+), 45 deletions(-) diff --git a/models/default_llm_backend.py b/models/default_llm_backend.py index b5786def..fc0cb11b 100644 --- a/models/default_llm_backend.py +++ b/models/default_llm_backend.py @@ -13,6 +13,7 @@ import pathlib import time from concurrent.futures import ThreadPoolExecutor +from typing import Any import litellm from litellm import completion @@ -34,7 +35,9 @@ def __init__(self, model: str, use_vertex_api: bool): else: self.vertex_credentials = None self.model = model - api_key_for_model = "ANTHROPIC_API_KEY" if model.startswith("claude") else "LLM_API_KEY" + api_key_for_model = ( + "ANTHROPIC_API_KEY" if self._is_claude_model(model) else "LLM_API_KEY" + ) self.api_key = os.environ[api_key_for_model] if "claude" in model: @@ -67,46 +70,7 @@ def send_messages( if "claude" in self.model: return self._send_parallel(messages, temperature, top_k) - count = 0 - while True: - try: - response = completion( - model=self.model, - messages=[message.to_dict() for message in messages], - temperature=temperature, - n=top_k, - api_key=self.api_key, - vertex_credentials=self.vertex_credentials, - max_tokens=self.max_tokens, - ) - break - except litellm.ContextWindowExceededError as e: - compacted = self._compact_conversation(messages) - if compacted is None: - msg = "Context window exceeded and conversation is too short to compact" - raise ContextWindowExceededError(msg) from e - logger.warning("Context window exceeded; compacting conversation and retrying") - messages = compacted - except ( - litellm.BadRequestError, - litellm.AuthenticationError, - litellm.NotFoundError, - litellm.UnprocessableEntityError, - ) as e: - raise GenerationError(f"Encountered an error with LLM call {e}") - except ( - litellm.RateLimitError, - litellm.InternalServerError, - litellm.APIConnectionError, - ) as e: - count += 1 - if count >= 5: - raise ModelError("Vertex AI API: Too many retries") - logger.warning(f"LLM Error {e}. Waiting 10 seconds and retrying") - time.sleep(10) - except Exception as e: - raise GenerationError(f"LLM Error: {e}") - + response = self._send_with_retry(messages, temperature, n=top_k) return [choice["message"]["content"] for choice in response["choices"]] def _send_parallel( @@ -143,6 +107,25 @@ def _send_one_message( Returns: str: The model's response text. """ + response = self._send_with_retry(messages, temperature) + return response["choices"][0]["message"]["content"] + + def _send_with_retry( + self, + messages: tuple[ConversationMessage, ...], + temperature: float, + **kwargs: Any, + ) -> Any: + """Return the raw LLM response, retrying on transient errors and compacting on overflow. + + Args: + messages (tuple[ConversationMessage, ...]): The conversation to send. + temperature (float): The sampling temperature. + **kwargs: Extra keyword arguments forwarded to `completion` (e.g. `n=top_k`). + + Returns: + Any: The raw litellm response object. + """ count = 0 while True: try: @@ -153,8 +136,9 @@ def _send_one_message( api_key=self.api_key, vertex_credentials=self.vertex_credentials, max_tokens=self.max_tokens, + **kwargs, ) - break + return response except litellm.ContextWindowExceededError as e: compacted = self._compact_conversation(messages) if compacted is None: @@ -176,14 +160,13 @@ def _send_one_message( ) as e: count += 1 if count >= 5: - raise ModelError("Vertex AI API: Too many retries") + msg = f"LLM API retries exceeded with model {self.model}" + raise ModelError(msg) logger.warning(f"LLM Error {e}. Waiting 10 seconds and retrying") time.sleep(10) except Exception as e: raise GenerationError(f"LLM Error: {e}") - return response["choices"][0]["message"]["content"] - @staticmethod def get_instance(model_name: str, use_vertex_api: bool) -> LlmBackend: """Return an instance of LlmBackend for the given model. @@ -255,3 +238,14 @@ def _compact_conversation( ): return None return (system_message, initial_user_message, latest_llm_response, last_user_message) + + def _is_claude_model(self, model_name: str) -> bool: + """Return True iff the model name corresponds to an Anthropic Claude model. + + Args: + model_name (str): The model name to check. + + Returns: + bool: True iff the model name corresponds to an Anthropic Claude model. + """ + return model_name.strip().startswith("claude")