From 19eb9daab0b7528964a76c379e08b9fa8ef998b3 Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 3 Jun 2026 18:36:41 -0400 Subject: [PATCH 1/4] feat: initial implementation for typed accessor methods --- models/conversation.py | 219 +++++++++++++++++++++++++++++ models/raw_access.py | 242 +++++++++++++++++++++++++++++++++ services/workspace_resolver.py | 45 ++++-- services/workspace_tabs.py | 109 +++++++++------ tests/test_raw_accessors.py | 88 ++++++++++++ utils/text_extract.py | 4 +- 6 files changed, 647 insertions(+), 60 deletions(-) create mode 100644 models/raw_access.py create mode 100644 tests/test_raw_accessors.py diff --git a/models/conversation.py b/models/conversation.py index a3d5e2f..04f55c3 100644 --- a/models/conversation.py +++ b/models/conversation.py @@ -1,8 +1,11 @@ from __future__ import annotations +import logging from dataclasses import dataclass, field from typing import Any +_logger = logging.getLogger(__name__) + from models.errors import SchemaError from models.from_dict_validation import ( require_dict, @@ -67,6 +70,84 @@ def from_dict(cls, raw: dict[str, Any], *, composer_id: str) -> "Composer": raw=raw, ) + @property + def newly_created_files(self) -> list[Any]: + value = self.raw.get("newlyCreatedFiles") + if value is None: + return [] + if not isinstance(value, list): + _logger.warning( + "Schema drift in Composer %s: invalid type for newlyCreatedFiles (expected list, got %s)", + self.composer_id, + type(value).__name__, + ) + return [] + return value + + @property + def code_block_data(self) -> dict[str, Any] | None: + value = self.raw.get("codeBlockData") + if value is None: + return None + if not isinstance(value, dict): + _logger.warning( + "Schema drift in Composer %s: invalid type for codeBlockData (expected dict, got %s)", + self.composer_id, + type(value).__name__, + ) + return None + return value + + @property + def usage_data(self) -> dict[str, Any]: + """Composer cost rollup; empty dict when absent (common).""" + value = self.raw.get("usageData") + if value is None: + return {} + if not isinstance(value, dict): + suffix = f" {self.composer_id}" if self.composer_id else "" + _logger.warning( + "Schema drift in Composer%s: invalid type for usageData (expected dict, got %s)", + suffix, + type(value).__name__, + ) + return {} + return value + + def _optional_counter(self, key: str) -> int | float: + value = self.raw.get(key, 0) + if isinstance(value, bool) or not isinstance(value, (int, float)): + if key in self.raw: + suffix = f" {self.composer_id}" if self.composer_id else "" + _logger.warning( + "Schema drift in Composer%s: invalid type for %s (expected number, got %s)", + suffix, + key, + type(value).__name__, + ) + return 0 + return value + + @property + def total_lines_added(self) -> int | float: + return self._optional_counter("totalLinesAdded") + + @property + def total_lines_removed(self) -> int | float: + return self._optional_counter("totalLinesRemoved") + + @property + def added_files(self) -> int | float: + return self._optional_counter("addedFiles") + + @property + def removed_files(self) -> int | float: + return self._optional_counter("removedFiles") + + def model_name_from_config(self) -> str | None: + name = self.model_config.get("modelName") + return name if isinstance(name, str) and name else None + @dataclass(frozen=True) class WorkspaceLocalComposer: @@ -101,3 +182,141 @@ def from_dict(cls, raw: dict[str, Any], *, bubble_id: str) -> "Bubble": raw = require_dict(raw, model="Bubble", field="bubble") require_non_empty_str(bubble_id, model="Bubble", field="bubbleId") return cls(bubble_id=bubble_id, raw=raw) + + @property + def text(self) -> str | None: + """Plain ``text`` field; richText is handled by :func:`extract_text_from_bubble`.""" + value = self.raw.get("text") + return value if isinstance(value, str) else None + + @property + def metadata(self) -> dict[str, Any]: + value = self.raw.get("metadata") + if value is None: + return {} + if not isinstance(value, dict): + _logger.warning( + "Schema drift in Bubble %s: invalid type for metadata (expected dict, got %s)", + self.bubble_id, + type(value).__name__, + ) + return {} + return value + + @property + def relevant_files(self) -> list[Any]: + value = self.raw.get("relevantFiles") + if value is None: + return [] + if not isinstance(value, list): + _logger.warning( + "Schema drift in Bubble %s: invalid type for relevantFiles (expected list, got %s)", + self.bubble_id, + type(value).__name__, + ) + return [] + return value + + @property + def attached_file_code_chunks_uris(self) -> list[Any]: + value = self.raw.get("attachedFileCodeChunksUris") + if value is None: + return [] + if not isinstance(value, list): + _logger.warning( + "Schema drift in Bubble %s: invalid type for attachedFileCodeChunksUris (expected list, got %s)", + self.bubble_id, + type(value).__name__, + ) + return [] + return value + + @property + def context(self) -> dict[str, Any]: + value = self.raw.get("context") + if value is None: + return {} + if not isinstance(value, dict): + _logger.warning( + "Schema drift in Bubble %s: invalid type for context (expected dict, got %s)", + self.bubble_id, + type(value).__name__, + ) + return {} + return value + + @property + def token_count(self) -> Any | None: + return self.raw.get("tokenCount") + + @property + def tool_former_data(self) -> dict[str, Any] | None: + value = self.raw.get("toolFormerData") + if value is None: + return None + if not isinstance(value, dict): + _logger.warning( + "Schema drift in Bubble %s: invalid type for toolFormerData (expected dict, got %s)", + self.bubble_id, + type(value).__name__, + ) + return None + return value + + @property + def model_info(self) -> dict[str, Any]: + value = self.raw.get("modelInfo") + if value is None: + return {} + if not isinstance(value, dict): + _logger.warning( + "Schema drift in Bubble %s: invalid type for modelInfo (expected dict, got %s)", + self.bubble_id, + type(value).__name__, + ) + return {} + return value + + @property + def thinking(self) -> Any | None: + return self.raw.get("thinking") + + @property + def thinking_duration_ms(self) -> Any | None: + return self.raw.get("thinkingDurationMs") + + @property + def context_window_status_at_creation(self) -> dict[str, Any]: + value = self.raw.get("contextWindowStatusAtCreation") + if value is None: + return {} + if not isinstance(value, dict): + _logger.warning( + "Schema drift in Bubble %s: invalid type for contextWindowStatusAtCreation (expected dict, got %s)", + self.bubble_id, + type(value).__name__, + ) + return {} + return value + + @property + def tool_results(self) -> list[Any] | None: + value = self.raw.get("toolResults") + if value is None: + return None + if not isinstance(value, list): + _logger.warning( + "Schema drift in Bubble %s: invalid type for toolResults (expected list, got %s)", + self.bubble_id, + type(value).__name__, + ) + return None + return value + + def bubble_timestamp_ms(self) -> int | float | None: + """``createdAt`` or ``timestamp`` in milliseconds when present.""" + for key in ("createdAt", "timestamp"): + value = self.raw.get(key) + if isinstance(value, (int, float)) and not isinstance(value, bool): + return value + return None diff --git a/models/raw_access.py b/models/raw_access.py new file mode 100644 index 0000000..911cf97 --- /dev/null +++ b/models/raw_access.py @@ -0,0 +1,242 @@ +"""Optional-key reads from Cursor JSON blobs with schema-drift logging.""" + +from __future__ import annotations + +import logging +from typing import Any + +_logger = logging.getLogger(__name__) + + +def warn_missing_raw_key( + raw: dict[str, Any], + key: str, + *, + model: str, + entity_id: str = "", +) -> None: + """Log when a frequently-used optional field is absent (likely key rename).""" + suffix = f" {entity_id}" if entity_id else "" + _logger.warning( + "Schema drift in %s%s: missing optional field %s", + model, + suffix, + key, + ) + + +def optional_raw_value( + raw: dict[str, Any], + key: str, + *, + model: str, + entity_id: str = "", + expected_type: type[Any] | tuple[type[Any], ...] | None = None, +) -> Any | None: + """Return ``raw[key]`` when present and typed; log drift and return ``None`` otherwise.""" + if key not in raw: + warn_missing_raw_key(raw, key, model=model, entity_id=entity_id) + return None + value = raw[key] + if expected_type is not None and not isinstance(value, expected_type): + suffix = f" {entity_id}" if entity_id else "" + _logger.warning( + "Schema drift in %s%s: invalid type for %s (expected %s, got %s)", + model, + suffix, + key, + expected_type, + type(value).__name__, + ) + return None + return value + + +def optional_raw_list( + raw: dict[str, Any], + key: str, + *, + model: str, + entity_id: str = "", +) -> list[Any] | None: + return optional_raw_value( + raw, + key, + model=model, + entity_id=entity_id, + expected_type=list, + ) + + +def optional_raw_dict( + raw: dict[str, Any], + key: str, + *, + model: str, + entity_id: str = "", +) -> dict[str, Any] | None: + return optional_raw_value( + raw, + key, + model=model, + entity_id=entity_id, + expected_type=dict, + ) + + +def optional_raw_str( + raw: dict[str, Any], + key: str, + *, + model: str, + entity_id: str = "", +) -> str | None: + return optional_raw_value( + raw, + key, + model=model, + entity_id=entity_id, + expected_type=str, + ) + + +def optional_raw_number( + raw: dict[str, Any], + key: str, + *, + model: str, + entity_id: str = "", + default: int | float = 0, +) -> int | float: + """Numeric composer counters; warn on missing key, return *default* when absent.""" + if key not in raw: + warn_missing_raw_key(raw, key, model=model, entity_id=entity_id) + return default + value = raw[key] + if isinstance(value, bool) or not isinstance(value, (int, float)): + suffix = f" {entity_id}" if entity_id else "" + _logger.warning( + "Schema drift in %s%s: invalid type for %s (expected number, got %s)", + model, + suffix, + key, + type(value).__name__, + ) + return default + return value + + +def conversation_header_bubble_id( + header: dict[str, Any], + *, + composer_id: str = "", +) -> str | None: + """``bubbleId`` from a ``fullConversationHeadersOnly`` entry.""" + value = optional_raw_str( + header, + "bubbleId", + model="ConversationHeader", + entity_id=composer_id, + ) + return value if value else None + + +def message_request_context_project_layouts( + ctx: dict[str, Any], + *, + composer_id: str = "", +) -> list[Any] | None: + """``projectLayouts`` from a messageRequestContext blob.""" + return optional_raw_list( + ctx, + "projectLayouts", + model="MessageRequestContext", + entity_id=composer_id, + ) + + +def composer_headers( + data: Any, + composer_id: str, +) -> list[dict[str, Any]]: + from models.conversation import Composer + + if isinstance(data, Composer): + return data.full_conversation_headers_only + headers = optional_raw_list( + data, + "fullConversationHeadersOnly", + model="Composer", + entity_id=composer_id, + ) + return headers if headers is not None else [] + + +def composer_newly_created_files(data: Any, composer_id: str) -> list[Any]: + from models.conversation import Composer + + if isinstance(data, Composer): + return data.newly_created_files + value = data.get("newlyCreatedFiles") if isinstance(data, dict) else None + if value is None: + return [] + if not isinstance(value, list): + _logger.warning( + "Schema drift in Composer %s: invalid type for newlyCreatedFiles (expected list, got %s)", + composer_id, + type(value).__name__, + ) + return [] + return value + + +def composer_code_block_data(data: Any, composer_id: str) -> dict[str, Any] | None: + from models.conversation import Composer + + if isinstance(data, Composer): + return data.code_block_data + return optional_raw_dict( + data, "codeBlockData", model="Composer", entity_id=composer_id + ) + + +def bubble_relevant_files(bubble: Any, bubble_id: str = "") -> list[Any]: + from models.conversation import Bubble + + if isinstance(bubble, Bubble): + return bubble.relevant_files + if isinstance(bubble, dict): + value = bubble.get("relevantFiles") + if value is None: + return [] + if isinstance(value, list): + return value + return [] + + +def bubble_attached_file_uris(bubble: Any, bubble_id: str = "") -> list[Any]: + from models.conversation import Bubble + + if isinstance(bubble, Bubble): + return bubble.attached_file_code_chunks_uris + if isinstance(bubble, dict): + value = bubble.get("attachedFileCodeChunksUris") + if value is None: + return [] + if isinstance(value, list): + return value + return [] + + +def bubble_context(bubble: Any, bubble_id: str = "") -> dict[str, Any]: + from models.conversation import Bubble + + if isinstance(bubble, Bubble): + return bubble.context or {} + if isinstance(bubble, dict): + ctx = bubble.get("context") + if ctx is None: + return {} + if isinstance(ctx, dict): + return ctx + return {} diff --git a/services/workspace_resolver.py b/services/workspace_resolver.py index faf5a8b..cc089ea 100644 --- a/services/workspace_resolver.py +++ b/services/workspace_resolver.py @@ -20,6 +20,17 @@ from utils.workspace_descriptor import basename_from_pathish, read_json_file from services.workspace_db import open_global_db from models import SchemaError, Workspace +from models.conversation import Composer +from models.raw_access import ( + bubble_attached_file_uris, + bubble_context, + bubble_relevant_files, + composer_code_block_data, + composer_headers, + composer_newly_created_files, + conversation_header_bubble_id, + message_request_context_project_layouts, +) def lookup_workspace_display_name(workspace_path: str, workspace_id: str) -> str: @@ -118,8 +129,8 @@ def infer_workspace_name_from_context(workspace_path: str, workspace_id: str) -> e, ) continue - layouts = ctx.get("projectLayouts") - if not isinstance(layouts, list): + layouts = message_request_context_project_layouts(ctx, composer_id=cid) + if not layouts: continue for layout in layouts: obj = None @@ -225,7 +236,7 @@ def create_workspace_path_to_id_map(workspace_entries): def determine_project_for_conversation( - composer_data: dict, + composer_data: Composer | dict, composer_id: str, project_layouts_map: dict, project_name_to_workspace_id: dict, @@ -272,7 +283,7 @@ def determine_project_for_conversation( return workspace_id # Fallback: newlyCreatedFiles - newly = composer_data.get("newlyCreatedFiles") or [] + newly = composer_newly_created_files(composer_data, composer_id) for file_entry in newly: uri = file_entry.get("uri") if isinstance(file_entry, dict) else None if isinstance(uri, dict) and uri.get("path"): @@ -281,7 +292,7 @@ def determine_project_for_conversation( return pid # Fallback: codeBlockData - cbd = composer_data.get("codeBlockData") + cbd = composer_code_block_data(composer_data, composer_id) if isinstance(cbd, dict): for fp in cbd.keys(): pid = get_project_from_file_path(re.sub(r"^file://", "", fp), workspace_entries) @@ -289,24 +300,27 @@ def determine_project_for_conversation( return pid # Fallback: conversation headers -> bubble references - headers = composer_data.get("fullConversationHeadersOnly") or [] + headers = composer_headers(composer_data, composer_id) for header in headers: if not isinstance(header, dict): continue - bubble = bubble_map.get(header.get("bubbleId")) + bubble_id = conversation_header_bubble_id(header, composer_id=composer_id) + if not bubble_id: + continue + bubble = bubble_map.get(bubble_id) if not bubble: continue - for fp in (bubble.get("relevantFiles") or []): + for fp in bubble_relevant_files(bubble, bubble_id): if fp: pid = get_project_from_file_path(fp, workspace_entries) if pid: return pid - for uri in (bubble.get("attachedFileCodeChunksUris") or []): + for uri in bubble_attached_file_uris(bubble, bubble_id): if isinstance(uri, dict) and uri.get("path"): pid = get_project_from_file_path(uri["path"], workspace_entries) if pid: return pid - for fs_entry in (bubble.get("context", {}).get("fileSelections") or []): + for fs_entry in (bubble_context(bubble, bubble_id).get("fileSelections") or []): if isinstance(fs_entry, dict): uri = fs_entry.get("uri") if isinstance(uri, dict) and uri.get("path"): @@ -327,16 +341,19 @@ def determine_project_for_conversation( for header in headers: if not isinstance(header, dict): continue - bubble = bubble_map.get(header.get("bubbleId")) + bubble_id = conversation_header_bubble_id(header, composer_id=composer_id) + if not bubble_id: + continue + bubble = bubble_map.get(bubble_id) if not bubble: continue - for fp in (bubble.get("relevantFiles") or []): + for fp in bubble_relevant_files(bubble, bubble_id): if fp: path_segments.append(normalize_file_path(fp)) - for uri in (bubble.get("attachedFileCodeChunksUris") or []): + for uri in bubble_attached_file_uris(bubble, bubble_id): if isinstance(uri, dict) and uri.get("path"): path_segments.append(normalize_file_path(uri["path"])) - for fs_entry in (bubble.get("context", {}).get("fileSelections") or []): + for fs_entry in (bubble_context(bubble, bubble_id).get("fileSelections") or []): if isinstance(fs_entry, dict): uri = fs_entry.get("uri") if isinstance(uri, dict) and uri.get("path"): diff --git a/services/workspace_tabs.py b/services/workspace_tabs.py index b6073d3..c11946a 100644 --- a/services/workspace_tabs.py +++ b/services/workspace_tabs.py @@ -21,6 +21,10 @@ from utils.tool_parser import parse_tool_call from utils.workspace_descriptor import read_json_file from models import Bubble, Composer, ParseWarningCollector, SchemaError +from models.raw_access import ( + conversation_header_bubble_id, + message_request_context_project_layouts, +) from services.summary_cache import ( fingerprint_workspace_storage, get_cached_tab_summaries, @@ -93,20 +97,20 @@ def _kv_payload_log_meta(value: object | None) -> tuple[int, str | None]: def _assemble_tab_from_composer_data( composer_id: str, - cd: dict, - bubble_map: dict[str, dict], + composer: Composer, + bubble_map: dict[str, Bubble | dict[str, Any]], contexts: list[dict], code_block_diffs: list[dict], workspace_display_name: str, rules: list, parse_warnings: ParseWarningCollector, ) -> dict | None: - """Assemble a single tab dict from an already-parsed composer dict. + """Assemble a single tab dict from a validated :class:`Composer`. Args: composer_id: Composer UUID. - cd: Raw ``composerData`` dict (``composer.raw``). - bubble_map: ``{bubble_id: bubble_dict}`` — may be global or scoped. + composer: Validated composer model (typed field access on ``.raw``). + bubble_map: ``{bubble_id: Bubble | bubble_dict}`` — global or scoped. contexts: ``messageRequestContext`` entries for *this* composer (list of dicts, each with an injected ``contextId`` key and a ``bubbleId`` field from the JSON value). @@ -119,18 +123,31 @@ def _assemble_tab_from_composer_data( A tab dict on success, or ``None`` when the tab should be omitted (no renderable bubbles or excluded by rules). """ - headers = cd.get("fullConversationHeadersOnly") or [] + headers = composer.full_conversation_headers_only bubbles: list[dict[str, Any]] = [] for header in headers: if not isinstance(header, dict): continue - bubble_id = header.get("bubbleId") - if not isinstance(bubble_id, str): + bubble_id = conversation_header_bubble_id(header, composer_id=composer_id) + if not bubble_id: continue - bubble = bubble_map.get(bubble_id) - if not bubble: + bubble_entry = bubble_map.get(bubble_id) + if not bubble_entry: continue + if isinstance(bubble_entry, Bubble): + bubble = bubble_entry + else: + try: + bubble = Bubble.from_dict(bubble_entry, bubble_id=bubble_id) + except SchemaError as e: + _logger.warning( + "Failed to parse Bubble from bubbleId:%s: %s", + bubble_id, + e, + ) + parse_warnings.record_bubble_skipped() + continue is_user = header.get("type") == 1 msg_type = "user" if is_user else "ai" @@ -177,11 +194,10 @@ def _assemble_tab_from_composer_data( context_text += f"\n- {comp.get('name') or comp.get('composerId') or 'Conversation'}" full_text = text + context_text - raw = bubble - token_count = raw.get("tokenCount") + token_count = bubble.token_count tool_calls = None - tfd = raw.get("toolFormerData") + tfd = bubble.tool_former_data if isinstance(tfd, dict): tool_call = parse_tool_call(tfd) if isinstance(tool_call, dict): @@ -189,15 +205,24 @@ def _assemble_tab_from_composer_data( thinking = None thinking_duration_ms = None - if raw.get("thinking"): - thinking = raw["thinking"] if isinstance(raw["thinking"], str) else (raw["thinking"].get("text") if isinstance(raw["thinking"], dict) else None) - thinking_duration_ms = raw.get("thinkingDurationMs") + thinking_raw = bubble.thinking + if thinking_raw: + thinking = ( + thinking_raw + if isinstance(thinking_raw, str) + else ( + thinking_raw.get("text") + if isinstance(thinking_raw, dict) + else None + ) + ) + thinking_duration_ms = bubble.thinking_duration_ms has_content = full_text.strip() or tool_calls or thinking if not has_content: continue - ctx_window = raw.get("contextWindowStatusAtCreation") or {} + ctx_window = bubble.context_window_status_at_creation ctx_pct = None if isinstance(ctx_window, dict): if ctx_window.get("percentageRemainingFloat") is not None: @@ -216,7 +241,7 @@ def _assemble_tab_from_composer_data( display_text = thinking bubble_meta = None - model_info = raw.get("modelInfo") or {} + model_info = bubble.model_info model_name = model_info.get("modelName") if model_name == "default": model_name = None @@ -231,8 +256,8 @@ def _assemble_tab_from_composer_data( "inputTokens": in_tok if in_tok > 0 else None, "outputTokens": out_tok if out_tok > 0 else None, "cachedTokens": cached_tok if cached_tok > 0 else None, - "toolResultsCount": (len(tool_calls) if tool_calls else None) or (len(raw["toolResults"]) if isinstance(raw.get("toolResults"), list) and raw["toolResults"] else None), - "toolResults": raw.get("toolResults") if isinstance(raw.get("toolResults"), list) and raw["toolResults"] else None, + "toolResultsCount": (len(tool_calls) if tool_calls else None) or (len(bubble.tool_results) if bubble.tool_results else None), + "toolResults": bubble.tool_results if bubble.tool_results else None, "toolCalls": tool_calls, "thinking": thinking, "thinkingDurationMs": thinking_duration_ms, @@ -259,7 +284,7 @@ def _assemble_tab_from_composer_data( b_entry = { "type": msg_type, "text": display_text, - "timestamp": to_epoch_ms(bubble.get("createdAt")) or to_epoch_ms(bubble.get("timestamp")) or int(datetime.now().timestamp() * 1000), + "timestamp": to_epoch_ms(bubble.bubble_timestamp_ms()) or int(datetime.now().timestamp() * 1000), } if bubble_meta: b_entry["metadata"] = bubble_meta @@ -268,8 +293,8 @@ def _assemble_tab_from_composer_data( if not bubbles: return None - title = cd.get("name") or f"Conversation {composer_id[:8]}" - if not cd.get("name") and bubbles: + title = composer.name or f"Conversation {composer_id[:8]}" + if not composer.name and bubbles: first_msg = bubbles[0].get("text", "") if first_msg: first_lines = [ln for ln in first_msg.split("\n") if ln.strip()] @@ -278,8 +303,7 @@ def _assemble_tab_from_composer_data( if len(title) == 100: title += "..." - _early_model_config = cd.get("modelConfig") or {} - _early_model_name = _early_model_config.get("modelName") + _early_model_name = composer.model_name_from_config() _early_model_names = [_early_model_name] if _early_model_name and _early_model_name != "default" else None if is_excluded_by_rules(rules, build_searchable_text( project_name=workspace_display_name, @@ -327,15 +351,15 @@ def _assemble_tab_from_composer_data( if m.get("thinkingDurationMs"): total_thinking_ms += m["thinkingDurationMs"] - usage = cd.get("usageData") or {} + usage = composer.usage_data composer_cost = usage.get("cost") or usage.get("estimatedCost") if isinstance(composer_cost, (int, float)) and total_cost == 0: total_cost = composer_cost - lines_added = cd.get("totalLinesAdded", 0) - lines_removed = cd.get("totalLinesRemoved", 0) - files_added = cd.get("addedFiles", 0) - files_removed = cd.get("removedFiles", 0) + lines_added = composer.total_lines_added + lines_removed = composer.total_lines_removed + files_added = composer.added_files + files_removed = composer.removed_files max_ctx_tokens = 0 ctx_token_limit = 0 @@ -370,8 +394,7 @@ def _assemble_tab_from_composer_data( } tab_meta = {k: v for k, v in tab_meta_raw.items() if v is not None} - model_config = cd.get("modelConfig") or {} - model_name_from_config = model_config.get("modelName") + model_name_from_config = composer.model_name_from_config() if model_name_from_config and model_name_from_config != "default": if not tab_meta: tab_meta = {} @@ -384,7 +407,7 @@ def _assemble_tab_from_composer_data( tab: dict[str, Any] = { "id": composer_id, "title": title, - "timestamp": to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms(cd.get("createdAt")) or int(datetime.now().timestamp() * 1000), + "timestamp": to_epoch_ms(composer.last_updated_at) or to_epoch_ms(composer.created_at) or int(datetime.now().timestamp() * 1000), "bubbles": [{ "type": b["type"], "text": b.get("text", ""), @@ -686,8 +709,6 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: ) return {"error": "Failed to parse conversation"}, 500 - cd = composer.raw - # Verify the conversation belongs to the requested workspace. # Always scoped: only load messageRequestContext rows for this composer. project_layouts_map: dict[str, list] = {} @@ -711,7 +732,7 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: ) pid = determine_project_for_conversation( - cd, composer_id, project_layouts_map, + composer, composer_id, project_layouts_map, project_name_map, workspace_path_map, workspace_entries, {}, composer_id_to_ws, invalid_workspace_ids, ) @@ -730,7 +751,7 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: tab = _assemble_tab_from_composer_data( composer_id=composer_id, - cd=cd, + composer=composer, bubble_map=bubble_map, contexts=contexts, code_block_diffs=code_block_diffs, @@ -777,7 +798,7 @@ def assemble_workspace_tabs( ) matching_ws_ids = _build_matching_ws_ids(workspace_id, workspace_path, workspace_entries) - bubble_map: dict[str, dict] = {} + bubble_map: dict[str, Bubble] = {} code_block_diff_map: dict[str, list] = {} message_request_context_map: dict[str, list] = {} @@ -817,7 +838,7 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: continue try: bubble_obj = Bubble.from_dict(parsed, bubble_id=bid) - bubble_map[bid] = bubble_obj.raw + bubble_map[bid] = bubble_obj except SchemaError as e: # Drift logged so the operator can chase disappearing # bubbles instead of guessing. Bad row still skipped so the @@ -853,8 +874,8 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: }) # Project-layout map (root paths used by the resolver) - layouts = ctx.get("projectLayouts") - if isinstance(layouts, list): + layouts = message_request_context_project_layouts(ctx, composer_id=chat_id) + if layouts: project_layouts_map.setdefault(chat_id, []) for layout in layouts: if isinstance(layout, str): @@ -916,11 +937,9 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: parse_warnings.record_composer_skipped() continue try: - cd = composer.raw - # Determine project pid = determine_project_for_conversation( - cd, composer_id, project_layouts_map, + composer, composer_id, project_layouts_map, project_name_map, workspace_path_map, workspace_entries, bubble_map, composer_id_to_ws, invalid_workspace_ids, ) @@ -934,7 +953,7 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: tab = _assemble_tab_from_composer_data( composer_id=composer_id, - cd=cd, + composer=composer, bubble_map=bubble_map, contexts=message_request_context_map.get(composer_id, []), code_block_diffs=code_block_diff_map.get(composer_id, []), diff --git a/tests/test_raw_accessors.py b/tests/test_raw_accessors.py new file mode 100644 index 0000000..4874231 --- /dev/null +++ b/tests/test_raw_accessors.py @@ -0,0 +1,88 @@ +"""Tests for typed raw accessors and schema-drift logging.""" + +from __future__ import annotations + +import logging +import os +import sys +import unittest + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +from models.conversation import Bubble, Composer +from models.raw_access import ( + composer_newly_created_files, + conversation_header_bubble_id, + message_request_context_project_layouts, + optional_raw_list, + warn_missing_raw_key, +) +from tests.test_models import GOOD_COMPOSER_RAW + + +class TestRawAccessorDriftLogging(unittest.TestCase): + def test_composer_newly_created_files_empty_when_key_missing(self) -> None: + bare = Composer.from_dict(GOOD_COMPOSER_RAW, composer_id="cid-2") + with self.assertNoLogs("models.conversation", level="WARNING"): + self.assertEqual(bare.newly_created_files, []) + + def test_composer_newly_created_files_warns_on_wrong_type(self) -> None: + bad = Composer.from_dict( + {**GOOD_COMPOSER_RAW, "newlyCreatedFiles": "not-a-list"}, + composer_id="cid-bad", + ) + with self.assertLogs("models.conversation", level="WARNING") as logs: + self.assertEqual(bad.newly_created_files, []) + self.assertTrue(any("newlyCreatedFiles" in m for m in logs.output), logs.output) + + def test_bubble_relevant_files_empty_when_key_missing(self) -> None: + bubble = Bubble.from_dict({"type": "user", "text": "hi"}, bubble_id="b-1") + with self.assertNoLogs("models.conversation", level="WARNING"): + self.assertEqual(bubble.relevant_files, []) + + def test_project_layouts_warns_when_key_missing(self) -> None: + with self.assertLogs("models.raw_access", level="WARNING") as logs: + layouts = message_request_context_project_layouts({}, composer_id="cmp-1") + self.assertIsNone(layouts) + self.assertTrue(any("projectLayouts" in m for m in logs.output), logs.output) + + def test_conversation_header_bubble_id_warns_when_missing(self) -> None: + with self.assertLogs("models.raw_access", level="WARNING") as logs: + bid = conversation_header_bubble_id({"type": 1}, composer_id="cmp-1") + self.assertIsNone(bid) + self.assertTrue(any("bubbleId" in m for m in logs.output), logs.output) + + def test_dict_bridge_newly_created_files_matches_composer_property(self) -> None: + data = {**GOOD_COMPOSER_RAW, "newlyCreatedFiles": [{"uri": {"path": "/a"}}]} + composer = Composer.from_dict(data, composer_id="cid-bridge") + self.assertEqual( + composer.newly_created_files, + composer_newly_created_files(composer, "cid-bridge"), + ) + + def test_optional_raw_list_no_warning_when_present(self) -> None: + with self.assertNoLogs("models.raw_access", level="WARNING"): + value = optional_raw_list( + {"items": [1]}, + "items", + model="Test", + entity_id="e1", + ) + self.assertEqual(value, [1]) + + def test_warn_missing_raw_key_message_format(self) -> None: + with self.assertLogs("models.raw_access", level="WARNING") as logs: + warn_missing_raw_key( + {}, + "sampleKey", + model="SampleModel", + entity_id="ent-9", + ) + self.assertIn("SampleModel ent-9", logs.output[0]) + self.assertIn("sampleKey", logs.output[0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/utils/text_extract.py b/utils/text_extract.py index 644ec10..6a02745 100644 --- a/utils/text_extract.py +++ b/utils/text_extract.py @@ -21,8 +21,10 @@ def extract_text_from_rich_text(children: list) -> str: return text -def extract_text_from_bubble(bubble: dict) -> str: +def extract_text_from_bubble(bubble: dict | object) -> str: """Extract displayable text from a bubble object (text, richText, codeBlocks).""" + if hasattr(bubble, "raw"): + bubble = bubble.raw # type: ignore[union-attr] if not bubble or not isinstance(bubble, dict): return "" From 8d45e95b29913dd2be764ee2a0c2686af429e919 Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 3 Jun 2026 20:17:53 -0400 Subject: [PATCH 2/4] fix: typecheck fail error --- services/workspace_tabs.py | 3 ++- tests/test_blob_parsing_fuzz.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/services/workspace_tabs.py b/services/workspace_tabs.py index c11946a..7e96eb8 100644 --- a/services/workspace_tabs.py +++ b/services/workspace_tabs.py @@ -5,6 +5,7 @@ import logging import os import sqlite3 +from collections.abc import Mapping from datetime import datetime from typing import Any @@ -98,7 +99,7 @@ def _kv_payload_log_meta(value: object | None) -> tuple[int, str | None]: def _assemble_tab_from_composer_data( composer_id: str, composer: Composer, - bubble_map: dict[str, Bubble | dict[str, Any]], + bubble_map: Mapping[str, Bubble | dict[str, Any]], contexts: list[dict], code_block_diffs: list[dict], workspace_display_name: str, diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py index 36fd753..e7b5f1d 100644 --- a/tests/test_blob_parsing_fuzz.py +++ b/tests/test_blob_parsing_fuzz.py @@ -55,7 +55,10 @@ ) _BUBBLE_ID = st.text( - alphabet=st.characters(blacklist_categories=("Cs",), blacklist_characters="\x00"), + alphabet=st.characters( + blacklist_categories=("Cs",), # type: ignore[arg-type] + blacklist_characters="\x00", + ), min_size=1, max_size=80, ) From 768c31d6772b668d3e0d1e26920a337a4c963ec8 Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 3 Jun 2026 21:44:46 -0400 Subject: [PATCH 3/4] fix: review comments by ai --- models/conversation.py | 4 +-- models/raw_access.py | 58 ++++++++++++++++++++++--------------- tests/test_raw_accessors.py | 4 ++- utils/text_extract.py | 18 +++++++++--- 4 files changed, 54 insertions(+), 30 deletions(-) diff --git a/models/conversation.py b/models/conversation.py index 04f55c3..57a2016 100644 --- a/models/conversation.py +++ b/models/conversation.py @@ -4,8 +4,6 @@ from dataclasses import dataclass, field from typing import Any -_logger = logging.getLogger(__name__) - from models.errors import SchemaError from models.from_dict_validation import ( require_dict, @@ -15,6 +13,8 @@ require_type, ) +_logger = logging.getLogger(__name__) + @dataclass(frozen=True) class Composer: diff --git a/models/raw_access.py b/models/raw_access.py index 911cf97..61830f5 100644 --- a/models/raw_access.py +++ b/models/raw_access.py @@ -163,13 +163,19 @@ def composer_headers( if isinstance(data, Composer): return data.full_conversation_headers_only - headers = optional_raw_list( - data, - "fullConversationHeadersOnly", - model="Composer", - entity_id=composer_id, - ) - return headers if headers is not None else [] + if not isinstance(data, dict): + return [] + value = data.get("fullConversationHeadersOnly") + if value is None: + return [] + if not isinstance(value, list): + _logger.warning( + "Schema drift in Composer %s: invalid type for fullConversationHeadersOnly (expected list, got %s)", + composer_id, + type(value).__name__, + ) + return [] + return value def composer_newly_created_files(data: Any, composer_id: str) -> list[Any]: @@ -206,11 +212,13 @@ def bubble_relevant_files(bubble: Any, bubble_id: str = "") -> list[Any]: if isinstance(bubble, Bubble): return bubble.relevant_files if isinstance(bubble, dict): - value = bubble.get("relevantFiles") - if value is None: - return [] - if isinstance(value, list): - return value + files = optional_raw_list( + bubble, + "relevantFiles", + model="Bubble", + entity_id=bubble_id, + ) + return files if files is not None else [] return [] @@ -220,11 +228,13 @@ def bubble_attached_file_uris(bubble: Any, bubble_id: str = "") -> list[Any]: if isinstance(bubble, Bubble): return bubble.attached_file_code_chunks_uris if isinstance(bubble, dict): - value = bubble.get("attachedFileCodeChunksUris") - if value is None: - return [] - if isinstance(value, list): - return value + uris = optional_raw_list( + bubble, + "attachedFileCodeChunksUris", + model="Bubble", + entity_id=bubble_id, + ) + return uris if uris is not None else [] return [] @@ -232,11 +242,13 @@ def bubble_context(bubble: Any, bubble_id: str = "") -> dict[str, Any]: from models.conversation import Bubble if isinstance(bubble, Bubble): - return bubble.context or {} + return bubble.context if isinstance(bubble, dict): - ctx = bubble.get("context") - if ctx is None: - return {} - if isinstance(ctx, dict): - return ctx + ctx = optional_raw_dict( + bubble, + "context", + model="Bubble", + entity_id=bubble_id, + ) + return ctx if ctx is not None else {} return {} diff --git a/tests/test_raw_accessors.py b/tests/test_raw_accessors.py index 4874231..c3f04a7 100644 --- a/tests/test_raw_accessors.py +++ b/tests/test_raw_accessors.py @@ -24,7 +24,9 @@ class TestRawAccessorDriftLogging(unittest.TestCase): def test_composer_newly_created_files_empty_when_key_missing(self) -> None: - bare = Composer.from_dict(GOOD_COMPOSER_RAW, composer_id="cid-2") + raw = dict(GOOD_COMPOSER_RAW) + raw.pop("newlyCreatedFiles", None) + bare = Composer.from_dict(raw, composer_id="cid-2") with self.assertNoLogs("models.conversation", level="WARNING"): self.assertEqual(bare.newly_created_files, []) diff --git a/utils/text_extract.py b/utils/text_extract.py index 6a02745..ed5ec1f 100644 --- a/utils/text_extract.py +++ b/utils/text_extract.py @@ -1,7 +1,17 @@ """Text extraction helpers mirroring the bubble/richText parsing in the Node.js codebase.""" +from __future__ import annotations + import json import re +from typing import Any, Protocol + + +class HasBubbleRaw(Protocol): + """Bubble model or any object exposing a Cursor JSON ``raw`` dict.""" + + @property + def raw(self) -> dict[str, Any]: ... def extract_text_from_rich_text(children: list) -> str: @@ -21,12 +31,12 @@ def extract_text_from_rich_text(children: list) -> str: return text -def extract_text_from_bubble(bubble: dict | object) -> str: +def extract_text_from_bubble(bubble: HasBubbleRaw | dict[str, Any]) -> str: """Extract displayable text from a bubble object (text, richText, codeBlocks).""" - if hasattr(bubble, "raw"): - bubble = bubble.raw # type: ignore[union-attr] - if not bubble or not isinstance(bubble, dict): + raw: dict[str, Any] = bubble if isinstance(bubble, dict) else bubble.raw + if not raw: return "" + bubble = raw text = "" From 838ff0f44d6190084d9e2a648e0d3ac0a5ee85d8 Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Thu, 4 Jun 2026 00:38:55 -0400 Subject: [PATCH 4/4] fix: outside diff range comments --- services/workspace_tabs.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/services/workspace_tabs.py b/services/workspace_tabs.py index 7e96eb8..3d098bc 100644 --- a/services/workspace_tabs.py +++ b/services/workspace_tabs.py @@ -134,7 +134,7 @@ def _assemble_tab_from_composer_data( if not bubble_id: continue bubble_entry = bubble_map.get(bubble_id) - if not bubble_entry: + if bubble_entry is None: continue if isinstance(bubble_entry, Bubble): bubble = bubble_entry @@ -570,6 +570,16 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: if not isinstance(cd, dict): parse_warnings.record_composer_skipped() continue + try: + composer = Composer.from_dict(cd, composer_id=composer_id) + except SchemaError as e: + _logger.warning( + "Failed to parse Composer from composerData:%s: %s", + composer_id, + e, + ) + parse_warnings.record_composer_skipped() + continue try: if ( composer_id not in composer_id_to_ws @@ -579,7 +589,7 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: global_db, composer_id, ) pid = determine_project_for_conversation( - cd, composer_id, project_layouts_map, + composer, composer_id, project_layouts_map, project_name_map, workspace_path_map, workspace_entries, {}, composer_id_to_ws, invalid_workspace_ids, ) @@ -591,14 +601,13 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: if assigned not in matching_ws_ids: continue - headers = cd.get("fullConversationHeadersOnly") or [] + headers = composer.full_conversation_headers_only if not headers: continue - title = cd.get("name") or f"Conversation {composer_id[:8]}" + title = composer.name or f"Conversation {composer_id[:8]}" - _early_model_config = cd.get("modelConfig") or {} - _early_model_name = _early_model_config.get("modelName") + _early_model_name = composer.model_name_from_config() _early_model_names = [_early_model_name] if _early_model_name and _early_model_name != "default" else None if is_excluded_by_rules(rules, build_searchable_text( project_name=workspace_display_name, @@ -614,7 +623,7 @@ def _safe_fetchall(query: str, params: tuple = ()) -> list: tab_entry: dict = { "id": composer_id, "title": title, - "timestamp": to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms(cd.get("createdAt")) or int(datetime.now().timestamp() * 1000), + "timestamp": to_epoch_ms(composer.last_updated_at) or to_epoch_ms(composer.created_at) or int(datetime.now().timestamp() * 1000), "messageCount": len(headers), } if tab_meta: