From 2e949ea38dedbb2a505d0a61a5f9d05096cde55e Mon Sep 17 00:00:00 2001 From: Exploreunive Date: Sat, 21 Mar 2026 22:14:33 +0800 Subject: [PATCH] fix: normalize plugin-wrapped conversation content before memory extraction --- .../conv_memcell_extractor.py | 63 +++++++++++++++++++ tests/test_conv_memcell_extractor.py | 49 +++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/src/memory_layer/memcell_extractor/conv_memcell_extractor.py b/src/memory_layer/memcell_extractor/conv_memcell_extractor.py index 881628e4..7337da27 100644 --- a/src/memory_layer/memcell_extractor/conv_memcell_extractor.py +++ b/src/memory_layer/memcell_extractor/conv_memcell_extractor.py @@ -36,6 +36,57 @@ logger = get_logger(__name__) +def _normalize_message_content_value(value: Any) -> str: + """Normalize message content into plain text for downstream memory extraction. + + Handles plugin-wrapped/nested content structures (e.g. OpenClaw/Feishu payloads) + by recursively extracting textual fields and flattening them into a readable string. + """ + if value is None: + return "" + if isinstance(value, str): + return value + if isinstance(value, (int, float, bool)): + return str(value) + + if isinstance(value, list): + parts = [_normalize_message_content_value(item) for item in value] + return "\n".join(part for part in parts if part) + + if isinstance(value, dict): + preferred_keys = [ + 'text', 'content', 'message', 'body', 'value', + 'output_text', 'input_text', 'title', 'description' + ] + + collected = [] + seen = set() + for key in preferred_keys: + if key in value: + normalized = _normalize_message_content_value(value.get(key)) + if normalized and normalized not in seen: + collected.append(normalized) + seen.add(normalized) + + if collected: + return "\n".join(collected) + + skip_keys = { + 'type', 'role', 'id', '_id', 'msgType', 'timestamp', 'time', + 'speaker_id', 'speaker_name', 'sender', 'sender_name', 'referList', + 'metadata', 'extra', 'tool_calls', 'tool_call_id', 'arguments', 'name' + } + for key, nested in value.items(): + if key in skip_keys: + continue + normalized = _normalize_message_content_value(nested) + if normalized: + return normalized + return "" + + return str(value) + + @dataclass class BoundaryDetectionResult: """Boundary detection result.""" @@ -580,4 +631,16 @@ def _data_process(self, raw_data: RawData) -> Dict[str, Any]: f"[ConvMemCellExtractor] Message type {msg_type} converted to placeholder: {placeholder}" ) + if isinstance(content, dict) and 'content' in content: + normalized_text = _normalize_message_content_value(content.get('content')) + if normalized_text != content.get('content'): + content = content.copy() + content['content'] = normalized_text + + if isinstance(content, dict) and 'content' in content: + normalized_text = _normalize_message_content_value(content.get('content')) + if normalized_text != content.get('content'): + content = content.copy() + content['content'] = normalized_text + return content diff --git a/tests/test_conv_memcell_extractor.py b/tests/test_conv_memcell_extractor.py index db24dbf8..68d89597 100644 --- a/tests/test_conv_memcell_extractor.py +++ b/tests/test_conv_memcell_extractor.py @@ -82,6 +82,55 @@ def create_raw_data_list(self, messages: List[Dict[str, Any]]) -> List[RawData]: raw_data_list.append(raw_data) return raw_data_list + def test_data_process_normalizes_plugin_wrapped_content(self): + """Should flatten nested plugin-wrapped content into plain text.""" + extractor = ConvMemCellExtractor(None) + + raw_data = RawData( + content={ + "speaker_id": "user_1", + "speaker_name": "Alice", + "content": { + "type": "message", + "content": [ + {"type": "text", "text": "今天讨论 EverMemOS 的修复方案"}, + {"type": "tool_result", "content": {"text": "需要过滤插件包裹结构"}}, + ], + "metadata": {"plugin": "openclaw-feishu"}, + }, + "timestamp": self.base_time.isoformat(), + "msgType": 1, + }, + data_id="wrapped_1", + metadata={}, + ) + + processed = extractor._data_process(raw_data) + + assert processed is not None + assert processed["content"] == "今天讨论 EverMemOS 的修复方案\n需要过滤插件包裹结构" + + def test_data_process_preserves_plain_text_content(self): + """Should keep plain text content unchanged.""" + extractor = ConvMemCellExtractor(None) + + raw_data = RawData( + content={ + "speaker_id": "user_1", + "speaker_name": "Alice", + "content": "普通文本消息", + "timestamp": self.base_time.isoformat(), + "msgType": 1, + }, + data_id="plain_1", + metadata={}, + ) + + processed = extractor._data_process(raw_data) + + assert processed is not None + assert processed["content"] == "普通文本消息" + def create_realistic_conversation(self) -> tuple[List[RawData], List[RawData]]: """Create realistic conversation scenario""" # Historical conversation - Project discussion