From 2e949ea38dedbb2a505d0a61a5f9d05096cde55e Mon Sep 17 00:00:00 2001
From: Exploreunive <Exploreunive@users.noreply.github.com>
Date: Sat, 21 Mar 2026 22:14:33 +0800
Subject: [PATCH] fix: normalize plugin-wrapped conversation content before
 memory extraction

---
 .../conv_memcell_extractor.py                 | 63 +++++++++++++++++++
 tests/test_conv_memcell_extractor.py          | 49 +++++++++++++++
 2 files changed, 112 insertions(+)

diff --git a/src/memory_layer/memcell_extractor/conv_memcell_extractor.py b/src/memory_layer/memcell_extractor/conv_memcell_extractor.py
index 881628e4..7337da27 100644
--- a/src/memory_layer/memcell_extractor/conv_memcell_extractor.py
+++ b/src/memory_layer/memcell_extractor/conv_memcell_extractor.py
@@ -36,6 +36,57 @@
 logger = get_logger(__name__)
 
 
+def _normalize_message_content_value(value: Any) -> str:
+    """Normalize message content into plain text for downstream memory extraction.
+
+    Handles plugin-wrapped/nested content structures (e.g. OpenClaw/Feishu payloads)
+    by recursively extracting textual fields and flattening them into a readable string.
+    """
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    if isinstance(value, (int, float, bool)):
+        return str(value)
+
+    if isinstance(value, list):
+        parts = [_normalize_message_content_value(item) for item in value]
+        return "\n".join(part for part in parts if part)
+
+    if isinstance(value, dict):
+        preferred_keys = [
+            'text', 'content', 'message', 'body', 'value',
+            'output_text', 'input_text', 'title', 'description'
+        ]
+
+        collected = []
+        seen = set()
+        for key in preferred_keys:
+            if key in value:
+                normalized = _normalize_message_content_value(value.get(key))
+                if normalized and normalized not in seen:
+                    collected.append(normalized)
+                    seen.add(normalized)
+
+        if collected:
+            return "\n".join(collected)
+
+        skip_keys = {
+            'type', 'role', 'id', '_id', 'msgType', 'timestamp', 'time',
+            'speaker_id', 'speaker_name', 'sender', 'sender_name', 'referList',
+            'metadata', 'extra', 'tool_calls', 'tool_call_id', 'arguments', 'name'
+        }
+        for key, nested in value.items():
+            if key in skip_keys:
+                continue
+            normalized = _normalize_message_content_value(nested)
+            if normalized:
+                return normalized
+        return ""
+
+    return str(value)
+
+
 @dataclass
 class BoundaryDetectionResult:
     """Boundary detection result."""
@@ -580,4 +631,16 @@ def _data_process(self, raw_data: RawData) -> Dict[str, Any]:
                     f"[ConvMemCellExtractor] Message type {msg_type} converted to placeholder: {placeholder}"
                 )
 
+        if isinstance(content, dict) and 'content' in content:
+            normalized_text = _normalize_message_content_value(content.get('content'))
+            if normalized_text != content.get('content'):
+                content = content.copy()
+                content['content'] = normalized_text
+
+        if isinstance(content, dict) and 'content' in content:
+            normalized_text = _normalize_message_content_value(content.get('content'))
+            if normalized_text != content.get('content'):
+                content = content.copy()
+                content['content'] = normalized_text
+
         return content
diff --git a/tests/test_conv_memcell_extractor.py b/tests/test_conv_memcell_extractor.py
index db24dbf8..68d89597 100644
--- a/tests/test_conv_memcell_extractor.py
+++ b/tests/test_conv_memcell_extractor.py
@@ -82,6 +82,55 @@ def create_raw_data_list(self, messages: List[Dict[str, Any]]) -> List[RawData]:
             raw_data_list.append(raw_data)
         return raw_data_list
 
+    def test_data_process_normalizes_plugin_wrapped_content(self):
+        """Should flatten nested plugin-wrapped content into plain text."""
+        extractor = ConvMemCellExtractor(None)
+
+        raw_data = RawData(
+            content={
+                "speaker_id": "user_1",
+                "speaker_name": "Alice",
+                "content": {
+                    "type": "message",
+                    "content": [
+                        {"type": "text", "text": "今天讨论 EverMemOS 的修复方案"},
+                        {"type": "tool_result", "content": {"text": "需要过滤插件包裹结构"}},
+                    ],
+                    "metadata": {"plugin": "openclaw-feishu"},
+                },
+                "timestamp": self.base_time.isoformat(),
+                "msgType": 1,
+            },
+            data_id="wrapped_1",
+            metadata={},
+        )
+
+        processed = extractor._data_process(raw_data)
+
+        assert processed is not None
+        assert processed["content"] == "今天讨论 EverMemOS 的修复方案\n需要过滤插件包裹结构"
+
+    def test_data_process_preserves_plain_text_content(self):
+        """Should keep plain text content unchanged."""
+        extractor = ConvMemCellExtractor(None)
+
+        raw_data = RawData(
+            content={
+                "speaker_id": "user_1",
+                "speaker_name": "Alice",
+                "content": "普通文本消息",
+                "timestamp": self.base_time.isoformat(),
+                "msgType": 1,
+            },
+            data_id="plain_1",
+            metadata={},
+        )
+
+        processed = extractor._data_process(raw_data)
+
+        assert processed is not None
+        assert processed["content"] == "普通文本消息"
+
     def create_realistic_conversation(self) -> tuple[List[RawData], List[RawData]]:
         """Create realistic conversation scenario"""
         # Historical conversation - Project discussion