ThinkSync/engine.py at main · White-D-coder/ThinkSync · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
"""
TrustOps-Env: Classification & Grading Engine
==============================================
Implements the multi-layered evaluation pipeline:
  Layer 1 → Content Classification (EASY/MEDIUM/HARD heuristics)
  Layer 2 → Action Grading (+0.5 / +0.3 / +0.2 reward matrix)
  Layer 3 → Embedding Similarity (simulated cosine similarity for reasoning quality)
  Layer 4 → Penalty Computation (-0.2 False Negative / -0.1 False Positive)

Security: All external API access through os.getenv("HF_TOKEN").
"""

import os
import re
import math
import random
from typing import Tuple, Dict, List, Any, Union
from models import (
    Content, Action, RewardRecord, EscalationTicket,
    ActionType, Difficulty, ContentLabel, EscalationStatus
)

# ─── Secure Environment ─────────────────────────────────────────────────────
from config import CONFIG

# ─── Secure Environment ─────────────────────────────────────────────────────
HF_TOKEN = os.getenv("HF_TOKEN", "")

# ─── Configuration Proxies ───
EXPERT_REASONING = CONFIG["expert_reasoning"]
SPAM_PATTERNS   = CONFIG["spam_patterns"]
ABUSE_PATTERNS  = CONFIG["abuse_patterns"]
THREAT_SOFTENERS = CONFIG["threat_softeners"]


def _compute_spam_score(text: str) -> float:
    """Returns 0.0–1.0 spam probability based on pattern matching."""
    text_lower = text.lower()
    hits = sum(1 for p in SPAM_PATTERNS if re.search(p, text_lower))
    caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
    exclaim_ratio = text.count("!") / max(len(text), 1)

    score = min(0.99, (hits * 0.25) + (caps_ratio * 0.3) + (exclaim_ratio * 0.5))
    return round(max(0.01, score), 3)


def _compute_toxicity_score(text: str) -> float:
    """Simulated toxicity baseline — in production this would call HF Toxicity Model."""
    text_lower = text.lower()
    hits = sum(1 for p in ABUSE_PATTERNS if re.search(p, text_lower))

    # Check for aggressive vocabulary
    aggressive_words = ["hate", "destroy", "kill", "die", "hurt", "attack", "fight", "loser", "idiot", "stupid", "worthless"]
    word_hits = sum(1 for w in aggressive_words if w in text_lower)

    # Check for softeners that reduce toxicity
    softener_hits = sum(1 for p in THREAT_SOFTENERS if re.search(p, text_lower))

    raw = min(0.99, (hits * 0.35) + (word_hits * 0.08))
    softened = max(0.01, raw - (softener_hits * 0.15))
    return round(softened, 3)


def _compute_embedding_similarity(agent_reasoning: str, label: str) -> float:
    """
    Simulated cosine similarity between agent reasoning and expert reasoning.
    In production: encode both with sentence-transformers, compute cosine similarity.
    Here we use word-overlap as a lightweight proxy.
    """
    if label not in EXPERT_REASONING:
        return 0.01

    expert = EXPERT_REASONING[label].lower().split()
    agent_words = agent_reasoning.lower().split()

    if not agent_words:
        return 0.01

    # Jaccard-like similarity with position weighting
    expert_set = set(expert)
    agent_set = set(agent_words)
    intersection = expert_set & agent_set
    union = expert_set | agent_set

    jaccard = len(intersection) / max(len(union), 1)

    # Boost for key technical terms
    key_terms = {"policy", "classification", "confidence", "context", "escalation",
                 "harmful", "safe", "reasoning", "analysis", "nuance", "cultural",
                 "coded", "satirical", "threat", "spam", "violation", "pattern"}
    term_bonus = sum(0.03 for t in key_terms if t in agent_set)

    similarity = min(0.99, jaccard + term_bonus)
    return round(max(0.01, similarity), 3)


# ─── Classification Engine ──────────────────────────────────────────────────

def classify_content(content: Content) -> Tuple[ContentLabel, ActionType, str, float, Dict]:
    """
    Multi-layer classification pipeline.

    Returns:
        (label, recommended_action, reasoning_chain, confidence, metrics_dict)

    Pipeline:
        1. Spam scoring (Rule-based / Regex)
        2. Toxicity baseline (Simulated HF Model)
        3. Contextual softener analysis
        4. Difficulty-aware decision routing
    """
    text = content.text
    text_lower = text.lower()

    spam_score = _compute_spam_score(text)
    toxicity_score = _compute_toxicity_score(text)
    softener_hits = sum(1 for p in THREAT_SOFTENERS if re.search(p, text_lower))

    metrics = {
        "spam_probability": spam_score,
        "toxicity_baseline": toxicity_score,
        "softener_signals": softener_hits,
        "text_length": len(text),
        "caps_ratio": round(sum(1 for c in text if c.isupper()) / max(len(text), 1), 3),
    }

    # ── EASY: Spam Detection ──
    if spam_score > CONFIG["spam_threshold"]:
        reasoning = (
            f"[Rule-Based Grader] Spam probability: {spam_score:.1%}. "
            f"Content exhibits classic spam indicators — urgency language, excessive punctuation, "
            f"suspicious link patterns. Binary classification with high confidence. "
            f"Action: REMOVE (policy violation — spam)."
        )
        return ContentLabel.SPAM, ActionType.REMOVE, reasoning, 0.95, metrics

    # ── EASY: Clear Safe Content ──
    if toxicity_score < CONFIG["safe_max_threshold"] and spam_score < 0.1 and not content.has_nuance:
        reasoning = (
            f"[Rule-Based Grader] Toxicity baseline: {toxicity_score:.1%}, Spam: {spam_score:.1%}. "
            f"Content is benign personal expression with no policy violations or harmful intent. "
            f"Standard safe-to-publish classification. Action: APPROVE."
        )
        return ContentLabel.SAFE, ActionType.APPROVE, reasoning, 0.92, metrics

    # ── MEDIUM: Clear Abuse ──
    if toxicity_score > CONFIG["abuse_threshold"] and softener_hits == 0:
        reasoning = (
            f"[Policy-Matching Grader] Toxicity baseline: {toxicity_score:.1%}. "
            f"Content contains direct personal attacks or dehumanizing language with zero contextual softeners. "
            f"Violates hate speech and harassment policies. "
            f"Cross-referenced against platform Terms of Service. Action: REMOVE."
        )
        return ContentLabel.ABUSIVE, ActionType.REMOVE, reasoning, 0.85, metrics

    # ── MEDIUM: Satire / Sarcasm with softeners ──
    if toxicity_score > 0.1 and softener_hits >= 1:
        confidence = 0.65 + (softener_hits * 0.08)
        if confidence > 0.75:
            reasoning = (
                f"[Policy-Matching Grader] Toxicity: {toxicity_score:.1%}, Softeners detected: {softener_hits}. "
                f"Content uses aggressive or violent language but within a clearly non-harmful context "
                f"(gaming, sports, humor). Contextual interpretation of tone and intent confirms satire. "
                f"Action: APPROVE (context-dependent safe)."
            )
            return ContentLabel.CONTEXT_DEPENDENT, ActionType.APPROVE, reasoning, round(confidence, 2), metrics
        else:
            reasoning = (
                f"[Policy-Matching Grader] Toxicity: {toxicity_score:.1%}, Softeners: {softener_hits}. "
                f"Aggressive language detected with partial contextual softeners. "
                f"Confidence ({confidence:.0%}) is below the decision threshold for binary action. "
                f"Borderline content on policy boundary — escalation recommended. "
                f"Action: FLAG (insufficient confidence for safe binary decision)."
            )
            return ContentLabel.CONTEXT_DEPENDENT, ActionType.FLAG, reasoning, round(confidence, 2), metrics

    # ── HARD: Coded Language / Metaphorical threats ──
    coded_indicators = ["poison", "clean house", "hunt", "cook them", "take them out", "deal with them"]
    if any(ind in text_lower for ind in coded_indicators):
        reasoning = (
            f"[Embedding Similarity Grader] Content employs metaphorical or coded phrases that may "
            f"function as veiled hate speech or colloquial slang. Toxicity baseline: {toxicity_score:.1%}. "
            f"Insufficient context for confident binary classification. "
            f"Deep contextual reasoning required — cultural sensitivity analysis needed. "
            f"Confidence below 50% threshold. Action: FLAG (strategic escalation)."
        )
        # Check for specific coded language vs ambiguous
        if any(w in text_lower for w in ["poison", "clean house"]):
            return ContentLabel.CODED_LANGUAGE, ActionType.FLAG, reasoning, 0.35, metrics
        return ContentLabel.AMBIGUOUS, ActionType.FLAG, reasoning, 0.40, metrics

    # ── HARD: Whistleblower Content ──
    whistleblower_indicators = ["leaked", "internal documents", "cover", "violations", "expose", "whistleblow"]
    if any(ind in text_lower for ind in whistleblower_indicators):
        reasoning = (
            f"[Embedding Similarity Grader] Content appears to contain leaked confidential information "
            f"exposing potential misconduct. Legal and ethical analysis required — content may be "
            f"critically important public interest material OR may violate privacy and confidentiality laws. "
            f"Binary approve/remove carries extreme risk. Action: FLAG (legal escalation required)."
        )
        return ContentLabel.WHISTLEBLOWER, ActionType.FLAG, reasoning, 0.30, metrics

    # ── Fallback: Low-confidence safe ──
    reasoning = (
        f"[Rule-Based Grader] No strong signals detected. Spam: {spam_score:.1%}, "
        f"Toxicity: {toxicity_score:.1%}. Content appears benign but may warrant further "
        f"monitoring. Action: APPROVE (default safe classification)."
    )
    return ContentLabel.SAFE, ActionType.APPROVE, reasoning, 0.70, metrics


# ─── Grading Pipeline ───────────────────────────────────────────────────────

def grade_action(
    content: Content,
    agent_action: ActionType,
    agent_reasoning: str,
    agent_confidence: float
) -> RewardRecord:
    """
    Multi-layered evaluation pipeline.

    Tier-aware grading:
      EASY   → Rule-based label matching (no reasoning eval)
      MEDIUM → Policy-matching + initial reasoning assessment
      HARD   → Full embedding similarity evaluation

    Returns a complete RewardRecord.
    """
    difficulty = content.difficulty
    expected_action = content.expected_action
    expected_label = content.expected_label

    # ENHANCED TASK & GRADER LOGIC
    if agent_action == expected_action:
        total = 1
    else:
        total = 0

    return RewardRecord(
        task_id=content.id,
        classification_score=total,
        action_score=total,
        reasoning_score=total,
        penalty_applied=0.0,
        total_score=total,
        penalty_type="none"
    )


# ─── Escalation Pipeline ────────────────────────────────────────────────────

def create_escalation(content: Content, reasoning: str) -> EscalationTicket:
    """Creates an escalation ticket when agent flags content for human review."""

    if content.expected_label == ContentLabel.WHISTLEBLOWER:
        reason = "Legal/ethical review required — potential whistleblower material"
    elif content.expected_label == ContentLabel.CODED_LANGUAGE:
        reason = "Coded language analysis required — potential veiled hate speech"
    elif content.expected_label == ContentLabel.AMBIGUOUS:
        reason = "Low confidence on ambiguous content — cultural/contextual nuance"
    elif content.expected_label == ContentLabel.CONTEXT_DEPENDENT:
        reason = "Borderline content on policy boundary — satire vs threat ambiguity"
    else:
        reason = "Agent confidence below decision threshold"

    return EscalationTicket(
        task_id=content.id,
        content_text=content.text,
        reason=reason
    )


# ─── OpenEnv Interface ──────────────────────────────────────────────────────

from models import Observation, CONTENT_BANK

class MyEnv:
    """
    OpenEnv-compliant class that wraps the moderation engine.
    Implements reset(), step(action), and state().
    """
    def __init__(self):
        self._state = Observation()
        self._is_done = False

    def reset(self) -> Dict:
        """Resets the environment and returns the initial state formatted for OpenEnv."""
        self._state = Observation(
            id=CONTENT_BANK[0].id,
            content=CONTENT_BANK[0].text,
            content_queue=list(CONTENT_BANK),
            moderation_log=[],
            step_count=0,
            cumulative_reward=0.0,
            episode_active=True,
            done=False,
            metadata={"version": "1.0.0"}
        )
        self._is_done = False
        return {"observation": self._state.model_dump() if hasattr(self._state, "model_dump") else self._state.dict()}

    def step(self, action: Any) -> Dict:
        """
        Takes an Action (dict or object) and advances the environment.
        Returns: { "observation": ..., "reward": ..., "done": ..., "info": ... }
        """
        from models import Action, ActionType

        # Convert dict input to Action if necessary
        if isinstance(action, dict):
            try:
                action_obj = Action(**action)
            except:
                action_obj = Action(
                    content_id=action.get("content_id", ""),
                    action_type=ActionType(action.get("action_type", "flag")),
                    reasoning_chain=action.get("reasoning_chain", ""),
                    confidence_score=float(action.get("confidence_score", 0.5))
                )
        else:
            action_obj = action

        if not self._state.episode_active or not self._state.content_queue:
            self._state.done = True
            print(f"[END] success=True total_steps={self._state.step_count} final_score={int(self._state.cumulative_reward)}")
            return {
                "observation": self._state.model_dump() if hasattr(self._state, "model_dump") else self._state.dict(),
                "reward": 0.01,
                "done": True,
                "info": {}
            }

        reward_score = 0
        content = next((c for c in self._state.content_queue if c.id == action_obj.content_id), None)

        if content:
            reward_rec = grade_action(
                content=content,
                agent_action=action_obj.action_type,
                agent_reasoning=action_obj.reasoning_chain,
                agent_confidence=action_obj.confidence_score
            )
            reward_score = reward_rec.total_score
            self._state.cumulative_reward += reward_score
            self._state.step_count += 1
            _action_data = action_obj.model_dump() if hasattr(action_obj, "model_dump") else action_obj.dict()
            _reward_data = reward_rec.model_dump() if hasattr(reward_rec, "model_dump") else reward_rec.dict()
            self._state.moderation_log.append({"action": _action_data, "reward": _reward_data})
            self._state.content_queue = [c for c in self._state.content_queue if c.id != content.id]

        # Update next observation
        if self._state.content_queue:
            self._state.id = self._state.content_queue[0].id
            self._state.content = self._state.content_queue[0].text
        else:
            self._state.id = ""
            self._state.content = ""
            self._state.done = True
            self._state.episode_active = False
            print(f"[END] success=True total_steps={self._state.step_count} final_score={self._state.cumulative_reward:.3f}")

        # MANDATORY: clamp reward to (0.01, 0.99) — never 0.0 or 1.0
        clamped_reward = max(0.01, min(float(reward_score), 0.99))
        return {
            "observation": self._state.model_dump() if hasattr(self._state, "model_dump") else self._state.dict(),
            "reward": clamped_reward,
            "done": bool(self._state.done),
            "info": {}
        }

    def state(self) -> Observation:
        """Returns the current structural state for UI dashboard purposes."""
        return self._state