SentiBot/analyzer.py at master · kpluas21/SentiBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""Core sentiment analysis using the Anthropic API."""
from __future__ import annotations

import csv
import json
import logging
import os
from typing import Any

import anthropic
from dotenv import load_dotenv

load_dotenv(override=True)

logger = logging.getLogger(__name__)

MODEL = "claude-sonnet-4-6"

_client: anthropic.Anthropic | None = None

_SYSTEM_PROMPT = (
    "You are a sentiment analysis expert. "
    "Analyze the sentiment of the given text and respond with a JSON object."
)

_OUTPUT_SCHEMA = {
    "type": "object",
    "properties": {
        "label": {
            "type": "string",
            "enum": ["positive", "negative", "neutral"],
            "description": "The overall sentiment label.",
        },
        "confidence": {
            "type": "number",
            "description": "Confidence score between 0.0 and 1.0.",
        },
    },
    "required": ["label", "confidence"],
    "additionalProperties": False,
}


def _get_client() -> anthropic.Anthropic:
    """Return a shared Anthropic client, creating it on first call."""
    global _client
    if _client is None:
        _client = anthropic.Anthropic()
    return _client


def analyze_sentiment(text: str) -> dict[str, Any]:
    """Analyze sentiment of a single text string.

    Args:
        text: The text to analyze. Empty strings are accepted.

    Returns:
        A dict with keys: text, label, confidence, tokens_used, model.

    Raises:
        anthropic.APIError: Propagated on any API failure after logging.
    """
    client = _get_client()
    user_content = text if text else "(empty)"

    try:
        response = client.messages.create(
            model=MODEL,
            max_tokens=256,
            system=_SYSTEM_PROMPT,
            messages=[{"role": "user", "content": user_content}],
            output_config={
                "format": {
                    "type": "json_schema",
                    "schema": _OUTPUT_SCHEMA,
                }
            },
        )
    except anthropic.APIError as exc:
        logger.error("API error analyzing text %r: %s", text[:50], exc)
        raise

    tokens_used = response.usage.input_tokens + response.usage.output_tokens
    logger.info(
        "analyze_sentiment: input_tokens=%d output_tokens=%d total=%d",
        response.usage.input_tokens,
        response.usage.output_tokens,
        tokens_used,
    )

    raw = json.loads(response.content[0].text)
    return {
        "text": text,
        "label": raw["label"],
        "confidence": float(raw["confidence"]),
        "tokens_used": tokens_used,
        "model": MODEL,
    }


def batch_analyze(texts: list[str]) -> list[dict[str, Any]]:
    """Analyze sentiment for a list of texts.

    Args:
        texts: List of text strings to analyze.

    Returns:
        List of result dicts in the same order as input.

    Raises:
        anthropic.APIError: On the first API failure encountered.
    """
    results: list[dict[str, Any]] = []
    for text in texts:
        results.append(analyze_sentiment(text))
    return results


def load_csv(filepath: str) -> list[str]:
    """Load texts from a CSV file (first column, header row skipped).

    Args:
        filepath: Path to the CSV file.

    Returns:
        List of text strings from the first column.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file is empty or has no data rows.
    """
    texts: list[str] = []
    with open(filepath, newline="", encoding="utf-8") as fh:
        reader = csv.reader(fh)
        next(reader, None)  # skip header
        for row in reader:
            if row:
                texts.append(row[0])
    if not texts:
        raise ValueError(f"No data rows found in {filepath!r}")
    return texts


def save_results(results: list[dict[str, Any]], filepath: str) -> None:
    """Save result dicts to a CSV file.

    Args:
        results: List of result dicts as returned by analyze_sentiment.
        filepath: Destination path for the output CSV.
    """
    if not results:
        logger.warning("save_results called with empty results list; writing empty file.")

    fieldnames = ["text", "label", "confidence", "tokens_used", "model"]
    with open(filepath, "w", newline="", encoding="utf-8") as fh:
        writer = csv.DictWriter(fh, fieldnames=fieldnames)
        writer.writeheader()
        for row in results:
            writer.writerow({k: row.get(k, "") for k in fieldnames})
    logger.info("save_results: wrote %d rows to %r", len(results), filepath)