Project_6_AI_Agent_CSV_Visual/plotter.py at main · kamalviewcode-spec/Project_6_AI_Agent_CSV_Visual · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

# ── Imports ───────────────────────────────────────────────────────────────────
import os
import re
import json
from openai import OpenAI           # OpenAI-compatible client — used to call DeepSeek's API
from json_repair import repair_json  # fixes malformed JSON from LLM responses
from pydantic import BaseModel, ValidationError  # schema validation for structured output

# Previous import (Groq-hosted models — now decommissioned):
# from groq import Groq

# ── Client Initialisation — Previous (Groq) ───────────────────────────────────
# client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# ── Client Initialisation — DeepSeek via OpenAI-compatible API ───────────────
# DeepSeek's API is OpenAI-compatible. The standard openai.OpenAI client works
# by pointing base_url at DeepSeek's endpoint and passing a DeepSeek API key.
# Instantiated once at module level so the client is reused across all calls.
client = OpenAI(
    api_key=os.environ.get("DEEPSEEK_API_KEY"),  # DeepSeek API key from .env
    base_url="https://api.deepseek.com"           # DeepSeek's OpenAI-compatible endpoint
)

# ── Model Selection ───────────────────────────────────────────────────────────
# deepseek-chat = DeepSeek-V3 in standard mode — strong at structured JSON
# output and HTML/JS chart generation. No <think> tokens in this mode.
model = "deepseek-chat"

# Alternative: deepseek-reasoner — thinking mode, stronger reasoning, produces <think> tokens
# strip_thinking() below handles them automatically if you switch to this.
# model = "deepseek-reasoner"

# Previous models on Groq (all decommissioned):
# model = "deepseek-r1-distill-qwen-32b"
# model = "deepseek-r1-distill-llama-70b"
# model = "llama-3.1-8b-instant"

# ── Output Schema ─────────────────────────────────────────────────────────────
# Pydantic model that validates the structured JSON returned by the LLM.
# html_content: a full HTML document string for charts/tables (empty if N/A)
# summary:      a plain-text report summarising the data insight
class Product(BaseModel):
    html_content: str
    summary: str = ""  # default to empty string if the model omits this field

# ── System Prompt ─────────────────────────────────────────────────────────────
# Instructs the LLM to act as a data visualisation expert and return a strict
# JSON object with two fields: html_content and summary.
# The embedded HTML example teaches the model the exact output format expected.
system_prompt = """
You are a data summary & data visualizing expert. You will be shown the Users question & the csv agent's output, \
You should summarize the agents results and also plot the data with a suitable plot technique via HTML format doc content.
html_content --> used for either plots or tables, default value is ""
summary --> used for summarizing the result in a report fashion
Note: always respond with valid JSON objects that match this complete HTML structure while plotting
If the data is Not plottable, return default empty str as output in place of html_content.
Example:
{
  "html_content": "
  <!DOCTYPE html>
<html>
<head>
  <title>Category Data Visualization</title>
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
  <style>
    body {
      font-family: Arial, sans-serif;
      margin: 40px;
    }

    .container {
      display: flex;
      gap: 40px;
      align-items: flex-start;
    }

    table {
      border-collapse: collapse;
      width: 300px;
    }

    th, td {
      border: 1px solid #ccc;
      padding: 12px;
      text-align: center;
    }

    th {
      background-color: #f4f4f4;
    }

    .chart-container {
      width: 400px;
    }
  </style>
</head>
<body>

  <h2>Category Distribution</h2>

  <div class="container">
    <!-- Table -->
    <table>
      <thead>
        <tr>
          <th>Category</th>
          <th>Percentage</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>Category 1</td>
          <td>20%</td>
        </tr>
        <tr>
          <td>Category 2</td>
          <td>80%</td>
        </tr>
      </tbody>
    </table>

    <!-- Pie Chart -->
    <div class="chart-container">
      <canvas id="categoryChart"></canvas>
    </div>
  </div>

  <script>
    const ctx = document.getElementById('categoryChart').getContext('2d');
    const categoryChart = new Chart(ctx, {
      type: 'pie',
      data: {
        labels: ['Category 1', 'Category 2'],
        datasets: [{
          label: 'Category Distribution',
          data: [20, 80],
          backgroundColor: ['#4e79a7', '#f28e2b']
        }]
      },
      options: {
        responsive: true,
        plugins: {
          legend: {
            position: 'bottom'
          }
        }
      }
    });
  </script>

</body>
</html>

",
  "summary":
  "Category 1 accounts for 20% of the total.
   Category 2 dominates with 80% of the total."
}
Your response should ONLY contain the JSON object and nothing else.
"""

# ── Thinking Token Stripper ───────────────────────────────────────────────────
# deepseek-reasoner prepends <think>...</think> blocks containing internal
# reasoning before the actual answer. These must be removed before JSON parsing.
# deepseek-chat does NOT produce thinking tokens — this is a no-op for that model.
def strip_thinking(text):
    return re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()

# ── Output Formatter ──────────────────────────────────────────────────────────
# Takes the user's original question and the CSV agent's raw text response,
# then asks the LLM to produce a structured JSON with an HTML visualisation
# and a plain-text summary. Returns a (html_content, summary) tuple.
def output_formatter(user_question, csv_agent_response):

    # ── LLM Call ─────────────────────────────────────────────────────────────
    # No response_format="json_object" — strict server-side JSON validation
    # rejects multi-line HTML embedded in JSON strings.
    # Instead we parse client-side with repair_json() for resilience.
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": f"user question: {user_question},\ncsv agent output: {csv_agent_response}"}
        ]
    )

    # ── Parse & Validate Response ─────────────────────────────────────────────
    try:
        raw_content = completion.choices[0].message.content

        # Strip any <think>...</think> tokens (present when using deepseek-reasoner)
        response_content = strip_thinking(raw_content)

        # repair_json fixes common LLM JSON issues (unescaped newlines, trailing commas)
        repaired  = repair_json(response_content)
        json_data = json.loads(repaired)

        # Validate against the Product schema — raises ValidationError if fields are wrong
        product = Product(**json_data)
        return product.html_content, product.summary

    except (json.JSONDecodeError, ValidationError) as e:
        # Fallback: return the raw text as the summary so the user still gets an answer
        print(f"Error parsing response: {e}")
        return "", str(completion.choices[0].message.content)