constcorrectness.github.io/attention.html at main · ConstCorrectness/constcorrectness.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>The Anatomy of Multi-Head Attention</title>
    <!-- Tailwind CSS -->
    <script src="https://cdn.tailwindcss.com"></script>
    <!-- Chart.js -->
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>

    <!-- MathJax Configuration & Loader -->
    <script>
        window.MathJax = {
            tex: {
                inlineMath: [['$', '$'], ['\\(', '\\)']]
            }
        };
    </script>
    <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>

    <!-- Google Fonts -->
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;700&display=swap" rel="stylesheet">

    <style>
        /* Chosen Palette: Warm Neutrals & Academic Clarity */
        :root {
            --bg-canvas: #0f1115; /* Deep Charcoal */
            --text-main: #e6eef7; /* Cool White */
            --text-muted: #9aa6b2;
            --accent-1: #61dafb; /* Cyan */
            --accent-2: #e9c46a; /* Mustard */
            --accent-3: #f4a261; /* Muted Orange */
            --accent-4: #e76f51; /* Burnt Sienna */
            --accent-5: #264653; /* Dark Slate */
            --card-bg: #151a21;
        }

        body {
            background-color: var(--bg-canvas);
            color: var(--text-main);
            font-family: 'Inter', sans-serif;
        }

        .mono-font {
            font-family: 'JetBrains Mono', monospace;
        }

        /* Custom Scrollbar */
        ::-webkit-scrollbar {
            width: 8px;
        }
        ::-webkit-scrollbar-track {
            background: transparent;
        }
        ::-webkit-scrollbar-thumb {
            background: #d1d5db;
            border-radius: 4px;
        }

        /* Transitions */
        .interactive-card {
            transition: all 0.3s ease;
        }
        .interactive-card:hover {
            transform: translateY(-2px);
            box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
        }

        .word-token {
            transition: background-color 0.2s;
            cursor: pointer;
        }

        /* Chart Container Utilities */
        .chart-wrapper {
            position: relative;
            width: 100%;
            height: 300px;
            max-height: 400px;
        }

        /* Theme alignment with main site */
        .attention-post .bg-white { background-color: #151a21 !important; }
        .attention-post .bg-gray-50 { background-color: #11161c !important; }
        .attention-post .bg-gray-100 { background-color: #121821 !important; }
        .attention-post .bg-gray-200 { background-color: #1a2230 !important; }
        .attention-post .bg-gray-900 { background-color: #0b0f14 !important; }
        .attention-post .text-gray-900 { color: #e6eef7 !important; }
        .attention-post .text-gray-800 { color: #d7e1ee !important; }
        .attention-post .text-gray-700 { color: #c2ccda !important; }
        .attention-post .text-gray-600 { color: #a4b0c2 !important; }
        .attention-post .text-gray-500 { color: #8a96a8 !important; }
        .attention-post .text-gray-400 { color: #6f7b8c !important; }
        .attention-post .border-gray-100 { border-color: #1f2833 !important; }
        .attention-post .border-gray-200 { border-color: #263141 !important; }
        .attention-post .border-gray-700 { border-color: #38465b !important; }
        .attention-post .text-teal-600 { color: #61dafb !important; }
        .attention-post .text-teal-700 { color: #61dafb !important; }
        .attention-post .text-teal-300 { color: #7fe3ff !important; }
        .attention-post .bg-teal-50 { background-color: #0d2a33 !important; }
        .attention-post .bg-teal-100 { background-color: #0f3440 !important; }
        .attention-post .bg-teal-500 { background-color: #61dafb !important; }
        .attention-post .bg-teal-900 { background-color: #0b2a33 !important; }
        .attention-post .ring-teal-500 { --tw-ring-color: #61dafb !important; }
        .attention-post .hover\\:bg-teal-50:hover { background-color: #0d2a33 !important; }
        .attention-post .hover\\:text-teal-600:hover { color: #61dafb !important; }
    </style>
    <!--
    Application Structure Plan:
    1. Introduction: Hook the user with the "Paradox of Reduction" (Why less dimension = more context).
    2. The Arithmetic Engine: Interactive inputs to demonstrate the mathematical split (d_model -> d_k).
    3. The Subspace Visualization: An interactive analogy showing how splitting dimensions allows for "Specialized" views (Syntax, Semantic, Position) vs one "Blurry" view.
    4. Practical Lab (Winograd Schema): A text interaction where users toggle "Heads" to see how they resolve ambiguous pronouns differently.
    5. Mathematical Deep Dive: Visualizing Q, K, V matrices and the dot product stability.
    -->

    <!--
    Visualization & Content Choices:
    1. Info -> Dimensionality Split -> Interactive Calculator -> Inputs change text/math -> Chart.js not needed here, pure DOM manipulation is clearer.
    2. Comparison -> Generalist vs Specialist -> Radar Chart -> Chart.js -> Shows how 1 head averages skills, while 8 heads spike in specific skills.
    3. Relationships -> Text Attention -> Interactive Heatmap-style Text -> JS Logic -> User clicks "Head 1", "Head 2" to see highlighting changes on a sentence.

    CONFIRMATION: NO SVG graphics used. NO Mermaid JS used.
    -->
</head>
<body class="antialiased attention-post">

    <!-- Navigation / Header -->
    <nav class="sticky top-0 z-50 bg-white/80 backdrop-blur-md border-b border-gray-200">
        <div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
            <div class="flex justify-between h-16 items-center">
                <div class="flex items-center gap-2">
                    <span class="text-2xl font-bold tracking-tight text-gray-900">Attention<span class="text-teal-600">Lab</span></span>
                </div>
                <div class="hidden md:flex space-x-8 text-sm font-medium text-gray-500">
                    <a href="#intro" class="hover:text-teal-600 transition">Concept</a>
                    <a href="#arithmetic" class="hover:text-teal-600 transition">Math</a>
                    <a href="#specialization" class="hover:text-teal-600 transition">Specialization</a>
                    <a href="#lab" class="hover:text-teal-600 transition">Interactive Lab</a>
                </div>
            </div>
        </div>
    </nav>

    <!-- Hero Section -->
    <section id="intro" class="py-20 px-4 sm:px-6 lg:px-8 max-w-5xl mx-auto text-center">
        <h1 class="text-4xl sm:text-5xl font-extrabold text-gray-900 mb-6">
            Why does <span class="text-teal-600">Splitting</span> Dimensions Create <br class="hidden sm:block" />Better Understanding?
        </h1>
        <p class="text-xl text-gray-600 max-w-3xl mx-auto mb-10 leading-relaxed">
            In Transformer models like BERT or GPT, we often take a massive vector (e.g., 256 dimensions) and slice it into smaller chunks (e.g., 8 heads × 32 dimensions).
            <br/><br/>
            It seems counterintuitive: <strong>How can "dumbing down" the resolution from 256 to 32 help the model see more?</strong>
        </p>
        <div class="grid grid-cols-1 md:grid-cols-2 gap-6 text-left">
            <div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100">
                <div class="flex items-center gap-3 mb-3">
                    <div class="w-8 h-8 rounded-full bg-gray-100 flex items-center justify-center font-bold text-gray-600">1</div>
                    <h3 class="font-bold text-gray-800">The "Blurry Generalist" Problem</h3>
                </div>
                <p class="text-gray-600 text-sm">A single large head tries to capture grammar, tone, references, and logic all in one dot product. The signals get muddled.</p>
            </div>
            <div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100">
                <div class="flex items-center gap-3 mb-3">
                    <div class="w-8 h-8 rounded-full bg-teal-100 flex items-center justify-center font-bold text-teal-600">8</div>
                    <h3 class="font-bold text-gray-800">The "Team of Specialists" Solution</h3>
                </div>
                <p class="text-gray-600 text-sm">Splitting dimensions allows each "Head" to project the input into a unique <strong>subspace</strong>. One looks for pronouns, another for prepositions.</p>
            </div>
        </div>
    </section>

    <!-- Section 2: The Arithmetic of the Split -->
    <section id="arithmetic" class="bg-gray-50 py-16 border-y border-gray-200">
        <div class="max-w-6xl mx-auto px-4 sm:px-6 lg:px-8">
            <div class="mb-10 max-w-3xl">
                <h2 class="text-3xl font-bold text-gray-900 mb-4">1. The Arithmetic of the Split</h2>
                <p class="text-gray-600">
                    Let's look at the numbers. The total parameter count remains roughly the same because we divide the dimension size ($d_{model}$) by the number of heads ($h$).
                    Adjust the sliders to see how the per-head dimension ($d_k$) changes.
                </p>
            </div>

            <div class="grid grid-cols-1 lg:grid-cols-3 gap-8">
                <!-- Controls -->
                <div class="bg-white p-6 rounded-xl shadow-sm border border-gray-200 h-fit">
                    <h3 class="font-semibold text-gray-900 mb-6 border-b pb-2">Configuration</h3>

                    <div class="mb-6">
                        <label class="block text-sm font-medium text-gray-700 mb-2">Embedding Dimension ($d_{model}$)</label>
                        <input type="range" id="dModelInput" min="64" max="1024" step="64" value="256" class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer accent-teal-600">
                        <div class="flex justify-between text-xs text-gray-500 mt-1">
                            <span>64</span>
                            <span id="dModelVal" class="font-bold text-teal-700">256</span>
                            <span>1024</span>
                        </div>
                    </div>

                    <div class="mb-6">
                        <label class="block text-sm font-medium text-gray-700 mb-2">Number of Heads ($h$)</label>
                        <input type="range" id="headsInput" min="1" max="16" step="1" value="8" class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer accent-teal-600">
                        <div class="flex justify-between text-xs text-gray-500 mt-1">
                            <span>1</span>
                            <span id="headsVal" class="font-bold text-teal-700">8</span>
                            <span>16</span>
                        </div>
                    </div>

                    <div class="bg-teal-50 p-4 rounded-lg border border-teal-100">
                        <p class="text-sm text-teal-800 font-medium text-center">
                            Dimension per Head ($d_k$): <br>
                            <span class="text-3xl font-bold mono-font" id="dkDisplay">32</span>
                        </p>
                    </div>
                </div>

                <!-- Visualizer -->
                <div class="lg:col-span-2 bg-white p-8 rounded-xl shadow-sm border border-gray-200 flex flex-col justify-center items-center relative overflow-hidden">
                    <h3 class="absolute top-6 left-6 text-sm font-semibold text-gray-400 uppercase tracking-wider">Vector Visualization</h3>

                    <!-- Dynamic Visual Blocks -->
                    <div class="w-full max-w-lg mt-8">
                        <!-- Parent Vector -->
                        <div class="flex flex-col items-center mb-8">
                            <div class="text-xs text-gray-500 mb-1">Original Vector ($d_{model}$)</div>
                            <div id="parentVector" class="h-12 w-full bg-gray-800 rounded-md shadow-lg flex overflow-hidden transition-all duration-500">
                                <!-- JS will fill this -->
                            </div>
                        </div>

                        <!-- Arrow -->
                        <div class="flex justify-center mb-8 text-gray-300">
                            <svg class="w-8 h-8 animate-bounce" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 14l-7 7m0 0l-7-7m7 7V3"></path></svg>
                        </div>

                        <!-- Split Heads -->
                        <div class="text-xs text-gray-500 mb-1 text-center">Projected Heads ($h \times d_k$)</div>
                        <div id="headsContainer" class="grid grid-cols-4 gap-2 transition-all duration-500">
                            <!-- JS will fill this -->
                        </div>
                    </div>

                    <div class="mt-8 bg-yellow-50 p-4 rounded text-sm text-yellow-800 border border-yellow-200 max-w-lg">
                        <strong>The Mathematical Insight:</strong><br>
                        We aren't losing data. We are reshaping it. Instead of one long vector of size <span id="mathDModel">256</span>, we now have <span id="mathHeads">8</span> independent vectors of size <span id="mathDk">32</span>. Each head can now learn a different linear projection matrix ($W^Q_i, W^K_i, W^V_i$).
                    </div>
                </div>
            </div>
        </div>
    </section>

    <!-- Section 3: The Subspace Theory (Chart.js) -->
    <section id="specialization" class="py-16 max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
        <div class="grid grid-cols-1 md:grid-cols-2 gap-12 items-center">
            <div>
                <h2 class="text-3xl font-bold text-gray-900 mb-4">2. The "Context Subspace" Theory</h2>
                <p class="text-lg text-gray-600 mb-6">
                    Why does this help? Imagine a single attention head is a "General Manager". They try to oversee everything—grammar, facts, tone—but miss the details.
                </p>
                <p class="text-lg text-gray-600 mb-6">
                    Multi-head attention creates a "Team of Specialists". By projecting the 256 dimensions down to 32, each head is forced to focus on a specific <strong>subspace</strong> of the language features.
                </p>

                <div class="space-y-4">
                    <button onclick="updateRadar('single')" class="w-full text-left p-4 rounded-lg border border-gray-200 hover:border-teal-500 hover:bg-teal-50 transition group flex items-center justify-between">
                        <div>
                            <span class="font-bold text-gray-800 block">Single Head (256 dims)</span>
                            <span class="text-sm text-gray-500">Tries to do it all, results in averaged, blurry attention.</span>
                        </div>
                        <div class="w-4 h-4 rounded-full border border-gray-300 group-hover:bg-teal-500"></div>
                    </button>
                    <button onclick="updateRadar('multi')" class="w-full text-left p-4 rounded-lg border border-gray-200 hover:border-teal-500 hover:bg-teal-50 transition group flex items-center justify-between ring-2 ring-teal-500 bg-teal-50">
                        <div>
                            <span class="font-bold text-gray-800 block">Multi-Head (8 x 32 dims)</span>
                            <span class="text-sm text-gray-500">Each head specializes in one linguistic feature.</span>
                        </div>
                        <div class="w-4 h-4 rounded-full bg-teal-500 border border-teal-500"></div>
                    </button>
                </div>
            </div>

            <div class="bg-white p-4 rounded-xl shadow-lg border border-gray-100">
                <div class="text-center mb-2 font-bold text-gray-700">Head Competency Map</div>
                <div class="chart-wrapper mx-auto">
                    <canvas id="specializationChart"></canvas>
                </div>
                <p class="text-xs text-center text-gray-400 mt-2">
                    *Radial axis represents attention strength/clarity on a specific feature.
                </p>
            </div>
        </div>
    </section>

    <!-- Section 4: Interactive Lab (The Core Example) -->
    <section id="lab" class="bg-gray-900 text-white py-20">
        <div class="max-w-6xl mx-auto px-4 sm:px-6 lg:px-8">
            <div class="text-center mb-12">
                <span class="bg-teal-900 text-teal-300 text-xs font-bold px-3 py-1 rounded-full uppercase tracking-wider">Interactive Experiment</span>
                <h2 class="text-3xl md:text-4xl font-bold mt-4 mb-4">Resolving Ambiguity</h2>
                <p class="text-gray-400 max-w-2xl mx-auto">
                    The classic "Winograd Schema" challenge. Consider the sentence below. The word <strong>"it"</strong> is ambiguous.
                    Depending on the context ("tired" vs "wide"), "it" refers to a different noun.
                    <br>Click the <span class="text-teal-400">Heads</span> below to see how they attend differently.
                </p>
            </div>

            <!-- The Interface -->
            <div class="grid grid-cols-1 lg:grid-cols-12 gap-8">

                <!-- Head Selector Panel -->
                <div class="lg:col-span-4 bg-gray-800 rounded-xl p-6 border border-gray-700">
                    <h3 class="text-sm font-semibold text-gray-400 uppercase tracking-wider mb-4">Select Attention Head</h3>
                    <div class="grid grid-cols-2 gap-3">
                        <button onclick="setHead(0)" class="head-btn p-3 rounded bg-gray-700 hover:bg-gray-600 border border-transparent focus:ring-2 focus:ring-teal-500 transition text-left active-head" data-head="0">
                            <div class="text-xs text-gray-400">Head 1</div>
                            <div class="font-bold text-teal-300">Syntax/Grammar</div>
                        </button>
                        <button onclick="setHead(1)" class="head-btn p-3 rounded bg-gray-700 hover:bg-gray-600 border border-transparent focus:ring-2 focus:ring-teal-500 transition text-left" data-head="1">
                            <div class="text-xs text-gray-400">Head 2</div>
                            <div class="font-bold text-yellow-300">Coreference (Tired)</div>
                        </button>
                        <button onclick="setHead(2)" class="head-btn p-3 rounded bg-gray-700 hover:bg-gray-600 border border-transparent focus:ring-2 focus:ring-teal-500 transition text-left" data-head="2">
                            <div class="text-xs text-gray-400">Head 3</div>
                            <div class="font-bold text-purple-300">Coreference (Wide)</div>
                        </button>
                        <button onclick="setHead(3)" class="head-btn p-3 rounded bg-gray-700 hover:bg-gray-600 border border-transparent focus:ring-2 focus:ring-teal-500 transition text-left" data-head="3">
                            <div class="text-xs text-gray-400">Head 4</div>
                            <div class="font-bold text-pink-300">Punctuation</div>
                        </button>
                    </div>

                    <div class="mt-8 p-4 bg-gray-900/50 rounded border border-gray-700">
                        <h4 class="font-bold text-white mb-2" id="explanationTitle">Syntax/Grammar Head</h4>
                        <p class="text-sm text-gray-400" id="explanationText">
                            This head focuses on the immediate syntactic structure. It likely links "it" to the nearest preceding noun or verb, purely based on sentence position, without deep semantic understanding.
                        </p>
                    </div>
                </div>

                <!-- Visualization Panel -->
                <div class="lg:col-span-8 bg-white rounded-xl p-8 text-gray-900 shadow-2xl relative">
                    <div class="absolute top-4 right-4 flex space-x-2">
                        <button onclick="changeSentence('tired')" class="px-3 py-1 text-xs font-bold rounded bg-gray-200 hover:bg-gray-300 transition" id="btn-tired">Version A: Tired</button>
                        <button onclick="changeSentence('wide')" class="px-3 py-1 text-xs font-bold rounded bg-white border border-gray-200 hover:bg-gray-100 transition text-gray-500" id="btn-wide">Version B: Wide</button>
                    </div>

                    <div class="mt-12 mb-8">
                        <div class="flex flex-wrap gap-2 text-xl md:text-2xl font-serif leading-relaxed" id="sentenceContainer">
                            <!-- JS Injects Spans Here -->
                        </div>
                    </div>

                    <div class="h-1 w-full bg-gray-100 rounded overflow-hidden">
                        <div id="attentionBar" class="h-full bg-teal-500 transition-all duration-300" style="width: 0%"></div>
                    </div>

                    <!-- Matrix View (Simplified Heatmap) -->
                    <div class="mt-8">
                         <h4 class="text-xs font-bold text-gray-400 uppercase mb-2">Attention Weights (From word "it")</h4>
                         <div class="flex gap-1 h-16 items-end" id="histogramContainer">
                             <!-- JS Injects Bars Here -->
                         </div>
                    </div>

                </div>
            </div>
        </div>
    </section>

    <!-- Section 5: Math Context -->
    <section class="py-16 bg-white border-t border-gray-200">
        <div class="max-w-4xl mx-auto px-4 sm:px-6 lg:px-8 text-center">
            <h2 class="text-2xl font-bold text-gray-900 mb-6">The Mathematical Mechanism</h2>
            <div class="bg-gray-50 p-6 rounded-xl border border-gray-200 font-mono text-sm md:text-base overflow-x-auto">
                <p class="mb-4 text-gray-500">// The Scaled Dot-Product Attention</p>
                <div class="flex justify-center items-center gap-4">
                    <span>Attention(Q, K, V) = softmax</span>
                    <div class="flex flex-col items-center px-2 border-l border-r border-gray-400">
                        <span class="border-b border-gray-400 w-full mb-1 pb-1">Q K<sup>T</sup></span>
                        <span>&radic;d<sub>k</sub></span>
                    </div>
                    <span>V</span>
                </div>
            </div>
            <p class="mt-6 text-gray-600 leading-relaxed text-left">
                <strong>Why divide by $\sqrt{d_k}$?</strong> When dimensions ($d_k$) are large (e.g., 256), the dot products can grow huge. This pushes the Softmax function into regions where gradients are extremely small (vanishing gradients). By splitting 256 into 8 heads of 32, we keep $d_k$ small (32). The dot products stay in a manageable range, making training more stable and allowing the model to distinguish subtle differences in meaning.
            </p>
        </div>
    </section>

    <!-- Footer -->
    <footer class="bg-gray-100 py-12 border-t border-gray-200">
        <div class="max-w-7xl mx-auto px-4 text-center text-gray-500 text-sm">
            <p>&copy; 2025 Multi-Head Attention Explanation. Built for clarity.</p>
        </div>
    </footer>

    <!-- LOGIC SCRIPT -->
    <script>
        // --- 1. Arithmetic Logic ---
        const dModelInput = document.getElementById('dModelInput');
        const headsInput = document.getElementById('headsInput');
        const dModelVal = document.getElementById('dModelVal');
        const headsVal = document.getElementById('headsVal');
        const dkDisplay = document.getElementById('dkDisplay');
        const mathDModel = document.getElementById('mathDModel');
        const mathHeads = document.getElementById('mathHeads');
        const mathDk = document.getElementById('mathDk');

        const parentVector = document.getElementById('parentVector');
        const headsContainer = document.getElementById('headsContainer');

        function updateMath() {
            const d = parseInt(dModelInput.value);
            const h = parseInt(headsInput.value);
            const dk = Math.floor(d / h);

            dModelVal.innerText = d;
            headsVal.innerText = h;
            dkDisplay.innerText = dk;

            mathDModel.innerText = d;
            mathHeads.innerText = h;
            mathDk.innerText = dk;

            renderVectorVisuals(h);
        }

        function renderVectorVisuals(count) {
            // Parent: Just a visual block
            parentVector.innerHTML = '';
            // We'll use gradients to simulate "complexity"
            parentVector.style.background = `linear-gradient(90deg, #1f2937 0%, #4b5563 100%)`;

            // Heads
            headsContainer.innerHTML = '';
            headsContainer.style.gridTemplateColumns = `repeat(${Math.min(count, 8)}, minmax(0, 1fr))`;

            const colors = ['#2a9d8f', '#e9c46a', '#f4a261', '#e76f51', '#264653', '#2a9d8f', '#e9c46a', '#f4a261', '#e76f51', '#264653', '#2a9d8f', '#e9c46a', '#f4a261', '#e76f51', '#264653'];

            for(let i=0; i<count; i++) {
                const head = document.createElement('div');
                head.className = 'h-16 rounded shadow-sm flex flex-col items-center justify-center text-white text-[10px] font-bold transition-all hover:scale-105';
                head.style.backgroundColor = colors[i % colors.length];
                head.innerText = `H${i+1}`;

                // Add "subspace" lines visual
                const lines = document.createElement('div');
                lines.className = 'w-full px-1 flex flex-col gap-[2px] mt-1 opacity-50';
                for(let j=0; j<3; j++) {
                    const line = document.createElement('div');
                    line.className = 'h-[2px] bg-white rounded-full w-full';
                    line.style.width = Math.random() * 50 + 50 + '%';
                    lines.appendChild(line);
                }
                head.appendChild(lines);

                headsContainer.appendChild(head);
            }
        }

        dModelInput.addEventListener('input', updateMath);
        headsInput.addEventListener('input', updateMath);

        // Init
        renderVectorVisuals(8);


        // --- 2. Chart.js Radar Logic ---
        const ctx = document.getElementById('specializationChart').getContext('2d');
        const radarData = {
            labels: ['Grammar', 'Coreference', 'Prepositions', 'Subject-Verb', 'Sentiment', 'Long-Range'],
            datasets: [
                {
                    label: 'Single Head (Average)',
                    data: [50, 50, 50, 50, 50, 50],
                    fill: true,
                    backgroundColor: 'rgba(107, 114, 128, 0.2)',
                    borderColor: 'rgba(107, 114, 128, 1)',
                    pointBackgroundColor: 'rgba(107, 114, 128, 1)',
                    pointBorderColor: '#fff',
                    pointHoverBackgroundColor: '#fff',
                    pointHoverBorderColor: 'rgba(107, 114, 128, 1)'
                },
                {
                    label: 'Multi-Head (Specialized)',
                    data: [90, 85, 20, 95, 30, 80], // High variance represents specialization
                    fill: true,
                    backgroundColor: 'rgba(42, 157, 143, 0.2)',
                    borderColor: 'rgba(42, 157, 143, 1)',
                    pointBackgroundColor: 'rgba(42, 157, 143, 1)',
                    pointBorderColor: '#fff',
                    pointHoverBackgroundColor: '#fff',
                    pointHoverBorderColor: 'rgba(42, 157, 143, 1)'
                }
            ]
        };

        const config = {
            type: 'radar',
            data: radarData,
            options: {
                responsive: true,
                maintainAspectRatio: false,
                elements: {
                    line: { borderWidth: 3 }
                },
                scales: {
                    r: {
                        angleLines: { display: true },
                        suggestedMin: 0,
                        suggestedMax: 100,
                        ticks: { display: false } // Hide numbers for cleaner look
                    }
                },
                plugins: {
                    legend: { display: false } // We use custom DOM buttons for legend/control
                }
            }
        };

        const myRadarChart = new Chart(ctx, config);

        function updateRadar(mode) {
            // Visual toggle of buttons
            const buttons = document.querySelectorAll('#specialization button');
            buttons.forEach(b => b.classList.remove('ring-2', 'ring-teal-500', 'bg-teal-50'));

            if(mode === 'single') {
                buttons[0].classList.add('ring-2', 'ring-teal-500', 'bg-teal-50');
                // Show dataset 0, hide dataset 1
                myRadarChart.data.datasets[0].hidden = false;
                myRadarChart.data.datasets[1].hidden = true;
            } else {
                buttons[1].classList.add('ring-2', 'ring-teal-500', 'bg-teal-50');
                // Hide dataset 0, show dataset 1
                myRadarChart.data.datasets[0].hidden = true;
                myRadarChart.data.datasets[1].hidden = false;
            }
            myRadarChart.update();
        }

        // Initialize with Multi view
        updateRadar('multi');


        // --- 3. Interactive Lab Logic (Winograd Schema) ---

        // Data Structure for the sentence
        const sentences = {
            tired: {
                text: "The animal didn't cross the street because it was too tired.",
                tokens: ["The", "animal", "didn't", "cross", "the", "street", "because", "it", "was", "too", "tired."],
                targetIndex: 7 // "it"
            },
            wide: {
                text: "The animal didn't cross the street because it was too wide.",
                tokens: ["The", "animal", "didn't", "cross", "the", "street", "because", "it", "was", "too", "wide."],
                targetIndex: 7 // "it"
            }
        };

        // Attention weights for "it" (targetIndex) across different heads
        // Values correspond to indices in the tokens array
        const attentionMaps = {
            tired: {
                0: [0.05, 0.1, 0.1, 0.1, 0.05, 0.1, 0.1, 0.0, 0.2, 0.1, 0.1], // Syntax (looks at neighbors 'was')
                1: [0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2],   // Coreference (looks at 'animal')
                2: [0.0, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.8],   // Wrong Semantic (looks at 'tired')
                3: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.0, 0.1, 0.1, 0.1]    // Diffuse/Punctuation
            },
            wide: {
                0: [0.05, 0.1, 0.1, 0.1, 0.05, 0.1, 0.1, 0.0, 0.2, 0.1, 0.1], // Syntax
                1: [0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8],   // Coreference (Failed/Confused)
                2: [0.0, 0.1, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.1],   // Semantic (looks at 'street' because streets are wide)
                3: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.0, 0.1, 0.1, 0.1]    // Diffuse
            }
        };

        const headExplanations = {
            tired: [
                "Focuses on immediate grammatical neighbors ('was', 'because'). It establishes local sentence structure.",
                "High attention on 'animal'. This head has learned that entities which get 'tired' are usually animate subjects.",
                "Low relevance here. This head specializes in physical properties of objects.",
                "Broad, unfocused attention. Often used as a 'fallback' or for punctuation."
            ],
            wide: [
                "Focuses on immediate grammatical neighbors ('was', 'because'). Structure remains constant.",
                "Low confidence. The property 'wide' doesn't strongly correlate with 'animal' for this head.",
                "High attention on 'street'. This head connects physical descriptors ('wide') to physical objects ('street').",
                "Broad, unfocused attention."
            ]
        };

        let currentSentenceKey = 'tired';
        let currentHeadIndex = 1; // Default to the interesting one

        const container = document.getElementById('sentenceContainer');
        const histoContainer = document.getElementById('histogramContainer');
        const explainTitle = document.getElementById('explanationTitle');
        const explainText = document.getElementById('explanationText');

        function renderSentence() {
            container.innerHTML = '';
            histoContainer.innerHTML = '';

            const data = sentences[currentSentenceKey];
            const weights = attentionMaps[currentSentenceKey][currentHeadIndex];

            // Render Tokens
            data.tokens.forEach((token, index) => {
                const span = document.createElement('span');
                span.innerText = token;
                span.className = 'px-1 rounded cursor-default transition-colors duration-300';

                // Highlight "it" specially
                if(index === data.targetIndex) {
                    span.classList.add('border-b-2', 'border-teal-500', 'font-bold');
                }

                // Apply background color based on attention weight
                // Opacity is mapped 0-1
                if(index !== data.targetIndex) {
                    const weight = weights[index];
                    if (weight > 0.05) {
                        // Use Golden-Yellow for attention highlight
                        span.style.backgroundColor = `rgba(233, 196, 106, ${weight * 1.2})`; // accent-2
                    }
                }

                container.appendChild(span);

                // Render Histogram Bar
                const barWrapper = document.createElement('div');
                barWrapper.className = 'flex flex-col items-center flex-1 h-full justify-end group relative';

                const bar = document.createElement('div');
                bar.className = 'w-full bg-teal-500 rounded-t transition-all duration-300';
                bar.style.height = `${weights[index] * 100}%`;

                // Tooltip for bar
                const tip = document.createElement('div');
                tip.className = 'absolute -top-8 bg-black text-white text-[10px] px-2 py-1 rounded opacity-0 group-hover:opacity-100 transition';
                tip.innerText = `${(weights[index]*100).toFixed(0)}%`;

                barWrapper.appendChild(tip);
                barWrapper.appendChild(bar);

                // Label under bar
                const label = document.createElement('div');
                label.innerText = token;
                label.className = 'text-[8px] mt-1 truncate w-full text-center text-gray-400 hidden sm:block';
                barWrapper.appendChild(label);

                histoContainer.appendChild(barWrapper);
            });
        }

        function updateExplanation() {
            const colors = ['text-teal-300', 'text-yellow-300', 'text-purple-300', 'text-pink-300'];
            const titles = ['Syntax/Grammar', 'Coreference (Animate)', 'Coreference (Object)', 'Punctuation/Diffuse'];

            explainTitle.innerText = titles[currentHeadIndex];
            explainTitle.className = `font-bold mb-2 ${colors[currentHeadIndex]}`;
            explainText.innerText = headExplanations[currentSentenceKey][currentHeadIndex];
        }

        function setHead(index) {
            currentHeadIndex = index;

            // UI Toggle
            document.querySelectorAll('.head-btn').forEach(btn => {
                btn.classList.remove('bg-gray-600', 'ring-2', 'ring-teal-500');
                btn.classList.add('bg-gray-700');
                if(parseInt(btn.dataset.head) === index) {
                    btn.classList.remove('bg-gray-700');
                    btn.classList.add('bg-gray-600', 'ring-2', 'ring-teal-500');
                }
            });

            renderSentence();
            updateExplanation();
        }

        function changeSentence(type) {
            currentSentenceKey = type;

            const btnTired = document.getElementById('btn-tired');
            const btnWide = document.getElementById('btn-wide');

            if(type === 'tired') {
                btnTired.className = 'px-3 py-1 text-xs font-bold rounded bg-gray-200 hover:bg-gray-300 transition text-gray-900';
                btnWide.className = 'px-3 py-1 text-xs font-bold rounded bg-white border border-gray-200 hover:bg-gray-100 transition text-gray-500';
            } else {
                btnWide.className = 'px-3 py-1 text-xs font-bold rounded bg-gray-200 hover:bg-gray-300 transition text-gray-900';
                btnTired.className = 'px-3 py-1 text-xs font-bold rounded bg-white border border-gray-200 hover:bg-gray-100 transition text-gray-500';
            }

            renderSentence();
            updateExplanation();
        }

        // Init Lab
        setHead(1); // Start with Head 2 (Coreference) as it's the most dramatic

    </script>
</body>
</html>