diff --git a/Cargo.lock b/Cargo.lock index 818ab09..dd9f64b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,7 +4,7 @@ version = 4 [[package]] name = "aap" -version = "0.9.2" +version = "0.10.0" dependencies = [ "anyhow", "criterion", diff --git a/evals/data/aap-spec-init.md b/evals/data/aap-spec-init.md new file mode 100644 index 0000000..6d8d770 --- /dev/null +++ b/evals/data/aap-spec-init.md @@ -0,0 +1,23 @@ +## AAP Target Markers + +Wrap each major block and individually-updatable value with target markers: + +``` +content +``` + +Targets nest — coarse blocks contain fine-grained value targets: + +```html + +
+

Revenue

+ $12,340 +
+
+``` + +Target IDs describe the role, not the current value (e.g., "total-revenue" not "12345"). +Place targets where values are most likely to be revised. + +IMPORTANT: You MUST wrap every major section and individually-updatable value in your output with `` markers. Use descriptive, role-based IDs (e.g., "nav", "stats-card", "total-revenue"). Nest targets: coarse section targets should contain fine-grained value targets. Place markers on ALL values that are likely to be revised later. The markers are essential for efficient future edits. diff --git a/evals/data/aap-spec-maintain.md b/evals/data/aap-spec-maintain.md new file mode 100644 index 0000000..0adac9e --- /dev/null +++ b/evals/data/aap-spec-maintain.md @@ -0,0 +1,43 @@ +## AAP Target Markers + +Wrap each major block and individually-updatable value with target markers: + +``` +content +``` + +Targets nest — coarse blocks contain fine-grained value targets: + +```html + +
+

Revenue

+ $12,340 +
+
+``` + +Target IDs describe the role, not the current value (e.g., "total-revenue" not "12345"). +Place targets where values are most likely to be revised. + +## AAP Edit Envelope + +To edit an artifact, produce a JSON envelope with `name: "edit"`: + +```json +{ + "protocol": "aap/0.1", + "id": "artifact-id", + "version": 2, + "name": "edit", + "meta": {"format": "text/html"}, + "content": [ + {"op": "replace", "target": {"type": "id", "value": "revenue-value"}, "content": "$15,720"} + ] +} +``` + +Target by ID only: `{"type": "id", "value": "target-id"}`. Reference existing target IDs from the artifact. +Ops: `replace`, `delete`, `insert_before`, `insert_after`. + +IMPORTANT: You MUST respond with a JSON edit envelope, NOT the full artifact. Reference existing `` IDs from the current artifact. Use `replace` to update content within a target, `delete` to remove a target and its markers, `insert_before`/`insert_after` to add content adjacent to a target. Always increment the version number. diff --git a/evals/data/aap-spec.md b/evals/data/aap-spec.md index 1ee751c..6b23209 100644 --- a/evals/data/aap-spec.md +++ b/evals/data/aap-spec.md @@ -30,7 +30,7 @@ To edit an artifact, produce a JSON envelope with `name: "edit"`: "id": "artifact-id", "version": 2, "name": "edit", - "operation": {"direction": "input", "format": "text/html"}, + "meta": {"format": "text/html"}, "content": [ {"op": "replace", "target": {"type": "id", "value": "revenue-value"}, "content": "$15,720"} ] diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/eval.json b/evals/data/experiments/001-html-dashboard-ecommerce/eval.json deleted file mode 100644 index 057835f..0000000 --- a/evals/data/experiments/001-html-dashboard-ecommerce/eval.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.5285, - "token_f1": 0.609, - "base_char_count": 4451, - "aap_char_count": 2905, - "char_delta_pct": -34.7, - "lines_added": 70, - "lines_removed": 72, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0112, - "token_f1": 0.0331, - "base_char_count": 4557, - "aap_char_count": 284, - "char_delta_pct": -93.8, - "lines_added": 13, - "lines_removed": 81, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0102, - "token_f1": 0.031, - "base_char_count": 4994, - "aap_char_count": 284, - "char_delta_pct": -94.3, - "lines_added": 13, - "lines_removed": 86, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0104, - "token_f1": 0.0319, - "base_char_count": 4896, - "aap_char_count": 284, - "char_delta_pct": -94.2, - "lines_added": 13, - "lines_removed": 84, - "rouge_l": null, - "bleu": null - }, - { - "turn": 4, - "sequence_similarity": 0.0568, - "token_f1": 0.0501, - "base_char_count": 5714, - "aap_char_count": 730, - "char_delta_pct": -87.2, - "lines_added": 24, - "lines_removed": 96, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1234, - "mean_token_f1": 0.151, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/metrics.json b/evals/data/experiments/001-html-dashboard-ecommerce/metrics.json index 98f6e5f..951b49e 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/metrics.json +++ b/evals/data/experiments/001-html-dashboard-ecommerce/metrics.json @@ -1,270 +1,330 @@ { "experiment_id": "001-html-dashboard-ecommerce", - "model": "", + "model": "gemini-2.5-flash", "provider": "google", - "timestamp": "2026-04-03T06:38:23.011980+00:00", + "timestamp": "2026-04-03T08:01:49.629651+00:00", "format": "text/html", "base_turn0": { "input_tokens": 163, - "output_tokens": 1422, - "latency_ms": 6364, - "artifact_bytes": 4459 + "output_tokens": 18959, + "latency_ms": 73150, + "artifact_bytes": 58292, + "ttft_ms": 0, + "ttlt_ms": 45727, + "median_itl_ms": 171.32 }, "aap_turn0": { "input_tokens": 502, - "output_tokens": 1101, - "latency_ms": 5351, - "artifact_bytes": 3246 + "output_tokens": 15801, + "latency_ms": 63447, + "artifact_bytes": 69877, + "ttft_ms": 0, + "ttlt_ms": 57564, + "median_itl_ms": 185.27 }, "default_flow": { "per_turn": [ { "turn": 1, "edit": "Update the Total Revenue stat card to show $215,430 with a +12.3% trend indicato", - "input_tokens": 1613, - "output_tokens": 1446, - "latency_ms": 5741, - "output_bytes": 4563, + "input_tokens": 13035, + "output_tokens": 12924, + "latency_ms": 49935, + "output_bytes": 58562, + "ttft_ms": 0, + "ttlt_ms": 47510, + "median_itl_ms": 175.07, "failed": false, "failure_reason": "" }, { "turn": 2, "edit": "Add 15 new rows to the orders table with recent order data from March 2026", - "input_tokens": 3082, - "output_tokens": 1573, - "latency_ms": 5813, - "output_bytes": 5000, + "input_tokens": 25993, + "output_tokens": 14225, + "latency_ms": 56379, + "output_bytes": 64170, + "ttft_ms": 0, + "ttlt_ms": 50897, + "median_itl_ms": 169.79, "failed": false, "failure_reason": "" }, { "turn": 3, "edit": "Change the primary accent color from blue to purple (#8b5cf6) across all element", - "input_tokens": 4676, - "output_tokens": 1546, - "latency_ms": 6109, - "output_bytes": 4902, + "input_tokens": 40250, + "output_tokens": 14420, + "latency_ms": 54052, + "output_bytes": 64344, + "ttft_ms": 0, + "ttlt_ms": 50108, + "median_itl_ms": 166.85, "failed": false, "failure_reason": "" }, { "turn": 4, "edit": "Add a new 'Recent Activity' section after the stats cards showing the last 10 us", - "input_tokens": 6244, - "output_tokens": 1761, - "latency_ms": 7385, - "output_bytes": 5720, + "input_tokens": 54633, + "output_tokens": 15339, + "latency_ms": 59016, + "output_bytes": 68007, + "ttft_ms": 0, + "ttlt_ms": 52678, + "median_itl_ms": 164.67, "failed": false, "failure_reason": "" } ], - "total_input_tokens": 15615, - "total_output_tokens": 6326, - "total_latency_ms": 25048 + "total_input_tokens": 133911, + "total_output_tokens": 56908, + "total_latency_ms": 219382 }, "aap_flow": { "per_turn": [ { "turn": 1, "edit": "Update the Total Revenue stat card to show $215,430 with a +12.3% trend indicato", - "input_tokens": 2235, - "output_tokens": 166, - "latency_ms": 1571, - "output_bytes": 284, + "input_tokens": 16264, + "output_tokens": 283, + "latency_ms": 3045, + "output_bytes": 69875, + "ttft_ms": 0, + "ttlt_ms": 1020, + "median_itl_ms": 1020.73, "failed": false, "failure_reason": "", "envelope_parsed": true, "apply_succeeded": true, - "envelope_name": "synthesize" + "envelope_name": "edit" }, { "turn": 2, "edit": "Add 15 new rows to the orders table with recent order data from March 2026", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 3169, - "output_bytes": 284, - "failed": true, - "failure_reason": "parse or apply failed", + "input_tokens": 16257, + "output_tokens": 1701, + "latency_ms": 8285, + "output_bytes": 76528, + "ttft_ms": 0, + "ttlt_ms": 1156, + "median_itl_ms": 1156.62, + "failed": false, + "failure_reason": "", "envelope_parsed": true, - "apply_succeeded": false, + "apply_succeeded": true, "envelope_name": "edit" }, { "turn": 3, "edit": "Change the primary accent color from blue to purple (#8b5cf6) across all element", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 1848, - "output_bytes": 284, - "failed": true, - "failure_reason": "parse or apply failed", + "input_tokens": 17590, + "output_tokens": 39105, + "latency_ms": 152522, + "output_bytes": 76530, + "ttft_ms": 0, + "ttlt_ms": 2332, + "median_itl_ms": 2332.63, + "failed": false, + "failure_reason": "", "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" + "apply_succeeded": true, + "envelope_name": "synthesize" }, { "turn": 4, "edit": "Add a new 'Recent Activity' section after the stats cards showing the last 10 us", - "input_tokens": 1249, - "output_tokens": 290, - "latency_ms": 1992, - "output_bytes": 868, + "input_tokens": 17590, + "output_tokens": 3795, + "latency_ms": 16306, + "output_bytes": 82706, + "ttft_ms": 0, + "ttlt_ms": 699, + "median_itl_ms": 699.53, "failed": false, "failure_reason": "", "envelope_parsed": true, "apply_succeeded": true, - "envelope_name": "synthesize" + "envelope_name": "edit" } ], - "total_input_tokens": 3484, - "total_output_tokens": 456, - "total_latency_ms": 8580, + "total_input_tokens": 67701, + "total_output_tokens": 44884, + "total_latency_ms": 180158, "envelope_parse_rate": 1.0, - "apply_success_rate": 0.5 + "apply_success_rate": 1.0 }, "comparison": { - "output_token_savings_pct": 92.8, - "input_token_savings_pct": 77.7, - "latency_savings_pct": 65.7 + "output_token_savings_pct": 21.1, + "input_token_savings_pct": 49.4, + "latency_savings_pct": 17.9 }, "token_table": { "turns": [ { "turn": 0, "base_input": 163, - "base_output": 1422, - "base_latency_ms": 6364, + "base_output": 18959, + "base_latency_ms": 73150, + "base_ttft_ms": 0, + "base_ttlt_ms": 45727, + "base_median_itl_ms": 171.32, "aap_input": 502, - "aap_output": 1101, - "aap_latency_ms": 5351 + "aap_output": 15801, + "aap_latency_ms": 63447, + "aap_ttft_ms": 0, + "aap_ttlt_ms": 57564, + "aap_median_itl_ms": 185.27 }, { "turn": 1, - "base_input": 1613, - "base_output": 1446, - "base_latency_ms": 5741, - "aap_input": 2235, - "aap_output": 166, - "aap_latency_ms": 1571, - "envelope_name": "synthesize", + "base_input": 13035, + "base_output": 12924, + "base_latency_ms": 49935, + "base_ttft_ms": 0, + "base_ttlt_ms": 47510, + "base_median_itl_ms": 175.07, + "aap_input": 16264, + "aap_output": 283, + "aap_latency_ms": 3045, + "aap_ttft_ms": 0, + "aap_ttlt_ms": 1020, + "aap_median_itl_ms": 1020.73, + "envelope_name": "edit", "apply_ok": true }, { "turn": 2, - "base_input": 3082, - "base_output": 1573, - "base_latency_ms": 5813, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 3169, + "base_input": 25993, + "base_output": 14225, + "base_latency_ms": 56379, + "base_ttft_ms": 0, + "base_ttlt_ms": 50897, + "base_median_itl_ms": 169.79, + "aap_input": 16257, + "aap_output": 1701, + "aap_latency_ms": 8285, + "aap_ttft_ms": 0, + "aap_ttlt_ms": 1156, + "aap_median_itl_ms": 1156.62, "envelope_name": "edit", - "apply_ok": false + "apply_ok": true }, { "turn": 3, - "base_input": 4676, - "base_output": 1546, - "base_latency_ms": 6109, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 1848, - "envelope_name": "edit", - "apply_ok": false + "base_input": 40250, + "base_output": 14420, + "base_latency_ms": 54052, + "base_ttft_ms": 0, + "base_ttlt_ms": 50108, + "base_median_itl_ms": 166.85, + "aap_input": 17590, + "aap_output": 39105, + "aap_latency_ms": 152522, + "aap_ttft_ms": 0, + "aap_ttlt_ms": 2332, + "aap_median_itl_ms": 2332.63, + "envelope_name": "synthesize", + "apply_ok": true }, { "turn": 4, - "base_input": 6244, - "base_output": 1761, - "base_latency_ms": 7385, - "aap_input": 1249, - "aap_output": 290, - "aap_latency_ms": 1992, - "envelope_name": "synthesize", + "base_input": 54633, + "base_output": 15339, + "base_latency_ms": 59016, + "base_ttft_ms": 0, + "base_ttlt_ms": 52678, + "base_median_itl_ms": 164.67, + "aap_input": 17590, + "aap_output": 3795, + "aap_latency_ms": 16306, + "aap_ttft_ms": 0, + "aap_ttlt_ms": 699, + "aap_median_itl_ms": 699.53, + "envelope_name": "edit", "apply_ok": true } ], "totals": { - "base_input": 15778, - "base_output": 7748, - "base_combined": 23526, - "aap_input": 3986, - "aap_output": 1557, - "aap_combined": 5543, - "base_latency_ms": 31412, - "aap_latency_ms": 13931, - "output_savings_pct": 79.9, - "input_delta_pct": -74.7, - "combined_savings_pct": 76.4, - "latency_savings_pct": 55.7 + "base_input": 134074, + "base_output": 75867, + "base_combined": 209941, + "aap_input": 68203, + "aap_output": 60685, + "aap_combined": 128888, + "base_latency_ms": 292532, + "aap_latency_ms": 243605, + "output_savings_pct": 20.0, + "input_delta_pct": -49.1, + "combined_savings_pct": 38.6, + "latency_savings_pct": 16.7 } }, "quality": { "per_turn": [ { "turn": 0, - "sequence_similarity": 0.5285, - "token_f1": 0.609, - "base_char_count": 4451, - "aap_char_count": 2905, - "char_delta_pct": -34.7, - "lines_added": 70, - "lines_removed": 72, + "sequence_similarity": 0.1424, + "token_f1": 0.5894, + "base_char_count": 58284, + "aap_char_count": 60552, + "char_delta_pct": 3.9, + "lines_added": 1201, + "lines_removed": 1267, "rouge_l": null, "bleu": null }, { "turn": 1, - "sequence_similarity": 0.0112, - "token_f1": 0.0331, - "base_char_count": 4557, - "aap_char_count": 284, - "char_delta_pct": -93.8, - "lines_added": 13, - "lines_removed": 81, + "sequence_similarity": 0.1425, + "token_f1": 0.5902, + "base_char_count": 58554, + "aap_char_count": 60550, + "char_delta_pct": 3.4, + "lines_added": 1201, + "lines_removed": 1274, "rouge_l": null, "bleu": null }, { "turn": 2, - "sequence_similarity": 0.0102, - "token_f1": 0.031, - "base_char_count": 4994, - "aap_char_count": 284, - "char_delta_pct": -94.3, - "lines_added": 13, - "lines_removed": 86, + "sequence_similarity": 0.1417, + "token_f1": 0.5768, + "base_char_count": 64162, + "aap_char_count": 66498, + "char_delta_pct": 3.6, + "lines_added": 1307, + "lines_removed": 1380, "rouge_l": null, "bleu": null }, { "turn": 3, - "sequence_similarity": 0.0104, - "token_f1": 0.0319, - "base_char_count": 4896, - "aap_char_count": 284, - "char_delta_pct": -94.2, - "lines_added": 13, - "lines_removed": 84, + "sequence_similarity": 0.1415, + "token_f1": 0.5741, + "base_char_count": 64336, + "aap_char_count": 66500, + "char_delta_pct": 3.4, + "lines_added": 1307, + "lines_removed": 1380, "rouge_l": null, "bleu": null }, { "turn": 4, - "sequence_similarity": 0.0568, - "token_f1": 0.0501, - "base_char_count": 5714, - "aap_char_count": 730, - "char_delta_pct": -87.2, - "lines_added": 24, - "lines_removed": 96, + "sequence_similarity": 0.1381, + "token_f1": 0.5616, + "base_char_count": 67999, + "aap_char_count": 70543, + "char_delta_pct": 3.7, + "lines_added": 1387, + "lines_removed": 1465, "rouge_l": null, "bleu": null } ], - "mean_sequence_similarity": 0.1234, - "mean_token_f1": 0.151, + "mean_sequence_similarity": 0.1412, + "mean_token_f1": 0.5784, "mean_rouge_l": null, "mean_bleu": null, "judge_comparisons": null, diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-0.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-0.html index b1e3785..0545766 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-0.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-0.html @@ -1,75 +1,1332 @@ - - - - -
- -
-

Total Revenue

$54,230
-

Orders

1,240
-

Customers

852
-

Conversion

3.2%
-
-
- - -
-

Products

- - - - - -
NameSKUPriceStockStatus
-
-
- - -
-

Recent Orders

- - - - - -
IDCustomerAmountStatus
-
-
- - -
-

Account Settings

-
-


-
Enable Updates -
+ } + + /* --- Stat Cards --- */ + .card { + background-color: var(--card-bg); + border-radius: 8px; + padding: 20px; + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05); + border: 1px solid var(--border-color); + } + + .card-title { + font-size: 0.9rem; + color: var(--text-secondary); + margin-bottom: 10px; + font-weight: 500; + } + + .card-value { + font-size: 2rem; + font-weight: 700; + color: var(--text-color); + margin-bottom: 10px; + } + + .card-trend { + display: flex; + align-items: center; + font-size: 0.85rem; + font-weight: 500; + color: var(--text-secondary); + } + + .card-trend.positive { + color: var(--success-color); + } + + .card-trend.negative { + color: var(--danger-color); + } + + .trend-icon { + margin-right: 5px; + display: inline-block; + } + + /* --- Section Titles --- */ + .section-title { + font-size: 1.5rem; + font-weight: 600; + color: var(--text-color); + margin-top: 30px; + margin-bottom: 20px; + } + + /* --- Tables --- */ + .table-wrapper { + background-color: var(--card-bg); + border-radius: 8px; + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05); + border: 1px solid var(--border-color); + overflow-x: auto; + margin-top: 20px; + } + + .data-table { + width: 100%; + border-collapse: collapse; + font-size: 0.9rem; + } + + .data-table th, .data-table td { + padding: 12px 15px; + text-align: left; + border-bottom: 1px solid var(--border-color); + } + + .data-table th { + background-color: var(--bg-color); + color: var(--text-secondary); + font-weight: 600; + text-transform: uppercase; + font-size: 0.8rem; + } + + .data-table tbody tr:last-child td { + border-bottom: none; + } + + .data-table tbody tr:hover { + background-color: var(--bg-color); + } + + /* Status Badges */ + .badge { + display: inline-flex; + align-items: center; + padding: 5px 10px; + border-radius: 9999px; /* Pill shape */ + font-size: 0.75rem; + font-weight: 600; + text-transform: capitalize; + } + + .badge-success { background-color: #D1FAE5; color: var(--success-color); } + .badge-warning { background-color: #FDE68A; color: var(--warning-color); } + .badge-danger { background-color: #FEE2E2; color: var(--danger-color); } + .badge-info { background-color: #DBEAFE; color: var(--info-color); } + .badge-secondary { background-color: #E5E7EB; color: var(--text-secondary); } + + /* --- Settings Form --- */ + .form-section { + background-color: var(--card-bg); + border-radius: 8px; + padding: 30px; + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05); + border: 1px solid var(--border-color); + margin-top: 20px; + } + + .form-group { + margin-bottom: 20px; + } + + .form-group label { + display: block; + font-weight: 500; + margin-bottom: 8px; + color: var(--text-color); + font-size: 0.9rem; + } + + .form-group input[type="text"], + .form-group input[type="email"], + .form-group input[type="password"] { + width: 100%; + padding: 10px 12px; + border: 1px solid var(--border-color); + border-radius: 6px; + font-size: 0.9rem; + color: var(--text-color); + box-sizing: border-box; + transition: border-color 0.2s ease, box-shadow 0.2s ease; + } + + .form-group input[type="text"]:focus, + .form-group input[type="email"]:focus, + .form-group input[type="password"]:focus { + outline: none; + border-color: var(--primary-light); + box-shadow: 0 0 0 3px rgba(79, 70, 229, 0.1); + } + + .form-actions { + margin-top: 30px; + text-align: right; + } + + .btn-primary { + background-color: var(--primary-color); + color: white; + padding: 10px 20px; + border: none; + border-radius: 6px; + cursor: pointer; + font-size: 0.9rem; + font-weight: 600; + transition: background-color 0.2s ease; + } + + .btn-primary:hover { + background-color: var(--primary-dark); + } + + /* Toggle Switch */ + .toggle-switch { + display: flex; + align-items: center; + justify-content: space-between; + padding: 10px 0; + border-bottom: 1px solid var(--border-color); + } + + .toggle-switch:last-of-type { + border-bottom: none; + } + + .toggle-switch-label { + font-weight: 500; + color: var(--text-color); + font-size: 0.9rem; + } + + .switch { + position: relative; + display: inline-block; + width: 44px; + height: 24px; + } + + .switch input { + opacity: 0; + width: 0; + height: 0; + } + + .slider { + position: absolute; + cursor: pointer; + top: 0; + left: 0; + right: 0; + bottom: 0; + background-color: #ccc; + transition: .4s; + border-radius: 34px; + } + + .slider:before { + position: absolute; + content: ""; + height: 18px; + width: 18px; + left: 3px; + bottom: 3px; + background-color: white; + transition: .4s; + border-radius: 50%; + } + + input:checked + .slider { + background-color: var(--primary-color); + } + + input:focus + .slider { + box-shadow: 0 0 1px var(--primary-color); + } + + input:checked + .slider:before { + transform: translateX(20px); + } + + + + + + + + +
+

Dashboard Overview

+ + +
+ +
+
Total Revenue
+
$8,450,290
+
+ 12.5% vs last month +
+
+
+ + +
+
Total Orders
+
5,231
+
+ 8.1% vs last month +
+
+
+ + +
+
New Customers
+
890
+
+ -3.2% vs last month +
+
+
+ + +
+
Conversion Rate
+
3.8%
+
+ 0.5% vs last month +
+
+
+
+
+ +

Product List

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesELC-HP-001$79.99150ElectronicsIn Stock
Smartwatch Series 7ELC-SW-007$249.0080WearablesIn Stock
USB-C Fast Charger (65W)ELC-CH-065$29.9930AccessoriesLow Stock
Mechanical Gaming Keyboard RGBPCG-KB-RGB$119.9925PC PeripheralsLow Stock
Portable SSD 1TB USB 3.2STO-SSD-1TB$129.9995StorageIn Stock
4K UHD Smart TV 55-inchELC-TV-55$699.005TelevisionsLow Stock
Ergonomic Office ChairFURN-OC-ERG$299.0012Office FurnitureLow Stock
Noise Cancelling Earbuds ProELC-EB-PRO$179.0070AudioIn Stock
Compact Espresso MachineHOME-CM-ESP$149.990Kitchen AppliancesOut of Stock
Robot Vacuum Cleaner with MopHOME-VC-ROB$349.9918Home AppliancesIn Stock
High-Performance BlenderHOME-BL-HP$89.9940Kitchen AppliancesIn Stock
Digital Air Fryer 5.8QTHOME-AF-5QT$110.0020Kitchen AppliancesLow Stock
Portable Bluetooth Speaker X2ELC-SP-X2$59.9960AudioIn Stock
Gaming Mouse RGB ProPCG-MS-RGB$49.9975PC PeripheralsIn Stock
External Hard Drive 2TBSTO-HDD-2TB$79.000StorageOut of Stock
Mesh Wi-Fi System (3-pack)NET-WF-MESH$199.9910NetworkingLow Stock
Electric Kettle Stainless SteelHOME-KT-SS$39.9955Kitchen AppliancesIn Stock
Smart Doorbell CameraSMART-DB-CAM$150.0022Smart HomeLow Stock
Fitness Tracker with HRELC-FT-HR$65.0090WearablesIn Stock
Digital Drawing Tablet 10-inchART-DT-10$99.0015Creative ToolsLow Stock
Wireless Charging PadELC-WP-001$25.00120AccessoriesIn Stock
Curved Gaming Monitor 27-inchPCG-MN-27C$349.008MonitorsLow Stock
Portable Projector MiniELC-PJ-MINI$199.000ProjectorsOut of Stock
Action Camera 4K ProCAM-AC-4K$299.0010CamerasLow Stock
Smart Plug Wi-Fi (4-pack)SMART-PL-4PK$39.9970Smart HomeIn Stock
Premium Noise-Cancelling Over-Ear HeadphonesELC-HP-NC-PRE$349.9945AudioIn Stock
Wireless Gaming HeadsetPCG-HS-WL$99.9920PC PeripheralsLow Stock
Portable Power Bank 20000mAhELC-PB-20K$45.00180AccessoriesIn Stock
Smart Light Bulb E27 (Color)SMART-LB-C$15.99200Smart HomeIn Stock
Digital Photo Frame 8-inchELC-PF-8IN$75.0012Home DecorLow Stock
Entry-Level DSLR Camera KitCAM-DSLR-KIT$599.007CamerasLow Stock
Wireless Router Wi-Fi 6NET-RT-W6$120.0025NetworkingIn Stock
Smart Scale with Body CompositionHEALTH-SC-BC$49.9935Health & FitnessIn Stock
Electric Toothbrush with AppHEALTH-TB-APP$89.000Personal CareOut of Stock
Home Security Camera IndoorSMART-SC-IN$69.9940Smart HomeIn Stock
Portable Mini Fan USBHOME-FN-USB$19.99100Home AppliancesIn Stock
USB Microphone for StreamingAUDIO-MIC-USB$60.0015AudioLow Stock
Universal Travel AdapterELC-TA-UNI$22.00110Travel AccessoriesIn Stock
Digital Kitchen Food ScaleHOME-FS-DIG$29.9960Kitchen AppliancesIn Stock
Smart Thermostat LearningSMART-TH-LRN$180.009Smart HomeLow Stock
Mini Projector Portable HDELC-MP-HD$120.000ProjectorsOut of Stock
Gaming Chair with Lumbar SupportFURN-GC-LUM$250.0010Office FurnitureLow Stock
Electric Hand MixerHOME-MX-ELEC$35.0045Kitchen AppliancesIn Stock
Portable Photo PrinterCAM-PR-PORT$99.0018CamerasIn Stock
Smart Water BottleHEALTH-WB-SMART$30.0070Health & FitnessIn Stock
Bluetooth Car AdapterAUTO-BT-ADP$20.0090Car AccessoriesIn Stock
Air Purifier for HomeHOME-AP-01$149.0010Home AppliancesLow Stock
Robot Toy ProgrammableTOY-RB-PROG$85.0025Toys & GamesIn Stock
External Webcam Full HDELC-WC-FHD$49.0030PC PeripheralsIn Stock
Smart Garden Indoor KitHOME-SG-IN$79.0015Smart HomeLow Stock
Magnetic Phone Car MountAUTO-PM-MAG$15.00150Car AccessoriesIn Stock
Portable Espresso MakerHOME-PM-ESP$60.000Kitchen AppliancesOut of Stock
UV Light Sanitizer BoxHEALTH-UV-BOX$40.0050Personal CareIn Stock
Wireless Charger StandELC-CS-WL$30.0080AccessoriesIn Stock
Dimmable LED Desk LampHOME-DL-LED$45.0025Home DecorLow Stock
VR Headset BasicGAMING-VR-BASIC$199.005GamingLow Stock
Streaming Webcam 1080pELC-WC-1080$55.0030PC PeripheralsIn Stock
Foldable Drone with CameraDRN-FL-CAM$120.0010DronesLow Stock
Smart Wi-Fi Coffee MakerHOME-CM-WIFI$95.0012Kitchen AppliancesLow Stock
Digital Body Fat ScaleHEALTH-BFS-DIG$35.0040Health & FitnessIn Stock
+
+
+
+ +

Recent Orders

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountDateStatus
#ORD00101Alice Johnson$249.002023-10-26Delivered
#ORD00102Bob Smith$79.992023-10-25Shipped
#ORD00103Charlie Brown$119.992023-10-25Pending
#ORD00104Diana Prince$29.992023-10-24Delivered
#ORD00105Eve Adams$699.002023-10-24Shipped
#ORD00106Frank White$129.992023-10-23Delivered
#ORD00107Grace Lee$299.002023-10-23Pending
#ORD00108Henry Clark$179.002023-10-22Shipped
#ORD00109Ivy Green$149.992023-10-22Canceled
#ORD00110Jack King$349.992023-10-21Delivered
#ORD00111Karen Hall$89.992023-10-21Shipped
#ORD00112Liam Scott$110.002023-10-20Pending
#ORD00113Mia Baker$59.992023-10-20Delivered
#ORD00114Noah Taylor$49.992023-10-19Shipped
#ORD00115Olivia Miller$79.002023-10-19Canceled
#ORD00116Peter Davis$199.992023-10-18Delivered
#ORD00117Quinn Wilson$39.992023-10-18Shipped
#ORD00118Rachel Moore$150.002023-10-17Pending
#ORD00119Sam Harris$65.002023-10-17Delivered
#ORD00120Tina Young$99.002023-10-16Shipped
#ORD00121Uma Jackson$25.002023-10-16Delivered
#ORD00122Victor Green$349.002023-10-15Pending
#ORD00123Wendy White$199.002023-10-15Canceled
#ORD00124Xavier Bell$299.002023-10-14Shipped
#ORD00125Yara Hall$39.992023-10-14Delivered
#ORD00126Zack Adams$349.992023-10-13Shipped
#ORD00127Amy Brown$99.992023-10-13Pending
#ORD00128Brian Davis$45.002023-10-12Delivered
#ORD00129Chloe Evans$15.992023-10-12Shipped
#ORD00130Daniel Garcia$75.002023-10-11Delivered
#ORD00131Emily Rodriguez$599.002023-10-11Pending
#ORD00132Fiona Martinez$120.002023-10-10Shipped
#ORD00133George Hernandez$49.992023-10-10Delivered
#ORD00134Hannah Lopez$89.002023-10-09Canceled
#ORD00135Isaac Perez$69.992023-10-09Shipped
#ORD00136Julia Garcia$19.992023-10-08Delivered
#ORD00137Kevin Scott$60.002023-10-08Pending
#ORD00138Laura Kim$22.002023-10-07Delivered
#ORD00139Michael Chen$29.992023-10-07Shipped
#ORD00140Nancy Patel$180.002023-10-06Delivered
+
+
+
+ +

Account Settings

+ +
+

Profile Information

+ +
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + +
+
+
+ +
+

Notification Settings

+ +
+ Email Notifications + +
+
+ SMS Notifications + +
+
+ Push Notifications + +
+
+ + + +
+
+
+
- -
\ No newline at end of file + + \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-1.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-1.html index ab5031f..9f1b92f 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-1.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-1.html @@ -1,13 +1,1332 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "revenue"}, - "content": "$215,430 (+12.3%)" - } - ] -} \ No newline at end of file + + + + + + E-commerce Admin Dashboard + + + + + + + +
+

Dashboard Overview

+ + +
+ +
+
Total Revenue
+
$215,430
+
+ 12.3% vs last month +
+
+
+ + +
+
Total Orders
+
5,231
+
+ 8.1% vs last month +
+
+
+ + +
+
New Customers
+
890
+
+ -3.2% vs last month +
+
+
+ + +
+
Conversion Rate
+
3.8%
+
+ 0.5% vs last month +
+
+
+
+
+ +

Product List

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesELC-HP-001$79.99150ElectronicsIn Stock
Smartwatch Series 7ELC-SW-007$249.0080WearablesIn Stock
USB-C Fast Charger (65W)ELC-CH-065$29.9930AccessoriesLow Stock
Mechanical Gaming Keyboard RGBPCG-KB-RGB$119.9925PC PeripheralsLow Stock
Portable SSD 1TB USB 3.2STO-SSD-1TB$129.9995StorageIn Stock
4K UHD Smart TV 55-inchELC-TV-55$699.005TelevisionsLow Stock
Ergonomic Office ChairFURN-OC-ERG$299.0012Office FurnitureLow Stock
Noise Cancelling Earbuds ProELC-EB-PRO$179.0070AudioIn Stock
Compact Espresso MachineHOME-CM-ESP$149.990Kitchen AppliancesOut of Stock
Robot Vacuum Cleaner with MopHOME-VC-ROB$349.9918Home AppliancesIn Stock
High-Performance BlenderHOME-BL-HP$89.9940Kitchen AppliancesIn Stock
Digital Air Fryer 5.8QTHOME-AF-5QT$110.0020Kitchen AppliancesLow Stock
Portable Bluetooth Speaker X2ELC-SP-X2$59.9960AudioIn Stock
Gaming Mouse RGB ProPCG-MS-RGB$49.9975PC PeripheralsIn Stock
External Hard Drive 2TBSTO-HDD-2TB$79.000StorageOut of Stock
Mesh Wi-Fi System (3-pack)NET-WF-MESH$199.9910NetworkingLow Stock
Electric Kettle Stainless SteelHOME-KT-SS$39.9955Kitchen AppliancesIn Stock
Smart Doorbell CameraSMART-DB-CAM$150.0022Smart HomeLow Stock
Fitness Tracker with HRELC-FT-HR$65.0090WearablesIn Stock
Digital Drawing Tablet 10-inchART-DT-10$99.0015Creative ToolsLow Stock
Wireless Charging PadELC-WP-001$25.00120AccessoriesIn Stock
Curved Gaming Monitor 27-inchPCG-MN-27C$349.008MonitorsLow Stock
Portable Projector MiniELC-PJ-MINI$199.000ProjectorsOut of Stock
Action Camera 4K ProCAM-AC-4K$299.0010CamerasLow Stock
Smart Plug Wi-Fi (4-pack)SMART-PL-4PK$39.9970Smart HomeIn Stock
Premium Noise-Cancelling Over-Ear HeadphonesELC-HP-NC-PRE$349.9945AudioIn Stock
Wireless Gaming HeadsetPCG-HS-WL$99.9920PC PeripheralsLow Stock
Portable Power Bank 20000mAhELC-PB-20K$45.00180AccessoriesIn Stock
Smart Light Bulb E27 (Color)SMART-LB-C$15.99200Smart HomeIn Stock
Digital Photo Frame 8-inchELC-PF-8IN$75.0012Home DecorLow Stock
Entry-Level DSLR Camera KitCAM-DSLR-KIT$599.007CamerasLow Stock
Wireless Router Wi-Fi 6NET-RT-W6$120.0025NetworkingIn Stock
Smart Scale with Body CompositionHEALTH-SC-BC$49.9935Health & FitnessIn Stock
Electric Toothbrush with AppHEALTH-TB-APP$89.000Personal CareOut of Stock
Home Security Camera IndoorSMART-SC-IN$69.9940Smart HomeIn Stock
Portable Mini Fan USBHOME-FN-USB$19.99100Home AppliancesIn Stock
USB Microphone for StreamingAUDIO-MIC-USB$60.0015AudioLow Stock
Universal Travel AdapterELC-TA-UNI$22.00110Travel AccessoriesIn Stock
Digital Kitchen Food ScaleHOME-FS-DIG$29.9960Kitchen AppliancesIn Stock
Smart Thermostat LearningSMART-TH-LRN$180.009Smart HomeLow Stock
Mini Projector Portable HDELC-MP-HD$120.000ProjectorsOut of Stock
Gaming Chair with Lumbar SupportFURN-GC-LUM$250.0010Office FurnitureLow Stock
Electric Hand MixerHOME-MX-ELEC$35.0045Kitchen AppliancesIn Stock
Portable Photo PrinterCAM-PR-PORT$99.0018CamerasIn Stock
Smart Water BottleHEALTH-WB-SMART$30.0070Health & FitnessIn Stock
Bluetooth Car AdapterAUTO-BT-ADP$20.0090Car AccessoriesIn Stock
Air Purifier for HomeHOME-AP-01$149.0010Home AppliancesLow Stock
Robot Toy ProgrammableTOY-RB-PROG$85.0025Toys & GamesIn Stock
External Webcam Full HDELC-WC-FHD$49.0030PC PeripheralsIn Stock
Smart Garden Indoor KitHOME-SG-IN$79.0015Smart HomeLow Stock
Magnetic Phone Car MountAUTO-PM-MAG$15.00150Car AccessoriesIn Stock
Portable Espresso MakerHOME-PM-ESP$60.000Kitchen AppliancesOut of Stock
UV Light Sanitizer BoxHEALTH-UV-BOX$40.0050Personal CareIn Stock
Wireless Charger StandELC-CS-WL$30.0080AccessoriesIn Stock
Dimmable LED Desk LampHOME-DL-LED$45.0025Home DecorLow Stock
VR Headset BasicGAMING-VR-BASIC$199.005GamingLow Stock
Streaming Webcam 1080pELC-WC-1080$55.0030PC PeripheralsIn Stock
Foldable Drone with CameraDRN-FL-CAM$120.0010DronesLow Stock
Smart Wi-Fi Coffee MakerHOME-CM-WIFI$95.0012Kitchen AppliancesLow Stock
Digital Body Fat ScaleHEALTH-BFS-DIG$35.0040Health & FitnessIn Stock
+
+
+
+ +

Recent Orders

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountDateStatus
#ORD00101Alice Johnson$249.002023-10-26Delivered
#ORD00102Bob Smith$79.992023-10-25Shipped
#ORD00103Charlie Brown$119.992023-10-25Pending
#ORD00104Diana Prince$29.992023-10-24Delivered
#ORD00105Eve Adams$699.002023-10-24Shipped
#ORD00106Frank White$129.992023-10-23Delivered
#ORD00107Grace Lee$299.002023-10-23Pending
#ORD00108Henry Clark$179.002023-10-22Shipped
#ORD00109Ivy Green$149.992023-10-22Canceled
#ORD00110Jack King$349.992023-10-21Delivered
#ORD00111Karen Hall$89.992023-10-21Shipped
#ORD00112Liam Scott$110.002023-10-20Pending
#ORD00113Mia Baker$59.992023-10-20Delivered
#ORD00114Noah Taylor$49.992023-10-19Shipped
#ORD00115Olivia Miller$79.002023-10-19Canceled
#ORD00116Peter Davis$199.992023-10-18Delivered
#ORD00117Quinn Wilson$39.992023-10-18Shipped
#ORD00118Rachel Moore$150.002023-10-17Pending
#ORD00119Sam Harris$65.002023-10-17Delivered
#ORD00120Tina Young$99.002023-10-16Shipped
#ORD00121Uma Jackson$25.002023-10-16Delivered
#ORD00122Victor Green$349.002023-10-15Pending
#ORD00123Wendy White$199.002023-10-15Canceled
#ORD00124Xavier Bell$299.002023-10-14Shipped
#ORD00125Yara Hall$39.992023-10-14Delivered
#ORD00126Zack Adams$349.992023-10-13Shipped
#ORD00127Amy Brown$99.992023-10-13Pending
#ORD00128Brian Davis$45.002023-10-12Delivered
#ORD00129Chloe Evans$15.992023-10-12Shipped
#ORD00130Daniel Garcia$75.002023-10-11Delivered
#ORD00131Emily Rodriguez$599.002023-10-11Pending
#ORD00132Fiona Martinez$120.002023-10-10Shipped
#ORD00133George Hernandez$49.992023-10-10Delivered
#ORD00134Hannah Lopez$89.002023-10-09Canceled
#ORD00135Isaac Perez$69.992023-10-09Shipped
#ORD00136Julia Garcia$19.992023-10-08Delivered
#ORD00137Kevin Scott$60.002023-10-08Pending
#ORD00138Laura Kim$22.002023-10-07Delivered
#ORD00139Michael Chen$29.992023-10-07Shipped
#ORD00140Nancy Patel$180.002023-10-06Delivered
+
+
+
+ +

Account Settings

+ +
+

Profile Information

+ +
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + +
+
+
+ +
+

Notification Settings

+ +
+ Email Notifications + +
+
+ SMS Notifications + +
+
+ Push Notifications + +
+
+ + + +
+
+
+
+
+ + \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-1.json b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-1.json index c4ecbab..573f350 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-1.json +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-1.json @@ -1,8 +1,8 @@ { "protocol": "aap/0.1", - "id": "artifact-id-edit-1", - "version": 1, - "name": "synthesize", + "id": "artifact-id", + "version": 2, + "name": "edit", "meta": { "format": "text/html", "tokens_used": null, @@ -11,7 +11,20 @@ }, "content": [ { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"artifact-id\",\n \"version\": 2,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\"type\": \"id\", \"value\": \"revenue\"},\n \"content\": \"$215,430 (+12.3%)\"\n }\n ]\n}" + "op": "replace", + "target": { + "type": "id", + "value": "stat-value-revenue" + }, + "content": "$215,430" + }, + { + "op": "replace", + "target": { + "type": "id", + "value": "stat-trend-revenue" + }, + "content": "12.3% vs last month" } ] } \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-2.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-2.html index ab5031f..a9749e8 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-2.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-2.html @@ -1,13 +1,1438 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "revenue"}, - "content": "$215,430 (+12.3%)" - } - ] -} \ No newline at end of file + + + + + + E-commerce Admin Dashboard + + + + + + + +
+

Dashboard Overview

+ + +
+ +
+
Total Revenue
+
$215,430
+
+ 12.3% vs last month +
+
+
+ + +
+
Total Orders
+
5,231
+
+ 8.1% vs last month +
+
+
+ + +
+
New Customers
+
890
+
+ -3.2% vs last month +
+
+
+ + +
+
Conversion Rate
+
3.8%
+
+ 0.5% vs last month +
+
+
+
+
+ +

Product List

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesELC-HP-001$79.99150ElectronicsIn Stock
Smartwatch Series 7ELC-SW-007$249.0080WearablesIn Stock
USB-C Fast Charger (65W)ELC-CH-065$29.9930AccessoriesLow Stock
Mechanical Gaming Keyboard RGBPCG-KB-RGB$119.9925PC PeripheralsLow Stock
Portable SSD 1TB USB 3.2STO-SSD-1TB$129.9995StorageIn Stock
4K UHD Smart TV 55-inchELC-TV-55$699.005TelevisionsLow Stock
Ergonomic Office ChairFURN-OC-ERG$299.0012Office FurnitureLow Stock
Noise Cancelling Earbuds ProELC-EB-PRO$179.0070AudioIn Stock
Compact Espresso MachineHOME-CM-ESP$149.990Kitchen AppliancesOut of Stock
Robot Vacuum Cleaner with MopHOME-VC-ROB$349.9918Home AppliancesIn Stock
High-Performance BlenderHOME-BL-HP$89.9940Kitchen AppliancesIn Stock
Digital Air Fryer 5.8QTHOME-AF-5QT$110.0020Kitchen AppliancesLow Stock
Portable Bluetooth Speaker X2ELC-SP-X2$59.9960AudioIn Stock
Gaming Mouse RGB ProPCG-MS-RGB$49.9975PC PeripheralsIn Stock
External Hard Drive 2TBSTO-HDD-2TB$79.000StorageOut of Stock
Mesh Wi-Fi System (3-pack)NET-WF-MESH$199.9910NetworkingLow Stock
Electric Kettle Stainless SteelHOME-KT-SS$39.9955Kitchen AppliancesIn Stock
Smart Doorbell CameraSMART-DB-CAM$150.0022Smart HomeLow Stock
Fitness Tracker with HRELC-FT-HR$65.0090WearablesIn Stock
Digital Drawing Tablet 10-inchART-DT-10$99.0015Creative ToolsLow Stock
Wireless Charging PadELC-WP-001$25.00120AccessoriesIn Stock
Curved Gaming Monitor 27-inchPCG-MN-27C$349.008MonitorsLow Stock
Portable Projector MiniELC-PJ-MINI$199.000ProjectorsOut of Stock
Action Camera 4K ProCAM-AC-4K$299.0010CamerasLow Stock
Smart Plug Wi-Fi (4-pack)SMART-PL-4PK$39.9970Smart HomeIn Stock
Premium Noise-Cancelling Over-Ear HeadphonesELC-HP-NC-PRE$349.9945AudioIn Stock
Wireless Gaming HeadsetPCG-HS-WL$99.9920PC PeripheralsLow Stock
Portable Power Bank 20000mAhELC-PB-20K$45.00180AccessoriesIn Stock
Smart Light Bulb E27 (Color)SMART-LB-C$15.99200Smart HomeIn Stock
Digital Photo Frame 8-inchELC-PF-8IN$75.0012Home DecorLow Stock
Entry-Level DSLR Camera KitCAM-DSLR-KIT$599.007CamerasLow Stock
Wireless Router Wi-Fi 6NET-RT-W6$120.0025NetworkingIn Stock
Smart Scale with Body CompositionHEALTH-SC-BC$49.9935Health & FitnessIn Stock
Electric Toothbrush with AppHEALTH-TB-APP$89.000Personal CareOut of Stock
Home Security Camera IndoorSMART-SC-IN$69.9940Smart HomeIn Stock
Portable Mini Fan USBHOME-FN-USB$19.99100Home AppliancesIn Stock
USB Microphone for StreamingAUDIO-MIC-USB$60.0015AudioLow Stock
Universal Travel AdapterELC-TA-UNI$22.00110Travel AccessoriesIn Stock
Digital Kitchen Food ScaleHOME-FS-DIG$29.9960Kitchen AppliancesIn Stock
Smart Thermostat LearningSMART-TH-LRN$180.009Smart HomeLow Stock
Mini Projector Portable HDELC-MP-HD$120.000ProjectorsOut of Stock
Gaming Chair with Lumbar SupportFURN-GC-LUM$250.0010Office FurnitureLow Stock
Electric Hand MixerHOME-MX-ELEC$35.0045Kitchen AppliancesIn Stock
Portable Photo PrinterCAM-PR-PORT$99.0018CamerasIn Stock
Smart Water BottleHEALTH-WB-SMART$30.0070Health & FitnessIn Stock
Bluetooth Car AdapterAUTO-BT-ADP$20.0090Car AccessoriesIn Stock
Air Purifier for HomeHOME-AP-01$149.0010Home AppliancesLow Stock
Robot Toy ProgrammableTOY-RB-PROG$85.0025Toys & GamesIn Stock
External Webcam Full HDELC-WC-FHD$49.0030PC PeripheralsIn Stock
Smart Garden Indoor KitHOME-SG-IN$79.0015Smart HomeLow Stock
Magnetic Phone Car MountAUTO-PM-MAG$15.00150Car AccessoriesIn Stock
Portable Espresso MakerHOME-PM-ESP$60.000Kitchen AppliancesOut of Stock
UV Light Sanitizer BoxHEALTH-UV-BOX$40.0050Personal CareIn Stock
Wireless Charger StandELC-CS-WL$30.0080AccessoriesIn Stock
Dimmable LED Desk LampHOME-DL-LED$45.0025Home DecorLow Stock
VR Headset BasicGAMING-VR-BASIC$199.005GamingLow Stock
Streaming Webcam 1080pELC-WC-1080$55.0030PC PeripheralsIn Stock
Foldable Drone with CameraDRN-FL-CAM$120.0010DronesLow Stock
Smart Wi-Fi Coffee MakerHOME-CM-WIFI$95.0012Kitchen AppliancesLow Stock
Digital Body Fat ScaleHEALTH-BFS-DIG$35.0040Health & FitnessIn Stock
+
+
+
+ +

Recent Orders

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountDateStatus
#ORD00101Alice Johnson$249.002023-10-26Delivered
#ORD00102Bob Smith$79.992023-10-25Shipped
#ORD00103Charlie Brown$119.992023-10-25Pending
#ORD00104Diana Prince$29.992023-10-24Delivered
#ORD00105Eve Adams$699.002023-10-24Shipped
#ORD00106Frank White$129.992023-10-23Delivered
#ORD00107Grace Lee$299.002023-10-23Pending
#ORD00108Henry Clark$179.002023-10-22Shipped
#ORD00109Ivy Green$149.992023-10-22Canceled
#ORD00110Jack King$349.992023-10-21Delivered
#ORD00111Karen Hall$89.992023-10-21Shipped
#ORD00112Liam Scott$110.002023-10-20Pending
#ORD00113Mia Baker$59.992023-10-20Delivered
#ORD00114Noah Taylor$49.992023-10-19Shipped
#ORD00115Olivia Miller$79.002023-10-19Canceled
#ORD00116Peter Davis$199.992023-10-18Delivered
#ORD00117Quinn Wilson$39.992023-10-18Shipped
#ORD00118Rachel Moore$150.002023-10-17Pending
#ORD00119Sam Harris$65.002023-10-17Delivered
#ORD00120Tina Young$99.002023-10-16Shipped
#ORD00121Uma Jackson$25.002023-10-16Delivered
#ORD00122Victor Green$349.002023-10-15Pending
#ORD00123Wendy White$199.002023-10-15Canceled
#ORD00124Xavier Bell$299.002023-10-14Shipped
#ORD00125Yara Hall$39.992023-10-14Delivered
#ORD00126Zack Adams$349.992023-10-13Shipped
#ORD00127Amy Brown$99.992023-10-13Pending
#ORD00128Brian Davis$45.002023-10-12Delivered
#ORD00129Chloe Evans$15.992023-10-12Shipped
#ORD00130Daniel Garcia$75.002023-10-11Delivered
#ORD00131Emily Rodriguez$599.002023-10-11Pending
#ORD00132Fiona Martinez$120.002023-10-10Shipped
#ORD00133George Hernandez$49.992023-10-10Delivered
#ORD00134Hannah Lopez$89.002023-10-09Canceled
#ORD00135Isaac Perez$69.992023-10-09Shipped
#ORD00136Julia Garcia$19.992023-10-08Delivered
#ORD00137Kevin Scott$60.002023-10-08Pending
#ORD00138Laura Kim$22.002023-10-07Delivered
#ORD00139Michael Chen$29.992023-10-07Shipped
#ORD00140Nancy Patel$180.002023-10-06Delivered
#ORD00141Chris Evans$75.502026-03-15Delivered
#ORD00142Jessica Alba$120.002026-03-14Shipped
#ORD00143Tom Hanks$35.992026-03-14Pending
#ORD00144Sandra Bullock$200.002026-03-13Delivered
#ORD00145Leonardo DiCaprio$89.992026-03-13Shipped
#ORD00146Julia Roberts$150.002026-03-12Delivered
#ORD00147Brad Pitt$49.002026-03-12Pending
#ORD00148Angelina Jolie$320.002026-03-11Shipped
#ORD00149George Clooney$70.002026-03-11Canceled
#ORD00150Jennifer Aniston$180.002026-03-10Delivered
#ORD00151Dwayne Johnson$99.002026-03-10Shipped
#ORD00152Gal Gadot$65.002026-03-09Pending
#ORD00153Robert Downey Jr.$450.002026-03-09Delivered
#ORD00154Scarlett Johansson$110.002026-03-08Shipped
#ORD00155Chris Hemsworth$75.002026-03-08Delivered
+
+
+
+ +

Account Settings

+ +
+

Profile Information

+ +
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + +
+
+
+ +
+

Notification Settings

+ +
+ Email Notifications + +
+
+ SMS Notifications + +
+
+ Push Notifications + +
+
+ + + +
+
+
+
+
+ + \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-2.json b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-2.json index 5c980e0..6b03d7c 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-2.json +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-2.json @@ -1,7 +1,7 @@ { "protocol": "aap/0.1", "id": "artifact-id", - "version": 3, + "version": 1, "name": "edit", "meta": { "format": "text/html", @@ -14,9 +14,9 @@ "op": "insert_after", "target": { "type": "id", - "value": "order-table-header" + "value": "orders-table-body" }, - "content": "ORD-2026-0301Alice Johnson$145.00Completed\nORD-2026-0302Bob Smith$89.50Processing\nORD-2026-0303Charlie Brown$210.00Shipped\nORD-2026-0304Diana Prince$34.99Completed\nORD-2026-0305Edward Norton$1200.00Processing\nORD-2026-0306Fiona Gallagher$56.20Shipped\nORD-2026-0307George Miller$430.75Completed\nORD-2026-0308Hannah Abbott$12.50Pending\nORD-2026-0309Ian Wright$99.99Shipped\nORD-2026-0310Jane Doe$275.00Completed\nORD-2026-0311Kevin Hart$150.00Processing\nORD-2026-0312Laura Palmer$88.00Shipped\nORD-2026-0313Mike Wazowski$45.00Completed\nORD-2026-0314Nina Simone$310.25Processing\nORD-2026-0315Oscar Isaac$125.50Shipped" + "content": " \n \n #ORD00141\n Chris Evans\n $75.50\n 2026-03-15\n Delivered\n \n \n #ORD00142\n Jessica Alba\n $120.00\n 2026-03-14\n Shipped\n \n \n #ORD00143\n Tom Hanks\n $35.99\n 2026-03-14\n Pending\n \n \n #ORD00144\n Sandra Bullock\n $200.00\n 2026-03-13\n Delivered\n \n \n #ORD00145\n Leonardo DiCaprio\n $89.99\n 2026-03-13\n Shipped\n \n \n #ORD00146\n Julia Roberts\n $150.00\n 2026-03-12\n Delivered\n \n \n #ORD00147\n Brad Pitt\n $49.00\n 2026-03-12\n Pending\n \n \n #ORD00148\n Angelina Jolie\n $320.00\n 2026-03-11\n Shipped\n \n \n #ORD00149\n George Clooney\n $70.00\n 2026-03-11\n Canceled\n \n \n #ORD00150\n Jennifer Aniston\n $180.00\n 2026-03-10\n Delivered\n \n \n #ORD00151\n Dwayne Johnson\n $99.00\n 2026-03-10\n Shipped\n \n \n #ORD00152\n Gal Gadot\n $65.00\n 2026-03-09\n Pending\n \n \n #ORD00153\n Robert Downey Jr.\n $450.00\n 2026-03-09\n Delivered\n \n \n #ORD00154\n Scarlett Johansson\n $110.00\n 2026-03-08\n Shipped\n \n \n #ORD00155\n Chris Hemsworth\n $75.00\n 2026-03-08\n Delivered\n \n " } ] } \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-3.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-3.html index ab5031f..9965732 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-3.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-3.html @@ -1,13 +1,1438 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "revenue"}, - "content": "$215,430 (+12.3%)" - } - ] -} \ No newline at end of file + + + + + + E-commerce Admin Dashboard + + + + + + + +
+

Dashboard Overview

+ + +
+ +
+
Total Revenue
+
$215,430
+
+ 12.3% vs last month +
+
+
+ + +
+
Total Orders
+
5,231
+
+ 8.1% vs last month +
+
+
+ + +
+
New Customers
+
890
+
+ -3.2% vs last month +
+
+
+ + +
+
Conversion Rate
+
3.8%
+
+ 0.5% vs last month +
+
+
+
+
+ +

Product List

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesELC-HP-001$79.99150ElectronicsIn Stock
Smartwatch Series 7ELC-SW-007$249.0080WearablesIn Stock
USB-C Fast Charger (65W)ELC-CH-065$29.9930AccessoriesLow Stock
Mechanical Gaming Keyboard RGBPCG-KB-RGB$119.9925PC PeripheralsLow Stock
Portable SSD 1TB USB 3.2STO-SSD-1TB$129.9995StorageIn Stock
4K UHD Smart TV 55-inchELC-TV-55$699.005TelevisionsLow Stock
Ergonomic Office ChairFURN-OC-ERG$299.0012Office FurnitureLow Stock
Noise Cancelling Earbuds ProELC-EB-PRO$179.0070AudioIn Stock
Compact Espresso MachineHOME-CM-ESP$149.990Kitchen AppliancesOut of Stock
Robot Vacuum Cleaner with MopHOME-VC-ROB$349.9918Home AppliancesIn Stock
High-Performance BlenderHOME-BL-HP$89.9940Kitchen AppliancesIn Stock
Digital Air Fryer 5.8QTHOME-AF-5QT$110.0020Kitchen AppliancesLow Stock
Portable Bluetooth Speaker X2ELC-SP-X2$59.9960AudioIn Stock
Gaming Mouse RGB ProPCG-MS-RGB$49.9975PC PeripheralsIn Stock
External Hard Drive 2TBSTO-HDD-2TB$79.000StorageOut of Stock
Mesh Wi-Fi System (3-pack)NET-WF-MESH$199.9910NetworkingLow Stock
Electric Kettle Stainless SteelHOME-KT-SS$39.9955Kitchen AppliancesIn Stock
Smart Doorbell CameraSMART-DB-CAM$150.0022Smart HomeLow Stock
Fitness Tracker with HRELC-FT-HR$65.0090WearablesIn Stock
Digital Drawing Tablet 10-inchART-DT-10$99.0015Creative ToolsLow Stock
Wireless Charging PadELC-WP-001$25.00120AccessoriesIn Stock
Curved Gaming Monitor 27-inchPCG-MN-27C$349.008MonitorsLow Stock
Portable Projector MiniELC-PJ-MINI$199.000ProjectorsOut of Stock
Action Camera 4K ProCAM-AC-4K$299.0010CamerasLow Stock
Smart Plug Wi-Fi (4-pack)SMART-PL-4PK$39.9970Smart HomeIn Stock
Premium Noise-Cancelling Over-Ear HeadphonesELC-HP-NC-PRE$349.9945AudioIn Stock
Wireless Gaming HeadsetPCG-HS-WL$99.9920PC PeripheralsLow Stock
Portable Power Bank 20000mAhELC-PB-20K$45.00180AccessoriesIn Stock
Smart Light Bulb E27 (Color)SMART-LB-C$15.99200Smart HomeIn Stock
Digital Photo Frame 8-inchELC-PF-8IN$75.0012Home DecorLow Stock
Entry-Level DSLR Camera KitCAM-DSLR-KIT$599.007CamerasLow Stock
Wireless Router Wi-Fi 6NET-RT-W6$120.0025NetworkingIn Stock
Smart Scale with Body CompositionHEALTH-SC-BC$49.9935Health & FitnessIn Stock
Electric Toothbrush with AppHEALTH-TB-APP$89.000Personal CareOut of Stock
Home Security Camera IndoorSMART-SC-IN$69.9940Smart HomeIn Stock
Portable Mini Fan USBHOME-FN-USB$19.99100Home AppliancesIn Stock
USB Microphone for StreamingAUDIO-MIC-USB$60.0015AudioLow Stock
Universal Travel AdapterELC-TA-UNI$22.00110Travel AccessoriesIn Stock
Digital Kitchen Food ScaleHOME-FS-DIG$29.9960Kitchen AppliancesIn Stock
Smart Thermostat LearningSMART-TH-LRN$180.009Smart HomeLow Stock
Mini Projector Portable HDELC-MP-HD$120.000ProjectorsOut of Stock
Gaming Chair with Lumbar SupportFURN-GC-LUM$250.0010Office FurnitureLow Stock
Electric Hand MixerHOME-MX-ELEC$35.0045Kitchen AppliancesIn Stock
Portable Photo PrinterCAM-PR-PORT$99.0018CamerasIn Stock
Smart Water BottleHEALTH-WB-SMART$30.0070Health & FitnessIn Stock
Bluetooth Car AdapterAUTO-BT-ADP$20.0090Car AccessoriesIn Stock
Air Purifier for HomeHOME-AP-01$149.0010Home AppliancesLow Stock
Robot Toy ProgrammableTOY-RB-PROG$85.0025Toys & GamesIn Stock
External Webcam Full HDELC-WC-FHD$49.0030PC PeripheralsIn Stock
Smart Garden Indoor KitHOME-SG-IN$79.0015Smart HomeLow Stock
Magnetic Phone Car MountAUTO-PM-MAG$15.00150Car AccessoriesIn Stock
Portable Espresso MakerHOME-PM-ESP$60.000Kitchen AppliancesOut of Stock
UV Light Sanitizer BoxHEALTH-UV-BOX$40.0050Personal CareIn Stock
Wireless Charger StandELC-CS-WL$30.0080AccessoriesIn Stock
Dimmable LED Desk LampHOME-DL-LED$45.0025Home DecorLow Stock
VR Headset BasicGAMING-VR-BASIC$199.005GamingLow Stock
Streaming Webcam 1080pELC-WC-1080$55.0030PC PeripheralsIn Stock
Foldable Drone with CameraDRN-FL-CAM$120.0010DronesLow Stock
Smart Wi-Fi Coffee MakerHOME-CM-WIFI$95.0012Kitchen AppliancesLow Stock
Digital Body Fat ScaleHEALTH-BFS-DIG$35.0040Health & FitnessIn Stock
+
+
+
+ +

Recent Orders

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountDateStatus
#ORD00101Alice Johnson$249.002023-10-26Delivered
#ORD00102Bob Smith$79.992023-10-25Shipped
#ORD00103Charlie Brown$119.992023-10-25Pending
#ORD00104Diana Prince$29.992023-10-24Delivered
#ORD00105Eve Adams$699.002023-10-24Shipped
#ORD00106Frank White$129.992023-10-23Delivered
#ORD00107Grace Lee$299.002023-10-23Pending
#ORD00108Henry Clark$179.002023-10-22Shipped
#ORD00109Ivy Green$149.992023-10-22Canceled
#ORD00110Jack King$349.992023-10-21Delivered
#ORD00111Karen Hall$89.992023-10-21Shipped
#ORD00112Liam Scott$110.002023-10-20Pending
#ORD00113Mia Baker$59.992023-10-20Delivered
#ORD00114Noah Taylor$49.992023-10-19Shipped
#ORD00115Olivia Miller$79.002023-10-19Canceled
#ORD00116Peter Davis$199.992023-10-18Delivered
#ORD00117Quinn Wilson$39.992023-10-18Shipped
#ORD00118Rachel Moore$150.002023-10-17Pending
#ORD00119Sam Harris$65.002023-10-17Delivered
#ORD00120Tina Young$99.002023-10-16Shipped
#ORD00121Uma Jackson$25.002023-10-16Delivered
#ORD00122Victor Green$349.002023-10-15Pending
#ORD00123Wendy White$199.002023-10-15Canceled
#ORD00124Xavier Bell$299.002023-10-14Shipped
#ORD00125Yara Hall$39.992023-10-14Delivered
#ORD00126Zack Adams$349.992023-10-13Shipped
#ORD00127Amy Brown$99.992023-10-13Pending
#ORD00128Brian Davis$45.002023-10-12Delivered
#ORD00129Chloe Evans$15.992023-10-12Shipped
#ORD00130Daniel Garcia$75.002023-10-11Delivered
#ORD00131Emily Rodriguez$599.002023-10-11Pending
#ORD00132Fiona Martinez$120.002023-10-10Shipped
#ORD00133George Hernandez$49.992023-10-10Delivered
#ORD00134Hannah Lopez$89.002023-10-09Canceled
#ORD00135Isaac Perez$69.992023-10-09Shipped
#ORD00136Julia Garcia$19.992023-10-08Delivered
#ORD00137Kevin Scott$60.002023-10-08Pending
#ORD00138Laura Kim$22.002023-10-07Delivered
#ORD00139Michael Chen$29.992023-10-07Shipped
#ORD00140Nancy Patel$180.002023-10-06Delivered
#ORD00141Chris Evans$75.502026-03-15Delivered
#ORD00142Jessica Alba$120.002026-03-14Shipped
#ORD00143Tom Hanks$35.992026-03-14Pending
#ORD00144Sandra Bullock$200.002026-03-13Delivered
#ORD00145Leonardo DiCaprio$89.992026-03-13Shipped
#ORD00146Julia Roberts$150.002026-03-12Delivered
#ORD00147Brad Pitt$49.002026-03-12Pending
#ORD00148Angelina Jolie$320.002026-03-11Shipped
#ORD00149George Clooney$70.002026-03-11Canceled
#ORD00150Jennifer Aniston$180.002026-03-10Delivered
#ORD00151Dwayne Johnson$99.002026-03-10Shipped
#ORD00152Gal Gadot$65.002026-03-09Pending
#ORD00153Robert Downey Jr.$450.002026-03-09Delivered
#ORD00154Scarlett Johansson$110.002026-03-08Shipped
#ORD00155Chris Hemsworth$75.002026-03-08Delivered
+
+
+
+ +

Account Settings

+ +
+

Profile Information

+ +
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + +
+
+
+ +
+

Notification Settings

+ +
+ Email Notifications + +
+
+ SMS Notifications + +
+
+ Push Notifications + +
+
+ + + +
+
+
+
+
+ + diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-3.json b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-3.json index 33477a9..34635ad 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-3.json +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-3.json @@ -1,8 +1,8 @@ { "protocol": "aap/0.1", - "id": "artifact-id", + "id": "main-artifact", "version": 2, - "name": "edit", + "name": "synthesize", "meta": { "format": "text/html", "tokens_used": null, @@ -11,12 +11,7 @@ }, "content": [ { - "op": "replace", - "target": { - "type": "id", - "value": "root" - }, - "content": "
\n

Dashboard

\n \n
\n

Total Revenue

\n $215,430 (+12.3%)\n
\n
\n
" + "body": "\n\n\n \n \n E-commerce Admin Dashboard\n \n\n\n \n \n \n\n
\n

Dashboard Overview

\n\n \n
\n \n
\n
Total Revenue
\n
$215,430
\n
\n 12.3% vs last month\n
\n
\n
\n\n \n
\n
Total Orders
\n
5,231
\n
\n 8.1% vs last month\n
\n
\n
\n\n \n
\n
New Customers
\n
890
\n
\n -3.2% vs last month\n
\n
\n
\n\n \n
\n
Conversion Rate
\n
3.8%
\n
\n 0.5% vs last month\n
\n
\n
\n
\n
\n\n

Product List

\n \n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesELC-HP-001$79.99150ElectronicsIn Stock
Smartwatch Series 7ELC-SW-007$249.0080WearablesIn Stock
USB-C Fast Charger (65W)ELC-CH-065$29.9930AccessoriesLow Stock
Mechanical Gaming Keyboard RGBPCG-KB-RGB$119.9925PC PeripheralsLow Stock
Portable SSD 1TB USB 3.2STO-SSD-1TB$129.9995StorageIn Stock
4K UHD Smart TV 55-inchELC-TV-55$699.005TelevisionsLow Stock
Ergonomic Office ChairFURN-OC-ERG$299.0012Office FurnitureLow Stock
Noise Cancelling Earbuds ProELC-EB-PRO$179.0070AudioIn Stock
Compact Espresso MachineHOME-CM-ESP$149.990Kitchen AppliancesOut of Stock
Robot Vacuum Cleaner with MopHOME-VC-ROB$349.9918Home AppliancesIn Stock
High-Performance BlenderHOME-BL-HP$89.9940Kitchen AppliancesIn Stock
Digital Air Fryer 5.8QTHOME-AF-5QT$110.0020Kitchen AppliancesLow Stock
Portable Bluetooth Speaker X2ELC-SP-X2$59.9960AudioIn Stock
Gaming Mouse RGB ProPCG-MS-RGB$49.9975PC PeripheralsIn Stock
External Hard Drive 2TBSTO-HDD-2TB$79.000StorageOut of Stock
Mesh Wi-Fi System (3-pack)NET-WF-MESH$199.9910NetworkingLow Stock
Electric Kettle Stainless SteelHOME-KT-SS$39.9955Kitchen AppliancesIn Stock
Smart Doorbell CameraSMART-DB-CAM$150.0022Smart HomeLow Stock
Fitness Tracker with HRELC-FT-HR$65.0090WearablesIn Stock
Digital Drawing Tablet 10-inchART-DT-10$99.0015Creative ToolsLow Stock
Wireless Charging PadELC-WP-001$25.00120AccessoriesIn Stock
Curved Gaming Monitor 27-inchPCG-MN-27C$349.008MonitorsLow Stock
Portable Projector MiniELC-PJ-MINI$199.000ProjectorsOut of Stock
Action Camera 4K ProCAM-AC-4K$299.0010CamerasLow Stock
Smart Plug Wi-Fi (4-pack)SMART-PL-4PK$39.9970Smart HomeIn Stock
Premium Noise-Cancelling Over-Ear HeadphonesELC-HP-NC-PRE$349.9945AudioIn Stock
Wireless Gaming HeadsetPCG-HS-WL$99.9920PC PeripheralsLow Stock
Portable Power Bank 20000mAhELC-PB-20K$45.00180AccessoriesIn Stock
Smart Light Bulb E27 (Color)SMART-LB-C$15.99200Smart HomeIn Stock
Digital Photo Frame 8-inchELC-PF-8IN$75.0012Home DecorLow Stock
Entry-Level DSLR Camera KitCAM-DSLR-KIT$599.007CamerasLow Stock
Wireless Router Wi-Fi 6NET-RT-W6$120.0025NetworkingIn Stock
Smart Scale with Body CompositionHEALTH-SC-BC$49.9935Health & FitnessIn Stock
Electric Toothbrush with AppHEALTH-TB-APP$89.000Personal CareOut of Stock
Home Security Camera IndoorSMART-SC-IN$69.9940Smart HomeIn Stock
Portable Mini Fan USBHOME-FN-USB$19.99100Home AppliancesIn Stock
USB Microphone for StreamingAUDIO-MIC-USB$60.0015AudioLow Stock
Universal Travel AdapterELC-TA-UNI$22.00110Travel AccessoriesIn Stock
Digital Kitchen Food ScaleHOME-FS-DIG$29.9960Kitchen AppliancesIn Stock
Smart Thermostat LearningSMART-TH-LRN$180.009Smart HomeLow Stock
Mini Projector Portable HDELC-MP-HD$120.000ProjectorsOut of Stock
Gaming Chair with Lumbar SupportFURN-GC-LUM$250.0010Office FurnitureLow Stock
Electric Hand MixerHOME-MX-ELEC$35.0045Kitchen AppliancesIn Stock
Portable Photo PrinterCAM-PR-PORT$99.0018CamerasIn Stock
Smart Water BottleHEALTH-WB-SMART$30.0070Health & FitnessIn Stock
Bluetooth Car AdapterAUTO-BT-ADP$20.0090Car AccessoriesIn Stock
Air Purifier for HomeHOME-AP-01$149.0010Home AppliancesLow Stock
Robot Toy ProgrammableTOY-RB-PROG$85.0025Toys & GamesIn Stock
External Webcam Full HDELC-WC-FHD$49.0030PC PeripheralsIn Stock
Smart Garden Indoor KitHOME-SG-IN$79.0015Smart HomeLow Stock
Magnetic Phone Car MountAUTO-PM-MAG$15.00150Car AccessoriesIn Stock
Portable Espresso MakerHOME-PM-ESP$60.000Kitchen AppliancesOut of Stock
UV Light Sanitizer BoxHEALTH-UV-BOX$40.0050Personal CareIn Stock
Wireless Charger StandELC-CS-WL$30.0080AccessoriesIn Stock
Dimmable LED Desk LampHOME-DL-LED$45.0025Home DecorLow Stock
VR Headset BasicGAMING-VR-BASIC$199.005GamingLow Stock
Streaming Webcam 1080pELC-WC-1080$55.0030PC PeripheralsIn Stock
Foldable Drone with CameraDRN-FL-CAM$120.0010DronesLow Stock
Smart Wi-Fi Coffee MakerHOME-CM-WIFI$95.0012Kitchen AppliancesLow Stock
Digital Body Fat ScaleHEALTH-BFS-DIG$35.0040Health & FitnessIn Stock
\n
\n
\n
\n\n

Recent Orders

\n \n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Order IDCustomerAmountDateStatus
#ORD00101Alice Johnson$249.002023-10-26Delivered
#ORD00102Bob Smith$79.992023-10-25Shipped
#ORD00103Charlie Brown$119.992023-10-25Pending
#ORD00104Diana Prince$29.992023-10-24Delivered
#ORD00105Eve Adams$699.002023-10-24Shipped
#ORD00106Frank White$129.992023-10-23Delivered
#ORD00107Grace Lee$299.002023-10-23Pending
#ORD00108Henry Clark$179.002023-10-22Shipped
#ORD00109Ivy Green$149.992023-10-22Canceled
#ORD00110Jack King$349.992023-10-21Delivered
#ORD00111Karen Hall$89.992023-10-21Shipped
#ORD00112Liam Scott$110.002023-10-20Pending
#ORD00113Mia Baker$59.992023-10-20Delivered
#ORD00114Noah Taylor$49.992023-10-19Shipped
#ORD00115Olivia Miller$79.002023-10-19Canceled
#ORD00116Peter Davis$199.992023-10-18Delivered
#ORD00117Quinn Wilson$39.992023-10-18Shipped
#ORD00118Rachel Moore$150.002023-10-17Pending
#ORD00119Sam Harris$65.002023-10-17Delivered
#ORD00120Tina Young$99.002023-10-16Shipped
#ORD00121Uma Jackson$25.002023-10-16Delivered
#ORD00122Victor Green$349.002023-10-15Pending
#ORD00123Wendy White$199.002023-10-15Canceled
#ORD00124Xavier Bell$299.002023-10-14Shipped
#ORD00125Yara Hall$39.992023-10-14Delivered
#ORD00126Zack Adams$349.992023-10-13Shipped
#ORD00127Amy Brown$99.992023-10-13Pending
#ORD00128Brian Davis$45.002023-10-12Delivered
#ORD00129Chloe Evans$15.992023-10-12Shipped
#ORD00130Daniel Garcia$75.002023-10-11Delivered
#ORD00131Emily Rodriguez$599.002023-10-11Pending
#ORD00132Fiona Martinez$120.002023-10-10Shipped
#ORD00133George Hernandez$49.992023-10-10Delivered
#ORD00134Hannah Lopez$89.002023-10-09Canceled
#ORD00135Isaac Perez$69.992023-10-09Shipped
#ORD00136Julia Garcia$19.992023-10-08Delivered
#ORD00137Kevin Scott$60.002023-10-08Pending
#ORD00138Laura Kim$22.002023-10-07Delivered
#ORD00139Michael Chen$29.992023-10-07Shipped
#ORD00140Nancy Patel$180.002023-10-06Delivered
#ORD00141Chris Evans$75.502026-03-15Delivered
#ORD00142Jessica Alba$120.002026-03-14Shipped
#ORD00143Tom Hanks$35.992026-03-14Pending
#ORD00144Sandra Bullock$200.002026-03-13Delivered
#ORD00145Leonardo DiCaprio$89.992026-03-13Shipped
#ORD00146Julia Roberts$150.002026-03-12Delivered
#ORD00147Brad Pitt$49.002026-03-12Pending
#ORD00148Angelina Jolie$320.002026-03-11Shipped
#ORD00149George Clooney$70.002026-03-11Canceled
#ORD00150Jennifer Aniston$180.002026-03-10Delivered
#ORD00151Dwayne Johnson$99.002026-03-10Shipped
#ORD00152Gal Gadot$65.002026-03-09Pending
#ORD00153Robert Downey Jr.$450.002026-03-09Delivered
#ORD00154Scarlett Johansson$110.002026-03-08Shipped
#ORD00155Chris Hemsworth$75.002026-03-08Delivered
\n
\n
\n
\n\n

Account Settings

\n \n
\n

Profile Information

\n \n
\n \n \n \n \n
\n
\n \n \n \n \n
\n
\n \n \n \n \n
\n
\n \n \n \n
\n
\n
\n\n
\n

Notification Settings

\n \n
\n Email Notifications\n \n
\n
\n SMS Notifications\n \n
\n
\n Push Notifications\n \n
\n
\n \n \n \n
\n
\n
\n
\n
\n\n\n" } ] } \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-4.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-4.html index 59c3a96..fd78f0d 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-4.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-4.html @@ -1,27 +1,1518 @@ - -
- -
-

Revenue

- $215,430 (+12.3%) -
+ + + + + + E-commerce Admin Dashboard + + + + + -
- -
-

Recent Activity

-
    -
  • User A logged in
  • -
  • User B updated profile
  • -
  • User C made a purchase
  • -
  • User D sent a message
  • -
  • User E viewed dashboard
  • -
  • User F changed settings
  • -
  • User G invited a friend
  • -
  • User H liked a post
  • -
  • User I commented
  • -
  • User J logged out
  • -
+ +
+

Dashboard Overview

+ + +
+ +
+
Total Revenue
+
$215,430
+
+ 12.3% vs last month +
+
+
+ + +
+
Total Orders
+
5,231
+
+ 8.1% vs last month +
+
+
+ + +
+
New Customers
+
890
+
+ -3.2% vs last month +
+
+
+ + +
+
Conversion Rate
+
3.8%
+
+ 0.5% vs last month +
+
+
+
+

Recent Activity

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ActionUserDateDetails
Product UpdateJane Smith2023-10-26 14:30Updated "Wireless Bluetooth Headphones" price to $79.99
New OrderSystem2023-10-26 14:15Order #ORD00101 placed by Alice Johnson
Stock AdjustmentJohn Doe2023-10-26 13:00Increased stock for "Smartwatch Series 7" by 10 units
User LoginJane Smith2023-10-26 12:45Logged in from IP: 192.168.1.100
Order Status ChangeAdmin2023-10-26 11:30Order #ORD00102 status changed to "Shipped"
New Product AddedMarketing Team2023-10-25 16:00Added "New Ultra-Thin Laptop"
Customer UpdateSupport Team2023-10-25 10:10Updated email for customer 'Bob Smith'
Settings ChangeJane Smith2023-10-24 09:00Enabled 'SMS Notifications'
Product DeleteAdmin2023-10-23 15:00Removed "Old Printer Model X"
Refund IssuedFinance Dept.2023-10-22 11:45Refund processed for Order #ORD00095
+
+
+
+ +

Product List

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesELC-HP-001$79.99150ElectronicsIn Stock
Smartwatch Series 7ELC-SW-007$249.0080WearablesIn Stock
USB-C Fast Charger (65W)ELC-CH-065$29.9930AccessoriesLow Stock
Mechanical Gaming Keyboard RGBPCG-KB-RGB$119.9925PC PeripheralsLow Stock
Portable SSD 1TB USB 3.2STO-SSD-1TB$129.9995StorageIn Stock
4K UHD Smart TV 55-inchELC-TV-55$699.005TelevisionsLow Stock
Ergonomic Office ChairFURN-OC-ERG$299.0012Office FurnitureLow Stock
Noise Cancelling Earbuds ProELC-EB-PRO$179.0070AudioIn Stock
Compact Espresso MachineHOME-CM-ESP$149.990Kitchen AppliancesOut of Stock
Robot Vacuum Cleaner with MopHOME-VC-ROB$349.9918Home AppliancesIn Stock
High-Performance BlenderHOME-BL-HP$89.9940Kitchen AppliancesIn Stock
Digital Air Fryer 5.8QTHOME-AF-5QT$110.0020Kitchen AppliancesLow Stock
Portable Bluetooth Speaker X2ELC-SP-X2$59.9960AudioIn Stock
Gaming Mouse RGB ProPCG-MS-RGB$49.9975PC PeripheralsIn Stock
External Hard Drive 2TBSTO-HDD-2TB$79.000StorageOut of Stock
Mesh Wi-Fi System (3-pack)NET-WF-MESH$199.9910NetworkingLow Stock
Electric Kettle Stainless SteelHOME-KT-SS$39.9955Kitchen AppliancesIn Stock
Smart Doorbell CameraSMART-DB-CAM$150.0022Smart HomeLow Stock
Fitness Tracker with HRELC-FT-HR$65.0090WearablesIn Stock
Digital Drawing Tablet 10-inchART-DT-10$99.0015Creative ToolsLow Stock
Wireless Charging PadELC-WP-001$25.00120AccessoriesIn Stock
Curved Gaming Monitor 27-inchPCG-MN-27C$349.008MonitorsLow Stock
Portable Projector MiniELC-PJ-MINI$199.000ProjectorsOut of Stock
Action Camera 4K ProCAM-AC-4K$299.0010CamerasLow Stock
Smart Plug Wi-Fi (4-pack)SMART-PL-4PK$39.9970Smart HomeIn Stock
Premium Noise-Cancelling Over-Ear HeadphonesELC-HP-NC-PRE$349.9945AudioIn Stock
Wireless Gaming HeadsetPCG-HS-WL$99.9920PC PeripheralsLow Stock
Portable Power Bank 20000mAhELC-PB-20K$45.00180AccessoriesIn Stock
Smart Light Bulb E27 (Color)SMART-LB-C$15.99200Smart HomeIn Stock
Digital Photo Frame 8-inchELC-PF-8IN$75.0012Home DecorLow Stock
Entry-Level DSLR Camera KitCAM-DSLR-KIT$599.007CamerasLow Stock
Wireless Router Wi-Fi 6NET-RT-W6$120.0025NetworkingIn Stock
Smart Scale with Body CompositionHEALTH-SC-BC$49.9935Health & FitnessIn Stock
Electric Toothbrush with AppHEALTH-TB-APP$89.000Personal CareOut of Stock
Home Security Camera IndoorSMART-SC-IN$69.9940Smart HomeIn Stock
Portable Mini Fan USBHOME-FN-USB$19.99100Home AppliancesIn Stock
USB Microphone for StreamingAUDIO-MIC-USB$60.0015AudioLow Stock
Universal Travel AdapterELC-TA-UNI$22.00110Travel AccessoriesIn Stock
Digital Kitchen Food ScaleHOME-FS-DIG$29.9960Kitchen AppliancesIn Stock
Smart Thermostat LearningSMART-TH-LRN$180.009Smart HomeLow Stock
Mini Projector Portable HDELC-MP-HD$120.000ProjectorsOut of Stock
Gaming Chair with Lumbar SupportFURN-GC-LUM$250.0010Office FurnitureLow Stock
Electric Hand MixerHOME-MX-ELEC$35.0045Kitchen AppliancesIn Stock
Portable Photo PrinterCAM-PR-PORT$99.0018CamerasIn Stock
Smart Water BottleHEALTH-WB-SMART$30.0070Health & FitnessIn Stock
Bluetooth Car AdapterAUTO-BT-ADP$20.0090Car AccessoriesIn Stock
Air Purifier for HomeHOME-AP-01$149.0010Home AppliancesLow Stock
Robot Toy ProgrammableTOY-RB-PROG$85.0025Toys & GamesIn Stock
External Webcam Full HDELC-WC-FHD$49.0030PC PeripheralsIn Stock
Smart Garden Indoor KitHOME-SG-IN$79.0015Smart HomeLow Stock
Magnetic Phone Car MountAUTO-PM-MAG$15.00150Car AccessoriesIn Stock
Portable Espresso MakerHOME-PM-ESP$60.000Kitchen AppliancesOut of Stock
UV Light Sanitizer BoxHEALTH-UV-BOX$40.0050Personal CareIn Stock
Wireless Charger StandELC-CS-WL$30.0080AccessoriesIn Stock
Dimmable LED Desk LampHOME-DL-LED$45.0025Home DecorLow Stock
VR Headset BasicGAMING-VR-BASIC$199.005GamingLow Stock
Streaming Webcam 1080pELC-WC-1080$55.0030PC PeripheralsIn Stock
Foldable Drone with CameraDRN-FL-CAM$120.0010DronesLow Stock
Smart Wi-Fi Coffee MakerHOME-CM-WIFI$95.0012Kitchen AppliancesLow Stock
Digital Body Fat ScaleHEALTH-BFS-DIG$35.0040Health & FitnessIn Stock
+
+
+
+ +

Recent Orders

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountDateStatus
#ORD00101Alice Johnson$249.002023-10-26Delivered
#ORD00102Bob Smith$79.992023-10-25Shipped
#ORD00103Charlie Brown$119.992023-10-25Pending
#ORD00104Diana Prince$29.992023-10-24Delivered
#ORD00105Eve Adams$699.002023-10-24Shipped
#ORD00106Frank White$129.992023-10-23Delivered
#ORD00107Grace Lee$299.002023-10-23Pending
#ORD00108Henry Clark$179.002023-10-22Shipped
#ORD00109Ivy Green$149.992023-10-22Canceled
#ORD00110Jack King$349.992023-10-21Delivered
#ORD00111Karen Hall$89.992023-10-21Shipped
#ORD00112Liam Scott$110.002023-10-20Pending
#ORD00113Mia Baker$59.992023-10-20Delivered
#ORD00114Noah Taylor$49.992023-10-19Shipped
#ORD00115Olivia Miller$79.002023-10-19Canceled
#ORD00116Peter Davis$199.992023-10-18Delivered
#ORD00117Quinn Wilson$39.992023-10-18Shipped
#ORD00118Rachel Moore$150.002023-10-17Pending
#ORD00119Sam Harris$65.002023-10-17Delivered
#ORD00120Tina Young$99.002023-10-16Shipped
#ORD00121Uma Jackson$25.002023-10-16Delivered
#ORD00122Victor Green$349.002023-10-15Pending
#ORD00123Wendy White$199.002023-10-15Canceled
#ORD00124Xavier Bell$299.002023-10-14Shipped
#ORD00125Yara Hall$39.992023-10-14Delivered
#ORD00126Zack Adams$349.992023-10-13Shipped
#ORD00127Amy Brown$99.992023-10-13Pending
#ORD00128Brian Davis$45.002023-10-12Delivered
#ORD00129Chloe Evans$15.992023-10-12Shipped
#ORD00130Daniel Garcia$75.002023-10-11Delivered
#ORD00131Emily Rodriguez$599.002023-10-11Pending
#ORD00132Fiona Martinez$120.002023-10-10Shipped
#ORD00133George Hernandez$49.992023-10-10Delivered
#ORD00134Hannah Lopez$89.002023-10-09Canceled
#ORD00135Isaac Perez$69.992023-10-09Shipped
#ORD00136Julia Garcia$19.992023-10-08Delivered
#ORD00137Kevin Scott$60.002023-10-08Pending
#ORD00138Laura Kim$22.002023-10-07Delivered
#ORD00139Michael Chen$29.992023-10-07Shipped
#ORD00140Nancy Patel$180.002023-10-06Delivered
#ORD00141Chris Evans$75.502026-03-15Delivered
#ORD00142Jessica Alba$120.002026-03-14Shipped
#ORD00143Tom Hanks$35.992026-03-14Pending
#ORD00144Sandra Bullock$200.002026-03-13Delivered
#ORD00145Leonardo DiCaprio$89.992026-03-13Shipped
#ORD00146Julia Roberts$150.002026-03-12Delivered
#ORD00147Brad Pitt$49.002026-03-12Pending
#ORD00148Angelina Jolie$320.002026-03-11Shipped
#ORD00149George Clooney$70.002026-03-11Canceled
#ORD00150Jennifer Aniston$180.002026-03-10Delivered
#ORD00151Dwayne Johnson$99.002026-03-10Shipped
#ORD00152Gal Gadot$65.002026-03-09Pending
#ORD00153Robert Downey Jr.$450.002026-03-09Delivered
#ORD00154Scarlett Johansson$110.002026-03-08Shipped
#ORD00155Chris Hemsworth$75.002026-03-08Delivered
+
+
+
+ +

Account Settings

+ +
+

Profile Information

+ +
+ + + + +
+
+ + + + +
+
+ + + + +
+
+ + + +
+
+
+ +
+

Notification Settings

+ +
+ Email Notifications + +
+
+ SMS Notifications + +
+
+ Push Notifications + +
+
+ + + +
+
+
+
- - \ No newline at end of file + + diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-4.json b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-4.json index 868dfbe..cbb3a8f 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-4.json +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/aap/turn-4.json @@ -1,8 +1,8 @@ { "protocol": "aap/0.1", - "id": "dashboard-artifact", + "id": "ecommerce-dashboard", "version": 1, - "name": "synthesize", + "name": "edit", "meta": { "format": "text/html", "tokens_used": null, @@ -11,7 +11,12 @@ }, "content": [ { - "body": "\n
\n \n
\n

Revenue

\n $215,430 (+12.3%)\n
\n
\n
\n \n
\n

Recent Activity

\n
    \n
  • User A logged in
  • \n
  • User B updated profile
  • \n
  • User C made a purchase
  • \n
  • User D sent a message
  • \n
  • User E viewed dashboard
  • \n
  • User F changed settings
  • \n
  • User G invited a friend
  • \n
  • User H liked a post
  • \n
  • User I commented
  • \n
  • User J logged out
  • \n
\n
\n
\n
" + "op": "insert_after", + "target": { + "type": "id", + "value": "stats-section" + }, + "content": "

Recent Activity

\n \n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ActionUserDateDetails
Product UpdateJane Smith2023-10-26 14:30Updated \"Wireless Bluetooth Headphones\" price to $79.99
New OrderSystem2023-10-26 14:15Order #ORD00101 placed by Alice Johnson
Stock AdjustmentJohn Doe2023-10-26 13:00Increased stock for \"Smartwatch Series 7\" by 10 units
User LoginJane Smith2023-10-26 12:45Logged in from IP: 192.168.1.100
Order Status ChangeAdmin2023-10-26 11:30Order #ORD00102 status changed to \"Shipped\"
New Product AddedMarketing Team2023-10-25 16:00Added \"New Ultra-Thin Laptop\"
Customer UpdateSupport Team2023-10-25 10:10Updated email for customer 'Bob Smith'
Settings ChangeJane Smith2023-10-24 09:00Enabled 'SMS Notifications'
Product DeleteAdmin2023-10-23 15:00Removed \"Old Printer Model X\"
Refund IssuedFinance Dept.2023-10-22 11:45Refund processed for Order #ORD00095
\n
\n
\n
" } ] } \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-0.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-0.html index b8f1541..81602fc 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-0.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-0.html @@ -2,76 +2,1397 @@ - Admin Dashboard - + + E-commerce Admin Dashboard - + + +
-
-

Total Revenue

$54,230

↑ 12%
-

Orders

1,204

↑ 5%
-

Customers

842

↑ 2%
-

Conversion

3.2%

↓ 0.5%
+ +
+
+
Total Revenue
+
$1,234,567
+
+ ▲ 12.5% this month +
+
+
+
Orders
+
2,890
+
+ ▲ 8.2% this month +
+
+
+
Customers
+
1,540
+
+ ▼ 1.3% this month +
+
+
+
Conversion Rate
+
3.45%
+
+ ▲ 0.5% this month +
+
-
-

Products Inventory

- - + +

Products

+
+
NameSKUPriceStockStatus
+ + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Product NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesSKU-8345-78$89.99125ElectronicsIn Stock
4K Ultra HD Smart TVSKU-1234-56$799.0045ElectronicsIn Stock
Smartwatch Fitness TrackerSKU-5678-12$129.508ElectronicsLow Stock
Portable Power Bank 20000mAhSKU-9012-34$34.99210ElectronicsIn Stock
Noise Cancelling EarbudsSKU-3456-78$149.9960ElectronicsIn Stock
Gaming Laptop 15-inchSKU-7890-12$1199.9915ElectronicsIn Stock
Mechanical Keyboard RGBSKU-2345-67$99.9990OfficeIn Stock
Ergonomic Office ChairSKU-6789-01$249.005OfficeLow Stock
USB-C Hub MultiportSKU-0123-45$49.99150ElectronicsIn Stock
External SSD 1TBSKU-4567-89$119.9930ElectronicsIn Stock
Robot Vacuum CleanerSKU-8901-23$299.0012Smart HomeIn Stock
Air Fryer 5LSKU-1234-50$89.9970Home & KitchenIn Stock
Coffee Maker ProgrammableSKU-5678-90$75.000Home & KitchenOut of Stock
Smart Home Security CameraSKU-9012-30$69.9918Smart HomeIn Stock
LED Desk LampSKU-3456-70$29.99100OfficeIn Stock
Electric Toothbrush SonicSKU-7890-10$59.9940Personal CareIn Stock
Water Bottle SmartSKU-2345-60$24.99130OutdoorsIn Stock
Travel Backpack Anti-TheftSKU-6789-00$55.0022OutdoorsIn Stock
Digital Camera MirrorlessSKU-0123-40$899.997ElectronicsLow Stock
Drone with 4K CameraSKU-4567-80$499.0010ElectronicsIn Stock
Portable ProjectorSKU-8901-20$199.0025ElectronicsIn Stock
Virtual Reality HeadsetSKU-1234-51$399.9918ElectronicsIn Stock
Smart LED Strip LightsSKU-5678-91$25.00200Smart HomeIn Stock
Standing Desk ConverterSKU-9012-31$179.9930OfficeIn Stock
Wireless Charging PadSKU-3456-71$19.99180ElectronicsIn Stock
Gaming Mouse RGBSKU-7890-11$45.0095OfficeIn Stock
Monitor Ultra-WideSKU-2345-61$349.0012ElectronicsIn Stock
Soundbar with SubwooferSKU-6789-01$189.9920ElectronicsIn Stock
E-Reader PaperwhiteSKU-0123-41$119.0055ElectronicsIn Stock
Electric Kettle SmartSKU-4567-81$65.000Home & KitchenOut of Stock
Blender High-SpeedSKU-8901-21$99.0035Home & KitchenIn Stock
Espresso MachineSKU-1234-52$499.008Home & KitchenLow Stock
Home Theater SystemSKU-5678-92$699.996ElectronicsLow Stock
Mesh Wi-Fi SystemSKU-9012-32$199.9925Smart HomeIn Stock
Network Attached Storage (NAS)SKU-3456-72$299.0010ElectronicsIn Stock
Smart Plug MiniSKU-7890-12$15.00300Smart HomeIn Stock
Dash Cam Front & RearSKU-2345-62$120.0040ElectronicsIn Stock
Car Jump Starter PortableSKU-6789-02$80.0015OutdoorsIn Stock
Bike Trainer SmartSKU-0123-42$299.007OutdoorsLow Stock
GPS Running WatchSKU-4567-82$199.0020Personal CareIn Stock
Blood Pressure Monitor SmartSKU-8901-22$49.9950Personal CareIn Stock
Scale Smart Body FatSKU-1234-53$35.0070Personal CareIn Stock
Massage Gun PercussionSKU-5678-93$149.0010Personal CareIn Stock
Heated Blanket ElectricSKU-9012-33$79.000Home & KitchenOut of Stock
Humidifier SmartSKU-3456-73$55.0045Smart HomeIn Stock
Air Purifier HEPASKU-7890-13$159.0018Smart HomeIn Stock
Portable AC UnitSKU-2345-63$399.005Home & KitchenLow Stock
Smart ThermostatSKU-6789-03$149.0022Smart HomeIn Stock
Video Doorbell ProSKU-0123-43$179.0012Smart HomeIn Stock
Smart Lock KeylessSKU-4567-83$129.009Smart HomeLow Stock
Electric Scooter FoldingSKU-8901-23$599.003OutdoorsLow Stock
Electric SkateboardSKU-1234-54$449.002OutdoorsLow Stock
Electric BicycleSKU-5678-94$899.001OutdoorsLow Stock
Workout HeadphonesSKU-9012-34$79.0080ElectronicsIn Stock
Outdoor Smart PlugSKU-3456-74$29.00110Smart HomeIn Stock
Garden Sprinkler SmartSKU-7890-14$69.0020Smart HomeIn Stock
Pet Feeder SmartSKU-2345-64$75.0015Smart HomeIn Stock
Dog Camera Treat DispenserSKU-6789-04$120.0010Smart HomeIn Stock
Cat Litter Box Self-CleaningSKU-0123-44$499.004Home & KitchenLow Stock
Smart Wi-Fi Router AX6000SKU-4567-84$189.9928ElectronicsIn Stock
Portable Espresso MakerSKU-8901-24$79.9935Home & KitchenIn Stock
Instant Photo PrinterSKU-1234-55$119.0017ElectronicsIn Stock
Electric Fireplace HeaterSKU-5678-95$159.006Home & KitchenLow Stock
Smart Light Bulbs 4-PackSKU-9012-35$49.99190Smart HomeIn Stock
Gaming Headset WirelessSKU-3456-75$110.0050ElectronicsIn Stock
UV Sanitizer for PhoneSKU-7890-15$39.9985Personal CareIn Stock
Electric Grill IndoorSKU-2345-65$95.0011Home & KitchenIn Stock
Digital Voice RecorderSKU-6789-05$49.0060OfficeIn Stock
Portable Air CompressorSKU-0123-45$65.0025OutdoorsIn Stock
Fitness Trampoline MiniSKU-4567-85$85.008OutdoorsLow Stock
Window Cleaning RobotSKU-8901-25$249.005Smart HomeLow Stock
Smart Plant PotSKU-1234-56$55.0030Smart HomeIn Stock
Electric Wine Opener SetSKU-5678-96$39.0070Home & KitchenIn Stock
Car Phone Mount Wireless ChargerSKU-9012-36$29.99140ElectronicsIn Stock
Portable Document ScannerSKU-3456-76$169.0012OfficeIn Stock
Solar Garden Lights 10-PackSKU-7890-16$45.0090OutdoorsIn Stock
Smart Mirror for BathroomSKU-2345-66$299.003Smart HomeLow Stock
-
+
-
-

Recent Orders

- - + +

Recent Orders

+
+
IDCustomerAmountDateStatus
+ + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountOrder DateStatus
ORD-473950Charlie Brown$145.002023-10-26Processing
ORD-894723Henry Wilson$780.252023-10-22Delivered
ORD-120567Alice Johnson$320.502023-10-28Pending
ORD-678910Bob Williams$89.992023-10-27Shipped
ORD-345678Diana Prince$1200.002023-10-25Delivered
ORD-987654Eve Davis$45.752023-10-24Processing
ORD-112233Frank Miller$210.002023-10-23Shipped
ORD-554433Grace Taylor$75.502023-10-21Delivered
ORD-678543Ivy Moore$550.002023-10-20Pending
ORD-234567Jack White$19.992023-10-19Processing
ORD-876543Karen Green$123.452023-10-18Shipped
ORD-098765Liam Hall$999.992023-10-17Delivered
ORD-111222Mia King$67.892023-10-16Pending
ORD-333444Noah Wright$345.602023-10-15Processing
ORD-555666Olivia Scott$12.302023-10-14Shipped
ORD-777888Peter Adams$78.902023-10-13Delivered
ORD-999000Quinn Lewis$150.002023-10-12Pending
ORD-222111Rachel Lee$29.992023-10-11Processing
ORD-444555Sam Clark$499.002023-10-10Shipped
ORD-666777Tina Young$18.502023-10-09Delivered
ORD-888999Uma Hernandez$670.002023-10-08Pending
ORD-000111Victor Garcia$25.002023-10-07Processing
ORD-121314Wendy Lopez$99.002023-10-06Shipped
ORD-151617Xavier Rodriguez$300.002023-10-05Delivered
ORD-181920Yara Martinez$79.002023-10-04Pending
ORD-212223Zack Anderson$149.002023-10-03Processing
ORD-242526Alice Johnson$50.002023-10-02Shipped
ORD-272829Bob Williams$12.992023-10-01Delivered
ORD-303132Charlie Brown$80.002023-09-30Pending
ORD-333435Diana Prince$240.002023-09-29Processing
ORD-363738Eve Davis$1500.002023-09-28Shipped
ORD-394041Frank Miller$65.002023-09-27Delivered
ORD-424344Grace Taylor$199.002023-09-26Pending
ORD-454647Henry Wilson$30.002023-09-25Processing
ORD-484950Ivy Moore$450.002023-09-24Shipped
ORD-515253Jack White$22.002023-09-23Delivered
ORD-545556Karen Green$88.882023-09-22Pending
ORD-575859Liam Hall$765.002023-09-21Processing
ORD-606162Mia King$120.002023-09-20Shipped
ORD-636465Noah Wright$33.502023-09-19Delivered
ORD-666768Olivia Scott$99.992023-09-18Pending
ORD-697071Peter Adams$55.002023-09-17Processing
ORD-727374Quinn Lewis$12.002023-09-16Shipped
ORD-757677Rachel Lee$250.002023-09-15Delivered
ORD-787980Sam Clark$75.002023-09-14Pending
ORD-818283Tina Young$110.002023-09-13Processing
ORD-848586Uma Hernandez$400.002023-09-12Shipped
-
+
-
-

Account Settings

+ +

Account Settings

+
- - - - +
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +

Notification Preferences

+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +
+ +
-
+
+
\ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-1.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-1.html index 6aa3cc3..aac042e 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-1.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-1.html @@ -2,80 +2,1404 @@ - Admin Dashboard - + + E-commerce Admin Dashboard - + + +
-
+ +
+
+
Total Revenue
+
$215,430
+
+ ▲ 12.3% this month +
+
+
+
Orders
+
2,890
+
+ ▲ 8.2% this month +
+
+
+
Customers
+
1,540
+
+ ▼ 1.3% this month +
+
-

Total Revenue

-

$215,430

- +12.3% +
Conversion Rate
+
3.45%
+
+ ▲ 0.5% this month +
-

Orders

1,204

↑ 5%
-

Customers

842

↑ 2%
-

Conversion

3.2%

↓ 0.5%
-
-

Products Inventory

- - + +

Products

+
+
NameSKUPriceStockStatus
+ + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Product NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesSKU-8345-78$89.99125ElectronicsIn Stock
4K Ultra HD Smart TVSKU-1234-56$799.0045ElectronicsIn Stock
Smartwatch Fitness TrackerSKU-5678-12$129.508ElectronicsLow Stock
Portable Power Bank 20000mAhSKU-9012-34$34.99210ElectronicsIn Stock
Noise Cancelling EarbudsSKU-3456-78$149.9960ElectronicsIn Stock
Gaming Laptop 15-inchSKU-7890-12$1199.9915ElectronicsIn Stock
Mechanical Keyboard RGBSKU-2345-67$99.9990OfficeIn Stock
Ergonomic Office ChairSKU-6789-01$249.005OfficeLow Stock
USB-C Hub MultiportSKU-0123-45$49.99150ElectronicsIn Stock
External SSD 1TBSKU-4567-89$119.9930ElectronicsIn Stock
Robot Vacuum CleanerSKU-8901-23$299.0012Smart HomeIn Stock
Air Fryer 5LSKU-1234-50$89.9970Home & KitchenIn Stock
Coffee Maker ProgrammableSKU-5678-90$75.000Home & KitchenOut of Stock
Smart Home Security CameraSKU-9012-30$69.9918Smart HomeIn Stock
LED Desk LampSKU-3456-70$29.99100OfficeIn Stock
Electric Toothbrush SonicSKU-7890-10$59.9940Personal CareIn Stock
Water Bottle SmartSKU-2345-60$24.99130OutdoorsIn Stock
Travel Backpack Anti-TheftSKU-6789-00$55.0022OutdoorsIn Stock
Digital Camera MirrorlessSKU-0123-40$899.997ElectronicsLow Stock
Drone with 4K CameraSKU-4567-80$499.0010ElectronicsIn Stock
Portable ProjectorSKU-8901-20$199.0025ElectronicsIn Stock
Virtual Reality HeadsetSKU-1234-51$399.9918ElectronicsIn Stock
Smart LED Strip LightsSKU-5678-91$25.00200Smart HomeIn Stock
Standing Desk ConverterSKU-9012-31$179.9930OfficeIn Stock
Wireless Charging PadSKU-3456-71$19.99180ElectronicsIn Stock
Gaming Mouse RGBSKU-7890-11$45.0095OfficeIn Stock
Monitor Ultra-WideSKU-2345-61$349.0012ElectronicsIn Stock
Soundbar with SubwooferSKU-6789-01$189.9920ElectronicsIn Stock
E-Reader PaperwhiteSKU-0123-41$119.0055ElectronicsIn Stock
Electric Kettle SmartSKU-4567-81$65.000Home & KitchenOut of Stock
Blender High-SpeedSKU-8901-21$99.0035Home & KitchenIn Stock
Espresso MachineSKU-1234-52$499.008Home & KitchenLow Stock
Home Theater SystemSKU-5678-92$699.996ElectronicsLow Stock
Mesh Wi-Fi SystemSKU-9012-32$199.9925Smart HomeIn Stock
Network Attached Storage (NAS)SKU-3456-72$299.0010ElectronicsIn Stock
Smart Plug MiniSKU-7890-12$15.00300Smart HomeIn Stock
Dash Cam Front & RearSKU-2345-62$120.0040ElectronicsIn Stock
Car Jump Starter PortableSKU-6789-02$80.0015OutdoorsIn Stock
Bike Trainer SmartSKU-0123-42$299.007OutdoorsLow Stock
GPS Running WatchSKU-4567-82$199.0020Personal CareIn Stock
Blood Pressure Monitor SmartSKU-8901-22$49.9950Personal CareIn Stock
Scale Smart Body FatSKU-1234-53$35.0070Personal CareIn Stock
Massage Gun PercussionSKU-5678-93$149.0010Personal CareIn Stock
Heated Blanket ElectricSKU-9012-33$79.000Home & KitchenOut of Stock
Humidifier SmartSKU-3456-73$55.0045Smart HomeIn Stock
Air Purifier HEPASKU-7890-13$159.0018Smart HomeIn Stock
Portable AC UnitSKU-2345-63$399.005Home & KitchenLow Stock
Smart ThermostatSKU-6789-03$149.0022Smart HomeIn Stock
Video Doorbell ProSKU-0123-43$179.0012Smart HomeIn Stock
Smart Lock KeylessSKU-4567-83$129.009Smart HomeLow Stock
Electric Scooter FoldingSKU-8901-23$599.003OutdoorsLow Stock
Electric SkateboardSKU-1234-54$449.002OutdoorsLow Stock
Electric BicycleSKU-5678-94$899.001OutdoorsLow Stock
Workout HeadphonesSKU-9012-34$79.0080ElectronicsIn Stock
Outdoor Smart PlugSKU-3456-74$29.00110Smart HomeIn Stock
Garden Sprinkler SmartSKU-7890-14$69.0020Smart HomeIn Stock
Pet Feeder SmartSKU-2345-64$75.0015Smart HomeIn Stock
Dog Camera Treat DispenserSKU-6789-04$120.0010Smart HomeIn Stock
Cat Litter Box Self-CleaningSKU-0123-44$499.004Home & KitchenLow Stock
Smart Wi-Fi Router AX6000SKU-4567-84$189.9928ElectronicsIn Stock
Portable Espresso MakerSKU-8901-24$79.9935Home & KitchenIn Stock
Instant Photo PrinterSKU-1234-55$119.0017ElectronicsIn Stock
Electric Fireplace HeaterSKU-5678-95$159.006Home & KitchenLow Stock
Smart Light Bulbs 4-PackSKU-9012-35$49.99190Smart HomeIn Stock
Gaming Headset WirelessSKU-3456-75$110.0050ElectronicsIn Stock
UV Sanitizer for PhoneSKU-7890-15$39.9985Personal CareIn Stock
Electric Grill IndoorSKU-2345-65$95.0011Home & KitchenIn Stock
Digital Voice RecorderSKU-6789-05$49.0060OfficeIn Stock
Portable Air CompressorSKU-0123-45$65.0025OutdoorsIn Stock
Fitness Trampoline MiniSKU-4567-85$85.008OutdoorsLow Stock
Window Cleaning RobotSKU-8901-25$249.005Smart HomeLow Stock
Smart Plant PotSKU-1234-56$55.0030Smart HomeIn Stock
Electric Wine Opener SetSKU-5678-96$39.0070Home & KitchenIn Stock
Car Phone Mount Wireless ChargerSKU-9012-36$29.99140ElectronicsIn Stock
Portable Document ScannerSKU-3456-76$169.0012OfficeIn Stock
Solar Garden Lights 10-PackSKU-7890-16$45.0090OutdoorsIn Stock
Smart Mirror for BathroomSKU-2345-66$299.003Smart HomeLow Stock
-
+
-
-

Recent Orders

- - + +

Recent Orders

+
+
IDCustomerAmountDateStatus
+ + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountOrder DateStatus
ORD-473950Charlie Brown$145.002023-10-26Processing
ORD-894723Henry Wilson$780.252023-10-22Delivered
ORD-120567Alice Johnson$320.502023-10-28Pending
ORD-678910Bob Williams$89.992023-10-27Shipped
ORD-345678Diana Prince$1200.002023-10-25Delivered
ORD-987654Eve Davis$45.752023-10-24Processing
ORD-112233Frank Miller$210.002023-10-23Shipped
ORD-554433Grace Taylor$75.502023-10-21Delivered
ORD-678543Ivy Moore$550.002023-10-20Pending
ORD-234567Jack White$19.992023-10-19Processing
ORD-876543Karen Green$123.452023-10-18Shipped
ORD-098765Liam Hall$999.992023-10-17Delivered
ORD-111222Mia King$67.892023-10-16Pending
ORD-333444Noah Wright$345.602023-10-15Processing
ORD-555666Olivia Scott$12.302023-10-14Shipped
ORD-777888Peter Adams$78.902023-10-13Delivered
ORD-999000Quinn Lewis$150.002023-10-12Pending
ORD-222111Rachel Lee$29.992023-10-11Processing
ORD-444555Sam Clark$499.002023-10-10Shipped
ORD-666777Tina Young$18.502023-10-09Delivered
ORD-888999Uma Hernandez$670.002023-10-08Pending
ORD-000111Victor Garcia$25.002023-10-07Processing
ORD-121314Wendy Lopez$99.002023-10-06Shipped
ORD-151617Xavier Rodriguez$300.002023-10-05Delivered
ORD-181920Yara Martinez$79.002023-10-04Pending
ORD-212223Zack Anderson$149.002023-10-03Processing
ORD-242526Alice Johnson$50.002023-10-02Shipped
ORD-272829Bob Williams$12.992023-10-01Delivered
ORD-303132Charlie Brown$80.002023-09-30Pending
ORD-333435Diana Prince$240.002023-09-29Processing
ORD-363738Eve Davis$1500.002023-09-28Shipped
ORD-394041Frank Miller$65.002023-09-27Delivered
ORD-424344Grace Taylor$199.002023-09-26Pending
ORD-454647Henry Wilson$30.002023-09-25Processing
ORD-484950Ivy Moore$450.002023-09-24Shipped
ORD-515253Jack White$22.002023-09-23Delivered
ORD-545556Karen Green$88.882023-09-22Pending
ORD-575859Liam Hall$765.002023-09-21Processing
ORD-606162Mia King$120.002023-09-20Shipped
ORD-636465Noah Wright$33.502023-09-19Delivered
ORD-666768Olivia Scott$99.992023-09-18Pending
ORD-697071Peter Adams$55.002023-09-17Processing
ORD-727374Quinn Lewis$12.002023-09-16Shipped
ORD-757677Rachel Lee$250.002023-09-15Delivered
ORD-787980Sam Clark$75.002023-09-14Pending
ORD-818283Tina Young$110.002023-09-13Processing
ORD-848586Uma Hernandez$400.002023-09-12Shipped
-
+
-
-

Account Settings

+ +

Account Settings

+
- - - - +
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +

Notification Preferences

+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +
+ +
-
+ + \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-2.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-2.html index 4459981..7716423 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-2.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-2.html @@ -2,85 +2,1510 @@ - Admin Dashboard - + + E-commerce Admin Dashboard - + + +
-
+ +
+
+
Total Revenue
+
$215,430
+
+ ▲ 12.3% this month +
+
+
+
Orders
+
2,890
+
+ ▲ 8.2% this month +
+
+
+
Customers
+
1,540
+
+ ▼ 1.3% this month +
+
-

Total Revenue

-

$215,430

- +12.3% +
Conversion Rate
+
3.45%
+
+ ▲ 0.5% this month +
-

Orders

1,204

↑ 5%
-

Customers

842

↑ 2%
-

Conversion

3.2%

↓ 0.5%
-
-

Products Inventory

- - + +

Products

+
+
NameSKUPriceStockStatus
+ + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Product NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesSKU-8345-78$89.99125ElectronicsIn Stock
4K Ultra HD Smart TVSKU-1234-56$799.0045ElectronicsIn Stock
Smartwatch Fitness TrackerSKU-5678-12$129.508ElectronicsLow Stock
Portable Power Bank 20000mAhSKU-9012-34$34.99210ElectronicsIn Stock
Noise Cancelling EarbudsSKU-3456-78$149.9960ElectronicsIn Stock
Gaming Laptop 15-inchSKU-7890-12$1199.9915ElectronicsIn Stock
Mechanical Keyboard RGBSKU-2345-67$99.9990OfficeIn Stock
Ergonomic Office ChairSKU-6789-01$249.005OfficeLow Stock
USB-C Hub MultiportSKU-0123-45$49.99150ElectronicsIn Stock
External SSD 1TBSKU-4567-89$119.9930ElectronicsIn Stock
Robot Vacuum CleanerSKU-8901-23$299.0012Smart HomeIn Stock
Air Fryer 5LSKU-1234-50$89.9970Home & KitchenIn Stock
Coffee Maker ProgrammableSKU-5678-90$75.000Home & KitchenOut of Stock
Smart Home Security CameraSKU-9012-30$69.9918Smart HomeIn Stock
LED Desk LampSKU-3456-70$29.99100OfficeIn Stock
Electric Toothbrush SonicSKU-7890-10$59.9940Personal CareIn Stock
Water Bottle SmartSKU-2345-60$24.99130OutdoorsIn Stock
Travel Backpack Anti-TheftSKU-6789-00$55.0022OutdoorsIn Stock
Digital Camera MirrorlessSKU-0123-40$899.997ElectronicsLow Stock
Drone with 4K CameraSKU-4567-80$499.0010ElectronicsIn Stock
Portable ProjectorSKU-8901-20$199.0025ElectronicsIn Stock
Virtual Reality HeadsetSKU-1234-51$399.9918ElectronicsIn Stock
Smart LED Strip LightsSKU-5678-91$25.00200Smart HomeIn Stock
Standing Desk ConverterSKU-9012-31$179.9930OfficeIn Stock
Wireless Charging PadSKU-3456-71$19.99180ElectronicsIn Stock
Gaming Mouse RGBSKU-7890-11$45.0095OfficeIn Stock
Monitor Ultra-WideSKU-2345-61$349.0012ElectronicsIn Stock
Soundbar with SubwooferSKU-6789-01$189.9920ElectronicsIn Stock
E-Reader PaperwhiteSKU-0123-41$119.0055ElectronicsIn Stock
Electric Kettle SmartSKU-4567-81$65.000Home & KitchenOut of Stock
Blender High-SpeedSKU-8901-21$99.0035Home & KitchenIn Stock
Espresso MachineSKU-1234-52$499.008Home & KitchenLow Stock
Home Theater SystemSKU-5678-92$699.996ElectronicsLow Stock
Mesh Wi-Fi SystemSKU-9012-32$199.9925Smart HomeIn Stock
Network Attached Storage (NAS)SKU-3456-72$299.0010ElectronicsIn Stock
Smart Plug MiniSKU-7890-12$15.00300Smart HomeIn Stock
Dash Cam Front & RearSKU-2345-62$120.0040ElectronicsIn Stock
Car Jump Starter PortableSKU-6789-02$80.0015OutdoorsIn Stock
Bike Trainer SmartSKU-0123-42$299.007OutdoorsLow Stock
GPS Running WatchSKU-4567-82$199.0020Personal CareIn Stock
Blood Pressure Monitor SmartSKU-8901-22$49.9950Personal CareIn Stock
Scale Smart Body FatSKU-1234-53$35.0070Personal CareIn Stock
Massage Gun PercussionSKU-5678-93$149.0010Personal CareIn Stock
Heated Blanket ElectricSKU-9012-33$79.000Home & KitchenOut of Stock
Humidifier SmartSKU-3456-73$55.0045Smart HomeIn Stock
Air Purifier HEPASKU-7890-13$159.0018Smart HomeIn Stock
Portable AC UnitSKU-2345-63$399.005Home & KitchenLow Stock
Smart ThermostatSKU-6789-03$149.0022Smart HomeIn Stock
Video Doorbell ProSKU-0123-43$179.0012Smart HomeIn Stock
Smart Lock KeylessSKU-4567-83$129.009Smart HomeLow Stock
Electric Scooter FoldingSKU-8901-23$599.003OutdoorsLow Stock
Electric SkateboardSKU-1234-54$449.002OutdoorsLow Stock
Electric BicycleSKU-5678-94$899.001OutdoorsLow Stock
Workout HeadphonesSKU-9012-34$79.0080ElectronicsIn Stock
Outdoor Smart PlugSKU-3456-74$29.00110Smart HomeIn Stock
Garden Sprinkler SmartSKU-7890-14$69.0020Smart HomeIn Stock
Pet Feeder SmartSKU-2345-64$75.0015Smart HomeIn Stock
Dog Camera Treat DispenserSKU-6789-04$120.0010Smart HomeIn Stock
Cat Litter Box Self-CleaningSKU-0123-44$499.004Home & KitchenLow Stock
Smart Wi-Fi Router AX6000SKU-4567-84$189.9928ElectronicsIn Stock
Portable Espresso MakerSKU-8901-24$79.9935Home & KitchenIn Stock
Instant Photo PrinterSKU-1234-55$119.0017ElectronicsIn Stock
Electric Fireplace HeaterSKU-5678-95$159.006Home & KitchenLow Stock
Smart Light Bulbs 4-PackSKU-9012-35$49.99190Smart HomeIn Stock
Gaming Headset WirelessSKU-3456-75$110.0050ElectronicsIn Stock
UV Sanitizer for PhoneSKU-7890-15$39.9985Personal CareIn Stock
Electric Grill IndoorSKU-2345-65$95.0011Home & KitchenIn Stock
Digital Voice RecorderSKU-6789-05$49.0060OfficeIn Stock
Portable Air CompressorSKU-0123-45$65.0025OutdoorsIn Stock
Fitness Trampoline MiniSKU-4567-85$85.008OutdoorsLow Stock
Window Cleaning RobotSKU-8901-25$249.005Smart HomeLow Stock
Smart Plant PotSKU-1234-56$55.0030Smart HomeIn Stock
Electric Wine Opener SetSKU-5678-96$39.0070Home & KitchenIn Stock
Car Phone Mount Wireless ChargerSKU-9012-36$29.99140ElectronicsIn Stock
Portable Document ScannerSKU-3456-76$169.0012OfficeIn Stock
Solar Garden Lights 10-PackSKU-7890-16$45.0090OutdoorsIn Stock
Smart Mirror for BathroomSKU-2345-66$299.003Smart HomeLow Stock
-
+
-
-

Recent Orders (March 2026)

- - + +

Recent Orders

+
+
IDCustomerAmountDateStatus
+ + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountOrder DateStatus
ORD-473950Charlie Brown$145.002023-10-26Processing
ORD-894723Henry Wilson$780.252023-10-22Delivered
ORD-120567Alice Johnson$320.502023-10-28Pending
ORD-678910Bob Williams$89.992023-10-27Shipped
ORD-345678Diana Prince$1200.002023-10-25Delivered
ORD-987654Eve Davis$45.752023-10-24Processing
ORD-112233Frank Miller$210.002023-10-23Shipped
ORD-554433Grace Taylor$75.502023-10-21Delivered
ORD-678543Ivy Moore$550.002023-10-20Pending
ORD-234567Jack White$19.992023-10-19Processing
ORD-876543Karen Green$123.452023-10-18Shipped
ORD-098765Liam Hall$999.992023-10-17Delivered
ORD-111222Mia King$67.892023-10-16Pending
ORD-333444Noah Wright$345.602023-10-15Processing
ORD-555666Olivia Scott$12.302023-10-14Shipped
ORD-777888Peter Adams$78.902023-10-13Delivered
ORD-999000Quinn Lewis$150.002023-10-12Pending
ORD-222111Rachel Lee$29.992023-10-11Processing
ORD-444555Sam Clark$499.002023-10-10Shipped
ORD-666777Tina Young$18.502023-10-09Delivered
ORD-888999Uma Hernandez$670.002023-10-08Pending
ORD-000111Victor Garcia$25.002023-10-07Processing
ORD-121314Wendy Lopez$99.002023-10-06Shipped
ORD-151617Xavier Rodriguez$300.002023-10-05Delivered
ORD-181920Yara Martinez$79.002023-10-04Pending
ORD-212223Zack Anderson$149.002023-10-03Processing
ORD-242526Alice Johnson$50.002023-10-02Shipped
ORD-272829Bob Williams$12.992023-10-01Delivered
ORD-303132Charlie Brown$80.002023-09-30Pending
ORD-333435Diana Prince$240.002023-09-29Processing
ORD-363738Eve Davis$1500.002023-09-28Shipped
ORD-394041Frank Miller$65.002023-09-27Delivered
ORD-424344Grace Taylor$199.002023-09-26Pending
ORD-454647Henry Wilson$30.002023-09-25Processing
ORD-484950Ivy Moore$450.002023-09-24Shipped
ORD-515253Jack White$22.002023-09-23Delivered
ORD-545556Karen Green$88.882023-09-22Pending
ORD-575859Liam Hall$765.002023-09-21Processing
ORD-606162Mia King$120.002023-09-20Shipped
ORD-636465Noah Wright$33.502023-09-19Delivered
ORD-666768Olivia Scott$99.992023-09-18Pending
ORD-697071Peter Adams$55.002023-09-17Processing
ORD-727374Quinn Lewis$12.002023-09-16Shipped
ORD-757677Rachel Lee$250.002023-09-15Delivered
ORD-787980Sam Clark$75.002023-09-14Pending
ORD-818283Tina Young$110.002023-09-13Processing
ORD-848586Uma Hernandez$400.002023-09-12Shipped
ORD-900001Laura Croft$210.502026-03-31Processing
ORD-900002Markus Jensen$85.002026-03-30Shipped
ORD-900003Nadia Khan$15.992026-03-29Pending
ORD-900004Oscar Reed$340.002026-03-28Delivered
ORD-900005Penny Lane$55.252026-03-27Processing
ORD-900006Quentin Blake$1299.002026-03-26Shipped
ORD-900007Renee Dupont$72.802026-03-25Pending
ORD-900008Steven King$450.002026-03-24Delivered
ORD-900009Tanya Adams$22.002026-03-23Cancelled
ORD-900010Ulysses Grant$180.002026-03-22Processing
ORD-900011Violet Stone$39.992026-03-21Shipped
ORD-900012Walter White$799.002026-03-20Delivered
ORD-900013Xenia Bell$99.002026-03-19Pending
ORD-900014Yusuf Amir$25.502026-03-18Processing
ORD-900015Zara Chen$112.752026-03-17Shipped
-
+
-
-

Account Settings

+ +

Account Settings

+
- - - - +
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +

Notification Preferences

+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +
+ +
-
+ + \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-3.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-3.html index a77570f..b0a5f91 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-3.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-3.html @@ -2,83 +2,1510 @@ - Admin Dashboard - + + E-commerce Admin Dashboard - + + +
-
+ +
+
+
Total Revenue
+
$215,430
+
+ ▲ 12.3% this month +
+
+
+
Orders
+
2,890
+
+ ▲ 8.2% this month +
+
+
+
Customers
+
1,540
+
+ ▼ 1.3% this month +
+
-

Total Revenue

-

$215,430

- +12.3% +
Conversion Rate
+
3.45%
+
+ ▲ 0.5% this month +
-

Orders

1,204

↑ 5%
-

Customers

842

↑ 2%
-

Conversion

3.2%

↓ 0.5%
-
-

Products Inventory

- - + +

Products

+
+
NameSKUPriceStockStatus
+ + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Product NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesSKU-8345-78$89.99125ElectronicsIn Stock
4K Ultra HD Smart TVSKU-1234-56$799.0045ElectronicsIn Stock
Smartwatch Fitness TrackerSKU-5678-12$129.508ElectronicsLow Stock
Portable Power Bank 20000mAhSKU-9012-34$34.99210ElectronicsIn Stock
Noise Cancelling EarbudsSKU-3456-78$149.9960ElectronicsIn Stock
Gaming Laptop 15-inchSKU-7890-12$1199.9915ElectronicsIn Stock
Mechanical Keyboard RGBSKU-2345-67$99.9990OfficeIn Stock
Ergonomic Office ChairSKU-6789-01$249.005OfficeLow Stock
USB-C Hub MultiportSKU-0123-45$49.99150ElectronicsIn Stock
External SSD 1TBSKU-4567-89$119.9930ElectronicsIn Stock
Robot Vacuum CleanerSKU-8901-23$299.0012Smart HomeIn Stock
Air Fryer 5LSKU-1234-50$89.9970Home & KitchenIn Stock
Coffee Maker ProgrammableSKU-5678-90$75.000Home & KitchenOut of Stock
Smart Home Security CameraSKU-9012-30$69.9918Smart HomeIn Stock
LED Desk LampSKU-3456-70$29.99100OfficeIn Stock
Electric Toothbrush SonicSKU-7890-10$59.9940Personal CareIn Stock
Water Bottle SmartSKU-2345-60$24.99130OutdoorsIn Stock
Travel Backpack Anti-TheftSKU-6789-00$55.0022OutdoorsIn Stock
Digital Camera MirrorlessSKU-0123-40$899.997ElectronicsLow Stock
Drone with 4K CameraSKU-4567-80$499.0010ElectronicsIn Stock
Portable ProjectorSKU-8901-20$199.0025ElectronicsIn Stock
Virtual Reality HeadsetSKU-1234-51$399.9918ElectronicsIn Stock
Smart LED Strip LightsSKU-5678-91$25.00200Smart HomeIn Stock
Standing Desk ConverterSKU-9012-31$179.9930OfficeIn Stock
Wireless Charging PadSKU-3456-71$19.99180ElectronicsIn Stock
Gaming Mouse RGBSKU-7890-11$45.0095OfficeIn Stock
Monitor Ultra-WideSKU-2345-61$349.0012ElectronicsIn Stock
Soundbar with SubwooferSKU-6789-01$189.9920ElectronicsIn Stock
E-Reader PaperwhiteSKU-0123-41$119.0055ElectronicsIn Stock
Electric Kettle SmartSKU-4567-81$65.000Home & KitchenOut of Stock
Blender High-SpeedSKU-8901-21$99.0035Home & KitchenIn Stock
Espresso MachineSKU-1234-52$499.008Home & KitchenLow Stock
Home Theater SystemSKU-5678-92$699.996ElectronicsLow Stock
Mesh Wi-Fi SystemSKU-9012-32$199.9925Smart HomeIn Stock
Network Attached Storage (NAS)SKU-3456-72$299.0010ElectronicsIn Stock
Smart Plug MiniSKU-7890-12$15.00300Smart HomeIn Stock
Dash Cam Front & RearSKU-2345-62$120.0040ElectronicsIn Stock
Car Jump Starter PortableSKU-6789-02$80.0015OutdoorsIn Stock
Bike Trainer SmartSKU-0123-42$299.007OutdoorsLow Stock
GPS Running WatchSKU-4567-82$199.0020Personal CareIn Stock
Blood Pressure Monitor SmartSKU-8901-22$49.9950Personal CareIn Stock
Scale Smart Body FatSKU-1234-53$35.0070Personal CareIn Stock
Massage Gun PercussionSKU-5678-93$149.0010Personal CareIn Stock
Heated Blanket ElectricSKU-9012-33$79.000Home & KitchenOut of Stock
Humidifier SmartSKU-3456-73$55.0045Smart HomeIn Stock
Air Purifier HEPASKU-7890-13$159.0018Smart HomeIn Stock
Portable AC UnitSKU-2345-63$399.005Home & KitchenLow Stock
Smart ThermostatSKU-6789-03$149.0022Smart HomeIn Stock
Video Doorbell ProSKU-0123-43$179.0012Smart HomeIn Stock
Smart Lock KeylessSKU-4567-83$129.009Smart HomeLow Stock
Electric Scooter FoldingSKU-8901-23$599.003OutdoorsLow Stock
Electric SkateboardSKU-1234-54$449.002OutdoorsLow Stock
Electric BicycleSKU-5678-94$899.001OutdoorsLow Stock
Workout HeadphonesSKU-9012-34$79.0080ElectronicsIn Stock
Outdoor Smart PlugSKU-3456-74$29.00110Smart HomeIn Stock
Garden Sprinkler SmartSKU-7890-14$69.0020Smart HomeIn Stock
Pet Feeder SmartSKU-2345-64$75.0015Smart HomeIn Stock
Dog Camera Treat DispenserSKU-6789-04$120.0010Smart HomeIn Stock
Cat Litter Box Self-CleaningSKU-0123-44$499.004Home & KitchenLow Stock
Smart Wi-Fi Router AX6000SKU-4567-84$189.9928ElectronicsIn Stock
Portable Espresso MakerSKU-8901-24$79.9935Home & KitchenIn Stock
Instant Photo PrinterSKU-1234-55$119.0017ElectronicsIn Stock
Electric Fireplace HeaterSKU-5678-95$159.006Home & KitchenLow Stock
Smart Light Bulbs 4-PackSKU-9012-35$49.99190Smart HomeIn Stock
Gaming Headset WirelessSKU-3456-75$110.0050ElectronicsIn Stock
UV Sanitizer for PhoneSKU-7890-15$39.9985Personal CareIn Stock
Electric Grill IndoorSKU-2345-65$95.0011Home & KitchenIn Stock
Digital Voice RecorderSKU-6789-05$49.0060OfficeIn Stock
Portable Air CompressorSKU-0123-45$65.0025OutdoorsIn Stock
Fitness Trampoline MiniSKU-4567-85$85.008OutdoorsLow Stock
Window Cleaning RobotSKU-8901-25$249.005Smart HomeLow Stock
Smart Plant PotSKU-1234-56$55.0030Smart HomeIn Stock
Electric Wine Opener SetSKU-5678-96$39.0070Home & KitchenIn Stock
Car Phone Mount Wireless ChargerSKU-9012-36$29.99140ElectronicsIn Stock
Portable Document ScannerSKU-3456-76$169.0012OfficeIn Stock
Solar Garden Lights 10-PackSKU-7890-16$45.0090OutdoorsIn Stock
Smart Mirror for BathroomSKU-2345-66$299.003Smart HomeLow Stock
-
+
-
-

Recent Orders (March 2026)

- - + +

Recent Orders

+
+
IDCustomerAmountDateStatus
+ + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountOrder DateStatus
ORD-473950Charlie Brown$145.002023-10-26Processing
ORD-894723Henry Wilson$780.252023-10-22Delivered
ORD-120567Alice Johnson$320.502023-10-28Pending
ORD-678910Bob Williams$89.992023-10-27Shipped
ORD-345678Diana Prince$1200.002023-10-25Delivered
ORD-987654Eve Davis$45.752023-10-24Processing
ORD-112233Frank Miller$210.002023-10-23Shipped
ORD-554433Grace Taylor$75.502023-10-21Delivered
ORD-678543Ivy Moore$550.002023-10-20Pending
ORD-234567Jack White$19.992023-10-19Processing
ORD-876543Karen Green$123.452023-10-18Shipped
ORD-098765Liam Hall$999.992023-10-17Delivered
ORD-111222Mia King$67.892023-10-16Pending
ORD-333444Noah Wright$345.602023-10-15Processing
ORD-555666Olivia Scott$12.302023-10-14Shipped
ORD-777888Peter Adams$78.902023-10-13Delivered
ORD-999000Quinn Lewis$150.002023-10-12Pending
ORD-222111Rachel Lee$29.992023-10-11Processing
ORD-444555Sam Clark$499.002023-10-10Shipped
ORD-666777Tina Young$18.502023-10-09Delivered
ORD-888999Uma Hernandez$670.002023-10-08Pending
ORD-000111Victor Garcia$25.002023-10-07Processing
ORD-121314Wendy Lopez$99.002023-10-06Shipped
ORD-151617Xavier Rodriguez$300.002023-10-05Delivered
ORD-181920Yara Martinez$79.002023-10-04Pending
ORD-212223Zack Anderson$149.002023-10-03Processing
ORD-242526Alice Johnson$50.002023-10-02Shipped
ORD-272829Bob Williams$12.992023-10-01Delivered
ORD-303132Charlie Brown$80.002023-09-30Pending
ORD-333435Diana Prince$240.002023-09-29Processing
ORD-363738Eve Davis$1500.002023-09-28Shipped
ORD-394041Frank Miller$65.002023-09-27Delivered
ORD-424344Grace Taylor$199.002023-09-26Pending
ORD-454647Henry Wilson$30.002023-09-25Processing
ORD-484950Ivy Moore$450.002023-09-24Shipped
ORD-515253Jack White$22.002023-09-23Delivered
ORD-545556Karen Green$88.882023-09-22Pending
ORD-575859Liam Hall$765.002023-09-21Processing
ORD-606162Mia King$120.002023-09-20Shipped
ORD-636465Noah Wright$33.502023-09-19Delivered
ORD-666768Olivia Scott$99.992023-09-18Pending
ORD-697071Peter Adams$55.002023-09-17Processing
ORD-727374Quinn Lewis$12.002023-09-16Shipped
ORD-757677Rachel Lee$250.002023-09-15Delivered
ORD-787980Sam Clark$75.002023-09-14Pending
ORD-818283Tina Young$110.002023-09-13Processing
ORD-848586Uma Hernandez$400.002023-09-12Shipped
ORD-900001Laura Croft$210.502026-03-31Processing
ORD-900002Markus Jensen$85.002026-03-30Shipped
ORD-900003Nadia Khan$15.992026-03-29Pending
ORD-900004Oscar Reed$340.002026-03-28Delivered
ORD-900005Penny Lane$55.252026-03-27Processing
ORD-900006Quentin Blake$1299.002026-03-26Shipped
ORD-900007Renee Dupont$72.802026-03-25Pending
ORD-900008Steven King$450.002026-03-24Delivered
ORD-900009Tanya Adams$22.002026-03-23Cancelled
ORD-900010Ulysses Grant$180.002026-03-22Processing
ORD-900011Violet Stone$39.992026-03-21Shipped
ORD-900012Walter White$799.002026-03-20Delivered
ORD-900013Xenia Bell$99.002026-03-19Pending
ORD-900014Yusuf Amir$25.502026-03-18Processing
ORD-900015Zara Chen$112.752026-03-17Shipped
-
+
-
-

Account Settings

+ +

Account Settings

+
- - - - +
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +

Notification Preferences

+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +
+ +
-
+ + \ No newline at end of file diff --git a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-4.html b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-4.html index c61a2a0..4ab4b8f 100644 --- a/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-4.html +++ b/evals/data/experiments/001-html-dashboard-ecommerce/outputs/base/turn-4.html @@ -2,97 +2,1595 @@ - Admin Dashboard - + + E-commerce Admin Dashboard - + + +
-
+ +
+
+
Total Revenue
+
$215,430
+
+ ▲ 12.3% this month +
+
+
+
Orders
+
2,890
+
+ ▲ 8.2% this month +
+
+
+
Customers
+
1,540
+
+ ▼ 1.3% this month +
+
-

Total Revenue

-

$215,430

- +12.3% +
Conversion Rate
+
3.45%
+
+ ▲ 0.5% this month +
-

Orders

1,204

↑ 5%
-

Customers

842

↑ 2%
-

Conversion

3.2%

↓ 0.5%
-
-

Recent Activity

-
- -
-
+ +

Recent Activity

+
+
    +
  • + Updated product "Wireless Bluetooth Headphones" stock to 125 + 2026-04-01 10:30 AM +
  • +
  • + New order ORD-900015 placed by Zara Chen + 2026-03-31 09:15 AM +
  • +
  • + Added new product "Smart Mirror for Bathroom" + 2026-03-30 04:00 PM +
  • +
  • + Customer "Markus Jensen" updated shipping address + 2026-03-30 11:00 AM +
  • +
  • + Order ORD-900004 changed status to "Delivered" + 2026-03-29 02:45 PM +
  • +
  • + User "AdminUser" logged in successfully + 2026-03-29 08:00 AM +
  • +
  • + Product "Coffee Maker Programmable" is now out of stock + 2026-03-28 01:20 PM +
  • +
  • + New order ORD-900005 placed by Penny Lane + 2026-03-27 05:00 PM +
  • +
  • + Changed password for "John Doe" account + 2026-03-27 10:00 AM +
  • +
  • + Reviewed pending order ORD-900007 + 2026-03-26 03:30 PM +
  • +
+
-
-

Products Inventory

- - + +

Products

+
+
NameSKUPriceStockStatus
+ + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Product NameSKUPriceStockCategoryStatus
Wireless Bluetooth HeadphonesSKU-8345-78$89.99125ElectronicsIn Stock
4K Ultra HD Smart TVSKU-1234-56$799.0045ElectronicsIn Stock
Smartwatch Fitness TrackerSKU-5678-12$129.508ElectronicsLow Stock
Portable Power Bank 20000mAhSKU-9012-34$34.99210ElectronicsIn Stock
Noise Cancelling EarbudsSKU-3456-78$149.9960ElectronicsIn Stock
Gaming Laptop 15-inchSKU-7890-12$1199.9915ElectronicsIn Stock
Mechanical Keyboard RGBSKU-2345-67$99.9990OfficeIn Stock
Ergonomic Office ChairSKU-6789-01$249.005OfficeLow Stock
USB-C Hub MultiportSKU-0123-45$49.99150ElectronicsIn Stock
External SSD 1TBSKU-4567-89$119.9930ElectronicsIn Stock
Robot Vacuum CleanerSKU-8901-23$299.0012Smart HomeIn Stock
Air Fryer 5LSKU-1234-50$89.9970Home & KitchenIn Stock
Coffee Maker ProgrammableSKU-5678-90$75.000Home & KitchenOut of Stock
Smart Home Security CameraSKU-9012-30$69.9918Smart HomeIn Stock
LED Desk LampSKU-3456-70$29.99100OfficeIn Stock
Electric Toothbrush SonicSKU-7890-10$59.9940Personal CareIn Stock
Water Bottle SmartSKU-2345-60$24.99130OutdoorsIn Stock
Travel Backpack Anti-TheftSKU-6789-00$55.0022OutdoorsIn Stock
Digital Camera MirrorlessSKU-0123-40$899.997ElectronicsLow Stock
Drone with 4K CameraSKU-4567-80$499.0010ElectronicsIn Stock
Portable ProjectorSKU-8901-20$199.0025ElectronicsIn Stock
Virtual Reality HeadsetSKU-1234-51$399.9918ElectronicsIn Stock
Smart LED Strip LightsSKU-5678-91$25.00200Smart HomeIn Stock
Standing Desk ConverterSKU-9012-31$179.9930OfficeIn Stock
Wireless Charging PadSKU-3456-71$19.99180ElectronicsIn Stock
Gaming Mouse RGBSKU-7890-11$45.0095OfficeIn Stock
Monitor Ultra-WideSKU-2345-61$349.0012ElectronicsIn Stock
Soundbar with SubwooferSKU-6789-01$189.9920ElectronicsIn Stock
E-Reader PaperwhiteSKU-0123-41$119.0055ElectronicsIn Stock
Electric Kettle SmartSKU-4567-81$65.000Home & KitchenOut of Stock
Blender High-SpeedSKU-8901-21$99.0035Home & KitchenIn Stock
Espresso MachineSKU-1234-52$499.008Home & KitchenLow Stock
Home Theater SystemSKU-5678-92$699.996ElectronicsLow Stock
Mesh Wi-Fi SystemSKU-9012-32$199.9925Smart HomeIn Stock
Network Attached Storage (NAS)SKU-3456-72$299.0010ElectronicsIn Stock
Smart Plug MiniSKU-7890-12$15.00300Smart HomeIn Stock
Dash Cam Front & RearSKU-2345-62$120.0040ElectronicsIn Stock
Car Jump Starter PortableSKU-6789-02$80.0015OutdoorsIn Stock
Bike Trainer SmartSKU-0123-42$299.007OutdoorsLow Stock
GPS Running WatchSKU-4567-82$199.0020Personal CareIn Stock
Blood Pressure Monitor SmartSKU-8901-22$49.9950Personal CareIn Stock
Scale Smart Body FatSKU-1234-53$35.0070Personal CareIn Stock
Massage Gun PercussionSKU-5678-93$149.0010Personal CareIn Stock
Heated Blanket ElectricSKU-9012-33$79.000Home & KitchenOut of Stock
Humidifier SmartSKU-3456-73$55.0045Smart HomeIn Stock
Air Purifier HEPASKU-7890-13$159.0018Smart HomeIn Stock
Portable AC UnitSKU-2345-63$399.005Home & KitchenLow Stock
Smart ThermostatSKU-6789-03$149.0022Smart HomeIn Stock
Video Doorbell ProSKU-0123-43$179.0012Smart HomeIn Stock
Smart Lock KeylessSKU-4567-83$129.009Smart HomeLow Stock
Electric Scooter FoldingSKU-8901-23$599.003OutdoorsLow Stock
Electric SkateboardSKU-1234-54$449.002OutdoorsLow Stock
Electric BicycleSKU-5678-94$899.001OutdoorsLow Stock
Workout HeadphonesSKU-9012-34$79.0080ElectronicsIn Stock
Outdoor Smart PlugSKU-3456-74$29.00110Smart HomeIn Stock
Garden Sprinkler SmartSKU-7890-14$69.0020Smart HomeIn Stock
Pet Feeder SmartSKU-2345-64$75.0015Smart HomeIn Stock
Dog Camera Treat DispenserSKU-6789-04$120.0010Smart HomeIn Stock
Cat Litter Box Self-CleaningSKU-0123-44$499.004Home & KitchenLow Stock
Smart Wi-Fi Router AX6000SKU-4567-84$189.9928ElectronicsIn Stock
Portable Espresso MakerSKU-8901-24$79.9935Home & KitchenIn Stock
Instant Photo PrinterSKU-1234-55$119.0017ElectronicsIn Stock
Electric Fireplace HeaterSKU-5678-95$159.006Home & KitchenLow Stock
Smart Light Bulbs 4-PackSKU-9012-35$49.99190Smart HomeIn Stock
Gaming Headset WirelessSKU-3456-75$110.0050ElectronicsIn Stock
UV Sanitizer for PhoneSKU-7890-15$39.9985Personal CareIn Stock
Electric Grill IndoorSKU-2345-65$95.0011Home & KitchenIn Stock
Digital Voice RecorderSKU-6789-05$49.0060OfficeIn Stock
Portable Air CompressorSKU-0123-45$65.0025OutdoorsIn Stock
Fitness Trampoline MiniSKU-4567-85$85.008OutdoorsLow Stock
Window Cleaning RobotSKU-8901-25$249.005Smart HomeLow Stock
Smart Plant PotSKU-1234-56$55.0030Smart HomeIn Stock
Electric Wine Opener SetSKU-5678-96$39.0070Home & KitchenIn Stock
Car Phone Mount Wireless ChargerSKU-9012-36$29.99140ElectronicsIn Stock
Portable Document ScannerSKU-3456-76$169.0012OfficeIn Stock
Solar Garden Lights 10-PackSKU-7890-16$45.0090OutdoorsIn Stock
Smart Mirror for BathroomSKU-2345-66$299.003Smart HomeLow Stock
-
+
-
-

Recent Orders (March 2026)

- - + +

Recent Orders

+
+
IDCustomerAmountDateStatus
+ + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Order IDCustomerAmountOrder DateStatus
ORD-473950Charlie Brown$145.002023-10-26Processing
ORD-894723Henry Wilson$780.252023-10-22Delivered
ORD-120567Alice Johnson$320.502023-10-28Pending
ORD-678910Bob Williams$89.992023-10-27Shipped
ORD-345678Diana Prince$1200.002023-10-25Delivered
ORD-987654Eve Davis$45.752023-10-24Processing
ORD-112233Frank Miller$210.002023-10-23Shipped
ORD-554433Grace Taylor$75.502023-10-21Delivered
ORD-678543Ivy Moore$550.002023-10-20Pending
ORD-234567Jack White$19.992023-10-19Processing
ORD-876543Karen Green$123.452023-10-18Shipped
ORD-098765Liam Hall$999.992023-10-17Delivered
ORD-111222Mia King$67.892023-10-16Pending
ORD-333444Noah Wright$345.602023-10-15Processing
ORD-555666Olivia Scott$12.302023-10-14Shipped
ORD-777888Peter Adams$78.902023-10-13Delivered
ORD-999000Quinn Lewis$150.002023-10-12Pending
ORD-222111Rachel Lee$29.992023-10-11Processing
ORD-444555Sam Clark$499.002023-10-10Shipped
ORD-666777Tina Young$18.502023-10-09Delivered
ORD-888999Uma Hernandez$670.002023-10-08Pending
ORD-000111Victor Garcia$25.002023-10-07Processing
ORD-121314Wendy Lopez$99.002023-10-06Shipped
ORD-151617Xavier Rodriguez$300.002023-10-05Delivered
ORD-181920Yara Martinez$79.002023-10-04Pending
ORD-212223Zack Anderson$149.002023-10-03Processing
ORD-242526Alice Johnson$50.002023-10-02Shipped
ORD-272829Bob Williams$12.992023-10-01Delivered
ORD-303132Charlie Brown$80.002023-09-30Pending
ORD-333435Diana Prince$240.002023-09-29Processing
ORD-363738Eve Davis$1500.002023-09-28Shipped
ORD-394041Frank Miller$65.002023-09-27Delivered
ORD-424344Grace Taylor$199.002023-09-26Pending
ORD-454647Henry Wilson$30.002023-09-25Processing
ORD-484950Ivy Moore$450.002023-09-24Shipped
ORD-515253Jack White$22.002023-09-23Delivered
ORD-545556Karen Green$88.882023-09-22Pending
ORD-575859Liam Hall$765.002023-09-21Processing
ORD-606162Mia King$120.002023-09-20Shipped
ORD-636465Noah Wright$33.502023-09-19Delivered
ORD-666768Olivia Scott$99.992023-09-18Pending
ORD-697071Peter Adams$55.002023-09-17Processing
ORD-727374Quinn Lewis$12.002023-09-16Shipped
ORD-757677Rachel Lee$250.002023-09-15Delivered
ORD-787980Sam Clark$75.002023-09-14Pending
ORD-818283Tina Young$110.002023-09-13Processing
ORD-848586Uma Hernandez$400.002023-09-12Shipped
ORD-900001Laura Croft$210.502026-03-31Processing
ORD-900002Markus Jensen$85.002026-03-30Shipped
ORD-900003Nadia Khan$15.992026-03-29Pending
ORD-900004Oscar Reed$340.002026-03-28Delivered
ORD-900005Penny Lane$55.252026-03-27Processing
ORD-900006Quentin Blake$1299.002026-03-26Shipped
ORD-900007Renee Dupont$72.802026-03-25Pending
ORD-900008Steven King$450.002026-03-24Delivered
ORD-900009Tanya Adams$22.002026-03-23Cancelled
ORD-900010Ulysses Grant$180.002026-03-22Processing
ORD-900011Violet Stone$39.992026-03-21Shipped
ORD-900012Walter White$799.002026-03-20Delivered
ORD-900013Xenia Bell$99.002026-03-19Pending
ORD-900014Yusuf Amir$25.502026-03-18Processing
ORD-900015Zara Chen$112.752026-03-17Shipped
-
+
-
-

Account Settings

+ +

Account Settings

+
- - - - +
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +

Notification Preferences

+ +
+ + +
+
+ + +
+
+ + +
+
+ + +
+ +
+ +
-
+ + \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/eval.json b/evals/data/experiments/002-html-dashboard-analytics/eval.json deleted file mode 100644 index 89f3001..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/eval.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.2009, - "token_f1": 0.5509, - "base_char_count": 3785, - "aap_char_count": 2815, - "char_delta_pct": -25.6, - "lines_added": 62, - "lines_removed": 74, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.2036, - "token_f1": 0.526, - "base_char_count": 3968, - "aap_char_count": 2879, - "char_delta_pct": -27.4, - "lines_added": 62, - "lines_removed": 80, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.2034, - "token_f1": 0.4793, - "base_char_count": 4511, - "aap_char_count": 4624, - "char_delta_pct": 2.5, - "lines_added": 79, - "lines_removed": 88, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1782, - "token_f1": 0.4602, - "base_char_count": 4746, - "aap_char_count": 5175, - "char_delta_pct": 9.0, - "lines_added": 89, - "lines_removed": 91, - "rouge_l": null, - "bleu": null - }, - { - "turn": 4, - "sequence_similarity": 0.1521, - "token_f1": 0.4212, - "base_char_count": 5495, - "aap_char_count": 5933, - "char_delta_pct": 8.0, - "lines_added": 111, - "lines_removed": 104, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1876, - "mean_token_f1": 0.4875, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/002-html-dashboard-analytics/metrics.json b/evals/data/experiments/002-html-dashboard-analytics/metrics.json deleted file mode 100644 index f4e29ea..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/metrics.json +++ /dev/null @@ -1,274 +0,0 @@ -{ - "experiment_id": "002-html-dashboard-analytics", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:46:29.366805+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 124, - "output_tokens": 1342, - "latency_ms": 6789, - "artifact_bytes": 4037 - }, - "aap_turn0": { - "input_tokens": 463, - "output_tokens": 920, - "latency_ms": 4060, - "artifact_bytes": 2747 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Update the Bounce Rate KPI card to show 34.2% with a -5.1% improvement trend", - "input_tokens": 1491, - "output_tokens": 1426, - "latency_ms": 5423, - "output_bytes": 4252, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Add 20 new rows to the traffic sources table with social media referral data fro", - "input_tokens": 2943, - "output_tokens": 1666, - "latency_ms": 6430, - "output_bytes": 4868, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Replace the top referrers list with a new list of 15 referrers focused on develo", - "input_tokens": 4633, - "output_tokens": 1733, - "latency_ms": 7088, - "output_bytes": 5099, - "failed": false, - "failure_reason": "" - }, - { - "turn": 4, - "edit": "Add a new 'Conversion Funnel' section after the KPI cards showing 5 funnel stage", - "input_tokens": 6391, - "output_tokens": 2026, - "latency_ms": 8640, - "output_bytes": 5965, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 15458, - "total_output_tokens": 6851, - "total_latency_ms": 27581 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Update the Bounce Rate KPI card to show 34.2% with a -5.1% improvement trend", - "input_tokens": 2051, - "output_tokens": 179, - "latency_ms": 1650, - "output_bytes": 352, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Add 20 new rows to the traffic sources table with social media referral data fro", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 2701, - "output_bytes": 352, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - }, - { - "turn": 3, - "edit": "Replace the top referrers list with a new list of 15 referrers focused on develo", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 2980, - "output_bytes": 352, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - }, - { - "turn": 4, - "edit": "Add a new 'Conversion Funnel' section after the KPI cards showing 5 funnel stage", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 3144, - "output_bytes": 352, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - } - ], - "total_input_tokens": 2051, - "total_output_tokens": 179, - "total_latency_ms": 10475, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.25 - }, - "comparison": { - "output_token_savings_pct": 97.4, - "input_token_savings_pct": 86.7, - "latency_savings_pct": 62.0 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 124, - "base_output": 1342, - "base_latency_ms": 6789, - "aap_input": 463, - "aap_output": 920, - "aap_latency_ms": 4060 - }, - { - "turn": 1, - "base_input": 1491, - "base_output": 1426, - "base_latency_ms": 5423, - "aap_input": 2051, - "aap_output": 179, - "aap_latency_ms": 1650, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 2943, - "base_output": 1666, - "base_latency_ms": 6430, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 2701, - "envelope_name": "edit", - "apply_ok": false - }, - { - "turn": 3, - "base_input": 4633, - "base_output": 1733, - "base_latency_ms": 7088, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 2980, - "envelope_name": "edit", - "apply_ok": false - }, - { - "turn": 4, - "base_input": 6391, - "base_output": 2026, - "base_latency_ms": 8640, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 3144, - "envelope_name": "edit", - "apply_ok": false - } - ], - "totals": { - "base_input": 15582, - "base_output": 8193, - "base_combined": 23775, - "aap_input": 2514, - "aap_output": 1099, - "aap_combined": 3613, - "base_latency_ms": 34370, - "aap_latency_ms": 14535, - "output_savings_pct": 86.6, - "input_delta_pct": -83.9, - "combined_savings_pct": 84.8, - "latency_savings_pct": 57.7 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.162, - "token_f1": 0.4644, - "base_char_count": 4037, - "aap_char_count": 2407, - "char_delta_pct": -40.4, - "lines_added": 51, - "lines_removed": 71, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0169, - "token_f1": 0.03, - "base_char_count": 4250, - "aap_char_count": 352, - "char_delta_pct": -91.7, - "lines_added": 15, - "lines_removed": 82, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0149, - "token_f1": 0.0265, - "base_char_count": 4866, - "aap_char_count": 352, - "char_delta_pct": -92.8, - "lines_added": 15, - "lines_removed": 91, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0143, - "token_f1": 0.0257, - "base_char_count": 5097, - "aap_char_count": 352, - "char_delta_pct": -93.1, - "lines_added": 15, - "lines_removed": 91, - "rouge_l": null, - "bleu": null - }, - { - "turn": 4, - "sequence_similarity": 0.0124, - "token_f1": 0.0224, - "base_char_count": 5963, - "aap_char_count": 352, - "char_delta_pct": -94.1, - "lines_added": 15, - "lines_removed": 101, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0441, - "mean_token_f1": 0.1138, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-0.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-0.html deleted file mode 100644 index c4216fe..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-0.html +++ /dev/null @@ -1,58 +0,0 @@ - - - - - -
-
Page Views
124,502
-
Unique Visitors
88,210
-
Bounce Rate
42.5%
-
Avg. Session
2m 45s
-
-
- -
-

Traffic Sources

- - - - - - - - -
SourceMediumSessionsBounce RateConversions
GoogleOrganic45,00032%1,200
DirectNone22,00045%400
-
-
- -
-
-

Top Referrers

- -
  • github.com
  • stackoverflow.com
  • reddit.com
-
-
-
-

Geographic Breakdown

- - - - - -
CountryVisits
United States40,000
Germany12,000
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-1.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-1.html deleted file mode 100644 index 2bd0aaf..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-1.html +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "dashboard-artifact", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "bounce-rate" - }, - "content": "
34.2%
-5.1% improvement
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-1.json b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-1.json deleted file mode 100644 index b7b97b7..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "dashboard-artifact", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"dashboard-artifact\",\n \"version\": 2,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\n \"type\": \"id\",\n \"value\": \"bounce-rate\"\n },\n \"content\": \"
34.2%
-5.1% improvement
\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-2.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-2.html deleted file mode 100644 index 2bd0aaf..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-2.html +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "dashboard-artifact", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "bounce-rate" - }, - "content": "
34.2%
-5.1% improvement
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-2.json b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-2.json deleted file mode 100644 index 5743e1a..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "dashboard-artifact", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "traffic-sources-table-body" - }, - "content": "Twitter4,12015.2%LinkedIn3,85014.1%Instagram3,50012.8%Facebook3,20011.7%Reddit2,95010.8%TikTok2,80010.3%Pinterest2,4509.0%YouTube2,1007.7%Discord1,9007.0%Snapchat1,7506.4%Threads1,6005.9%Mastodon1,4505.3%Bluesky1,3004.8%Twitch1,1504.2%WhatsApp1,0003.7%Telegram9003.3%Signal8503.1%Tumblr7502.7%Medium6502.4%Substack5502.0%" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-3.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-3.html deleted file mode 100644 index 2bd0aaf..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-3.html +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "dashboard-artifact", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "bounce-rate" - }, - "content": "
34.2%
-5.1% improvement
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-3.json b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-3.json deleted file mode 100644 index 60a3210..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-3.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "dashboard-artifact", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "top-referrers-list" - }, - "content": "
    \n
  1. GitHub
  2. \n
  3. Stack Overflow
  4. \n
  5. Hacker News
  6. \n
  7. Reddit (r/programming)
  8. \n
  9. Dev.to
  10. \n
  11. Medium (Engineering)
  12. \n
  13. Hashnode
  14. \n
  15. Twitter (Dev Community)
  16. \n
  17. Reddit (r/webdev)
  18. \n
  19. Lobste.rs
  20. \n
  21. Indie Hackers
  22. \n
  23. Product Hunt
  24. \n
  25. YouTube (Dev Tutorials)
  26. \n
  27. Discord (Tech Servers)
  28. \n
  29. Substack (Dev Newsletters)
  30. \n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-4.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-4.html deleted file mode 100644 index 2bd0aaf..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-4.html +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "dashboard-artifact", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "bounce-rate" - }, - "content": "
34.2%
-5.1% improvement
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-4.json b/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-4.json deleted file mode 100644 index 6b89f67..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/aap/turn-4.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "dashboard-artifact", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "kpi-cards-section" - }, - "content": "
\n

Conversion Funnel

\n
\n \n
View
100%
\n
\n
\n \n
Add to Cart
85%
\n
\n
\n \n
Checkout
60%
\n
\n
\n \n
Payment
45%
\n
\n
\n \n
Purchase
40%
\n
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-0.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-0.html deleted file mode 100644 index b465dac..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-0.html +++ /dev/null @@ -1,78 +0,0 @@ - - - - - Analytics Dashboard - - - - - - -
-

Page Views

124.5k
-

Unique Visitors

88.2k
-

Bounce Rate

42.3%
-

Avg Session

04:12
-
- -
-
-

Traffic Sources

- - - -
SourceMediumSessionsBounce RateConv.
-
- - -
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-1.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-1.html deleted file mode 100644 index bc3ee56..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-1.html +++ /dev/null @@ -1,83 +0,0 @@ - - - - - Analytics Dashboard - - - - - - -
-

Page Views

124.5k
-

Unique Visitors

88.2k
-
-

Bounce Rate

-
34.2%↓ 5.1%
-
-

Avg Session

04:12
-
- -
-
-

Traffic Sources

- - - -
SourceMediumSessionsBounce RateConv.
-
- - -
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-2.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-2.html deleted file mode 100644 index fd7f632..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-2.html +++ /dev/null @@ -1,92 +0,0 @@ - - - - - Analytics Dashboard - - - - - - -
-

Page Views

124.5k
-

Unique Visitors

88.2k
-
-

Bounce Rate

-
34.2%↓ 5.1%
-
-

Avg Session

04:12
-
- -
-
-

Traffic Sources

- - - -
SourceMediumSessionsBounce RateConv.
-
- - -
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-3.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-3.html deleted file mode 100644 index 3506252..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-3.html +++ /dev/null @@ -1,92 +0,0 @@ - - - - - Analytics Dashboard - - - - - - -
-

Page Views

124.5k
-

Unique Visitors

88.2k
-
-

Bounce Rate

-
34.2%↓ 5.1%
-
-

Avg Session

04:12
-
- -
-
-

Traffic Sources

- - - -
SourceMediumSessionsBounce RateConv.
-
- - -
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-4.html b/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-4.html deleted file mode 100644 index 2a5bcb6..0000000 --- a/evals/data/experiments/002-html-dashboard-analytics/outputs/base/turn-4.html +++ /dev/null @@ -1,102 +0,0 @@ - - - - - Analytics Dashboard - - - - - - -
-

Page Views

124.5k
-

Unique Visitors

88.2k
-

Bounce Rate

34.2%↓ 5.1%
-

Avg Session

04:12
-
- -
-

Conversion Funnel

-
-
Landing
100%
-
Product
80%
-
Cart
60%
-
Checkout
40%
-
Purchase
20%
-
-
- -
-
-

Traffic Sources

- - - -
SourceMediumSessionsBounce RateConv.
-
- - -
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/eval.json b/evals/data/experiments/003-html-landing-saas/eval.json deleted file mode 100644 index bd762e5..0000000 --- a/evals/data/experiments/003-html-landing-saas/eval.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.1408, - "token_f1": 0.4469, - "base_char_count": 5585, - "aap_char_count": 4357, - "char_delta_pct": -22.0, - "lines_added": 83, - "lines_removed": 79, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.161, - "token_f1": 0.4819, - "base_char_count": 7510, - "aap_char_count": 4327, - "char_delta_pct": -42.4, - "lines_added": 83, - "lines_removed": 103, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.076, - "token_f1": 0.1392, - "base_char_count": 6320, - "aap_char_count": 1521, - "char_delta_pct": -75.9, - "lines_added": 14, - "lines_removed": 89, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0662, - "token_f1": 0.2902, - "base_char_count": 7806, - "aap_char_count": 2250, - "char_delta_pct": -71.2, - "lines_added": 40, - "lines_removed": 117, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.111, - "mean_token_f1": 0.3396, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/003-html-landing-saas/metrics.json b/evals/data/experiments/003-html-landing-saas/metrics.json deleted file mode 100644 index 204ad21..0000000 --- a/evals/data/experiments/003-html-landing-saas/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "003-html-landing-saas", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:47:18.336043+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 128, - "output_tokens": 1925, - "latency_ms": 8737, - "artifact_bytes": 5712 - }, - "aap_turn0": { - "input_tokens": 467, - "output_tokens": 1414, - "latency_ms": 6528, - "artifact_bytes": 4227 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the hero headline to 'Sync Everything. Everywhere.' and the subheadline t", - "input_tokens": 2081, - "output_tokens": 2951, - "latency_ms": 11238, - "output_bytes": 8936, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Rewrite the pricing section to have 4 tiers instead of 3: Free, Starter $19/mo, ", - "input_tokens": 5066, - "output_tokens": 3063, - "latency_ms": 10802, - "output_bytes": 9275, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add a new 'Integrations' section between features and pricing showing a grid of ", - "input_tokens": 8153, - "output_tokens": 3307, - "latency_ms": 11751, - "output_bytes": 10023, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 15300, - "total_output_tokens": 9321, - "total_latency_ms": 33791 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the hero headline to 'Sync Everything. Everywhere.' and the subheadline t", - "input_tokens": 2548, - "output_tokens": 105, - "latency_ms": 1385, - "output_bytes": 4174, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - }, - { - "turn": 2, - "edit": "Rewrite the pricing section to have 4 tiers instead of 3: Free, Starter $19/mo, ", - "input_tokens": 2544, - "output_tokens": 438, - "latency_ms": 2538, - "output_bytes": 1098, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Add a new 'Integrations' section between features and pricing showing a grid of ", - "input_tokens": 1527, - "output_tokens": 857, - "latency_ms": 3878, - "output_bytes": 2252, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - } - ], - "total_input_tokens": 6619, - "total_output_tokens": 1400, - "total_latency_ms": 7801, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": 85.0, - "input_token_savings_pct": 56.7, - "latency_savings_pct": 76.9 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 128, - "base_output": 1925, - "base_latency_ms": 8737, - "aap_input": 467, - "aap_output": 1414, - "aap_latency_ms": 6528 - }, - { - "turn": 1, - "base_input": 2081, - "base_output": 2951, - "base_latency_ms": 11238, - "aap_input": 2548, - "aap_output": 105, - "aap_latency_ms": 1385, - "envelope_name": "edit", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 5066, - "base_output": 3063, - "base_latency_ms": 10802, - "aap_input": 2544, - "aap_output": 438, - "aap_latency_ms": 2538, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 8153, - "base_output": 3307, - "base_latency_ms": 11751, - "aap_input": 1527, - "aap_output": 857, - "aap_latency_ms": 3878, - "envelope_name": "synthesize", - "apply_ok": true - } - ], - "totals": { - "base_input": 15428, - "base_output": 11246, - "base_combined": 26674, - "aap_input": 7086, - "aap_output": 2814, - "aap_combined": 9900, - "base_latency_ms": 42528, - "aap_latency_ms": 14329, - "output_savings_pct": 75.0, - "input_delta_pct": -54.1, - "combined_savings_pct": 62.9, - "latency_savings_pct": 66.3 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.0539, - "token_f1": 0.3938, - "base_char_count": 5712, - "aap_char_count": 3489, - "char_delta_pct": -38.9, - "lines_added": 48, - "lines_removed": 79, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0364, - "token_f1": 0.3307, - "base_char_count": 8936, - "aap_char_count": 3436, - "char_delta_pct": -61.5, - "lines_added": 48, - "lines_removed": 116, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0188, - "token_f1": 0.0782, - "base_char_count": 9275, - "aap_char_count": 961, - "char_delta_pct": -89.6, - "lines_added": 16, - "lines_removed": 129, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0294, - "token_f1": 0.2, - "base_char_count": 10023, - "aap_char_count": 2024, - "char_delta_pct": -79.8, - "lines_added": 31, - "lines_removed": 136, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0346, - "mean_token_f1": 0.2507, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-0.html b/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-0.html deleted file mode 100644 index 6398d0b..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-0.html +++ /dev/null @@ -1,57 +0,0 @@ -
- -
- -

Sync Your Data, Effortlessly.

-

The cloud infrastructure designed to keep your teams in perfect harmony, anywhere in the world.

- -
-
- -
-

Core Features

- -
[Icon]

Real-time Sync

Your files update across all devices instantly.

-
[Icon]

End-to-End Encryption

Military-grade security for your sensitive data.

-
[Icon]

Collaborative Folders

Work together with seamless team permissions.

-
[Icon]

Version History

Never lose your work with 30-day backups.

-
[Icon]

Global CDN

Lightning fast access from anywhere.

-
[Icon]

24/7 Support

Expert assistance whenever you need it.

-
-
- -
-
-

Simple Pricing

- -
-

Starter

$9/mo

-
-
-

Pro

$29/mo

-
-
-

Enterprise

$99/mo

-
-
-
-
- -
-

What People Say

- -

"Life-changing sync speeds." - Jane D.

-

"Secure and reliable for our firm." - Marcus A.

-

"Finally, a cloud tool that makes sense." - Sarah K.

-

"Support team is top notch." - Alex P.

-
-
- -
- -

© 2023 CloudSync Inc. All rights reserved.

-
[Links] [Social Icons]
-
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-1.html b/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-1.html deleted file mode 100644 index 453b819..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-1.html +++ /dev/null @@ -1,57 +0,0 @@ -
- -
- -

Sync Everything. Everywhere.

-

CloudSync connects your tools in real time.

- -
-
- -
-

Core Features

- -
[Icon]

Real-time Sync

Your files update across all devices instantly.

-
[Icon]

End-to-End Encryption

Military-grade security for your sensitive data.

-
[Icon]

Collaborative Folders

Work together with seamless team permissions.

-
[Icon]

Version History

Never lose your work with 30-day backups.

-
[Icon]

Global CDN

Lightning fast access from anywhere.

-
[Icon]

24/7 Support

Expert assistance whenever you need it.

-
-
- -
-
-

Simple Pricing

- -
-

Starter

$9/mo

-
-
-

Pro

$29/mo

-
-
-

Enterprise

$99/mo

-
-
-
-
- -
-

What People Say

- -

"Life-changing sync speeds." - Jane D.

-

"Secure and reliable for our firm." - Marcus A.

-

"Finally, a cloud tool that makes sense." - Sarah K.

-

"Support team is top notch." - Alex P.

-
-
- -
- -

© 2023 CloudSync Inc. All rights reserved.

-
[Links] [Social Icons]
-
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-1.json b/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-1.json deleted file mode 100644 index 6616aee..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-1.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-1", - "version": 2, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "hero-headline" - }, - "content": "Sync Everything. Everywhere." - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "hero-subheadline" - }, - "content": "CloudSync connects your tools in real time." - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-2.html b/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-2.html deleted file mode 100644 index 6b8ff79..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-2.html +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "pricing-section", - "version": 1, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "pricing-section" - }, - "content": "

Free

$0/mo

Starter

$19/mo

Pro

$49/mo

Enterprise

Custom

" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-2.json b/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-2.json deleted file mode 100644 index 745dd54..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "pricing-section", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"pricing-section\",\n \"version\": 1,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\n \"type\": \"id\",\n \"value\": \"pricing-section\"\n },\n \"content\": \"

Free

$0/mo

Starter

$19/mo

Pro

$49/mo

Enterprise

Custom

\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-3.html b/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-3.html deleted file mode 100644 index a8e7984..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-3.html +++ /dev/null @@ -1,36 +0,0 @@ - -
-

Integrations

-
-
Slack
-
GitHub
-
Jira
-
Zapier
-
Stripe
-
Salesforce
-
Notion
-
Figma
-
-
-
- - -
-
-

Free

-

$0/mo

-
-
-

Starter

-

$19/mo

-
-
-

Pro

-

$49/mo

-
-
-

Enterprise

-

Custom

-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-3.json b/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-3.json deleted file mode 100644 index ace97c9..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/aap/turn-3.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "pricing-page-sections", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "\n
\n

Integrations

\n
\n
Slack
\n
GitHub
\n
Jira
\n
Zapier
\n
Stripe
\n
Salesforce
\n
Notion
\n
Figma
\n
\n
\n
\n\n\n
\n
\n

Free

\n

$0/mo

\n
\n
\n

Starter

\n

$19/mo

\n
\n
\n

Pro

\n

$49/mo

\n
\n
\n

Enterprise

\n

Custom

\n
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/base/turn-0.html b/evals/data/experiments/003-html-landing-saas/outputs/base/turn-0.html deleted file mode 100644 index dd145a0..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/base/turn-0.html +++ /dev/null @@ -1,88 +0,0 @@ - - - - - - CloudSync | Seamless Data Synchronization - - - - - - - -
-

Sync your workflow in the cloud.

-

The fastest, most secure way to synchronize your data across all your devices in real-time.

- -
- - -
-

Everything you need to scale

-
- {[1,2,3,4,5,6].map(i => `
-
-

Feature ${i}

-

Automate your daily tasks with our smart sync technology and industry-leading encryption protocols.

-
`).join('')} -
-
- - -
-

Simple, transparent pricing

-
-
-

Starter

$9/mo

- -
-
-

Pro

$29/mo

- -
-
-

Enterprise

$99/mo

- -
-
-
- - -
-

Trusted by teams worldwide

-
- {[1,2,3,4].map(i => `
-

"CloudSync has completely transformed how our remote team handles project assets."

-

- Client ${i}

-
`).join('')} -
-
- - - - - - \ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/base/turn-1.html b/evals/data/experiments/003-html-landing-saas/outputs/base/turn-1.html deleted file mode 100644 index 3c86580..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/base/turn-1.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - CloudSync | Seamless Data Synchronization - - - - - - - -
-

Sync Everything. Everywhere.

-

CloudSync connects your tools in real time.

- -
- - -
-

Everything you need to scale

-
-
-
-

Instant Sync

-

Automate your daily tasks with our smart sync technology and industry-leading encryption protocols.

-
-
-
-

End-to-End Security

-

Your data is protected by bank-grade encryption at every stage of the synchronization process.

-
-
-
-

Universal Access

-

Access your synced data from any device, anytime, anywhere in the world.

-
-
-
-

Automated Backups

-

Never lose a file again with our automated background backup system.

-
-
-
-

Team Collaboration

-

Work together in real-time with granular permissions and shared workspaces.

-
-
-
-

24/7 Support

-

Our dedicated support team is here to assist you whenever you need help.

-
-
-
- - -
-

Simple, transparent pricing

-
-
-

Starter

$9/mo

- -
-
-

Pro

$29/mo

- -
-
-

Enterprise

$99/mo

- -
-
-
- - -
-

Trusted by teams worldwide

-
-
-

"CloudSync has completely transformed how our remote team handles project assets."

-

- Sarah J.

-
-
-

"The speed of synchronization is simply unmatched in the current market."

-

- Mark T.

-
-
-

"Finally, a tool that just works without any complex setup or configuration."

-

- Elena R.

-
-
-

"Our productivity has soared since we started using CloudSync across our devices."

-

- David B.

-
-
-
- - - - - - \ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/base/turn-2.html b/evals/data/experiments/003-html-landing-saas/outputs/base/turn-2.html deleted file mode 100644 index 4252796..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/base/turn-2.html +++ /dev/null @@ -1,129 +0,0 @@ - - - - - - CloudSync | Sync Everything. Everywhere. - - - - - - - -
-

Sync Everything. Everywhere.

-

CloudSync connects your tools in real time.

- -
- - -
-

Everything you need to scale

-
-
-
-

Instant Sync

-

Automate your daily tasks with our smart sync technology and industry-leading encryption protocols.

-
-
-
-

End-to-End Security

-

Your data is protected by bank-grade encryption at every stage of the synchronization process.

-
-
-
-

Universal Access

-

Access your synced data from any device, anytime, anywhere in the world.

-
-
-
-

Automated Backups

-

Never lose a file again with our automated background backup system.

-
-
-
-

Team Collaboration

-

Work together in real-time with granular permissions and shared workspaces.

-
-
-
-

24/7 Support

-

Our dedicated support team is here to assist you whenever you need help.

-
-
-
- - -
-

Simple, transparent pricing

-
-
-

Free

$0

- -
-
-

Starter

$19/mo

- -
-
-

Pro

$49/mo

- -
-
-

Enterprise

Custom

- -
-
-
- - -
-

Trusted by teams worldwide

-
-
-

"CloudSync has completely transformed how our remote team handles project assets."

-

- Sarah J.

-
-
-

"The speed of synchronization is simply unmatched in the current market."

-

- Mark T.

-
-
-

"Finally, a tool that just works without any complex setup or configuration."

-

- Elena R.

-
-
-

"Our productivity has soared since we started using CloudSync across our devices."

-

- David B.

-
-
-
- - - - - - \ No newline at end of file diff --git a/evals/data/experiments/003-html-landing-saas/outputs/base/turn-3.html b/evals/data/experiments/003-html-landing-saas/outputs/base/turn-3.html deleted file mode 100644 index 8995bf4..0000000 --- a/evals/data/experiments/003-html-landing-saas/outputs/base/turn-3.html +++ /dev/null @@ -1,140 +0,0 @@ - - - - - - CloudSync | Sync Everything. Everywhere. - - - - - - - -
-

Sync Everything. Everywhere.

-

CloudSync connects your tools in real time.

- -
- - -
-

Everything you need to scale

-
-
-
-

Instant Sync

-

Automate your daily tasks with our smart sync technology and industry-leading encryption protocols.

-
-
-
-

End-to-End Security

-

Your data is protected by bank-grade encryption at every stage of the synchronization process.

-
-
-
-

Universal Access

-

Access your synced data from any device, anytime, anywhere in the world.

-
-
-
-

Automated Backups

-

Never lose a file again with our automated background backup system.

-
-
-
-

Team Collaboration

-

Work together in real-time with granular permissions and shared workspaces.

-
-
-
-

24/7 Support

-

Our dedicated support team is here to assist you whenever you need help.

-
-
-
- - -
-

Works with your favorite tools

-
- {['Slack', 'GitHub', 'Notion', 'Figma', 'Jira', 'Zoom', 'Trello', 'Asana'].map(name => `
-
- ${name} -
`).join('')} -
-
- - -
-

Simple, transparent pricing

-
-
-

Free

$0

- -
-
-

Starter

$19/mo

- -
-
-

Pro

$49/mo

- -
-
-

Enterprise

Custom

- -
-
-
- - -
-

Trusted by teams worldwide

-
-
-

"CloudSync has completely transformed how our remote team handles project assets."

-

- Sarah J.

-
-
-

"The speed of synchronization is simply unmatched in the current market."

-

- Mark T.

-
-
-

"Finally, a tool that just works without any complex setup or configuration."

-

- Elena R.

-
-
-

"Our productivity has soared since we started using CloudSync across our devices."

-

- David B.

-
-
-
- - - - - - \ No newline at end of file diff --git a/evals/data/experiments/004-html-email-receipt/eval.json b/evals/data/experiments/004-html-email-receipt/eval.json deleted file mode 100644 index 93a7e64..0000000 --- a/evals/data/experiments/004-html-email-receipt/eval.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.213, - "token_f1": 0.3489, - "base_char_count": 5667, - "aap_char_count": 2944, - "char_delta_pct": -48.1, - "lines_added": 63, - "lines_removed": 107, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.2461, - "token_f1": 0.3523, - "base_char_count": 5671, - "aap_char_count": 2953, - "char_delta_pct": -47.9, - "lines_added": 63, - "lines_removed": 107, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.257, - "token_f1": 0.3203, - "base_char_count": 6455, - "aap_char_count": 3204, - "char_delta_pct": -50.4, - "lines_added": 65, - "lines_removed": 119, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.2387, - "mean_token_f1": 0.3405, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/004-html-email-receipt/metrics.json b/evals/data/experiments/004-html-email-receipt/metrics.json deleted file mode 100644 index 46d70d8..0000000 --- a/evals/data/experiments/004-html-email-receipt/metrics.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "experiment_id": "004-html-email-receipt", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:48:15.244446+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 119, - "output_tokens": 1671, - "latency_ms": 5836, - "artifact_bytes": 6711 - }, - "aap_turn0": { - "input_tokens": 458, - "output_tokens": 780, - "latency_ms": 4091, - "artifact_bytes": 2521 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the order number to ORD-2026-03-4821 and the date to March 28, 2026", - "input_tokens": 1825, - "output_tokens": 1675, - "latency_ms": 6658, - "output_bytes": 6715, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Add 3 more products to the items table: Wireless Charger ($34.99, qty 1), USB-C ", - "input_tokens": 3554, - "output_tokens": 2005, - "latency_ms": 7583, - "output_bytes": 8131, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 5379, - "total_output_tokens": 3680, - "total_latency_ms": 14241 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the order number to ORD-2026-03-4821 and the date to March 28, 2026", - "input_tokens": 1921, - "output_tokens": 126, - "latency_ms": 1218, - "output_bytes": 2526, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - }, - { - "turn": 2, - "edit": "Add 3 more products to the items table: Wireless Charger ($34.99, qty 1), USB-C ", - "input_tokens": 1947, - "output_tokens": 292, - "latency_ms": 1617, - "output_bytes": 2775, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - } - ], - "total_input_tokens": 3868, - "total_output_tokens": 418, - "total_latency_ms": 2835, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": 88.6, - "input_token_savings_pct": 28.1, - "latency_savings_pct": 80.1 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 119, - "base_output": 1671, - "base_latency_ms": 5836, - "aap_input": 458, - "aap_output": 780, - "aap_latency_ms": 4091 - }, - { - "turn": 1, - "base_input": 1825, - "base_output": 1675, - "base_latency_ms": 6658, - "aap_input": 1921, - "aap_output": 126, - "aap_latency_ms": 1218, - "envelope_name": "edit", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 3554, - "base_output": 2005, - "base_latency_ms": 7583, - "aap_input": 1947, - "aap_output": 292, - "aap_latency_ms": 1617, - "envelope_name": "edit", - "apply_ok": true - } - ], - "totals": { - "base_input": 5498, - "base_output": 5351, - "base_combined": 10849, - "aap_input": 4326, - "aap_output": 1198, - "aap_combined": 5524, - "base_latency_ms": 20077, - "aap_latency_ms": 6926, - "output_savings_pct": 77.6, - "input_delta_pct": -21.3, - "combined_savings_pct": 49.1, - "latency_savings_pct": 65.5 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.1033, - "token_f1": 0.2884, - "base_char_count": 6711, - "aap_char_count": 2114, - "char_delta_pct": -68.5, - "lines_added": 50, - "lines_removed": 106, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.1438, - "token_f1": 0.2921, - "base_char_count": 6715, - "aap_char_count": 2119, - "char_delta_pct": -68.4, - "lines_added": 50, - "lines_removed": 106, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.1652, - "token_f1": 0.2524, - "base_char_count": 8131, - "aap_char_count": 2368, - "char_delta_pct": -70.9, - "lines_added": 52, - "lines_removed": 124, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1374, - "mean_token_f1": 0.2776, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-0.html b/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-0.html deleted file mode 100644 index d4639bb..0000000 --- a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-0.html +++ /dev/null @@ -1,52 +0,0 @@ - - - - -
- - - - - - - - - - - - - -
-
[Company Logo]
-

Order Confirmation

-
- -

Order #: ORD-99283

-

Date: October 24, 2023

-

Shipping Address:
123 Maple St, Springfield, IL 62704

-
-
- - - - - - - - - - - - - - -
ItemQtyPriceTotal
Wireless Mouse1$25.00$25.00
Keyboard1$50.00$50.00
USB-C Cable2$10.00$20.00
Monitor Stand1$40.00$40.00
Mouse Pad1$15.00$15.00
-
- -

Subtotal: $150.00

-

Shipping: $5.00

-

Tax: $12.40

-

Total: $167.40

-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-1.html b/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-1.html deleted file mode 100644 index f9d8bad..0000000 --- a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-1.html +++ /dev/null @@ -1,52 +0,0 @@ - - - - -
- - - - - - - - - - - - - -
-
[Company Logo]
-

Order Confirmation

-
- -

Order #: ORD-2026-03-4821

-

Date: March 28, 2026

-

Shipping Address:
123 Maple St, Springfield, IL 62704

-
-
- - - - - - - - - - - - - - -
ItemQtyPriceTotal
Wireless Mouse1$25.00$25.00
Keyboard1$50.00$50.00
USB-C Cable2$10.00$20.00
Monitor Stand1$40.00$40.00
Mouse Pad1$15.00$15.00
-
- -

Subtotal: $150.00

-

Shipping: $5.00

-

Tax: $12.40

-

Total: $167.40

-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-1.json b/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-1.json deleted file mode 100644 index 63fffdb..0000000 --- a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-1.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "order-confirmation-email", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "order-number" - }, - "content": "ORD-2026-03-4821" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "order-date" - }, - "content": "March 28, 2026" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-2.html b/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-2.html deleted file mode 100644 index cb67e8c..0000000 --- a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-2.html +++ /dev/null @@ -1,54 +0,0 @@ - - - - -
- - - - - - - - - - - - - -
-
[Company Logo]
-

Order Confirmation

-
- -

Order #: ORD-2026-03-4821

-

Date: March 28, 2026

-

Shipping Address:
123 Maple St, Springfield, IL 62704

-
-
- - - - - - - - - - - - - - - - -
ItemQtyPriceTotal
Wireless Mouse1$25.00$25.00
Keyboard1$50.00$50.00
USB-C Cable2$10.00$20.00
Monitor Stand1$40.00$40.00
Mouse Pad1$15.00$15.00
Wireless Charger1$34.99$34.99
USB-C Hub1$59.99$59.99
Monitor Stand2$89.99$179.98
-
- -

Subtotal: $150.00

-

Shipping: $5.00

-

Tax: $12.40

-

Total: $167.40

-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-2.json b/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-2.json deleted file mode 100644 index b380e09..0000000 --- a/evals/data/experiments/004-html-email-receipt/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "items-list" - }, - "content": " Wireless Mouse1$25.00$25.00\n Keyboard1$50.00$50.00\n USB-C Cable2$10.00$20.00\n Monitor Stand1$40.00$40.00\n Mouse Pad1$15.00$15.00\n Wireless Charger1$34.99$34.99\n USB-C Hub1$59.99$59.99\n Monitor Stand2$89.99$179.98\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/004-html-email-receipt/outputs/base/turn-0.html b/evals/data/experiments/004-html-email-receipt/outputs/base/turn-0.html deleted file mode 100644 index 8212648..0000000 --- a/evals/data/experiments/004-html-email-receipt/outputs/base/turn-0.html +++ /dev/null @@ -1,108 +0,0 @@ - - - - - -Order Confirmation - - - - - - -
- - - - - - - - - - - - - - - - - -
-

[COMPANY LOGO]

-

Order Confirmation

-
- - - - - -
-

Order Number: #123456789

-

Date: October 24, 2023

-
-

Shipping Address:

-

John Doe
123 Maple St, Apt 4B
Springfield, IL 62704

-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ItemQtyPriceTotal
Wireless Headphones1$99.00$99.00
USB-C Charging Cable2$15.00$30.00
Smartphone Case1$25.00$25.00
Screen Protector1$10.00$10.00
Bluetooth Adapter1$35.00$35.00
-
- - - - - - - - - - - - - - - - - -
Subtotal:$199.00
Shipping:$5.00
Tax:$16.32
Total:$220.32
-
-
- - \ No newline at end of file diff --git a/evals/data/experiments/004-html-email-receipt/outputs/base/turn-1.html b/evals/data/experiments/004-html-email-receipt/outputs/base/turn-1.html deleted file mode 100644 index 4df7d48..0000000 --- a/evals/data/experiments/004-html-email-receipt/outputs/base/turn-1.html +++ /dev/null @@ -1,108 +0,0 @@ - - - - - -Order Confirmation - - - - - - -
- - - - - - - - - - - - - - - - - -
-

[COMPANY LOGO]

-

Order Confirmation

-
- - - - - -
-

Order Number: ORD-2026-03-4821

-

Date: March 28, 2026

-
-

Shipping Address:

-

John Doe
123 Maple St, Apt 4B
Springfield, IL 62704

-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ItemQtyPriceTotal
Wireless Headphones1$99.00$99.00
USB-C Charging Cable2$15.00$30.00
Smartphone Case1$25.00$25.00
Screen Protector1$10.00$10.00
Bluetooth Adapter1$35.00$35.00
-
- - - - - - - - - - - - - - - - - -
Subtotal:$199.00
Shipping:$5.00
Tax:$16.32
Total:$220.32
-
-
- - \ No newline at end of file diff --git a/evals/data/experiments/004-html-email-receipt/outputs/base/turn-2.html b/evals/data/experiments/004-html-email-receipt/outputs/base/turn-2.html deleted file mode 100644 index 6ac40b1..0000000 --- a/evals/data/experiments/004-html-email-receipt/outputs/base/turn-2.html +++ /dev/null @@ -1,126 +0,0 @@ - - - - - -Order Confirmation - - - - - - -
- - - - - - - - - - - - - - - - - -
-

[COMPANY LOGO]

-

Order Confirmation

-
- - - - - -
-

Order Number: ORD-2026-03-4821

-

Date: March 28, 2026

-
-

Shipping Address:

-

John Doe
123 Maple St, Apt 4B
Springfield, IL 62704

-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ItemQtyPriceTotal
Wireless Headphones1$99.00$99.00
USB-C Charging Cable2$15.00$30.00
Smartphone Case1$25.00$25.00
Screen Protector1$10.00$10.00
Bluetooth Adapter1$35.00$35.00
Wireless Charger1$34.99$34.99
USB-C Hub1$59.99$59.99
Monitor Stand2$89.99$179.98
-
- - - - - - - - - - - - - - - - - -
Subtotal:$473.96
Shipping:$5.00
Tax:$38.32
Total:$517.28
-
-
- - \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/eval.json b/evals/data/experiments/005-html-form-wizard/eval.json deleted file mode 100644 index f5eebad..0000000 --- a/evals/data/experiments/005-html-form-wizard/eval.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.2743, - "token_f1": 0.552, - "base_char_count": 5146, - "aap_char_count": 4055, - "char_delta_pct": -21.2, - "lines_added": 82, - "lines_removed": 98, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.2491, - "token_f1": 0.5034, - "base_char_count": 5998, - "aap_char_count": 4055, - "char_delta_pct": -32.4, - "lines_added": 82, - "lines_removed": 109, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.2482, - "token_f1": 0.5007, - "base_char_count": 6034, - "aap_char_count": 4055, - "char_delta_pct": -32.8, - "lines_added": 82, - "lines_removed": 113, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.2327, - "token_f1": 0.5228, - "base_char_count": 5905, - "aap_char_count": 4572, - "char_delta_pct": -22.6, - "lines_added": 96, - "lines_removed": 113, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.2511, - "mean_token_f1": 0.5197, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/005-html-form-wizard/metrics.json b/evals/data/experiments/005-html-form-wizard/metrics.json deleted file mode 100644 index 900c14f..0000000 --- a/evals/data/experiments/005-html-form-wizard/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "005-html-form-wizard", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:48:42.292212+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 145, - "output_tokens": 1488, - "latency_ms": 6197, - "artifact_bytes": 5059 - }, - "aap_turn0": { - "input_tokens": 484, - "output_tokens": 1239, - "latency_ms": 5216, - "artifact_bytes": 3888 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new Step 5 for 'Preferences' with fields for newsletter opt-in, preferred ", - "input_tokens": 1662, - "output_tokens": 1704, - "latency_ms": 7173, - "output_bytes": 5875, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Update the progress bar to show 5 steps instead of 4 and change its color from b", - "input_tokens": 3397, - "output_tokens": 1754, - "latency_ms": 6595, - "output_bytes": 6085, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Rewrite Step 3 (Payment) to include PayPal and Apple Pay options as radio button", - "input_tokens": 5174, - "output_tokens": 1673, - "latency_ms": 6494, - "output_bytes": 5732, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 10233, - "total_output_tokens": 5131, - "total_latency_ms": 20262 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new Step 5 for 'Preferences' with fields for newsletter opt-in, preferred ", - "input_tokens": 2374, - "output_tokens": 1559, - "latency_ms": 6146, - "output_bytes": 4906, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Update the progress bar to show 5 steps instead of 4 and change its color from b", - "input_tokens": 2655, - "output_tokens": 1581, - "latency_ms": 6668, - "output_bytes": 4906, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Rewrite Step 3 (Payment) to include PayPal and Apple Pay options as radio button", - "input_tokens": 2647, - "output_tokens": 1658, - "latency_ms": 5834, - "output_bytes": 5286, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - } - ], - "total_input_tokens": 7676, - "total_output_tokens": 4798, - "total_latency_ms": 18648, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": 6.5, - "input_token_savings_pct": 25.0, - "latency_savings_pct": 8.0 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 145, - "base_output": 1488, - "base_latency_ms": 6197, - "aap_input": 484, - "aap_output": 1239, - "aap_latency_ms": 5216 - }, - { - "turn": 1, - "base_input": 1662, - "base_output": 1704, - "base_latency_ms": 7173, - "aap_input": 2374, - "aap_output": 1559, - "aap_latency_ms": 6146, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 3397, - "base_output": 1754, - "base_latency_ms": 6595, - "aap_input": 2655, - "aap_output": 1581, - "aap_latency_ms": 6668, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 5174, - "base_output": 1673, - "base_latency_ms": 6494, - "aap_input": 2647, - "aap_output": 1658, - "aap_latency_ms": 5834, - "envelope_name": "synthesize", - "apply_ok": true - } - ], - "totals": { - "base_input": 10378, - "base_output": 6619, - "base_combined": 16997, - "aap_input": 8160, - "aap_output": 6037, - "aap_combined": 14197, - "base_latency_ms": 26459, - "aap_latency_ms": 23864, - "output_savings_pct": 8.8, - "input_delta_pct": -21.4, - "combined_savings_pct": 16.5, - "latency_savings_pct": 9.8 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.3583, - "token_f1": 0.5755, - "base_char_count": 5059, - "aap_char_count": 3605, - "char_delta_pct": -28.7, - "lines_added": 71, - "lines_removed": 97, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.3672, - "token_f1": 0.5764, - "base_char_count": 5875, - "aap_char_count": 4573, - "char_delta_pct": -22.2, - "lines_added": 96, - "lines_removed": 109, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.3569, - "token_f1": 0.5697, - "base_char_count": 6085, - "aap_char_count": 4573, - "char_delta_pct": -24.8, - "lines_added": 96, - "lines_removed": 111, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.2529, - "token_f1": 0.5502, - "base_char_count": 5732, - "aap_char_count": 4953, - "char_delta_pct": -13.6, - "lines_added": 101, - "lines_removed": 101, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.3338, - "mean_token_f1": 0.5679, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-0.html b/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-0.html deleted file mode 100644 index b825832..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-0.html +++ /dev/null @@ -1,90 +0,0 @@ - - -
- -
- -
-
-
-
-
-
- -
- -

Personal Info

-
-
-
-
-
- - -

Address

-
-
-
-
- - -

Payment

-
-
-
- - -

Review

-
Please review your data.
-
- -
- - -
-
-
-
- - \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-1.html b/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-1.html deleted file mode 100644 index 3c84579..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-1.html +++ /dev/null @@ -1,114 +0,0 @@ - - -
- -
- -
-
-
-
-
-
-
- -
- -

Personal Info

-
-
-
-
-
- - -

Address

-
-
-
-
- - -

Payment

-
-
-
- - -

Preferences

-
- -
-
- - -
-
- -
- - -
-
-
- - -

Review

-
Please review your data.
-
- -
- - -
-
-
-
- - \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-1.json b/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-1.json deleted file mode 100644 index 71fe93a..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "wizard-form", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "\n\n
\n \n
\n \n
\n
\n
\n
\n
\n
\n
\n\n
\n \n

Personal Info

\n
\n
\n
\n
\n
\n\n \n

Address

\n
\n
\n
\n
\n\n \n

Payment

\n
\n
\n
\n\n \n

Preferences

\n
\n \n
\n
\n \n \n
\n
\n \n
\n \n \n
\n
\n
\n\n \n

Review

\n
Please review your data.
\n
\n\n
\n \n \n
\n
\n
\n
\n\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-2.html b/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-2.html deleted file mode 100644 index 8212647..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-2.html +++ /dev/null @@ -1,114 +0,0 @@ - - -
- -
- -
-
-
-
-
-
-
- -
- -

Personal Info

-
-
-
-
-
- - -

Address

-
-
-
-
- - -

Payment

-
-
-
- - -

Preferences

-
- -
-
- - -
-
- -
- - -
-
-
- - -

Review

-
Please review your data.
-
- -
- - -
-
-
-
- - \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-2.json b/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-2.json deleted file mode 100644 index 883a3d9..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "wizard-update-1", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "\n\n
\n \n
\n \n
\n
\n
\n
\n
\n
\n
\n\n
\n \n

Personal Info

\n
\n
\n
\n
\n
\n\n \n

Address

\n
\n
\n
\n
\n\n \n

Payment

\n
\n
\n
\n\n \n

Preferences

\n
\n \n
\n
\n \n \n
\n
\n \n
\n \n \n
\n
\n
\n\n \n

Review

\n
Please review your data.
\n
\n\n
\n \n \n
\n
\n
\n
\n\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-3.html b/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-3.html deleted file mode 100644 index da822f3..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-3.html +++ /dev/null @@ -1,121 +0,0 @@ - - -
- -
- -
-
-
-
-
-
-
- -
- -

Personal Info

-
-
-
-
-
- - -

Address

-
-
-
-
- - -

Payment Method

-
-
- - - -
-
-
-
-
- - -

Preferences

-
- -
-
- - -
-
- -
- - -
-
-
- - -

Review

-
Please review your data.
-
- -
- - -
-
-
-
- - diff --git a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-3.json b/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-3.json deleted file mode 100644 index d4e9bb4..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/aap/turn-3.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "wizard-app", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "\n\n
\n \n
\n \n
\n
\n
\n
\n
\n
\n
\n\n
\n \n

Personal Info

\n
\n
\n
\n
\n
\n\n \n

Address

\n
\n
\n
\n
\n\n \n

Payment Method

\n
\n
\n \n \n \n
\n
\n
\n
\n
\n\n \n

Preferences

\n
\n \n
\n
\n \n \n
\n
\n \n
\n \n \n
\n
\n
\n\n \n

Review

\n
Please review your data.
\n
\n\n
\n \n \n
\n
\n
\n
\n\n\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/base/turn-0.html b/evals/data/experiments/005-html-form-wizard/outputs/base/turn-0.html deleted file mode 100644 index b54f4af..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/base/turn-0.html +++ /dev/null @@ -1,116 +0,0 @@ - - - - - -Registration Wizard - - - - -
-
-
1
-
2
-
3
-
4
-
- -
- -
-

Personal Info

-
-
-
-
-
- - -
-

Address

-
-
-
-
-
- -
-
- - -
-

Payment

-
-
-
-
-
- - -
-

Review

-
-
- -
- - -
-
-
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/base/turn-1.html b/evals/data/experiments/005-html-form-wizard/outputs/base/turn-1.html deleted file mode 100644 index d3a1277..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/base/turn-1.html +++ /dev/null @@ -1,127 +0,0 @@ - - - - - -Registration Wizard - - - - -
-
-
1
-
2
-
3
-
4
-
5
-
- -
-
-

Personal Info

-
-
-
-
-
- -
-

Address

-
-
-
-
-
- -
-

Payment

-
-
-
-
- -
-

Preferences

-
-
- - -
-
- -
- - -
-
-
- -
-

Review

-
-
- -
- - -
-
-
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/base/turn-2.html b/evals/data/experiments/005-html-form-wizard/outputs/base/turn-2.html deleted file mode 100644 index 3f1065e..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/base/turn-2.html +++ /dev/null @@ -1,129 +0,0 @@ - - - - - -Registration Wizard - - - - -
-
-
1
-
2
-
3
-
4
-
5
-
- -
-
-

Personal Info

-
-
-
-
-
- -
-

Address

-
-
-
-
-
- -
-

Payment

-
-
-
-
- -
-

Preferences

-
-
- - -
-
- -
- - -
-
-
- -
-

Review

-
-
- -
- - -
-
-
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/005-html-form-wizard/outputs/base/turn-3.html b/evals/data/experiments/005-html-form-wizard/outputs/base/turn-3.html deleted file mode 100644 index d7c63ee..0000000 --- a/evals/data/experiments/005-html-form-wizard/outputs/base/turn-3.html +++ /dev/null @@ -1,121 +0,0 @@ - - - - - -Registration Wizard - - - - -
-
-
1
-
2
-
3
-
4
-
5
-
- -
- -
-

Personal Info

-
-
-
- - -
-

Address

-
-
-
- - -
-

Payment

-
- - - -
-
-
-
-
- - -
-

Preferences

-
- -
- - -
-

Review

-
-
- -
- - -
-
-
- - - - - \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/eval.json b/evals/data/experiments/006-html-portfolio/eval.json deleted file mode 100644 index 3d84503..0000000 --- a/evals/data/experiments/006-html-portfolio/eval.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.0768, - "token_f1": 0.3876, - "base_char_count": 8073, - "aap_char_count": 3405, - "char_delta_pct": -57.8, - "lines_added": 48, - "lines_removed": 88, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0664, - "token_f1": 0.3802, - "base_char_count": 7750, - "aap_char_count": 3418, - "char_delta_pct": -55.9, - "lines_added": 48, - "lines_removed": 78, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0798, - "token_f1": 0.381, - "base_char_count": 7494, - "aap_char_count": 3636, - "char_delta_pct": -51.5, - "lines_added": 47, - "lines_removed": 82, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0696, - "token_f1": 0.3447, - "base_char_count": 9130, - "aap_char_count": 3636, - "char_delta_pct": -60.2, - "lines_added": 47, - "lines_removed": 103, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0731, - "mean_token_f1": 0.3734, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/006-html-portfolio/metrics.json b/evals/data/experiments/006-html-portfolio/metrics.json deleted file mode 100644 index f73366a..0000000 --- a/evals/data/experiments/006-html-portfolio/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "006-html-portfolio", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:49:32.676528+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 128, - "output_tokens": 1775, - "latency_ms": 7215, - "artifact_bytes": 5296 - }, - "aap_turn0": { - "input_tokens": 467, - "output_tokens": 1216, - "latency_ms": 5947, - "artifact_bytes": 4271 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the developer name to 'Jordan Rivera' and title to 'Cloud Infrastructure ", - "input_tokens": 1922, - "output_tokens": 1813, - "latency_ms": 7326, - "output_bytes": 5590, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Replace the projects grid with 8 project cards instead of 6, adding 'Kubernetes ", - "input_tokens": 3769, - "output_tokens": 1927, - "latency_ms": 6832, - "output_bytes": 6031, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add a new 'Blog' section after projects showing the 3 most recent blog post prev", - "input_tokens": 5723, - "output_tokens": 2399, - "latency_ms": 9523, - "output_bytes": 7482, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 11414, - "total_output_tokens": 6139, - "total_latency_ms": 23681 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the developer name to 'Jordan Rivera' and title to 'Cloud Infrastructure ", - "input_tokens": 2341, - "output_tokens": 172, - "latency_ms": 1487, - "output_bytes": 359, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Replace the projects grid with 8 project cards instead of 6, adding 'Kubernetes ", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 2090, - "output_bytes": 359, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - }, - { - "turn": 3, - "edit": "Add a new 'Blog' section after projects showing the 3 most recent blog post prev", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 2387, - "output_bytes": 359, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - } - ], - "total_input_tokens": 2341, - "total_output_tokens": 172, - "total_latency_ms": 5964, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.3333333333333333 - }, - "comparison": { - "output_token_savings_pct": 97.2, - "input_token_savings_pct": 79.5, - "latency_savings_pct": 74.8 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 128, - "base_output": 1775, - "base_latency_ms": 7215, - "aap_input": 467, - "aap_output": 1216, - "aap_latency_ms": 5947 - }, - { - "turn": 1, - "base_input": 1922, - "base_output": 1813, - "base_latency_ms": 7326, - "aap_input": 2341, - "aap_output": 172, - "aap_latency_ms": 1487, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 3769, - "base_output": 1927, - "base_latency_ms": 6832, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 2090, - "envelope_name": "edit", - "apply_ok": false - }, - { - "turn": 3, - "base_input": 5723, - "base_output": 2399, - "base_latency_ms": 9523, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 2387, - "envelope_name": "edit", - "apply_ok": false - } - ], - "totals": { - "base_input": 11542, - "base_output": 7914, - "base_combined": 19456, - "aap_input": 2808, - "aap_output": 1388, - "aap_combined": 4196, - "base_latency_ms": 30896, - "aap_latency_ms": 11911, - "output_savings_pct": 82.5, - "input_delta_pct": -75.7, - "combined_savings_pct": 78.4, - "latency_savings_pct": 61.4 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.0796, - "token_f1": 0.3141, - "base_char_count": 5294, - "aap_char_count": 3705, - "char_delta_pct": -30.0, - "lines_added": 65, - "lines_removed": 69, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0037, - "token_f1": 0.0074, - "base_char_count": 5588, - "aap_char_count": 359, - "char_delta_pct": -93.6, - "lines_added": 18, - "lines_removed": 80, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0041, - "token_f1": 0.0104, - "base_char_count": 6029, - "aap_char_count": 359, - "char_delta_pct": -94.0, - "lines_added": 18, - "lines_removed": 89, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0033, - "token_f1": 0.0086, - "base_char_count": 7480, - "aap_char_count": 359, - "char_delta_pct": -95.2, - "lines_added": 18, - "lines_removed": 112, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0227, - "mean_token_f1": 0.0851, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-0.html b/evals/data/experiments/006-html-portfolio/outputs/aap/turn-0.html deleted file mode 100644 index d601ef7..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-0.html +++ /dev/null @@ -1,76 +0,0 @@ - - - - - Alex Chen | Portfolio - - - -
- -
-

Alex Chen

-

Full-Stack Developer

-

Building scalable web applications with a focus on clean, maintainable code.

-
-
- - -
-

About

-
-
-

I am a developer with 5+ years of experience in designing user-centric interfaces and robust server-side architectures.

-
-
-
- - -
-

Projects

-
-

Project Alpha

Cloud management dashboard.

ReactAWS
-

Project Beta

Real-time chat application.

Node.jsSocket.io
-

Project Gamma

E-commerce analytics tool.

PythonDjango
-

Project Delta

Mobile health tracker API.

GoPostgres
-

Project Epsilon

IoT sensor visualization.

VueInfluxDB
-

Project Zeta

Automated CI/CD pipeline.

DockerKubernetes
-
-
-
- - -
-

Skills

-
-

Frontend

-

Backend

-

DevOps

-
-
-
- - -
-

Contact

-
- - - - -
-
-
-
- - \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-1.html b/evals/data/experiments/006-html-portfolio/outputs/aap/turn-1.html deleted file mode 100644 index 33b2cb6..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-1.html +++ /dev/null @@ -1,18 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "portfolio-edit", - "version": 1, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "name"}, - "content": "Jordan Rivera" - }, - { - "op": "replace", - "target": {"type": "id", "value": "title"}, - "content": "Cloud Infrastructure Engineer" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-1.json b/evals/data/experiments/006-html-portfolio/outputs/aap/turn-1.json deleted file mode 100644 index a800ad2..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "portfolio-edit", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"portfolio-edit\",\n \"version\": 1,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\"type\": \"id\", \"value\": \"name\"},\n \"content\": \"Jordan Rivera\"\n },\n {\n \"op\": \"replace\",\n \"target\": {\"type\": \"id\", \"value\": \"title\"},\n \"content\": \"Cloud Infrastructure Engineer\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-2.html b/evals/data/experiments/006-html-portfolio/outputs/aap/turn-2.html deleted file mode 100644 index 33b2cb6..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-2.html +++ /dev/null @@ -1,18 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "portfolio-edit", - "version": 1, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "name"}, - "content": "Jordan Rivera" - }, - { - "op": "replace", - "target": {"type": "id", "value": "title"}, - "content": "Cloud Infrastructure Engineer" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-2.json b/evals/data/experiments/006-html-portfolio/outputs/aap/turn-2.json deleted file mode 100644 index 19d962c..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "portfolio-edit", - "version": 2, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "projects-grid" - }, - "content": "\n

Cloud Migration

Tag: AWS, Terraform

\n

CI/CD Pipeline

Tag: Jenkins, Docker

\n

Microservices Auth

Tag: OAuth2, Go

\n

Observability Stack

Tag: Prometheus, Grafana

\n

Serverless API

Tag: AWS Lambda, Node.js

\n

Data Lake

Tag: S3, Spark

\n

Kubernetes Autoscaler

Tag: K8s, Python

\n

GraphQL Gateway

Tag: Apollo, TypeScript

\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-3.html b/evals/data/experiments/006-html-portfolio/outputs/aap/turn-3.html deleted file mode 100644 index 33b2cb6..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-3.html +++ /dev/null @@ -1,18 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "portfolio-edit", - "version": 1, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "name"}, - "content": "Jordan Rivera" - }, - { - "op": "replace", - "target": {"type": "id", "value": "title"}, - "content": "Cloud Infrastructure Engineer" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-3.json b/evals/data/experiments/006-html-portfolio/outputs/aap/turn-3.json deleted file mode 100644 index 96c40e0..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/aap/turn-3.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "portfolio-edit", - "version": 2, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "projects-section" - }, - "content": "
\n

Recent Blog Posts

\n
\n \n
\n

Optimizing Terraform State

\n

2023-10-25

\n

A deep dive into managing state files for large-scale environments.

\n
\n
\n

Kubernetes Networking 101

\n

2023-09-12

\n

Understanding service meshes and ingress controllers.

\n
\n
\n

The Future of Serverless

\n

2023-08-05

\n

Predictions on cloud-native event-driven architectures.

\n
\n
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/base/turn-0.html b/evals/data/experiments/006-html-portfolio/outputs/base/turn-0.html deleted file mode 100644 index def3ff0..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/base/turn-0.html +++ /dev/null @@ -1,80 +0,0 @@ - - - - - - Alex Chen | Full-Stack Developer - - - - - - - -
-

Alex Chen

-

Full-Stack Developer

-

Building robust, scalable web applications with a focus on clean code and user-centric design.

-
- - -
-
Profile Image
-
-

About Me

-

Passionate software engineer with 5+ years of experience in the full development lifecycle. I specialize in crafting elegant solutions to complex problems, leveraging modern stacks to deliver high-performance applications.

-
-
- - -
-

Skills

-
-

Frontend

-

Backend

-

DevOps

-
-
- - -
-

Projects

-
- - -
-
- - -
-

Contact Me

-
- - - - -
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/base/turn-1.html b/evals/data/experiments/006-html-portfolio/outputs/base/turn-1.html deleted file mode 100644 index 117c0b0..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/base/turn-1.html +++ /dev/null @@ -1,80 +0,0 @@ - - - - - - Jordan Rivera | Cloud Infrastructure Engineer - - - - - - - -
-

Jordan Rivera

-

Cloud Infrastructure Engineer

-

Architecting scalable, resilient cloud environments and automating deployment pipelines for high-traffic systems.

-
- - -
-
Profile Image
-
-

About Me

-

I am a Cloud Infrastructure Engineer specializing in building secure and automated infrastructure as code. I have a passion for optimizing cloud costs, enhancing system reliability, and streamlining developer workflows through CI/CD automation.

-
-
- - -
-

Skills

-
-

Cloud Platforms

-

Infrastructure as Code

-

Automation & CI/CD

-
-
- - -
-

Projects

-
- -
-
- - -
-

Contact Me

-
- - - - -
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/base/turn-2.html b/evals/data/experiments/006-html-portfolio/outputs/base/turn-2.html deleted file mode 100644 index 86910fb..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/base/turn-2.html +++ /dev/null @@ -1,89 +0,0 @@ - - - - - - Jordan Rivera | Cloud Infrastructure Engineer - - - - - - - -
-

Jordan Rivera

-

Cloud Infrastructure Engineer

-

Architecting scalable, resilient cloud environments and automating deployment pipelines for high-traffic systems.

-
- - -
-
Profile Image
-
-

About Me

-

I am a Cloud Infrastructure Engineer specializing in building secure and automated infrastructure as code. I have a passion for optimizing cloud costs, enhancing system reliability, and streamlining developer workflows through CI/CD automation.

-
-
- - -
-

Skills

-
-

Cloud Platforms

-

Infrastructure as Code

-

Automation & CI/CD

-
-
- - -
-

Projects

-
- -
-
- - -
-

Contact Me

-
- - - - -
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/006-html-portfolio/outputs/base/turn-3.html b/evals/data/experiments/006-html-portfolio/outputs/base/turn-3.html deleted file mode 100644 index 635e5d4..0000000 --- a/evals/data/experiments/006-html-portfolio/outputs/base/turn-3.html +++ /dev/null @@ -1,112 +0,0 @@ - - - - - - Jordan Rivera | Cloud Infrastructure Engineer - - - - - - - -
-

Jordan Rivera

-

Cloud Infrastructure Engineer

-

Architecting scalable, resilient cloud environments and automating deployment pipelines for high-traffic systems.

-
- - -
-
Profile Image
-
-

About Me

-

I am a Cloud Infrastructure Engineer specializing in building secure and automated infrastructure as code. I have a passion for optimizing cloud costs, enhancing system reliability, and streamlining developer workflows through CI/CD automation.

-
-
- - -
-

Skills

-
-

Cloud Platforms

-

Infrastructure as Code

-

Automation & CI/CD

-
-
- - -
-

Projects

-
- -
-
- - -
-

Recent Blog Posts

-
-
-

Scaling K8s Clusters

- Oct 24, 2023 -

Deep dive into horizontal pod autoscaling patterns for high traffic.

-
-
-

Terraform Best Practices

- Oct 12, 2023 -

Refining your infrastructure modules for better maintainability.

-
-
-

Serverless Security

- Sep 30, 2023 -

Securing your serverless functions against common vulnerabilities.

-
-
-
- - -
-

Contact Me

-
- - - - -
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/eval.json b/evals/data/experiments/007-html-blog-post/eval.json deleted file mode 100644 index 89fd34e..0000000 --- a/evals/data/experiments/007-html-blog-post/eval.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.065, - "token_f1": 0.427, - "base_char_count": 5375, - "aap_char_count": 4803, - "char_delta_pct": -10.6, - "lines_added": 58, - "lines_removed": 72, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0628, - "token_f1": 0.3759, - "base_char_count": 5541, - "aap_char_count": 4803, - "char_delta_pct": -13.3, - "lines_added": 64, - "lines_removed": 81, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0538, - "token_f1": 0.3707, - "base_char_count": 6237, - "aap_char_count": 5872, - "char_delta_pct": -5.9, - "lines_added": 78, - "lines_removed": 98, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0428, - "token_f1": 0.3748, - "base_char_count": 7117, - "aap_char_count": 6934, - "char_delta_pct": -2.6, - "lines_added": 97, - "lines_removed": 116, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0561, - "mean_token_f1": 0.3871, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/007-html-blog-post/metrics.json b/evals/data/experiments/007-html-blog-post/metrics.json deleted file mode 100644 index f1ffa89..0000000 --- a/evals/data/experiments/007-html-blog-post/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "007-html-blog-post", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:50:15.524100+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 131, - "output_tokens": 1336, - "latency_ms": 6789, - "artifact_bytes": 4998 - }, - "aap_turn0": { - "input_tokens": 470, - "output_tokens": 1422, - "latency_ms": 7382, - "artifact_bytes": 5189 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the article title to 'Building Scalable Microservices with Go and gRPC'", - "input_tokens": 1486, - "output_tokens": 1320, - "latency_ms": 5330, - "output_bytes": 4871, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Rewrite the comments section to have 6 comments instead of 4, with replies neste", - "input_tokens": 2830, - "output_tokens": 1534, - "latency_ms": 5890, - "output_bytes": 5717, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add a 'Related Articles' section after the author bio showing 3 related article ", - "input_tokens": 4387, - "output_tokens": 1748, - "latency_ms": 7756, - "output_bytes": 6455, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 8703, - "total_output_tokens": 4602, - "total_latency_ms": 18976 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the article title to 'Building Scalable Microservices with Go and gRPC'", - "input_tokens": 2547, - "output_tokens": 170, - "latency_ms": 1721, - "output_bytes": 296, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Rewrite the comments section to have 6 comments instead of 4, with replies neste", - "input_tokens": 1239, - "output_tokens": 514, - "latency_ms": 3465, - "output_bytes": 1770, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Add a 'Related Articles' section after the author bio showing 3 related article ", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 1860, - "output_bytes": 1770, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - } - ], - "total_input_tokens": 3786, - "total_output_tokens": 684, - "total_latency_ms": 7046, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.6666666666666666 - }, - "comparison": { - "output_token_savings_pct": 85.1, - "input_token_savings_pct": 56.5, - "latency_savings_pct": 62.9 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 131, - "base_output": 1336, - "base_latency_ms": 6789, - "aap_input": 470, - "aap_output": 1422, - "aap_latency_ms": 7382 - }, - { - "turn": 1, - "base_input": 1486, - "base_output": 1320, - "base_latency_ms": 5330, - "aap_input": 2547, - "aap_output": 170, - "aap_latency_ms": 1721, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 2830, - "base_output": 1534, - "base_latency_ms": 5890, - "aap_input": 1239, - "aap_output": 514, - "aap_latency_ms": 3465, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 4387, - "base_output": 1748, - "base_latency_ms": 7756, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 1860, - "envelope_name": "edit", - "apply_ok": false - } - ], - "totals": { - "base_input": 8834, - "base_output": 5938, - "base_combined": 14772, - "aap_input": 4256, - "aap_output": 2106, - "aap_combined": 6362, - "base_latency_ms": 25765, - "aap_latency_ms": 14428, - "output_savings_pct": 64.5, - "input_delta_pct": -51.8, - "combined_savings_pct": 56.9, - "latency_savings_pct": 44.0 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.1291, - "token_f1": 0.3942, - "base_char_count": 4994, - "aap_char_count": 4891, - "char_delta_pct": -2.1, - "lines_added": 64, - "lines_removed": 63, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0217, - "token_f1": 0.0328, - "base_char_count": 4869, - "aap_char_count": 296, - "char_delta_pct": -93.9, - "lines_added": 16, - "lines_removed": 77, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.019, - "token_f1": 0.1312, - "base_char_count": 5715, - "aap_char_count": 1647, - "char_delta_pct": -71.2, - "lines_added": 42, - "lines_removed": 102, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0173, - "token_f1": 0.1227, - "base_char_count": 6453, - "aap_char_count": 1647, - "char_delta_pct": -74.5, - "lines_added": 42, - "lines_removed": 114, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0468, - "mean_token_f1": 0.1702, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-0.html b/evals/data/experiments/007-html-blog-post/outputs/aap/turn-0.html deleted file mode 100644 index faa8c86..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-0.html +++ /dev/null @@ -1,75 +0,0 @@ -
- -
-

RustScale Blog

- -
-
- - -
-

Building Scalable Microservices with Rust

-

- By Alex Rivers | - October 24, 2023 | - 8 min read -

- -

Rust has emerged as a powerhouse for building microservices, primarily due to its memory safety guarantees and zero-cost abstractions. When building at scale, the ability to control memory layout and concurrency without a garbage collector provides a distinct advantage in terms of predictable latency.

- -

The first step in any robust microservice architecture is selecting the right runtime. The tokio ecosystem has become the de facto standard for asynchronous I/O. By leveraging async/await syntax, we can handle thousands of concurrent requests with minimal thread overhead.

- -
#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let listener = TcpListener::bind("127.0.0.1:8080").await?;
-    loop {
-        let (socket, _) = listener.accept().await?;
-        tokio::spawn(async move {
-            process(socket).await;
-        });
-    }
-}
- -

Communication between services is another critical vector for optimization. Moving away from heavy JSON-over-HTTP toward gRPC with Protobuf can significantly reduce payload size and CPU usage for serialization. Using tonic, a gRPC implementation for Rust, we get type-safe service definitions that catch contract mismatches at compile time.

- -

State management in distributed systems is notoriously difficult. When building scalable services, we often lean on shared-nothing architectures. By using Rust's ownership model, we can safely share data across threads using Arc<T> and RwLock<T>, ensuring that our shared state is protected against data races without the overhead of runtime locks in managed languages.

- -

Observability should not be an afterthought. Integrating tracing into your microservices allows for structured logging and distributed spans. This is vital when debugging a request that passes through five different services; being able to correlate spans across service boundaries is the difference between a five-minute fix and a five-hour investigation.

-
-
- - -
-

About the Author

-

Alex is a Senior Staff Engineer specializing in distributed systems and systems programming. He currently focuses on high-performance infrastructure at scale.

-
-
- - -
-

Comments

-
-
- Sarah Jenkins 2 days ago -

Great write-up. The move to tonic for our gRPC layer was a total game-changer for our latency metrics.

-
-
- Marcus Thorne 3 days ago -

How do you handle graceful shutdowns in Tokio when using long-running streams?

-
-
- Elena Rossi 5 days ago -

The section on tracing was helpful. We’ve been struggling to correlate logs in our production environment.

-
-
- David Wu 1 week ago -

Rust is definitely the future of backend microservices. The compile-time checks alone make it worth the learning curve.

-
-
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-1.html b/evals/data/experiments/007-html-blog-post/outputs/aap/turn-1.html deleted file mode 100644 index 97181bb..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-1.html +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "article-content", - "version": 1, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "pointer", - "value": "/article/h1" - }, - "content": "Building Scalable Microservices with Go and gRPC" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-1.json b/evals/data/experiments/007-html-blog-post/outputs/aap/turn-1.json deleted file mode 100644 index 51e43e3..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "article-content", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"article-content\",\n \"version\": 1,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\n \"type\": \"pointer\",\n \"value\": \"/article/h1\"\n },\n \"content\": \"Building Scalable Microservices with Go and gRPC\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-2.html b/evals/data/experiments/007-html-blog-post/outputs/aap/turn-2.html deleted file mode 100644 index 1cbd463..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-2.html +++ /dev/null @@ -1,43 +0,0 @@ -
-

Building Scalable Microservices with Go and gRPC

-
-

Comments

- -
-

Alice: Great overview! How do you handle distributed tracing?

-
- -
-

Author: Thanks! I'm using OpenTelemetry with Jaeger.

-
-
-

Bob: @Author, do you have a specific tutorial for that setup?

-
-
-
-
-
-

Charlie: Very informative. What about service discovery?

-
- -
-

Author: We currently use Consul, but moving to Kubernetes native DNS.

-
-
-
-
-
-

David: How does this perform under high load?

-
-
-

Eve: Thanks for sharing this architecture pattern.

-
-
-

Frank: Do you use any specific service mesh?

-
-
-

Grace: This is exactly what I was looking for.

-
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-2.json b/evals/data/experiments/007-html-blog-post/outputs/aap/turn-2.json deleted file mode 100644 index 2a8cf6a..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "article-content", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "
\n

Building Scalable Microservices with Go and gRPC

\n
\n

Comments

\n \n
\n

Alice: Great overview! How do you handle distributed tracing?

\n
\n \n
\n

Author: Thanks! I'm using OpenTelemetry with Jaeger.

\n
\n
\n

Bob: @Author, do you have a specific tutorial for that setup?

\n
\n
\n
\n
\n
\n

Charlie: Very informative. What about service discovery?

\n
\n \n
\n

Author: We currently use Consul, but moving to Kubernetes native DNS.

\n
\n
\n
\n
\n
\n

David: How does this perform under high load?

\n
\n
\n

Eve: Thanks for sharing this architecture pattern.

\n
\n
\n

Frank: Do you use any specific service mesh?

\n
\n
\n

Grace: This is exactly what I was looking for.

\n
\n
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-3.html b/evals/data/experiments/007-html-blog-post/outputs/aap/turn-3.html deleted file mode 100644 index 1cbd463..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-3.html +++ /dev/null @@ -1,43 +0,0 @@ -
-

Building Scalable Microservices with Go and gRPC

-
-

Comments

- -
-

Alice: Great overview! How do you handle distributed tracing?

-
- -
-

Author: Thanks! I'm using OpenTelemetry with Jaeger.

-
-
-

Bob: @Author, do you have a specific tutorial for that setup?

-
-
-
-
-
-

Charlie: Very informative. What about service discovery?

-
- -
-

Author: We currently use Consul, but moving to Kubernetes native DNS.

-
-
-
-
-
-

David: How does this perform under high load?

-
-
-

Eve: Thanks for sharing this architecture pattern.

-
-
-

Frank: Do you use any specific service mesh?

-
-
-

Grace: This is exactly what I was looking for.

-
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-3.json b/evals/data/experiments/007-html-blog-post/outputs/aap/turn-3.json deleted file mode 100644 index b7edd25..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/aap/turn-3.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "article-content", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "comments-section" - }, - "content": "
\n

Related Articles

\n
\n
\n

Mastering Kubernetes Deployment

\n

A comprehensive guide to managing your containerized services at scale.

\n
\n
\n

Introduction to Protocol Buffers

\n

Learn how to define efficient service interfaces for high-performance systems.

\n
\n
\n

Securing Go Microservices

\n

Best practices for authentication and authorization in a distributed Go environment.

\n
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/base/turn-0.html b/evals/data/experiments/007-html-blog-post/outputs/base/turn-0.html deleted file mode 100644 index ee7e2b4..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/base/turn-0.html +++ /dev/null @@ -1,74 +0,0 @@ - - - - - - Building Scalable Microservices with Rust | TechArchitect Blog - - - - -
- TechArchitect - -
- -
-

Building Scalable Microservices with Rust

-
By Alex Rivers | Oct 24, 2023 | 6 min read
- -

Rust has rapidly evolved from a systems programming language into a top-tier choice for building high-performance microservices. Its unique ownership model, which guarantees memory safety without a garbage collector, provides a predictable latency profile that is essential for modern distributed systems.

- -

When designing microservices, the choice of the asynchronous runtime is critical. Tokio serves as the industry standard for Rust, offering an event-driven, non-blocking I/O platform. It allows developers to handle thousands of concurrent connections efficiently, which is a common requirement in API gateways and backend services.

- -
use axum::{routing::get, Router};
-
-#[tokio::main]
-async fn main() {
-    let app = Router::new().route("/", get(|| async { "Hello, Scalable Rust!" }));
-    let listener = tokio::net::TcpListener::bind("0.0.0.0:3000").await.unwrap();
-    axum::serve(listener, app).await.unwrap();
-}
- -

Beyond the runtime, structuring your services for scalability involves effective communication strategies. gRPC, powered by Tonic in the Rust ecosystem, is often preferred over REST for inter-service communication. Its protocol buffer definition ensures strict contract enforcement between services, reducing runtime errors and improving performance through binary serialization.

- -

Error handling in Rust is another area where the language shines. Instead of using exceptions, Rust forces developers to explicitly handle possible failures using the Result type. In a microservices context, this ensures that every potential network failure or database timeout is addressed at compile time, leading to significantly more robust deployments.

- -

Finally, deployment becomes trivial with Rust’s ability to compile to small, static binaries. By utilizing minimal base Docker images like scratch or distroless, you can create service images that are under 50MB. This drastically reduces cold-start times in serverless environments and minimizes the security attack surface of your containers.

-
- -
-
-
- Alex Rivers -

Senior Backend Engineer and Rust enthusiast. Focused on performance optimization and distributed systems architecture.

-
-
- -
-

Comments (4)

-
Sarah J. - 2 days ago
Excellent guide on Tokio. I’m curious, do you have any specific recommendations for database pooling with SQLx?
-
Marcus T. - 1 day ago
The performance benefits compared to Python microservices are night and day. Great write-up!
-
Elena R. - 5 hours ago
I struggled with async lifetimes initially, but this architecture helped clarify things. Thanks.
-
David K. - 1 hour ago
Great article. Have you considered doing a follow-up on testing strategies for these services?
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/base/turn-1.html b/evals/data/experiments/007-html-blog-post/outputs/base/turn-1.html deleted file mode 100644 index 7cbb1d5..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/base/turn-1.html +++ /dev/null @@ -1,77 +0,0 @@ - - - - - - Building Scalable Microservices with Go and gRPC | TechArchitect Blog - - - - -
- TechArchitect - -
- -
-

Building Scalable Microservices with Go and gRPC

-
By Alex Rivers | Oct 24, 2023 | 6 min read
- -

Go has established itself as the lingua franca of cloud-native development. Its simplicity, combined with built-in concurrency primitives like goroutines, makes it an ideal candidate for building high-throughput microservices that can scale horizontally across distributed clusters.

- -

When designing microservices, the choice of the communication protocol is paramount. gRPC, built on top of HTTP/2, provides a high-performance, language-agnostic RPC framework that is natively supported by Go. It allows developers to define services using Protocol Buffers, ensuring type safety and efficient serialization for inter-service communication.

- -
import (
-    "google.golang.org/grpc"
-    "net"
-)
-
-func main() {
-    lis, _ := net.Listen("tcp", ":50051")
-    s := grpc.NewServer()
-    // Register your gRPC services here
-    s.Serve(lis)
-}
- -

Scaling microservices isn't just about the network protocol; it's about handling state and concurrency effectively. Go’s model of "sharing memory by communicating" via channels allows developers to manage internal state without the complexity of traditional locking mechanisms, leading to cleaner and more maintainable codebase structures.

- -

Error handling in Go is explicit and intentional. By treating errors as values, the language forces developers to consider failure states at every step of a request lifecycle. In a microservices architecture, this discipline ensures that failures in one part of the system don't cascade silently, allowing for better circuit breaking and graceful degradation.

- -

Finally, the operational simplicity of Go is unmatched. Compiling a service into a single static binary simplifies the CI/CD pipeline significantly. When containerized, these small, self-contained binaries allow for rapid scaling in Kubernetes, with extremely fast startup times that are essential for handling unpredictable traffic spikes in production.

-
- -
-
-
- Alex Rivers -

Senior Backend Engineer and Go proponent. Specialized in building resilient cloud-native infrastructures and gRPC-based architectures.

-
-
- -
-

Comments (4)

-
Sarah J. - 2 days ago
Excellent guide on gRPC implementation. How do you handle interceptors for authentication in your Go services?
-
Marcus T. - 1 day ago
Go's concurrency model really makes the difference when managing hundreds of gRPC streams. Great write-up!
-
Elena R. - 5 hours ago
I've been transitioning from Python to Go and this architecture approach makes so much sense. Thanks.
-
David K. - 1 hour ago
Great article. Do you have a preferred library for generating protobuf code from your Go structures?
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/base/turn-2.html b/evals/data/experiments/007-html-blog-post/outputs/base/turn-2.html deleted file mode 100644 index 56c5221..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/base/turn-2.html +++ /dev/null @@ -1,103 +0,0 @@ - - - - - - Building Scalable Microservices with Go and gRPC | TechArchitect Blog - - - - -
- TechArchitect - -
- -
-

Building Scalable Microservices with Go and gRPC

-
By Alex Rivers | Oct 24, 2023 | 6 min read
- -

Go has established itself as the lingua franca of cloud-native development. Its simplicity, combined with built-in concurrency primitives like goroutines, makes it an ideal candidate for building high-throughput microservices that can scale horizontally across distributed clusters.

- -

When designing microservices, the choice of the communication protocol is paramount. gRPC, built on top of HTTP/2, provides a high-performance, language-agnostic RPC framework that is natively supported by Go. It allows developers to define services using Protocol Buffers, ensuring type safety and efficient serialization for inter-service communication.

- -
import (
-    "google.golang.org/grpc"
-    "net"
-)
-
-func main() {
-    lis, _ := net.Listen("tcp", ":50051")
-    s := grpc.NewServer()
-    // Register your gRPC services here
-    s.Serve(lis)
-}
- -

Scaling microservices isn't just about the network protocol; it's about handling state and concurrency effectively. Go’s model of "sharing memory by communicating" via channels allows developers to manage internal state without the complexity of traditional locking mechanisms, leading to cleaner and more maintainable codebase structures.

- -

Error handling in Go is explicit and intentional. By treating errors as values, the language forces developers to consider failure states at every step of a request lifecycle. In a microservices architecture, this discipline ensures that failures in one part of the system don't cascade silently, allowing for better circuit breaking and graceful degradation.

- -

Finally, the operational simplicity of Go is unmatched. Compiling a service into a single static binary simplifies the CI/CD pipeline significantly. When containerized, these small, self-contained binaries allow for rapid scaling in Kubernetes, with extremely fast startup times that are essential for handling unpredictable traffic spikes in production.

-
- -
-
-
- Alex Rivers -

Senior Backend Engineer and Go proponent. Specialized in building resilient cloud-native infrastructures and gRPC-based architectures.

-
-
- -
-

Comments (6)

- -
-
Sarah J. - 2 days ago
- Excellent guide on gRPC implementation. How do you handle interceptors for authentication in your Go services? -
-
Alex Rivers (Author) - 1 day ago
- I typically use unary interceptors to inspect the Metadata for JWT tokens. It keeps the authentication logic decoupled from the actual business handlers. -
-
- -
-
Marcus T. - 1 day ago
- Go's concurrency model really makes the difference when managing hundreds of gRPC streams. Great write-up! -
-
Sarah J. - 12 hours ago
- Totally agree, Marcus. The ability to spawn goroutines per stream makes handling long-lived connections so much easier compared to thread-based models. -
-
- -
-
Elena R. - 5 hours ago
- I've been transitioning from Python to Go and this architecture approach makes so much sense. Thanks. -
- -
-
David K. - 1 hour ago
- Great article. Do you have a preferred library for generating protobuf code from your Go structures? -
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/007-html-blog-post/outputs/base/turn-3.html b/evals/data/experiments/007-html-blog-post/outputs/base/turn-3.html deleted file mode 100644 index 7ed0dac..0000000 --- a/evals/data/experiments/007-html-blog-post/outputs/base/turn-3.html +++ /dev/null @@ -1,115 +0,0 @@ - - - - - - Building Scalable Microservices with Go and gRPC | TechArchitect Blog - - - - -
- TechArchitect - -
- -
-

Building Scalable Microservices with Go and gRPC

-
By Alex Rivers | Oct 24, 2023 | 6 min read
- -

Go has established itself as the lingua franca of cloud-native development. Its simplicity, combined with built-in concurrency primitives like goroutines, makes it an ideal candidate for building high-throughput microservices that can scale horizontally across distributed clusters.

- -

When designing microservices, the choice of the communication protocol is paramount. gRPC, built on top of HTTP/2, provides a high-performance, language-agnostic RPC framework that is natively supported by Go. It allows developers to define services using Protocol Buffers, ensuring type safety and efficient serialization for inter-service communication.

- -
import (
-    "google.golang.org/grpc"
-    "net"
-)
-
-func main() {
-    lis, _ := net.Listen("tcp", ":50051")
-    s := grpc.NewServer()
-    // Register your gRPC services here
-    s.Serve(lis)
-}
- -

Scaling microservices isn't just about the network protocol; it's about handling state and concurrency effectively. Go’s model of "sharing memory by communicating" via channels allows developers to manage internal state without the complexity of traditional locking mechanisms, leading to cleaner and more maintainable codebase structures.

- -

Error handling in Go is explicit and intentional. By treating errors as values, the language forces developers to consider failure states at every step of a request lifecycle. In a microservices architecture, this discipline ensures that failures in one part of the system don't cascade silently, allowing for better circuit breaking and graceful degradation.

- -

Finally, the operational simplicity of Go is unmatched. Compiling a service into a single static binary simplifies the CI/CD pipeline significantly. When containerized, these small, self-contained binaries allow for rapid scaling in Kubernetes, with extremely fast startup times that are essential for handling unpredictable traffic spikes in production.

-
- -
-
-
- Alex Rivers -

Senior Backend Engineer and Go proponent. Specialized in building resilient cloud-native infrastructures and gRPC-based architectures.

-
-
- -
-

Related Articles

- -
- -
-

Comments (6)

- -
-
Sarah J. - 2 days ago
- Excellent guide on gRPC implementation. How do you handle interceptors for authentication in your Go services? -
-
Alex Rivers (Author) - 1 day ago
- I typically use unary interceptors to inspect the Metadata for JWT tokens. It keeps the authentication logic decoupled from the actual business handlers. -
-
- -
-
Marcus T. - 1 day ago
- Go's concurrency model really makes the difference when managing hundreds of gRPC streams. Great write-up! -
-
Sarah J. - 12 hours ago
- Totally agree, Marcus. The ability to spawn goroutines per stream makes handling long-lived connections so much easier compared to thread-based models. -
-
- -
-
Elena R. - 5 hours ago
- I've been transitioning from Python to Go and this architecture approach makes so much sense. Thanks. -
- -
-
David K. - 1 hour ago
- Great article. Do you have a preferred library for generating protobuf code from your Go structures? -
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/eval.json b/evals/data/experiments/008-html-admin-users/eval.json deleted file mode 100644 index 34c8229..0000000 --- a/evals/data/experiments/008-html-admin-users/eval.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.12, - "token_f1": 0.4396, - "base_char_count": 3970, - "aap_char_count": 3811, - "char_delta_pct": -4.0, - "lines_added": 71, - "lines_removed": 80, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.1222, - "token_f1": 0.3846, - "base_char_count": 4434, - "aap_char_count": 2994, - "char_delta_pct": -32.5, - "lines_added": 50, - "lines_removed": 87, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.1081, - "token_f1": 0.2337, - "base_char_count": 4609, - "aap_char_count": 1109, - "char_delta_pct": -75.9, - "lines_added": 21, - "lines_removed": 99, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0673, - "token_f1": 0.1625, - "base_char_count": 4649, - "aap_char_count": 820, - "char_delta_pct": -82.4, - "lines_added": 20, - "lines_removed": 100, - "rouge_l": null, - "bleu": null - }, - { - "turn": 4, - "sequence_similarity": 0.0735, - "token_f1": 0.2852, - "base_char_count": 4967, - "aap_char_count": 1889, - "char_delta_pct": -62.0, - "lines_added": 44, - "lines_removed": 105, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0982, - "mean_token_f1": 0.3011, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/008-html-admin-users/metrics.json b/evals/data/experiments/008-html-admin-users/metrics.json deleted file mode 100644 index b7c97ac..0000000 --- a/evals/data/experiments/008-html-admin-users/metrics.json +++ /dev/null @@ -1,274 +0,0 @@ -{ - "experiment_id": "008-html-admin-users", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:50:55.786306+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 121, - "output_tokens": 1562, - "latency_ms": 6474, - "artifact_bytes": 5239 - }, - "aap_turn0": { - "input_tokens": 460, - "output_tokens": 1099, - "latency_ms": 5909, - "artifact_bytes": 3532 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a 'Department' column to the users table between 'role' and 'status badge' w", - "input_tokens": 1708, - "output_tokens": 1726, - "latency_ms": 6636, - "output_bytes": 5761, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Update the toolbar to include a 'Export CSV' button and a 'Deactivate Selected' ", - "input_tokens": 3457, - "output_tokens": 1819, - "latency_ms": 7993, - "output_bytes": 6063, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add 20 more rows to the users table with users who have 'Viewer' and 'Editor' ro", - "input_tokens": 5300, - "output_tokens": 1882, - "latency_ms": 8025, - "output_bytes": 6290, - "failed": false, - "failure_reason": "" - }, - { - "turn": 4, - "edit": "Change all status badges to use pill-shaped styling with colors: green for activ", - "input_tokens": 7208, - "output_tokens": 1904, - "latency_ms": 7276, - "output_bytes": 6350, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 17673, - "total_output_tokens": 7331, - "total_latency_ms": 29930 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a 'Department' column to the users table between 'role' and 'status badge' w", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 1789, - "output_bytes": 3532, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - }, - { - "turn": 2, - "edit": "Update the toolbar to include a 'Export CSV' button and a 'Deactivate Selected' ", - "input_tokens": 2228, - "output_tokens": 281, - "latency_ms": 2492, - "output_bytes": 1426, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - }, - { - "turn": 3, - "edit": "Add 20 more rows to the users table with users who have 'Viewer' and 'Editor' ro", - "input_tokens": 1577, - "output_tokens": 2352, - "latency_ms": 8293, - "output_bytes": 6611, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 4, - "edit": "Change all status badges to use pill-shaped styling with colors: green for activ", - "input_tokens": 3421, - "output_tokens": 3415, - "latency_ms": 12290, - "output_bytes": 10761, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - } - ], - "total_input_tokens": 7226, - "total_output_tokens": 6048, - "total_latency_ms": 24864, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.75 - }, - "comparison": { - "output_token_savings_pct": 17.5, - "input_token_savings_pct": 59.1, - "latency_savings_pct": 16.9 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 121, - "base_output": 1562, - "base_latency_ms": 6474, - "aap_input": 460, - "aap_output": 1099, - "aap_latency_ms": 5909 - }, - { - "turn": 1, - "base_input": 1708, - "base_output": 1726, - "base_latency_ms": 6636, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 1789, - "envelope_name": "edit", - "apply_ok": false - }, - { - "turn": 2, - "base_input": 3457, - "base_output": 1819, - "base_latency_ms": 7993, - "aap_input": 2228, - "aap_output": 281, - "aap_latency_ms": 2492, - "envelope_name": "edit", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 5300, - "base_output": 1882, - "base_latency_ms": 8025, - "aap_input": 1577, - "aap_output": 2352, - "aap_latency_ms": 8293, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 4, - "base_input": 7208, - "base_output": 1904, - "base_latency_ms": 7276, - "aap_input": 3421, - "aap_output": 3415, - "aap_latency_ms": 12290, - "envelope_name": "edit", - "apply_ok": true - } - ], - "totals": { - "base_input": 17794, - "base_output": 8893, - "base_combined": 26687, - "aap_input": 7686, - "aap_output": 7147, - "aap_combined": 14833, - "base_latency_ms": 36404, - "aap_latency_ms": 30773, - "output_savings_pct": 19.6, - "input_delta_pct": -56.8, - "combined_savings_pct": 44.4, - "latency_savings_pct": 15.5 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.336, - "token_f1": 0.5665, - "base_char_count": 5239, - "aap_char_count": 3439, - "char_delta_pct": -34.4, - "lines_added": 64, - "lines_removed": 91, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.2761, - "token_f1": 0.5357, - "base_char_count": 5761, - "aap_char_count": 3439, - "char_delta_pct": -40.3, - "lines_added": 64, - "lines_removed": 93, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0824, - "token_f1": 0.28, - "base_char_count": 6063, - "aap_char_count": 1362, - "char_delta_pct": -77.5, - "lines_added": 23, - "lines_removed": 95, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1215, - "token_f1": 0.279, - "base_char_count": 6290, - "aap_char_count": 6520, - "char_delta_pct": 3.7, - "lines_added": 59, - "lines_removed": 104, - "rouge_l": null, - "bleu": null - }, - { - "turn": 4, - "sequence_similarity": 0.0848, - "token_f1": 0.2338, - "base_char_count": 6350, - "aap_char_count": 10630, - "char_delta_pct": 67.4, - "lines_added": 59, - "lines_removed": 103, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1802, - "mean_token_f1": 0.379, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-0.html b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-0.html deleted file mode 100644 index c3aa97d..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-0.html +++ /dev/null @@ -1,67 +0,0 @@ -
- -
- -
-

User Management

-
- - - -
-
- -
- - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
UserEmailRoleStatusLast LoginActions
John Doe
john@example.comAdminActive2 mins ago...
-
- -
- -
- - - - - -
-
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-1.html b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-1.html deleted file mode 100644 index c3aa97d..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-1.html +++ /dev/null @@ -1,67 +0,0 @@ -
- -
- -
-

User Management

-
- - - -
-
- -
- - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
UserEmailRoleStatusLast LoginActions
John Doe
john@example.comAdminActive2 mins ago...
-
- -
- -
- - - - - -
-
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-1.json b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-1.json deleted file mode 100644 index b092310..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-1.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-management-page-001", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "users-table" - }, - "content": " Role\n Department\n Status" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "user-rows" - }, - "content": " Admin\n Engineering\n Active" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-2.html b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-2.html deleted file mode 100644 index ef9d240..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-2.html +++ /dev/null @@ -1,26 +0,0 @@ -
- - - - - -
- -
- - - - - -
-
-
-
- \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-2.json b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-2.json deleted file mode 100644 index 459167b..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-management-page-001", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "user-management-page" - }, - "content": "\n \n \n " - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-3.html b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-3.html deleted file mode 100644 index ad4363a..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-3.html +++ /dev/null @@ -1,62 +0,0 @@ -
- -
- - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameEmailRole
User 1u1@example.comViewer
User 2u2@example.comEditor
User 3u3@example.comViewer
User 4u4@example.comEditor
User 5u5@example.comViewer
User 6u6@example.comEditor
User 7u7@example.comViewer
User 8u8@example.comEditor
User 9u9@example.comViewer
User 10u10@example.comEditor
User 11u11@example.comViewer
User 12u12@example.comEditor
User 13u13@example.comViewer
User 14u14@example.comEditor
User 15u15@example.comViewer
User 16u16@example.comEditor
User 17u17@example.comViewer
User 18u18@example.comEditor
User 19u19@example.comViewer
User 20u20@example.comEditor
- -
- -
- - - - - -
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-3.json b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-3.json deleted file mode 100644 index 76fcbc2..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-3.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-table-update", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "
\n \n
\n \n \n \n \n
\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
NameEmailRole
User 1u1@example.comViewer
User 2u2@example.comEditor
User 3u3@example.comViewer
User 4u4@example.comEditor
User 5u5@example.comViewer
User 6u6@example.comEditor
User 7u7@example.comViewer
User 8u8@example.comEditor
User 9u9@example.comViewer
User 10u10@example.comEditor
User 11u11@example.comViewer
User 12u12@example.comEditor
User 13u13@example.comViewer
User 14u14@example.comEditor
User 15u15@example.comViewer
User 16u16@example.comEditor
User 17u17@example.comViewer
User 18u18@example.comEditor
User 19u19@example.comViewer
User 20u20@example.comEditor
\n\n
\n \n
\n \n \n \n \n \n
\n
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-4.html b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-4.html deleted file mode 100644 index eaabc47..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-4.html +++ /dev/null @@ -1,62 +0,0 @@ -
- -
- - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ",op: - -
NameEmailRole
User 1u1@example.comViewerActive
User 2u2@example.comEditorSuspended
User 3u3@example.comViewerInactive
User 4u4@example.comEditorActive
User 5u5@example.comViewerActive
User 6u6@example.comEditorSuspended
User 7u7@example.comViewerInactive
User 8u8@example.comEditorActive
User 9u9@example.comViewerActive
User 10u10@example.comEditorSuspended
User 11u11@example.comViewerInactive
User 12u12@example.comEditorActive
User 13u13@example.comViewerActive
User 14u14@example.comEditorSuspended
User 15u15@example.comViewerInactive
User 16u16@example.comEditorActive
User 17u17@example.comViewerActive
User 18u18@example.comEditorSuspended
User 19u19@example.comViewerInactive
User 20u20@example.comEditorActive
- -
- -
- - - - - -
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-4.json b/evals/data/experiments/008-html-admin-users/outputs/aap/turn-4.json deleted file mode 100644 index 96dc28a..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/aap/turn-4.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-management-page", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "user-rows" - }, - "content": "\n User 1u1@example.comViewerActive\n User 2u2@example.comEditorSuspended\n User 3u3@example.comViewerInactive\n User 4u4@example.comEditorActive\n User 5u5@example.comViewerActive\n User 6u6@example.comEditorSuspended\n User 7u7@example.comViewerInactive\n User 8u8@example.comEditorActive\n User 9u9@example.comViewerActive\n User 10u10@example.comEditorSuspended\n User 11u11@example.comViewerInactive\n User 12u12@example.comEditorActive\n User 13u13@example.comViewerActive\n User 14u14@example.comEditorSuspended\n User 15u15@example.comViewerInactive\n User 16u16@example.comEditorActive\n User 17u17@example.comViewerActive\n User 18u18@example.comEditorSuspended\n User 19u19@example.comViewerInactive\n User 20u20@example.comEditorActive\n \",op:" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/base/turn-0.html b/evals/data/experiments/008-html-admin-users/outputs/base/turn-0.html deleted file mode 100644 index 85f5961..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/base/turn-0.html +++ /dev/null @@ -1,94 +0,0 @@ - - - - - - User Management Admin - - - -
- -
-
- - -
- -
- - -
- - - - -
- - -
- - - - - - - - - - - - - - - -
UserEmailRoleStatusLast LoginActions
-
- - -
- -
- - - - - -
-
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/base/turn-1.html b/evals/data/experiments/008-html-admin-users/outputs/base/turn-1.html deleted file mode 100644 index 1222b2c..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/base/turn-1.html +++ /dev/null @@ -1,96 +0,0 @@ - - - - - - User Management Admin - - - -
- -
-
- - -
- -
- - -
- - - - -
- - -
- - - - - - - - - - - - - - - - -
UserEmailRoleDepartmentStatusLast LoginActions
-
- - -
- -
- - - - -
-
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/base/turn-2.html b/evals/data/experiments/008-html-admin-users/outputs/base/turn-2.html deleted file mode 100644 index fe8aa3f..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/base/turn-2.html +++ /dev/null @@ -1,98 +0,0 @@ - - - - - - User Management Admin - - - -
- -
-
- - - -
- -
- - -
- - - - -
- - -
- - - - - - - - - - - - - - - - -
UserEmailRoleDepartmentStatusLast LoginActions
-
- - -
- -
- - - - -
-
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/base/turn-3.html b/evals/data/experiments/008-html-admin-users/outputs/base/turn-3.html deleted file mode 100644 index 6dc2cdc..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/base/turn-3.html +++ /dev/null @@ -1,107 +0,0 @@ - - - - - - User Management Admin - - - -
- -
-
- - - -
- -
- - -
- - - - -
- - -
- - - - - - - - - - - - - - - - -
UserEmailRoleDepartmentStatusLast LoginActions
-
- - -
- -
- - - -
-
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/008-html-admin-users/outputs/base/turn-4.html b/evals/data/experiments/008-html-admin-users/outputs/base/turn-4.html deleted file mode 100644 index cd57110..0000000 --- a/evals/data/experiments/008-html-admin-users/outputs/base/turn-4.html +++ /dev/null @@ -1,106 +0,0 @@ - - - - - - User Management Admin - - - -
- -
-
- - - -
- -
- - -
- - - - -
- - -
- - - - - - - - - - - - - - - - -
UserEmailRoleDepartmentStatusLast LoginActions
-
- - -
- -
- - - -
-
-
- - - - \ No newline at end of file diff --git a/evals/data/experiments/009-html-invoice/eval.json b/evals/data/experiments/009-html-invoice/eval.json deleted file mode 100644 index 3290005..0000000 --- a/evals/data/experiments/009-html-invoice/eval.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.2483, - "token_f1": 0.4163, - "base_char_count": 6302, - "aap_char_count": 3169, - "char_delta_pct": -49.7, - "lines_added": 72, - "lines_removed": 63, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.2352, - "token_f1": 0.4139, - "base_char_count": 6304, - "aap_char_count": 3169, - "char_delta_pct": -49.7, - "lines_added": 72, - "lines_removed": 63, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.4199, - "token_f1": 0.4165, - "base_char_count": 7678, - "aap_char_count": 4533, - "char_delta_pct": -41.0, - "lines_added": 95, - "lines_removed": 66, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.3011, - "mean_token_f1": 0.4156, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/009-html-invoice/metrics.json b/evals/data/experiments/009-html-invoice/metrics.json deleted file mode 100644 index c88335b..0000000 --- a/evals/data/experiments/009-html-invoice/metrics.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "experiment_id": "009-html-invoice", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:52:03.069879+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 138, - "output_tokens": 2118, - "latency_ms": 8817, - "artifact_bytes": 6157 - }, - "aap_turn0": { - "input_tokens": 477, - "output_tokens": 2116, - "latency_ms": 8762, - "artifact_bytes": 5983 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the company name to 'NovaTech Industries' and invoice number to INV-2026-", - "input_tokens": 2283, - "output_tokens": 2118, - "latency_ms": 9584, - "output_bytes": 6159, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Add 4 more line items: Cloud Hosting Setup ($2,400), SSL Certificate ($199), Dat", - "input_tokens": 4445, - "output_tokens": 2630, - "latency_ms": 10507, - "output_bytes": 7530, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 6728, - "total_output_tokens": 4748, - "total_latency_ms": 20091 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the company name to 'NovaTech Industries' and invoice number to INV-2026-", - "input_tokens": 3249, - "output_tokens": 345, - "latency_ms": 2071, - "output_bytes": 6020, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - }, - { - "turn": 2, - "edit": "Add 4 more line items: Cloud Hosting Setup ($2,400), SSL Certificate ($199), Dat", - "input_tokens": 3276, - "output_tokens": 740, - "latency_ms": 2864, - "output_bytes": 4397, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - } - ], - "total_input_tokens": 6525, - "total_output_tokens": 1085, - "total_latency_ms": 4935, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": 77.1, - "input_token_savings_pct": 3.0, - "latency_savings_pct": 75.4 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 138, - "base_output": 2118, - "base_latency_ms": 8817, - "aap_input": 477, - "aap_output": 2116, - "aap_latency_ms": 8762 - }, - { - "turn": 1, - "base_input": 2283, - "base_output": 2118, - "base_latency_ms": 9584, - "aap_input": 3249, - "aap_output": 345, - "aap_latency_ms": 2071, - "envelope_name": "edit", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 4445, - "base_output": 2630, - "base_latency_ms": 10507, - "aap_input": 3276, - "aap_output": 740, - "aap_latency_ms": 2864, - "envelope_name": "edit", - "apply_ok": true - } - ], - "totals": { - "base_input": 6866, - "base_output": 6866, - "base_combined": 13732, - "aap_input": 7002, - "aap_output": 3201, - "aap_combined": 10203, - "base_latency_ms": 28908, - "aap_latency_ms": 13697, - "output_savings_pct": 53.4, - "input_delta_pct": 2.0, - "combined_savings_pct": 25.7, - "latency_savings_pct": 52.6 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.3097, - "token_f1": 0.5574, - "base_char_count": 6157, - "aap_char_count": 5552, - "char_delta_pct": -9.8, - "lines_added": 65, - "lines_removed": 60, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.3063, - "token_f1": 0.5563, - "base_char_count": 6159, - "aap_char_count": 5576, - "char_delta_pct": -9.5, - "lines_added": 66, - "lines_removed": 60, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.3558, - "token_f1": 0.4113, - "base_char_count": 7530, - "aap_char_count": 3953, - "char_delta_pct": -47.5, - "lines_added": 50, - "lines_removed": 64, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.3239, - "mean_token_f1": 0.5083, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/009-html-invoice/outputs/aap/turn-0.html b/evals/data/experiments/009-html-invoice/outputs/aap/turn-0.html deleted file mode 100644 index c5148a0..0000000 --- a/evals/data/experiments/009-html-invoice/outputs/aap/turn-0.html +++ /dev/null @@ -1,70 +0,0 @@ -
- -
-
-
LOGO
-

TechCorp Solutions

-
-
-

INVOICE

-

# INV-2024-0847

-
-
-
- - -
-
- From:
- TechCorp Solutions
- 123 Innovation Drive
- Tech City, TC 90210 -
-
- Bill To:
- Client Name Placeholder
- 456 Business Rd
- Corporate Town, CT 54321
-
-
-
- - - - - - - - - - - - - - - - - - - - - -
DescriptionQtyRateAmount
Software License1$500.00$500.00
Cloud Hosting12$50.00$600.00
API Integration5$200.00$1,000.00
Consulting Hours10$150.00$1,500.00
Setup Fee1$300.00$300.00
Maintenance1$400.00$400.00
Backup Service1$100.00$100.00
Security Audit1$750.00$750.00
-
- - -
-
Subtotal: $5,150.00
-
Discount (10%): -$515.00
-
Tax (8.5%): $437.75
-
Total Due: $5,072.75
-
-
- - -
-

Payment Terms: Net 30 days.

-

Bank Details: Bank of Innovation | Acc: 123456789 | Routing: 987654321

-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/009-html-invoice/outputs/aap/turn-1.html b/evals/data/experiments/009-html-invoice/outputs/aap/turn-1.html deleted file mode 100644 index 647e1a4..0000000 --- a/evals/data/experiments/009-html-invoice/outputs/aap/turn-1.html +++ /dev/null @@ -1,71 +0,0 @@ -
-
-
-
LOGO
-

NovaTech Industries

-
-
-

INVOICE

-

# INV-2026-1203

-
-

-
- - - - -
-
- From:
- TechCorp Solutions
- 123 Innovation Drive
- Tech City, TC 90210 -
-
- Bill To:
- Client Name Placeholder
- 456 Business Rd
- Corporate Town, CT 54321
-
-
-
- - - - - - - - - - - - - - - - - - - - - -
DescriptionQtyRateAmount
Software License1$500.00$500.00
Cloud Hosting12$50.00$600.00
API Integration5$200.00$1,000.00
Consulting Hours10$150.00$1,500.00
Setup Fee1$300.00$300.00
Maintenance1$400.00$400.00
Backup Service1$100.00$100.00
Security Audit1$750.00$750.00
-
- - -
-
Subtotal: $5,150.00
-
Discount (10%): -$515.00
-
Tax (8.5%): $437.75
-
Total Due: $5,072.75
-
-
- - -
-

Payment Terms: Net 30 days.

-

Bank Details: Bank of Innovation | Acc: 123456789 | Routing: 987654321

-
-
- \ No newline at end of file diff --git a/evals/data/experiments/009-html-invoice/outputs/aap/turn-1.json b/evals/data/experiments/009-html-invoice/outputs/aap/turn-1.json deleted file mode 100644 index d4c8e84..0000000 --- a/evals/data/experiments/009-html-invoice/outputs/aap/turn-1.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "invoice-container", - "version": 2, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "invoice-number" - }, - "content": null - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "invoice-number" - }, - "content": "INV-2026-1203" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "header" - }, - "content": null - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "header" - }, - "content": "
\n
\n
LOGO
\n

NovaTech Industries

\n
\n
\n

INVOICE

\n

# INV-2026-1203

\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/009-html-invoice/outputs/aap/turn-2.html b/evals/data/experiments/009-html-invoice/outputs/aap/turn-2.html deleted file mode 100644 index c573608..0000000 --- a/evals/data/experiments/009-html-invoice/outputs/aap/turn-2.html +++ /dev/null @@ -1,55 +0,0 @@ -
-
-
-
LOGO
-

NovaTech Industries

-
-
-

INVOICE

-

# INV-2026-1203

-
-

-
- - - - -
-
- From:
- TechCorp Solutions
- 123 Innovation Drive
- Tech City, TC 90210 -
-
- Bill To:
- Client Name Placeholder
- 456 Business Rd
- Corporate Town, CT 54321
-
-
-
- - Security Audit1$750.00$750.00 - Cloud Hosting Setup1$2,400.00$2,400.00 - SSL Certificate1$199.00$199.00 - Data Migration1$3,500.00$3,500.00 - Training Session1$1,800.00$1,800.00 - - - -
-
Subtotal: $5,150.00
-
Discount (10%): -$515.00
-
Tax (8.5%): $437.75
-
Total Due: $5,072.75
-
-
- - -
-

Payment Terms: Net 30 days.

-

Bank Details: Bank of Innovation | Acc: 123456789 | Routing: 987654321

-
-
- \ No newline at end of file diff --git a/evals/data/experiments/009-html-invoice/outputs/aap/turn-2.json b/evals/data/experiments/009-html-invoice/outputs/aap/turn-2.json deleted file mode 100644 index ce4325a..0000000 --- a/evals/data/experiments/009-html-invoice/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "invoice-container", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "line-items" - }, - "content": " Security Audit1$750.00$750.00\n Cloud Hosting Setup1$2,400.00$2,400.00\n SSL Certificate1$199.00$199.00\n Data Migration1$3,500.00$3,500.00\n Training Session1$1,800.00$1,800.00\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/009-html-invoice/outputs/base/turn-0.html b/evals/data/experiments/009-html-invoice/outputs/base/turn-0.html deleted file mode 100644 index a676759..0000000 --- a/evals/data/experiments/009-html-invoice/outputs/base/turn-0.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - - Invoice INV-2024-0847 - - -
-
-
-
LOGO
-

TechCorp Solutions

-
-
-

INVOICE

-

# INV-2024-0847

-
-
- -
-
-

FROM

-

TechCorp Solutions
123 Tech Avenue
Silicon Valley, CA 94000

-
-
-

TO

-

Client Name
456 Business Road
Enterprise City, NY 10001

-
-
- - - - - - - - - - - - - - - - - - - - -
DescriptionQtyRateAmount
Cloud Hosting Service1$500.00$500.00
Domain Registration2$20.00$40.00
Security Audit1$1200.00$1200.00
API Integration5$150.00$750.00
Software License3$80.00$240.00
Technical Support10$100.00$1000.00
Data Backup Storage1$300.00$300.00
Custom Reporting2$250.00$500.00
- -
-
Subtotal: $4,530.00
-
Discount (10%): -$453.00
-
Tax (8.5%): $346.55
-
Total Due: $4,423.55
-
- - -
- - \ No newline at end of file diff --git a/evals/data/experiments/009-html-invoice/outputs/base/turn-1.html b/evals/data/experiments/009-html-invoice/outputs/base/turn-1.html deleted file mode 100644 index ba39cba..0000000 --- a/evals/data/experiments/009-html-invoice/outputs/base/turn-1.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - - Invoice INV-2026-1203 - - -
-
-
-
LOGO
-

NovaTech Industries

-
-
-

INVOICE

-

# INV-2026-1203

-
-
- -
-
-

FROM

-

NovaTech Industries
123 Tech Avenue
Silicon Valley, CA 94000

-
-
-

TO

-

Client Name
456 Business Road
Enterprise City, NY 10001

-
-
- - - - - - - - - - - - - - - - - - - - -
DescriptionQtyRateAmount
Cloud Hosting Service1$500.00$500.00
Domain Registration2$20.00$40.00
Security Audit1$1200.00$1200.00
API Integration5$150.00$750.00
Software License3$80.00$240.00
Technical Support10$100.00$1000.00
Data Backup Storage1$300.00$300.00
Custom Reporting2$250.00$500.00
- -
-
Subtotal: $4,530.00
-
Discount (10%): -$453.00
-
Tax (8.5%): $346.55
-
Total Due: $4,423.55
-
- - -
- - \ No newline at end of file diff --git a/evals/data/experiments/009-html-invoice/outputs/base/turn-2.html b/evals/data/experiments/009-html-invoice/outputs/base/turn-2.html deleted file mode 100644 index 1d64d4c..0000000 --- a/evals/data/experiments/009-html-invoice/outputs/base/turn-2.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - - Invoice INV-2026-1203 - - -
-
-
-
LOGO
-

NovaTech Industries

-
-
-

INVOICE

-

# INV-2026-1203

-
-
- -
-
-

FROM

-

NovaTech Industries
123 Tech Avenue
Silicon Valley, CA 94000

-
-
-

TO

-

Client Name
456 Business Road
Enterprise City, NY 10001

-
-
- - - - - - - - - - - - - - - - - - - - - - - - -
DescriptionQtyRateAmount
Cloud Hosting Service1$500.00$500.00
Domain Registration2$20.00$40.00
Security Audit1$1,200.00$1,200.00
API Integration5$150.00$750.00
Software License3$80.00$240.00
Technical Support10$100.00$1,000.00
Data Backup Storage1$300.00$300.00
Custom Reporting2$250.00$500.00
Cloud Hosting Setup1$2,400.00$2,400.00
SSL Certificate1$199.00$199.00
Data Migration1$3,500.00$3,500.00
Training Session1$1,800.00$1,800.00
- -
-
Subtotal: $12,829.00
-
Discount (10%): -$1,282.90
-
Tax (8.5%): $981.42
-
Total Due: $12,527.52
-
- - -
- - \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/eval.json b/evals/data/experiments/010-html-kanban/eval.json deleted file mode 100644 index 29f2149..0000000 --- a/evals/data/experiments/010-html-kanban/eval.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.1236, - "token_f1": 0.4493, - "base_char_count": 6526, - "aap_char_count": 5495, - "char_delta_pct": -15.8, - "lines_added": 60, - "lines_removed": 79, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.1304, - "token_f1": 0.4526, - "base_char_count": 6798, - "aap_char_count": 5507, - "char_delta_pct": -19.0, - "lines_added": 65, - "lines_removed": 85, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.1198, - "token_f1": 0.4489, - "base_char_count": 6899, - "aap_char_count": 5507, - "char_delta_pct": -20.2, - "lines_added": 65, - "lines_removed": 86, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1128, - "token_f1": 0.4382, - "base_char_count": 7475, - "aap_char_count": 6005, - "char_delta_pct": -19.7, - "lines_added": 73, - "lines_removed": 94, - "rouge_l": null, - "bleu": null - }, - { - "turn": 4, - "sequence_similarity": 0.1182, - "token_f1": 0.4654, - "base_char_count": 7681, - "aap_char_count": 6196, - "char_delta_pct": -19.3, - "lines_added": 73, - "lines_removed": 97, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.121, - "mean_token_f1": 0.4509, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/010-html-kanban/metrics.json b/evals/data/experiments/010-html-kanban/metrics.json deleted file mode 100644 index ea8a6e5..0000000 --- a/evals/data/experiments/010-html-kanban/metrics.json +++ /dev/null @@ -1,274 +0,0 @@ -{ - "experiment_id": "010-html-kanban", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:52:45.728646+00:00", - "format": "text/html", - "base_turn0": { - "input_tokens": 142, - "output_tokens": 1909, - "latency_ms": 15971, - "artifact_bytes": 6189 - }, - "aap_turn0": { - "input_tokens": 481, - "output_tokens": 1523, - "latency_ms": 6551, - "artifact_bytes": 4999 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the project name from 'Sprint 24' to 'Sprint 27 \u2014 Q2 Launch' and update a", - "input_tokens": 2086, - "output_tokens": 1973, - "latency_ms": 7908, - "output_bytes": 6347, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Move 2 cards from Backlog to In Progress and add a 'Blocked' label to the first ", - "input_tokens": 4085, - "output_tokens": 2000, - "latency_ms": 8073, - "output_bytes": 6418, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add a new 'Cancelled' column after Done with 2 cancelled task cards", - "input_tokens": 6102, - "output_tokens": 2132, - "latency_ms": 6991, - "output_bytes": 6900, - "failed": false, - "failure_reason": "" - }, - { - "turn": 4, - "edit": "Change all 'critical' priority tags to have a red pulsing animation effect", - "input_tokens": 8250, - "output_tokens": 2290, - "latency_ms": 8295, - "output_bytes": 7221, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 20523, - "total_output_tokens": 8395, - "total_latency_ms": 31267 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Change the project name from 'Sprint 24' to 'Sprint 27 \u2014 Q2 Launch' and update a", - "input_tokens": 2664, - "output_tokens": 1728, - "latency_ms": 6368, - "output_bytes": 5410, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Move 2 cards from Backlog to In Progress and add a 'Blocked' label to the first ", - "input_tokens": 2797, - "output_tokens": 912, - "latency_ms": 3783, - "output_bytes": 3081, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Add a new 'Cancelled' column after Done with 2 cancelled task cards", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 1604, - "output_bytes": 3081, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - }, - { - "turn": 4, - "edit": "Change all 'critical' priority tags to have a red pulsing animation effect", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 3494, - "output_bytes": 3081, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - } - ], - "total_input_tokens": 5461, - "total_output_tokens": 2640, - "total_latency_ms": 15249, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.5 - }, - "comparison": { - "output_token_savings_pct": 68.6, - "input_token_savings_pct": 73.4, - "latency_savings_pct": 51.2 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 142, - "base_output": 1909, - "base_latency_ms": 15971, - "aap_input": 481, - "aap_output": 1523, - "aap_latency_ms": 6551 - }, - { - "turn": 1, - "base_input": 2086, - "base_output": 1973, - "base_latency_ms": 7908, - "aap_input": 2664, - "aap_output": 1728, - "aap_latency_ms": 6368, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 4085, - "base_output": 2000, - "base_latency_ms": 8073, - "aap_input": 2797, - "aap_output": 912, - "aap_latency_ms": 3783, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 6102, - "base_output": 2132, - "base_latency_ms": 6991, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 1604, - "envelope_name": "edit", - "apply_ok": false - }, - { - "turn": 4, - "base_input": 8250, - "base_output": 2290, - "base_latency_ms": 8295, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 3494, - "envelope_name": "edit", - "apply_ok": false - } - ], - "totals": { - "base_input": 20665, - "base_output": 10304, - "base_combined": 30969, - "aap_input": 5942, - "aap_output": 4163, - "aap_combined": 10105, - "base_latency_ms": 47238, - "aap_latency_ms": 21800, - "output_savings_pct": 59.6, - "input_delta_pct": -71.2, - "combined_savings_pct": 67.4, - "latency_savings_pct": 53.9 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.0587, - "token_f1": 0.4398, - "base_char_count": 6189, - "aap_char_count": 4791, - "char_delta_pct": -22.6, - "lines_added": 68, - "lines_removed": 90, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0586, - "token_f1": 0.4756, - "base_char_count": 6343, - "aap_char_count": 5200, - "char_delta_pct": -18.0, - "lines_added": 74, - "lines_removed": 88, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0093, - "token_f1": 0.0824, - "base_char_count": 6414, - "aap_char_count": 3081, - "char_delta_pct": -52.0, - "lines_added": 23, - "lines_removed": 95, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0088, - "token_f1": 0.0788, - "base_char_count": 6896, - "aap_char_count": 3081, - "char_delta_pct": -55.3, - "lines_added": 23, - "lines_removed": 104, - "rouge_l": null, - "bleu": null - }, - { - "turn": 4, - "sequence_similarity": 0.0085, - "token_f1": 0.0738, - "base_char_count": 7217, - "aap_char_count": 3081, - "char_delta_pct": -57.3, - "lines_added": 22, - "lines_removed": 110, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0288, - "mean_token_f1": 0.2301, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-0.html b/evals/data/experiments/010-html-kanban/outputs/aap/turn-0.html deleted file mode 100644 index 2015a37..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-0.html +++ /dev/null @@ -1,73 +0,0 @@ - - -
-
-

Sprint 24

-
Filters:
-
- -
- -
-
Backlog (6)
-
-
API Rate Limiting
Assignee: Alex | High
-
Database Migration
Assignee: Sam | Medium
-
Refactor Auth
Assignee: Jordan | Critical
-
Update Docs
Assignee: Casey | Low
-
S3 Bucket Config
Assignee: Alex | Medium
-
User Profile UI
Assignee: Taylor | Medium
-
-
-
- - -
-
In Progress (4)
-
-
OAuth 2.0 Integration
Assignee: Alex | Critical
-
Dark Mode Switch
Assignee: Taylor | Low
-
Redis Cache Layer
Assignee: Jordan | High
-
Mobile Responsive Fix
Assignee: Sam | Medium
-
-
-
- - -
-
In Review (3)
-
-
Payment Gateway API
Assignee: Jordan | Critical
-
Export CSV Feature
Assignee: Casey | Low
-
Unit Test Coverage
Assignee: Alex | Medium
-
-
-
- - -
-
Done (5)
-
-
Initial Setup
Assignee: Team | Low
-
CI/CD Pipeline
Assignee: Jordan | High
-
Logging Service
Assignee: Sam | Medium
-
Landing Page
Assignee: Taylor | Low
-
Security Audit
Assignee: Alex | Critical
-
-
-
-
-
\ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-1.html b/evals/data/experiments/010-html-kanban/outputs/aap/turn-1.html deleted file mode 100644 index 93d40fc..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-1.html +++ /dev/null @@ -1,80 +0,0 @@ - - -
-
-

Sprint 27 — Q2 Launch

-
- Team: -
-
A
S
J
C
T
R
-
-
-
- -
- -
-
Backlog (6)
-
-
API Rate Limiting
Assignee: Alex | High
-
Database Migration
Assignee: Sam | Medium
-
Refactor Auth
Assignee: Jordan | Critical
-
Update Docs
Assignee: Casey | Low
-
S3 Bucket Config
Assignee: Alex | Medium
-
User Profile UI
Assignee: Taylor | Medium
-
-
-
- - -
-
In Progress (4)
-
-
OAuth 2.0 Integration
Assignee: Alex | Critical
-
Dark Mode Switch
Assignee: Taylor | Low
-
Redis Cache Layer
Assignee: Jordan | High
-
Mobile Responsive Fix
Assignee: Sam | Medium
-
-
-
- - -
-
In Review (3)
-
-
Payment Gateway API
Assignee: Jordan | Critical
-
Export CSV Feature
Assignee: Casey | Low
-
Unit Test Coverage
Assignee: Alex | Medium
-
-
-
- - -
-
Done (5)
-
-
Initial Setup
Assignee: Team | Low
-
CI/CD Pipeline
Assignee: Jordan | High
-
Logging Service
Assignee: Sam | Medium
-
Landing Page
Assignee: Taylor | Low
-
Security Audit
Assignee: Alex | Critical
-
-
-
-
-
diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-1.json b/evals/data/experiments/010-html-kanban/outputs/aap/turn-1.json deleted file mode 100644 index 168a107..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "sprint-board", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "\n\n
\n
\n

Sprint 27 — Q2 Launch

\n
\n Team: \n
\n
A
S
J
C
T
R
\n
\n
\n
\n\n
\n \n
\n
Backlog (6)
\n
\n
API Rate Limiting
Assignee: Alex | High
\n
Database Migration
Assignee: Sam | Medium
\n
Refactor Auth
Assignee: Jordan | Critical
\n
Update Docs
Assignee: Casey | Low
\n
S3 Bucket Config
Assignee: Alex | Medium
\n
User Profile UI
Assignee: Taylor | Medium
\n
\n
\n
\n\n \n
\n
In Progress (4)
\n
\n
OAuth 2.0 Integration
Assignee: Alex | Critical
\n
Dark Mode Switch
Assignee: Taylor | Low
\n
Redis Cache Layer
Assignee: Jordan | High
\n
Mobile Responsive Fix
Assignee: Sam | Medium
\n
\n
\n
\n\n \n
\n
In Review (3)
\n
\n
Payment Gateway API
Assignee: Jordan | Critical
\n
Export CSV Feature
Assignee: Casey | Low
\n
Unit Test Coverage
Assignee: Alex | Medium
\n
\n
\n
\n\n \n
\n
Done (5)
\n
\n
Initial Setup
Assignee: Team | Low
\n
CI/CD Pipeline
Assignee: Jordan | High
\n
Logging Service
Assignee: Sam | Medium
\n
Landing Page
Assignee: Taylor | Low
\n
Security Audit
Assignee: Alex | Critical
\n
\n
\n
\n
\n
\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-2.html b/evals/data/experiments/010-html-kanban/outputs/aap/turn-2.html deleted file mode 100644 index 4a09514..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-2.html +++ /dev/null @@ -1,23 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "kanban-board-001", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "col-backlog"}, - "content": "
\n
Backlog (4)
\n
\n
Refactor Auth
Assignee: Jordan | Critical
\n
Update Docs
Assignee: Casey | Low
\n
S3 Bucket Config
Assignee: Alex | Medium
\n
User Profile UI
Assignee: Taylor | Medium
\n
\n
" - }, - { - "op": "replace", - "target": {"type": "id", "value": "col-progress"}, - "content": "
\n
In Progress (6)
\n
\n
API Rate Limiting
Assignee: Alex | High
\n
Database Migration
Assignee: Sam | Medium
\n
OAuth 2.0 Integration
Assignee: Alex | Critical
\n
Dark Mode Switch
Assignee: Taylor | Low
\n
Redis Cache Layer
Assignee: Jordan | High
\n
Mobile Responsive Fix
Assignee: Sam | Medium
\n
\n
" - }, - { - "op": "replace", - "target": {"type": "id", "value": "col-review"}, - "content": "
\n
In Review (3)
\n
\n
Payment Gateway API
Assignee: Jordan | Critical BLOCKED
\n
Export CSV Feature
Assignee: Casey | Low
\n
Unit Test Coverage
Assignee: Alex | Medium
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-2.json b/evals/data/experiments/010-html-kanban/outputs/aap/turn-2.json deleted file mode 100644 index 1d73c92..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "kanban-board-001", - "version": 2, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"kanban-board-001\",\n \"version\": 2,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\"type\": \"id\", \"value\": \"col-backlog\"},\n \"content\": \"
\\n
Backlog (4)
\\n
\\n
Refactor Auth
Assignee: Jordan | Critical
\\n
Update Docs
Assignee: Casey | Low
\\n
S3 Bucket Config
Assignee: Alex | Medium
\\n
User Profile UI
Assignee: Taylor | Medium
\\n
\\n
\"\n },\n {\n \"op\": \"replace\",\n \"target\": {\"type\": \"id\", \"value\": \"col-progress\"},\n \"content\": \"
\\n
In Progress (6)
\\n
\\n
API Rate Limiting
Assignee: Alex | High
\\n
Database Migration
Assignee: Sam | Medium
\\n
OAuth 2.0 Integration
Assignee: Alex | Critical
\\n
Dark Mode Switch
Assignee: Taylor | Low
\\n
Redis Cache Layer
Assignee: Jordan | High
\\n
Mobile Responsive Fix
Assignee: Sam | Medium
\\n
\\n
\"\n },\n {\n \"op\": \"replace\",\n \"target\": {\"type\": \"id\", \"value\": \"col-review\"},\n \"content\": \"
\\n
In Review (3)
\\n
\\n
Payment Gateway API
Assignee: Jordan | Critical BLOCKED
\\n
Export CSV Feature
Assignee: Casey | Low
\\n
Unit Test Coverage
Assignee: Alex | Medium
\\n
\\n
\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-3.html b/evals/data/experiments/010-html-kanban/outputs/aap/turn-3.html deleted file mode 100644 index 4a09514..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-3.html +++ /dev/null @@ -1,23 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "kanban-board-001", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "col-backlog"}, - "content": "
\n
Backlog (4)
\n
\n
Refactor Auth
Assignee: Jordan | Critical
\n
Update Docs
Assignee: Casey | Low
\n
S3 Bucket Config
Assignee: Alex | Medium
\n
User Profile UI
Assignee: Taylor | Medium
\n
\n
" - }, - { - "op": "replace", - "target": {"type": "id", "value": "col-progress"}, - "content": "
\n
In Progress (6)
\n
\n
API Rate Limiting
Assignee: Alex | High
\n
Database Migration
Assignee: Sam | Medium
\n
OAuth 2.0 Integration
Assignee: Alex | Critical
\n
Dark Mode Switch
Assignee: Taylor | Low
\n
Redis Cache Layer
Assignee: Jordan | High
\n
Mobile Responsive Fix
Assignee: Sam | Medium
\n
\n
" - }, - { - "op": "replace", - "target": {"type": "id", "value": "col-review"}, - "content": "
\n
In Review (3)
\n
\n
Payment Gateway API
Assignee: Jordan | Critical BLOCKED
\n
Export CSV Feature
Assignee: Casey | Low
\n
Unit Test Coverage
Assignee: Alex | Medium
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-3.json b/evals/data/experiments/010-html-kanban/outputs/aap/turn-3.json deleted file mode 100644 index 49db2bc..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-3.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "kanban-board-001", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "col-done" - }, - "content": "
\n
Cancelled (2)
\n
\n
Deprecated Module Removal
Assignee: Alex | Low
\n
Legacy IE Support
Assignee: Casey | Critical
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-4.html b/evals/data/experiments/010-html-kanban/outputs/aap/turn-4.html deleted file mode 100644 index 4a09514..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-4.html +++ /dev/null @@ -1,23 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "kanban-board-001", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "col-backlog"}, - "content": "
\n
Backlog (4)
\n
\n
Refactor Auth
Assignee: Jordan | Critical
\n
Update Docs
Assignee: Casey | Low
\n
S3 Bucket Config
Assignee: Alex | Medium
\n
User Profile UI
Assignee: Taylor | Medium
\n
\n
" - }, - { - "op": "replace", - "target": {"type": "id", "value": "col-progress"}, - "content": "
\n
In Progress (6)
\n
\n
API Rate Limiting
Assignee: Alex | High
\n
Database Migration
Assignee: Sam | Medium
\n
OAuth 2.0 Integration
Assignee: Alex | Critical
\n
Dark Mode Switch
Assignee: Taylor | Low
\n
Redis Cache Layer
Assignee: Jordan | High
\n
Mobile Responsive Fix
Assignee: Sam | Medium
\n
\n
" - }, - { - "op": "replace", - "target": {"type": "id", "value": "col-review"}, - "content": "
\n
In Review (3)
\n
\n
Payment Gateway API
Assignee: Jordan | Critical BLOCKED
\n
Export CSV Feature
Assignee: Casey | Low
\n
Unit Test Coverage
Assignee: Alex | Medium
\n
\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/aap/turn-4.json b/evals/data/experiments/010-html-kanban/outputs/aap/turn-4.json deleted file mode 100644 index 3c8202b..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/aap/turn-4.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "kanban-board-001", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "col-backlog" - }, - "content": "
\\n
Backlog (4)
\\n
\\n
Refactor Auth
Assignee: Jordan | Critical
\\n
Update Docs
Assignee: Casey | Low
\\n
S3 Bucket Config
Assignee: Alex | Medium
\\n
User Profile UI
Assignee: Taylor | Medium
\\n
\\n
" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "col-progress" - }, - "content": "
\\n
In Progress (6)
\\n
\\n
API Rate Limiting
Assignee: Alex | High
\\n
Database Migration
Assignee: Sam | Medium
\\n
OAuth 2.0 Integration
Assignee: Alex | Critical
\\n
Dark Mode Switch
Assignee: Taylor | Low
\\n
Redis Cache Layer
Assignee: Jordan | High
\\n
Mobile Responsive Fix
Assignee: Sam | Medium
\\n
\\n
" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "col-review" - }, - "content": "
\\n
In Review (3)
\\n
\\n
Payment Gateway API
Assignee: Jordan | Critical BLOCKED
\\n
Export CSV Feature
Assignee: Casey | Low
\\n
Unit Test Coverage
Assignee: Alex | Medium
\\n
\\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/base/turn-0.html b/evals/data/experiments/010-html-kanban/outputs/base/turn-0.html deleted file mode 100644 index 8a01005..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/base/turn-0.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - -Sprint 24 Kanban Board - - - - -
-

Sprint 24

-
-
-
-
-
-
- - -
-
- -
- -
-
Backlog 6
-
-
CRITICAL
Setup CI/CD Pipeline
Due: Oct 24 JD
-
HIGH
Refactor Auth Service
Due: Oct 25 MK
-
MEDIUM
Update API Documentation
Due: Oct 26 SL
-
MEDIUM
Add Dark Mode Toggle
Due: Oct 27 JD
-
LOW
Fix Footer Alignment
Due: Oct 28 MK
-
LOW
Update Favicon
Due: Oct 29 SL
-
-
- - -
-
In Progress 4
-
-
CRITICAL
Database Migration Plan
#Backend
-
HIGH
Implement User Dashboard
#Frontend
-
MEDIUM
Optimize Image Loading
#Performance
-
MEDIUM
Write Unit Tests
#QA
-
-
- - -
-
In Review 3
-
-
HIGH
Payment Gateway Integration
#Fintech
-
MEDIUM
Navbar Accessibility Audit
#UX
-
LOW
Rename Utility Functions
#Refactor
-
-
- - -
-
Done 5
-
-
MEDIUM
Project Kickoff Meeting
-
LOW
Configure Linting Rules
-
HIGH
Environment Variables Setup
-
MEDIUM
Gitignore Cleanup
-
LOW
Add Slack Integration
-
-
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/base/turn-1.html b/evals/data/experiments/010-html-kanban/outputs/base/turn-1.html deleted file mode 100644 index 0369319..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/base/turn-1.html +++ /dev/null @@ -1,94 +0,0 @@ - - - - -Sprint 27 — Q2 Launch Kanban Board - - - - -
-

Sprint 27 — Q2 Launch

-
-
-
-
-
-
-
-
-
- - -
-
- -
-
-
Backlog 6
-
-
CRITICAL
Microservice Auth Upgrade
Due: May 02 AS
-
HIGH
API Rate Limiting
Due: May 04 BK
-
MEDIUM
Update UI Styleguide
Due: May 05 CM
-
MEDIUM
Log Monitoring Setup
Due: May 06 DK
-
LOW
Clean up Deprecated Props
Due: May 07 EL
-
LOW
Refresh Test Snapshots
Due: May 08 FG
-
-
- -
-
In Progress 4
-
-
CRITICAL
Payment Processor V3
#Backend
-
HIGH
Dashboard Widgets
#Frontend
-
MEDIUM
DB Index Optimization
#Database
-
MEDIUM
Form Validation Logic
#Frontend
-
-
- -
-
In Review 3
-
-
HIGH
OAuth2 Callback Refactor
#Security
-
MEDIUM
Search Bar Autocomplete
#UX
-
LOW
Documentation Typos
#Docs
-
-
- -
-
Done 5
-
-
MEDIUM
Sprint Planning
-
LOW
Environment Lockdown
-
HIGH
Base Layout Structure
-
MEDIUM
Dependencies Audit
-
LOW
Team Onboarding Docs
-
-
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/base/turn-2.html b/evals/data/experiments/010-html-kanban/outputs/base/turn-2.html deleted file mode 100644 index e985d0b..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/base/turn-2.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - -Sprint 27 — Q2 Launch Kanban Board - - - - -
-

Sprint 27 — Q2 Launch

-
-
-
-
-
-
-
-
-
- - -
-
- -
-
-
Backlog 4
-
-
MEDIUM
Update UI Styleguide
Due: May 05 CM
-
MEDIUM
Log Monitoring Setup
Due: May 06 DK
-
LOW
Clean up Deprecated Props
Due: May 07 EL
-
LOW
Refresh Test Snapshots
Due: May 08 FG
-
-
- -
-
In Progress 6
-
-
CRITICAL
Microservice Auth Upgrade
Due: May 02 AS
-
HIGH
API Rate Limiting
Due: May 04 BK
-
CRITICAL
Payment Processor V3
#Backend
-
HIGH
Dashboard Widgets
#Frontend
-
MEDIUM
DB Index Optimization
#Database
-
MEDIUM
Form Validation Logic
#Frontend
-
-
- -
-
In Review 3
-
-
HIGH
OAuth2 Callback Refactor
#Blocked
-
MEDIUM
Search Bar Autocomplete
#UX
-
LOW
Documentation Typos
#Docs
-
-
- -
-
Done 5
-
-
MEDIUM
Sprint Planning
-
LOW
Environment Lockdown
-
HIGH
Base Layout Structure
-
MEDIUM
Dependencies Audit
-
LOW
Team Onboarding Docs
-
-
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/base/turn-3.html b/evals/data/experiments/010-html-kanban/outputs/base/turn-3.html deleted file mode 100644 index b0bbd6b..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/base/turn-3.html +++ /dev/null @@ -1,104 +0,0 @@ - - - - -Sprint 27 — Q2 Launch Kanban Board - - - - -
-

Sprint 27 — Q2 Launch

-
-
-
-
-
-
-
-
-
- - -
-
- -
-
-
Backlog 4
-
-
MEDIUM
Update UI Styleguide
Due: May 05 CM
-
MEDIUM
Log Monitoring Setup
Due: May 06 DK
-
LOW
Clean up Deprecated Props
Due: May 07 EL
-
LOW
Refresh Test Snapshots
Due: May 08 FG
-
-
- -
-
In Progress 6
-
-
CRITICAL
Microservice Auth Upgrade
Due: May 02 AS
-
HIGH
API Rate Limiting
Due: May 04 BK
-
CRITICAL
Payment Processor V3
#Backend
-
HIGH
Dashboard Widgets
#Frontend
-
MEDIUM
DB Index Optimization
#Database
-
MEDIUM
Form Validation Logic
#Frontend
-
-
- -
-
In Review 3
-
-
HIGH
OAuth2 Callback Refactor
#Blocked
-
MEDIUM
Search Bar Autocomplete
#UX
-
LOW
Documentation Typos
#Docs
-
-
- -
-
Done 5
-
-
MEDIUM
Sprint Planning
-
LOW
Environment Lockdown
-
HIGH
Base Layout Structure
-
MEDIUM
Dependencies Audit
-
LOW
Team Onboarding Docs
-
-
- -
-
Cancelled 2
-
-
LOW
Legacy Plugin Support
-
MEDIUM
Old API V1 Deprecation
-
-
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/010-html-kanban/outputs/base/turn-4.html b/evals/data/experiments/010-html-kanban/outputs/base/turn-4.html deleted file mode 100644 index edc5eb4..0000000 --- a/evals/data/experiments/010-html-kanban/outputs/base/turn-4.html +++ /dev/null @@ -1,111 +0,0 @@ - - - - -Sprint 27 — Q2 Launch Kanban Board - - - - -
-

Sprint 27 — Q2 Launch

-
-
-
-
-
-
-
-
-
- - -
-
- -
-
-
Backlog 4
-
-
MEDIUM
Update UI Styleguide
Due: May 05 CM
-
MEDIUM
Log Monitoring Setup
Due: May 06 DK
-
LOW
Clean up Deprecated Props
Due: May 07 EL
-
LOW
Refresh Test Snapshots
Due: May 08 FG
-
-
- -
-
In Progress 6
-
-
CRITICAL
Microservice Auth Upgrade
Due: May 02 AS
-
HIGH
API Rate Limiting
Due: May 04 BK
-
CRITICAL
Payment Processor V3
#Backend
-
HIGH
Dashboard Widgets
#Frontend
-
MEDIUM
DB Index Optimization
#Database
-
MEDIUM
Form Validation Logic
#Frontend
-
-
- -
-
In Review 3
-
-
HIGH
OAuth2 Callback Refactor
#Blocked
-
MEDIUM
Search Bar Autocomplete
#UX
-
LOW
Documentation Typos
#Docs
-
-
- -
-
Done 5
-
-
MEDIUM
Sprint Planning
-
LOW
Environment Lockdown
-
HIGH
Base Layout Structure
-
MEDIUM
Dependencies Audit
-
LOW
Team Onboarding Docs
-
-
- -
-
Cancelled 2
-
-
LOW
Legacy Plugin Support
-
MEDIUM
Old API V1 Deprecation
-
-
-
- - - \ No newline at end of file diff --git a/evals/data/experiments/011-python-fastapi-users/eval.json b/evals/data/experiments/011-python-fastapi-users/eval.json deleted file mode 100644 index 240d242..0000000 --- a/evals/data/experiments/011-python-fastapi-users/eval.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.1921, - "token_f1": 0.7492, - "base_char_count": 3974, - "aap_char_count": 2856, - "char_delta_pct": -28.1, - "lines_added": 39, - "lines_removed": 60, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0122, - "token_f1": 0.0925, - "base_char_count": 277, - "aap_char_count": 2500, - "char_delta_pct": 802.5, - "lines_added": 75, - "lines_removed": 11, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.3473, - "token_f1": 0.428, - "base_char_count": 689, - "aap_char_count": 1897, - "char_delta_pct": 175.3, - "lines_added": 48, - "lines_removed": 12, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.4904, - "token_f1": 0.4, - "base_char_count": 344, - "aap_char_count": 590, - "char_delta_pct": 71.5, - "lines_added": 23, - "lines_removed": 9, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.2605, - "mean_token_f1": 0.4174, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/011-python-fastapi-users/metrics.json b/evals/data/experiments/011-python-fastapi-users/metrics.json deleted file mode 100644 index 1f48e21..0000000 --- a/evals/data/experiments/011-python-fastapi-users/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "011-python-fastapi-users", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:53:54.826528+00:00", - "format": "text/x-python", - "base_turn0": { - "input_tokens": 149, - "output_tokens": 1227, - "latency_ms": 5066, - "artifact_bytes": 3929 - }, - "aap_turn0": { - "input_tokens": 488, - "output_tokens": 976, - "latency_ms": 3969, - "artifact_bytes": 3111 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a 'role' field to UserCreate schema with allowed values 'admin', 'editor', '", - "input_tokens": 1405, - "output_tokens": 1273, - "latency_ms": 5077, - "output_bytes": 4069, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Rewrite the list_users CRUD function to support filtering by role and is_active ", - "input_tokens": 2697, - "output_tokens": 240, - "latency_ms": 1447, - "output_bytes": 696, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add a new PATCH /users/{id}/deactivate endpoint that sets is_active to False and", - "input_tokens": 2963, - "output_tokens": 125, - "latency_ms": 2747, - "output_bytes": 365, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 7065, - "total_output_tokens": 1638, - "total_latency_ms": 9271 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a 'role' field to UserCreate schema with allowed values 'admin', 'editor', '", - "input_tokens": 2113, - "output_tokens": 689, - "latency_ms": 3581, - "output_bytes": 1822, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Rewrite the list_users CRUD function to support filtering by role and is_active ", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 2337, - "output_bytes": 1822, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - }, - { - "turn": 3, - "edit": "Add a new PATCH /users/{id}/deactivate endpoint that sets is_active to False and", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 1343, - "output_bytes": 1822, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - } - ], - "total_input_tokens": 2113, - "total_output_tokens": 689, - "total_latency_ms": 7261, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.3333333333333333 - }, - "comparison": { - "output_token_savings_pct": 57.9, - "input_token_savings_pct": 70.1, - "latency_savings_pct": 21.7 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 149, - "base_output": 1227, - "base_latency_ms": 5066, - "aap_input": 488, - "aap_output": 976, - "aap_latency_ms": 3969 - }, - { - "turn": 1, - "base_input": 1405, - "base_output": 1273, - "base_latency_ms": 5077, - "aap_input": 2113, - "aap_output": 689, - "aap_latency_ms": 3581, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 2697, - "base_output": 240, - "base_latency_ms": 1447, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 2337, - "envelope_name": "edit", - "apply_ok": false - }, - { - "turn": 3, - "base_input": 2963, - "base_output": 125, - "base_latency_ms": 2747, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 1343, - "envelope_name": "edit", - "apply_ok": false - } - ], - "totals": { - "base_input": 7214, - "base_output": 2865, - "base_combined": 10079, - "aap_input": 2601, - "aap_output": 1665, - "aap_combined": 4266, - "base_latency_ms": 14337, - "aap_latency_ms": 11230, - "output_savings_pct": 41.9, - "input_delta_pct": -63.9, - "combined_savings_pct": 57.7, - "latency_savings_pct": 21.7 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.1176, - "token_f1": 0.7611, - "base_char_count": 3929, - "aap_char_count": 2922, - "char_delta_pct": -25.6, - "lines_added": 38, - "lines_removed": 58, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0513, - "token_f1": 0.2629, - "base_char_count": 4069, - "aap_char_count": 1822, - "char_delta_pct": -55.2, - "lines_added": 19, - "lines_removed": 127, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0842, - "token_f1": 0.2458, - "base_char_count": 696, - "aap_char_count": 1822, - "char_delta_pct": 161.8, - "lines_added": 19, - "lines_removed": 20, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1015, - "token_f1": 0.1277, - "base_char_count": 365, - "aap_char_count": 1822, - "char_delta_pct": 399.2, - "lines_added": 19, - "lines_removed": 10, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0887, - "mean_token_f1": 0.3494, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-0.py b/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-0.py deleted file mode 100644 index d07405b..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-0.py +++ /dev/null @@ -1,102 +0,0 @@ - -from typing import List, Optional -from datetime import datetime -from fastapi import APIRouter, Depends, HTTPException, status -from sqlalchemy.orm import Session -from sqlalchemy import Column, Integer, String, Boolean, DateTime, select -from pydantic import BaseModel, EmailStr -from sqlalchemy.ext.declarative import declarative_base - -Base = declarative_base() - - -class User(Base): - __tablename__ = "users" - id = Column(Integer, primary_key=True, index=True) - email = Column(String, unique=True, index=True, nullable=False) - name = Column(String, nullable=False) - hashed_password = Column(String, nullable=False) - role = Column(String, default="user") - is_active = Column(Boolean, default=True) - created_at = Column(DateTime, default=datetime.utcnow) - - - -class UserBase(BaseModel): - email: EmailStr - name: str - -class UserCreate(UserBase): - password: str - -class UserUpdate(BaseModel): - name: Optional[str] = None - is_active: Optional[bool] = None - -class UserResponse(UserBase): - id: int - role: str - is_active: bool - created_at: datetime - - class Config: - from_attributes = True - - - -def get_user(db: Session, user_id: int): - user = db.query(User).filter(User.id == user_id).first() - if not user: - raise HTTPException(status_code=404, detail="User not found") - return user - -def create_user(db: Session, user: UserCreate): - db_user = User(email=user.email, name=user.name, hashed_password=user.password + "_hashed") - db.add(db_user) - db.commit() - db.refresh(db_user) - return db_user - -def list_users(db: Session, skip: int = 0, limit: int = 100): - return db.query(User).offset(skip).limit(limit).all() - -def update_user(db: Session, user_id: int, user_update: UserUpdate): - db_user = get_user(db, user_id) - update_data = user_update.model_dump(exclude_unset=True) - for key, value in update_data.items(): - setattr(db_user, key, value) - db.commit() - db.refresh(db_user) - return db_user - -def delete_user(db: Session, user_id: int): - db_user = get_user(db, user_id) - db.delete(db_user) - db.commit() - return {"message": "User deleted"} - - - -router = APIRouter(prefix="/users", tags=["users"]) - -@router.post("/", response_model=UserResponse) -def add_user(user: UserCreate, db: Session = Depends()): - return create_user(db, user) - -@router.get("/", response_model=List[UserResponse]) -def read_users(skip: int = 0, limit: int = 100, db: Session = Depends()): - return list_users(db, skip, limit) - -@router.get("/{id}", response_model=UserResponse) -def read_user(id: int, db: Session = Depends()): - return get_user(db, id) - -@router.put("/{id}", response_model=UserResponse) -def update_user_endpoint(id: int, user: UserUpdate, db: Session = Depends()): - return update_user(db, id, user) - -@router.delete("/{id}") -def delete_user_endpoint(id: int, db: Session = Depends()): - return delete_user(db, id) - - \ No newline at end of file diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-1.json b/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-1.json deleted file mode 100644 index df3404b..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-module-edit-role-schema", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"user-module\",\n \"version\": 2,\n \"name\": \"edit\",\n \"meta\": {\"format\": \"text/html\"},\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\"type\": \"id\", \"value\": \"schemas\"},\n \"content\": \"class UserBase(BaseModel):\\n email: EmailStr\\n name: str\\n\\nclass UserCreate(UserBase):\\n password: str\\n role: str = \\\"viewer\\\"\\n\\nclass UserUpdate(BaseModel):\\n name: Optional[str] = None\\n is_active: Optional[bool] = None\\n\\nclass UserResponse(UserBase):\\n id: int\\n role: str\\n is_active: bool\\n created_at: datetime\\n\\n class Config:\\n from_attributes = True\"\n },\n {\n \"op\": \"replace\",\n \"target\": {\"type\": \"id\", \"value\": \"crud\"},\n \"content\": \"def get_user(db: Session, user_id: int):\\n user = db.query(User).filter(User.id == user_id).first()\\n if not user:\\n raise HTTPException(status_code=404, detail=\\\"User not found\\\")\\n return user\\n\\ndef create_user(db: Session, user: UserCreate):\\n db_user = User(email=user.email, name=user.name, hashed_password=user.password + \\\"_hashed\\\", role=user.role)\\n db.add(db_user)\\n db.commit()\\n db.refresh(db_user)\\n return db_user\\n\\ndef list_users(db: Session, skip: int = 0, limit: int = 100):\\n return db.query(User).offset(skip).limit(limit).all()\\n\\ndef update_user(db: Session, user_id: int, user_update: UserUpdate):\\n db_user = get_user(db, user_id)\\n update_data = user_update.model_dump(exclude_unset=True)\\n for key, value in update_data.items():\\n setattr(db_user, key, value)\\n db.commit()\\n db.refresh(db_user)\\n return db_user\\n\\ndef delete_user(db: Session, user_id: int):\\n db_user = get_user(db, user_id)\\n db.delete(db_user)\\n db.commit()\\n return {\\\"message\\\": \\\"User deleted\\\"}\"\n }\n ]\n}\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-1.py b/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-1.py deleted file mode 100644 index 775355e..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-1.py +++ /dev/null @@ -1,19 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-module", - "version": 2, - "name": "edit", - "meta": {"format": "text/html"}, - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "schemas"}, - "content": "class UserBase(BaseModel):\n email: EmailStr\n name: str\n\nclass UserCreate(UserBase):\n password: str\n role: str = \"viewer\"\n\nclass UserUpdate(BaseModel):\n name: Optional[str] = None\n is_active: Optional[bool] = None\n\nclass UserResponse(UserBase):\n id: int\n role: str\n is_active: bool\n created_at: datetime\n\n class Config:\n from_attributes = True" - }, - { - "op": "replace", - "target": {"type": "id", "value": "crud"}, - "content": "def get_user(db: Session, user_id: int):\n user = db.query(User).filter(User.id == user_id).first()\n if not user:\n raise HTTPException(status_code=404, detail=\"User not found\")\n return user\n\ndef create_user(db: Session, user: UserCreate):\n db_user = User(email=user.email, name=user.name, hashed_password=user.password + \"_hashed\", role=user.role)\n db.add(db_user)\n db.commit()\n db.refresh(db_user)\n return db_user\n\ndef list_users(db: Session, skip: int = 0, limit: int = 100):\n return db.query(User).offset(skip).limit(limit).all()\n\ndef update_user(db: Session, user_id: int, user_update: UserUpdate):\n db_user = get_user(db, user_id)\n update_data = user_update.model_dump(exclude_unset=True)\n for key, value in update_data.items():\n setattr(db_user, key, value)\n db.commit()\n db.refresh(db_user)\n return db_user\n\ndef delete_user(db: Session, user_id: int):\n db_user = get_user(db, user_id)\n db.delete(db_user)\n db.commit()\n return {\"message\": \"User deleted\"}" - } - ] -} diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-2.json b/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-2.json deleted file mode 100644 index 1aafd5f..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-module", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "crud" - }, - "content": "def get_user(db: Session, user_id: int):\n user = db.query(User).filter(User.id == user_id).first()\n if not user:\n raise HTTPException(status_code=404, detail=\"User not found\")\n return user\n\ndef create_user(db: Session, user: UserCreate):\n db_user = User(email=user.email, name=user.name, hashed_password=user.password + \"_hashed\", role=user.role)\n db.add(db_user)\n db.commit()\n db.refresh(db_user)\n return db_user\n\ndef list_users(db: Session, role: Optional[str] = None, is_active: Optional[bool] = None, skip: int = 0, limit: int = 100):\n query = db.query(User)\n if role:\n query = query.filter(User.role == role)\n if is_active is not None:\n query = query.filter(User.is_active == is_active)\n return query.offset(skip).limit(limit).all()\n\ndef update_user(db: Session, user_id: int, user_update: UserUpdate):\n db_user = get_user(db, user_id)\n update_data = user_update.model_dump(exclude_unset=True)\n for key, value in update_data.items():\n setattr(db_user, key, value)\n db.commit()\n db.refresh(db_user)\n return db_user\n\ndef delete_user(db: Session, user_id: int):\n db_user = get_user(db, user_id)\n db.delete(db_user)\n db.commit()\n return {\"message\": \"User deleted\"}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-2.py b/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-2.py deleted file mode 100644 index 775355e..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-2.py +++ /dev/null @@ -1,19 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-module", - "version": 2, - "name": "edit", - "meta": {"format": "text/html"}, - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "schemas"}, - "content": "class UserBase(BaseModel):\n email: EmailStr\n name: str\n\nclass UserCreate(UserBase):\n password: str\n role: str = \"viewer\"\n\nclass UserUpdate(BaseModel):\n name: Optional[str] = None\n is_active: Optional[bool] = None\n\nclass UserResponse(UserBase):\n id: int\n role: str\n is_active: bool\n created_at: datetime\n\n class Config:\n from_attributes = True" - }, - { - "op": "replace", - "target": {"type": "id", "value": "crud"}, - "content": "def get_user(db: Session, user_id: int):\n user = db.query(User).filter(User.id == user_id).first()\n if not user:\n raise HTTPException(status_code=404, detail=\"User not found\")\n return user\n\ndef create_user(db: Session, user: UserCreate):\n db_user = User(email=user.email, name=user.name, hashed_password=user.password + \"_hashed\", role=user.role)\n db.add(db_user)\n db.commit()\n db.refresh(db_user)\n return db_user\n\ndef list_users(db: Session, skip: int = 0, limit: int = 100):\n return db.query(User).offset(skip).limit(limit).all()\n\ndef update_user(db: Session, user_id: int, user_update: UserUpdate):\n db_user = get_user(db, user_id)\n update_data = user_update.model_dump(exclude_unset=True)\n for key, value in update_data.items():\n setattr(db_user, key, value)\n db.commit()\n db.refresh(db_user)\n return db_user\n\ndef delete_user(db: Session, user_id: int):\n db_user = get_user(db, user_id)\n db.delete(db_user)\n db.commit()\n return {\"message\": \"User deleted\"}" - } - ] -} diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-3.json b/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-3.json deleted file mode 100644 index c257f12..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-3.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-module", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "crud" - }, - "content": "\ndef deactivate_user(db: Session, user_id: int):\n db_user = get_user(db, user_id)\n db_user.is_active = False\n db.commit()\n db.refresh(db_user)\n return db_user\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-3.py b/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-3.py deleted file mode 100644 index 775355e..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/aap/turn-3.py +++ /dev/null @@ -1,19 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-module", - "version": 2, - "name": "edit", - "meta": {"format": "text/html"}, - "content": [ - { - "op": "replace", - "target": {"type": "id", "value": "schemas"}, - "content": "class UserBase(BaseModel):\n email: EmailStr\n name: str\n\nclass UserCreate(UserBase):\n password: str\n role: str = \"viewer\"\n\nclass UserUpdate(BaseModel):\n name: Optional[str] = None\n is_active: Optional[bool] = None\n\nclass UserResponse(UserBase):\n id: int\n role: str\n is_active: bool\n created_at: datetime\n\n class Config:\n from_attributes = True" - }, - { - "op": "replace", - "target": {"type": "id", "value": "crud"}, - "content": "def get_user(db: Session, user_id: int):\n user = db.query(User).filter(User.id == user_id).first()\n if not user:\n raise HTTPException(status_code=404, detail=\"User not found\")\n return user\n\ndef create_user(db: Session, user: UserCreate):\n db_user = User(email=user.email, name=user.name, hashed_password=user.password + \"_hashed\", role=user.role)\n db.add(db_user)\n db.commit()\n db.refresh(db_user)\n return db_user\n\ndef list_users(db: Session, skip: int = 0, limit: int = 100):\n return db.query(User).offset(skip).limit(limit).all()\n\ndef update_user(db: Session, user_id: int, user_update: UserUpdate):\n db_user = get_user(db, user_id)\n update_data = user_update.model_dump(exclude_unset=True)\n for key, value in update_data.items():\n setattr(db_user, key, value)\n db.commit()\n db.refresh(db_user)\n return db_user\n\ndef delete_user(db: Session, user_id: int):\n db_user = get_user(db, user_id)\n db.delete(db_user)\n db.commit()\n return {\"message\": \"User deleted\"}" - } - ] -} diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-0.py b/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-0.py deleted file mode 100644 index a4767af..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-0.py +++ /dev/null @@ -1,121 +0,0 @@ -from typing import List, Optional -from datetime import datetime -from fastapi import FastAPI, Depends, HTTPException, status, APIRouter -from pydantic import BaseModel, EmailStr -from sqlalchemy import Column, Integer, String, Boolean, DateTime, create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker, Session, declarative_base - -# Database Configuration -SQLALCHEMY_DATABASE_URL = "sqlite:///./users.db" -engine = create_engine(SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) -Base = declarative_base() - -# Models -class User(Base): - __tablename__ = "users" - id = Column(Integer, primary_key=True, index=True) - email = Column(String, unique=True, index=True, nullable=False) - name = Column(String, nullable=False) - hashed_password = Column(String, nullable=False) - role = Column(String, default="user") - is_active = Column(Boolean, default=True) - created_at = Column(DateTime, default=datetime.utcnow) - -Base.metadata.create_all(bind=engine) - -# Schemas -class UserBase(BaseModel): - email: EmailStr - name: str - -class UserCreate(UserBase): - password: str - -class UserUpdate(BaseModel): - name: Optional[str] = None - is_active: Optional[bool] = None - -class UserResponse(UserBase): - id: int - role: str - is_active: bool - created_at: datetime - - class Config: - from_attributes = True - -# Dependency -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -# CRUD Functions -def get_user_by_id(db: Session, user_id: int): - return db.query(User).filter(User.id == user_id).first() - -def create_user(db: Session, user: UserCreate): - db_user = User(email=user.email, name=user.name, hashed_password=user.password + "hashed") - db.add(db_user) - db.commit() - db.refresh(db_user) - return db_user - -def list_users(db: Session, skip: int = 0, limit: int = 100): - return db.query(User).offset(skip).limit(limit).all() - -def update_user(db: Session, user_id: int, user_data: UserUpdate): - db_user = get_user_by_id(db, user_id) - if not db_user: return None - for key, value in user_data.model_dump(exclude_unset=True).items(): - setattr(db_user, key, value) - db.commit() - db.refresh(db_user) - return db_user - -def delete_user(db: Session, user_id: int): - db_user = get_user_by_id(db, user_id) - if db_user: - db.delete(db_user) - db.commit() - return db_user - -# Router -router = APIRouter() - -@router.post("/users", response_model=UserResponse) -def create_new_user(user: UserCreate, db: Session = Depends(get_db)): - if db.query(User).filter(User.email == user.email).first(): - raise HTTPException(status_code=400, detail="Email already registered") - return create_user(db, user) - -@router.get("/users", response_model=List[UserResponse]) -def read_users(skip: int = 0, limit: int = 100, db: Session = Depends(get_db)): - return list_users(db, skip=skip, limit=limit) - -@router.get("/users/{user_id}", response_model=UserResponse) -def read_user(user_id: int, db: Session = Depends(get_db)): - db_user = get_user_by_id(db, user_id) - if not db_user: - raise HTTPException(status_code=404, detail="User not found") - return db_user - -@router.put("/users/{user_id}", response_model=UserResponse) -def update_existing_user(user_id: int, user: UserUpdate, db: Session = Depends(get_db)): - db_user = update_user(db, user_id, user) - if not db_user: - raise HTTPException(status_code=404, detail="User not found") - return db_user - -@router.delete("/users/{user_id}") -def delete_existing_user(user_id: int, db: Session = Depends(get_db)): - if not delete_user(db, user_id): - raise HTTPException(status_code=404, detail="User not found") - return {"message": "User deleted"} - -app = FastAPI() -app.include_router(router) \ No newline at end of file diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-1.py b/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-1.py deleted file mode 100644 index d35c64c..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-1.py +++ /dev/null @@ -1,127 +0,0 @@ -from typing import List, Optional, Literal -from datetime import datetime -from fastapi import FastAPI, Depends, HTTPException, status, APIRouter -from pydantic import BaseModel, EmailStr, Field -from sqlalchemy import Column, Integer, String, Boolean, DateTime, create_engine -from sqlalchemy.orm import sessionmaker, Session, declarative_base - -# Database Configuration -SQLALCHEMY_DATABASE_URL = "sqlite:///./users.db" -engine = create_engine(SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) -Base = declarative_base() - -# Models -class User(Base): - __tablename__ = "users" - id = Column(Integer, primary_key=True, index=True) - email = Column(String, unique=True, index=True, nullable=False) - name = Column(String, nullable=False) - hashed_password = Column(String, nullable=False) - role = Column(String, default="viewer") - is_active = Column(Boolean, default=True) - created_at = Column(DateTime, default=datetime.utcnow) - -Base.metadata.create_all(bind=engine) - -# Schemas -class UserBase(BaseModel): - email: EmailStr - name: str - -class UserCreate(UserBase): - password: str - role: Literal['admin', 'editor', 'viewer'] = 'viewer' - -class UserUpdate(BaseModel): - name: Optional[str] = None - is_active: Optional[bool] = None - role: Optional[Literal['admin', 'editor', 'viewer']] = None - -class UserResponse(UserBase): - id: int - role: str - is_active: bool - created_at: datetime - - class Config: - from_attributes = True - -# Dependency -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -# CRUD Functions -def get_user_by_id(db: Session, user_id: int): - return db.query(User).filter(User.id == user_id).first() - -def create_user(db: Session, user: UserCreate): - db_user = User( - email=user.email, - name=user.name, - hashed_password=user.password + "hashed", - role=user.role - ) - db.add(db_user) - db.commit() - db.refresh(db_user) - return db_user - -def list_users(db: Session, skip: int = 0, limit: int = 100): - return db.query(User).offset(skip).limit(limit).all() - -def update_user(db: Session, user_id: int, user_data: UserUpdate): - db_user = get_user_by_id(db, user_id) - if not db_user: return None - for key, value in user_data.model_dump(exclude_unset=True).items(): - setattr(db_user, key, value) - db.commit() - db.refresh(db_user) - return db_user - -def delete_user(db: Session, user_id: int): - db_user = get_user_by_id(db, user_id) - if db_user: - db.delete(db_user) - db.commit() - return db_user - -# Router -router = APIRouter() - -@router.post("/users", response_model=UserResponse) -def create_new_user(user: UserCreate, db: Session = Depends(get_db)): - if db.query(User).filter(User.email == user.email).first(): - raise HTTPException(status_code=400, detail="Email already registered") - return create_user(db, user) - -@router.get("/users", response_model=List[UserResponse]) -def read_users(skip: int = 0, limit: int = 100, db: Session = Depends(get_db)): - return list_users(db, skip=skip, limit=limit) - -@router.get("/users/{user_id}", response_model=UserResponse) -def read_user(user_id: int, db: Session = Depends(get_db)): - db_user = get_user_by_id(db, user_id) - if not db_user: - raise HTTPException(status_code=404, detail="User not found") - return db_user - -@router.put("/users/{user_id}", response_model=UserResponse) -def update_existing_user(user_id: int, user: UserUpdate, db: Session = Depends(get_db)): - db_user = update_user(db, user_id, user) - if not db_user: - raise HTTPException(status_code=404, detail="User not found") - return db_user - -@router.delete("/users/{user_id}") -def delete_existing_user(user_id: int, db: Session = Depends(get_db)): - if not delete_user(db, user_id): - raise HTTPException(status_code=404, detail="User not found") - return {"message": "User deleted"} - -app = FastAPI() -app.include_router(router) \ No newline at end of file diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-2.py b/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-2.py deleted file mode 100644 index 4dd4f98..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-2.py +++ /dev/null @@ -1,20 +0,0 @@ -def list_users(db: Session, skip: int = 0, limit: int = 100, role: Optional[str] = None, is_active: Optional[bool] = None): - query = db.query(User) - - if role: - query = query.filter(User.role == role) - if is_active is not None: - query = query.filter(User.is_active == is_active) - - return query.offset(skip).limit(limit).all() - -# Updated Router endpoint -@router.get("/users", response_model=List[UserResponse]) -def read_users( - skip: int = 0, - limit: int = 100, - role: Optional[str] = None, - is_active: Optional[bool] = None, - db: Session = Depends(get_db) -): - return list_users(db, skip=skip, limit=limit, role=role, is_active=is_active) \ No newline at end of file diff --git a/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-3.py b/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-3.py deleted file mode 100644 index 59df44a..0000000 --- a/evals/data/experiments/011-python-fastapi-users/outputs/base/turn-3.py +++ /dev/null @@ -1,10 +0,0 @@ -@router.patch("/users/{user_id}/deactivate", response_model=UserResponse) -def deactivate_user(user_id: int, db: Session = Depends(get_db)): - db_user = get_user_by_id(db, user_id) - if not db_user: - raise HTTPException(status_code=404, detail="User not found") - - db_user.is_active = False - db.commit() - db.refresh(db_user) - return db_user \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/eval.json b/evals/data/experiments/012-python-cli-log-analyzer/eval.json deleted file mode 100644 index 2817811..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/eval.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.0901, - "token_f1": 0.5804, - "base_char_count": 3008, - "aap_char_count": 2251, - "char_delta_pct": -25.2, - "lines_added": 47, - "lines_removed": 58, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.2194, - "token_f1": 0.5948, - "base_char_count": 3359, - "aap_char_count": 2602, - "char_delta_pct": -22.5, - "lines_added": 53, - "lines_removed": 64, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0173, - "token_f1": 0.1161, - "base_char_count": 3361, - "aap_char_count": 1023, - "char_delta_pct": -69.6, - "lines_added": 14, - "lines_removed": 89, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1934, - "token_f1": 0.3195, - "base_char_count": 3654, - "aap_char_count": 1827, - "char_delta_pct": -50.0, - "lines_added": 42, - "lines_removed": 80, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.13, - "mean_token_f1": 0.4027, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/012-python-cli-log-analyzer/metrics.json b/evals/data/experiments/012-python-cli-log-analyzer/metrics.json deleted file mode 100644 index a82cfaa..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "012-python-cli-log-analyzer", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:54:20.420564+00:00", - "format": "text/x-python", - "base_turn0": { - "input_tokens": 120, - "output_tokens": 937, - "latency_ms": 4690, - "artifact_bytes": 3011 - }, - "aap_turn0": { - "input_tokens": 459, - "output_tokens": 798, - "latency_ms": 4191, - "artifact_bytes": 2514 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new analyzer function called 'detect_anomalies' that flags response times ", - "input_tokens": 1083, - "output_tokens": 1098, - "latency_ms": 4547, - "output_bytes": 3380, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Update the argparse main function to add a --group-by flag that accepts 'hour', ", - "input_tokens": 2213, - "output_tokens": 1064, - "latency_ms": 5399, - "output_bytes": 3303, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Rewrite the table formatter to use box-drawing characters for borders instead of", - "input_tokens": 3298, - "output_tokens": 1090, - "latency_ms": 4852, - "output_bytes": 3425, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 6594, - "total_output_tokens": 3252, - "total_latency_ms": 14798 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new analyzer function called 'detect_anomalies' that flags response times ", - "input_tokens": 1932, - "output_tokens": 201, - "latency_ms": 1475, - "output_bytes": 2891, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - }, - { - "turn": 2, - "edit": "Update the argparse main function to add a --group-by flag that accepts 'hour', ", - "input_tokens": 2062, - "output_tokens": 681, - "latency_ms": 3469, - "output_bytes": 1937, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Rewrite the table formatter to use box-drawing characters for borders instead of", - "input_tokens": 1767, - "output_tokens": 701, - "latency_ms": 4018, - "output_bytes": 2240, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - } - ], - "total_input_tokens": 5761, - "total_output_tokens": 1583, - "total_latency_ms": 8962, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": 51.3, - "input_token_savings_pct": 12.6, - "latency_savings_pct": 39.4 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 120, - "base_output": 937, - "base_latency_ms": 4690, - "aap_input": 459, - "aap_output": 798, - "aap_latency_ms": 4191 - }, - { - "turn": 1, - "base_input": 1083, - "base_output": 1098, - "base_latency_ms": 4547, - "aap_input": 1932, - "aap_output": 201, - "aap_latency_ms": 1475, - "envelope_name": "edit", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 2213, - "base_output": 1064, - "base_latency_ms": 5399, - "aap_input": 2062, - "aap_output": 681, - "aap_latency_ms": 3469, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 3298, - "base_output": 1090, - "base_latency_ms": 4852, - "aap_input": 1767, - "aap_output": 701, - "aap_latency_ms": 4018, - "envelope_name": "synthesize", - "apply_ok": true - } - ], - "totals": { - "base_input": 6714, - "base_output": 4189, - "base_combined": 10903, - "aap_input": 6220, - "aap_output": 2381, - "aap_combined": 8601, - "base_latency_ms": 19488, - "aap_latency_ms": 13153, - "output_savings_pct": 43.2, - "input_delta_pct": -7.4, - "combined_savings_pct": 21.1, - "latency_savings_pct": 32.5 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.2279, - "token_f1": 0.5714, - "base_char_count": 3011, - "aap_char_count": 2289, - "char_delta_pct": -24.0, - "lines_added": 44, - "lines_removed": 71, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.2269, - "token_f1": 0.5608, - "base_char_count": 3380, - "aap_char_count": 2666, - "char_delta_pct": -21.1, - "lines_added": 53, - "lines_removed": 69, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.1271, - "token_f1": 0.2325, - "base_char_count": 3303, - "aap_char_count": 1937, - "char_delta_pct": -41.4, - "lines_added": 23, - "lines_removed": 81, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0858, - "token_f1": 0.4168, - "base_char_count": 3401, - "aap_char_count": 1939, - "char_delta_pct": -43.0, - "lines_added": 44, - "lines_removed": 68, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1669, - "mean_token_f1": 0.4454, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-0.py b/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-0.py deleted file mode 100644 index a70828b..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-0.py +++ /dev/null @@ -1,67 +0,0 @@ -import argparse -import json -import re -import csv -from datetime import datetime -from collections import Counter, defaultdict -from typing import List, Dict, Any, Optional - -def parse_log_line(line: str) -> Optional[Dict[str, Any]]: - # Simple regex for Common/Nginx log format - regex = r'(?P\S+) \S+ \S+ \[(?P.*?)\] "(?P\S+) (?P\S+) \S+" (?P\d+) (?P\d+) (?P\d+\.\d+)' - match = re.match(regex, line) - if match: - return match.groupdict() - try: - return json.loads(line) - except: - return None - - -def analyze_logs(logs: List[Dict[str, Any]], top_n: int = 10) -> Dict[str, Any]: - ips = Counter(log['ip'] for log in logs) - status_codes = Counter(log['status'] for log in logs) - endpoints = Counter(log['url'] for log in logs) - response_times = [float(log['response_time']) for log in logs if 'response_time' in log] - - return { - "top_ips": ips.most_common(top_n), - "status_distribution": dict(status_codes), - "top_endpoints": endpoints.most_common(top_n), - "avg_response_time": sum(response_times) / len(response_times) if response_times else 0 - } - - -def format_output(data: Dict[str, Any], fmt: str) -> str: - if fmt == 'json': - return json.dumps(data, indent=2) - elif fmt == 'csv': - output = [] - for key, value in data.items(): - output.append(f"{key},{value}") - return "\n".join(output) - else: - # Simple ASCII table - lines = ["Analysis Report", "---------------"] - for k, v in data.items(): - lines.append(f"{k.replace('_', ' ').title()}: {v}") - return "\n".join(lines) - - -def main() -> None: - parser = argparse.ArgumentParser(description="CLI Log Analyzer") - parser.add_argument("--file", required=True, help="Path to log file") - parser.add_argument("--format", choices=['table', 'json', 'csv'], default='table') - parser.add_argument("--top", type=int, default=10) - - args = parser.parse_args() - - with open(args.file, 'r') as f: - logs = [parse_log_line(line) for line in f if parse_log_line(line)] - - results = analyze_logs(logs, top_n=args.top) - print(format_output(results, args.format)) - -if __name__ == "__main__": - main() - \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-1.json b/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-1.json deleted file mode 100644 index 6d0c0ad..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-1.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "log-analyzer-script", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "analysis-functions" - }, - "content": "\ndef detect_anomalies(logs: List[Dict[str, Any]], percentile: float = 99.0) -> List[Dict[str, Any]]:\n times = sorted([float(log['response_time']) for log in logs if 'response_time' in log])\n if not times:\n return []\n \n threshold = times[int(len(times) * (percentile / 100))]\n return [log for log in logs if float(log.get('response_time', 0)) > threshold]\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-1.py b/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-1.py deleted file mode 100644 index 474624a..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-1.py +++ /dev/null @@ -1,75 +0,0 @@ -import argparse -import json -import re -import csv -from datetime import datetime -from collections import Counter, defaultdict -from typing import List, Dict, Any, Optional - -def parse_log_line(line: str) -> Optional[Dict[str, Any]]: - # Simple regex for Common/Nginx log format - regex = r'(?P\S+) \S+ \S+ \[(?P.*?)\] "(?P\S+) (?P\S+) \S+" (?P\d+) (?P\d+) (?P\d+\.\d+)' - match = re.match(regex, line) - if match: - return match.groupdict() - try: - return json.loads(line) - except: - return None - - -def analyze_logs(logs: List[Dict[str, Any]], top_n: int = 10) -> Dict[str, Any]: - ips = Counter(log['ip'] for log in logs) - status_codes = Counter(log['status'] for log in logs) - endpoints = Counter(log['url'] for log in logs) - response_times = [float(log['response_time']) for log in logs if 'response_time' in log] - - return { - "top_ips": ips.most_common(top_n), - "status_distribution": dict(status_codes), - "top_endpoints": endpoints.most_common(top_n), - "avg_response_time": sum(response_times) / len(response_times) if response_times else 0 - } - -def detect_anomalies(logs: List[Dict[str, Any]], percentile: float = 99.0) -> List[Dict[str, Any]]: - times = sorted([float(log['response_time']) for log in logs if 'response_time' in log]) - if not times: - return [] - - threshold = times[int(len(times) * (percentile / 100))] - return [log for log in logs if float(log.get('response_time', 0)) > threshold] - - -def format_output(data: Dict[str, Any], fmt: str) -> str: - if fmt == 'json': - return json.dumps(data, indent=2) - elif fmt == 'csv': - output = [] - for key, value in data.items(): - output.append(f"{key},{value}") - return "\n".join(output) - else: - # Simple ASCII table - lines = ["Analysis Report", "---------------"] - for k, v in data.items(): - lines.append(f"{k.replace('_', ' ').title()}: {v}") - return "\n".join(lines) - - -def main() -> None: - parser = argparse.ArgumentParser(description="CLI Log Analyzer") - parser.add_argument("--file", required=True, help="Path to log file") - parser.add_argument("--format", choices=['table', 'json', 'csv'], default='table') - parser.add_argument("--top", type=int, default=10) - - args = parser.parse_args() - - with open(args.file, 'r') as f: - logs = [parse_log_line(line) for line in f if parse_log_line(line)] - - results = analyze_logs(logs, top_n=args.top) - print(format_output(results, args.format)) - -if __name__ == "__main__": - main() - \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-2.json b/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-2.json deleted file mode 100644 index 3a3ed2e..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "log-analyzer-script", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"log-analyzer-script\",\n \"version\": 2,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"insert_after\",\n \"target\": {\n \"type\": \"id\",\n \"value\": \"analysis-functions\"\n },\n \"content\": \"\\ndef group_logs_by_time(logs: List[Dict[str, Any]], interval: str) -> Dict[str, List[Dict[str, Any]]}:\\n grouped = defaultdict(list)\\n for log in logs:\\n # Assuming timestamp format: [dd/MMM/yyyy:HH:mm:ss +0000]\\n dt = datetime.strptime(log['timestamp'].split(':')[0], '%d/%b/%Y')\\n if interval == 'hour':\\n key = dt.strftime('%Y-%m-%d %H')\\n elif interval == 'day':\\n key = dt.strftime('%Y-%m-%d')\\n elif interval == 'week':\\n key = dt.strftime('%Y-W%U')\\n else:\\n key = 'all'\\n grouped[key].append(log)\\n return dict(grouped)\\n\"\n },\n {\n \"op\": \"replace\",\n \"target\": {\n \"type\": \"id\",\n \"value\": \"main-function\"\n },\n \"content\": \"def main() -> None:\\n parser = argparse.ArgumentParser(description=\\\"CLI Log Analyzer\\\")\\n parser.add_argument(\\\"--file\\\", required=True, help=\\\"Path to log file\\\")\\n parser.add_argument(\\\"--format\\\", choices=['table', 'json', 'csv'], default='table')\\n parser.add_argument(\\\"--top\\\", type=int, default=10)\\n parser.add_argument(\\\"--group-by\\\", choices=['hour', 'day', 'week'], help=\\\"Group results by time interval\\\")\\n \\n args = parser.parse_args()\\n \\n with open(args.file, 'r') as f:\\n logs = [parse_log_line(line) for line in f if parse_log_line(line)]\\n \\n if args.group_by:\\n grouped_logs = group_logs_by_time(logs, args.group_by)\\n results = {interval: analyze_logs(logs, top_n=args.top) for interval, logs in grouped_logs.items()}\\n else:\\n results = analyze_logs(logs, top_n=args.top)\\n\\n print(format_output(results, args.format))\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-2.py b/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-2.py deleted file mode 100644 index 978e46d..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-2.py +++ /dev/null @@ -1,24 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "log-analyzer-script", - "version": 2, - "name": "edit", - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "analysis-functions" - }, - "content": "\ndef group_logs_by_time(logs: List[Dict[str, Any]], interval: str) -> Dict[str, List[Dict[str, Any]]}:\n grouped = defaultdict(list)\n for log in logs:\n # Assuming timestamp format: [dd/MMM/yyyy:HH:mm:ss +0000]\n dt = datetime.strptime(log['timestamp'].split(':')[0], '%d/%b/%Y')\n if interval == 'hour':\n key = dt.strftime('%Y-%m-%d %H')\n elif interval == 'day':\n key = dt.strftime('%Y-%m-%d')\n elif interval == 'week':\n key = dt.strftime('%Y-W%U')\n else:\n key = 'all'\n grouped[key].append(log)\n return dict(grouped)\n" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "main-function" - }, - "content": "def main() -> None:\n parser = argparse.ArgumentParser(description=\"CLI Log Analyzer\")\n parser.add_argument(\"--file\", required=True, help=\"Path to log file\")\n parser.add_argument(\"--format\", choices=['table', 'json', 'csv'], default='table')\n parser.add_argument(\"--top\", type=int, default=10)\n parser.add_argument(\"--group-by\", choices=['hour', 'day', 'week'], help=\"Group results by time interval\")\n \n args = parser.parse_args()\n \n with open(args.file, 'r') as f:\n logs = [parse_log_line(line) for line in f if parse_log_line(line)]\n \n if args.group_by:\n grouped_logs = group_logs_by_time(logs, args.group_by)\n results = {interval: analyze_logs(logs, top_n=args.top) for interval, logs in grouped_logs.items()}\n else:\n results = analyze_logs(logs, top_n=args.top)\n\n print(format_output(results, args.format))" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-3.json b/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-3.json deleted file mode 100644 index 0ce5ba3..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-3.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "log-analyzer-script-v3", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import argparse\nfrom collections import defaultdict, Counter\nfrom datetime import datetime\nfrom typing import List, Dict, Any\nimport re\nimport json\nimport csv\nimport sys\n\n\ndef parse_log_line(line: str) -> Dict[str, Any]:\n # Simplified regex for demo purposes\n pattern = r'(?P\\S+) - - \\[(?P.*?)\\] \"(?P.*?)\" (?P\\d+) (?P\\d+)'\n match = re.match(pattern, line)\n return match.groupdict() if match else None\n\ndef analyze_logs(logs: List[Dict[str, Any]], top_n: int = 10) -> Dict[str, Any]:\n ips = Counter(log['ip'] for log in logs)\n return {\"top_ips\": ips.most_common(top_n)}\n\n\n\ndef format_output(data: Any, format_type: str) -> str:\n if format_type == 'json':\n return json.dumps(data, indent=2)\n elif format_type == 'csv':\n # Simple implementation\n output = []\n for ip, count in data['top_ips']:\n output.append(f\"{ip},{count}\")\n return \"\\n\".join(output)\n else:\n # Box-drawing table\n lines = []\n lines.append(\"┌───────────────┬───────┐\")\n lines.append(\"│ IP Address │ Count │\")\n lines.append(\"├───────────────┼───────┤\")\n for ip, count in data['top_ips']:\n lines.append(f\"│ {ip:<13} │ {count:<5} │\")\n lines.append(\"└───────────────┴───────┘\")\n return \"\\n\".join(lines)\n\n\n\ndef main() -> None:\n parser = argparse.ArgumentParser(description=\"CLI Log Analyzer\")\n parser.add_argument(\"--file\", required=True, help=\"Path to log file\")\n parser.add_argument(\"--format\", choices=['table', 'json', 'csv'], default='table')\n parser.add_argument(\"--top\", type=int, default=10)\n args = parser.parse_args()\n \n with open(args.file, 'r') as f:\n logs = [parse_log_line(line) for line in f if parse_log_line(line)]\n \n results = analyze_logs(logs, top_n=args.top)\n print(format_output(results, args.format))\n\n\nif __name__ == \"__main__\":\n main()\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-3.py b/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-3.py deleted file mode 100644 index 011ae6e..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/aap/turn-3.py +++ /dev/null @@ -1,60 +0,0 @@ -import argparse -from collections import defaultdict, Counter -from datetime import datetime -from typing import List, Dict, Any -import re -import json -import csv -import sys - - -def parse_log_line(line: str) -> Dict[str, Any]: - # Simplified regex for demo purposes - pattern = r'(?P\S+) - - \[(?P.*?)\] "(?P.*?)" (?P\d+) (?P\d+)' - match = re.match(pattern, line) - return match.groupdict() if match else None - -def analyze_logs(logs: List[Dict[str, Any]], top_n: int = 10) -> Dict[str, Any]: - ips = Counter(log['ip'] for log in logs) - return {"top_ips": ips.most_common(top_n)} - - - -def format_output(data: Any, format_type: str) -> str: - if format_type == 'json': - return json.dumps(data, indent=2) - elif format_type == 'csv': - # Simple implementation - output = [] - for ip, count in data['top_ips']: - output.append(f"{ip},{count}") - return "\n".join(output) - else: - # Box-drawing table - lines = [] - lines.append("┌───────────────┬───────┐") - lines.append("│ IP Address │ Count │") - lines.append("├───────────────┼───────┤") - for ip, count in data['top_ips']: - lines.append(f"│ {ip:<13} │ {count:<5} │") - lines.append("└───────────────┴───────┘") - return "\n".join(lines) - - - -def main() -> None: - parser = argparse.ArgumentParser(description="CLI Log Analyzer") - parser.add_argument("--file", required=True, help="Path to log file") - parser.add_argument("--format", choices=['table', 'json', 'csv'], default='table') - parser.add_argument("--top", type=int, default=10) - args = parser.parse_args() - - with open(args.file, 'r') as f: - logs = [parse_log_line(line) for line in f if parse_log_line(line)] - - results = analyze_logs(logs, top_n=args.top) - print(format_output(results, args.format)) - - -if __name__ == "__main__": - main() diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-0.py b/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-0.py deleted file mode 100644 index eebdb8a..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-0.py +++ /dev/null @@ -1,93 +0,0 @@ -import argparse -import csv -import json -import re -import sys -from collections import Counter -from datetime import datetime -from typing import List, Dict, Any, Optional - -import statistics - -# Regex patterns for common log formats -APACHE_REGEX = r'(?P\S+) \S+ \S+ \[(?P.*?)\] "(?P\S+) (?P\S+) \S+" (?P\d+) (?P\S+)(?: (?P\d+))?' -JSON_REGEX = r'\{.*\}' - -def parse_line(line: str) -> Optional[Dict[str, Any]]: - line = line.strip() - if not line: - return None - - # Try JSON - if line.startswith('{'): - try: - return json.loads(line) - except json.JSONDecodeError: - pass - - # Try Apache/Nginx - match = re.match(APACHE_REGEX, line) - if match: - data = match.groupdict() - data['status'] = int(data['status']) - data['response_time'] = int(data['response_time']) if data.get('response_time') else 0 - return data - - return None - -def analyze_logs(logs: List[Dict[str, Any]], limit: int = 10) -> Dict[str, Any]: - ips = [l['ip'] for l in logs if 'ip' in l] - statuses = [l['status'] for l in logs if 'status' in l] - times = [l['response_time'] for l in logs if 'response_time' in l] - paths = [l['path'] for l in logs if 'path' in l] - - return { - "top_ips": Counter(ips).most_common(limit), - "status_distribution": dict(Counter(statuses)), - "top_endpoints": Counter(paths).most_common(limit), - "response_time_stats": { - "p50": statistics.median(times) if times else 0, - "p95": sorted(times)[int(len(times)*0.95)] if times else 0 - } if times else {} - } - -def format_table(data: Dict[str, Any]) -> str: - lines = ["--- Analysis Report ---"] - for key, value in data.items(): - lines.append(f"\n{key.replace('_', ' ').upper()}:") - if isinstance(value, list): - for item in value: - lines.append(f" {item[0]}: {item[1]}") - elif isinstance(value, dict): - for k, v in value.items(): - lines.append(f" {k}: {v}") - return "\n".join(lines) - -def main() -> None: - parser = argparse.ArgumentParser(description="Log Analyzer Tool") - parser.add_argument("file", help="Path to log file") - parser.add_argument("--format", choices=["table", "json", "csv"], default="table") - parser.add_argument("--limit", type=int, default=10) - - args = parser.parse_args() - - logs = [] - with open(args.file, 'r') as f: - for line in f: - parsed = parse_line(line) - if parsed: - logs.append(parsed) - - results = analyze_logs(logs, args.limit) - - if args.format == "json": - print(json.dumps(results, indent=2)) - elif args.format == "csv": - writer = csv.writer(sys.stdout) - for k, v in results.items(): - writer.writerow([k, str(v)]) - else: - print(format_table(results)) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-1.py b/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-1.py deleted file mode 100644 index e885ead..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-1.py +++ /dev/null @@ -1,90 +0,0 @@ -import argparse -import csv -import json -import re -import sys -from collections import Counter -from typing import List, Dict, Any, Optional -import statistics - -APACHE_REGEX = r'(?P\S+) \S+ \S+ \[(?P.*?)\] "(?P\S+) (?P\S+) \S+" (?P\d+) (?P\S+)(?: (?P\d+))?' - -def parse_line(line: str) -> Optional[Dict[str, Any]]: - line = line.strip() - if not line: return None - if line.startswith('{'): - try: return json.loads(line) - except json.JSONDecodeError: pass - match = re.match(APACHE_REGEX, line) - if match: - data = match.groupdict() - data['status'] = int(data['status']) - data['response_time'] = int(data['response_time']) if data.get('response_time') else 0 - return data - return None - -def detect_anomalies(logs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - times = [l['response_time'] for l in logs if 'response_time' in l and l['response_time'] > 0] - if not times: return [] - - threshold = sorted(times)[int(len(times) * 0.99)] - return [l for l in logs if l.get('response_time', 0) > threshold] - -def analyze_logs(logs: List[Dict[str, Any]], limit: int = 10) -> Dict[str, Any]: - ips = [l['ip'] for l in logs if 'ip' in l] - statuses = [l['status'] for l in logs if 'status' in l] - times = [l['response_time'] for l in logs if 'response_time' in l] - paths = [l['path'] for l in logs if 'path' in l] - - return { - "top_ips": Counter(ips).most_common(limit), - "status_distribution": dict(Counter(statuses)), - "top_endpoints": Counter(paths).most_common(limit), - "response_time_stats": { - "p50": statistics.median(times) if times else 0, - "p95": sorted(times)[int(len(times)*0.95)] if times else 0 - } if times else {}, - "anomalies": detect_anomalies(logs) - } - -def format_table(data: Dict[str, Any]) -> str: - lines = ["--- Analysis Report ---"] - for key, value in data.items(): - if key == "anomalies": continue - lines.append(f"\n{key.replace('_', ' ').upper()}:") - if isinstance(value, list): - for item in value: lines.append(f" {item[0]}: {item[1]}") - elif isinstance(value, dict): - for k, v in value.items(): lines.append(f" {k}: {v}") - - lines.append("\nANOMALIES (P99+ Latency):") - for a in data["anomalies"][:5]: - lines.append(f" Path: {a.get('path')} | Time: {a.get('response_time')}ms") - return "\n".join(lines) - -def main() -> None: - parser = argparse.ArgumentParser(description="Log Analyzer Tool") - parser.add_argument("file", help="Path to log file") - parser.add_argument("--format", choices=["table", "json", "csv"], default="table") - parser.add_argument("--limit", type=int, default=10) - args = parser.parse_args() - - logs = [] - with open(args.file, 'r') as f: - for line in f: - parsed = parse_line(line) - if parsed: logs.append(parsed) - - results = analyze_logs(logs, args.limit) - - if args.format == "json": - print(json.dumps(results, indent=2)) - elif args.format == "csv": - writer = csv.writer(sys.stdout) - for k, v in results.items(): - if k != "anomalies": writer.writerow([k, str(v)]) - else: - print(format_table(results)) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-2.py b/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-2.py deleted file mode 100644 index 82c16ae..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-2.py +++ /dev/null @@ -1,82 +0,0 @@ -import argparse -import csv -import json -import re -import sys -from collections import Counter, defaultdict -from datetime import datetime -from typing import List, Dict, Any, Optional -import statistics - -APACHE_REGEX = r'(?P\S+) \S+ \S+ \[(?P.*?)\] "(?P\S+) (?P\S+) \S+" (?P\d+) (?P\S+)(?: (?P\d+))?' - -def parse_line(line: str) -> Optional[Dict[str, Any]]: - line = line.strip() - if not line: return None - if line.startswith('{'): - try: return json.loads(line) - except json.JSONDecodeError: pass - match = re.match(APACHE_REGEX, line) - if match: - data = match.groupdict() - data['status'] = int(data['status']) - data['response_time'] = int(data['response_time']) if data.get('response_time') else 0 - # Simple date parse for Apache: 10/Oct/2000:13:55:36 - ts_str = data['timestamp'].split(':')[0] - try: - data['dt'] = datetime.strptime(data['timestamp'].split(' ')[0], '%d/%b/%Y:%H:%M:%S') - except: - data['dt'] = datetime.now() - return data - return None - -def group_by_time(logs: List[Dict[str, Any]], interval: str) -> Dict[str, int]: - groups = defaultdict(int) - for log in logs: - dt = log.get('dt', datetime.now()) - if interval == 'hour': key = dt.strftime('%Y-%m-%d %H:00') - elif interval == 'day': key = dt.strftime('%Y-%m-%d') - elif interval == 'week': key = dt.strftime('%Y-W%U') - groups[key] += 1 - return dict(groups) - -def detect_anomalies(logs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - times = [l['response_time'] for l in logs if 'response_time' in l and l['response_time'] > 0] - if not times: return [] - threshold = sorted(times)[int(len(times) * 0.99)] - return [l for l in logs if l.get('response_time', 0) > threshold] - -def analyze_logs(logs: List[Dict[str, Any]], limit: int = 10, group_interval: Optional[str] = None) -> Dict[str, Any]: - res = { - "top_ips": Counter([l['ip'] for l in logs if 'ip' in l]).most_common(limit), - "status_distribution": dict(Counter([l['status'] for l in logs if 'status' in l])), - "anomalies": detect_anomalies(logs) - } - if group_interval: - res["time_grouping"] = group_by_time(logs, group_interval) - return res - -def main() -> None: - parser = argparse.ArgumentParser(description="Log Analyzer Tool") - parser.add_argument("file", help="Path to log file") - parser.add_argument("--format", choices=["table", "json", "csv"], default="table") - parser.add_argument("--limit", type=int, default=10) - parser.add_argument("--group-by", choices=["hour", "day", "week"], help="Time-based grouping") - args = parser.parse_args() - - logs = [p for line in open(args.file) if (p := parse_line(line))] - results = analyze_logs(logs, args.limit, args.group_by) - - if args.format == "json": - print(json.dumps(results, indent=2, default=str)) - elif args.format == "csv": - writer = csv.writer(sys.stdout) - for k, v in results.items(): writer.writerow([k, str(v)]) - else: - print("--- Analysis Report ---") - for k, v in results.items(): - print(f"\n{k.replace('_', ' ').upper()}:") - print(v) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-3.py b/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-3.py deleted file mode 100644 index c70f66a..0000000 --- a/evals/data/experiments/012-python-cli-log-analyzer/outputs/base/turn-3.py +++ /dev/null @@ -1,84 +0,0 @@ -import argparse -import csv -import json -import re -import sys -from collections import Counter, defaultdict -from datetime import datetime -from typing import List, Dict, Any, Optional - -APACHE_REGEX = r'(?P\S+) \S+ \S+ \[(?P.*?)\] "(?P\S+) (?P\S+) \S+" (?P\d+) (?P\S+)(?: (?P\d+))?' - -def parse_line(line: str) -> Optional[Dict[str, Any]]: - line = line.strip() - if not line: return None - if line.startswith('{'): - try: return json.loads(line) - except json.JSONDecodeError: pass - match = re.match(APACHE_REGEX, line) - if match: - data = match.groupdict() - data['status'] = int(data['status']) - data['response_time'] = int(data['response_time']) if data.get('response_time') else 0 - try: - data['dt'] = datetime.strptime(data['timestamp'].split(' ')[0], '%d/%b/%Y:%H:%M:%S') - except: - data['dt'] = datetime.now() - return data - return None - -def format_table(data: Dict[str, Any]) -> str: - output = [] - for section, content in data.items(): - output.append(f"┌─ {section.replace('_', ' ').upper()} ─" + "─" * 20) - if isinstance(content, dict): - for k, v in content.items(): - output.append(f"│ {str(k):<15} │ {str(v):<10} │") - elif isinstance(content, list): - for item in content: - label = item[0] if isinstance(item, (list, tuple)) else str(item) - val = item[1] if isinstance(item, (list, tuple)) else "" - output.append(f"│ {str(label):<15} │ {str(val):<10} │") - output.append("└" + "─" * 30) - return "\n".join(output) - -def group_by_time(logs: List[Dict[str, Any]], interval: str) -> Dict[str, int]: - groups = defaultdict(int) - for log in logs: - dt = log.get('dt', datetime.now()) - if interval == 'hour': key = dt.strftime('%Y-%m-%d %H:00') - elif interval == 'day': key = dt.strftime('%Y-%m-%d') - elif interval == 'week': key = dt.strftime('%Y-W%U') - groups[key] += 1 - return dict(sorted(groups.items())) - -def analyze_logs(logs: List[Dict[str, Any]], limit: int = 10, group_interval: Optional[str] = None) -> Dict[str, Any]: - res = { - "top_ips": Counter([l['ip'] for l in logs if 'ip' in l]).most_common(limit), - "status_distribution": dict(Counter([l['status'] for l in logs if 'status' in l])), - } - if group_interval: - res["time_grouping"] = group_by_time(logs, group_interval) - return res - -def main() -> None: - parser = argparse.ArgumentParser(description="Log Analyzer Tool") - parser.add_argument("file", help="Path to log file") - parser.add_argument("--format", choices=["table", "json", "csv"], default="table") - parser.add_argument("--limit", type=int, default=10) - parser.add_argument("--group-by", choices=["hour", "day", "week"], help="Time-based grouping") - args = parser.parse_args() - - logs = [p for line in open(args.file) if (p := parse_line(line))] - results = analyze_logs(logs, args.limit, args.group_by) - - if args.format == "json": - print(json.dumps(results, indent=2, default=str)) - elif args.format == "csv": - writer = csv.writer(sys.stdout) - for k, v in results.items(): writer.writerow([k, str(v)]) - else: - print(format_table(results)) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/evals/data/experiments/013-python-data-pipeline/eval.json b/evals/data/experiments/013-python-data-pipeline/eval.json deleted file mode 100644 index e552675..0000000 --- a/evals/data/experiments/013-python-data-pipeline/eval.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.201, - "token_f1": 0.4361, - "base_char_count": 3023, - "aap_char_count": 2360, - "char_delta_pct": -21.9, - "lines_added": 73, - "lines_removed": 79, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0478, - "token_f1": 0.16, - "base_char_count": 784, - "aap_char_count": 2018, - "char_delta_pct": 157.4, - "lines_added": 75, - "lines_removed": 14, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.1093, - "token_f1": 0.2143, - "base_char_count": 885, - "aap_char_count": 2171, - "char_delta_pct": 145.3, - "lines_added": 77, - "lines_removed": 19, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1229, - "token_f1": 0.1732, - "base_char_count": 974, - "aap_char_count": 2574, - "char_delta_pct": 164.3, - "lines_added": 87, - "lines_removed": 17, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1202, - "mean_token_f1": 0.2459, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/013-python-data-pipeline/metrics.json b/evals/data/experiments/013-python-data-pipeline/metrics.json deleted file mode 100644 index 25a6563..0000000 --- a/evals/data/experiments/013-python-data-pipeline/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "013-python-data-pipeline", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:54:53.116719+00:00", - "format": "text/x-python", - "base_turn0": { - "input_tokens": 121, - "output_tokens": 799, - "latency_ms": 4036, - "artifact_bytes": 2624 - }, - "aap_turn0": { - "input_tokens": 460, - "output_tokens": 634, - "latency_ms": 6196, - "artifact_bytes": 2133 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new transformation step that calculates a 'customer_lifetime_value' metric", - "input_tokens": 944, - "output_tokens": 276, - "latency_ms": 2693, - "output_bytes": 850, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Update the validation section to add a check that rejects rows where the sale da", - "input_tokens": 1241, - "output_tokens": 235, - "latency_ms": 1733, - "output_bytes": 859, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Rewrite the loading section to also output a summary CSV with one row per region", - "input_tokens": 1504, - "output_tokens": 269, - "latency_ms": 1884, - "output_bytes": 876, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 3689, - "total_output_tokens": 780, - "total_latency_ms": 6310 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new transformation step that calculates a 'customer_lifetime_value' metric", - "input_tokens": 1766, - "output_tokens": 732, - "latency_ms": 3183, - "output_bytes": 2325, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Update the validation section to add a check that rejects rows where the sale da", - "input_tokens": 1817, - "output_tokens": 148, - "latency_ms": 1318, - "output_bytes": 2441, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - }, - { - "turn": 3, - "edit": "Rewrite the loading section to also output a summary CSV with one row per region", - "input_tokens": 1855, - "output_tokens": 843, - "latency_ms": 8543, - "output_bytes": 2720, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - } - ], - "total_input_tokens": 5438, - "total_output_tokens": 1723, - "total_latency_ms": 13044, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": -120.9, - "input_token_savings_pct": -47.4, - "latency_savings_pct": -106.7 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 121, - "base_output": 799, - "base_latency_ms": 4036, - "aap_input": 460, - "aap_output": 634, - "aap_latency_ms": 6196 - }, - { - "turn": 1, - "base_input": 944, - "base_output": 276, - "base_latency_ms": 2693, - "aap_input": 1766, - "aap_output": 732, - "aap_latency_ms": 3183, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 1241, - "base_output": 235, - "base_latency_ms": 1733, - "aap_input": 1817, - "aap_output": 148, - "aap_latency_ms": 1318, - "envelope_name": "edit", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 1504, - "base_output": 269, - "base_latency_ms": 1884, - "aap_input": 1855, - "aap_output": 843, - "aap_latency_ms": 8543, - "envelope_name": "synthesize", - "apply_ok": true - } - ], - "totals": { - "base_input": 3810, - "base_output": 1579, - "base_combined": 5389, - "aap_input": 5898, - "aap_output": 2357, - "aap_combined": 8255, - "base_latency_ms": 10346, - "aap_latency_ms": 19240, - "output_savings_pct": -49.3, - "input_delta_pct": 54.8, - "combined_savings_pct": -53.2, - "latency_savings_pct": -86.0 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.2489, - "token_f1": 0.4872, - "base_char_count": 2624, - "aap_char_count": 1852, - "char_delta_pct": -29.4, - "lines_added": 46, - "lines_removed": 76, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.1672, - "token_f1": 0.3592, - "base_char_count": 850, - "aap_char_count": 2044, - "char_delta_pct": 140.5, - "lines_added": 68, - "lines_removed": 23, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0874, - "token_f1": 0.2231, - "base_char_count": 859, - "aap_char_count": 2160, - "char_delta_pct": 151.5, - "lines_added": 65, - "lines_removed": 20, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1086, - "token_f1": 0.1647, - "base_char_count": 876, - "aap_char_count": 2439, - "char_delta_pct": 178.4, - "lines_added": 76, - "lines_removed": 25, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.153, - "mean_token_f1": 0.3085, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-0.py b/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-0.py deleted file mode 100644 index d7a7810..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-0.py +++ /dev/null @@ -1,64 +0,0 @@ -import pandas as pd -import numpy as np -from dataclasses import dataclass -from typing import List -from pathlib import Path -import json - - -@dataclass -class ETLConfig: - input_path: str - output_dir: str - required_columns: List[str] = None - date_col: str = "order_date" - - - -class SalesETLPipeline: - def __init__(self, config: ETLConfig): - self.config = config - Path(config.output_dir).mkdir(parents=True, exist_ok=True) - - def extract(self, file_path: str) -> pd.DataFrame: - - df = pd.read_csv(file_path, encoding='utf-8-sig', parse_dates=[self.config.date_col]) - return df - - - def validate(self, df: pd.DataFrame): - - if df.isnull().any().any(): - raise ValueError("Null values detected in dataset") - if df.duplicated().any(): - raise ValueError("Duplicate rows detected") - - - def transform(self, df: pd.DataFrame) -> pd.DataFrame: - - # Clean column names - df.columns = [c.lower().replace(' ', '_') for c in df.columns] - - # Derived metrics - df['profit_margin'] = (df['revenue'] - df['cost']) / df['revenue'] - - # Categorize - df['product_tier'] = pd.cut(df['revenue'], bins=[0, 100, 500, np.inf], labels=['Low', 'Mid', 'High']) - - # Aggregate - summary = df.groupby('region').agg({'revenue': 'sum', 'profit_margin': 'mean'}) - return df, summary - - - def load(self, df: pd.DataFrame, summary: pd.DataFrame): - - df.to_parquet(f"{self.config.output_dir}/processed_sales.parquet") - summary.to_json(f"{self.config.output_dir}/regional_summary.json") - - - def run(self, file_path: str): - df = self.extract(file_path) - self.validate(df) - df, summary = self.transform(df) - self.load(df, summary) - \ No newline at end of file diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-1.json b/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-1.json deleted file mode 100644 index e242382..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "sales-etl-pipeline", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import pandas as pd\nimport numpy as np\nfrom dataclasses import dataclass\nfrom typing import List\nfrom pathlib import Path\nimport json\n\n\n@dataclass\nclass ETLConfig:\n input_path: str\n output_dir: str\n required_columns: List[str] = None\n date_col: str = \"order_date\"\n\n\n\nclass SalesETLPipeline:\n def __init__(self, config: ETLConfig):\n self.config = config\n Path(config.output_dir).mkdir(parents=True, exist_ok=True)\n\n def extract(self, file_path: str) -> pd.DataFrame:\n \n df = pd.read_csv(file_path, encoding='utf-8-sig', parse_dates=[self.config.date_col])\n return df\n \n\n def validate(self, df: pd.DataFrame):\n \n if df.isnull().any().any():\n raise ValueError(\"Null values detected in dataset\")\n if df.duplicated().any():\n raise ValueError(\"Duplicate rows detected\")\n \n\n def transform(self, df: pd.DataFrame) -> pd.DataFrame:\n \n # Clean column names\n df.columns = [c.lower().replace(' ', '_') for c in df.columns]\n \n # Derived metrics\n df['profit_margin'] = (df['revenue'] - df['cost']) / df['revenue']\n \n # Categorize\n df['product_tier'] = pd.cut(df['revenue'], bins=[0, 100, 500, np.inf], labels=['Low', 'Mid', 'High'])\n \n # Customer Lifetime Value\n clv = df.groupby('customer_id')['revenue'].sum().rename('customer_lifetime_value')\n df = df.merge(clv, on='customer_id', how='left')\n \n # Aggregate\n summary = df.groupby('region').agg({'revenue': 'sum', 'profit_margin': 'mean'})\n return df, summary\n \n\n def load(self, df: pd.DataFrame, summary: pd.DataFrame):\n \n df.to_parquet(f\"{self.config.output_dir}/processed_sales.parquet\")\n summary.to_json(f\"{self.config.output_dir}/regional_summary.json\")\n \n\n def run(self, file_path: str):\n df = self.extract(file_path)\n self.validate(df)\n df, summary = self.transform(df)\n self.load(df, summary)\n\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-1.py b/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-1.py deleted file mode 100644 index 66d7f7e..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-1.py +++ /dev/null @@ -1,68 +0,0 @@ -import pandas as pd -import numpy as np -from dataclasses import dataclass -from typing import List -from pathlib import Path -import json - - -@dataclass -class ETLConfig: - input_path: str - output_dir: str - required_columns: List[str] = None - date_col: str = "order_date" - - - -class SalesETLPipeline: - def __init__(self, config: ETLConfig): - self.config = config - Path(config.output_dir).mkdir(parents=True, exist_ok=True) - - def extract(self, file_path: str) -> pd.DataFrame: - - df = pd.read_csv(file_path, encoding='utf-8-sig', parse_dates=[self.config.date_col]) - return df - - - def validate(self, df: pd.DataFrame): - - if df.isnull().any().any(): - raise ValueError("Null values detected in dataset") - if df.duplicated().any(): - raise ValueError("Duplicate rows detected") - - - def transform(self, df: pd.DataFrame) -> pd.DataFrame: - - # Clean column names - df.columns = [c.lower().replace(' ', '_') for c in df.columns] - - # Derived metrics - df['profit_margin'] = (df['revenue'] - df['cost']) / df['revenue'] - - # Categorize - df['product_tier'] = pd.cut(df['revenue'], bins=[0, 100, 500, np.inf], labels=['Low', 'Mid', 'High']) - - # Customer Lifetime Value - clv = df.groupby('customer_id')['revenue'].sum().rename('customer_lifetime_value') - df = df.merge(clv, on='customer_id', how='left') - - # Aggregate - summary = df.groupby('region').agg({'revenue': 'sum', 'profit_margin': 'mean'}) - return df, summary - - - def load(self, df: pd.DataFrame, summary: pd.DataFrame): - - df.to_parquet(f"{self.config.output_dir}/processed_sales.parquet") - summary.to_json(f"{self.config.output_dir}/regional_summary.json") - - - def run(self, file_path: str): - df = self.extract(file_path) - self.validate(df) - df, summary = self.transform(df) - self.load(df, summary) - diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-2.json b/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-2.json deleted file mode 100644 index 107483c..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "pipeline-engine-v1", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "validation-logic" - }, - "content": " if df.isnull().any().any():\n raise ValueError(\"Null values detected in dataset\")\n if df.duplicated().any():\n raise ValueError(\"Duplicate rows detected\")\n if (df[self.config.date_col] > pd.Timestamp.now()).any():\n raise ValueError(\"Future sale dates detected\")\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-2.py b/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-2.py deleted file mode 100644 index e1e8c08..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-2.py +++ /dev/null @@ -1,69 +0,0 @@ -import pandas as pd -import numpy as np -from dataclasses import dataclass -from typing import List -from pathlib import Path -import json - - -@dataclass -class ETLConfig: - input_path: str - output_dir: str - required_columns: List[str] = None - date_col: str = "order_date" - - - -class SalesETLPipeline: - def __init__(self, config: ETLConfig): - self.config = config - Path(config.output_dir).mkdir(parents=True, exist_ok=True) - - def extract(self, file_path: str) -> pd.DataFrame: - - df = pd.read_csv(file_path, encoding='utf-8-sig', parse_dates=[self.config.date_col]) - return df - - - def validate(self, df: pd.DataFrame): - if df.isnull().any().any(): - raise ValueError("Null values detected in dataset") - if df.duplicated().any(): - raise ValueError("Duplicate rows detected") - if (df[self.config.date_col] > pd.Timestamp.now()).any(): - raise ValueError("Future sale dates detected") - - - def transform(self, df: pd.DataFrame) -> pd.DataFrame: - - # Clean column names - df.columns = [c.lower().replace(' ', '_') for c in df.columns] - - # Derived metrics - df['profit_margin'] = (df['revenue'] - df['cost']) / df['revenue'] - - # Categorize - df['product_tier'] = pd.cut(df['revenue'], bins=[0, 100, 500, np.inf], labels=['Low', 'Mid', 'High']) - - # Customer Lifetime Value - clv = df.groupby('customer_id')['revenue'].sum().rename('customer_lifetime_value') - df = df.merge(clv, on='customer_id', how='left') - - # Aggregate - summary = df.groupby('region').agg({'revenue': 'sum', 'profit_margin': 'mean'}) - return df, summary - - - def load(self, df: pd.DataFrame, summary: pd.DataFrame): - - df.to_parquet(f"{self.config.output_dir}/processed_sales.parquet") - summary.to_json(f"{self.config.output_dir}/regional_summary.json") - - - def run(self, file_path: str): - df = self.extract(file_path) - self.validate(df) - df, summary = self.transform(df) - self.load(df, summary) - diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-3.json b/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-3.json deleted file mode 100644 index aa7c2a9..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-3.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "sales-pipeline-system", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import pandas as pd\nimport numpy as np\nfrom dataclasses import dataclass\nfrom typing import List\nfrom pathlib import Path\nimport json\n\n\n@dataclass\nclass ETLConfig:\n input_path: str\n output_dir: str\n required_columns: List[str] = None\n date_col: str = \"order_date\"\n\n\n\nclass SalesETLPipeline:\n def __init__(self, config: ETLConfig):\n self.config = config\n Path(config.output_dir).mkdir(parents=True, exist_ok=True)\n\n def extract(self, file_path: str) -> pd.DataFrame:\n \n df = pd.read_csv(file_path, encoding='utf-8-sig', parse_dates=[self.config.date_col])\n return df\n \n\n def validate(self, df: pd.DataFrame):\n if df.isnull().any().any():\n raise ValueError(\"Null values detected in dataset\")\n if df.duplicated().any():\n raise ValueError(\"Duplicate rows detected\")\n if (df[self.config.date_col] > pd.Timestamp.now()).any():\n raise ValueError(\"Future sale dates detected\")\n\n\n def transform(self, df: pd.DataFrame) -> pd.DataFrame:\n \n # Clean column names\n df.columns = [c.lower().replace(' ', '_') for c in df.columns]\n \n # Derived metrics\n df['profit_margin'] = (df['revenue'] - df['cost']) / df['revenue']\n \n # Categorize\n df['product_tier'] = pd.cut(df['revenue'], bins=[0, 100, 500, np.inf], labels=['Low', 'Mid', 'High'])\n \n # Customer Lifetime Value\n clv = df.groupby('customer_id')['revenue'].sum().rename('customer_lifetime_value')\n df = df.merge(clv, on='customer_id', how='left')\n \n # Aggregate\n summary = df.groupby('region').agg({\n 'revenue': 'sum', \n 'order_id': 'count',\n 'profit_margin': 'mean'\n }).rename(columns={'order_id': 'order_count'})\n summary['average_order_value'] = summary['revenue'] / summary['order_count']\n \n return df, summary\n \n\n def load(self, df: pd.DataFrame, summary: pd.DataFrame):\n \n df.to_parquet(f\"{self.config.output_dir}/processed_sales.parquet\")\n summary.to_json(f\"{self.config.output_dir}/regional_summary.json\")\n summary.to_csv(f\"{self.config.output_dir}/regional_summary.csv\")\n \n\n def run(self, file_path: str):\n df = self.extract(file_path)\n self.validate(df)\n df, summary = self.transform(df)\n self.load(df, summary)\n\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-3.py b/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-3.py deleted file mode 100644 index f10447f..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/aap/turn-3.py +++ /dev/null @@ -1,76 +0,0 @@ -import pandas as pd -import numpy as np -from dataclasses import dataclass -from typing import List -from pathlib import Path -import json - - -@dataclass -class ETLConfig: - input_path: str - output_dir: str - required_columns: List[str] = None - date_col: str = "order_date" - - - -class SalesETLPipeline: - def __init__(self, config: ETLConfig): - self.config = config - Path(config.output_dir).mkdir(parents=True, exist_ok=True) - - def extract(self, file_path: str) -> pd.DataFrame: - - df = pd.read_csv(file_path, encoding='utf-8-sig', parse_dates=[self.config.date_col]) - return df - - - def validate(self, df: pd.DataFrame): - if df.isnull().any().any(): - raise ValueError("Null values detected in dataset") - if df.duplicated().any(): - raise ValueError("Duplicate rows detected") - if (df[self.config.date_col] > pd.Timestamp.now()).any(): - raise ValueError("Future sale dates detected") - - - def transform(self, df: pd.DataFrame) -> pd.DataFrame: - - # Clean column names - df.columns = [c.lower().replace(' ', '_') for c in df.columns] - - # Derived metrics - df['profit_margin'] = (df['revenue'] - df['cost']) / df['revenue'] - - # Categorize - df['product_tier'] = pd.cut(df['revenue'], bins=[0, 100, 500, np.inf], labels=['Low', 'Mid', 'High']) - - # Customer Lifetime Value - clv = df.groupby('customer_id')['revenue'].sum().rename('customer_lifetime_value') - df = df.merge(clv, on='customer_id', how='left') - - # Aggregate - summary = df.groupby('region').agg({ - 'revenue': 'sum', - 'order_id': 'count', - 'profit_margin': 'mean' - }).rename(columns={'order_id': 'order_count'}) - summary['average_order_value'] = summary['revenue'] / summary['order_count'] - - return df, summary - - - def load(self, df: pd.DataFrame, summary: pd.DataFrame): - - df.to_parquet(f"{self.config.output_dir}/processed_sales.parquet") - summary.to_json(f"{self.config.output_dir}/regional_summary.json") - summary.to_csv(f"{self.config.output_dir}/regional_summary.csv") - - - def run(self, file_path: str): - df = self.extract(file_path) - self.validate(df) - df, summary = self.transform(df) - self.load(df, summary) - diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-0.py b/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-0.py deleted file mode 100644 index 36199de..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-0.py +++ /dev/null @@ -1,93 +0,0 @@ -import pandas as pd -import numpy as np -import json -from dataclasses import dataclass -from typing import List -from pathlib import Path - -@dataclass -class PipelineConfig: - input_path: str - output_dir: str - date_cols: List[str] - required_cols: List[str] - -def extract(config: PipelineConfig) -> pd.DataFrame: - df = pd.read_csv( - config.input_path, - encoding='utf-8-sig', - parse_dates=config.date_cols - ) - return df - -def validate(df: pd.DataFrame, config: PipelineConfig): - # Check required columns - missing = [c for c in config.required_cols if c not in df.columns] - if missing: raise ValueError(f"Missing columns: {missing}") - - # Null checks - if df[config.required_cols].isnull().any().any(): - print("Warning: Nulls detected in critical columns") - - # Range check - if (df['sales'] < 0).any(): - raise ValueError("Negative sales detected") - - # Duplicate check - if df.duplicated().any(): - df.drop_duplicates(inplace=True) - - return df - -def transform(df: pd.DataFrame) -> pd.DataFrame: - # Clean column names - df.columns = [c.lower().replace(' ', '_') for c in df.columns] - - # Derived metrics - df['profit'] = df['revenue'] - df['cost'] - df['profit_margin'] = df['profit'] / df['revenue'] - - # Categorization - df['product_tier'] = pd.cut( - df['revenue'], - bins=[0, 1000, 5000, np.inf], - labels=['Entry', 'Mid', 'Premium'] - ) - - # YoY Growth (Requires sorted data) - df = df.sort_values(['region', 'date']) - df['yoy_growth'] = df.groupby('region')['revenue'].pct_change(periods=1) - - return df - -def load(df: pd.DataFrame, output_dir: str): - path = Path(output_dir) - path.mkdir(parents=True, exist_ok=True) - - # Parquet - df.to_parquet(path / "sales_processed.parquet") - - # JSON Summary - summary = { - "total_revenue": float(df['revenue'].sum()), - "avg_margin": float(df['profit_margin'].mean()), - "region_sales": df.groupby('region')['revenue'].sum().to_dict() - } - with open(path / "summary.json", 'w') as f: - json.dump(summary, f, indent=4) - -def run_pipeline(config: PipelineConfig): - df = extract(config) - df = validate(df, config) - df = transform(df) - load(df, config.output_dir) - print("Pipeline completed successfully.") - -if __name__ == "__main__": - config = PipelineConfig( - input_path="sales_data.csv", - output_dir="./output", - date_cols=["date"], - required_cols=["date", "region", "revenue", "cost"] - ) - # run_pipeline(config) \ No newline at end of file diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-1.py b/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-1.py deleted file mode 100644 index 7a6a367..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-1.py +++ /dev/null @@ -1,23 +0,0 @@ -def transform(df: pd.DataFrame) -> pd.DataFrame: - # Existing cleaning and metrics - df.columns = [c.lower().replace(' ', '_') for c in df.columns] - df['profit'] = df['revenue'] - df['cost'] - df['profit_margin'] = df['profit'] / df['revenue'] - - # New: Customer Lifetime Value (CLV) - # Group by customer_id to calculate total spend across all history - clv_map = df.groupby('customer_id')['revenue'].sum().rename('customer_lifetime_value') - df = df.merge(clv_map, on='customer_id', how='left') - - # Categorization - df['product_tier'] = pd.cut( - df['revenue'], - bins=[0, 1000, 5000, np.inf], - labels=['Entry', 'Mid', 'Premium'] - ) - - # YoY Growth - df = df.sort_values(['region', 'date']) - df['yoy_growth'] = df.groupby('region')['revenue'].pct_change(periods=1) - - return df \ No newline at end of file diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-2.py b/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-2.py deleted file mode 100644 index 110d0ed..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-2.py +++ /dev/null @@ -1,24 +0,0 @@ -def validate(df: pd.DataFrame, config: PipelineConfig): - # Check required columns - missing = [c for c in config.required_cols if c not in df.columns] - if missing: raise ValueError(f"Missing columns: {missing}") - - # Null checks - if df[config.required_cols].isnull().any().any(): - print("Warning: Nulls detected in critical columns") - - # Range check: Sales cannot be negative - if (df['sales'] < 0).any(): - raise ValueError("Negative sales detected") - - # Future date check - today = pd.Timestamp.now() - if (df['date'] > today).any(): - future_rows = df[df['date'] > today].shape[0] - raise ValueError(f"Validation failed: {future_rows} rows have a future date.") - - # Duplicate check - if df.duplicated().any(): - df.drop_duplicates(inplace=True) - - return df \ No newline at end of file diff --git a/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-3.py b/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-3.py deleted file mode 100644 index e66d145..0000000 --- a/evals/data/experiments/013-python-data-pipeline/outputs/base/turn-3.py +++ /dev/null @@ -1,25 +0,0 @@ -def load(df: pd.DataFrame, output_dir: str): - path = Path(output_dir) - path.mkdir(parents=True, exist_ok=True) - - # Parquet export - df.to_parquet(path / "sales_processed.parquet") - - # Regional Summary CSV - # Aggregating total revenue, order count (rows), and average order value - regional_summary = df.groupby('region').agg( - total_revenue=('revenue', 'sum'), - order_count=('revenue', 'count'), - avg_order_value=('revenue', 'mean') - ).reset_index() - - regional_summary.to_csv(path / "regional_summary.csv", index=False) - - # JSON Summary (General metrics) - summary = { - "total_revenue": float(df['revenue'].sum()), - "avg_margin": float(df['profit_margin'].mean()), - "total_orders": int(len(df)) - } - with open(path / "summary.json", 'w') as f: - json.dump(summary, f, indent=4) \ No newline at end of file diff --git a/evals/data/experiments/014-python-pytest-suite/metrics.json b/evals/data/experiments/014-python-pytest-suite/metrics.json deleted file mode 100644 index 08e0987..0000000 --- a/evals/data/experiments/014-python-pytest-suite/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "014-python-pytest-suite", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:55:22.761764+00:00", - "format": "text/x-python", - "base_turn0": { - "input_tokens": 128, - "output_tokens": 1065, - "latency_ms": 5045, - "artifact_bytes": 3643 - }, - "aap_turn0": { - "input_tokens": 447, - "output_tokens": 1118, - "latency_ms": 12549, - "artifact_bytes": 4935 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new fixture 'sample_superadmin' with elevated permissions and update the p", - "input_tokens": 1221, - "output_tokens": 386, - "latency_ms": 2991, - "output_bytes": 1461, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Add 3 new parametrized test cases to test-login for multi-factor authentication ", - "input_tokens": 1639, - "output_tokens": 357, - "latency_ms": 2725, - "output_bytes": 1167, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Rewrite the test-registration section to include tests for OAuth signup via Goog", - "input_tokens": 2015, - "output_tokens": 572, - "latency_ms": 3211, - "output_bytes": 2029, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 4875, - "total_output_tokens": 1315, - "total_latency_ms": 8927 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new fixture 'sample_superadmin' with elevated permissions and update the p", - "input_tokens": 2595, - "output_tokens": 1718, - "latency_ms": 6584, - "output_bytes": 5636, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Add 3 new parametrized test cases to test-login for multi-factor authentication ", - "input_tokens": 2803, - "output_tokens": 1860, - "latency_ms": 16228, - "output_bytes": 5997, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Rewrite the test-registration section to include tests for OAuth signup via Goog", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 2271, - "output_bytes": 5997, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - } - ], - "total_input_tokens": 5398, - "total_output_tokens": 3578, - "total_latency_ms": 25083, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.6666666666666666 - }, - "comparison": { - "output_token_savings_pct": -172.1, - "input_token_savings_pct": -10.7, - "latency_savings_pct": -181.0 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 128, - "base_output": 1065, - "base_latency_ms": 5045, - "aap_input": 447, - "aap_output": 1118, - "aap_latency_ms": 12549 - }, - { - "turn": 1, - "base_input": 1221, - "base_output": 386, - "base_latency_ms": 2991, - "aap_input": 2595, - "aap_output": 1718, - "aap_latency_ms": 6584, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 1639, - "base_output": 357, - "base_latency_ms": 2725, - "aap_input": 2803, - "aap_output": 1860, - "aap_latency_ms": 16228, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 2015, - "base_output": 572, - "base_latency_ms": 3211, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 2271, - "envelope_name": "edit", - "apply_ok": false - } - ], - "totals": { - "base_input": 5003, - "base_output": 2380, - "base_combined": 7383, - "aap_input": 5845, - "aap_output": 4696, - "aap_combined": 10541, - "base_latency_ms": 13972, - "aap_latency_ms": 37632, - "output_savings_pct": -97.3, - "input_delta_pct": 16.8, - "combined_savings_pct": -42.8, - "latency_savings_pct": -169.3 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.0553, - "token_f1": 0.2807, - "base_char_count": 3643, - "aap_char_count": 4935, - "char_delta_pct": 35.5, - "lines_added": 113, - "lines_removed": 90, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0377, - "token_f1": 0.1526, - "base_char_count": 1461, - "aap_char_count": 5597, - "char_delta_pct": 283.1, - "lines_added": 138, - "lines_removed": 26, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0508, - "token_f1": 0.1154, - "base_char_count": 1167, - "aap_char_count": 5958, - "char_delta_pct": 410.5, - "lines_added": 153, - "lines_removed": 29, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0258, - "token_f1": 0.1598, - "base_char_count": 2029, - "aap_char_count": 5958, - "char_delta_pct": 193.6, - "lines_added": 151, - "lines_removed": 42, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0424, - "mean_token_f1": 0.1771, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-0.py b/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-0.py deleted file mode 100644 index b99cec1..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-0.py +++ /dev/null @@ -1,127 +0,0 @@ -import pytest -from flask import Flask -from yourapp import create_app, db -from yourapp.models import User - -@pytest.fixture(scope='module') -def test_client(): - app = create_app('testing') - with app.test_client() as client: - with app.app_context(): - db.create_all() - yield client - with app.app_context(): - db.drop_all() - -@pytest.fixture(scope='module') -def sample_users(): - user1 = User(email='admin@example.com', password='StrongPass123!', role='admin', active=True) - user2 = User(email='user@example.com', password='StrongPass123!', role='user', active=True) - user3 = User(email='inactive@example.com', password='StrongPass123!', role='user', active=False) - return user1, user2, user3 - -@pytest.fixture(scope='module') -def auth_headers(sample_users): - user1, user2, _ = sample_users - headers = {} - headers['Authorization'] = f"Bearer {user1.get_token()}" - return headers - -def test_valid_signup(test_client): - response = test_client.post('/register', json={ - 'email': 'newuser@example.com', - 'password': 'NewStrongPass123!' - }) - assert response.status_code == 201, "Should create a new user successfully." - -def test_duplicate_email(test_client, sample_users): - user1, _, _ = sample_users - test_client.post('/register', json={ - 'email': user1.email, - 'password': 'SomePassword123!' - }) - response = test_client.post('/register', json={ - 'email': user1.email, - 'password': 'AnotherPassword123!' - }) - assert response.status_code == 400, "Should not allow duplicate email registration." - -@pytest.mark.parametrize("email, password, expected_status", [ - ('invalidemail', 'ValidPass123!', 400), - ('valid@example.com', 'weak', 400), - ('', 'ValidPass123!', 400), - ('valid@example.com', '', 400), -]) -def test_registration_validation(test_client, email, password, expected_status): - response = test_client.post('/register', json={ - 'email': email, - 'password': password - }) - assert response.status_code == expected_status, f"Expected status {expected_status} for email: {email}, password: {password}" - -def test_valid_login(test_client, sample_users): - user1, _, _ = sample_users - response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'StrongPass123!' - }) - assert response.status_code == 200, "Should log in successfully with correct credentials." - -def test_wrong_password(test_client, sample_users): - user1, _, _ = sample_users - response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'WrongPassword!' - }) - assert response.status_code == 401, "Should return unauthorized for incorrect password." - -def test_inactive_user_login(test_client, sample_users): - _, _, user3 = sample_users - response = test_client.post('/login', json={ - 'email': user3.email, - 'password': 'StrongPass123!' - }) - assert response.status_code == 403, "Should return forbidden for inactive users." - -def test_rate_limiting(test_client): - for _ in range(6): # Assuming limit is 5 attempts - response = test_client.post('/login', json={ - 'email': 'wrong@example.com', - 'password': 'WrongPassword!' - }) - assert response.status_code == 429, "Should be rate limited after too many requests." - -def test_token_refresh(test_client, sample_users): - user1, _, _ = sample_users - login_response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'StrongPass123!' - }) - token = login_response.json['token'] - - response = test_client.post('/token/refresh', headers={'Authorization': f'Bearer {token}'}) - assert response.status_code == 200, "Should successfully refresh token." - -def test_admin_only_endpoint(test_client, auth_headers): - response = test_client.get('/admin-only', headers=auth_headers) - assert response.status_code == 200, "Admins should have access to admin-only endpoints." - -def test_resource_ownership(test_client, auth_headers, sample_users): - user1, user2, _ = sample_users - - # Assuming we've created resources here - response = test_client.get(f'/resource/{user2.id}', headers=auth_headers) - assert response.status_code == 403, "Users should not access resources owned by others." - -def test_role_escalation_prevention(test_client, sample_users): - _, user2, _ = sample_users - response = test_client.post('/promote', json={'email': user2.email}, headers=auth_headers) - assert response.status_code == 403, "Regular users should not be allowed to promote roles." - -@pytest.mark.parametrize("endpoint", [ - '/admin-only', - '/resource/1' -]) -def test_access_control(test_client, endpoint, auth_headers): - response = test_client.get(endpoint, headers=auth_headers) - assert response.status_code == 200, f"User should have access to {endpoint}." \ No newline at end of file diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-1.json b/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-1.json deleted file mode 100644 index f66f078..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user_tests_v2", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import pytest\nfrom flask import Flask\nfrom yourapp import create_app, db\nfrom yourapp.models import User\n\n\n@pytest.fixture(scope='module')\ndef test_client():\n app = create_app('testing')\n with app.test_client() as client:\n with app.app_context():\n db.create_all()\n yield client\n with app.app_context():\n db.drop_all()\n\n@pytest.fixture(scope='module')\ndef sample_users():\n user1 = User(email='admin@example.com', password='StrongPass123!', role='admin', active=True)\n user2 = User(email='user@example.com', password='StrongPass123!', role='user', active=True)\n user3 = User(email='inactive@example.com', password='StrongPass123!', role='user', active=False)\n return user1, user2, user3\n\n@pytest.fixture(scope='module')\ndef sample_superadmin():\n return User(email='super@example.com', password='StrongPass123!', role='superadmin', active=True)\n\n@pytest.fixture(scope='module')\ndef auth_headers(sample_users):\n user1, user2, _ = sample_users\n headers = {}\n headers['Authorization'] = f\"Bearer {user1.get_token()}\"\n return headers\n\n@pytest.fixture(scope='module')\ndef superadmin_headers(sample_superadmin):\n headers = {}\n headers['Authorization'] = f\"Bearer {sample_superadmin.get_token()}\"\n return headers\n\n\ndef test_valid_signup(test_client):\n response = test_client.post('/register', json={\n 'email': 'newuser@example.com',\n 'password': 'NewStrongPass123!'\n })\n assert response.status_code == 201, \"Should create a new user successfully.\"\n\ndef test_duplicate_email(test_client, sample_users):\n user1, _, _ = sample_users\n test_client.post('/register', json={\n 'email': user1.email,\n 'password': 'SomePassword123!'\n })\n response = test_client.post('/register', json={\n 'email': user1.email,\n 'password': 'AnotherPassword123!'\n })\n assert response.status_code == 400, \"Should not allow duplicate email registration.\"\n\n@pytest.mark.parametrize(\"email, password, expected_status\", [\n ('invalidemail', 'ValidPass123!', 400),\n ('valid@example.com', 'weak', 400),\n ('', 'ValidPass123!', 400),\n ('valid@example.com', '', 400),\n])\ndef test_registration_validation(test_client, email, password, expected_status):\n response = test_client.post('/register', json={\n 'email': email,\n 'password': password\n })\n assert response.status_code == expected_status, f\"Expected status {expected_status} for email: {email}, password: {password}\"\n\ndef test_valid_login(test_client, sample_users):\n user1, _, _ = sample_users\n response = test_client.post('/login', json={\n 'email': user1.email,\n 'password': 'StrongPass123!'\n })\n assert response.status_code == 200, \"Should log in successfully with correct credentials.\"\n\ndef test_wrong_password(test_client, sample_users):\n user1, _, _ = sample_users\n response = test_client.post('/login', json={\n 'email': user1.email,\n 'password': 'WrongPassword!'\n })\n assert response.status_code == 401, \"Should return unauthorized for incorrect password.\"\n\ndef test_inactive_user_login(test_client, sample_users):\n _, _, user3 = sample_users\n response = test_client.post('/login', json={\n 'email': user3.email,\n 'password': 'StrongPass123!'\n })\n assert response.status_code == 403, \"Should return forbidden for inactive users.\"\n\ndef test_rate_limiting(test_client):\n for _ in range(6): # Assuming limit is 5 attempts\n response = test_client.post('/login', json={\n 'email': 'wrong@example.com',\n 'password': 'WrongPassword!'\n })\n assert response.status_code == 429, \"Should be rate limited after too many requests.\"\n\ndef test_token_refresh(test_client, sample_users):\n user1, _, _ = sample_users\n login_response = test_client.post('/login', json={\n 'email': user1.email,\n 'password': 'StrongPass123!'\n })\n token = login_response.json['token']\n\n response = test_client.post('/token/refresh', headers={'Authorization': f'Bearer {token}'})\n assert response.status_code == 200, \"Should successfully refresh token.\"\n\ndef test_admin_only_endpoint(test_client, auth_headers):\n response = test_client.get('/admin-only', headers=auth_headers)\n assert response.status_code == 200, \"Admins should have access to admin-only endpoints.\"\n\ndef test_superadmin_access_all(test_client, superadmin_headers):\n for endpoint in ['/admin-only', '/resource/1', '/super-secret']:\n response = test_client.get(endpoint, headers=superadmin_headers)\n assert response.status_code == 200, f\"Superadmin should have access to {endpoint}.\"\n\ndef test_resource_ownership(test_client, auth_headers, sample_users):\n user1, user2, _ = sample_users\n \n # Assuming we've created resources here\n response = test_client.get(f'/resource/{user2.id}', headers=auth_headers)\n assert response.status_code == 403, \"Users should not access resources owned by others.\"\n\ndef test_role_escalation_prevention(test_client, sample_users, auth_headers):\n _, user2, _ = sample_users\n response = test_client.post('/promote', json={'email': user2.email}, headers=auth_headers)\n assert response.status_code == 403, \"Regular users should not be allowed to promote roles.\"\n\n@pytest.mark.parametrize(\"endpoint\", [\n '/admin-only',\n '/resource/1'\n])\ndef test_access_control(test_client, endpoint, auth_headers):\n response = test_client.get(endpoint, headers=auth_headers)\n assert response.status_code == 200, f\"User should have access to {endpoint}.\"\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-1.py b/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-1.py deleted file mode 100644 index 2072bc6..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-1.py +++ /dev/null @@ -1,144 +0,0 @@ -import pytest -from flask import Flask -from yourapp import create_app, db -from yourapp.models import User - - -@pytest.fixture(scope='module') -def test_client(): - app = create_app('testing') - with app.test_client() as client: - with app.app_context(): - db.create_all() - yield client - with app.app_context(): - db.drop_all() - -@pytest.fixture(scope='module') -def sample_users(): - user1 = User(email='admin@example.com', password='StrongPass123!', role='admin', active=True) - user2 = User(email='user@example.com', password='StrongPass123!', role='user', active=True) - user3 = User(email='inactive@example.com', password='StrongPass123!', role='user', active=False) - return user1, user2, user3 - -@pytest.fixture(scope='module') -def sample_superadmin(): - return User(email='super@example.com', password='StrongPass123!', role='superadmin', active=True) - -@pytest.fixture(scope='module') -def auth_headers(sample_users): - user1, user2, _ = sample_users - headers = {} - headers['Authorization'] = f"Bearer {user1.get_token()}" - return headers - -@pytest.fixture(scope='module') -def superadmin_headers(sample_superadmin): - headers = {} - headers['Authorization'] = f"Bearer {sample_superadmin.get_token()}" - return headers - - -def test_valid_signup(test_client): - response = test_client.post('/register', json={ - 'email': 'newuser@example.com', - 'password': 'NewStrongPass123!' - }) - assert response.status_code == 201, "Should create a new user successfully." - -def test_duplicate_email(test_client, sample_users): - user1, _, _ = sample_users - test_client.post('/register', json={ - 'email': user1.email, - 'password': 'SomePassword123!' - }) - response = test_client.post('/register', json={ - 'email': user1.email, - 'password': 'AnotherPassword123!' - }) - assert response.status_code == 400, "Should not allow duplicate email registration." - -@pytest.mark.parametrize("email, password, expected_status", [ - ('invalidemail', 'ValidPass123!', 400), - ('valid@example.com', 'weak', 400), - ('', 'ValidPass123!', 400), - ('valid@example.com', '', 400), -]) -def test_registration_validation(test_client, email, password, expected_status): - response = test_client.post('/register', json={ - 'email': email, - 'password': password - }) - assert response.status_code == expected_status, f"Expected status {expected_status} for email: {email}, password: {password}" - -def test_valid_login(test_client, sample_users): - user1, _, _ = sample_users - response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'StrongPass123!' - }) - assert response.status_code == 200, "Should log in successfully with correct credentials." - -def test_wrong_password(test_client, sample_users): - user1, _, _ = sample_users - response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'WrongPassword!' - }) - assert response.status_code == 401, "Should return unauthorized for incorrect password." - -def test_inactive_user_login(test_client, sample_users): - _, _, user3 = sample_users - response = test_client.post('/login', json={ - 'email': user3.email, - 'password': 'StrongPass123!' - }) - assert response.status_code == 403, "Should return forbidden for inactive users." - -def test_rate_limiting(test_client): - for _ in range(6): # Assuming limit is 5 attempts - response = test_client.post('/login', json={ - 'email': 'wrong@example.com', - 'password': 'WrongPassword!' - }) - assert response.status_code == 429, "Should be rate limited after too many requests." - -def test_token_refresh(test_client, sample_users): - user1, _, _ = sample_users - login_response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'StrongPass123!' - }) - token = login_response.json['token'] - - response = test_client.post('/token/refresh', headers={'Authorization': f'Bearer {token}'}) - assert response.status_code == 200, "Should successfully refresh token." - -def test_admin_only_endpoint(test_client, auth_headers): - response = test_client.get('/admin-only', headers=auth_headers) - assert response.status_code == 200, "Admins should have access to admin-only endpoints." - -def test_superadmin_access_all(test_client, superadmin_headers): - for endpoint in ['/admin-only', '/resource/1', '/super-secret']: - response = test_client.get(endpoint, headers=superadmin_headers) - assert response.status_code == 200, f"Superadmin should have access to {endpoint}." - -def test_resource_ownership(test_client, auth_headers, sample_users): - user1, user2, _ = sample_users - - # Assuming we've created resources here - response = test_client.get(f'/resource/{user2.id}', headers=auth_headers) - assert response.status_code == 403, "Users should not access resources owned by others." - -def test_role_escalation_prevention(test_client, sample_users, auth_headers): - _, user2, _ = sample_users - response = test_client.post('/promote', json={'email': user2.email}, headers=auth_headers) - assert response.status_code == 403, "Regular users should not be allowed to promote roles." - -@pytest.mark.parametrize("endpoint", [ - '/admin-only', - '/resource/1' -]) -def test_access_control(test_client, endpoint, auth_headers): - response = test_client.get(endpoint, headers=auth_headers) - assert response.status_code == 200, f"User should have access to {endpoint}." diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-2.json b/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-2.json deleted file mode 100644 index b72c0ea..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "test_api_mfa_login_implementation", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import pytest\nfrom flask import Flask\nfrom yourapp import create_app, db\nfrom yourapp.models import User\n\n\n@pytest.fixture(scope='module')\ndef test_client():\n app = create_app('testing')\n with app.test_client() as client:\n with app.app_context():\n db.create_all()\n yield client\n with app.app_context():\n db.drop_all()\n\n@pytest.fixture(scope='module')\ndef sample_users():\n user1 = User(email='admin@example.com', password='StrongPass123!', role='admin', active=True)\n user2 = User(email='user@example.com', password='StrongPass123!', role='user', active=True)\n user3 = User(email='inactive@example.com', password='StrongPass123!', role='user', active=False)\n return user1, user2, user3\n\n@pytest.fixture(scope='module')\ndef sample_superadmin():\n return User(email='super@example.com', password='StrongPass123!', role='superadmin', active=True)\n\n@pytest.fixture(scope='module')\ndef auth_headers(sample_users):\n user1, user2, _ = sample_users\n headers = {}\n headers['Authorization'] = f\"Bearer {user1.get_token()}\"\n return headers\n\n@pytest.fixture(scope='module')\ndef superadmin_headers(sample_superadmin):\n headers = {}\n headers['Authorization'] = f\"Bearer {sample_superadmin.get_token()}\"\n return headers\n\n\ndef test_valid_signup(test_client):\n response = test_client.post('/register', json={\n 'email': 'newuser@example.com',\n 'password': 'NewStrongPass123!'\n })\n assert response.status_code == 201, \"Should create a new user successfully.\"\n\ndef test_duplicate_email(test_client, sample_users):\n user1, _, _ = sample_users\n test_client.post('/register', json={\n 'email': user1.email,\n 'password': 'SomePassword123!'\n })\n response = test_client.post('/register', json={\n 'email': user1.email,\n 'password': 'AnotherPassword123!'\n })\n assert response.status_code == 400, \"Should not allow duplicate email registration.\"\n\n@pytest.mark.parametrize(\"email, password, expected_status\", [\n ('invalidemail', 'ValidPass123!', 400),\n ('valid@example.com', 'weak', 400),\n ('', 'ValidPass123!', 400),\n ('valid@example.com', '', 400),\n])\ndef test_registration_validation(test_client, email, password, expected_status):\n response = test_client.post('/register', json={\n 'email': email,\n 'password': password\n })\n assert response.status_code == expected_status, f\"Expected status {expected_status} for email: {email}, password: {password}\"\n\ndef test_valid_login(test_client, sample_users):\n user1, _, _ = sample_users\n response = test_client.post('/login', json={\n 'email': user1.email,\n 'password': 'StrongPass123!'\n })\n assert response.status_code == 200, \"Should log in successfully with correct credentials.\"\n\n@pytest.mark.parametrize(\"mfa_code, expected_status\", [\n ('123456', 200),\n ('000000', 401),\n ('expired', 401),\n])\ndef test_mfa_login(test_client, mfa_code, expected_status):\n response = test_client.post('/login/mfa', json={\n 'email': 'user@example.com',\n 'mfa_code': mfa_code\n })\n assert response.status_code == expected_status\n\ndef test_wrong_password(test_client, sample_users):\n user1, _, _ = sample_users\n response = test_client.post('/login', json={\n 'email': user1.email,\n 'password': 'WrongPassword!'\n })\n assert response.status_code == 401, \"Should return unauthorized for incorrect password.\"\n\ndef test_inactive_user_login(test_client, sample_users):\n _, _, user3 = sample_users\n response = test_client.post('/login', json={\n 'email': user3.email,\n 'password': 'StrongPass123!'\n })\n assert response.status_code == 403, \"Should return forbidden for inactive users.\"\n\ndef test_rate_limiting(test_client):\n for _ in range(6): # Assuming limit is 5 attempts\n response = test_client.post('/login', json={\n 'email': 'wrong@example.com',\n 'password': 'WrongPassword!'\n })\n assert response.status_code == 429, \"Should be rate limited after too many requests.\"\n\ndef test_token_refresh(test_client, sample_users):\n user1, _, _ = sample_users\n login_response = test_client.post('/login', json={\n 'email': user1.email,\n 'password': 'StrongPass123!'\n })\n token = login_response.json['token']\n\n response = test_client.post('/token/refresh', headers={'Authorization': f'Bearer {token}'})\n assert response.status_code == 200, \"Should successfully refresh token.\"\n\ndef test_admin_only_endpoint(test_client, auth_headers):\n response = test_client.get('/admin-only', headers=auth_headers)\n assert response.status_code == 200, \"Admins should have access to admin-only endpoints.\"\n\ndef test_superadmin_access_all(test_client, superadmin_headers):\n for endpoint in ['/admin-only', '/resource/1', '/super-secret']:\n response = test_client.get(endpoint, headers=superadmin_headers)\n assert response.status_code == 200, f\"Superadmin should have access to {endpoint}.\"\n\ndef test_resource_ownership(test_client, auth_headers, sample_users):\n user1, user2, _ = sample_users\n \n # Assuming we've created resources here\n response = test_client.get(f'/resource/{user2.id}', headers=auth_headers)\n assert response.status_code == 403, \"Users should not access resources owned by others.\"\n\ndef test_role_escalation_prevention(test_client, sample_users, auth_headers):\n _, user2, _ = sample_users\n response = test_client.post('/promote', json={'email': user2.email}, headers=auth_headers)\n assert response.status_code == 403, \"Regular users should not be allowed to promote roles.\"\n\n@pytest.mark.parametrize(\"endpoint\", [\n '/admin-only',\n '/resource/1'\n])\ndef test_access_control(test_client, endpoint, auth_headers):\n response = test_client.get(endpoint, headers=auth_headers)\n assert response.status_code == 200, f\"User should have access to {endpoint}.\"\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-2.py b/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-2.py deleted file mode 100644 index 5e32386..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-2.py +++ /dev/null @@ -1,156 +0,0 @@ -import pytest -from flask import Flask -from yourapp import create_app, db -from yourapp.models import User - - -@pytest.fixture(scope='module') -def test_client(): - app = create_app('testing') - with app.test_client() as client: - with app.app_context(): - db.create_all() - yield client - with app.app_context(): - db.drop_all() - -@pytest.fixture(scope='module') -def sample_users(): - user1 = User(email='admin@example.com', password='StrongPass123!', role='admin', active=True) - user2 = User(email='user@example.com', password='StrongPass123!', role='user', active=True) - user3 = User(email='inactive@example.com', password='StrongPass123!', role='user', active=False) - return user1, user2, user3 - -@pytest.fixture(scope='module') -def sample_superadmin(): - return User(email='super@example.com', password='StrongPass123!', role='superadmin', active=True) - -@pytest.fixture(scope='module') -def auth_headers(sample_users): - user1, user2, _ = sample_users - headers = {} - headers['Authorization'] = f"Bearer {user1.get_token()}" - return headers - -@pytest.fixture(scope='module') -def superadmin_headers(sample_superadmin): - headers = {} - headers['Authorization'] = f"Bearer {sample_superadmin.get_token()}" - return headers - - -def test_valid_signup(test_client): - response = test_client.post('/register', json={ - 'email': 'newuser@example.com', - 'password': 'NewStrongPass123!' - }) - assert response.status_code == 201, "Should create a new user successfully." - -def test_duplicate_email(test_client, sample_users): - user1, _, _ = sample_users - test_client.post('/register', json={ - 'email': user1.email, - 'password': 'SomePassword123!' - }) - response = test_client.post('/register', json={ - 'email': user1.email, - 'password': 'AnotherPassword123!' - }) - assert response.status_code == 400, "Should not allow duplicate email registration." - -@pytest.mark.parametrize("email, password, expected_status", [ - ('invalidemail', 'ValidPass123!', 400), - ('valid@example.com', 'weak', 400), - ('', 'ValidPass123!', 400), - ('valid@example.com', '', 400), -]) -def test_registration_validation(test_client, email, password, expected_status): - response = test_client.post('/register', json={ - 'email': email, - 'password': password - }) - assert response.status_code == expected_status, f"Expected status {expected_status} for email: {email}, password: {password}" - -def test_valid_login(test_client, sample_users): - user1, _, _ = sample_users - response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'StrongPass123!' - }) - assert response.status_code == 200, "Should log in successfully with correct credentials." - -@pytest.mark.parametrize("mfa_code, expected_status", [ - ('123456', 200), - ('000000', 401), - ('expired', 401), -]) -def test_mfa_login(test_client, mfa_code, expected_status): - response = test_client.post('/login/mfa', json={ - 'email': 'user@example.com', - 'mfa_code': mfa_code - }) - assert response.status_code == expected_status - -def test_wrong_password(test_client, sample_users): - user1, _, _ = sample_users - response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'WrongPassword!' - }) - assert response.status_code == 401, "Should return unauthorized for incorrect password." - -def test_inactive_user_login(test_client, sample_users): - _, _, user3 = sample_users - response = test_client.post('/login', json={ - 'email': user3.email, - 'password': 'StrongPass123!' - }) - assert response.status_code == 403, "Should return forbidden for inactive users." - -def test_rate_limiting(test_client): - for _ in range(6): # Assuming limit is 5 attempts - response = test_client.post('/login', json={ - 'email': 'wrong@example.com', - 'password': 'WrongPassword!' - }) - assert response.status_code == 429, "Should be rate limited after too many requests." - -def test_token_refresh(test_client, sample_users): - user1, _, _ = sample_users - login_response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'StrongPass123!' - }) - token = login_response.json['token'] - - response = test_client.post('/token/refresh', headers={'Authorization': f'Bearer {token}'}) - assert response.status_code == 200, "Should successfully refresh token." - -def test_admin_only_endpoint(test_client, auth_headers): - response = test_client.get('/admin-only', headers=auth_headers) - assert response.status_code == 200, "Admins should have access to admin-only endpoints." - -def test_superadmin_access_all(test_client, superadmin_headers): - for endpoint in ['/admin-only', '/resource/1', '/super-secret']: - response = test_client.get(endpoint, headers=superadmin_headers) - assert response.status_code == 200, f"Superadmin should have access to {endpoint}." - -def test_resource_ownership(test_client, auth_headers, sample_users): - user1, user2, _ = sample_users - - # Assuming we've created resources here - response = test_client.get(f'/resource/{user2.id}', headers=auth_headers) - assert response.status_code == 403, "Users should not access resources owned by others." - -def test_role_escalation_prevention(test_client, sample_users, auth_headers): - _, user2, _ = sample_users - response = test_client.post('/promote', json={'email': user2.email}, headers=auth_headers) - assert response.status_code == 403, "Regular users should not be allowed to promote roles." - -@pytest.mark.parametrize("endpoint", [ - '/admin-only', - '/resource/1' -]) -def test_access_control(test_client, endpoint, auth_headers): - response = test_client.get(endpoint, headers=auth_headers) - assert response.status_code == 200, f"User should have access to {endpoint}." diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-3.json b/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-3.json deleted file mode 100644 index 8a7701a..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-3.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "test_suite_id", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "test_valid_signup" - }, - "content": "def test_valid_signup(test_client):\n response = test_client.post('/register', json={\n 'email': 'newuser@example.com',\n 'password': 'NewStrongPass123!'\n })\n assert response.status_code == 201, \"Should create a new user successfully.\"\n\ndef test_oauth_signup_google(test_client):\n response = test_client.post('/oauth/google', json={'token': 'fake-google-token'})\n assert response.status_code == 200, \"Should register/login via Google OAuth.\"\n\ndef test_oauth_signup_github(test_client):\n response = test_client.post('/oauth/github', json={'token': 'fake-github-token'})\n assert response.status_code == 200, \"Should register/login via GitHub OAuth.\"\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-3.py b/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-3.py deleted file mode 100644 index 5e32386..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/aap/turn-3.py +++ /dev/null @@ -1,156 +0,0 @@ -import pytest -from flask import Flask -from yourapp import create_app, db -from yourapp.models import User - - -@pytest.fixture(scope='module') -def test_client(): - app = create_app('testing') - with app.test_client() as client: - with app.app_context(): - db.create_all() - yield client - with app.app_context(): - db.drop_all() - -@pytest.fixture(scope='module') -def sample_users(): - user1 = User(email='admin@example.com', password='StrongPass123!', role='admin', active=True) - user2 = User(email='user@example.com', password='StrongPass123!', role='user', active=True) - user3 = User(email='inactive@example.com', password='StrongPass123!', role='user', active=False) - return user1, user2, user3 - -@pytest.fixture(scope='module') -def sample_superadmin(): - return User(email='super@example.com', password='StrongPass123!', role='superadmin', active=True) - -@pytest.fixture(scope='module') -def auth_headers(sample_users): - user1, user2, _ = sample_users - headers = {} - headers['Authorization'] = f"Bearer {user1.get_token()}" - return headers - -@pytest.fixture(scope='module') -def superadmin_headers(sample_superadmin): - headers = {} - headers['Authorization'] = f"Bearer {sample_superadmin.get_token()}" - return headers - - -def test_valid_signup(test_client): - response = test_client.post('/register', json={ - 'email': 'newuser@example.com', - 'password': 'NewStrongPass123!' - }) - assert response.status_code == 201, "Should create a new user successfully." - -def test_duplicate_email(test_client, sample_users): - user1, _, _ = sample_users - test_client.post('/register', json={ - 'email': user1.email, - 'password': 'SomePassword123!' - }) - response = test_client.post('/register', json={ - 'email': user1.email, - 'password': 'AnotherPassword123!' - }) - assert response.status_code == 400, "Should not allow duplicate email registration." - -@pytest.mark.parametrize("email, password, expected_status", [ - ('invalidemail', 'ValidPass123!', 400), - ('valid@example.com', 'weak', 400), - ('', 'ValidPass123!', 400), - ('valid@example.com', '', 400), -]) -def test_registration_validation(test_client, email, password, expected_status): - response = test_client.post('/register', json={ - 'email': email, - 'password': password - }) - assert response.status_code == expected_status, f"Expected status {expected_status} for email: {email}, password: {password}" - -def test_valid_login(test_client, sample_users): - user1, _, _ = sample_users - response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'StrongPass123!' - }) - assert response.status_code == 200, "Should log in successfully with correct credentials." - -@pytest.mark.parametrize("mfa_code, expected_status", [ - ('123456', 200), - ('000000', 401), - ('expired', 401), -]) -def test_mfa_login(test_client, mfa_code, expected_status): - response = test_client.post('/login/mfa', json={ - 'email': 'user@example.com', - 'mfa_code': mfa_code - }) - assert response.status_code == expected_status - -def test_wrong_password(test_client, sample_users): - user1, _, _ = sample_users - response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'WrongPassword!' - }) - assert response.status_code == 401, "Should return unauthorized for incorrect password." - -def test_inactive_user_login(test_client, sample_users): - _, _, user3 = sample_users - response = test_client.post('/login', json={ - 'email': user3.email, - 'password': 'StrongPass123!' - }) - assert response.status_code == 403, "Should return forbidden for inactive users." - -def test_rate_limiting(test_client): - for _ in range(6): # Assuming limit is 5 attempts - response = test_client.post('/login', json={ - 'email': 'wrong@example.com', - 'password': 'WrongPassword!' - }) - assert response.status_code == 429, "Should be rate limited after too many requests." - -def test_token_refresh(test_client, sample_users): - user1, _, _ = sample_users - login_response = test_client.post('/login', json={ - 'email': user1.email, - 'password': 'StrongPass123!' - }) - token = login_response.json['token'] - - response = test_client.post('/token/refresh', headers={'Authorization': f'Bearer {token}'}) - assert response.status_code == 200, "Should successfully refresh token." - -def test_admin_only_endpoint(test_client, auth_headers): - response = test_client.get('/admin-only', headers=auth_headers) - assert response.status_code == 200, "Admins should have access to admin-only endpoints." - -def test_superadmin_access_all(test_client, superadmin_headers): - for endpoint in ['/admin-only', '/resource/1', '/super-secret']: - response = test_client.get(endpoint, headers=superadmin_headers) - assert response.status_code == 200, f"Superadmin should have access to {endpoint}." - -def test_resource_ownership(test_client, auth_headers, sample_users): - user1, user2, _ = sample_users - - # Assuming we've created resources here - response = test_client.get(f'/resource/{user2.id}', headers=auth_headers) - assert response.status_code == 403, "Users should not access resources owned by others." - -def test_role_escalation_prevention(test_client, sample_users, auth_headers): - _, user2, _ = sample_users - response = test_client.post('/promote', json={'email': user2.email}, headers=auth_headers) - assert response.status_code == 403, "Regular users should not be allowed to promote roles." - -@pytest.mark.parametrize("endpoint", [ - '/admin-only', - '/resource/1' -]) -def test_access_control(test_client, endpoint, auth_headers): - response = test_client.get(endpoint, headers=auth_headers) - assert response.status_code == 200, f"User should have access to {endpoint}." diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-0.py b/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-0.py deleted file mode 100644 index 93e06fa..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-0.py +++ /dev/null @@ -1,104 +0,0 @@ -import pytest -import jwt -import datetime -from unittest.mock import patch - -# --- Fixtures --- - -@pytest.fixture -def test_db(): - db = {"users": {}} - return db - -@pytest.fixture -def test_client(test_db): - class Client: - def __init__(self, db): self.db = db - def post(self, endpoint, data): - if endpoint == "/register": - if data.get("email") in self.db["users"]: return {"status": 400, "msg": "Duplicate"} - if len(data.get("password", "")) < 8: return {"status": 400, "msg": "Weak password"} - self.db["users"][data["email"]] = data - return {"status": 201} - return {"status": 404} - return Client(test_db) - -@pytest.fixture -def sample_users(test_db): - users = { - "admin": {"email": "admin@test.com", "role": "admin", "active": True}, - "regular": {"email": "user@test.com", "role": "user", "active": True}, - "inactive": {"email": "old@test.com", "role": "user", "active": False} - } - test_db["users"].update(users) - return users - -@pytest.fixture -def auth_headers(sample_users): - token = jwt.encode({"email": sample_users["regular"]["email"]}, "secret", algorithm="HS256") - return {"Authorization": f"Bearer {token}"} - -# --- Registration Tests --- - -@pytest.mark.parametrize("email, password, expected_status", [ - ("valid@test.com", "securePassword123", 201), - ("dup@test.com", "securePassword123", 201), - ("bad-email", "short", 400), -]) -def test_registration_edge_cases(test_client, email, password, expected_status): - response = test_client.post("/register", {"email": email, "password": password}) - assert response["status"] == expected_status, f"Failed for {email}" - -def test_registration_missing_fields(test_client): - response = test_client.post("/register", {}) - assert response["status"] == 400, "Registration should fail without fields" - -# --- Login Tests --- - -def test_login_valid(sample_users): - # Mocking authentication service - assert True - -@pytest.mark.parametrize("scenario, password, active, expected", [ - ("wrong_pass", "wrong", True, 401), - ("inactive_user", "password123", False, 403), -]) -def test_login_failures(scenario, password, active, expected): - # Logic to simulate login attempt against db state - status = 401 if scenario == "wrong_pass" else 403 - assert status == expected - -def test_token_refresh(auth_headers): - # Verify JWT decoding and expiration handling - token = auth_headers["Authorization"].split(" ")[1] - payload = jwt.decode(token, "secret", algorithms=["HS256"]) - assert "email" in payload - -# --- Permission Tests --- - -def test_admin_only_endpoint_access(sample_users, auth_headers): - # Simulate attempt by regular user to access admin resource - is_admin = False - assert is_admin is False, "Regular user accessed admin endpoint" - -def test_resource_ownership(test_db, auth_headers): - # Test that user A cannot edit user B's resource - owner_id = "user1" - request_id = "user2" - assert owner_id != request_id, "Users should not edit other users resources" - -def test_role_escalation_prevention(): - # Test that update profile payload ignores 'role' field - payload = {"email": "u@t.com", "role": "admin"} - assert "role" not in ["email"], "Role field should be sanitized from input" - -# --- Security/Rate Limit --- - -@patch('time.sleep', return_value=None) -def test_login_rate_limiting(mock_sleep): - attempts = 0 - for _ in range(6): - attempts += 1 - assert attempts > 5 - # Verify that the 6th attempt triggers a rate limit flag - assert True, "Rate limit should have been hit" \ No newline at end of file diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-1.py b/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-1.py deleted file mode 100644 index 5ad9613..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-1.py +++ /dev/null @@ -1,32 +0,0 @@ -@pytest.fixture -def sample_superadmin(test_db): - superadmin = {"email": "root@test.com", "role": "superadmin", "active": True} - test_db["users"][superadmin["email"]] = superadmin - return superadmin - -@pytest.fixture -def superadmin_headers(sample_superadmin): - token = jwt.encode({"email": sample_superadmin["email"], "role": "superadmin"}, "secret", algorithm="HS256") - return {"Authorization": f"Bearer {token}"} - -def test_superadmin_access_all(sample_superadmin, superadmin_headers): - # Simulate an endpoint security check logic - user_role = "superadmin" - required_role = "admin" - - # Assert that superadmin satisfies any admin-level check - has_permission = (user_role == "superadmin") or (user_role == required_role) - assert has_permission is True, "Superadmin should have access to admin endpoints" - -def test_regular_user_cannot_access_superadmin_resource(auth_headers): - # Verify regular user blocked from sensitive configuration endpoints - user_role = "user" - resource_required = "superadmin" - - is_authorized = (user_role == resource_required) - assert is_authorized is False, "Regular users must not access superadmin resources" - -def test_superadmin_elevated_actions(superadmin_headers): - # Test that superadmin can perform actions like user deletion or role modification - can_delete_user = True - assert can_delete_user is True, "Superadmin should be able to perform administrative deletions" \ No newline at end of file diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-2.py b/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-2.py deleted file mode 100644 index b8678dd..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-2.py +++ /dev/null @@ -1,32 +0,0 @@ -@pytest.mark.parametrize("mfa_code, mfa_status, expected_status", [ - ("123456", "valid", 200), - ("123456", "expired", 401), - ("000000", "invalid", 401), -]) -def test_login_mfa_scenarios(mfa_code, mfa_status, expected_status): - """ - Simulates MFA verification logic: - - 'valid': code matches and is current - - 'expired': code matches but timestamp is too old - - 'invalid': code does not match expected secret - """ - # Logic simulation - if mfa_status == "valid": - actual_status = 200 - elif mfa_status == "expired": - actual_status = 401 - else: # invalid - actual_status = 401 - - assert actual_status == expected_status, f"MFA test failed for scenario: {mfa_status}" - -def test_login_mfa_missing_code_for_enabled_user(): - """Ensure that users with MFA enabled cannot login with password only.""" - mfa_enabled = True - provided_mfa = None - - # Assert that missing MFA code results in rejection - if mfa_enabled and not provided_mfa: - assert True, "System correctly blocked login without MFA code" - else: - pytest.fail("System allowed login without MFA for MFA-enabled user") \ No newline at end of file diff --git a/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-3.py b/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-3.py deleted file mode 100644 index 56ff411..0000000 --- a/evals/data/experiments/014-python-pytest-suite/outputs/base/turn-3.py +++ /dev/null @@ -1,47 +0,0 @@ -@pytest.mark.parametrize("provider, oauth_token, expected_status", [ - ("google", "valid_google_token", 201), - ("github", "valid_github_token", 201), - ("google", "malformed_token", 400), - ("github", "expired_token", 401), - ("invalid_provider", "token", 422), -]) -def test_oauth_registration(test_client, provider, oauth_token, expected_status): - """ - Verifies that the registration endpoint correctly handles OAuth providers. - """ - # Mocking external provider verification - def verify_oauth(p, t): - if p not in ["google", "github"]: return 422 - if t == "valid_google_token" or t == "valid_github_token": return 201 - if t == "expired_token": return 401 - return 400 - - response_status = verify_oauth(provider, oauth_token) - assert response_status == expected_status, f"OAuth {provider} registration failed with {oauth_token}" - -def test_oauth_user_duplicate_email(test_client, sample_users): - """ - Ensure that signing up via OAuth with an email already present in - the system correctly handles the collision (e.g., merging accounts). - """ - existing_email = sample_users["regular"]["email"] - payload = { - "email": existing_email, - "provider": "google", - "oauth_id": "google_123" - } - - # Check if system identifies account collision - response = test_client.post("/register/oauth", payload) - - # In a real system, this might return 200 (linked) or 400 (conflict) - # Here we assert it does not allow duplicate account creation - assert response.get("status") != 201, "Should not create new account for existing email" - -def test_oauth_missing_provider_data(test_client): - """Verify that incomplete OAuth payloads are rejected.""" - payload = {"email": "test@example.com"} # Missing provider and oauth_id - response = test_client.post("/register/oauth", payload) - - # Assuming the API requires these fields - assert response.get("status") in [400, 422], "Should reject incomplete OAuth registration" \ No newline at end of file diff --git a/evals/data/experiments/015-python-dataclasses-models/eval.json b/evals/data/experiments/015-python-dataclasses-models/eval.json deleted file mode 100644 index ee488f4..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/eval.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.2173, - "token_f1": 0.6571, - "base_char_count": 2619, - "aap_char_count": 1763, - "char_delta_pct": -32.7, - "lines_added": 53, - "lines_removed": 77, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0661, - "token_f1": 0.1935, - "base_char_count": 1245, - "aap_char_count": 1963, - "char_delta_pct": 57.7, - "lines_added": 97, - "lines_removed": 30, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.1169, - "token_f1": 0.1633, - "base_char_count": 569, - "aap_char_count": 1963, - "char_delta_pct": 245.0, - "lines_added": 98, - "lines_removed": 12, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1334, - "mean_token_f1": 0.338, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/015-python-dataclasses-models/metrics.json b/evals/data/experiments/015-python-dataclasses-models/metrics.json deleted file mode 100644 index b3cd819..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/metrics.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "experiment_id": "015-python-dataclasses-models", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:56:14.428103+00:00", - "format": "text/x-python", - "base_turn0": { - "input_tokens": 111, - "output_tokens": 646, - "latency_ms": 4255, - "artifact_bytes": 2209 - }, - "aap_turn0": { - "input_tokens": 450, - "output_tokens": 596, - "latency_ms": 5529, - "artifact_bytes": 1974 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new 'Milestone' entity with fields: id, name, target_date, status, project", - "input_tokens": 791, - "output_tokens": 228, - "latency_ms": 2349, - "output_bytes": 834, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Update the Priority enum to include a 'URGENT' level above 'HIGH' and add a colo", - "input_tokens": 1049, - "output_tokens": 171, - "latency_ms": 1703, - "output_bytes": 561, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 1840, - "total_output_tokens": 399, - "total_latency_ms": 4052 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new 'Milestone' entity with fields: id, name, target_date, status, project", - "input_tokens": 1738, - "output_tokens": 214, - "latency_ms": 1665, - "output_bytes": 467, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Update the Priority enum to include a 'URGENT' level above 'HIGH' and add a colo", - "input_tokens": 1306, - "output_tokens": 371, - "latency_ms": 2591, - "output_bytes": 974, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - } - ], - "total_input_tokens": 3044, - "total_output_tokens": 585, - "total_latency_ms": 4256, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": -46.6, - "input_token_savings_pct": -65.4, - "latency_savings_pct": -5.0 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 111, - "base_output": 646, - "base_latency_ms": 4255, - "aap_input": 450, - "aap_output": 596, - "aap_latency_ms": 5529 - }, - { - "turn": 1, - "base_input": 791, - "base_output": 228, - "base_latency_ms": 2349, - "aap_input": 1738, - "aap_output": 214, - "aap_latency_ms": 1665, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 1049, - "base_output": 171, - "base_latency_ms": 1703, - "aap_input": 1306, - "aap_output": 371, - "aap_latency_ms": 2591, - "envelope_name": "synthesize", - "apply_ok": true - } - ], - "totals": { - "base_input": 1951, - "base_output": 1045, - "base_combined": 2996, - "aap_input": 3494, - "aap_output": 1181, - "aap_combined": 4675, - "base_latency_ms": 8307, - "aap_latency_ms": 9785, - "output_savings_pct": -13.0, - "input_delta_pct": 79.1, - "combined_savings_pct": -56.0, - "latency_savings_pct": -17.8 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.6246, - "token_f1": 0.7441, - "base_char_count": 2209, - "aap_char_count": 1803, - "char_delta_pct": -18.4, - "lines_added": 32, - "lines_removed": 43, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.249, - "token_f1": 0.1789, - "base_char_count": 834, - "aap_char_count": 467, - "char_delta_pct": -44.0, - "lines_added": 16, - "lines_removed": 25, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.3859, - "token_f1": 0.3718, - "base_char_count": 561, - "aap_char_count": 885, - "char_delta_pct": 57.8, - "lines_added": 38, - "lines_removed": 14, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.4198, - "mean_token_f1": 0.4316, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-0.py b/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-0.py deleted file mode 100644 index 23bd746..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-0.py +++ /dev/null @@ -1,83 +0,0 @@ -from dataclasses import dataclass, field -from datetime import datetime -from enum import Enum -from typing import List, Optional -from uuid import UUID, uuid4 - - -class Priority(Enum): - LOW = "low" - MEDIUM = "medium" - HIGH = "high" - URGENT = "urgent" - -class Status(Enum): - TODO = "todo" - IN_PROGRESS = "in_progress" - REVIEW = "review" - DONE = "done" - -@dataclass(frozen=True) -class TimeEstimate: - hours: int - minutes: int - - def __post_init__(self): - if self.hours < 0 or self.minutes < 0: - raise ValueError("Time values cannot be negative") - -@dataclass(frozen=True) -class DateRange: - start: datetime - end: datetime - - def __post_init__(self): - if self.start > self.end: - raise ValueError("Start date must be before end date") - -@dataclass -class BaseEntity: - - id: UUID = field(default_factory=uuid4) - created_at: datetime = field(default_factory=datetime.utcnow) - updated_at: datetime = field(default_factory=datetime.utcnow) - - -@dataclass -class User(BaseEntity): - name: str = "Unknown User" - email: str = "" - -@dataclass -class Team(BaseEntity): - name: str = "" - member_ids: List[UUID] = field(default_factory=list) - -@dataclass -class Project(BaseEntity): - name: str = "" - lead_id: Optional[UUID] = None - -@dataclass -class Sprint(BaseEntity): - project_id: UUID = None - duration: DateRange = None - -@dataclass -class Task(BaseEntity): - project_id: UUID = None - sprint_id: Optional[UUID] = None - assignee_id: Optional[UUID] = None - title: str = "" - - priority: Priority = Priority.MEDIUM - status: Status = Status.TODO - estimate: Optional[TimeEstimate] = None - - -@dataclass -class Comment(BaseEntity): - task_id: UUID = None - author_id: UUID = None - content: str = "" - \ No newline at end of file diff --git a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-1.json b/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-1.json deleted file mode 100644 index 103c134..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "project-models-edit-milestone", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"project-models\",\n \"version\": 2,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"insert_after\",\n \"target\": {\n \"type\": \"id\",\n \"value\": \"project-models\"\n },\n \"content\": \"@dataclass\\nclass Milestone(BaseEntity):\\n name: str = \\\"\\\"\\n target_date: datetime = None\\n status: Status = Status.TODO\\n project_id: UUID = None\\n task_ids: List[UUID] = field(default_factory=list)\\n\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-1.py b/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-1.py deleted file mode 100644 index 1f93306..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-1.py +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "project-models", - "version": 2, - "name": "edit", - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "project-models" - }, - "content": "@dataclass\nclass Milestone(BaseEntity):\n name: str = \"\"\n target_date: datetime = None\n status: Status = Status.TODO\n project_id: UUID = None\n task_ids: List[UUID] = field(default_factory=list)\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-2.json b/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-2.json deleted file mode 100644 index a73cc2e..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "project-models-v2", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/x-python", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "from dataclasses import dataclass, field\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import List\nfrom uuid import UUID\n\nclass Status(Enum):\n TODO = \"TODO\"\n IN_PROGRESS = \"IN_PROGRESS\"\n DONE = \"DONE\"\n\n\nclass Priority(Enum):\n LOW = \"LOW\"\n MEDIUM = \"MEDIUM\"\n HIGH = \"HIGH\"\n URGENT = \"URGENT\"\n\n @property\n def color(self) -> str:\n return {\n Priority.LOW: \"#00FF00\",\n Priority.MEDIUM: \"#FFFF00\",\n Priority.HIGH: \"#FF8C00\",\n Priority.URGENT: \"#FF0000\"\n }[self]\n\n\n@dataclass\nclass BaseEntity:\n id: UUID = None\n created_at: datetime = field(default_factory=datetime.now)\n\n\n@dataclass\nclass Milestone(BaseEntity):\n name: str = \"\"\n target_date: datetime = None\n status: Status = Status.TODO\n project_id: UUID = None\n task_ids: List[UUID] = field(default_factory=list)\n\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-2.py b/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-2.py deleted file mode 100644 index ef24e9d..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/outputs/aap/turn-2.py +++ /dev/null @@ -1,42 +0,0 @@ -from dataclasses import dataclass, field -from datetime import datetime -from enum import Enum -from typing import List -from uuid import UUID - -class Status(Enum): - TODO = "TODO" - IN_PROGRESS = "IN_PROGRESS" - DONE = "DONE" - - -class Priority(Enum): - LOW = "LOW" - MEDIUM = "MEDIUM" - HIGH = "HIGH" - URGENT = "URGENT" - - @property - def color(self) -> str: - return { - Priority.LOW: "#00FF00", - Priority.MEDIUM: "#FFFF00", - Priority.HIGH: "#FF8C00", - Priority.URGENT: "#FF0000" - }[self] - - -@dataclass -class BaseEntity: - id: UUID = None - created_at: datetime = field(default_factory=datetime.now) - - -@dataclass -class Milestone(BaseEntity): - name: str = "" - target_date: datetime = None - status: Status = Status.TODO - project_id: UUID = None - task_ids: List[UUID] = field(default_factory=list) - diff --git a/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-0.py b/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-0.py deleted file mode 100644 index 25fb547..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-0.py +++ /dev/null @@ -1,93 +0,0 @@ -from dataclasses import dataclass, field -from datetime import datetime -from enum import Enum -from typing import List, Optional, Type, TypeVar -from uuid import UUID, uuid4 - -T = TypeVar("T", bound="BaseModel") - -class Priority(Enum): - LOW = "low" - MEDIUM = "medium" - HIGH = "high" - CRITICAL = "critical" - -class Status(Enum): - TODO = "todo" - IN_PROGRESS = "in_progress" - DONE = "done" - ARCHIVED = "archived" - -@dataclass(frozen=True) -class TimeEstimate: - hours: int - - def __post_init__(self): - if self.hours < 0: - raise ValueError("Estimate cannot be negative") - -@dataclass(frozen=True) -class DateRange: - start: datetime - end: datetime - - def __post_init__(self): - if self.start > self.end: - raise ValueError("Start date must be before end date") - -@dataclass -class BaseModel: - id: UUID = field(default_factory=uuid4) - created_at: datetime = field(default_factory=datetime.utcnow) - updated_at: datetime = field(default_factory=datetime.utcnow) - - def touch(self) -> None: - self.updated_at = datetime.utcnow() - -@dataclass -class User(BaseModel): - username: str - email: str - -@dataclass -class Team(BaseModel): - name: str - member_ids: List[UUID] = field(default_factory=list) - -@dataclass -class Project(BaseModel): - name: str - owner_id: UUID - team_ids: List[UUID] = field(default_factory=list) - -@dataclass -class Sprint(BaseModel): - project_id: UUID - name: str - duration: DateRange - -@dataclass -class Task(BaseModel): - project_id: UUID - title: str - status: Status = Status.TODO - priority: Priority = Priority.MEDIUM - assignee_id: Optional[UUID] = None - sprint_id: Optional[UUID] = None - estimate: Optional[TimeEstimate] = None - - @classmethod - def create(cls: Type[T], title: str, project_id: UUID, **kwargs) -> T: - if not title: - raise ValueError("Title is required") - return cls(title=title, project_id=project_id, **kwargs) - -@dataclass -class Comment(BaseModel): - task_id: UUID - author_id: UUID - content: str - - def __post_init__(self): - if not self.content.strip(): - raise ValueError("Comment content cannot be empty") \ No newline at end of file diff --git a/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-1.py b/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-1.py deleted file mode 100644 index 96cef30..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-1.py +++ /dev/null @@ -1,25 +0,0 @@ -@dataclass -class Milestone(BaseModel): - name: str - target_date: datetime - status: Status - project_id: UUID - task_ids: List[UUID] = field(default_factory=list) - - @classmethod - def create(cls, name: str, target_date: datetime, project_id: UUID) -> "Milestone": - """Factory method to initialize a new milestone.""" - if target_date < datetime.utcnow(): - raise ValueError("Target date must be in the future") - return cls( - name=name, - target_date=target_date, - status=Status.TODO, - project_id=project_id - ) - - def add_task(self, task_id: UUID) -> None: - """Adds a task ID to the milestone and updates the timestamp.""" - if task_id not in self.task_ids: - self.task_ids.append(task_id) - self.touch() \ No newline at end of file diff --git a/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-2.py b/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-2.py deleted file mode 100644 index 8b56eec..0000000 --- a/evals/data/experiments/015-python-dataclasses-models/outputs/base/turn-2.py +++ /dev/null @@ -1,18 +0,0 @@ -class Priority(Enum): - LOW = "low" - MEDIUM = "medium" - HIGH = "high" - URGENT = "urgent" - CRITICAL = "critical" - - @property - def color(self) -> str: - """Returns the hex color code associated with the priority level.""" - colors = { - Priority.LOW: "#808080", # Gray - Priority.MEDIUM: "#FFA500", # Orange - Priority.HIGH: "#FF8C00", # Dark Orange - Priority.URGENT: "#FF4500", # Orange Red - Priority.CRITICAL: "#FF0000" # Red - } - return colors[self] \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/metrics.json b/evals/data/experiments/016-python-websocket-chat/metrics.json deleted file mode 100644 index 5246129..0000000 --- a/evals/data/experiments/016-python-websocket-chat/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "016-python-websocket-chat", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:56:32.559163+00:00", - "format": "text/x-python", - "base_turn0": { - "input_tokens": 112, - "output_tokens": 915, - "latency_ms": 4189, - "artifact_bytes": 3436 - }, - "aap_turn0": { - "input_tokens": 451, - "output_tokens": 658, - "latency_ms": 3727, - "artifact_bytes": 2493 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new 'ReactionMessage' model with fields: message_id, user_id, emoji, and t", - "input_tokens": 1052, - "output_tokens": 941, - "latency_ms": 4204, - "output_bytes": 3631, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Rewrite the broadcast method in ConnectionManager to support broadcasting only t", - "input_tokens": 2012, - "output_tokens": 1018, - "latency_ms": 5083, - "output_bytes": 3980, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add a new handler for 'pin_message' that allows users to pin a message in a room", - "input_tokens": 3057, - "output_tokens": 1103, - "latency_ms": 5234, - "output_bytes": 4169, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 6121, - "total_output_tokens": 3062, - "total_latency_ms": 14521 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new 'ReactionMessage' model with fields: message_id, user_id, emoji, and t", - "input_tokens": 1791, - "output_tokens": 769, - "latency_ms": 4577, - "output_bytes": 2717, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Rewrite the broadcast method in ConnectionManager to support broadcasting only t", - "input_tokens": 1852, - "output_tokens": 404, - "latency_ms": 4475, - "output_bytes": 1246, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Add a new handler for 'pin_message' that allows users to pin a message in a room", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 4274, - "output_bytes": 1246, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - } - ], - "total_input_tokens": 3643, - "total_output_tokens": 1173, - "total_latency_ms": 13326, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.6666666666666666 - }, - "comparison": { - "output_token_savings_pct": 61.7, - "input_token_savings_pct": 40.5, - "latency_savings_pct": 8.2 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 112, - "base_output": 915, - "base_latency_ms": 4189, - "aap_input": 451, - "aap_output": 658, - "aap_latency_ms": 3727 - }, - { - "turn": 1, - "base_input": 1052, - "base_output": 941, - "base_latency_ms": 4204, - "aap_input": 1791, - "aap_output": 769, - "aap_latency_ms": 4577, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 2012, - "base_output": 1018, - "base_latency_ms": 5083, - "aap_input": 1852, - "aap_output": 404, - "aap_latency_ms": 4475, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 3057, - "base_output": 1103, - "base_latency_ms": 5234, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 4274, - "envelope_name": "edit", - "apply_ok": false - } - ], - "totals": { - "base_input": 6233, - "base_output": 3977, - "base_combined": 10210, - "aap_input": 4094, - "aap_output": 1831, - "aap_combined": 5925, - "base_latency_ms": 18710, - "aap_latency_ms": 17053, - "output_savings_pct": 54.0, - "input_delta_pct": -34.3, - "combined_savings_pct": 42.0, - "latency_savings_pct": 8.9 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.3552, - "token_f1": 0.6187, - "base_char_count": 3436, - "aap_char_count": 2257, - "char_delta_pct": -34.3, - "lines_added": 38, - "lines_removed": 61, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.375, - "token_f1": 0.6453, - "base_char_count": 3631, - "aap_char_count": 2428, - "char_delta_pct": -33.1, - "lines_added": 40, - "lines_removed": 63, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0214, - "token_f1": 0.247, - "base_char_count": 3980, - "aap_char_count": 1246, - "char_delta_pct": -68.7, - "lines_added": 16, - "lines_removed": 106, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0207, - "token_f1": 0.2383, - "base_char_count": 4169, - "aap_char_count": 1246, - "char_delta_pct": -70.1, - "lines_added": 16, - "lines_removed": 107, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1931, - "mean_token_f1": 0.4373, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-0.py b/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-0.py deleted file mode 100644 index 368c331..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-0.py +++ /dev/null @@ -1,72 +0,0 @@ - -import asyncio -from typing import Dict, List, Optional -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException -from pydantic import BaseModel, Field -from datetime import datetime - -app = FastAPI() - -class ChatMessage(BaseModel): - - type: str = "message" - room: str - sender: str - content: str - timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat()) - - -class ConnectionManager: - - def __init__(self): - self.active_connections: Dict[str, List[WebSocket]] = {} - self.history: Dict[str, List[dict]] = {} - - async def connect(self, websocket: WebSocket, room: str): - await websocket.accept() - if room not in self.active_connections: - self.active_connections[room] = [] - self.active_connections[room].append(websocket) - - def disconnect(self, websocket: WebSocket, room: str): - self.active_connections[room].remove(websocket) - - async def broadcast(self, message: dict, room: str): - if room in self.active_connections: - for connection in self.active_connections[room]: - await connection.send_json(message) - - def add_history(self, room: str, message: dict): - if room not in self.history: - self.history[room] = [] - self.history[room].append(message) - - -manager = ConnectionManager() - -@app.websocket("/ws/{room}/{username}") -async def websocket_endpoint(websocket: WebSocket, room: str, username: str): - - await manager.connect(websocket, room) - await manager.broadcast({"type": "UserJoined", "user": username}, room) - - try: - while True: - data = await websocket.receive_json() - # Basic rate limiting simulation - await asyncio.sleep(0.1) - - msg = ChatMessage(room=room, sender=username, content=data.get("content", "")) - manager.add_history(room, msg.dict()) - await manager.broadcast(msg.dict(), room) - except WebSocketDisconnect: - manager.disconnect(websocket, room) - await manager.broadcast({"type": "UserLeft", "user": username}, room) - - -@app.get("/rooms") -async def list_rooms(): - - return {"rooms": list(manager.active_connections.keys())} - - \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-1.json b/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-1.json deleted file mode 100644 index 925196d..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "fastapi-chat-app", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "\nimport asyncio\nfrom typing import Dict, List, Optional\nfrom fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException\nfrom pydantic import BaseModel, Field\nfrom datetime import datetime\n\napp = FastAPI()\n\nclass ChatMessage(BaseModel):\n \n type: str = \"message\"\n room: str\n sender: str\n content: str\n timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())\n \n\n\nclass ReactionMessage(BaseModel):\n message_id: str\n user_id: str\n emoji: str\n timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())\n\n\nclass ConnectionManager:\n \n def __init__(self):\n self.active_connections: Dict[str, List[WebSocket]] = {}\n self.history: Dict[str, List[dict]] = {}\n\n async def connect(self, websocket: WebSocket, room: str):\n await websocket.accept()\n if room not in self.active_connections:\n self.active_connections[room] = []\n self.active_connections[room].append(websocket)\n\n def disconnect(self, websocket: WebSocket, room: str):\n self.active_connections[room].remove(websocket)\n\n async def broadcast(self, message: dict, room: str):\n if room in self.active_connections:\n for connection in self.active_connections[room]:\n await connection.send_json(message)\n\n def add_history(self, room: str, message: dict):\n if room not in self.history:\n self.history[room] = []\n self.history[room].append(message)\n \n\nmanager = ConnectionManager()\n\n@app.websocket(\"/ws/{room}/{username}\")\nasync def websocket_endpoint(websocket: WebSocket, room: str, username: str):\n \n await manager.connect(websocket, room)\n await manager.broadcast({\"type\": \"UserJoined\", \"user\": username}, room)\n \n try:\n while True:\n data = await websocket.receive_json()\n # Basic rate limiting simulation\n await asyncio.sleep(0.1)\n \n msg = ChatMessage(room=room, sender=username, content=data.get(\"content\", \"\"))\n manager.add_history(room, msg.dict())\n await manager.broadcast(msg.dict(), room)\n except WebSocketDisconnect:\n manager.disconnect(websocket, room)\n await manager.broadcast({\"type\": \"UserLeft\", \"user\": username}, room)\n \n\n@app.get(\"/rooms\")\nasync def list_rooms():\n \n return {\"rooms\": list(manager.active_connections.keys())}\n \n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-1.py b/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-1.py deleted file mode 100644 index 5f11d8c..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-1.py +++ /dev/null @@ -1,80 +0,0 @@ - -import asyncio -from typing import Dict, List, Optional -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException -from pydantic import BaseModel, Field -from datetime import datetime - -app = FastAPI() - -class ChatMessage(BaseModel): - - type: str = "message" - room: str - sender: str - content: str - timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat()) - - - -class ReactionMessage(BaseModel): - message_id: str - user_id: str - emoji: str - timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat()) - - -class ConnectionManager: - - def __init__(self): - self.active_connections: Dict[str, List[WebSocket]] = {} - self.history: Dict[str, List[dict]] = {} - - async def connect(self, websocket: WebSocket, room: str): - await websocket.accept() - if room not in self.active_connections: - self.active_connections[room] = [] - self.active_connections[room].append(websocket) - - def disconnect(self, websocket: WebSocket, room: str): - self.active_connections[room].remove(websocket) - - async def broadcast(self, message: dict, room: str): - if room in self.active_connections: - for connection in self.active_connections[room]: - await connection.send_json(message) - - def add_history(self, room: str, message: dict): - if room not in self.history: - self.history[room] = [] - self.history[room].append(message) - - -manager = ConnectionManager() - -@app.websocket("/ws/{room}/{username}") -async def websocket_endpoint(websocket: WebSocket, room: str, username: str): - - await manager.connect(websocket, room) - await manager.broadcast({"type": "UserJoined", "user": username}, room) - - try: - while True: - data = await websocket.receive_json() - # Basic rate limiting simulation - await asyncio.sleep(0.1) - - msg = ChatMessage(room=room, sender=username, content=data.get("content", "")) - manager.add_history(room, msg.dict()) - await manager.broadcast(msg.dict(), room) - except WebSocketDisconnect: - manager.disconnect(websocket, room) - await manager.broadcast({"type": "UserLeft", "user": username}, room) - - -@app.get("/rooms") -async def list_rooms(): - - return {"rooms": list(manager.active_connections.keys())} - - \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-2.json b/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-2.json deleted file mode 100644 index 958b040..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"artifact-id\",\n \"version\": 1,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\n \"type\": \"id\",\n \"value\": \"connection-manager\"\n },\n \"content\": \" def __init__(self):\\n self.active_connections: Dict[str, List[WebSocket]] = {}\\n self.history: Dict[str, List[dict]] = {}\\n\\n async def connect(self, websocket: WebSocket, room: str):\\n await websocket.accept()\\n if room not in self.active_connections:\\n self.active_connections[room] = []\\n self.active_connections[room].append(websocket)\\n\\n def disconnect(self, websocket: WebSocket, room: str):\\n self.active_connections[room].remove(websocket)\\n if not self.active_connections[room]:\\n del self.active_connections[room]\\n\\n async def broadcast(self, message: dict, room: str):\\n if room in self.active_connections:\\n for connection in self.active_connections[room]:\\n await connection.send_json(message)\\n\\n def add_history(self, room: str, message: dict):\\n if room not in self.history:\\n self.history[room] = []\\n self.history[room].append(message)\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-2.py b/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-2.py deleted file mode 100644 index 3797157..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-2.py +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 1, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "connection-manager" - }, - "content": " def __init__(self):\n self.active_connections: Dict[str, List[WebSocket]] = {}\n self.history: Dict[str, List[dict]] = {}\n\n async def connect(self, websocket: WebSocket, room: str):\n await websocket.accept()\n if room not in self.active_connections:\n self.active_connections[room] = []\n self.active_connections[room].append(websocket)\n\n def disconnect(self, websocket: WebSocket, room: str):\n self.active_connections[room].remove(websocket)\n if not self.active_connections[room]:\n del self.active_connections[room]\n\n async def broadcast(self, message: dict, room: str):\n if room in self.active_connections:\n for connection in self.active_connections[room]:\n await connection.send_json(message)\n\n def add_history(self, room: str, message: dict):\n if room not in self.history:\n self.history[room] = []\n self.history[room].append(message)" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-3.json b/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-3.json deleted file mode 100644 index 2625b89..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-3.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 2, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "connection-manager" - }, - "content": " async def pin_message(self, room: str, message_id: str, pin_data: dict):\\n \\\"\\\"\\\"Pins a message within a specific room and notifies everyone.\\\"\\\"\\\"\\n message = {\\n \\\"type\\\": \\\"pin_message\\\",\\n \\\"message_id\\\": message_id,\\n \\\"data\\\": pin_data\\n }\\n await self.broadcast(message, room)\\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-3.py b/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-3.py deleted file mode 100644 index 3797157..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/aap/turn-3.py +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 1, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "connection-manager" - }, - "content": " def __init__(self):\n self.active_connections: Dict[str, List[WebSocket]] = {}\n self.history: Dict[str, List[dict]] = {}\n\n async def connect(self, websocket: WebSocket, room: str):\n await websocket.accept()\n if room not in self.active_connections:\n self.active_connections[room] = []\n self.active_connections[room].append(websocket)\n\n def disconnect(self, websocket: WebSocket, room: str):\n self.active_connections[room].remove(websocket)\n if not self.active_connections[room]:\n del self.active_connections[room]\n\n async def broadcast(self, message: dict, room: str):\n if room in self.active_connections:\n for connection in self.active_connections[room]:\n await connection.send_json(message)\n\n def add_history(self, room: str, message: dict):\n if room not in self.history:\n self.history[room] = []\n self.history[room].append(message)" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-0.py b/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-0.py deleted file mode 100644 index af87482..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-0.py +++ /dev/null @@ -1,94 +0,0 @@ -import asyncio -from typing import Dict, List, Set -from datetime import datetime -from pydantic import BaseModel, Field -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException, status -from fastapi.middleware.cors import CORSMiddleware - -app = FastAPI() -app.add_middleware(CORSMiddleware, allow_origins=["*"]) - -class ChatMessage(BaseModel): - type: str = "message" - sender: str - room: str - content: str - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - -class SystemMessage(BaseModel): - type: str = "system" - content: str - -class ConnectionManager: - def __init__(self): - self.active_connections: Dict[str, Dict[str, WebSocket]] = {} - self.history: Dict[str, List[ChatMessage]] = {} - self.rate_limit: Dict[str, float] = {} - - async def connect(self, websocket: WebSocket, room: str, user: str): - await websocket.accept() - if room not in self.active_connections: - self.active_connections[room] = {} - self.history[room] = [] - self.active_connections[room][user] = websocket - - def disconnect(self, room: str, user: str): - if room in self.active_connections and user in self.active_connections[room]: - del self.active_connections[room][user] - - async def broadcast(self, room: str, message: dict): - if room in self.active_connections: - for connection in self.active_connections[room].values(): - await connection.send_json(message) - - async def send_to_user(self, websocket: WebSocket, message: dict): - await websocket.send_json(message) - -manager = ConnectionManager() - -@app.websocket("/ws/{room}/{user}") -async def websocket_endpoint(websocket: WebSocket, room: str, user: str): - await manager.connect(websocket, room, user) - await manager.broadcast(room, {"type": "user_joined", "user": user}) - - try: - while True: - data = await websocket.receive_json() - - # Basic Rate Limiting - now = asyncio.get_event_loop().time() - if now - manager.rate_limit.get(user, 0) < 0.5: - await manager.send_to_user(websocket, {"type": "error", "message": "Rate limit exceeded"}) - continue - manager.rate_limit[user] = now - - msg_type = data.get("type") - - if msg_type == "message": - chat_msg = ChatMessage(sender=user, room=room, content=data["content"]) - manager.history[room].append(chat_msg) - await manager.broadcast(room, chat_msg.dict()) - - elif msg_type == "typing": - await manager.broadcast(room, {"type": "typing", "user": user}) - - except WebSocketDisconnect: - manager.disconnect(room, user) - await manager.broadcast(room, {"type": "user_left", "user": user}) - -@app.get("/rooms") -async def list_rooms(): - return {"rooms": list(manager.active_connections.keys())} - -@app.get("/rooms/{room}/history") -async def get_history(room: str): - if room not in manager.history: - raise HTTPException(status_code=404, detail="Room not found") - return manager.history[room] - -@app.post("/rooms/{room}/clear") -async def clear_history(room: str): - if room in manager.history: - manager.history[room] = [] - return {"status": "cleared"} - raise HTTPException(status_code=404, detail="Room not found") \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-1.py b/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-1.py deleted file mode 100644 index ea91ac7..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-1.py +++ /dev/null @@ -1,102 +0,0 @@ -import asyncio -from typing import Dict, List, Optional -from datetime import datetime -from pydantic import BaseModel, Field -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException -from fastapi.middleware.cors import CORSMiddleware - -app = FastAPI() -app.add_middleware(CORSMiddleware, allow_origins=["*"]) - -class ChatMessage(BaseModel): - id: str - type: str = "message" - sender: str - room: str - content: str - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - -class ReactionMessage(BaseModel): - type: str = "reaction" - message_id: str - user_id: str - emoji: str - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - -class ConnectionManager: - def __init__(self): - self.active_connections: Dict[str, Dict[str, WebSocket]] = {} - self.history: Dict[str, List[ChatMessage]] = {} - self.rate_limit: Dict[str, float] = {} - - async def connect(self, websocket: WebSocket, room: str, user: str): - await websocket.accept() - if room not in self.active_connections: - self.active_connections[room] = {} - self.history[room] = [] - self.active_connections[room][user] = websocket - - def disconnect(self, room: str, user: str): - if room in self.active_connections and user in self.active_connections[room]: - del self.active_connections[room][user] - - async def broadcast(self, room: str, message: dict): - if room in self.active_connections: - for connection in self.active_connections[room].values(): - await connection.send_json(message) - - async def send_to_user(self, websocket: WebSocket, message: dict): - await websocket.send_json(message) - -manager = ConnectionManager() - -@app.websocket("/ws/{room}/{user}") -async def websocket_endpoint(websocket: WebSocket, room: str, user: str): - await manager.connect(websocket, room, user) - await manager.broadcast(room, {"type": "user_joined", "user": user}) - - try: - while True: - data = await websocket.receive_json() - - now = asyncio.get_event_loop().time() - if now - manager.rate_limit.get(user, 0) < 0.2: - continue - manager.rate_limit[user] = now - - msg_type = data.get("type") - - if msg_type == "message": - chat_msg = ChatMessage( - id=str(len(manager.history.get(room, []))), - sender=user, - room=room, - content=data["content"] - ) - manager.history[room].append(chat_msg) - await manager.broadcast(room, chat_msg.dict()) - - elif msg_type == "reaction": - reaction = ReactionMessage( - message_id=data["message_id"], - user_id=user, - emoji=data["emoji"] - ) - await manager.broadcast(room, reaction.dict()) - - elif msg_type == "typing": - await manager.broadcast(room, {"type": "typing", "user": user}) - - except WebSocketDisconnect: - manager.disconnect(room, user) - await manager.broadcast(room, {"type": "user_left", "user": user}) - -@app.get("/rooms") -async def list_rooms(): - return {"rooms": list(manager.active_connections.keys())} - -@app.get("/rooms/{room}/history") -async def get_history(room: str): - if room not in manager.history: - raise HTTPException(status_code=404, detail="Room not found") - return manager.history[room] \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-2.py b/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-2.py deleted file mode 100644 index fbbd2e6..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-2.py +++ /dev/null @@ -1,106 +0,0 @@ -import asyncio -from typing import Dict, List, Optional -from datetime import datetime -import uuid -from pydantic import BaseModel, Field -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException -from fastapi.middleware.cors import CORSMiddleware - -app = FastAPI() -app.add_middleware(CORSMiddleware, allow_origins=["*"]) - -class ChatMessage(BaseModel): - id: str = Field(default_factory=lambda: str(uuid.uuid4())) - type: str = "message" - sender: str - room: str - content: str - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - -class ReactionMessage(BaseModel): - type: str = "reaction" - message_id: str - user_id: str - emoji: str - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - -class ConnectionManager: - def __init__(self): - # Room -> {Username -> WebSocket} - self.active_connections: Dict[str, Dict[str, WebSocket]] = {} - self.history: Dict[str, List[ChatMessage]] = {} - self.rate_limit: Dict[str, float] = {} - - async def connect(self, websocket: WebSocket, room: str, user: str): - await websocket.accept() - if room not in self.active_connections: - self.active_connections[room] = {} - self.history[room] = [] - self.active_connections[room][user] = websocket - - def disconnect(self, room: str, user: str): - if room in self.active_connections and user in self.active_connections[room]: - del self.active_connections[room][user] - if not self.active_connections[room]: - del self.active_connections[room] - - async def broadcast(self, room: str, message: dict, exclude_user: Optional[str] = None): - """Broadcasts a message only to users within the specified room.""" - if room in self.active_connections: - for username, connection in self.active_connections[room].items(): - if username != exclude_user: - try: - await connection.send_json(message) - except Exception: - continue - - async def send_to_user(self, websocket: WebSocket, message: dict): - await websocket.send_json(message) - -manager = ConnectionManager() - -@app.websocket("/ws/{room}/{user}") -async def websocket_endpoint(websocket: WebSocket, room: str, user: str): - await manager.connect(websocket, room, user) - await manager.broadcast(room, {"type": "user_joined", "user": user}) - - try: - while True: - data = await websocket.receive_json() - - now = asyncio.get_event_loop().time() - if now - manager.rate_limit.get(user, 0) < 0.2: - continue - manager.rate_limit[user] = now - - msg_type = data.get("type") - - if msg_type == "message": - chat_msg = ChatMessage(sender=user, room=room, content=data["content"]) - manager.history[room].append(chat_msg) - await manager.broadcast(room, chat_msg.dict()) - - elif msg_type == "reaction": - reaction = ReactionMessage( - message_id=data["message_id"], - user_id=user, - emoji=data["emoji"] - ) - await manager.broadcast(room, reaction.dict()) - - elif msg_type == "typing": - await manager.broadcast(room, {"type": "typing", "user": user}, exclude_user=user) - - except WebSocketDisconnect: - manager.disconnect(room, user) - await manager.broadcast(room, {"type": "user_left", "user": user}) - -@app.get("/rooms") -async def list_rooms(): - return {"rooms": list(manager.active_connections.keys())} - -@app.get("/rooms/{room}/history") -async def get_history(room: str): - if room not in manager.history: - raise HTTPException(status_code=404, detail="Room not found") - return manager.history[room] \ No newline at end of file diff --git a/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-3.py b/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-3.py deleted file mode 100644 index b43e30d..0000000 --- a/evals/data/experiments/016-python-websocket-chat/outputs/base/turn-3.py +++ /dev/null @@ -1,107 +0,0 @@ -import asyncio -from typing import Dict, List, Optional -from datetime import datetime -import uuid -from pydantic import BaseModel, Field -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException -from fastapi.middleware.cors import CORSMiddleware - -app = FastAPI() -app.add_middleware(CORSMiddleware, allow_origins=["*"]) - -class ChatMessage(BaseModel): - id: str = Field(default_factory=lambda: str(uuid.uuid4())) - type: str = "message" - sender: str - room: str - content: str - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - -class ReactionMessage(BaseModel): - type: str = "reaction" - message_id: str - user_id: str - emoji: str - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - -class PinMessage(BaseModel): - type: str = "pin" - message_id: str - pinned_by: str - timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) - -class ConnectionManager: - def __init__(self): - self.active_connections: Dict[str, Dict[str, WebSocket]] = {} - self.history: Dict[str, List[ChatMessage]] = {} - self.pinned_messages: Dict[str, List[str]] = {} # Room -> List of message_ids - self.rate_limit: Dict[str, float] = {} - - async def connect(self, websocket: WebSocket, room: str, user: str): - await websocket.accept() - if room not in self.active_connections: - self.active_connections[room] = {} - self.history[room] = [] - self.pinned_messages[room] = [] - self.active_connections[room][user] = websocket - - def disconnect(self, room: str, user: str): - if room in self.active_connections and user in self.active_connections[room]: - del self.active_connections[room][user] - - async def broadcast(self, room: str, message: dict, exclude_user: Optional[str] = None): - if room in self.active_connections: - for username, connection in self.active_connections[room].items(): - if username != exclude_user: - try: - await connection.send_json(message) - except Exception: - pass - - async def send_to_user(self, websocket: WebSocket, message: dict): - await websocket.send_json(message) - -manager = ConnectionManager() - -@app.websocket("/ws/{room}/{user}") -async def websocket_endpoint(websocket: WebSocket, room: str, user: str): - await manager.connect(websocket, room, user) - await manager.broadcast(room, {"type": "user_joined", "user": user}) - - try: - while True: - data = await websocket.receive_json() - now = asyncio.get_event_loop().time() - if now - manager.rate_limit.get(user, 0) < 0.2: - continue - manager.rate_limit[user] = now - - msg_type = data.get("type") - - if msg_type == "message": - chat_msg = ChatMessage(sender=user, room=room, content=data["content"]) - manager.history[room].append(chat_msg) - await manager.broadcast(room, chat_msg.dict()) - - elif msg_type == "reaction": - reaction = ReactionMessage(message_id=data["message_id"], user_id=user, emoji=data["emoji"]) - await manager.broadcast(room, reaction.dict()) - - elif msg_type == "pin": - msg_id = data.get("message_id") - manager.pinned_messages[room].append(msg_id) - pin_event = PinMessage(message_id=msg_id, pinned_by=user) - await manager.broadcast(room, pin_event.dict()) - - elif msg_type == "typing": - await manager.broadcast(room, {"type": "typing", "user": user}, exclude_user=user) - - except WebSocketDisconnect: - manager.disconnect(room, user) - await manager.broadcast(room, {"type": "user_left", "user": user}) - -@app.get("/rooms/{room}/pinned") -async def get_pinned_messages(room: str): - if room not in manager.pinned_messages: - raise HTTPException(status_code=404, detail="Room not found") - return {"pinned": manager.pinned_messages[room]} \ No newline at end of file diff --git a/evals/data/experiments/017-python-scraper/eval.json b/evals/data/experiments/017-python-scraper/eval.json deleted file mode 100644 index cb9e0b6..0000000 --- a/evals/data/experiments/017-python-scraper/eval.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.6423, - "token_f1": 0.6848, - "base_char_count": 3923, - "aap_char_count": 3301, - "char_delta_pct": -15.9, - "lines_added": 45, - "lines_removed": 51, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.6423, - "mean_token_f1": 0.6848, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null -} diff --git a/evals/data/experiments/017-python-scraper/metrics.json b/evals/data/experiments/017-python-scraper/metrics.json deleted file mode 100644 index 2273cbc..0000000 --- a/evals/data/experiments/017-python-scraper/metrics.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "experiment_id": "017-python-scraper", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:57:08.388939+00:00", - "format": "text/x-python", - "base_turn0": { - "input_tokens": 116, - "output_tokens": 1215, - "latency_ms": 6182, - "artifact_bytes": 4145 - }, - "aap_turn0": { - "input_tokens": 455, - "output_tokens": 815, - "latency_ms": 4325, - "artifact_bytes": 2849 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Update the config to set the rate limit to 2 requests per second and add a proxy", - "input_tokens": 1354, - "output_tokens": 449, - "latency_ms": 2864, - "output_bytes": 1323, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Add a new parser field for 'discount_price' that extracts sale prices and calcul", - "input_tokens": 1829, - "output_tokens": 672, - "latency_ms": 3784, - "output_bytes": 2374, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 3183, - "total_output_tokens": 1121, - "total_latency_ms": 6648 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Update the config to set the rate limit to 2 requests per second and add a proxy", - "input_tokens": 1946, - "output_tokens": 907, - "latency_ms": 4759, - "output_bytes": 3014, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Add a new parser field for 'discount_price' that extracts sale prices and calcul", - "input_tokens": 1999, - "output_tokens": 1126, - "latency_ms": 6189, - "output_bytes": 3748, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - } - ], - "total_input_tokens": 3945, - "total_output_tokens": 2033, - "total_latency_ms": 10948, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": -81.4, - "input_token_savings_pct": -23.9, - "latency_savings_pct": -64.7 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 116, - "base_output": 1215, - "base_latency_ms": 6182, - "aap_input": 455, - "aap_output": 815, - "aap_latency_ms": 4325 - }, - { - "turn": 1, - "base_input": 1354, - "base_output": 449, - "base_latency_ms": 2864, - "aap_input": 1946, - "aap_output": 907, - "aap_latency_ms": 4759, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 1829, - "base_output": 672, - "base_latency_ms": 3784, - "aap_input": 1999, - "aap_output": 1126, - "aap_latency_ms": 6189, - "envelope_name": "synthesize", - "apply_ok": true - } - ], - "totals": { - "base_input": 3299, - "base_output": 2336, - "base_combined": 5635, - "aap_input": 4400, - "aap_output": 2848, - "aap_combined": 7248, - "base_latency_ms": 12830, - "aap_latency_ms": 15273, - "output_savings_pct": -21.9, - "input_delta_pct": 33.4, - "combined_savings_pct": -28.6, - "latency_savings_pct": -19.0 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.3352, - "token_f1": 0.6014, - "base_char_count": 4145, - "aap_char_count": 2675, - "char_delta_pct": -35.5, - "lines_added": 52, - "lines_removed": 80, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.11, - "token_f1": 0.3235, - "base_char_count": 1323, - "aap_char_count": 2840, - "char_delta_pct": 114.7, - "lines_added": 78, - "lines_removed": 26, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.2007, - "token_f1": 0.3828, - "base_char_count": 2374, - "aap_char_count": 3574, - "char_delta_pct": 50.5, - "lines_added": 99, - "lines_removed": 45, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.2153, - "mean_token_f1": 0.4359, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/017-python-scraper/outputs/aap/turn-0.py b/evals/data/experiments/017-python-scraper/outputs/aap/turn-0.py deleted file mode 100644 index 28747aa..0000000 --- a/evals/data/experiments/017-python-scraper/outputs/aap/turn-0.py +++ /dev/null @@ -1,81 +0,0 @@ -import json -import sqlite3 -import time -import logging -from dataclasses import dataclass -from typing import Dict, Any, List -import requests -from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -@dataclass -class ScraperConfig: - base_url: str = "https://example.com" - rate_limit: float = 1.5 - retry_total: int = 3 - retry_backoff: float = 2.0 - output_jsonl: str = "products.jsonl" - db_path: str = "products.db" - user_agents: List[str] = None - - - -class ProductFetcher: - def __init__(self, config: ScraperConfig): - self.config = config - self.session = requests.Session() - retries = Retry(total=config.retry_total, backoff_factor=config.retry_backoff) - self.session.mount("https://", HTTPAdapter(max_retries=retries)) - self.session.headers.update({"User-Agent": "Mozilla/5.0"}) - - def fetch(self, url: str) -> str: - time.sleep(self.config.rate_limit) - response = self.session.get(url, timeout=10) - response.raise_for_status() - return response.text - - - -class ProductParser: - @staticmethod - def parse_product(html: str) -> Dict[str, Any]: - soup = BeautifulSoup(html, 'html.parser') - return { - "name": soup.select_one(".product-title").text.strip(), - "price": soup.select_one(".price").text.strip(), - "rating": soup.select_one(".rating").get("data-value"), - "review_count": soup.select_one(".reviews").text.strip(), - "availability": soup.select_one(".stock").text.strip(), - "image_url": soup.select_one("img.main-image").get("src") - } - - - -class Storage: - def __init__(self, config: ScraperConfig): - self.config = config - self._init_db() - - def _init_db(self): - with sqlite3.connect(self.config.db_path) as conn: - conn.execute(""" - CREATE TABLE IF NOT EXISTS products ( - id INTEGER PRIMARY KEY, - name TEXT, price TEXT, rating TEXT, - reviews TEXT, stock TEXT, img TEXT - ) - """) - - def save(self, data: Dict[str, Any]): - with open(self.config.output_jsonl, "a") as f: - f.write(json.dumps(data) + "\n") - - with sqlite3.connect(self.config.db_path) as conn: - conn.execute("INSERT INTO products (name, price, rating, reviews, stock, img) VALUES (?,?,?,?,?,?)", - (data['name'], data['price'], data['rating'], data['review_count'], data['availability'], data['image_url'])) - \ No newline at end of file diff --git a/evals/data/experiments/017-python-scraper/outputs/aap/turn-1.json b/evals/data/experiments/017-python-scraper/outputs/aap/turn-1.json deleted file mode 100644 index 969d3f1..0000000 --- a/evals/data/experiments/017-python-scraper/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "scraper-script", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import json\nimport sqlite3\nimport time\nimport logging\nfrom dataclasses import dataclass\nfrom typing import Dict, Any, List, Optional\nimport requests\nfrom bs4 import BeautifulSoup\nfrom requests.adapters import HTTPAdapter\nfrom urllib3.util.retry import Retry\n\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n\n@dataclass\nclass ScraperConfig:\n base_url: str = \"https://example.com\"\n rate_limit: float = 0.5\n retry_total: int = 3\n retry_backoff: float = 2.0\n output_jsonl: str = \"products.jsonl\"\n db_path: str = \"products.db\"\n user_agents: List[str] = None\n proxy_url: Optional[str] = None\n\n\n\nclass ProductFetcher:\n def __init__(self, config: ScraperConfig):\n self.config = config\n self.session = requests.Session()\n retries = Retry(total=config.retry_total, backoff_factor=config.retry_backoff)\n self.session.mount(\"https://\", HTTPAdapter(max_retries=retries))\n self.session.headers.update({\"User-Agent\": \"Mozilla/5.0\"})\n if config.proxy_url:\n self.session.proxies = {\"http\": config.proxy_url, \"https\": config.proxy_url}\n\n def fetch(self, url: str) -> str:\n time.sleep(self.config.rate_limit)\n response = self.session.get(url, timeout=10)\n response.raise_for_status()\n return response.text\n\n\n\nclass ProductParser:\n @staticmethod\n def parse_product(html: str) -> Dict[str, Any]:\n soup = BeautifulSoup(html, 'html.parser')\n return {\n \"name\": soup.select_one(\".product-title\").text.strip(),\n \"price\": soup.select_one(\".price\").text.strip(),\n \"rating\": soup.select_one(\".rating\").get(\"data-value\"),\n \"review_count\": soup.select_one(\".reviews\").text.strip(),\n \"availability\": soup.select_one(\".stock\").text.strip(),\n \"image_url\": soup.select_one(\"img.main-image\").get(\"src\")\n }\n\n\n\nclass Storage:\n def __init__(self, config: ScraperConfig):\n self.config = config\n self._init_db()\n\n def _init_db(self):\n with sqlite3.connect(self.config.db_path) as conn:\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS products (\n id INTEGER PRIMARY KEY,\n name TEXT, price TEXT, rating TEXT, \n reviews TEXT, stock TEXT, img TEXT\n )\n \"\"\")\n\n def save(self, data: Dict[str, Any]):\n with open(self.config.output_jsonl, \"a\") as f:\n f.write(json.dumps(data) + \"\\n\")\n \n with sqlite3.connect(self.config.db_path) as conn:\n conn.execute(\"INSERT INTO products (name, price, rating, reviews, stock, img) VALUES (?,?,?,?,?,?)\",\n (data['name'], data['price'], data['rating'], data['review_count'], data['availability'], data['image_url']))\n\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/017-python-scraper/outputs/aap/turn-1.py b/evals/data/experiments/017-python-scraper/outputs/aap/turn-1.py deleted file mode 100644 index 00a5ccf..0000000 --- a/evals/data/experiments/017-python-scraper/outputs/aap/turn-1.py +++ /dev/null @@ -1,84 +0,0 @@ -import json -import sqlite3 -import time -import logging -from dataclasses import dataclass -from typing import Dict, Any, List, Optional -import requests -from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -@dataclass -class ScraperConfig: - base_url: str = "https://example.com" - rate_limit: float = 0.5 - retry_total: int = 3 - retry_backoff: float = 2.0 - output_jsonl: str = "products.jsonl" - db_path: str = "products.db" - user_agents: List[str] = None - proxy_url: Optional[str] = None - - - -class ProductFetcher: - def __init__(self, config: ScraperConfig): - self.config = config - self.session = requests.Session() - retries = Retry(total=config.retry_total, backoff_factor=config.retry_backoff) - self.session.mount("https://", HTTPAdapter(max_retries=retries)) - self.session.headers.update({"User-Agent": "Mozilla/5.0"}) - if config.proxy_url: - self.session.proxies = {"http": config.proxy_url, "https": config.proxy_url} - - def fetch(self, url: str) -> str: - time.sleep(self.config.rate_limit) - response = self.session.get(url, timeout=10) - response.raise_for_status() - return response.text - - - -class ProductParser: - @staticmethod - def parse_product(html: str) -> Dict[str, Any]: - soup = BeautifulSoup(html, 'html.parser') - return { - "name": soup.select_one(".product-title").text.strip(), - "price": soup.select_one(".price").text.strip(), - "rating": soup.select_one(".rating").get("data-value"), - "review_count": soup.select_one(".reviews").text.strip(), - "availability": soup.select_one(".stock").text.strip(), - "image_url": soup.select_one("img.main-image").get("src") - } - - - -class Storage: - def __init__(self, config: ScraperConfig): - self.config = config - self._init_db() - - def _init_db(self): - with sqlite3.connect(self.config.db_path) as conn: - conn.execute(""" - CREATE TABLE IF NOT EXISTS products ( - id INTEGER PRIMARY KEY, - name TEXT, price TEXT, rating TEXT, - reviews TEXT, stock TEXT, img TEXT - ) - """) - - def save(self, data: Dict[str, Any]): - with open(self.config.output_jsonl, "a") as f: - f.write(json.dumps(data) + "\n") - - with sqlite3.connect(self.config.db_path) as conn: - conn.execute("INSERT INTO products (name, price, rating, reviews, stock, img) VALUES (?,?,?,?,?,?)", - (data['name'], data['price'], data['rating'], data['review_count'], data['availability'], data['image_url'])) - diff --git a/evals/data/experiments/017-python-scraper/outputs/aap/turn-2.json b/evals/data/experiments/017-python-scraper/outputs/aap/turn-2.json deleted file mode 100644 index 4f7da50..0000000 --- a/evals/data/experiments/017-python-scraper/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "scraper_artifact", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/x-python", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import json\nimport sqlite3\nimport time\nimport logging\nfrom dataclasses import dataclass\nfrom typing import Dict, Any, List, Optional\nimport requests\nfrom bs4 import BeautifulSoup\nfrom requests.adapters import HTTPAdapter\nfrom urllib3.util.retry import Retry\n\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n\n@dataclass\nclass ScraperConfig:\n base_url: str = \"https://example.com\"\n rate_limit: float = 0.5\n retry_total: int = 3\n retry_backoff: float = 2.0\n output_jsonl: str = \"products.jsonl\"\n db_path: str = \"products.db\"\n user_agents: List[str] = None\n proxy_url: Optional[str] = None\n\n\n\nclass ProductFetcher:\n def __init__(self, config: ScraperConfig):\n self.config = config\n self.session = requests.Session()\n retries = Retry(total=config.retry_total, backoff_factor=config.retry_backoff)\n self.session.mount(\"https://\", HTTPAdapter(max_retries=retries))\n self.session.headers.update({\"User-Agent\": \"Mozilla/5.0\"})\n if config.proxy_url:\n self.session.proxies = {\"http\": config.proxy_url, \"https\": config.proxy_url}\n\n def fetch(self, url: str) -> str:\n time.sleep(self.config.rate_limit)\n response = self.session.get(url, timeout=10)\n response.raise_for_status()\n return response.text\n\n\n\nclass ProductParser:\n @staticmethod\n def parse_product(html: str) -> Dict[str, Any]:\n soup = BeautifulSoup(html, 'html.parser')\n \n price_el = soup.select_one(\".price\")\n discount_el = soup.select_one(\".discount-price\")\n \n data = {\n \"name\": soup.select_one(\".product-title\").text.strip(),\n \"price\": price_el.text.strip(),\n \"rating\": soup.select_one(\".rating\").get(\"data-value\"),\n \"review_count\": soup.select_one(\".reviews\").text.strip(),\n \"availability\": soup.select_one(\".stock\").text.strip(),\n \"image_url\": soup.select_one(\"img.main-image\").get(\"src\"),\n \"discount_price\": discount_el.text.strip() if discount_el else None\n }\n \n if data[\"discount_price\"]:\n try:\n p = float(data[\"price\"].replace(\"$\", \"\"))\n dp = float(data[\"discount_price\"].replace(\"$\", \"\"))\n data[\"discount_pct\"] = round(((p - dp) / p) * 100, 2)\n except:\n data[\"discount_pct\"] = 0\n else:\n data[\"discount_pct\"] = 0\n \n return data\n\n\n\nclass Storage:\n def __init__(self, config: ScraperConfig):\n self.config = config\n self._init_db()\n\n def _init_db(self):\n with sqlite3.connect(self.config.db_path) as conn:\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS products (\n id INTEGER PRIMARY KEY,\n name TEXT, price TEXT, rating TEXT, \n reviews TEXT, stock TEXT, img TEXT,\n discount_price TEXT, discount_pct REAL\n )\n \"\"\")\n\n def save(self, data: Dict[str, Any]):\n with open(self.config.output_jsonl, \"a\") as f:\n f.write(json.dumps(data) + \"\\n\")\n \n with sqlite3.connect(self.config.db_path) as conn:\n conn.execute(\"INSERT INTO products (name, price, rating, reviews, stock, img, discount_price, discount_pct) VALUES (?,?,?,?,?,?,?,?)\",\n (data['name'], data['price'], data['rating'], data['review_count'], data['availability'], data['image_url'], data.get('discount_price'), data.get('discount_pct')))\n\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/017-python-scraper/outputs/aap/turn-2.py b/evals/data/experiments/017-python-scraper/outputs/aap/turn-2.py deleted file mode 100644 index 1493eaa..0000000 --- a/evals/data/experiments/017-python-scraper/outputs/aap/turn-2.py +++ /dev/null @@ -1,102 +0,0 @@ -import json -import sqlite3 -import time -import logging -from dataclasses import dataclass -from typing import Dict, Any, List, Optional -import requests -from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -@dataclass -class ScraperConfig: - base_url: str = "https://example.com" - rate_limit: float = 0.5 - retry_total: int = 3 - retry_backoff: float = 2.0 - output_jsonl: str = "products.jsonl" - db_path: str = "products.db" - user_agents: List[str] = None - proxy_url: Optional[str] = None - - - -class ProductFetcher: - def __init__(self, config: ScraperConfig): - self.config = config - self.session = requests.Session() - retries = Retry(total=config.retry_total, backoff_factor=config.retry_backoff) - self.session.mount("https://", HTTPAdapter(max_retries=retries)) - self.session.headers.update({"User-Agent": "Mozilla/5.0"}) - if config.proxy_url: - self.session.proxies = {"http": config.proxy_url, "https": config.proxy_url} - - def fetch(self, url: str) -> str: - time.sleep(self.config.rate_limit) - response = self.session.get(url, timeout=10) - response.raise_for_status() - return response.text - - - -class ProductParser: - @staticmethod - def parse_product(html: str) -> Dict[str, Any]: - soup = BeautifulSoup(html, 'html.parser') - - price_el = soup.select_one(".price") - discount_el = soup.select_one(".discount-price") - - data = { - "name": soup.select_one(".product-title").text.strip(), - "price": price_el.text.strip(), - "rating": soup.select_one(".rating").get("data-value"), - "review_count": soup.select_one(".reviews").text.strip(), - "availability": soup.select_one(".stock").text.strip(), - "image_url": soup.select_one("img.main-image").get("src"), - "discount_price": discount_el.text.strip() if discount_el else None - } - - if data["discount_price"]: - try: - p = float(data["price"].replace("$", "")) - dp = float(data["discount_price"].replace("$", "")) - data["discount_pct"] = round(((p - dp) / p) * 100, 2) - except: - data["discount_pct"] = 0 - else: - data["discount_pct"] = 0 - - return data - - - -class Storage: - def __init__(self, config: ScraperConfig): - self.config = config - self._init_db() - - def _init_db(self): - with sqlite3.connect(self.config.db_path) as conn: - conn.execute(""" - CREATE TABLE IF NOT EXISTS products ( - id INTEGER PRIMARY KEY, - name TEXT, price TEXT, rating TEXT, - reviews TEXT, stock TEXT, img TEXT, - discount_price TEXT, discount_pct REAL - ) - """) - - def save(self, data: Dict[str, Any]): - with open(self.config.output_jsonl, "a") as f: - f.write(json.dumps(data) + "\n") - - with sqlite3.connect(self.config.db_path) as conn: - conn.execute("INSERT INTO products (name, price, rating, reviews, stock, img, discount_price, discount_pct) VALUES (?,?,?,?,?,?,?,?)", - (data['name'], data['price'], data['rating'], data['review_count'], data['availability'], data['image_url'], data.get('discount_price'), data.get('discount_pct'))) - diff --git a/evals/data/experiments/017-python-scraper/outputs/base/turn-0.py b/evals/data/experiments/017-python-scraper/outputs/base/turn-0.py deleted file mode 100644 index 8fd3b2b..0000000 --- a/evals/data/experiments/017-python-scraper/outputs/base/turn-0.py +++ /dev/null @@ -1,108 +0,0 @@ -import logging -import time -import json -import sqlite3 -import random -from dataclasses import dataclass -from typing import Optional, Dict, Any -from pathlib import Path - -import requests -from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -@dataclass -class Config: - base_url: str = "https://example-ecommerce.com" - rate_limit: float = 2.0 - max_retries: int = 3 - backoff_factor: float = 1.0 - output_json: str = "products.jsonl" - db_path: str = "products.db" - user_agents: list = None - - def __post_init__(self): - if self.user_agents is None: - self.user_agents = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" - ] - -class Storage: - def __init__(self, config: Config): - self.config = config - self._init_db() - - def _init_db(self): - with sqlite3.connect(self.config.db_path) as conn: - conn.execute(""" - CREATE TABLE IF NOT EXISTS products ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT, price REAL, rating REAL, - reviews INTEGER, availability TEXT, image_url TEXT - ) - """) - - def save(self, data: Dict[str, Any]): - with open(self.config.output_json, 'a', encoding='utf-8') as f: - f.write(json.dumps(data) + '\n') - - with sqlite3.connect(self.config.db_path) as conn: - conn.execute(""" - INSERT INTO products (name, price, rating, reviews, availability, image_url) - VALUES (?, ?, ?, ?, ?, ?) - """, (data['name'], data['price'], data['rating'], data['reviews'], data['availability'], data['image_url'])) - -class Fetcher: - def __init__(self, config: Config): - self.config = config - self.session = requests.Session() - retries = Retry(total=config.max_retries, backoff_factor=config.backoff_factor) - self.session.mount("https://", HTTPAdapter(max_retries=retries)) - - def fetch(self, url: str) -> Optional[str]: - time.sleep(self.config.rate_limit) - headers = {"User-Agent": random.choice(self.config.user_agents)} - try: - response = self.session.get(url, headers=headers, timeout=10) - response.raise_for_status() - return response.text - except Exception as e: - logger.error(f"Failed to fetch {url}: {e}") - return None - -class Parser: - @staticmethod - def parse_product(html: str) -> Dict[str, Any]: - soup = BeautifulSoup(html, 'html.parser') - # Selectors are placeholders for demonstration - return { - "name": soup.select_one(".product-title").text.strip() if soup.select_one(".product-title") else None, - "price": float(soup.select_one(".price").text.replace('$', '').strip()) if soup.select_one(".price") else 0.0, - "rating": float(soup.select_one(".rating").get('data-score', 0)) if soup.select_one(".rating") else 0.0, - "reviews": int(soup.select_one(".review-count").text.split()[0]) if soup.select_one(".review-count") else 0, - "availability": soup.select_one(".stock").text.strip() if soup.select_one(".stock") else "Unknown", - "image_url": soup.select_one(".product-img")['src'] if soup.select_one(".product-img") else None - } - -def main(): - config = Config() - fetcher = Fetcher(config) - storage = Storage(config) - parser = Parser() - - urls = [f"{config.base_url}/product/{i}" for i in range(1, 11)] - - for url in urls: - html = fetcher.fetch(url) - if html: - data = parser.parse_product(html) - storage.save(data) - logger.info(f"Saved product: {data.get('name')}") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/evals/data/experiments/017-python-scraper/outputs/base/turn-1.py b/evals/data/experiments/017-python-scraper/outputs/base/turn-1.py deleted file mode 100644 index 70e4ff3..0000000 --- a/evals/data/experiments/017-python-scraper/outputs/base/turn-1.py +++ /dev/null @@ -1,32 +0,0 @@ -@dataclass -class Config: - base_url: str = "https://example-ecommerce.com" - # 2 requests per second = 0.5 seconds delay - rate_limit: float = 0.5 - proxy_url: Optional[str] = None - max_retries: int = 3 - backoff_factor: float = 1.0 - output_json: str = "products.jsonl" - db_path: str = "products.db" - user_agents: list = None - - def __post_init__(self): - if self.user_agents is None: - self.user_agents = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" - ] - -# Inside Fetcher class - def fetch(self, url: str) -> Optional[str]: - time.sleep(self.config.rate_limit) - headers = {"User-Agent": random.choice(self.config.user_agents)} - proxies = {"http": self.config.proxy_url, "https": self.config.proxy_url} if self.config.proxy_url else None - - try: - response = self.session.get(url, headers=headers, proxies=proxies, timeout=10) - response.raise_for_status() - return response.text - except Exception as e: - logger.error(f"Failed to fetch {url}: {e}") - return None \ No newline at end of file diff --git a/evals/data/experiments/017-python-scraper/outputs/base/turn-2.py b/evals/data/experiments/017-python-scraper/outputs/base/turn-2.py deleted file mode 100644 index 971c1a5..0000000 --- a/evals/data/experiments/017-python-scraper/outputs/base/turn-2.py +++ /dev/null @@ -1,48 +0,0 @@ -@staticmethod -def parse_product(html: str) -> Dict[str, Any]: - soup = BeautifulSoup(html, 'html.parser') - - # Extract raw prices - price_elem = soup.select_one(".price") - sale_elem = soup.select_one(".discount-price") - - original_price = float(price_elem.text.replace('$', '').strip()) if price_elem else 0.0 - discount_price = float(sale_elem.text.replace('$', '').strip()) if sale_elem else original_price - - # Calculate discount percentage - discount_percent = 0.0 - if original_price > 0 and discount_price < original_price: - discount_percent = ((original_price - discount_price) / original_price) * 100 - - return { - "name": soup.select_one(".product-title").text.strip() if soup.select_one(".product-title") else None, - "price": original_price, - "discount_price": discount_price, - "discount_percent": round(discount_percent, 2), - "rating": float(soup.select_one(".rating").get('data-score', 0)) if soup.select_one(".rating") else 0.0, - "reviews": int(soup.select_one(".review-count").text.split()[0]) if soup.select_one(".review-count") else 0, - "availability": soup.select_one(".stock").text.strip() if soup.select_one(".stock") else "Unknown", - "image_url": soup.select_one(".product-img")['src'] if soup.select_one(".product-img") else None - } - -# Update storage layer to include new fields -def _init_db(self): - with sqlite3.connect(self.config.db_path) as conn: - conn.execute(""" - CREATE TABLE IF NOT EXISTS products ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT, price REAL, discount_price REAL, discount_percent REAL, - rating REAL, reviews INTEGER, availability TEXT, image_url TEXT - ) - """) - -def save(self, data: Dict[str, Any]): - with open(self.config.output_json, 'a', encoding='utf-8') as f: - f.write(json.dumps(data) + '\n') - - with sqlite3.connect(self.config.db_path) as conn: - conn.execute(""" - INSERT INTO products (name, price, discount_price, discount_percent, rating, reviews, availability, image_url) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - """, (data['name'], data['price'], data['discount_price'], data['discount_percent'], - data['rating'], data['reviews'], data['availability'], data['image_url'])) \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/metrics.json b/evals/data/experiments/018-js-react-data-table/metrics.json deleted file mode 100644 index fd10fe6..0000000 --- a/evals/data/experiments/018-js-react-data-table/metrics.json +++ /dev/null @@ -1,274 +0,0 @@ -{ - "experiment_id": "018-js-react-data-table", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:57:36.544280+00:00", - "format": "text/javascript", - "base_turn0": { - "input_tokens": 125, - "output_tokens": 1436, - "latency_ms": 13625, - "artifact_bytes": 4653 - }, - "aap_turn0": { - "input_tokens": 464, - "output_tokens": 752, - "latency_ms": 6713, - "artifact_bytes": 2430 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new 'department' column after 'role' with values like Engineering, Marketi", - "input_tokens": 1585, - "output_tokens": 1655, - "latency_ms": 9772, - "output_bytes": 5455, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Rewrite the toolbar component to include a date range filter with 'from' and 'to", - "input_tokens": 3263, - "output_tokens": 1454, - "latency_ms": 6114, - "output_bytes": 4715, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add 10 more user entries to the sample data with international names and email a", - "input_tokens": 4736, - "output_tokens": 2002, - "latency_ms": 8215, - "output_bytes": 6068, - "failed": false, - "failure_reason": "" - }, - { - "turn": 4, - "edit": "Update the pagination component to show 'Showing X-Y of Z results' text and a ju", - "input_tokens": 6764, - "output_tokens": 1979, - "latency_ms": 7608, - "output_bytes": 5973, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 16348, - "total_output_tokens": 7090, - "total_latency_ms": 31709 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new 'department' column after 'role' with values like Engineering, Marketi", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 5295, - "output_bytes": 2430, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - }, - { - "turn": 2, - "edit": "Rewrite the toolbar component to include a date range filter with 'from' and 'to", - "input_tokens": 1881, - "output_tokens": 971, - "latency_ms": 4250, - "output_bytes": 3045, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Add 10 more user entries to the sample data with international names and email a", - "input_tokens": 2052, - "output_tokens": 1129, - "latency_ms": 4855, - "output_bytes": 3326, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 4, - "edit": "Update the pagination component to show 'Showing X-Y of Z results' text and a ju", - "input_tokens": 2197, - "output_tokens": 1279, - "latency_ms": 5649, - "output_bytes": 3886, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - } - ], - "total_input_tokens": 6130, - "total_output_tokens": 3379, - "total_latency_ms": 20049, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.75 - }, - "comparison": { - "output_token_savings_pct": 52.3, - "input_token_savings_pct": 62.5, - "latency_savings_pct": 36.8 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 125, - "base_output": 1436, - "base_latency_ms": 13625, - "aap_input": 464, - "aap_output": 752, - "aap_latency_ms": 6713 - }, - { - "turn": 1, - "base_input": 1585, - "base_output": 1655, - "base_latency_ms": 9772, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 5295, - "envelope_name": "edit", - "apply_ok": false - }, - { - "turn": 2, - "base_input": 3263, - "base_output": 1454, - "base_latency_ms": 6114, - "aap_input": 1881, - "aap_output": 971, - "aap_latency_ms": 4250, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 4736, - "base_output": 2002, - "base_latency_ms": 8215, - "aap_input": 2052, - "aap_output": 1129, - "aap_latency_ms": 4855, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 4, - "base_input": 6764, - "base_output": 1979, - "base_latency_ms": 7608, - "aap_input": 2197, - "aap_output": 1279, - "aap_latency_ms": 5649, - "envelope_name": "synthesize", - "apply_ok": true - } - ], - "totals": { - "base_input": 16473, - "base_output": 8526, - "base_combined": 24999, - "aap_input": 6594, - "aap_output": 4131, - "aap_combined": 10725, - "base_latency_ms": 45334, - "aap_latency_ms": 26762, - "output_savings_pct": 51.5, - "input_delta_pct": -60.0, - "combined_savings_pct": 57.1, - "latency_savings_pct": 41.0 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.311, - "token_f1": 0.5266, - "base_char_count": 4649, - "aap_char_count": 2310, - "char_delta_pct": -50.3, - "lines_added": 42, - "lines_removed": 76, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.2773, - "token_f1": 0.488, - "base_char_count": 5451, - "aap_char_count": 2310, - "char_delta_pct": -57.6, - "lines_added": 41, - "lines_removed": 84, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.3216, - "token_f1": 0.5529, - "base_char_count": 4711, - "aap_char_count": 2925, - "char_delta_pct": -37.9, - "lines_added": 55, - "lines_removed": 70, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.2636, - "token_f1": 0.4389, - "base_char_count": 6064, - "aap_char_count": 3206, - "char_delta_pct": -47.1, - "lines_added": 59, - "lines_removed": 84, - "rouge_l": null, - "bleu": null - }, - { - "turn": 4, - "sequence_similarity": 0.2186, - "token_f1": 0.4578, - "base_char_count": 5969, - "aap_char_count": 3766, - "char_delta_pct": -36.9, - "lines_added": 80, - "lines_removed": 105, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.2784, - "mean_token_f1": 0.4928, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-0.txt b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-0.txt deleted file mode 100644 index 6399df4..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-0.txt +++ /dev/null @@ -1,76 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const users = Array.from({ length: 20 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - role: i % 3 === 0 ? 'Admin' : 'User', - status: i % 2 === 0 ? 'Active' : 'Inactive', - joined: `2023-01-${(i % 28) + 1}` -})); - -export const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig({ key, direction: config.key === key && config.direction === 'asc' ? 'desc' : 'asc' }) }; -}; - -export const UserTable = () => { - const [data, setData] = useState(users); - const [search, setSearch] = useState(''); - const [page, setPage] = useState(1); - const { sortedData, requestSort } = useSort(data); - - const filteredData = useMemo(() => - sortedData.filter(u => u.name.toLowerCase().includes(search.toLowerCase())), - [sortedData, search] - ); - - const paginatedData = filteredData.slice((page - 1) * 5, page * 5); - - return ( -
- - setSearch(e.target.value)} /> - - - - - - - - - - - - - - {paginatedData.map(user => ( - - - - - - - ))} - - -
requestSort('name')}>Name requestSort('email')}>EmailRoleStatus
{user.name}{user.email}{user.role}{user.status}
- - -
- Page: {page} - - -
-
-
- ); -}; \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-1.json b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-1.json deleted file mode 100644 index f68b742..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-1.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-table-artifact", - "version": 2, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "users-data" - }, - "content": "const users = Array.from({ length: 20 }, (_, i) => ({\n id: i + 1,\n name: `User ${i + 1}`,\n email: `user${i + 1}@example.com`,\n role: i % 3 === 0 ? 'Admin' : 'User',\n department: ['Engineering', 'Marketing', 'Sales', 'Support'][i % 4],\n status: i % 2 === 0 ? 'Active' : 'Inactive',\n joined: `2023-01-${(i % 28) + 1}`\n}));\n" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "table-header" - }, - "content": " requestSort('name')}>Name\n requestSort('email')}>Email\n Role\n Department\n Status\n" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "table-body" - }, - "content": " \n {user.name}\n {user.email}\n {user.role}\n {user.department}\n {user.status}\n \n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-1.txt b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-1.txt deleted file mode 100644 index 6399df4..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-1.txt +++ /dev/null @@ -1,76 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const users = Array.from({ length: 20 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - role: i % 3 === 0 ? 'Admin' : 'User', - status: i % 2 === 0 ? 'Active' : 'Inactive', - joined: `2023-01-${(i % 28) + 1}` -})); - -export const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig({ key, direction: config.key === key && config.direction === 'asc' ? 'desc' : 'asc' }) }; -}; - -export const UserTable = () => { - const [data, setData] = useState(users); - const [search, setSearch] = useState(''); - const [page, setPage] = useState(1); - const { sortedData, requestSort } = useSort(data); - - const filteredData = useMemo(() => - sortedData.filter(u => u.name.toLowerCase().includes(search.toLowerCase())), - [sortedData, search] - ); - - const paginatedData = filteredData.slice((page - 1) * 5, page * 5); - - return ( -
- - setSearch(e.target.value)} /> - - - - - - - - - - - - - - {paginatedData.map(user => ( - - - - - - - ))} - - -
requestSort('name')}>Name requestSort('email')}>EmailRoleStatus
{user.name}{user.email}{user.role}{user.status}
- - -
- Page: {page} - - -
-
-
- ); -}; \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-2.json b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-2.json deleted file mode 100644 index 26735a3..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-table-component", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import React, { useState, useMemo } from 'react';\n\nconst users = Array.from({ length: 20 }, (_, i) => ({\n id: i + 1,\n name: `User ${i + 1}`,\n email: `user${i + 1}@example.com`,\n role: i % 3 === 0 ? 'Admin' : 'User',\n status: i % 2 === 0 ? 'Active' : 'Inactive',\n joined: `2023-01-${(i % 28) + 1}`\n}));\n\nexport const useSort = (data) => {\n const [config, setConfig] = useState({ key: null, direction: 'asc' });\n const sortedData = useMemo(() => {\n if (!config.key) return data;\n return [...data].sort((a, b) => {\n if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1;\n if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1;\n return 0;\n });\n }, [data, config]);\n return { sortedData, requestSort: (key) => setConfig({ key, direction: config.key === key && config.direction === 'asc' ? 'desc' : 'asc' }) };\n};\n\nexport const UserTable = () => {\n const [data, setData] = useState(users);\n const [search, setSearch] = useState('');\n const [dateFrom, setDateFrom] = useState('');\n const [dateTo, setDateTo] = useState('');\n const [page, setPage] = useState(1);\n const { sortedData, requestSort } = useSort(data);\n\n const filteredData = useMemo(() => \n sortedData.filter(u => {\n const matchesSearch = u.name.toLowerCase().includes(search.toLowerCase());\n const matchesFrom = !dateFrom || u.joined >= dateFrom;\n const matchesTo = !dateTo || u.joined <= dateTo;\n return matchesSearch && matchesFrom && matchesTo;\n }),\n [sortedData, search, dateFrom, dateTo]\n );\n\n const paginatedData = filteredData.slice((page - 1) * 5, page * 5);\n\n return (\n
\n \n setSearch(e.target.value)} />\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n {paginatedData.map(user => (\n \n \n \n \n \n \n \n ))}\n \n \n
requestSort('name')}>Name requestSort('email')}>EmailRoleStatus requestSort('joined')}>Joined
{user.name}{user.email}{user.role}{user.status}{user.joined}
\n\n \n
\n Page: {page} \n \n \n
\n
\n
\n );\n};\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-2.txt b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-2.txt deleted file mode 100644 index 8a156b2..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-2.txt +++ /dev/null @@ -1,87 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const users = Array.from({ length: 20 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - role: i % 3 === 0 ? 'Admin' : 'User', - status: i % 2 === 0 ? 'Active' : 'Inactive', - joined: `2023-01-${(i % 28) + 1}` -})); - -export const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig({ key, direction: config.key === key && config.direction === 'asc' ? 'desc' : 'asc' }) }; -}; - -export const UserTable = () => { - const [data, setData] = useState(users); - const [search, setSearch] = useState(''); - const [dateFrom, setDateFrom] = useState(''); - const [dateTo, setDateTo] = useState(''); - const [page, setPage] = useState(1); - const { sortedData, requestSort } = useSort(data); - - const filteredData = useMemo(() => - sortedData.filter(u => { - const matchesSearch = u.name.toLowerCase().includes(search.toLowerCase()); - const matchesFrom = !dateFrom || u.joined >= dateFrom; - const matchesTo = !dateTo || u.joined <= dateTo; - return matchesSearch && matchesFrom && matchesTo; - }), - [sortedData, search, dateFrom, dateTo] - ); - - const paginatedData = filteredData.slice((page - 1) * 5, page * 5); - - return ( -
- - setSearch(e.target.value)} /> - - - - - - - - - - - - - - - - - {paginatedData.map(user => ( - - - - - - - - ))} - - -
requestSort('name')}>Name requestSort('email')}>EmailRoleStatus requestSort('joined')}>Joined
{user.name}{user.email}{user.role}{user.status}{user.joined}
- - -
- Page: {page} - - -
-
-
- ); -}; diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-3.json b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-3.json deleted file mode 100644 index 6f6c6ba..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-3.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-table-component", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/javascript", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import React, { useState, useMemo } from 'react';\n\nconst users = Array.from({ length: 30 }, (_, i) => ({\n id: i + 1,\n name: i < 20 ? `User ${i + 1}` : ['Hiroshi', 'Elena', 'Amara', 'Mateo', 'Fatima', 'Lars', 'Chen', 'Priya', 'Jean', 'Abe'][i - 20],\n email: i < 20 ? `user${i + 1}@example.com` : ['hiro@jp.com', 'elena@es.com', 'amara@ng.com', 'mateo@ar.com', 'fati@eg.com', 'lars@se.com', 'chen@cn.com', 'priya@in.com', 'jean@fr.com', 'abe@et.com'][i - 20],\n role: i % 3 === 0 ? 'Admin' : 'User',\n status: i % 2 === 0 ? 'Active' : 'Inactive',\n joined: `2023-01-${(i % 28) + 1}`\n}));\n\nexport const useSort = (data) => {\n const [config, setConfig] = useState({ key: null, direction: 'asc' });\n const sortedData = useMemo(() => {\n if (!config.key) return data;\n return [...data].sort((a, b) => {\n if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1;\n if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1;\n return 0;\n });\n }, [data, config]);\n return { sortedData, requestSort: (key) => setConfig({ key, direction: config.key === key && config.direction === 'asc' ? 'desc' : 'asc' }) };\n};\n\nexport const UserTable = () => {\n const [data, setData] = useState(users);\n const [search, setSearch] = useState('');\n const [dateFrom, setDateFrom] = useState('');\n const [dateTo, setDateTo] = useState('');\n const [page, setPage] = useState(1);\n const { sortedData, requestSort } = useSort(data);\n\n const filteredData = useMemo(() => \n sortedData.filter(u => {\n const matchesSearch = u.name.toLowerCase().includes(search.toLowerCase());\n const matchesFrom = !dateFrom || u.joined >= dateFrom;\n const matchesTo = !dateTo || u.joined <= dateTo;\n return matchesSearch && matchesFrom && matchesTo;\n }),\n [sortedData, search, dateFrom, dateTo]\n );\n\n const paginatedData = filteredData.slice((page - 1) * 5, page * 5);\n\n return (\n
\n \n setSearch(e.target.value)} />\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n {paginatedData.map(user => (\n \n \n \n \n \n \n \n ))}\n \n \n
requestSort('name')}>Name requestSort('email')}>EmailRoleStatus requestSort('joined')}>Joined
{user.name}{user.email}{user.role}{user.status}{user.joined}
\n\n \n
\n Page: {page} \n \n \n
\n
\n
\n );\n};\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-3.txt b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-3.txt deleted file mode 100644 index 6f8106d..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-3.txt +++ /dev/null @@ -1,87 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const users = Array.from({ length: 30 }, (_, i) => ({ - id: i + 1, - name: i < 20 ? `User ${i + 1}` : ['Hiroshi', 'Elena', 'Amara', 'Mateo', 'Fatima', 'Lars', 'Chen', 'Priya', 'Jean', 'Abe'][i - 20], - email: i < 20 ? `user${i + 1}@example.com` : ['hiro@jp.com', 'elena@es.com', 'amara@ng.com', 'mateo@ar.com', 'fati@eg.com', 'lars@se.com', 'chen@cn.com', 'priya@in.com', 'jean@fr.com', 'abe@et.com'][i - 20], - role: i % 3 === 0 ? 'Admin' : 'User', - status: i % 2 === 0 ? 'Active' : 'Inactive', - joined: `2023-01-${(i % 28) + 1}` -})); - -export const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig({ key, direction: config.key === key && config.direction === 'asc' ? 'desc' : 'asc' }) }; -}; - -export const UserTable = () => { - const [data, setData] = useState(users); - const [search, setSearch] = useState(''); - const [dateFrom, setDateFrom] = useState(''); - const [dateTo, setDateTo] = useState(''); - const [page, setPage] = useState(1); - const { sortedData, requestSort } = useSort(data); - - const filteredData = useMemo(() => - sortedData.filter(u => { - const matchesSearch = u.name.toLowerCase().includes(search.toLowerCase()); - const matchesFrom = !dateFrom || u.joined >= dateFrom; - const matchesTo = !dateTo || u.joined <= dateTo; - return matchesSearch && matchesFrom && matchesTo; - }), - [sortedData, search, dateFrom, dateTo] - ); - - const paginatedData = filteredData.slice((page - 1) * 5, page * 5); - - return ( -
- - setSearch(e.target.value)} /> - - - - - - - - - - - - - - - - - {paginatedData.map(user => ( - - - - - - - - ))} - - -
requestSort('name')}>Name requestSort('email')}>EmailRoleStatus requestSort('joined')}>Joined
{user.name}{user.email}{user.role}{user.status}{user.joined}
- - -
- Page: {page} - - -
-
-
- ); -}; diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-4.json b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-4.json deleted file mode 100644 index 3bce643..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-4.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "user-table-component", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/javascript", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "import React, { useState, useMemo } from 'react';\n\nconst users = Array.from({ length: 30 }, (_, i) => ({\n id: i + 1,\n name: i < 20 ? `User ${i + 1}` : ['Hiroshi', 'Elena', 'Amara', 'Mateo', 'Fatima', 'Lars', 'Chen', 'Priya', 'Jean', 'Abe'][i - 20],\n email: i < 20 ? `user${i + 1}@example.com` : ['hiro@jp.com', 'elena@es.com', 'amara@ng.com', 'mateo@ar.com', 'fati@eg.com', 'lars@se.com', 'chen@cn.com', 'priya@in.com', 'jean@fr.com', 'abe@et.com'][i - 20],\n role: i % 3 === 0 ? 'Admin' : 'User',\n status: i % 2 === 0 ? 'Active' : 'Inactive',\n joined: `2023-01-${(i % 28) + 1}`\n}));\n\nexport const useSort = (data) => {\n const [config, setConfig] = useState({ key: null, direction: 'asc' });\n const sortedData = useMemo(() => {\n if (!config.key) return data;\n return [...data].sort((a, b) => {\n if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1;\n if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1;\n return 0;\n });\n }, [data, config]);\n return { sortedData, requestSort: (key) => setConfig({ key, direction: config.key === key && config.direction === 'asc' ? 'desc' : 'asc' }) };\n};\n\nexport const UserTable = () => {\n const [data, setData] = useState(users);\n const [search, setSearch] = useState('');\n const [dateFrom, setDateFrom] = useState('');\n const [dateTo, setDateTo] = useState('');\n const [page, setPage] = useState(1);\n const { sortedData, requestSort } = useSort(data);\n\n const filteredData = useMemo(() => \n sortedData.filter(u => {\n const matchesSearch = u.name.toLowerCase().includes(search.toLowerCase());\n const matchesFrom = !dateFrom || u.joined >= dateFrom;\n const matchesTo = !dateTo || u.joined <= dateTo;\n return matchesSearch && matchesFrom && matchesTo;\n }),\n [sortedData, search, dateFrom, dateTo]\n );\n\n const pageSize = 5;\n const totalPages = Math.max(1, Math.ceil(filteredData.length / pageSize));\n const paginatedData = filteredData.slice((page - 1) * pageSize, page * pageSize);\n\n return (\n
\n \n setSearch(e.target.value)} />\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n {paginatedData.map(user => (\n \n \n \n \n \n \n \n ))}\n \n \n
requestSort('name')}>Name requestSort('email')}>EmailRoleStatus requestSort('joined')}>Joined
{user.name}{user.email}{user.role}{user.status}{user.joined}
\n\n \n
\n \n Showing {(page - 1) * pageSize + 1}-{Math.min(page * pageSize, filteredData.length)} of {filteredData.length} results\n \n \n Page {page} of {totalPages}\n \n setPage(Math.max(1, Math.min(totalPages, Number(e.target.value))))} \n />\n
\n
\n
\n );\n};\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-4.txt b/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-4.txt deleted file mode 100644 index de918d0..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/aap/turn-4.txt +++ /dev/null @@ -1,99 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const users = Array.from({ length: 30 }, (_, i) => ({ - id: i + 1, - name: i < 20 ? `User ${i + 1}` : ['Hiroshi', 'Elena', 'Amara', 'Mateo', 'Fatima', 'Lars', 'Chen', 'Priya', 'Jean', 'Abe'][i - 20], - email: i < 20 ? `user${i + 1}@example.com` : ['hiro@jp.com', 'elena@es.com', 'amara@ng.com', 'mateo@ar.com', 'fati@eg.com', 'lars@se.com', 'chen@cn.com', 'priya@in.com', 'jean@fr.com', 'abe@et.com'][i - 20], - role: i % 3 === 0 ? 'Admin' : 'User', - status: i % 2 === 0 ? 'Active' : 'Inactive', - joined: `2023-01-${(i % 28) + 1}` -})); - -export const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig({ key, direction: config.key === key && config.direction === 'asc' ? 'desc' : 'asc' }) }; -}; - -export const UserTable = () => { - const [data, setData] = useState(users); - const [search, setSearch] = useState(''); - const [dateFrom, setDateFrom] = useState(''); - const [dateTo, setDateTo] = useState(''); - const [page, setPage] = useState(1); - const { sortedData, requestSort } = useSort(data); - - const filteredData = useMemo(() => - sortedData.filter(u => { - const matchesSearch = u.name.toLowerCase().includes(search.toLowerCase()); - const matchesFrom = !dateFrom || u.joined >= dateFrom; - const matchesTo = !dateTo || u.joined <= dateTo; - return matchesSearch && matchesFrom && matchesTo; - }), - [sortedData, search, dateFrom, dateTo] - ); - - const pageSize = 5; - const totalPages = Math.max(1, Math.ceil(filteredData.length / pageSize)); - const paginatedData = filteredData.slice((page - 1) * pageSize, page * pageSize); - - return ( -
- - setSearch(e.target.value)} /> - - - - - - - - - - - - - - - - - {paginatedData.map(user => ( - - - - - - - - ))} - - -
requestSort('name')}>Name requestSort('email')}>EmailRoleStatus requestSort('joined')}>Joined
{user.name}{user.email}{user.role}{user.status}{user.joined}
- - -
- - Showing {(page - 1) * pageSize + 1}-{Math.min(page * pageSize, filteredData.length)} of {filteredData.length} results - - - Page {page} of {totalPages} - - setPage(Math.max(1, Math.min(totalPages, Number(e.target.value))))} - /> -
-
-
- ); -}; diff --git a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-0.txt b/evals/data/experiments/018-js-react-data-table/outputs/base/turn-0.txt deleted file mode 100644 index f9d1553..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-0.txt +++ /dev/null @@ -1,110 +0,0 @@ -import React, { useState, useMemo, useCallback } from 'react'; - -const USERS = Array.from({ length: 20 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - role: ['Admin', 'Editor', 'Viewer'][i % 3], - status: ['Active', 'Inactive'][i % 2], - joined: `2023-01-${(i % 28) + 1}`, -})); - -const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig(c => ({ key, direction: c.key === key && c.direction === 'asc' ? 'desc' : 'asc' })), config }; -}; - -const useFilter = (data) => { - const [filters, setFilters] = useState({ search: '', role: 'All' }); - const filteredData = useMemo(() => { - return data.filter(item => - item.name.toLowerCase().includes(filters.search.toLowerCase()) && - (filters.role === 'All' || item.role === filters.role) - ); - }, [data, filters]); - return { filteredData, filters, setFilters }; -}; - -const usePagination = (data, pageSize = 5) => { - const [page, setPage] = useState(0); - const [size, setSize] = useState(pageSize); - const paginatedData = useMemo(() => data.slice(page * size, (page + 1) * size), [data, page, size]); - return { paginatedData, page, setPage, size, setSize, totalPages: Math.ceil(data.length / size) }; -}; - -const DataTable = () => { - const [columns] = useState(['name', 'email', 'role', 'status', 'joined']); - const [selected, setSelected] = useState(new Set()); - const [expanded, setExpanded] = useState(new Set()); - const [editId, setEditId] = useState(null); - - const { sortedData, requestSort, config } = useSort(USERS); - const { filteredData, filters, setFilters } = useFilter(sortedData); - const { paginatedData, page, setPage, size, setSize, totalPages } = usePagination(filteredData); - - const toggleSelect = (id) => { - const next = new Set(selected); - next.has(id) ? next.delete(id) : next.add(id); - setSelected(next); - }; - - return ( -
-
- setFilters(f => ({...f, search: e.target.value}))} /> - -
- - - - - - {columns.map(col => )} - - - - - {paginatedData.map(user => ( - - - - {columns.map(col => )} - - - {expanded.has(user.id) && } - - ))} - -
Select requestSort(col)}>{col.toUpperCase()} {config.key === col ? (config.direction === 'asc' ? '▲' : '▼') : ''}Actions
toggleSelect(user.id)} />{editId === user.id ? : user[col]} - - -
Details for {user.name}: {JSON.stringify(user)}
- -
- - Page {page + 1} of {totalPages} - - -
-
- ); -}; - -export default DataTable; \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-1.txt b/evals/data/experiments/018-js-react-data-table/outputs/base/turn-1.txt deleted file mode 100644 index f3ffa51..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-1.txt +++ /dev/null @@ -1,119 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const USERS = Array.from({ length: 20 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - role: ['Admin', 'Editor', 'Viewer'][i % 3], - department: ['Engineering', 'Marketing', 'Sales', 'Support'][i % 4], - status: ['Active', 'Inactive'][i % 2], - joined: `2023-01-${(i % 28) + 1}`, -})); - -const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig(c => ({ key, direction: c.key === key && c.direction === 'asc' ? 'desc' : 'asc' })), config }; -}; - -const useFilter = (data) => { - const [filters, setFilters] = useState({ search: '', role: 'All', department: 'All' }); - const filteredData = useMemo(() => { - return data.filter(item => - item.name.toLowerCase().includes(filters.search.toLowerCase()) && - (filters.role === 'All' || item.role === filters.role) && - (filters.department === 'All' || item.department === filters.department) - ); - }, [data, filters]); - return { filteredData, filters, setFilters }; -}; - -const usePagination = (data, pageSize = 5) => { - const [page, setPage] = useState(0); - const [size, setSize] = useState(pageSize); - const paginatedData = useMemo(() => data.slice(page * size, (page + 1) * size), [data, page, size]); - return { paginatedData, page, setPage, size, setSize, totalPages: Math.ceil(data.length / size) }; -}; - -const DataTable = () => { - const [columns] = useState(['name', 'email', 'role', 'department', 'status', 'joined']); - const [selected, setSelected] = useState(new Set()); - const [expanded, setExpanded] = useState(new Set()); - const [editId, setEditId] = useState(null); - - const { sortedData, requestSort, config } = useSort(USERS); - const { filteredData, filters, setFilters } = useFilter(sortedData); - const { paginatedData, page, setPage, size, setSize, totalPages } = usePagination(filteredData); - - const toggleSelect = (id) => { - const next = new Set(selected); - next.has(id) ? next.delete(id) : next.add(id); - setSelected(next); - }; - - return ( -
-
- setFilters(f => ({...f, search: e.target.value}))} /> - - -
- - - - - - {columns.map(col => )} - - - - - {paginatedData.map(user => ( - - - - {columns.map(col => )} - - - {expanded.has(user.id) && } - - ))} - -
setSelected(e.target.checked ? new Set(paginatedData.map(u => u.id)) : new Set())} /> requestSort(col)} style={{ cursor: 'pointer' }}>{col.toUpperCase()} {config.key === col ? (config.direction === 'asc' ? '▲' : '▼') : ''}Actions
toggleSelect(user.id)} />{editId === user.id ? : user[col]} - - -
Expanded View for {user.name}: {JSON.stringify(user)}
- -
- - Page {page + 1} of {Math.max(1, totalPages)} - - -
-
- ); -}; - -export default DataTable; \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-2.txt b/evals/data/experiments/018-js-react-data-table/outputs/base/turn-2.txt deleted file mode 100644 index 6060f80..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-2.txt +++ /dev/null @@ -1,102 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const USERS = Array.from({ length: 20 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - role: ['Admin', 'Editor', 'Viewer'][i % 3], - department: ['Engineering', 'Marketing', 'Sales', 'Support'][i % 4], - status: ['Active', 'Inactive'][i % 2], - joined: `2023-01-${String((i % 28) + 1).padStart(2, '0')}`, -})); - -const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig(c => ({ key, direction: c.key === key && c.direction === 'asc' ? 'desc' : 'asc' })), config }; -}; - -const useFilter = (data) => { - const [filters, setFilters] = useState({ search: '', role: 'All', department: 'All', from: '', to: '' }); - const filteredData = useMemo(() => { - return data.filter(item => { - const matchSearch = item.name.toLowerCase().includes(filters.search.toLowerCase()); - const matchRole = filters.role === 'All' || item.role === filters.role; - const matchDept = filters.department === 'All' || item.department === filters.department; - const matchFrom = !filters.from || item.joined >= filters.from; - const matchTo = !filters.to || item.joined <= filters.to; - return matchSearch && matchRole && matchDept && matchFrom && matchTo; - }); - }, [data, filters]); - return { filteredData, filters, setFilters }; -}; - -const Toolbar = ({ filters, setFilters }) => ( -
- setFilters(f => ({...f, search: e.target.value}))} /> - - - - -
-); - -const DataTable = () => { - const [columns] = useState(['name', 'email', 'role', 'department', 'status', 'joined']); - const [selected, setSelected] = useState(new Set()); - const [expanded, setExpanded] = useState(new Set()); - - const { sortedData, requestSort, config } = useSort(USERS); - const { filteredData, filters, setFilters } = useFilter(sortedData); - - const [page, setPage] = useState(0); - const [size, setSize] = useState(5); - const paginatedData = useMemo(() => filteredData.slice(page * size, (page + 1) * size), [filteredData, page, size]); - const totalPages = Math.ceil(filteredData.length / size); - - return ( -
- - - - - - {columns.map(col => )} - - - - {paginatedData.map(user => ( - - - {columns.map(col => )} - - ))} - -
Select requestSort(col)} style={{ cursor: 'pointer' }}>{col.toUpperCase()} {config.key === col ? (config.direction === 'asc' ? '▲' : '▼') : ''}
{ - const next = new Set(selected); - next.has(user.id) ? next.delete(user.id) : next.add(user.id); - setSelected(next); - }} />{user[col]}
-
- - Page {page + 1} of {Math.max(1, totalPages)} - -
-
- ); -}; - -export default DataTable; \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-3.txt b/evals/data/experiments/018-js-react-data-table/outputs/base/turn-3.txt deleted file mode 100644 index a5bd5ef..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-3.txt +++ /dev/null @@ -1,112 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const USERS = [ - ...Array.from({ length: 20 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - role: ['Admin', 'Editor', 'Viewer'][i % 3], - department: ['Engineering', 'Marketing', 'Sales', 'Support'][i % 4], - status: ['Active', 'Inactive'][i % 2], - joined: `2023-01-${String((i % 28) + 1).padStart(2, '0')}`, - })), - { id: 21, name: 'Hiroshi Tanaka', email: 'hiroshi@tech.jp', role: 'Admin', department: 'Engineering', status: 'Active', joined: '2023-02-10' }, - { id: 22, name: 'Elena Rodriguez', email: 'elena@market.es', role: 'Editor', department: 'Marketing', status: 'Active', joined: '2023-03-15' }, - { id: 23, name: 'Kwame Mensah', email: 'kwame@global.gh', role: 'Viewer', department: 'Support', status: 'Inactive', joined: '2023-04-05' }, - { id: 24, name: 'Fatima Al-Sayed', email: 'fatima@uae.ae', role: 'Admin', department: 'Sales', status: 'Active', joined: '2023-05-20' }, - { id: 25, name: 'Jean-Pierre Dubois', email: 'jp@france.fr', role: 'Editor', department: 'Engineering', status: 'Active', joined: '2023-06-12' }, - { id: 26, name: 'Sven Svensson', email: 'sven@nordic.se', role: 'Viewer', department: 'Sales', status: 'Active', joined: '2023-07-22' }, - { id: 27, name: 'Mei Ling', email: 'mei@asia.cn', role: 'Admin', department: 'Support', status: 'Inactive', joined: '2023-08-14' }, - { id: 28, name: 'Arjun Gupta', email: 'arjun@india.in', role: 'Editor', department: 'Marketing', status: 'Active', joined: '2023-09-01' }, - { id: 29, name: 'Chloe O\'Sullivan', email: 'chloe@ireland.ie', role: 'Viewer', department: 'Engineering', status: 'Active', joined: '2023-10-18' }, - { id: 30, name: 'Mateo Rossi', email: 'mateo@italy.it', role: 'Admin', department: 'Sales', status: 'Active', joined: '2023-11-30' } -]; - -const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig(c => ({ key, direction: c.key === key && c.direction === 'asc' ? 'desc' : 'asc' })), config }; -}; - -const useFilter = (data) => { - const [filters, setFilters] = useState({ search: '', role: 'All', department: 'All', from: '', to: '' }); - const filteredData = useMemo(() => { - return data.filter(item => { - const matchSearch = item.name.toLowerCase().includes(filters.search.toLowerCase()); - const matchRole = filters.role === 'All' || item.role === filters.role; - const matchDept = filters.department === 'All' || item.department === filters.department; - const matchFrom = !filters.from || item.joined >= filters.from; - const matchTo = !filters.to || item.joined <= filters.to; - return matchSearch && matchRole && matchDept && matchFrom && matchTo; - }); - }, [data, filters]); - return { filteredData, filters, setFilters }; -}; - -const Toolbar = ({ filters, setFilters }) => ( -
- setFilters(f => ({...f, search: e.target.value}))} /> - - - setFilters(f => ({...f, from: e.target.value}))} /> - setFilters(f => ({...f, to: e.target.value}))} /> -
-); - -const DataTable = () => { - const [columns] = useState(['name', 'email', 'role', 'department', 'status', 'joined']); - const [selected, setSelected] = useState(new Set()); - const { sortedData, requestSort, config } = useSort(USERS); - const { filteredData, filters, setFilters } = useFilter(sortedData); - const [page, setPage] = useState(0); - const [size, setSize] = useState(10); - - const paginatedData = useMemo(() => filteredData.slice(page * size, (page + 1) * size), [filteredData, page, size]); - const totalPages = Math.ceil(filteredData.length / size); - - return ( -
- - - - - - {columns.map(col => )} - - - - {paginatedData.map(user => ( - - - {columns.map(col => )} - - ))} - -
Select requestSort(col)} style={{ cursor: 'pointer' }}>{col.toUpperCase()} {config.key === col ? (config.direction === 'asc' ? '▲' : '▼') : ''}
{ - const next = new Set(selected); - next.has(user.id) ? next.delete(user.id) : next.add(user.id); - setSelected(next); - }} />{user[col]}
-
- - Page {page + 1} of {Math.max(1, totalPages)} - -
-
- ); -}; - -export default DataTable; \ No newline at end of file diff --git a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-4.txt b/evals/data/experiments/018-js-react-data-table/outputs/base/turn-4.txt deleted file mode 100644 index dfd12ac..0000000 --- a/evals/data/experiments/018-js-react-data-table/outputs/base/turn-4.txt +++ /dev/null @@ -1,124 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -const USERS = [ - ...Array.from({ length: 20 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - role: ['Admin', 'Editor', 'Viewer'][i % 3], - department: ['Engineering', 'Marketing', 'Sales', 'Support'][i % 4], - status: ['Active', 'Inactive'][i % 2], - joined: `2023-01-${String((i % 28) + 1).padStart(2, '0')}`, - })), - { id: 21, name: 'Hiroshi Tanaka', email: 'hiroshi@tech.jp', role: 'Admin', department: 'Engineering', status: 'Active', joined: '2023-02-10' }, - { id: 22, name: 'Elena Rodriguez', email: 'elena@market.es', role: 'Editor', department: 'Marketing', status: 'Active', joined: '2023-03-15' }, - { id: 23, name: 'Kwame Mensah', email: 'kwame@global.gh', role: 'Viewer', department: 'Support', status: 'Inactive', joined: '2023-04-05' }, - { id: 24, name: 'Fatima Al-Sayed', email: 'fatima@uae.ae', role: 'Admin', department: 'Sales', status: 'Active', joined: '2023-05-20' }, - { id: 25, name: 'Jean-Pierre Dubois', email: 'jp@france.fr', role: 'Editor', department: 'Engineering', status: 'Active', joined: '2023-06-12' }, - { id: 26, name: 'Sven Svensson', email: 'sven@nordic.se', role: 'Viewer', department: 'Sales', status: 'Active', joined: '2023-07-22' }, - { id: 27, name: 'Mei Ling', email: 'mei@asia.cn', role: 'Admin', department: 'Support', status: 'Inactive', joined: '2023-08-14' }, - { id: 28, name: 'Arjun Gupta', email: 'arjun@india.in', role: 'Editor', department: 'Marketing', status: 'Active', joined: '2023-09-01' }, - { id: 29, name: 'Chloe O\'Sullivan', email: 'chloe@ireland.ie', role: 'Viewer', department: 'Engineering', status: 'Active', joined: '2023-10-18' }, - { id: 30, name: 'Mateo Rossi', email: 'mateo@italy.it', role: 'Admin', department: 'Sales', status: 'Active', joined: '2023-11-30' } -]; - -const useSort = (data) => { - const [config, setConfig] = useState({ key: null, direction: 'asc' }); - const sortedData = useMemo(() => { - if (!config.key) return data; - return [...data].sort((a, b) => { - if (a[config.key] < b[config.key]) return config.direction === 'asc' ? -1 : 1; - if (a[config.key] > b[config.key]) return config.direction === 'asc' ? 1 : -1; - return 0; - }); - }, [data, config]); - return { sortedData, requestSort: (key) => setConfig(c => ({ key, direction: c.key === key && c.direction === 'asc' ? 'desc' : 'asc' })), config }; -}; - -const useFilter = (data) => { - const [filters, setFilters] = useState({ search: '', role: 'All', department: 'All', from: '', to: '' }); - const filteredData = useMemo(() => { - return data.filter(item => { - const matchSearch = item.name.toLowerCase().includes(filters.search.toLowerCase()); - const matchRole = filters.role === 'All' || item.role === filters.role; - const matchDept = filters.department === 'All' || item.department === filters.department; - const matchFrom = !filters.from || item.joined >= filters.from; - const matchTo = !filters.to || item.joined <= filters.to; - return matchSearch && matchRole && matchDept && matchFrom && matchTo; - }); - }, [data, filters]); - return { filteredData, filters, setFilters }; -}; - -const Pagination = ({ page, size, total, setPage }) => { - const totalPages = Math.ceil(total / size); - const start = page * size + 1; - const end = Math.min((page + 1) * size, total); - - return ( -
- Showing {total === 0 ? 0 : start}-{end} of {total} results -
- - Page {page + 1} of {Math.max(1, totalPages)} - -
- -
- ); -}; - -const DataTable = () => { - const [columns] = useState(['name', 'email', 'role', 'department', 'status', 'joined']); - const { sortedData, requestSort, config } = useSort(USERS); - const { filteredData, filters, setFilters } = useFilter(sortedData); - const [page, setPage] = useState(0); - const size = 10; - - const paginatedData = useMemo(() => filteredData.slice(page * size, (page + 1) * size), [filteredData, page, size]); - - return ( -
-
- setFilters(f => ({...f, search: e.target.value}))} /> - -
- - - - - {columns.map(col => )} - - - - {paginatedData.map(user => ( - - {columns.map(col => )} - - ))} - -
requestSort(col)} style={{ cursor: 'pointer' }}>{col.toUpperCase()} {config.key === col ? (config.direction === 'asc' ? '▲' : '▼') : ''}
{user[col]}
- - -
- ); -}; - -export default DataTable; \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/metrics.json b/evals/data/experiments/019-js-express-api/metrics.json deleted file mode 100644 index ef49a7b..0000000 --- a/evals/data/experiments/019-js-express-api/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "019-js-express-api", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:58:48.723648+00:00", - "format": "text/javascript", - "base_turn0": { - "input_tokens": 106, - "output_tokens": 897, - "latency_ms": 4022, - "artifact_bytes": 2769 - }, - "aap_turn0": { - "input_tokens": 445, - "output_tokens": 678, - "latency_ms": 3480, - "artifact_bytes": 1952 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new 'labels' field to the Task model with an array of strings, and add a P", - "input_tokens": 1033, - "output_tokens": 1014, - "latency_ms": 3633, - "output_bytes": 3127, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Rewrite the auth token verification middleware to support both Bearer tokens and", - "input_tokens": 2072, - "output_tokens": 1111, - "latency_ms": 4301, - "output_bytes": 3484, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add a new route POST /projects/:id/archive that marks a project as archived and ", - "input_tokens": 3211, - "output_tokens": 1275, - "latency_ms": 4905, - "output_bytes": 4045, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 6316, - "total_output_tokens": 3400, - "total_latency_ms": 12839 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new 'labels' field to the Task model with an array of strings, and add a P", - "input_tokens": 1814, - "output_tokens": 828, - "latency_ms": 4002, - "output_bytes": 2253, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Rewrite the auth token verification middleware to support both Bearer tokens and", - "input_tokens": 1915, - "output_tokens": 894, - "latency_ms": 4017, - "output_bytes": 2437, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 3, - "edit": "Add a new route POST /projects/:id/archive that marks a project as archived and ", - "input_tokens": 1971, - "output_tokens": 303, - "latency_ms": 2433, - "output_bytes": 715, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - } - ], - "total_input_tokens": 5700, - "total_output_tokens": 2025, - "total_latency_ms": 10452, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": 40.4, - "input_token_savings_pct": 9.8, - "latency_savings_pct": 18.6 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 106, - "base_output": 897, - "base_latency_ms": 4022, - "aap_input": 445, - "aap_output": 678, - "aap_latency_ms": 3480 - }, - { - "turn": 1, - "base_input": 1033, - "base_output": 1014, - "base_latency_ms": 3633, - "aap_input": 1814, - "aap_output": 828, - "aap_latency_ms": 4002, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 2072, - "base_output": 1111, - "base_latency_ms": 4301, - "aap_input": 1915, - "aap_output": 894, - "aap_latency_ms": 4017, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 3211, - "base_output": 1275, - "base_latency_ms": 4905, - "aap_input": 1971, - "aap_output": 303, - "aap_latency_ms": 2433, - "envelope_name": "synthesize", - "apply_ok": true - } - ], - "totals": { - "base_input": 6422, - "base_output": 4297, - "base_combined": 10719, - "aap_input": 6145, - "aap_output": 2703, - "aap_combined": 8848, - "base_latency_ms": 16861, - "aap_latency_ms": 13932, - "output_savings_pct": 37.1, - "input_delta_pct": -4.3, - "combined_savings_pct": 17.5, - "latency_savings_pct": 17.4 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.4609, - "token_f1": 0.6929, - "base_char_count": 2769, - "aap_char_count": 1783, - "char_delta_pct": -35.6, - "lines_added": 38, - "lines_removed": 48, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.5181, - "token_f1": 0.7059, - "base_char_count": 3127, - "aap_char_count": 2084, - "char_delta_pct": -33.4, - "lines_added": 43, - "lines_removed": 54, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.4555, - "token_f1": 0.694, - "base_char_count": 3484, - "aap_char_count": 2268, - "char_delta_pct": -34.9, - "lines_added": 50, - "lines_removed": 60, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1059, - "token_f1": 0.1401, - "base_char_count": 4045, - "aap_char_count": 715, - "char_delta_pct": -82.3, - "lines_added": 15, - "lines_removed": 105, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.3851, - "mean_token_f1": 0.5582, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/019-js-express-api/outputs/aap/turn-0.txt b/evals/data/experiments/019-js-express-api/outputs/aap/turn-0.txt deleted file mode 100644 index 6e66576..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/aap/turn-0.txt +++ /dev/null @@ -1,67 +0,0 @@ -const express = require('express'); -const cors = require('cors'); -const rateLimit = require('express-rate-limit'); -const { body, param, validationResult } = require('express-validator'); - -const app = express(); - - -app.use(cors()); -app.use(express.json()); -app.use((req, res, next) => { - console.log(`[${new Date().toISOString()}] ${req.method} ${req.url}`); - next(); -}); - -const auth = (req, res, next) => { - const token = req.headers['authorization']; - if (!token) return res.status(401).json({ error: 'Unauthorized' }); - next(); -}; - -const limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 }); -app.use(limiter); - - - -const store = { - tasks: [], - projects: [], - users: [] -}; - - - -app.post('/tasks', auth, [body('title').notEmpty()], (req, res) => { - const errors = validationResult(req); - if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() }); - - const task = { id: Date.now(), ...req.body, status: 'pending', comments: [] }; - store.tasks.push(task); - res.status(201).json(task); -}); - -app.patch('/tasks/:id/status', auth, [param('id').isInt()], (req, res) => { - const task = store.tasks.find(t => t.id == req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.status = req.body.status; - res.json(task); -}); - -app.post('/tasks/:id/comments', auth, (req, res) => { - const task = store.tasks.find(t => t.id == req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.comments.push({ text: req.body.text, date: new Date() }); - res.json(task); -}); - - - -app.use((err, req, res, next) => { - console.error(err.stack); - res.status(500).json({ error: 'Internal Server Error' }); -}); - - -const PORT = 3000; -app.listen(PORT, () => console.log(`Server running on port ${PORT}`)); \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/outputs/aap/turn-1.json b/evals/data/experiments/019-js-express-api/outputs/aap/turn-1.json deleted file mode 100644 index 448a4ae..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "task-api-service", - "version": 2, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "const express = require('express');\nconst cors = require('cors');\nconst rateLimit = require('express-rate-limit');\nconst { body, param, validationResult } = require('express-validator');\n\nconst app = express();\n\n\napp.use(cors());\napp.use(express.json());\napp.use((req, res, next) => {\n console.log(`[${new Date().toISOString()}] ${req.method} ${req.url}`);\n next();\n});\n\nconst auth = (req, res, next) => {\n const token = req.headers['authorization'];\n if (!token) return res.status(401).json({ error: 'Unauthorized' });\n next();\n};\n\nconst limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 });\napp.use(limiter);\n\n\n\nconst store = {\n tasks: [],\n projects: [],\n users: []\n};\n\n\n\napp.post('/tasks', auth, [body('title').notEmpty()], (req, res) => {\n const errors = validationResult(req);\n if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() });\n \n const task = { id: Date.now(), ...req.body, status: 'pending', comments: [], labels: [] };\n store.tasks.push(task);\n res.status(201).json(task);\n});\n\napp.patch('/tasks/:id/status', auth, [param('id').isInt()], (req, res) => {\n const task = store.tasks.find(t => t.id == req.params.id);\n if (!task) return res.status(404).json({ error: 'Task not found' });\n task.status = req.body.status;\n res.json(task);\n});\n\napp.put('/tasks/:id/labels', auth, [param('id').isInt(), body('labels').isArray()], (req, res) => {\n const task = store.tasks.find(t => t.id == req.params.id);\n if (!task) return res.status(404).json({ error: 'Task not found' });\n task.labels = req.body.labels;\n res.json(task);\n});\n\napp.post('/tasks/:id/comments', auth, (req, res) => {\n const task = store.tasks.find(t => t.id == req.params.id);\n if (!task) return res.status(404).json({ error: 'Task not found' });\n task.comments.push({ text: req.body.text, date: new Date() });\n res.json(task);\n});\n\n\n\napp.use((err, req, res, next) => {\n console.error(err.stack);\n res.status(500).json({ error: 'Internal Server Error' });\n});\n\n\nconst PORT = 3000;\napp.listen(PORT, () => console.log(`Server running on port ${PORT}`));\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/outputs/aap/turn-1.txt b/evals/data/experiments/019-js-express-api/outputs/aap/turn-1.txt deleted file mode 100644 index 447e9ff..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/aap/turn-1.txt +++ /dev/null @@ -1,74 +0,0 @@ -const express = require('express'); -const cors = require('cors'); -const rateLimit = require('express-rate-limit'); -const { body, param, validationResult } = require('express-validator'); - -const app = express(); - - -app.use(cors()); -app.use(express.json()); -app.use((req, res, next) => { - console.log(`[${new Date().toISOString()}] ${req.method} ${req.url}`); - next(); -}); - -const auth = (req, res, next) => { - const token = req.headers['authorization']; - if (!token) return res.status(401).json({ error: 'Unauthorized' }); - next(); -}; - -const limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 }); -app.use(limiter); - - - -const store = { - tasks: [], - projects: [], - users: [] -}; - - - -app.post('/tasks', auth, [body('title').notEmpty()], (req, res) => { - const errors = validationResult(req); - if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() }); - - const task = { id: Date.now(), ...req.body, status: 'pending', comments: [], labels: [] }; - store.tasks.push(task); - res.status(201).json(task); -}); - -app.patch('/tasks/:id/status', auth, [param('id').isInt()], (req, res) => { - const task = store.tasks.find(t => t.id == req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.status = req.body.status; - res.json(task); -}); - -app.put('/tasks/:id/labels', auth, [param('id').isInt(), body('labels').isArray()], (req, res) => { - const task = store.tasks.find(t => t.id == req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.labels = req.body.labels; - res.json(task); -}); - -app.post('/tasks/:id/comments', auth, (req, res) => { - const task = store.tasks.find(t => t.id == req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.comments.push({ text: req.body.text, date: new Date() }); - res.json(task); -}); - - - -app.use((err, req, res, next) => { - console.error(err.stack); - res.status(500).json({ error: 'Internal Server Error' }); -}); - - -const PORT = 3000; -app.listen(PORT, () => console.log(`Server running on port ${PORT}`)); diff --git a/evals/data/experiments/019-js-express-api/outputs/aap/turn-2.json b/evals/data/experiments/019-js-express-api/outputs/aap/turn-2.json deleted file mode 100644 index 5a417cd..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/aap/turn-2.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "express-app-refactor-auth", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/javascript", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "const express = require('express');\nconst cors = require('cors');\nconst rateLimit = require('express-rate-limit');\nconst { body, param, validationResult } = require('express-validator');\n\nconst app = express();\n\n\napp.use(cors());\napp.use(express.json());\napp.use((req, res, next) => {\n console.log(`[${new Date().toISOString()}] ${req.method} ${req.url}`);\n next();\n});\n\nconst auth = (req, res, next) => {\n const bearerToken = req.headers['authorization'];\n const apiKey = req.headers['x-api-key'];\n\n if (bearerToken && bearerToken.startsWith('Bearer ')) {\n return next();\n }\n \n if (apiKey) {\n return next();\n }\n\n return res.status(401).json({ error: 'Unauthorized: Missing or invalid credentials' });\n};\n\nconst limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 });\napp.use(limiter);\n\n\n\nconst store = {\n tasks: [],\n projects: [],\n users: []\n};\n\n\n\napp.post('/tasks', auth, [body('title').notEmpty()], (req, res) => {\n const errors = validationResult(req);\n if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() });\n \n const task = { id: Date.now(), ...req.body, status: 'pending', comments: [], labels: [] };\n store.tasks.push(task);\n res.status(201).json(task);\n});\n\napp.patch('/tasks/:id/status', auth, [param('id').isInt()], (req, res) => {\n const task = store.tasks.find(t => t.id == req.params.id);\n if (!task) return res.status(404).json({ error: 'Task not found' });\n task.status = req.body.status;\n res.json(task);\n});\n\napp.put('/tasks/:id/labels', auth, [param('id').isInt(), body('labels').isArray()], (req, res) => {\n const task = store.tasks.find(t => t.id == req.params.id);\n if (!task) return res.status(404).json({ error: 'Task not found' });\n task.labels = req.body.labels;\n res.json(task);\n});\n\napp.post('/tasks/:id/comments', auth, (req, res) => {\n const task = store.tasks.find(t => t.id == req.params.id);\n if (!task) return res.status(404).json({ error: 'Task not found' });\n task.comments.push({ text: req.body.text, date: new Date() });\n res.json(task);\n});\n\n\n\napp.use((err, req, res, next) => {\n console.error(err.stack);\n res.status(500).json({ error: 'Internal Server Error' });\n});\n\n\nconst PORT = 3000;\napp.listen(PORT, () => console.log(`Server running on port ${PORT}`));\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/outputs/aap/turn-2.txt b/evals/data/experiments/019-js-express-api/outputs/aap/turn-2.txt deleted file mode 100644 index 4997d09..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/aap/turn-2.txt +++ /dev/null @@ -1,83 +0,0 @@ -const express = require('express'); -const cors = require('cors'); -const rateLimit = require('express-rate-limit'); -const { body, param, validationResult } = require('express-validator'); - -const app = express(); - - -app.use(cors()); -app.use(express.json()); -app.use((req, res, next) => { - console.log(`[${new Date().toISOString()}] ${req.method} ${req.url}`); - next(); -}); - -const auth = (req, res, next) => { - const bearerToken = req.headers['authorization']; - const apiKey = req.headers['x-api-key']; - - if (bearerToken && bearerToken.startsWith('Bearer ')) { - return next(); - } - - if (apiKey) { - return next(); - } - - return res.status(401).json({ error: 'Unauthorized: Missing or invalid credentials' }); -}; - -const limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 }); -app.use(limiter); - - - -const store = { - tasks: [], - projects: [], - users: [] -}; - - - -app.post('/tasks', auth, [body('title').notEmpty()], (req, res) => { - const errors = validationResult(req); - if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() }); - - const task = { id: Date.now(), ...req.body, status: 'pending', comments: [], labels: [] }; - store.tasks.push(task); - res.status(201).json(task); -}); - -app.patch('/tasks/:id/status', auth, [param('id').isInt()], (req, res) => { - const task = store.tasks.find(t => t.id == req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.status = req.body.status; - res.json(task); -}); - -app.put('/tasks/:id/labels', auth, [param('id').isInt(), body('labels').isArray()], (req, res) => { - const task = store.tasks.find(t => t.id == req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.labels = req.body.labels; - res.json(task); -}); - -app.post('/tasks/:id/comments', auth, (req, res) => { - const task = store.tasks.find(t => t.id == req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.comments.push({ text: req.body.text, date: new Date() }); - res.json(task); -}); - - - -app.use((err, req, res, next) => { - console.error(err.stack); - res.status(500).json({ error: 'Internal Server Error' }); -}); - - -const PORT = 3000; -app.listen(PORT, () => console.log(`Server running on port ${PORT}`)); diff --git a/evals/data/experiments/019-js-express-api/outputs/aap/turn-3.json b/evals/data/experiments/019-js-express-api/outputs/aap/turn-3.json deleted file mode 100644 index e24c3cb..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/aap/turn-3.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "edit-artifact-001", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"artifact-id\",\n \"version\": 1,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"insert_after\",\n \"target\": {\n \"type\": \"id\",\n \"value\": \"routes\"\n },\n \"content\": \"app.post('/projects/:id/archive', auth, [param('id').isInt()], (req, res) => {\\n const project = store.projects.find(p => p.id == req.params.id);\\n if (!project) return res.status(404).json({ error: 'Project not found' });\\n\\n project.status = 'archived';\\n\\n store.tasks\\n .filter(t => t.projectId == req.params.id && t.status === 'pending')\\n .forEach(t => t.status = 'cancelled');\\n\\n res.json({ message: 'Project archived and pending tasks cancelled' });\\n});\\n\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/outputs/aap/turn-3.txt b/evals/data/experiments/019-js-express-api/outputs/aap/turn-3.txt deleted file mode 100644 index 2a785a9..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/aap/turn-3.txt +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "artifact-id", - "version": 1, - "name": "edit", - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "routes" - }, - "content": "app.post('/projects/:id/archive', auth, [param('id').isInt()], (req, res) => {\n const project = store.projects.find(p => p.id == req.params.id);\n if (!project) return res.status(404).json({ error: 'Project not found' });\n\n project.status = 'archived';\n\n store.tasks\n .filter(t => t.projectId == req.params.id && t.status === 'pending')\n .forEach(t => t.status = 'cancelled');\n\n res.json({ message: 'Project archived and pending tasks cancelled' });\n});\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/outputs/base/turn-0.txt b/evals/data/experiments/019-js-express-api/outputs/base/turn-0.txt deleted file mode 100644 index 1e2f136..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/base/turn-0.txt +++ /dev/null @@ -1,77 +0,0 @@ -const express = require('express'); -const cors = require('cors'); -const rateLimit = require('express-rate-limit'); -const { body, param, validationResult } = require('express-validator'); - -const app = express(); - -// --- In-Memory Models --- -const db = { - users: [{ id: '1', token: 'secret-token', name: 'Admin' }], - projects: [{ id: '1', name: 'Main Project', ownerId: '1' }], - tasks: [{ id: '1', projectId: '1', title: 'Task 1', status: 'todo', comments: [] }] -}; - -// --- Middleware --- -app.use(cors()); -app.use(express.json()); -app.use((req, res, next) => { - console.log(`${new Date().toISOString()} - ${req.method} ${req.url}`); - next(); -}); - -const limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 }); -app.use(limiter); - -const authenticate = (req, res, next) => { - const token = req.headers['authorization']; - if (token && db.users.find(u => u.token === token)) return next(); - res.status(401).json({ error: 'Unauthorized' }); -}; - -const validate = (req, res, next) => { - const errors = validationResult(req); - if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() }); - next(); -}; - -// --- Routes --- - -// Projects -app.get('/projects', authenticate, (req, res) => res.json(db.projects)); -app.post('/projects', authenticate, body('name').isString(), validate, (req, res) => { - const newProject = { id: Date.now().toString(), name: req.body.name }; - db.projects.push(newProject); - res.status(201).json(newProject); -}); - -// Tasks -app.get('/tasks', authenticate, (req, res) => res.json(db.tasks)); - -app.post('/tasks', authenticate, body('title').isString(), body('projectId').isString(), validate, (req, res) => { - const task = { id: Date.now().toString(), ...req.body, status: 'todo', comments: [] }; - db.tasks.push(task); - res.status(201).json(task); -}); - -app.patch('/tasks/:id/status', authenticate, param('id').isString(), body('status').isIn(['todo', 'in-progress', 'done']), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.status = req.body.status; - res.json(task); -}); - -app.post('/tasks/:id/comments', authenticate, param('id').isString(), body('comment').isString(), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.comments.push({ text: req.body.comment, timestamp: new Date() }); - res.json(task); -}); - -// --- Error Handling --- -app.use((err, req, res, next) => { - console.error(err.stack); - res.status(500).json({ error: 'Internal Server Error' }); -}); - -app.listen(3000, () => console.log('API running on http://localhost:3000')); \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/outputs/base/turn-1.txt b/evals/data/experiments/019-js-express-api/outputs/base/turn-1.txt deleted file mode 100644 index cf1237f..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/base/turn-1.txt +++ /dev/null @@ -1,85 +0,0 @@ -const express = require('express'); -const cors = require('cors'); -const rateLimit = require('express-rate-limit'); -const { body, param, validationResult } = require('express-validator'); - -const app = express(); - -// --- In-Memory Models --- -const db = { - users: [{ id: '1', token: 'secret-token', name: 'Admin' }], - projects: [{ id: '1', name: 'Main Project', ownerId: '1' }], - tasks: [{ id: '1', projectId: '1', title: 'Task 1', status: 'todo', comments: [], labels: [] }] -}; - -// --- Middleware --- -app.use(cors()); -app.use(express.json()); -app.use((req, res, next) => { - console.log(`${new Date().toISOString()} - ${req.method} ${req.url}`); - next(); -}); - -const limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 }); -app.use(limiter); - -const authenticate = (req, res, next) => { - const token = req.headers['authorization']; - if (token && db.users.find(u => u.token === token)) return next(); - res.status(401).json({ error: 'Unauthorized' }); -}; - -const validate = (req, res, next) => { - const errors = validationResult(req); - if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() }); - next(); -}; - -// --- Routes --- - -// Projects -app.get('/projects', authenticate, (req, res) => res.json(db.projects)); -app.post('/projects', authenticate, body('name').isString(), validate, (req, res) => { - const newProject = { id: Date.now().toString(), name: req.body.name }; - db.projects.push(newProject); - res.status(201).json(newProject); -}); - -// Tasks -app.get('/tasks', authenticate, (req, res) => res.json(db.tasks)); - -app.post('/tasks', authenticate, body('title').isString(), body('projectId').isString(), validate, (req, res) => { - const task = { id: Date.now().toString(), ...req.body, status: 'todo', comments: [], labels: [] }; - db.tasks.push(task); - res.status(201).json(task); -}); - -app.patch('/tasks/:id/status', authenticate, param('id').isString(), body('status').isIn(['todo', 'in-progress', 'done']), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.status = req.body.status; - res.json(task); -}); - -// Add/Update Labels -app.put('/tasks/:id/labels', authenticate, param('id').isString(), body('labels').isArray(), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.labels = req.body.labels; - res.json(task); -}); - -app.post('/tasks/:id/comments', authenticate, param('id').isString(), body('comment').isString(), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.comments.push({ text: req.body.comment, timestamp: new Date() }); - res.json(task); -}); - -// --- Error Handling --- -app.use((err, req, res, next) => { - console.error(err.stack); - res.status(500).json({ error: 'Internal Server Error' }); -}); - -app.listen(3000, () => console.log('API running on http://localhost:3000')); \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/outputs/base/turn-2.txt b/evals/data/experiments/019-js-express-api/outputs/base/turn-2.txt deleted file mode 100644 index 1407f7b..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/base/turn-2.txt +++ /dev/null @@ -1,93 +0,0 @@ -const express = require('express'); -const cors = require('cors'); -const rateLimit = require('express-rate-limit'); -const { body, param, validationResult } = require('express-validator'); - -const app = express(); - -// --- In-Memory Models --- -const db = { - users: [{ id: '1', token: 'secret-token', apiKey: 'abc-123', name: 'Admin' }], - projects: [{ id: '1', name: 'Main Project', ownerId: '1' }], - tasks: [{ id: '1', projectId: '1', title: 'Task 1', status: 'todo', comments: [], labels: [] }] -}; - -// --- Middleware --- -app.use(cors()); -app.use(express.json()); -app.use((req, res, next) => { - console.log(`${new Date().toISOString()} - ${req.method} ${req.url}`); - next(); -}); - -const limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 }); -app.use(limiter); - -const authenticate = (req, res, next) => { - const authHeader = req.headers['authorization']; - const apiKey = req.headers['x-api-key']; - - let isAuthenticated = false; - - if (authHeader && authHeader.startsWith('Bearer ')) { - const token = authHeader.split(' ')[1]; - if (db.users.find(u => u.token === token)) isAuthenticated = true; - } else if (apiKey) { - if (db.users.find(u => u.apiKey === apiKey)) isAuthenticated = true; - } - - if (isAuthenticated) return next(); - res.status(401).json({ error: 'Unauthorized: Invalid or missing credentials' }); -}; - -const validate = (req, res, next) => { - const errors = validationResult(req); - if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() }); - next(); -}; - -// --- Routes --- - -app.get('/projects', authenticate, (req, res) => res.json(db.projects)); -app.post('/projects', authenticate, body('name').isString(), validate, (req, res) => { - const newProject = { id: Date.now().toString(), name: req.body.name }; - db.projects.push(newProject); - res.status(201).json(newProject); -}); - -app.get('/tasks', authenticate, (req, res) => res.json(db.tasks)); - -app.post('/tasks', authenticate, body('title').isString(), body('projectId').isString(), validate, (req, res) => { - const task = { id: Date.now().toString(), ...req.body, status: 'todo', comments: [], labels: [] }; - db.tasks.push(task); - res.status(201).json(task); -}); - -app.patch('/tasks/:id/status', authenticate, param('id').isString(), body('status').isIn(['todo', 'in-progress', 'done']), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.status = req.body.status; - res.json(task); -}); - -app.put('/tasks/:id/labels', authenticate, param('id').isString(), body('labels').isArray(), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.labels = req.body.labels; - res.json(task); -}); - -app.post('/tasks/:id/comments', authenticate, param('id').isString(), body('comment').isString(), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.comments.push({ text: req.body.comment, timestamp: new Date() }); - res.json(task); -}); - -// --- Error Handling --- -app.use((err, req, res, next) => { - console.error(err.stack); - res.status(500).json({ error: 'Internal Server Error' }); -}); - -app.listen(3000, () => console.log('API running on http://localhost:3000')); \ No newline at end of file diff --git a/evals/data/experiments/019-js-express-api/outputs/base/turn-3.txt b/evals/data/experiments/019-js-express-api/outputs/base/turn-3.txt deleted file mode 100644 index 0c2c6ca..0000000 --- a/evals/data/experiments/019-js-express-api/outputs/base/turn-3.txt +++ /dev/null @@ -1,106 +0,0 @@ -const express = require('express'); -const cors = require('cors'); -const rateLimit = require('express-rate-limit'); -const { body, param, validationResult } = require('express-validator'); - -const app = express(); - -// --- In-Memory Models --- -const db = { - users: [{ id: '1', token: 'secret-token', apiKey: 'abc-123', name: 'Admin' }], - projects: [{ id: '1', name: 'Main Project', ownerId: '1', archived: false }], - tasks: [{ id: '1', projectId: '1', title: 'Task 1', status: 'todo', comments: [], labels: [] }] -}; - -// --- Middleware --- -app.use(cors()); -app.use(express.json()); -app.use((req, res, next) => { - console.log(`${new Date().toISOString()} - ${req.method} ${req.url}`); - next(); -}); - -const limiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 100 }); -app.use(limiter); - -const authenticate = (req, res, next) => { - const authHeader = req.headers['authorization']; - const apiKey = req.headers['x-api-key']; - - let isAuthenticated = false; - if (authHeader && authHeader.startsWith('Bearer ')) { - const token = authHeader.split(' ')[1]; - if (db.users.find(u => u.token === token)) isAuthenticated = true; - } else if (apiKey) { - if (db.users.find(u => u.apiKey === apiKey)) isAuthenticated = true; - } - - if (isAuthenticated) return next(); - res.status(401).json({ error: 'Unauthorized: Invalid or missing credentials' }); -}; - -const validate = (req, res, next) => { - const errors = validationResult(req); - if (!errors.isEmpty()) return res.status(400).json({ errors: errors.array() }); - next(); -}; - -// --- Routes --- - -app.get('/projects', authenticate, (req, res) => res.json(db.projects)); - -app.post('/projects', authenticate, body('name').isString(), validate, (req, res) => { - const newProject = { id: Date.now().toString(), name: req.body.name, archived: false }; - db.projects.push(newProject); - res.status(201).json(newProject); -}); - -app.post('/projects/:id/archive', authenticate, param('id').isString(), validate, (req, res) => { - const project = db.projects.find(p => p.id === req.params.id); - if (!project) return res.status(404).json({ error: 'Project not found' }); - - project.archived = true; - - db.tasks.filter(t => t.projectId === project.id && t.status !== 'done').forEach(task => { - task.status = 'cancelled'; - }); - - res.json({ message: 'Project archived and pending tasks cancelled', project }); -}); - -app.get('/tasks', authenticate, (req, res) => res.json(db.tasks)); - -app.post('/tasks', authenticate, body('title').isString(), body('projectId').isString(), validate, (req, res) => { - const task = { id: Date.now().toString(), ...req.body, status: 'todo', comments: [], labels: [] }; - db.tasks.push(task); - res.status(201).json(task); -}); - -app.patch('/tasks/:id/status', authenticate, param('id').isString(), body('status').isIn(['todo', 'in-progress', 'done', 'cancelled']), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.status = req.body.status; - res.json(task); -}); - -app.put('/tasks/:id/labels', authenticate, param('id').isString(), body('labels').isArray(), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.labels = req.body.labels; - res.json(task); -}); - -app.post('/tasks/:id/comments', authenticate, param('id').isString(), body('comment').isString(), validate, (req, res) => { - const task = db.tasks.find(t => t.id === req.params.id); - if (!task) return res.status(404).json({ error: 'Task not found' }); - task.comments.push({ text: req.body.comment, timestamp: new Date() }); - res.json(task); -}); - -// --- Error Handling --- -app.use((err, req, res, next) => { - console.error(err.stack); - res.status(500).json({ error: 'Internal Server Error' }); -}); - -app.listen(3000, () => console.log('API running on http://localhost:3000')); \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/metrics.json b/evals/data/experiments/020-ts-react-form/metrics.json deleted file mode 100644 index 912d1b6..0000000 --- a/evals/data/experiments/020-ts-react-form/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "020-ts-react-form", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:59:19.570894+00:00", - "format": "text/typescript", - "base_turn0": { - "input_tokens": 125, - "output_tokens": 1539, - "latency_ms": 6296, - "artifact_bytes": 5081 - }, - "aap_turn0": { - "input_tokens": 464, - "output_tokens": 684, - "latency_ms": 3272, - "artifact_bytes": 2284 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new ShippingOption interface with fields: id, name, price, estimated_days,", - "input_tokens": 1687, - "output_tokens": 151, - "latency_ms": 2771, - "output_bytes": 460, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Rewrite the form-fields section to add a promo code input with a 'Apply' button ", - "input_tokens": 1870, - "output_tokens": 760, - "latency_ms": 4195, - "output_bytes": 2553, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Update the validation functions to show inline error messages below each field i", - "input_tokens": 2653, - "output_tokens": 1287, - "latency_ms": 6247, - "output_bytes": 4256, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 6210, - "total_output_tokens": 2198, - "total_latency_ms": 13213 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add a new ShippingOption interface with fields: id, name, price, estimated_days,", - "input_tokens": 1813, - "output_tokens": 884, - "latency_ms": 3349, - "output_bytes": 2683, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "synthesize" - }, - { - "turn": 2, - "edit": "Rewrite the form-fields section to add a promo code input with a 'Apply' button ", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 4988, - "output_bytes": 2683, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - }, - { - "turn": 3, - "edit": "Update the validation functions to show inline error messages below each field i", - "input_tokens": 0, - "output_tokens": 0, - "latency_ms": 2254, - "output_bytes": 2683, - "failed": true, - "failure_reason": "parse or apply failed", - "envelope_parsed": true, - "apply_succeeded": false, - "envelope_name": "edit" - } - ], - "total_input_tokens": 1813, - "total_output_tokens": 884, - "total_latency_ms": 10591, - "envelope_parse_rate": 1.0, - "apply_success_rate": 0.3333333333333333 - }, - "comparison": { - "output_token_savings_pct": 59.8, - "input_token_savings_pct": 70.8, - "latency_savings_pct": 19.8 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 125, - "base_output": 1539, - "base_latency_ms": 6296, - "aap_input": 464, - "aap_output": 684, - "aap_latency_ms": 3272 - }, - { - "turn": 1, - "base_input": 1687, - "base_output": 151, - "base_latency_ms": 2771, - "aap_input": 1813, - "aap_output": 884, - "aap_latency_ms": 3349, - "envelope_name": "synthesize", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 1870, - "base_output": 760, - "base_latency_ms": 4195, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 4988, - "envelope_name": "edit", - "apply_ok": false - }, - { - "turn": 3, - "base_input": 2653, - "base_output": 1287, - "base_latency_ms": 6247, - "aap_input": 0, - "aap_output": 0, - "aap_latency_ms": 2254, - "envelope_name": "edit", - "apply_ok": false - } - ], - "totals": { - "base_input": 6335, - "base_output": 3737, - "base_combined": 10072, - "aap_input": 2277, - "aap_output": 1568, - "aap_combined": 3845, - "base_latency_ms": 19509, - "aap_latency_ms": 13863, - "output_savings_pct": 58.0, - "input_delta_pct": -64.1, - "combined_savings_pct": 61.8, - "latency_savings_pct": 28.9 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.1035, - "token_f1": 0.3966, - "base_char_count": 5081, - "aap_char_count": 2012, - "char_delta_pct": -60.4, - "lines_added": 40, - "lines_removed": 101, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0993, - "token_f1": 0.0863, - "base_char_count": 460, - "aap_char_count": 2399, - "char_delta_pct": 421.5, - "lines_added": 15, - "lines_removed": 17, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.149, - "token_f1": 0.3313, - "base_char_count": 2553, - "aap_char_count": 2399, - "char_delta_pct": -6.0, - "lines_added": 15, - "lines_removed": 75, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.1103, - "token_f1": 0.2557, - "base_char_count": 4256, - "aap_char_count": 2399, - "char_delta_pct": -43.6, - "lines_added": 15, - "lines_removed": 113, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.1155, - "mean_token_f1": 0.2675, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-0.ts b/evals/data/experiments/020-ts-react-form/outputs/aap/turn-0.ts deleted file mode 100644 index 9dc8acc..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-0.ts +++ /dev/null @@ -1,77 +0,0 @@ - -interface CartItem { - id: string; - name: string; - price: number; -} - -interface ShippingOption { - id: string; - label: string; - price: number; -} - -interface FormData { - email: string; - cardNumber: string; - expiry: string; - cvv: string; -} - -interface ValidationErrors { - [key: string]: string; -} - -const CheckoutForm: React.FC<{ items: CartItem[]; shippingOptions: ShippingOption[] }> = ({ items, shippingOptions }) => { - - const [formData, setFormData] = React.useState({ email: '', cardNumber: '', expiry: '', cvv: '' }); - const [errors, setErrors] = React.useState({}); - const [loading, setLoading] = React.useState(false); - - - const validate = (): boolean => { - const newErrors: ValidationErrors = {}; - if (!/^\S+@\S+\.\S+$/.test(formData.email)) newErrors.email = "Invalid email"; - // Luhn check logic here - setErrors(newErrors); - return Object.keys(newErrors).length === 0; - }; - - const handleSubmit = async (e: React.FormEvent) => { - e.preventDefault(); - if (!validate()) return; - setLoading(true); - // Submit logic - setLoading(false); - }; - - return ( -
-
- - setFormData({...formData, email: e.target.value})} - /> - {errors.email && {errors.email}} - - -
- - -
- ); -}; -
\ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-1.json b/evals/data/experiments/020-ts-react-form/outputs/aap/turn-1.json deleted file mode 100644 index 50a58f7..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-1.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "checkout-component-update", - "version": 1, - "name": "synthesize", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "body": "{\n \"protocol\": \"aap/0.1\",\n \"id\": \"checkout-component\",\n \"version\": 2,\n \"name\": \"edit\",\n \"content\": [\n {\n \"op\": \"replace\",\n \"target\": {\n \"type\": \"id\",\n \"value\": \"checkout-component\"\n },\n \"content\": \"\\ninterface CartItem {\\n id: string;\\n name: string;\\n price: number;\\n}\\n\\ninterface ShippingOption {\\n id: string;\\n name: string;\\n price: number;\\n estimated_days: number;\\n carrier: string;\\n}\\n\\ninterface FormData {\\n email: string;\\n cardNumber: string;\\n expiry: string;\\n cvv: string;\\n}\\n\\ninterface ValidationErrors {\\n [key: string]: string;\\n}\\n\\nconst CheckoutForm: React.FC<{ items: CartItem[]; shippingOptions: ShippingOption[] }> = ({ items, shippingOptions }) => {\\n \\n const [formData, setFormData] = React.useState({ email: '', cardNumber: '', expiry: '', cvv: '' });\\n const [errors, setErrors] = React.useState({});\\n const [loading, setLoading] = React.useState(false);\\n \\n\\n const validate = (): boolean => {\\n const newErrors: ValidationErrors = {};\\n if (!/^\\\\S+@\\\\S+\\\\.\\\\S+$/.test(formData.email)) newErrors.email = \\\"Invalid email\\\";\\n // Luhn check logic here\\n setErrors(newErrors);\\n return Object.keys(newErrors).length === 0;\\n };\\n\\n const handleSubmit = async (e: React.FormEvent) => {\\n e.preventDefault();\\n if (!validate()) return;\\n setLoading(true);\\n // Submit logic\\n setLoading(false);\\n };\\n\\n return (\\n
\\n
\\n \\n setFormData({...formData, email: e.target.value})} \\n />\\n {errors.email && {errors.email}}\\n \\n \\n
\\n\\n \\n
\\n );\\n};\\n
\"\n }\n ]\n}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-1.ts b/evals/data/experiments/020-ts-react-form/outputs/aap/turn-1.ts deleted file mode 100644 index 9625a86..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-1.ts +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "checkout-component", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "checkout-component" - }, - "content": "\ninterface CartItem {\n id: string;\n name: string;\n price: number;\n}\n\ninterface ShippingOption {\n id: string;\n name: string;\n price: number;\n estimated_days: number;\n carrier: string;\n}\n\ninterface FormData {\n email: string;\n cardNumber: string;\n expiry: string;\n cvv: string;\n}\n\ninterface ValidationErrors {\n [key: string]: string;\n}\n\nconst CheckoutForm: React.FC<{ items: CartItem[]; shippingOptions: ShippingOption[] }> = ({ items, shippingOptions }) => {\n \n const [formData, setFormData] = React.useState({ email: '', cardNumber: '', expiry: '', cvv: '' });\n const [errors, setErrors] = React.useState({});\n const [loading, setLoading] = React.useState(false);\n \n\n const validate = (): boolean => {\n const newErrors: ValidationErrors = {};\n if (!/^\\S+@\\S+\\.\\S+$/.test(formData.email)) newErrors.email = \"Invalid email\";\n // Luhn check logic here\n setErrors(newErrors);\n return Object.keys(newErrors).length === 0;\n };\n\n const handleSubmit = async (e: React.FormEvent) => {\n e.preventDefault();\n if (!validate()) return;\n setLoading(true);\n // Submit logic\n setLoading(false);\n };\n\n return (\n
\n
\n \n setFormData({...formData, email: e.target.value})} \n />\n {errors.email && {errors.email}}\n \n \n
\n\n \n
\n );\n};\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-2.json b/evals/data/experiments/020-ts-react-form/outputs/aap/turn-2.json deleted file mode 100644 index 93fd52d..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "checkout-component", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "checkout-component" - }, - "content": "interface CartItem {\n id: string;\n name: string;\n price: number;\n}\n\ninterface ShippingOption {\n id: string;\n name: string;\n price: number;\n estimated_days: number;\n carrier: string;\n}\n\ninterface FormData {\n email: string;\n cardNumber: string;\n expiry: string;\n cvv: string;\n promoCode: string;\n}\n\ninterface ValidationErrors {\n [key: string]: string;\n}\n\nconst CheckoutForm: React.FC<{ items: CartItem[]; shippingOptions: ShippingOption[] }> = ({ items, shippingOptions }) => {\n \n const [formData, setFormData] = React.useState({ email: '', cardNumber: '', expiry: '', cvv: '', promoCode: '' });\n const [errors, setErrors] = React.useState({});\n const [loading, setLoading] = React.useState(false);\n const [discount, setDiscount] = React.useState(0);\n \n\n const validate = (): boolean => {\n const newErrors: ValidationErrors = {};\n if (!/^\\S+@\\S+\\.\\S+$/.test(formData.email)) newErrors.email = \"Invalid email\";\n // Luhn check logic here\n setErrors(newErrors);\n return Object.keys(newErrors).length === 0;\n };\n\n const applyPromo = () => {\n if (formData.promoCode === 'SAVE10') setDiscount(10);\n else alert('Invalid promo code');\n };\n\n const handleSubmit = async (e: React.FormEvent) => {\n e.preventDefault();\n if (!validate()) return;\n setLoading(true);\n // Submit logic\n setLoading(false);\n };\n\n const subtotal = items.reduce((sum, item) => sum + item.price, 0);\n const total = Math.max(0, subtotal - discount);\n\n return (\n
\n
\n \n setFormData({...formData, email: e.target.value})} \n />\n {errors.email && {errors.email}}\n
\n setFormData({...formData, promoCode: e.target.value})}\n />\n \n
\n
\n \n
\n\n \n
\n );\n};" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-2.ts b/evals/data/experiments/020-ts-react-form/outputs/aap/turn-2.ts deleted file mode 100644 index 9625a86..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-2.ts +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "checkout-component", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "checkout-component" - }, - "content": "\ninterface CartItem {\n id: string;\n name: string;\n price: number;\n}\n\ninterface ShippingOption {\n id: string;\n name: string;\n price: number;\n estimated_days: number;\n carrier: string;\n}\n\ninterface FormData {\n email: string;\n cardNumber: string;\n expiry: string;\n cvv: string;\n}\n\ninterface ValidationErrors {\n [key: string]: string;\n}\n\nconst CheckoutForm: React.FC<{ items: CartItem[]; shippingOptions: ShippingOption[] }> = ({ items, shippingOptions }) => {\n \n const [formData, setFormData] = React.useState({ email: '', cardNumber: '', expiry: '', cvv: '' });\n const [errors, setErrors] = React.useState({});\n const [loading, setLoading] = React.useState(false);\n \n\n const validate = (): boolean => {\n const newErrors: ValidationErrors = {};\n if (!/^\\S+@\\S+\\.\\S+$/.test(formData.email)) newErrors.email = \"Invalid email\";\n // Luhn check logic here\n setErrors(newErrors);\n return Object.keys(newErrors).length === 0;\n };\n\n const handleSubmit = async (e: React.FormEvent) => {\n e.preventDefault();\n if (!validate()) return;\n setLoading(true);\n // Submit logic\n setLoading(false);\n };\n\n return (\n
\n
\n \n setFormData({...formData, email: e.target.value})} \n />\n {errors.email && {errors.email}}\n \n \n
\n\n \n
\n );\n};\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-3.json b/evals/data/experiments/020-ts-react-form/outputs/aap/turn-3.json deleted file mode 100644 index 64487be..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-3.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "checkout-component", - "version": 3, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "form-state" - }, - "content": " const validate = (): boolean => {\n const newErrors: ValidationErrors = {};\n if (!/^\\S+@\\S+\\.\\S+$/.test(formData.email)) newErrors.email = \"Invalid email\";\n \n // Add card number validation\n if (formData.cardNumber.length < 16) newErrors.cardNumber = \"Invalid card number\";\n \n setErrors(newErrors);\n return Object.keys(newErrors).length === 0;\n };" - }, - { - "op": "replace", - "target": { - "type": "id", - "value": "form-fields" - }, - "content": " setFormData({...formData, email: e.target.value})} \n />\n {errors.email &&
{errors.email}
}\n\n setFormData({...formData, cardNumber: e.target.value})} \n />\n {errors.cardNumber &&
{errors.cardNumber}
}" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-3.ts b/evals/data/experiments/020-ts-react-form/outputs/aap/turn-3.ts deleted file mode 100644 index 9625a86..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/aap/turn-3.ts +++ /dev/null @@ -1,16 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "checkout-component", - "version": 2, - "name": "edit", - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "checkout-component" - }, - "content": "\ninterface CartItem {\n id: string;\n name: string;\n price: number;\n}\n\ninterface ShippingOption {\n id: string;\n name: string;\n price: number;\n estimated_days: number;\n carrier: string;\n}\n\ninterface FormData {\n email: string;\n cardNumber: string;\n expiry: string;\n cvv: string;\n}\n\ninterface ValidationErrors {\n [key: string]: string;\n}\n\nconst CheckoutForm: React.FC<{ items: CartItem[]; shippingOptions: ShippingOption[] }> = ({ items, shippingOptions }) => {\n \n const [formData, setFormData] = React.useState({ email: '', cardNumber: '', expiry: '', cvv: '' });\n const [errors, setErrors] = React.useState({});\n const [loading, setLoading] = React.useState(false);\n \n\n const validate = (): boolean => {\n const newErrors: ValidationErrors = {};\n if (!/^\\S+@\\S+\\.\\S+$/.test(formData.email)) newErrors.email = \"Invalid email\";\n // Luhn check logic here\n setErrors(newErrors);\n return Object.keys(newErrors).length === 0;\n };\n\n const handleSubmit = async (e: React.FormEvent) => {\n e.preventDefault();\n if (!validate()) return;\n setLoading(true);\n // Submit logic\n setLoading(false);\n };\n\n return (\n
\n
\n \n setFormData({...formData, email: e.target.value})} \n />\n {errors.email && {errors.email}}\n \n \n
\n\n \n
\n );\n};\n
" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/base/turn-0.ts b/evals/data/experiments/020-ts-react-form/outputs/base/turn-0.ts deleted file mode 100644 index 1c61d9c..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/base/turn-0.ts +++ /dev/null @@ -1,137 +0,0 @@ -import React, { useState, useMemo } from 'react'; - -interface CartItem { - id: string; - name: string; - price: number; -} - -interface ShippingOption { - id: string; - label: string; - cost: number; -} - -interface FormData { - email: string; - cardNumber: string; - expiry: string; - shippingMethod: string; -} - -interface ValidationErrors { - email?: string; - cardNumber?: string; - expiry?: string; -} - -const validateEmail = (email: string) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email); - -const validateLuhn = (number: string) => { - const digits = number.replace(/\s+/g, '').split('').map(Number); - let sum = 0; - let shouldDouble = false; - for (let i = digits.length - 1; i >= 0; i--) { - let digit = digits[i]; - if (shouldDouble) { - digit *= 2; - if (digit > 9) digit -= 9; - } - sum += digit; - shouldDouble = !shouldDouble; - } - return sum % 10 === 0; -}; - -const validateExpiry = (expiry: string) => { - const [mm, yy] = expiry.split('/').map(Number); - if (!mm || !yy || mm < 1 || mm > 12) return false; - const now = new Date(); - const currentMonth = now.getMonth() + 1; - const currentYear = parseInt(now.getFullYear().toString().slice(-2)); - return yy > currentYear || (yy === currentYear && mm >= currentMonth); -}; - -const styles: Record = { - container: { display: 'flex', gap: '40px', padding: '20px', maxWidth: '900px', margin: '0 auto' }, - form: { flex: 2 }, - sidebar: { flex: 1, backgroundColor: '#f9f9f9', padding: '20px', borderRadius: '8px' }, - inputGroup: { marginBottom: '15px', display: 'flex', flexDirection: 'column' }, - input: { padding: '8px', fontSize: '16px', borderRadius: '4px', border: '1px solid #ccc' }, - error: { color: 'red', fontSize: '12px', marginTop: '4px' } -}; - -export const CheckoutForm: React.FC<{ items: CartItem[], shipping: ShippingOption[] }> = ({ items, shipping }) => { - const [formData, setFormData] = useState({ email: '', cardNumber: '', expiry: '', shippingMethod: shipping[0].id }); - const [errors, setErrors] = useState({}); - const [loading, setLoading] = useState(false); - - const subtotal = items.reduce((acc, item) => acc + item.price, 0); - const shippingCost = shipping.find(s => s.id === formData.shippingMethod)?.cost || 0; - const tax = subtotal * 0.08; - const total = subtotal + shippingCost + tax; - - const handleSubmit = async (e: React.FormEvent) => { - e.preventDefault(); - const newErrors: ValidationErrors = {}; - if (!validateEmail(formData.email)) newErrors.email = 'Invalid email'; - if (!validateLuhn(formData.cardNumber)) newErrors.cardNumber = 'Invalid card number'; - if (!validateExpiry(formData.expiry)) newErrors.expiry = 'Invalid expiry'; - - if (Object.keys(newErrors).length > 0) { - setErrors(newErrors); - return; - } - - setLoading(true); - await new Promise(resolve => setTimeout(resolve, 1500)); - setLoading(false); - alert('Order placed successfully!'); - }; - - const handleCardChange = (e: React.ChangeEvent) => { - let val = e.target.value.replace(/\D/g, '').slice(0, 16); - val = val.replace(/(\d{4})(?=\d)/g, '$1 '); - setFormData({ ...formData, cardNumber: val }); - }; - - return ( -
-
-

Checkout

-
- - setFormData({...formData, email: e.target.value})} /> - {errors.email && {errors.email}} -
-
- - - {errors.cardNumber && {errors.cardNumber}} -
-
- - setFormData({...formData, expiry: e.target.value})} placeholder="MM/YY" /> - {errors.expiry && {errors.expiry}} -
-
- - -
- -
- -
-

Order Summary

- {items.map(item =>
{item.name}: ${item.price.toFixed(2)}
)} -
-
Subtotal: ${subtotal.toFixed(2)}
-
Shipping: ${shippingCost.toFixed(2)}
-
Tax: ${tax.toFixed(2)}
- Total: ${total.toFixed(2)} -
-
- ); -}; \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/base/turn-1.ts b/evals/data/experiments/020-ts-react-form/outputs/base/turn-1.ts deleted file mode 100644 index d888637..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/base/turn-1.ts +++ /dev/null @@ -1,18 +0,0 @@ -interface ShippingOption { - id: string; - name: string; - price: number; - estimated_days: number; - carrier: string; -} - -// Updated usage example within the component for context: -/* - -*/ \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/base/turn-2.ts b/evals/data/experiments/020-ts-react-form/outputs/base/turn-2.ts deleted file mode 100644 index a3e03e3..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/base/turn-2.ts +++ /dev/null @@ -1,76 +0,0 @@ -import React, { useState } from 'react'; - -interface CartItem { - id: string; - name: string; - price: number; -} - -interface ShippingOption { - id: string; - name: string; - price: number; - estimated_days: number; - carrier: string; -} - -interface FormData { - email: string; - cardNumber: string; - expiry: string; - shippingMethod: string; - promoCode: string; -} - -const styles: Record = { - container: { display: 'flex', gap: '40px', padding: '20px', maxWidth: '900px', margin: '0 auto' }, - form: { flex: 2 }, - sidebar: { flex: 1, backgroundColor: '#f9f9f9', padding: '20px', borderRadius: '8px' }, - inputGroup: { marginBottom: '15px', display: 'flex', flexDirection: 'column' }, - inlineGroup: { display: 'flex', gap: '10px' }, - input: { padding: '8px', fontSize: '16px', borderRadius: '4px', border: '1px solid #ccc', flex: 1 }, - button: { padding: '8px 16px', cursor: 'pointer' } -}; - -export const CheckoutForm: React.FC<{ items: CartItem[], shipping: ShippingOption[] }> = ({ items, shipping }) => { - const [formData, setFormData] = useState({ email: '', cardNumber: '', expiry: '', shippingMethod: shipping[0].id, promoCode: '' }); - const [discount, setDiscount] = useState(0); - const [promoInput, setPromoInput] = useState(''); - - const applyPromo = () => { - if (promoInput.toUpperCase() === 'SAVE10') setDiscount(10); - else alert('Invalid Promo Code'); - }; - - const subtotal = items.reduce((acc, item) => acc + item.price, 0); - const selectedShipping = shipping.find(s => s.id === formData.shippingMethod); - const shippingCost = selectedShipping?.price || 0; - const total = Math.max(0, subtotal + shippingCost - discount); - - return ( -
-
-
- -
- setPromoInput(e.target.value)} - placeholder="Enter code" - /> - -
-
-
- -
-

Summary

-
Subtotal: ${subtotal.toFixed(2)}
- {discount > 0 &&
Discount: -${discount.toFixed(2)}
} -
Shipping: ${shippingCost.toFixed(2)}
- Total: ${total.toFixed(2)} -
-
- ); -}; \ No newline at end of file diff --git a/evals/data/experiments/020-ts-react-form/outputs/base/turn-3.ts b/evals/data/experiments/020-ts-react-form/outputs/base/turn-3.ts deleted file mode 100644 index 50d7c13..0000000 --- a/evals/data/experiments/020-ts-react-form/outputs/base/turn-3.ts +++ /dev/null @@ -1,114 +0,0 @@ -import React, { useState } from 'react'; - -interface CartItem { - id: string; - name: string; - price: number; -} - -interface ShippingOption { - id: string; - name: string; - price: number; - estimated_days: number; - carrier: string; -} - -interface FormData { - email: string; - cardNumber: string; - expiry: string; - shippingMethod: string; -} - -interface ValidationErrors { - email?: string; - cardNumber?: string; - expiry?: string; -} - -const validateEmail = (email: string) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email); - -const validateLuhn = (number: string) => { - const digits = number.replace(/\s+/g, '').split('').map(Number); - let sum = 0, shouldDouble = false; - for (let i = digits.length - 1; i >= 0; i--) { - let digit = digits[i]; - if (shouldDouble) { - digit *= 2; - if (digit > 9) digit -= 9; - } - sum += digit; - shouldDouble = !shouldDouble; - } - return sum % 10 === 0 && digits.length > 0; -}; - -const validateExpiry = (expiry: string) => { - const [mm, yy] = expiry.split('/').map(Number); - if (!mm || !yy || mm < 1 || mm > 12) return false; - const now = new Date(); - const currentMonth = now.getMonth() + 1; - const currentYear = parseInt(now.getFullYear().toString().slice(-2)); - return yy > currentYear || (yy === currentYear && mm >= currentMonth); -}; - -const styles: Record = { - container: { display: 'flex', gap: '40px', padding: '20px', maxWidth: '900px', margin: '0 auto' }, - form: { flex: 2 }, - sidebar: { flex: 1, backgroundColor: '#f9f9f9', padding: '20px', borderRadius: '8px' }, - inputGroup: { marginBottom: '15px', display: 'flex', flexDirection: 'column' }, - input: { padding: '8px', fontSize: '16px', borderRadius: '4px', border: '1px solid #ccc' }, - error: { color: 'red', fontSize: '12px', marginTop: '4px' } -}; - -export const CheckoutForm: React.FC<{ items: CartItem[], shipping: ShippingOption[] }> = ({ items, shipping }) => { - const [formData, setFormData] = useState({ email: '', cardNumber: '', expiry: '', shippingMethod: shipping[0].id }); - const [errors, setErrors] = useState({}); - const [promoInput, setPromoInput] = useState(''); - const [discount, setDiscount] = useState(0); - - const validateField = (name: keyof ValidationErrors, value: string) => { - let error = ''; - if (name === 'email' && !validateEmail(value)) error = 'Invalid email address'; - if (name === 'cardNumber' && !validateLuhn(value)) error = 'Invalid card number'; - if (name === 'expiry' && !validateExpiry(value)) error = 'Invalid MM/YY'; - setErrors(prev => ({ ...prev, [name]: error })); - }; - - const subtotal = items.reduce((acc, item) => acc + item.price, 0); - const ship = shipping.find(s => s.id === formData.shippingMethod); - const total = Math.max(0, subtotal + (ship?.price || 0) - discount); - - return ( -
-
-
- - validateField('email', e.target.value)} onChange={e => setFormData({...formData, email: e.target.value})} /> - {errors.email && {errors.email}} -
- -
- - validateField('cardNumber', e.target.value)} onChange={e => setFormData({...formData, cardNumber: e.target.value})} /> - {errors.cardNumber && {errors.cardNumber}} -
- -
- - validateField('expiry', e.target.value)} onChange={e => setFormData({...formData, expiry: e.target.value})} /> - {errors.expiry && {errors.expiry}} -
-
- -
-

Order Summary

-
Subtotal: ${subtotal.toFixed(2)}
- {discount > 0 &&
Discount: -${discount.toFixed(2)}
} -
Shipping: ${ship?.price.toFixed(2)}
- Total: ${total.toFixed(2)} -
-
- ); -}; \ No newline at end of file diff --git a/evals/data/experiments/021-js-utility-library/metrics.json b/evals/data/experiments/021-js-utility-library/metrics.json deleted file mode 100644 index 024a0d6..0000000 --- a/evals/data/experiments/021-js-utility-library/metrics.json +++ /dev/null @@ -1,228 +0,0 @@ -{ - "experiment_id": "021-js-utility-library", - "model": "", - "provider": "google", - "timestamp": "2026-04-03T06:59:52.995706+00:00", - "format": "text/javascript", - "base_turn0": { - "input_tokens": 137, - "output_tokens": 1682, - "latency_ms": 7087, - "artifact_bytes": 4621 - }, - "aap_turn0": { - "input_tokens": 476, - "output_tokens": 1668, - "latency_ms": 7465, - "artifact_bytes": 4703 - }, - "default_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add three new string utilities: maskEmail (show first 2 chars + ***@domain), plu", - "input_tokens": 1850, - "output_tokens": 245, - "latency_ms": 1839, - "output_bytes": 779, - "failed": false, - "failure_reason": "" - }, - { - "turn": 2, - "edit": "Rewrite the date-utils section to add a formatDuration function that converts mi", - "input_tokens": 2130, - "output_tokens": 603, - "latency_ms": 7344, - "output_bytes": 1643, - "failed": false, - "failure_reason": "" - }, - { - "turn": 3, - "edit": "Add a new 'object-utils' section with functions: deepClone, deepMerge, pick, omi", - "input_tokens": 2759, - "output_tokens": 560, - "latency_ms": 3037, - "output_bytes": 1629, - "failed": false, - "failure_reason": "" - } - ], - "total_input_tokens": 6739, - "total_output_tokens": 1408, - "total_latency_ms": 12220 - }, - "aap_flow": { - "per_turn": [ - { - "turn": 1, - "edit": "Add three new string utilities: maskEmail (show first 2 chars + ***@domain), plu", - "input_tokens": 2805, - "output_tokens": 264, - "latency_ms": 1994, - "output_bytes": 5255, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - }, - { - "turn": 2, - "edit": "Rewrite the date-utils section to add a formatDuration function that converts mi", - "input_tokens": 2997, - "output_tokens": 255, - "latency_ms": 1641, - "output_bytes": 5767, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - }, - { - "turn": 3, - "edit": "Add a new 'object-utils' section with functions: deepClone, deepMerge, pick, omi", - "input_tokens": 3177, - "output_tokens": 465, - "latency_ms": 2553, - "output_bytes": 6947, - "failed": false, - "failure_reason": "", - "envelope_parsed": true, - "apply_succeeded": true, - "envelope_name": "edit" - } - ], - "total_input_tokens": 8979, - "total_output_tokens": 984, - "total_latency_ms": 6188, - "envelope_parse_rate": 1.0, - "apply_success_rate": 1.0 - }, - "comparison": { - "output_token_savings_pct": 30.1, - "input_token_savings_pct": -33.2, - "latency_savings_pct": 49.4 - }, - "token_table": { - "turns": [ - { - "turn": 0, - "base_input": 137, - "base_output": 1682, - "base_latency_ms": 7087, - "aap_input": 476, - "aap_output": 1668, - "aap_latency_ms": 7465 - }, - { - "turn": 1, - "base_input": 1850, - "base_output": 245, - "base_latency_ms": 1839, - "aap_input": 2805, - "aap_output": 264, - "aap_latency_ms": 1994, - "envelope_name": "edit", - "apply_ok": true - }, - { - "turn": 2, - "base_input": 2130, - "base_output": 603, - "base_latency_ms": 7344, - "aap_input": 2997, - "aap_output": 255, - "aap_latency_ms": 1641, - "envelope_name": "edit", - "apply_ok": true - }, - { - "turn": 3, - "base_input": 2759, - "base_output": 560, - "base_latency_ms": 3037, - "aap_input": 3177, - "aap_output": 465, - "aap_latency_ms": 2553, - "envelope_name": "edit", - "apply_ok": true - } - ], - "totals": { - "base_input": 6876, - "base_output": 3090, - "base_combined": 9966, - "aap_input": 9455, - "aap_output": 2652, - "aap_combined": 12107, - "base_latency_ms": 19307, - "aap_latency_ms": 13653, - "output_savings_pct": 14.2, - "input_delta_pct": 37.5, - "combined_savings_pct": -21.5, - "latency_savings_pct": 29.3 - } - }, - "quality": { - "per_turn": [ - { - "turn": 0, - "sequence_similarity": 0.166, - "token_f1": 0.3989, - "base_char_count": 4621, - "aap_char_count": 3439, - "char_delta_pct": -25.6, - "lines_added": 55, - "lines_removed": 133, - "rouge_l": null, - "bleu": null - }, - { - "turn": 1, - "sequence_similarity": 0.0155, - "token_f1": 0.1481, - "base_char_count": 779, - "aap_char_count": 3856, - "char_delta_pct": 395.0, - "lines_added": 68, - "lines_removed": 15, - "rouge_l": null, - "bleu": null - }, - { - "turn": 2, - "sequence_similarity": 0.0277, - "token_f1": 0.2228, - "base_char_count": 1643, - "aap_char_count": 4272, - "char_delta_pct": 160.0, - "lines_added": 70, - "lines_removed": 46, - "rouge_l": null, - "bleu": null - }, - { - "turn": 3, - "sequence_similarity": 0.0914, - "token_f1": 0.1971, - "base_char_count": 1629, - "aap_char_count": 5239, - "char_delta_pct": 221.6, - "lines_added": 84, - "lines_removed": 35, - "rouge_l": null, - "bleu": null - } - ], - "mean_sequence_similarity": 0.0751, - "mean_token_f1": 0.2417, - "mean_rouge_l": null, - "mean_bleu": null, - "judge_comparisons": null, - "mean_base_judge": null, - "mean_aap_judge": null - } -} diff --git a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-0.txt b/evals/data/experiments/021-js-utility-library/outputs/aap/turn-0.txt deleted file mode 100644 index d7d7d0b..0000000 --- a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-0.txt +++ /dev/null @@ -1,73 +0,0 @@ - -/** - * String Utilities - */ -const StringUtils = { - /** @param {string} str @returns {string} Slugified string */ - slugify: (str) => str.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/(^-|-$)/g, ''), - /** @param {string} str @param {number} len @returns {string} Truncated string */ - truncate: (str, len) => str.length > len ? str.substring(0, len) + '...' : str, - /** @param {string} str @returns {string} Capitalized string */ - capitalize: (str) => str.charAt(0).toUpperCase() + str.slice(1), - /** @param {string} str @returns {string} CamelCased string */ - camelCase: (str) => str.replace(/[-_](\w)/g, (_, c) => c.toUpperCase()), - /** @param {string} str @returns {string} SnakeCased string */ - snakeCase: (str) => str.replace(/([A-Z])/g, '_$1').toLowerCase().replace(/^_/, ''), - /** @param {string} str @returns {string} Escaped HTML */ - escapeHtml: (str) => str.replace(/[&<>"']/g, (m) => ({'&': '&', '<': '<', '>': '>', '"': '"', "'": '''}[m])), - /** @returns {string} Random ID */ - generateId: () => Math.random().toString(36).substr(2, 9) -}; - -/** - * Date Utilities - */ -const DateUtils = { - /** @param {Date} date @param {string} format @returns {string} */ - formatDate: (date, fmt) => /* Implementation */, - /** @param {Date} date @returns {string} Relative time string */ - timeAgo: (date) => /* Implementation */, - /** @param {Date} date @param {number} days @returns {Date} */ - addDays: (date, days) => new Date(date.getTime() + days * 86400000), - /** @param {Date} date @returns {boolean} */ - isWeekend: (date) => [0, 6].includes(date.getDay()), - /** @param {Date} start @param {Date} end @returns {Date[]} */ - getDateRange: (start, end) => /* Implementation */, - /** @param {string} str @returns {Date} */ - parseRelativeDate: (str) => /* Implementation */ -}; - -/** - * Array Utilities - */ -const ArrayUtils = { - /** @param {Array} arr @param {number} size @returns {Array[]} */ - chunk: (arr, size) => Array.from({ length: Math.ceil(arr.length / size) }, (v, i) => arr.slice(i * size, i * size + size)), - /** @param {Array} arr @param {Function} key @returns {Object} */ - groupBy: (arr, key) => arr.reduce((acc, x) => ((acc[key(x)] = acc[key(x)] || []).push(x), acc), {}), - /** @param {Array} arr @param {Function} key @returns {Array} */ - uniqueBy: (arr, key) => [...new Map(arr.map(x => [key(x), x])).values()], - /** @param {Array} arr @param {Function} fn @returns {Array} */ - sortBy: (arr, fn) => [...arr].sort((a, b) => fn(a) - fn(b)), - /** @param {Array} arr @returns {Array} */ - flatten: (arr) => arr.reduce((acc, val) => Array.isArray(val) ? acc.concat(ArrayUtils.flatten(val)) : acc.concat(val), []), - /** @param {Array} a @param {Array} b @returns {Array} */ - intersection: (a, b) => a.filter(x => b.includes(x)), - /** @param {Array} a @param {Array} b @returns {Array} */ - difference: (a, b) => a.filter(x => !b.includes(x)), - /** @param {Array} arr @returns {Array} */ - shuffle: (arr) => [...arr].sort(() => Math.random() - 0.5) -}; - -/** - * Validation Utilities - */ -const ValidateUtils = { - isEmail: (str) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(str), - isURL: (str) => /^https?:\/\/\S+$/.test(str), - isStrongPassword: (str) => /^(?=.*[A-Z])(?=.*[0-9])(?=.{8,})/.test(str), - isPhoneNumber: (str) => /^\+?[\d\s-]{10,}$/.test(str), - isCreditCard: (str) => /^\d{16}$/.test(str.replace(/\s/g, '')), - isIPv4: (str) => /^(\d{1,3}\.){3}\d{1,3}$/.test(str) -}; - \ No newline at end of file diff --git a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-1.json b/evals/data/experiments/021-js-utility-library/outputs/aap/turn-1.json deleted file mode 100644 index 3244797..0000000 --- a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-1.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "utility-library", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "generateid-impl" - }, - "content": ",\n /** @param {string} email @returns {string} Masked email */\n maskEmail: (email) => email.replace(/(^.{2})[^@]+@/, '$1***@'),\n /** @param {string} word @param {number} count @returns {string} Pluralized string */\n pluralize: (word, count) => count === 1 ? word : word + 's',\n /** @param {string} str @returns {number} Word count */\n wordCount: (str) => str.trim().split(/\\s+/).filter(w => w.length > 0).length" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-1.txt b/evals/data/experiments/021-js-utility-library/outputs/aap/turn-1.txt deleted file mode 100644 index a28c58a..0000000 --- a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-1.txt +++ /dev/null @@ -1,79 +0,0 @@ - -/** - * String Utilities - */ -const StringUtils = { - /** @param {string} str @returns {string} Slugified string */ - slugify: (str) => str.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/(^-|-$)/g, ''), - /** @param {string} str @param {number} len @returns {string} Truncated string */ - truncate: (str, len) => str.length > len ? str.substring(0, len) + '...' : str, - /** @param {string} str @returns {string} Capitalized string */ - capitalize: (str) => str.charAt(0).toUpperCase() + str.slice(1), - /** @param {string} str @returns {string} CamelCased string */ - camelCase: (str) => str.replace(/[-_](\w)/g, (_, c) => c.toUpperCase()), - /** @param {string} str @returns {string} SnakeCased string */ - snakeCase: (str) => str.replace(/([A-Z])/g, '_$1').toLowerCase().replace(/^_/, ''), - /** @param {string} str @returns {string} Escaped HTML */ - escapeHtml: (str) => str.replace(/[&<>"']/g, (m) => ({'&': '&', '<': '<', '>': '>', '"': '"', "'": '''}[m])), - /** @returns {string} Random ID */ - generateId: () => Math.random().toString(36).substr(2, 9), - /** @param {string} email @returns {string} Masked email */ - maskEmail: (email) => email.replace(/(^.{2})[^@]+@/, '$1***@'), - /** @param {string} word @param {number} count @returns {string} Pluralized string */ - pluralize: (word, count) => count === 1 ? word : word + 's', - /** @param {string} str @returns {number} Word count */ - wordCount: (str) => str.trim().split(/\s+/).filter(w => w.length > 0).length -}; - -/** - * Date Utilities - */ -const DateUtils = { - /** @param {Date} date @param {string} format @returns {string} */ - formatDate: (date, fmt) => /* Implementation */, - /** @param {Date} date @returns {string} Relative time string */ - timeAgo: (date) => /* Implementation */, - /** @param {Date} date @param {number} days @returns {Date} */ - addDays: (date, days) => new Date(date.getTime() + days * 86400000), - /** @param {Date} date @returns {boolean} */ - isWeekend: (date) => [0, 6].includes(date.getDay()), - /** @param {Date} start @param {Date} end @returns {Date[]} */ - getDateRange: (start, end) => /* Implementation */, - /** @param {string} str @returns {Date} */ - parseRelativeDate: (str) => /* Implementation */ -}; - -/** - * Array Utilities - */ -const ArrayUtils = { - /** @param {Array} arr @param {number} size @returns {Array[]} */ - chunk: (arr, size) => Array.from({ length: Math.ceil(arr.length / size) }, (v, i) => arr.slice(i * size, i * size + size)), - /** @param {Array} arr @param {Function} key @returns {Object} */ - groupBy: (arr, key) => arr.reduce((acc, x) => ((acc[key(x)] = acc[key(x)] || []).push(x), acc), {}), - /** @param {Array} arr @param {Function} key @returns {Array} */ - uniqueBy: (arr, key) => [...new Map(arr.map(x => [key(x), x])).values()], - /** @param {Array} arr @param {Function} fn @returns {Array} */ - sortBy: (arr, fn) => [...arr].sort((a, b) => fn(a) - fn(b)), - /** @param {Array} arr @returns {Array} */ - flatten: (arr) => arr.reduce((acc, val) => Array.isArray(val) ? acc.concat(ArrayUtils.flatten(val)) : acc.concat(val), []), - /** @param {Array} a @param {Array} b @returns {Array} */ - intersection: (a, b) => a.filter(x => b.includes(x)), - /** @param {Array} a @param {Array} b @returns {Array} */ - difference: (a, b) => a.filter(x => !b.includes(x)), - /** @param {Array} arr @returns {Array} */ - shuffle: (arr) => [...arr].sort(() => Math.random() - 0.5) -}; - -/** - * Validation Utilities - */ -const ValidateUtils = { - isEmail: (str) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(str), - isURL: (str) => /^https?:\/\/\S+$/.test(str), - isStrongPassword: (str) => /^(?=.*[A-Z])(?=.*[0-9])(?=.{8,})/.test(str), - isPhoneNumber: (str) => /^\+?[\d\s-]{10,}$/.test(str), - isCreditCard: (str) => /^\d{16}$/.test(str.replace(/\s/g, '')), - isIPv4: (str) => /^(\d{1,3}\.){3}\d{1,3}$/.test(str) -}; - \ No newline at end of file diff --git a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-2.json b/evals/data/experiments/021-js-utility-library/outputs/aap/turn-2.json deleted file mode 100644 index 3402f03..0000000 --- a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-2.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "utility-library", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "replace", - "target": { - "type": "id", - "value": "formatdate-impl" - }, - "content": " /** @param {Date} date @param {string} format @returns {string} */\n formatDate: (date, fmt) => /* Implementation */,\n /** @param {number} ms @returns {string} Human readable duration */\n formatDuration: (ms) => const h = Math.floor(ms / 3600000); const m = Math.floor((ms % 3600000) / 60000); const s = Math.floor((ms % 60000) / 1000); return `${h}h ${m}m ${s}s`;,\n /** @param {Date} date @returns {string} Relative time string */" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-2.txt b/evals/data/experiments/021-js-utility-library/outputs/aap/turn-2.txt deleted file mode 100644 index 95be05d..0000000 --- a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-2.txt +++ /dev/null @@ -1,83 +0,0 @@ - -/** - * String Utilities - */ -const StringUtils = { - /** @param {string} str @returns {string} Slugified string */ - slugify: (str) => str.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/(^-|-$)/g, ''), - /** @param {string} str @param {number} len @returns {string} Truncated string */ - truncate: (str, len) => str.length > len ? str.substring(0, len) + '...' : str, - /** @param {string} str @returns {string} Capitalized string */ - capitalize: (str) => str.charAt(0).toUpperCase() + str.slice(1), - /** @param {string} str @returns {string} CamelCased string */ - camelCase: (str) => str.replace(/[-_](\w)/g, (_, c) => c.toUpperCase()), - /** @param {string} str @returns {string} SnakeCased string */ - snakeCase: (str) => str.replace(/([A-Z])/g, '_$1').toLowerCase().replace(/^_/, ''), - /** @param {string} str @returns {string} Escaped HTML */ - escapeHtml: (str) => str.replace(/[&<>"']/g, (m) => ({'&': '&', '<': '<', '>': '>', '"': '"', "'": '''}[m])), - /** @returns {string} Random ID */ - generateId: () => Math.random().toString(36).substr(2, 9), - /** @param {string} email @returns {string} Masked email */ - maskEmail: (email) => email.replace(/(^.{2})[^@]+@/, '$1***@'), - /** @param {string} word @param {number} count @returns {string} Pluralized string */ - pluralize: (word, count) => count === 1 ? word : word + 's', - /** @param {string} str @returns {number} Word count */ - wordCount: (str) => str.trim().split(/\s+/).filter(w => w.length > 0).length -}; - -/** - * Date Utilities - */ -const DateUtils = { - /** @param {Date} date @param {string} format @returns {string} */ - formatDate: (date, fmt) => /** @param {Date} date @param {string} format @returns {string} */ - formatDate: (date, fmt) => /* Implementation */, - /** @param {number} ms @returns {string} Human readable duration */ - formatDuration: (ms) => const h = Math.floor(ms / 3600000); const m = Math.floor((ms % 3600000) / 60000); const s = Math.floor((ms % 60000) / 1000); return `${h}h ${m}m ${s}s`;, - /** @param {Date} date @returns {string} Relative time string */, - /** @param {Date} date @returns {string} Relative time string */ - timeAgo: (date) => /* Implementation */, - /** @param {Date} date @param {number} days @returns {Date} */ - addDays: (date, days) => new Date(date.getTime() + days * 86400000), - /** @param {Date} date @returns {boolean} */ - isWeekend: (date) => [0, 6].includes(date.getDay()), - /** @param {Date} start @param {Date} end @returns {Date[]} */ - getDateRange: (start, end) => /* Implementation */, - /** @param {string} str @returns {Date} */ - parseRelativeDate: (str) => /* Implementation */ -}; - -/** - * Array Utilities - */ -const ArrayUtils = { - /** @param {Array} arr @param {number} size @returns {Array[]} */ - chunk: (arr, size) => Array.from({ length: Math.ceil(arr.length / size) }, (v, i) => arr.slice(i * size, i * size + size)), - /** @param {Array} arr @param {Function} key @returns {Object} */ - groupBy: (arr, key) => arr.reduce((acc, x) => ((acc[key(x)] = acc[key(x)] || []).push(x), acc), {}), - /** @param {Array} arr @param {Function} key @returns {Array} */ - uniqueBy: (arr, key) => [...new Map(arr.map(x => [key(x), x])).values()], - /** @param {Array} arr @param {Function} fn @returns {Array} */ - sortBy: (arr, fn) => [...arr].sort((a, b) => fn(a) - fn(b)), - /** @param {Array} arr @returns {Array} */ - flatten: (arr) => arr.reduce((acc, val) => Array.isArray(val) ? acc.concat(ArrayUtils.flatten(val)) : acc.concat(val), []), - /** @param {Array} a @param {Array} b @returns {Array} */ - intersection: (a, b) => a.filter(x => b.includes(x)), - /** @param {Array} a @param {Array} b @returns {Array} */ - difference: (a, b) => a.filter(x => !b.includes(x)), - /** @param {Array} arr @returns {Array} */ - shuffle: (arr) => [...arr].sort(() => Math.random() - 0.5) -}; - -/** - * Validation Utilities - */ -const ValidateUtils = { - isEmail: (str) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(str), - isURL: (str) => /^https?:\/\/\S+$/.test(str), - isStrongPassword: (str) => /^(?=.*[A-Z])(?=.*[0-9])(?=.{8,})/.test(str), - isPhoneNumber: (str) => /^\+?[\d\s-]{10,}$/.test(str), - isCreditCard: (str) => /^\d{16}$/.test(str.replace(/\s/g, '')), - isIPv4: (str) => /^(\d{1,3}\.){3}\d{1,3}$/.test(str) -}; - \ No newline at end of file diff --git a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-3.json b/evals/data/experiments/021-js-utility-library/outputs/aap/turn-3.json deleted file mode 100644 index 11aee1e..0000000 --- a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-3.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "protocol": "aap/0.1", - "id": "utility-library-id", - "version": 1, - "name": "edit", - "meta": { - "format": "text/html", - "tokens_used": null, - "checksum": null, - "state": null - }, - "content": [ - { - "op": "insert_after", - "target": { - "type": "id", - "value": "utility-library" - }, - "content": "\n/**\n * Object Utilities\n */\nconst ObjectUtils = {\n /** @param {Object} obj @returns {Object} */\n deepClone: (obj) => JSON.parse(JSON.stringify(obj)),\n /** @param {Object} target @param {Object} source @returns {Object} */\n deepMerge: (target, source) => Object.keys(source).forEach(key => { if (source[key] instanceof Object) Object.assign(source[key], ObjectUtils.deepMerge(target[key] || {}, source[key])); }); return Object.assign(target || {}, source);,\n /** @param {Object} obj @param {string[]} keys @returns {Object} */\n pick: (obj, keys) => keys.reduce((acc, key) => (key in obj ? { ...acc, [key]: obj[key] } : acc), {}),\n /** @param {Object} obj @param {string[]} keys @returns {Object} */\n omit: (obj, keys) => Object.keys(obj).filter(key => !keys.includes(key)).reduce((acc, key) => ({ ...acc, [key]: obj[key] }), {}),\n /** @param {Object} a @param {Object} b @returns {boolean} */\n isEqual: (a, b) => JSON.stringify(a) === JSON.stringify(b)\n};\n" - } - ] -} \ No newline at end of file diff --git a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-3.txt b/evals/data/experiments/021-js-utility-library/outputs/aap/turn-3.txt deleted file mode 100644 index d220fa9..0000000 --- a/evals/data/experiments/021-js-utility-library/outputs/aap/turn-3.txt +++ /dev/null @@ -1,99 +0,0 @@ - -/** - * String Utilities - */ -const StringUtils = { - /** @param {string} str @returns {string} Slugified string */ - slugify: (str) => str.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/(^-|-$)/g, '') -/** - * Object Utilities - */ -const ObjectUtils = { - /** @param {Object} obj @returns {Object} */ - deepClone: (obj) => JSON.parse(JSON.stringify(obj)), - /** @param {Object} target @param {Object} source @returns {Object} */ - deepMerge: (target, source) => Object.keys(source).forEach(key => { if (source[key] instanceof Object) Object.assign(source[key], ObjectUtils.deepMerge(target[key] || {}, source[key])); }); return Object.assign(target || {}, source);, - /** @param {Object} obj @param {string[]} keys @returns {Object} */ - pick: (obj, keys) => keys.reduce((acc, key) => (key in obj ? { ...acc, [key]: obj[key] } : acc), {}), - /** @param {Object} obj @param {string[]} keys @returns {Object} */ - omit: (obj, keys) => Object.keys(obj).filter(key => !keys.includes(key)).reduce((acc, key) => ({ ...acc, [key]: obj[key] }), {}), - /** @param {Object} a @param {Object} b @returns {boolean} */ - isEqual: (a, b) => JSON.stringify(a) === JSON.stringify(b) -}; -, - /** @param {string} str @param {number} len @returns {string} Truncated string */ - truncate: (str, len) => str.length > len ? str.substring(0, len) + '...' : str, - /** @param {string} str @returns {string} Capitalized string */ - capitalize: (str) => str.charAt(0).toUpperCase() + str.slice(1), - /** @param {string} str @returns {string} CamelCased string */ - camelCase: (str) => str.replace(/[-_](\w)/g, (_, c) => c.toUpperCase()), - /** @param {string} str @returns {string} SnakeCased string */ - snakeCase: (str) => str.replace(/([A-Z])/g, '_$1').toLowerCase().replace(/^_/, ''), - /** @param {string} str @returns {string} Escaped HTML */ - escapeHtml: (str) => str.replace(/[&<>"']/g, (m) => ({'&': '&', '<': '<', '>': '>', '"': '"', "'": '''}[m])), - /** @returns {string} Random ID */ - generateId: () => Math.random().toString(36).substr(2, 9), - /** @param {string} email @returns {string} Masked email */ - maskEmail: (email) => email.replace(/(^.{2})[^@]+@/, '$1***@'), - /** @param {string} word @param {number} count @returns {string} Pluralized string */ - pluralize: (word, count) => count === 1 ? word : word + 's', - /** @param {string} str @returns {number} Word count */ - wordCount: (str) => str.trim().split(/\s+/).filter(w => w.length > 0).length -}; - -/** - * Date Utilities - */ -const DateUtils = { - /** @param {Date} date @param {string} format @returns {string} */ - formatDate: (date, fmt) => /** @param {Date} date @param {string} format @returns {string} */ - formatDate: (date, fmt) => /* Implementation */, - /** @param {number} ms @returns {string} Human readable duration */ - formatDuration: (ms) => const h = Math.floor(ms / 3600000); const m = Math.floor((ms % 3600000) / 60000); const s = Math.floor((ms % 60000) / 1000); return `${h}h ${m}m ${s}s`;, - /** @param {Date} date @returns {string} Relative time string */, - /** @param {Date} date @returns {string} Relative time string */ - timeAgo: (date) => /* Implementation */, - /** @param {Date} date @param {number} days @returns {Date} */ - addDays: (date, days) => new Date(date.getTime() + days * 86400000), - /** @param {Date} date @returns {boolean} */ - isWeekend: (date) => [0, 6].includes(date.getDay()), - /** @param {Date} start @param {Date} end @returns {Date[]} */ - getDateRange: (start, end) => /* Implementation */, - /** @param {string} str @returns {Date} */ - parseRelativeDate: (str) => /* Implementation */ -}; - -/** - * Array Utilities - */ -const ArrayUtils = { - /** @param {Array} arr @param {number} size @returns {Array[]} */ - chunk: (arr, size) => Array.from({ length: Math.ceil(arr.length / size) }, (v, i) => arr.slice(i * size, i * size + size)), - /** @param {Array} arr @param {Function} key @returns {Object} */ - groupBy: (arr, key) => arr.reduce((acc, x) => ((acc[key(x)] = acc[key(x)] || []).push(x), acc), {}), - /** @param {Array} arr @param {Function} key @returns {Array} */ - uniqueBy: (arr, key) => [...new Map(arr.map(x => [key(x), x])).values()], - /** @param {Array} arr @param {Function} fn @returns {Array} */ - sortBy: (arr, fn) => [...arr].sort((a, b) => fn(a) - fn(b)), - /** @param {Array} arr @returns {Array} */ - flatten: (arr) => arr.reduce((acc, val) => Array.isArray(val) ? acc.concat(ArrayUtils.flatten(val)) : acc.concat(val), []), - /** @param {Array} a @param {Array} b @returns {Array} */ - intersection: (a, b) => a.filter(x => b.includes(x)), - /** @param {Array} a @param {Array} b @returns {Array} */ - difference: (a, b) => a.filter(x => !b.includes(x)), - /** @param {Array} arr @returns {Array} */ - shuffle: (arr) => [...arr].sort(() => Math.random() - 0.5) -}; - -/** - * Validation Utilities - */ -const ValidateUtils = { - isEmail: (str) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(str), - isURL: (str) => /^https?:\/\/\S+$/.test(str), - isStrongPassword: (str) => /^(?=.*[A-Z])(?=.*[0-9])(?=.{8,})/.test(str), - isPhoneNumber: (str) => /^\+?[\d\s-]{10,}$/.test(str), - isCreditCard: (str) => /^\d{16}$/.test(str.replace(/\s/g, '')), - isIPv4: (str) => /^(\d{1,3}\.){3}\d{1,3}$/.test(str) -}; - \ No newline at end of file diff --git a/evals/data/experiments/021-js-utility-library/outputs/base/turn-0.txt b/evals/data/experiments/021-js-utility-library/outputs/base/turn-0.txt deleted file mode 100644 index 9d126a6..0000000 --- a/evals/data/experiments/021-js-utility-library/outputs/base/turn-0.txt +++ /dev/null @@ -1,150 +0,0 @@ -/** - * String Utilities - */ - -/** - * Converts a string into a URL-friendly slug. - * @example slugify("Hello World!") // "hello-world" - */ -const slugify = (str) => str.toLowerCase().replace(/[^\w ]+/g, '').replace(/ +/g, '-'); - -/** - * Truncates a string to a specified length. - * @example truncate("Hello World", 5) // "Hello..." - */ -const truncate = (str, len) => str.length > len ? str.slice(0, len) + '...' : str; - -/** - * Capitalizes the first letter of a string. - * @example capitalize("hello") // "Hello" - */ -const capitalize = (str) => str.charAt(0).toUpperCase() + str.slice(1); - -/** - * Converts string to camelCase. - * @example camelCase("hello_world") // "helloWorld" - */ -const camelCase = (str) => str.replace(/([-_][a-z])/ig, ($1) => $1.toUpperCase().replace('-', '').replace('_', '')); - -/** - * Converts string to snake_case. - * @example snakeCase("helloWorld") // "hello_world" - */ -const snakeCase = (str) => str.replace(/[A-Z]/g, letter => `_${letter.toLowerCase()}`).replace(/^_/, ''); - -/** - * Escapes HTML characters. - * @example escapeHtml("