Skip to content

Commit 9cd822b

Browse files
committed
添加语义基准报告的 JSON 和 Markdown 文件,并更新基准页面以支持新报告的加载和展示
1 parent be6b6d2 commit 9cd822b

4 files changed

Lines changed: 272 additions & 43 deletions

File tree

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
{
2+
"model": "deepseek-chat",
3+
"repeats": 1,
4+
"overall_pairwise": {
5+
"left_id": "zcp_client_to_native_zcp",
6+
"right_id": "mcp_client_to_zcp_mcp_surface",
7+
"left_avg_total_tokens": 8027.864864864865,
8+
"right_avg_total_tokens": 30723.702702702703,
9+
"right_minus_left": 22695.83783783784,
10+
"right_div_left": 3.8271325215213228
11+
},
12+
"overall_summary": [
13+
{
14+
"backend_id": "zcp_client_to_native_zcp",
15+
"answer_accuracy": 1.0,
16+
"workbook_accuracy": 0.972972972972973,
17+
"tool_compliance": 1.0,
18+
"avg_total_tokens": 8027.864864864865,
19+
"avg_turns": 2.081081081081081,
20+
"avg_tool_calls": 1.0810810810810811
21+
},
22+
{
23+
"backend_id": "mcp_client_to_zcp_mcp_surface",
24+
"answer_accuracy": 0.972972972972973,
25+
"workbook_accuracy": 0.918918918918919,
26+
"tool_compliance": 0.7297297297297297,
27+
"avg_total_tokens": 30723.702702702703,
28+
"avg_turns": 3.945945945945946,
29+
"avg_tool_calls": 3.0
30+
}
31+
],
32+
"tier_summary": [
33+
{
34+
"tier": "A",
35+
"zcp_avg_total_tokens": 15979.375,
36+
"mcp_avg_total_tokens": 17613.1875,
37+
"ratio": 1.102245081550436,
38+
"zcp_answer_accuracy": 1.0,
39+
"zcp_workbook_accuracy": 0.9375,
40+
"zcp_tool_compliance": 1.0
41+
},
42+
{
43+
"tier": "B",
44+
"zcp_avg_total_tokens": 1826.625,
45+
"mcp_avg_total_tokens": 29239.375,
46+
"ratio": 16.007322247314036,
47+
"zcp_answer_accuracy": 1.0,
48+
"zcp_workbook_accuracy": 1.0,
49+
"zcp_tool_compliance": 1.0
50+
},
51+
{
52+
"tier": "C",
53+
"zcp_avg_total_tokens": 2091.1428571428573,
54+
"mcp_avg_total_tokens": 72113.85714285714,
55+
"ratio": 34.485380516464,
56+
"zcp_answer_accuracy": 1.0,
57+
"zcp_workbook_accuracy": 1.0,
58+
"zcp_tool_compliance": 1.0
59+
},
60+
{
61+
"tier": "D",
62+
"zcp_avg_total_tokens": 2018.3333333333333,
63+
"mcp_avg_total_tokens": 19375.666666666668,
64+
"ratio": 9.599834847233693,
65+
"zcp_answer_accuracy": 1.0,
66+
"zcp_workbook_accuracy": 1.0,
67+
"zcp_tool_compliance": 1.0
68+
}
69+
]
70+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Semantic Benchmark Summary v5
2+
3+
Model: `deepseek-chat`
4+
Repeats: `1`
5+
6+
## Overall
7+
8+
- `zcp_client_to_native_zcp`: `8027.9`
9+
- `mcp_client_to_zcp_mcp_surface`: `30723.7`
10+
- native ZCP advantage: `3.83x`
11+
12+
## Tier Results
13+
14+
| Tier | Native ZCP Avg Total | MCP Surface Avg Total | Ratio | Native Quality |
15+
| --- | ---: | ---: | ---: | --- |
16+
| `A` | 15979.4 | 17613.2 | `1.10x` | `100.0 / 93.8 / 100.0` |
17+
| `B` | 1826.6 | 29239.4 | `16.01x` | `100.0 / 100.0 / 100.0` |
18+
| `C` | 2091.1 | 72113.9 | `34.49x` | `100.0 / 100.0 / 100.0` |
19+
| `D` | 2018.3 | 19375.7 | `9.60x` | `100.0 / 100.0 / 100.0` |
20+
21+
`Native Quality` is `answer / workbook / tool`.

docs/web/app/_components/benchmarks-page-view.tsx

Lines changed: 134 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,42 @@ type BenchmarkReport = {
3030
cases: CaseRow[];
3131
};
3232

33-
async function loadReport(): Promise<BenchmarkReport | null> {
33+
type SemanticOverallRow = {
34+
backend_id: string;
35+
answer_accuracy: number;
36+
workbook_accuracy: number;
37+
tool_compliance: number;
38+
avg_total_tokens: number;
39+
avg_turns: number;
40+
avg_tool_calls: number;
41+
};
42+
43+
type SemanticTierRow = {
44+
tier: string;
45+
zcp_avg_total_tokens: number;
46+
mcp_avg_total_tokens: number;
47+
ratio: number;
48+
zcp_answer_accuracy: number;
49+
zcp_workbook_accuracy: number;
50+
zcp_tool_compliance: number;
51+
};
52+
53+
type SemanticReport = {
54+
model: string;
55+
repeats: number;
56+
overall_pairwise: {
57+
left_id: string;
58+
right_id: string;
59+
left_avg_total_tokens: number;
60+
right_avg_total_tokens: number;
61+
right_minus_left: number;
62+
right_div_left: number;
63+
};
64+
overall_summary: SemanticOverallRow[];
65+
tier_summary: SemanticTierRow[];
66+
};
67+
68+
async function loadCompactReport(): Promise<BenchmarkReport | null> {
3469
try {
3570
const reportPath = path.join(process.cwd(), "..", "..", "benchmark_reports", "zcp_mcp_tool_call_benchmark.json");
3671
const content = await readFile(reportPath, "utf-8");
@@ -40,13 +75,31 @@ async function loadReport(): Promise<BenchmarkReport | null> {
4075
}
4176
}
4277

78+
async function loadSemanticReport(): Promise<SemanticReport | null> {
79+
try {
80+
const reportPath = path.join(
81+
process.cwd(),
82+
"..",
83+
"..",
84+
"benchmark_reports",
85+
"full_semantic_compare_v5",
86+
"semantic_benchmark_summary.json",
87+
);
88+
const content = await readFile(reportPath, "utf-8");
89+
return JSON.parse(content) as SemanticReport;
90+
} catch {
91+
return null;
92+
}
93+
}
94+
4395
function percent(value: number): string {
4496
return `${(value * 100).toFixed(1)}%`;
4597
}
4698

4799
export async function BenchmarksPageView({ locale }: { locale: Locale }) {
48100
const copy = benchmarksCopy[locale];
49-
const report = await loadReport();
101+
const compactReport = await loadCompactReport();
102+
const semanticReport = await loadSemanticReport();
50103

51104
return (
52105
<div className="site-shell article">
@@ -72,30 +125,96 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
72125
<p>{copy.summary}</p>
73126
</div>
74127
<p>{copy.description}</p>
128+
<h2 id="snapshot">{copy.snapshotTitle}</h2>
129+
<p>
130+
{copy.modelLabel}: <code>{semanticReport?.model ?? compactReport?.model ?? "unknown"}</code>.{" "}
131+
{copy.repeatsLabel}: <code>{semanticReport?.repeats ?? compactReport?.repeats ?? "unknown"}</code>.
132+
</p>
75133

76-
{!report ? (
134+
{semanticReport ? (
77135
<>
78-
<h2 id="snapshot">{copy.missingTitle}</h2>
79-
<p>{copy.missingBody}</p>
136+
<h2 id="semantic">{copy.semanticTitle}</h2>
137+
<h3>{copy.semanticHeadlineTitle}</h3>
138+
<p>
139+
<code>{semanticReport.overall_pairwise.left_id}</code> vs{" "}
140+
<code>{semanticReport.overall_pairwise.right_id}</code>: {copy.ratioLabel}{" "}
141+
<strong>{semanticReport.overall_pairwise.right_div_left.toFixed(2)}x</strong>. Token delta:{" "}
142+
<code>{semanticReport.overall_pairwise.right_minus_left.toFixed(1)}</code>.
143+
</p>
144+
<p>
145+
{copy.artifactLabel}:{" "}
146+
<code>benchmark_reports/full_semantic_compare_v5/semantic_benchmark_summary.json</code>
147+
</p>
148+
149+
<h3>{copy.semanticOverallTitle}</h3>
150+
<table>
151+
<thead>
152+
<tr>
153+
{copy.semanticOverallHeaders.map((header) => (
154+
<th key={header}>{header}</th>
155+
))}
156+
</tr>
157+
</thead>
158+
<tbody>
159+
{semanticReport.overall_summary.map((row) => (
160+
<tr key={row.backend_id}>
161+
<td>{row.backend_id}</td>
162+
<td>{percent(row.answer_accuracy)}</td>
163+
<td>{percent(row.workbook_accuracy)}</td>
164+
<td>{percent(row.tool_compliance)}</td>
165+
<td>{row.avg_total_tokens.toFixed(1)}</td>
166+
<td>{row.avg_turns.toFixed(1)}</td>
167+
<td>{row.avg_tool_calls.toFixed(1)}</td>
168+
</tr>
169+
))}
170+
</tbody>
171+
</table>
172+
173+
<h3>{copy.semanticTierTitle}</h3>
174+
<table>
175+
<thead>
176+
<tr>
177+
{copy.semanticTierHeaders.map((header) => (
178+
<th key={header}>{header}</th>
179+
))}
180+
</tr>
181+
</thead>
182+
<tbody>
183+
{semanticReport.tier_summary.map((row) => (
184+
<tr key={row.tier}>
185+
<td>{row.tier}</td>
186+
<td>{row.zcp_avg_total_tokens.toFixed(1)}</td>
187+
<td>{row.mcp_avg_total_tokens.toFixed(1)}</td>
188+
<td>{row.ratio.toFixed(2)}x</td>
189+
<td>
190+
{percent(row.zcp_answer_accuracy)} / {percent(row.zcp_workbook_accuracy)} /{" "}
191+
{percent(row.zcp_tool_compliance)}
192+
</td>
193+
</tr>
194+
))}
195+
</tbody>
196+
</table>
80197
</>
81-
) : (
198+
) : null}
199+
200+
{compactReport ? (
82201
<>
83-
<h2 id="snapshot">{copy.snapshotTitle}</h2>
202+
<h2 id="compact">{copy.compactTitle}</h2>
84203
<p>
85-
{copy.modelLabel}: <code>{report.model}</code>. {copy.repeatsLabel}: <code>{report.repeats}</code>.
204+
{copy.artifactLabel}: <code>benchmark_reports/zcp_mcp_tool_call_benchmark.json</code>
86205
</p>
87206

88-
<h2 id="summary">{copy.summaryTitle}</h2>
207+
<h3>{copy.compactSummaryTitle}</h3>
89208
<table>
90209
<thead>
91210
<tr>
92-
{copy.summaryHeaders.map((header) => (
211+
{copy.compactSummaryHeaders.map((header) => (
93212
<th key={header}>{header}</th>
94213
))}
95214
</tr>
96215
</thead>
97216
<tbody>
98-
{report.summary.map((row) => (
217+
{compactReport.summary.map((row) => (
99218
<tr key={row.protocol}>
100219
<td>{row.protocol}</td>
101220
<td>{row.runs}</td>
@@ -109,17 +228,17 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
109228
</tbody>
110229
</table>
111230

112-
<h2 id="cases">{copy.casesTitle}</h2>
231+
<h3>{copy.compactCasesTitle}</h3>
113232
<table>
114233
<thead>
115234
<tr>
116-
{copy.caseHeaders.map((header) => (
235+
{copy.compactCaseHeaders.map((header) => (
117236
<th key={header}>{header}</th>
118237
))}
119238
</tr>
120239
</thead>
121240
<tbody>
122-
{report.cases.map((row) => (
241+
{compactReport.cases.map((row) => (
123242
<tr key={row.case_id}>
124243
<td>{row.case_id}</td>
125244
<td>{row.zcp_avg_total_tokens.toFixed(1)}</td>
@@ -131,7 +250,7 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
131250
</tbody>
132251
</table>
133252
</>
134-
)}
253+
) : null}
135254
</main>
136255
</div>
137256
);

0 commit comments

Comments
 (0)