添加语义基准报告的 JSON 和 Markdown 文件，并更新基准页面以支持新报告的加载和展示

jiayuqi7813 · jiayuqi7813 · commit 9cd822ba7f71 · 2026-03-19T11:27:40.000+08:00
diff --git a/benchmark_reports/full_semantic_compare_v5/semantic_benchmark_summary.json b/benchmark_reports/full_semantic_compare_v5/semantic_benchmark_summary.json
@@ -0,0 +1,70 @@
+{
+  "model": "deepseek-chat",
+  "repeats": 1,
+  "overall_pairwise": {
+    "left_id": "zcp_client_to_native_zcp",
+    "right_id": "mcp_client_to_zcp_mcp_surface",
+    "left_avg_total_tokens": 8027.864864864865,
+    "right_avg_total_tokens": 30723.702702702703,
+    "right_minus_left": 22695.83783783784,
+    "right_div_left": 3.8271325215213228
+  },
+  "overall_summary": [
+    {
+      "backend_id": "zcp_client_to_native_zcp",
+      "answer_accuracy": 1.0,
+      "workbook_accuracy": 0.972972972972973,
+      "tool_compliance": 1.0,
+      "avg_total_tokens": 8027.864864864865,
+      "avg_turns": 2.081081081081081,
+      "avg_tool_calls": 1.0810810810810811
+    },
+    {
+      "backend_id": "mcp_client_to_zcp_mcp_surface",
+      "answer_accuracy": 0.972972972972973,
+      "workbook_accuracy": 0.918918918918919,
+      "tool_compliance": 0.7297297297297297,
+      "avg_total_tokens": 30723.702702702703,
+      "avg_turns": 3.945945945945946,
+      "avg_tool_calls": 3.0
+    }
+  ],
+  "tier_summary": [
+    {
+      "tier": "A",
+      "zcp_avg_total_tokens": 15979.375,
+      "mcp_avg_total_tokens": 17613.1875,
+      "ratio": 1.102245081550436,
+      "zcp_answer_accuracy": 1.0,
+      "zcp_workbook_accuracy": 0.9375,
+      "zcp_tool_compliance": 1.0
+    },
+    {
+      "tier": "B",
+      "zcp_avg_total_tokens": 1826.625,
+      "mcp_avg_total_tokens": 29239.375,
+      "ratio": 16.007322247314036,
+      "zcp_answer_accuracy": 1.0,
+      "zcp_workbook_accuracy": 1.0,
+      "zcp_tool_compliance": 1.0
+    },
+    {
+      "tier": "C",
+      "zcp_avg_total_tokens": 2091.1428571428573,
+      "mcp_avg_total_tokens": 72113.85714285714,
+      "ratio": 34.485380516464,
+      "zcp_answer_accuracy": 1.0,
+      "zcp_workbook_accuracy": 1.0,
+      "zcp_tool_compliance": 1.0
+    },
+    {
+      "tier": "D",
+      "zcp_avg_total_tokens": 2018.3333333333333,
+      "mcp_avg_total_tokens": 19375.666666666668,
+      "ratio": 9.599834847233693,
+      "zcp_answer_accuracy": 1.0,
+      "zcp_workbook_accuracy": 1.0,
+      "zcp_tool_compliance": 1.0
+    }
+  ]
+}
diff --git a/benchmark_reports/full_semantic_compare_v5/semantic_benchmark_summary.md b/benchmark_reports/full_semantic_compare_v5/semantic_benchmark_summary.md
@@ -0,0 +1,21 @@
+# Semantic Benchmark Summary v5
+
+Model: `deepseek-chat`  
+Repeats: `1`
+
+## Overall
+
+- `zcp_client_to_native_zcp`: `8027.9`
+- `mcp_client_to_zcp_mcp_surface`: `30723.7`
+- native ZCP advantage: `3.83x`
+
+## Tier Results
+
+| Tier | Native ZCP Avg Total | MCP Surface Avg Total | Ratio | Native Quality |
+| --- | ---: | ---: | ---: | --- |
+| `A` | 15979.4 | 17613.2 | `1.10x` | `100.0 / 93.8 / 100.0` |
+| `B` | 1826.6 | 29239.4 | `16.01x` | `100.0 / 100.0 / 100.0` |
+| `C` | 2091.1 | 72113.9 | `34.49x` | `100.0 / 100.0 / 100.0` |
+| `D` | 2018.3 | 19375.7 | `9.60x` | `100.0 / 100.0 / 100.0` |
+
+`Native Quality` is `answer / workbook / tool`.
diff --git a/docs/web/app/_components/benchmarks-page-view.tsx b/docs/web/app/_components/benchmarks-page-view.tsx
@@ -30,7 +30,42 @@ type BenchmarkReport = {
   cases: CaseRow[];
 };
 
-async function loadReport(): Promise<BenchmarkReport | null> {
+type SemanticOverallRow = {
+  backend_id: string;
+  answer_accuracy: number;
+  workbook_accuracy: number;
+  tool_compliance: number;
+  avg_total_tokens: number;
+  avg_turns: number;
+  avg_tool_calls: number;
+};
+
+type SemanticTierRow = {
+  tier: string;
+  zcp_avg_total_tokens: number;
+  mcp_avg_total_tokens: number;
+  ratio: number;
+  zcp_answer_accuracy: number;
+  zcp_workbook_accuracy: number;
+  zcp_tool_compliance: number;
+};
+
+type SemanticReport = {
+  model: string;
+  repeats: number;
+  overall_pairwise: {
+    left_id: string;
+    right_id: string;
+    left_avg_total_tokens: number;
+    right_avg_total_tokens: number;
+    right_minus_left: number;
+    right_div_left: number;
+  };
+  overall_summary: SemanticOverallRow[];
+  tier_summary: SemanticTierRow[];
+};
+
+async function loadCompactReport(): Promise<BenchmarkReport | null> {
   try {
     const reportPath = path.join(process.cwd(), "..", "..", "benchmark_reports", "zcp_mcp_tool_call_benchmark.json");
     const content = await readFile(reportPath, "utf-8");
@@ -40,13 +75,31 @@ async function loadReport(): Promise<BenchmarkReport | null> {
   }
 }
 
+async function loadSemanticReport(): Promise<SemanticReport | null> {
+  try {
+    const reportPath = path.join(
+      process.cwd(),
+      "..",
+      "..",
+      "benchmark_reports",
+      "full_semantic_compare_v5",
+      "semantic_benchmark_summary.json",
+    );
+    const content = await readFile(reportPath, "utf-8");
+    return JSON.parse(content) as SemanticReport;
+  } catch {
+    return null;
+  }
+}
+
 function percent(value: number): string {
   return `${(value * 100).toFixed(1)}%`;
 }
 
 export async function BenchmarksPageView({ locale }: { locale: Locale }) {
   const copy = benchmarksCopy[locale];
-  const report = await loadReport();
+  const compactReport = await loadCompactReport();
+  const semanticReport = await loadSemanticReport();
 
   return (
     <div className="site-shell article">
@@ -72,30 +125,96 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
           <p>{copy.summary}</p>
         </div>
         <p>{copy.description}</p>
+        <h2 id="snapshot">{copy.snapshotTitle}</h2>
+        <p>
+          {copy.modelLabel}: <code>{semanticReport?.model ?? compactReport?.model ?? "unknown"}</code>.{" "}
+          {copy.repeatsLabel}: <code>{semanticReport?.repeats ?? compactReport?.repeats ?? "unknown"}</code>.
+        </p>
 
-        {!report ? (
+        {semanticReport ? (
           <>
-            <h2 id="snapshot">{copy.missingTitle}</h2>
-            <p>{copy.missingBody}</p>
+            <h2 id="semantic">{copy.semanticTitle}</h2>
+            <h3>{copy.semanticHeadlineTitle}</h3>
+            <p>
+              <code>{semanticReport.overall_pairwise.left_id}</code> vs{" "}
+              <code>{semanticReport.overall_pairwise.right_id}</code>: {copy.ratioLabel}{" "}
+              <strong>{semanticReport.overall_pairwise.right_div_left.toFixed(2)}x</strong>. Token delta:{" "}
+              <code>{semanticReport.overall_pairwise.right_minus_left.toFixed(1)}</code>.
+            </p>
+            <p>
+              {copy.artifactLabel}:{" "}
+              <code>benchmark_reports/full_semantic_compare_v5/semantic_benchmark_summary.json</code>
+            </p>
+
+            <h3>{copy.semanticOverallTitle}</h3>
+            <table>
+              <thead>
+                <tr>
+                  {copy.semanticOverallHeaders.map((header) => (
+                    <th key={header}>{header}</th>
+                  ))}
+                </tr>
+              </thead>
+              <tbody>
+                {semanticReport.overall_summary.map((row) => (
+                  <tr key={row.backend_id}>
+                    <td>{row.backend_id}</td>
+                    <td>{percent(row.answer_accuracy)}</td>
+                    <td>{percent(row.workbook_accuracy)}</td>
+                    <td>{percent(row.tool_compliance)}</td>
+                    <td>{row.avg_total_tokens.toFixed(1)}</td>
+                    <td>{row.avg_turns.toFixed(1)}</td>
+                    <td>{row.avg_tool_calls.toFixed(1)}</td>
+                  </tr>
+                ))}
+              </tbody>
+            </table>
+
+            <h3>{copy.semanticTierTitle}</h3>
+            <table>
+              <thead>
+                <tr>
+                  {copy.semanticTierHeaders.map((header) => (
+                    <th key={header}>{header}</th>
+                  ))}
+                </tr>
+              </thead>
+              <tbody>
+                {semanticReport.tier_summary.map((row) => (
+                  <tr key={row.tier}>
+                    <td>{row.tier}</td>
+                    <td>{row.zcp_avg_total_tokens.toFixed(1)}</td>
+                    <td>{row.mcp_avg_total_tokens.toFixed(1)}</td>
+                    <td>{row.ratio.toFixed(2)}x</td>
+                    <td>
+                      {percent(row.zcp_answer_accuracy)} / {percent(row.zcp_workbook_accuracy)} /{" "}
+                      {percent(row.zcp_tool_compliance)}
+                    </td>
+                  </tr>
+                ))}
+              </tbody>
+            </table>
           </>
-        ) : (
+        ) : null}
+
+        {compactReport ? (
           <>
-            <h2 id="snapshot">{copy.snapshotTitle}</h2>
+            <h2 id="compact">{copy.compactTitle}</h2>
             <p>
-              {copy.modelLabel}: <code>{report.model}</code>. {copy.repeatsLabel}: <code>{report.repeats}</code>.
+              {copy.artifactLabel}: <code>benchmark_reports/zcp_mcp_tool_call_benchmark.json</code>
             </p>
 
-            <h2 id="summary">{copy.summaryTitle}</h2>
+            <h3>{copy.compactSummaryTitle}</h3>
             <table>
               <thead>
                 <tr>
-                  {copy.summaryHeaders.map((header) => (
+                  {copy.compactSummaryHeaders.map((header) => (
                     <th key={header}>{header}</th>
                   ))}
                 </tr>
               </thead>
               <tbody>
-                {report.summary.map((row) => (
+                {compactReport.summary.map((row) => (
                   <tr key={row.protocol}>
                     <td>{row.protocol}</td>
                     <td>{row.runs}</td>
@@ -109,17 +228,17 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
               </tbody>
             </table>
 
-            <h2 id="cases">{copy.casesTitle}</h2>
+            <h3>{copy.compactCasesTitle}</h3>
             <table>
               <thead>
                 <tr>
-                  {copy.caseHeaders.map((header) => (
+                  {copy.compactCaseHeaders.map((header) => (
                     <th key={header}>{header}</th>
                   ))}
                 </tr>
               </thead>
               <tbody>
-                {report.cases.map((row) => (
+                {compactReport.cases.map((row) => (
                   <tr key={row.case_id}>
                     <td>{row.case_id}</td>
                     <td>{row.zcp_avg_total_tokens.toFixed(1)}</td>
@@ -131,7 +250,7 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
               </tbody>
             </table>
           </>
-        )}
+        ) : null}
       </main>
     </div>
   );
diff --git a/docs/web/app/lib/site-copy.ts b/docs/web/app/lib/site-copy.ts