@@ -30,7 +30,42 @@ type BenchmarkReport = {
3030 cases : CaseRow [ ] ;
3131} ;
3232
33- async function loadReport ( ) : Promise < BenchmarkReport | null > {
33+ type SemanticOverallRow = {
34+ backend_id : string ;
35+ answer_accuracy : number ;
36+ workbook_accuracy : number ;
37+ tool_compliance : number ;
38+ avg_total_tokens : number ;
39+ avg_turns : number ;
40+ avg_tool_calls : number ;
41+ } ;
42+
43+ type SemanticTierRow = {
44+ tier : string ;
45+ zcp_avg_total_tokens : number ;
46+ mcp_avg_total_tokens : number ;
47+ ratio : number ;
48+ zcp_answer_accuracy : number ;
49+ zcp_workbook_accuracy : number ;
50+ zcp_tool_compliance : number ;
51+ } ;
52+
53+ type SemanticReport = {
54+ model : string ;
55+ repeats : number ;
56+ overall_pairwise : {
57+ left_id : string ;
58+ right_id : string ;
59+ left_avg_total_tokens : number ;
60+ right_avg_total_tokens : number ;
61+ right_minus_left : number ;
62+ right_div_left : number ;
63+ } ;
64+ overall_summary : SemanticOverallRow [ ] ;
65+ tier_summary : SemanticTierRow [ ] ;
66+ } ;
67+
68+ async function loadCompactReport ( ) : Promise < BenchmarkReport | null > {
3469 try {
3570 const reportPath = path . join ( process . cwd ( ) , ".." , ".." , "benchmark_reports" , "zcp_mcp_tool_call_benchmark.json" ) ;
3671 const content = await readFile ( reportPath , "utf-8" ) ;
@@ -40,13 +75,31 @@ async function loadReport(): Promise<BenchmarkReport | null> {
4075 }
4176}
4277
78+ async function loadSemanticReport ( ) : Promise < SemanticReport | null > {
79+ try {
80+ const reportPath = path . join (
81+ process . cwd ( ) ,
82+ ".." ,
83+ ".." ,
84+ "benchmark_reports" ,
85+ "full_semantic_compare_v5" ,
86+ "semantic_benchmark_summary.json" ,
87+ ) ;
88+ const content = await readFile ( reportPath , "utf-8" ) ;
89+ return JSON . parse ( content ) as SemanticReport ;
90+ } catch {
91+ return null ;
92+ }
93+ }
94+
4395function percent ( value : number ) : string {
4496 return `${ ( value * 100 ) . toFixed ( 1 ) } %` ;
4597}
4698
4799export async function BenchmarksPageView ( { locale } : { locale : Locale } ) {
48100 const copy = benchmarksCopy [ locale ] ;
49- const report = await loadReport ( ) ;
101+ const compactReport = await loadCompactReport ( ) ;
102+ const semanticReport = await loadSemanticReport ( ) ;
50103
51104 return (
52105 < div className = "site-shell article" >
@@ -72,30 +125,96 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
72125 < p > { copy . summary } </ p >
73126 </ div >
74127 < p > { copy . description } </ p >
128+ < h2 id = "snapshot" > { copy . snapshotTitle } </ h2 >
129+ < p >
130+ { copy . modelLabel } : < code > { semanticReport ?. model ?? compactReport ?. model ?? "unknown" } </ code > .{ " " }
131+ { copy . repeatsLabel } : < code > { semanticReport ?. repeats ?? compactReport ?. repeats ?? "unknown" } </ code > .
132+ </ p >
75133
76- { ! report ? (
134+ { semanticReport ? (
77135 < >
78- < h2 id = "snapshot" > { copy . missingTitle } </ h2 >
79- < p > { copy . missingBody } </ p >
136+ < h2 id = "semantic" > { copy . semanticTitle } </ h2 >
137+ < h3 > { copy . semanticHeadlineTitle } </ h3 >
138+ < p >
139+ < code > { semanticReport . overall_pairwise . left_id } </ code > vs{ " " }
140+ < code > { semanticReport . overall_pairwise . right_id } </ code > : { copy . ratioLabel } { " " }
141+ < strong > { semanticReport . overall_pairwise . right_div_left . toFixed ( 2 ) } x</ strong > . Token delta:{ " " }
142+ < code > { semanticReport . overall_pairwise . right_minus_left . toFixed ( 1 ) } </ code > .
143+ </ p >
144+ < p >
145+ { copy . artifactLabel } :{ " " }
146+ < code > benchmark_reports/full_semantic_compare_v5/semantic_benchmark_summary.json</ code >
147+ </ p >
148+
149+ < h3 > { copy . semanticOverallTitle } </ h3 >
150+ < table >
151+ < thead >
152+ < tr >
153+ { copy . semanticOverallHeaders . map ( ( header ) => (
154+ < th key = { header } > { header } </ th >
155+ ) ) }
156+ </ tr >
157+ </ thead >
158+ < tbody >
159+ { semanticReport . overall_summary . map ( ( row ) => (
160+ < tr key = { row . backend_id } >
161+ < td > { row . backend_id } </ td >
162+ < td > { percent ( row . answer_accuracy ) } </ td >
163+ < td > { percent ( row . workbook_accuracy ) } </ td >
164+ < td > { percent ( row . tool_compliance ) } </ td >
165+ < td > { row . avg_total_tokens . toFixed ( 1 ) } </ td >
166+ < td > { row . avg_turns . toFixed ( 1 ) } </ td >
167+ < td > { row . avg_tool_calls . toFixed ( 1 ) } </ td >
168+ </ tr >
169+ ) ) }
170+ </ tbody >
171+ </ table >
172+
173+ < h3 > { copy . semanticTierTitle } </ h3 >
174+ < table >
175+ < thead >
176+ < tr >
177+ { copy . semanticTierHeaders . map ( ( header ) => (
178+ < th key = { header } > { header } </ th >
179+ ) ) }
180+ </ tr >
181+ </ thead >
182+ < tbody >
183+ { semanticReport . tier_summary . map ( ( row ) => (
184+ < tr key = { row . tier } >
185+ < td > { row . tier } </ td >
186+ < td > { row . zcp_avg_total_tokens . toFixed ( 1 ) } </ td >
187+ < td > { row . mcp_avg_total_tokens . toFixed ( 1 ) } </ td >
188+ < td > { row . ratio . toFixed ( 2 ) } x</ td >
189+ < td >
190+ { percent ( row . zcp_answer_accuracy ) } / { percent ( row . zcp_workbook_accuracy ) } /{ " " }
191+ { percent ( row . zcp_tool_compliance ) }
192+ </ td >
193+ </ tr >
194+ ) ) }
195+ </ tbody >
196+ </ table >
80197 </ >
81- ) : (
198+ ) : null }
199+
200+ { compactReport ? (
82201 < >
83- < h2 id = "snapshot " > { copy . snapshotTitle } </ h2 >
202+ < h2 id = "compact " > { copy . compactTitle } </ h2 >
84203 < p >
85- { copy . modelLabel } : < code > { report . model } </ code > . { copy . repeatsLabel } : < code > { report . repeats } < /code > .
204+ { copy . artifactLabel } : < code > benchmark_reports/zcp_mcp_tool_call_benchmark.json < /code >
86205 </ p >
87206
88- < h2 id = "summary" > { copy . summaryTitle } </ h2 >
207+ < h3 > { copy . compactSummaryTitle } </ h3 >
89208 < table >
90209 < thead >
91210 < tr >
92- { copy . summaryHeaders . map ( ( header ) => (
211+ { copy . compactSummaryHeaders . map ( ( header ) => (
93212 < th key = { header } > { header } </ th >
94213 ) ) }
95214 </ tr >
96215 </ thead >
97216 < tbody >
98- { report . summary . map ( ( row ) => (
217+ { compactReport . summary . map ( ( row ) => (
99218 < tr key = { row . protocol } >
100219 < td > { row . protocol } </ td >
101220 < td > { row . runs } </ td >
@@ -109,17 +228,17 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
109228 </ tbody >
110229 </ table >
111230
112- < h2 id = "cases" > { copy . casesTitle } </ h2 >
231+ < h3 > { copy . compactCasesTitle } </ h3 >
113232 < table >
114233 < thead >
115234 < tr >
116- { copy . caseHeaders . map ( ( header ) => (
235+ { copy . compactCaseHeaders . map ( ( header ) => (
117236 < th key = { header } > { header } </ th >
118237 ) ) }
119238 </ tr >
120239 </ thead >
121240 < tbody >
122- { report . cases . map ( ( row ) => (
241+ { compactReport . cases . map ( ( row ) => (
123242 < tr key = { row . case_id } >
124243 < td > { row . case_id } </ td >
125244 < td > { row . zcp_avg_total_tokens . toFixed ( 1 ) } </ td >
@@ -131,7 +250,7 @@ export async function BenchmarksPageView({ locale }: { locale: Locale }) {
131250 </ tbody >
132251 </ table >
133252 </ >
134- ) }
253+ ) : null }
135254 </ main >
136255 </ div >
137256 ) ;
0 commit comments