Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dashboard/pages/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
plot_metric_vs_size,
plot_comparison_matrix,
create_gauge_chart,
create_summary_table,
plot_timeseries_auto,
create_summary_table_infer,
create_summary_table_comm,
)

init_page("通信测试分析 | InfiniMetrics", "🔗")
Expand Down Expand Up @@ -293,7 +293,7 @@ def main():
f"{run['operation']} ({run['device_used']} GPUs) - 配置详情"
):
# Create summary table
summary_df = create_summary_table(run.get("data", {}))
summary_df = create_summary_table_comm(run.get("data", {}))
st.dataframe(
summary_df,
use_container_width=True,
Expand Down
38 changes: 31 additions & 7 deletions dashboard/pages/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from utils.visualizations import (
create_summary_table_ops,
plot_timeseries_auto,
render_operator_performance_charts,
)

init_page("算子测试分析 | InfiniMetrics", "⚡")
Expand Down Expand Up @@ -37,6 +38,10 @@ def main():
only_success = st.checkbox("仅显示成功测试", value=True)
y_log = st.checkbox("Y轴对数刻度(可选)", value=False)

st.markdown("---")
st.markdown("### 📊 图表选项")
show_performance_charts = st.checkbox("显示性能仪表盘", value=True)

filtered = [r for r in ops_runs if (not only_success or r.get("success"))]

st.caption(f"找到 {len(filtered)} 个算子测试")
Expand All @@ -63,35 +68,54 @@ def main():
ri["data"] = data
selected_runs.append(ri)

tab1, tab2 = st.tabs(["📌 概览", "📈 曲线/原始数据"])
tab1, tab2, tab3 = st.tabs(["📈 性能图表", "📌 概览", "📊 原始数据"])

with tab1:
# Use the new performance chart function
render_operator_performance_charts(
selected_runs, y_log, show_performance_charts
)

with tab2:
for run in selected_runs:
with st.expander(f"{run.get('run_id')} - 概览"):
st.dataframe(
create_summary_table_ops(run["data"]),
use_container_width=True,
hide_index=True,
)
st.markdown("**config**")
st.markdown("**完整配置**")
st.json(run["data"].get("config", {}))

with tab2:
# If operators have timeseries CSVs, automatically plot them
env = run["data"].get("environment", {})
if env:
st.markdown("**环境信息**")
try:
acc = env["cluster"][0]["machine"]["accelerators"][0]
st.write(f"- 加速卡: {acc.get('model', 'Unknown')}")
st.write(f"- 显存: {acc.get('memory_gb_per_card', '?')} GB")
st.write(f"- CUDA版本: {acc.get('cuda', 'Unknown')}")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个地方改成加速卡版本是不是好一点?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO (后续 PR): 将环境信息中的 “cuda“字段改为统一的“runtime“ 结构:“{"type": "cuda", "version": "13.0"}“

except Exception:
st.json(env)

with tab3:
# Original data
for run in selected_runs:
with st.expander(f"{run.get('run_id')} - metrics"):
with st.expander(f"{run.get('run_id')} - 原始数据"):
for m in run["data"].get("metrics", []):
df = m.get("data")
if df is not None and len(df.columns) >= 2:
st.markdown(f"**{m.get('name', 'metric')}**")
fig = plot_timeseries_auto(
df, title=m.get("name", "metric"), y_log_scale=y_log
)
st.plotly_chart(fig, use_container_width=True)
else:
# scalar
if m.get("type") == "scalar":
st.write(
f"- {m.get('name')}: {m.get('value')} {m.get('unit','')}"
st.metric(
label=m.get("name", ""),
value=f"{m.get('value', '')} {m.get('unit', '')}",
)


Expand Down
2 changes: 1 addition & 1 deletion dashboard/pages/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
load_selected_runs,
create_training_summary,
)
from utils.training_plots import (
from utils.visualizations import (
render_performance_curves,
render_throughput_comparison,
render_data_tables,
Expand Down
66 changes: 56 additions & 10 deletions dashboard/utils/visualizations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,83 @@
This package provides visualization utilities organized by test type:
- base: Common/legacy visualization functions
- hardware: Hardware test visualizations (memory sweep, cache bandwidth)
- (future) communication: Communication test visualizations
- (future) inference: Inference test visualizations
- (future) operator: Operator test visualizations
- communication: Communication test visualizations
- inference: Inference test visualizations
- operator: Operator test visualizations
- training: Training test visualizations
- summary_tables: Summary tables for different test types
"""

# Base functions (common)
from .base import (
plot_metric_vs_size,
plot_comparison_matrix,
create_summary_table,
create_gauge_chart,
plot_timeseries_auto,
)

# Communication functions
from .communication import (
plot_metric_vs_size,
plot_comparison_matrix,
)

# Inference functions
from .inference import (
render_inference_metrics,
render_memory_gauge,
)

# Summary tables
from .summary_tables import (
create_summary_table_comm,
create_summary_table_infer,
create_summary_table_ops,
)

# Hardware functions
from .hardware import (
create_summary_table_hw,
plot_hw_mem_sweep,
plot_hw_cache,
)

# Operator functions
from .operator import (
extract_operator_metrics,
render_operator_performance_charts,
)

# Training functions
from .training import (
render_performance_curves,
render_throughput_comparison,
render_data_tables,
render_config_details,
)

__all__ = [
# Base (common/legacy)
"plot_metric_vs_size",
"plot_comparison_matrix",
"create_summary_table",
# Base
"create_gauge_chart",
"plot_timeseries_auto",
# Communication
"plot_metric_vs_size",
"plot_comparison_matrix",
# Inference
"render_inference_metrics",
"render_memory_gauge",
# Summary tables
"create_summary_table_comm",
"create_summary_table_infer",
"create_summary_table_ops",
# Hardware
"create_summary_table_hw",
"plot_hw_mem_sweep",
"plot_hw_cache",
# Operator
"extract_operator_metrics",
"render_operator_performance_charts",
# Training
"render_performance_curves",
"render_throughput_comparison",
"render_data_tables",
"render_config_details",
]
Loading
Loading