Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/trackers/performance-benchmark-runs.md
Original file line number Diff line number Diff line change
Expand Up @@ -461,3 +461,12 @@ Outputs:
- Bulk read: `results_dev_perf_dashboard/bulk_read_multi/perf/README.md`
- Bulk write: `results_dev_perf_dashboard/bulk_write_multi/perf/README.md`
- Per-cell fast: `results_dev_perf_dashboard/per_cell_fast/perf/README.md`


## Canonical run-comparison procedure

1. Run benchmark with stable knobs (`--warmup`, `--iters`, fixed adapter set) and commit code before each run.
2. Review `perf/results.json` metadata and confirm same profile, CPU, core count, memory, Python, and adapter build info.
3. Compare `perf/matrix.csv` columns `read_tail_ratio`/`write_tail_ratio` and `regression_status`. Treat `confidence_note=high` as noisy and rerun.
4. For regression gates, compare against `results/perf/history.jsonl` median of recent samples (last 5, min 3).
5. Investigate any `regressed:<pct>%` row above the configured threshold, such as `regressed:12.3%`; require two consecutive confirmations before escalating.
132 changes: 124 additions & 8 deletions src/excelbench/perf/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,13 @@ def render_perf_markdown(results: PerfResults, path: Path) -> None:
lines.append(f"*Profile: {data['metadata']['profile']}*")
lines.append(f"*Platform: {data['metadata']['platform']}*")
lines.append(f"*Python: {data['metadata']['python']}*")
run_env = data["metadata"].get("run_environment") or {}
if run_env:
lines.append(f"*CPU: {run_env.get('cpu_model') or 'unknown'}*")
lines.append(
f"*Cores: {run_env.get('core_count') or 'unknown'} | "
f"Memory MB: {run_env.get('memory_total_mb') or 'unknown'}*"
)
if data["metadata"].get("commit"):
lines.append(f"*Commit: {data['metadata']['commit']}*")
cfg = data["metadata"].get("config", {})
Expand All @@ -93,6 +100,10 @@ def render_perf_markdown(results: PerfResults, path: Path) -> None:
"These numbers measure only the library under test. "
"Write timings do NOT include oracle verification."
)
lines.append(
"Confidence note: treat deltas under ~5% as noise unless "
"stable across multiple runs."
)
lines.append("")

workload_features = _collect_workload_features(libs, features, lookup)
Expand All @@ -107,10 +118,10 @@ def render_perf_markdown(results: PerfResults, path: Path) -> None:
for lib in libs:
caps = set(data["libraries"][lib].get("capabilities", []))
if "read" in caps:
header += f" {lib} (R p50 ms) |"
header += f" {lib} (R p50/p95 ms) |"
sep += "--------------|"
if "write" in caps:
header += f" {lib} (W p50 ms) |"
header += f" {lib} (W p50/p95 ms) |"
sep += "--------------|"

tier_map: dict[int, list[str]] = {0: [], 1: [], 2: []}
Expand All @@ -133,9 +144,9 @@ def render_perf_markdown(results: PerfResults, path: Path) -> None:
entry = lookup.get((feat, lib))
perf = entry.get("perf") if entry else None
if "read" in caps:
row += f" {_fmt_p50_ms(perf, 'read')} |"
row += f" {_fmt_p50_p95_ms(perf, 'read')} |"
if "write" in caps:
row += f" {_fmt_p50_ms(perf, 'write')} |"
row += f" {_fmt_p50_p95_ms(perf, 'write')} |"
lines.append(row)
lines.append("")

Expand Down Expand Up @@ -303,7 +314,7 @@ def _fmt_rate(rate: float) -> str:
return f"{rate:.2f}"


def _fmt_p50_ms(perf: dict[str, Any] | None, op: str) -> str:
def _fmt_p50_p95_ms(perf: dict[str, Any] | None, op: str) -> str:
if not perf or not isinstance(perf, dict):
return "—"
op_data = perf.get(op)
Expand All @@ -316,16 +327,37 @@ def _fmt_p50_ms(perf: dict[str, Any] | None, op: str) -> str:
if p50 is None:
return "—"
try:
return f"{float(p50):.2f}"
p95 = wall.get("p95")
p95_txt = f"/{float(p95):.2f}" if p95 is not None else ""
return f"{float(p50):.2f}{p95_txt}"
except (TypeError, ValueError):
return "—"


def render_perf_csv(results: PerfResults, path: Path) -> None:
data = perf_results_to_json_dict(results)
history_path = path.parent / "history.jsonl"
history_entries = _load_matching_history_entries(data, history_path)
header_columns = [
"library",
"feature",
"read_p50_wall_ms",
"read_p95_wall_ms",
"read_op_count",
"read_op_unit",
"read_p50_units_per_sec",
"write_p50_wall_ms",
"write_p95_wall_ms",
"write_op_count",
"write_op_unit",
"write_p50_units_per_sec",
"read_tail_ratio",
"write_tail_ratio",
"confidence_note",
"regression_status",
]
lines = [
"library,feature,read_p50_wall_ms,read_p95_wall_ms,read_op_count,read_op_unit,read_p50_units_per_sec,"
"write_p50_wall_ms,write_p95_wall_ms,write_op_count,write_op_unit,write_p50_units_per_sec",
",".join(header_columns),
]
Comment thread
wolfiesch marked this conversation as resolved.
for r in data["results"]:
perf = r.get("perf") or {}
Expand All @@ -350,6 +382,9 @@ def _rate(count: Any, p50_ms: Any) -> str:
def _f(v: Any) -> str:
return "" if v is None else str(v)

read_tail_ratio = _tail_ratio(read_wall)
write_tail_ratio = _tail_ratio(write_wall)
reg_status = _regression_status(history_entries, r)
lines.append(
",".join(
[
Expand All @@ -365,6 +400,17 @@ def _f(v: Any) -> str:
_f(write_count),
_f(write_unit),
_rate(write_count, write_wall.get("p50")),
_f(read_tail_ratio),
_f(write_tail_ratio),
_f(
"high"
if (
(read_tail_ratio or 0) > 0.20
or (write_tail_ratio or 0) > 0.20
)
else "ok"
),
reg_status,
]
)
)
Expand Down Expand Up @@ -400,3 +446,73 @@ def append_perf_history(results: PerfResults, history_path: Path) -> None:

with open(history_path, "a") as f:
f.write(json.dumps(entry) + "\n")


def _tail_ratio(wall: dict[str, Any]) -> float | None:
p50 = wall.get("p50")
p95 = wall.get("p95")
try:
if p50 is None or p95 is None:
return None
p50_float = float(p50)
if p50_float == 0:
return None
return round(max(float(p95) - p50_float, 0.0) / p50_float, 4)
except (TypeError, ValueError, ZeroDivisionError):
return None


def _load_matching_history_entries(
data: dict[str, Any],
history_path: Path,
) -> list[dict[str, Any]]:
if not history_path.exists():
return []
metadata = data.get("metadata", {})
current_profile = metadata.get("profile")
current_config = metadata.get("config")
entries: list[dict[str, Any]] = []
for line in history_path.read_text().splitlines():
try:
entry = json.loads(line)
if entry.get("profile") != current_profile or entry.get("config") != current_config:
continue
entries.append(entry)
except json.JSONDecodeError:
continue
return entries


def _regression_status(
history_entries: list[dict[str, Any]],
row: dict[str, Any],
*,
threshold_pct: float = 10.0,
) -> str:
if not history_entries:
return "no_history"
vals: list[float] = []
for entry in history_entries:
try:
sample = entry.get("p50_wall_ms", {}).get(row["library"], {}).get(row["feature"], {})
rv = sample.get("read_p50")
if rv is not None:
vals.append(float(rv))
Comment thread
wolfiesch marked this conversation as resolved.
except (TypeError, ValueError, KeyError):
continue
if len(vals) < 3:
return "insufficient_history"
baseline = sorted(vals[-5:])[len(vals[-5:]) // 2]
cur = ((row.get("perf") or {}).get("read") or {}).get("wall_ms", {}).get("p50")
try:
curf = float(cur)
except (TypeError, ValueError):
return "n/a"
if baseline <= 0:
return "n/a"
delta = ((curf - baseline) / baseline) * 100.0
if delta > threshold_pct:
return f"regressed:{delta:.1f}%"
if delta < -threshold_pct:
return f"improved:{delta:.1f}%"
return f"stable:{delta:.1f}%"
36 changes: 33 additions & 3 deletions src/excelbench/perf/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from __future__ import annotations

import os
import platform
from collections.abc import Callable
from dataclasses import asdict, dataclass
from datetime import UTC, date, datetime, timedelta
Expand Down Expand Up @@ -67,6 +69,13 @@ class PerfFeatureResult:
notes: str | None = None


@dataclass(frozen=True)
class PerfRunEnvironment:
cpu_model: str | None
core_count: int | None
memory_total_mb: float | None


@dataclass(frozen=True)
class PerfMetadata:
benchmark_version: str
Expand All @@ -77,6 +86,7 @@ class PerfMetadata:
python: str
commit: str | None
config: PerfConfig
run_environment: PerfRunEnvironment


@dataclass(frozen=True)
Expand All @@ -101,7 +111,6 @@ def run_perf(
breakdown: bool = False,
memory_mode: MemoryMode = "getrusage",
) -> PerfResults:
import platform as _platform

from excelbench.generator.generate import load_manifest
from excelbench.harness.adapters import get_all_adapters
Expand Down Expand Up @@ -137,16 +146,17 @@ def run_perf(
benchmark_version=BENCHMARK_VERSION,
run_date=datetime.now(UTC),
excel_version=manifest.excel_version,
platform=f"{_platform.system()}-{_platform.machine()}",
platform=f"{platform.system()}-{platform.machine()}",
profile=profile,
python=_platform.python_version(),
python=platform.python_version(),
commit=_get_git_commit(),
config=PerfConfig(
warmup=warmup,
iters=iters,
iteration_policy=iteration_policy_normalized,
breakdown=breakdown,
),
run_environment=_collect_run_environment(),
)

libraries = {a.name: _library_info_dict(a.info) for a in adapters}
Expand Down Expand Up @@ -238,6 +248,7 @@ def perf_results_to_json_dict(results: PerfResults) -> dict[str, Any]:
"python": results.metadata.python,
"commit": results.metadata.commit,
"config": asdict(results.metadata.config),
"run_environment": asdict(results.metadata.run_environment),
},
"libraries": results.libraries,
"results": [_feature_result_to_dict(r) for r in results.results],
Expand Down Expand Up @@ -1704,3 +1715,22 @@ def run_one_iteration(
)

raise ValueError(f"kind must be 'read' or 'write'; got {kind!r}")


def _collect_run_environment() -> PerfRunEnvironment:
cpu_model = platform.processor() or None
if not cpu_model:
cpu_model = os.environ.get("PROCESSOR_IDENTIFIER")
core_count = os.cpu_count()
mem_mb = None
try:
pages = os.sysconf("SC_PHYS_PAGES")
page_size = os.sysconf("SC_PAGE_SIZE")
mem_mb = float(pages * page_size) / (1024.0 * 1024.0)
except (AttributeError, OSError, ValueError):
mem_mb = None
return PerfRunEnvironment(
cpu_model=cpu_model,
core_count=core_count,
memory_total_mb=round(mem_mb, 2) if mem_mb is not None else None,
)
16 changes: 13 additions & 3 deletions tests/test_perf_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
PerfMetadata,
PerfOpResult,
PerfResults,
PerfRunEnvironment,
PerfStats,
)

Expand Down Expand Up @@ -91,6 +92,8 @@ def test_perf_command_writes_outputs(tmp_path: Path) -> None:
assert data["metadata"]["config"]["iters"] == 1
assert data["metadata"]["config"]["iteration_policy"] == "fixed"
assert "openpyxl" in data["libraries"]
assert "run_environment" in data["metadata"]
assert "cpu_model" in data["metadata"]["run_environment"]
Comment thread
wolfiesch marked this conversation as resolved.


def test_perf_markdown_header_matches_all_rendered_cells(tmp_path: Path) -> None:
Expand All @@ -110,6 +113,11 @@ def test_perf_markdown_header_matches_all_rendered_cells(tmp_path: Path) -> None
iteration_policy="fixed",
breakdown=False,
),
run_environment=PerfRunEnvironment(
cpu_model=None,
core_count=None,
memory_total_mb=None,
),
),
libraries={
"openpyxl": {
Expand Down Expand Up @@ -158,8 +166,10 @@ def test_perf_markdown_header_matches_all_rendered_cells(tmp_path: Path) -> None

markdown = readme.read_text()
assert (
"| Feature | openpyxl (R p50 ms) | openpyxl (W p50 ms) | "
"python-calamine (R p50 ms) |"
"| Feature | openpyxl (R p50/p95 ms) | openpyxl (W p50/p95 ms) | "
"python-calamine (R p50/p95 ms) |"
) in markdown
assert "| cell_values | 1.00 | 2.00 | 0.50 |" in markdown
assert "| cell_values | 1.00/1.00 | 2.00/2.00 | 0.50/0.50 |" in markdown
assert markdown.count("**Tier 0") == 1
assert "Confidence note:" in markdown
assert "p50/p95" in markdown
Comment thread
wolfiesch marked this conversation as resolved.
42 changes: 42 additions & 0 deletions tests/test_perf_data_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,3 +622,45 @@ def test_shape_fixtures_stale_corrupt_manifest_with_needs_1m(tmp_path: Path) ->

# Corrupt manifest with needs_1m=True triggers regeneration.
assert _shape_fixtures_stale(manifest, generator, needs_1m=True) is True


def test_perf_csv_includes_regression_status(tmp_path: Path) -> None:
from excelbench.perf.renderer import render_perf_csv
from excelbench.perf.runner import (
PerfConfig,
PerfFeatureResult,
PerfMetadata,
PerfOpResult,
PerfResults,
PerfRunEnvironment,
PerfStats,
)

stats = PerfStats(min=1, p50=1, p95=2)
res = PerfResults(
metadata=PerfMetadata(
benchmark_version="x",
run_date=datetime.now(UTC),
excel_version="x",
platform="x",
profile="xlsx",
python="3",
commit=None,
config=PerfConfig(warmup=0, iters=1, iteration_policy="fixed", breakdown=False),
run_environment=PerfRunEnvironment(cpu_model=None, core_count=1, memory_total_mb=None),
),
libraries={"openpyxl": {"capabilities": ["read"]}},
results=[
PerfFeatureResult(
feature="f",
library="openpyxl",
workload_size="tiny",
perf={"read": PerfOpResult(wall_ms=stats, cpu_ms=stats), "write": None},
)
],
)
out = tmp_path / "m.csv"
render_perf_csv(res, out)
txt = out.read_text()
assert "regression_status" in txt
assert "confidence_note" in txt
Loading
Loading