diff --git a/docs/trackers/performance-benchmark-runs.md b/docs/trackers/performance-benchmark-runs.md index 5793dfd..ac0fa87 100644 --- a/docs/trackers/performance-benchmark-runs.md +++ b/docs/trackers/performance-benchmark-runs.md @@ -461,3 +461,12 @@ Outputs: - Bulk read: `results_dev_perf_dashboard/bulk_read_multi/perf/README.md` - Bulk write: `results_dev_perf_dashboard/bulk_write_multi/perf/README.md` - Per-cell fast: `results_dev_perf_dashboard/per_cell_fast/perf/README.md` + + +## Canonical run-comparison procedure + +1. Run benchmark with stable knobs (`--warmup`, `--iters`, fixed adapter set) and commit code before each run. +2. Review `perf/results.json` metadata and confirm same profile, CPU, core count, memory, Python, and adapter build info. +3. Compare `perf/matrix.csv` columns `read_tail_ratio`/`write_tail_ratio` and `regression_status`. Treat `confidence_note=high` as noisy and rerun. +4. For regression gates, compare against `results/perf/history.jsonl` median of recent samples (last 5, min 3). +5. Investigate any `regressed:%` row above the configured threshold, such as `regressed:12.3%`; require two consecutive confirmations before escalating. diff --git a/src/excelbench/perf/renderer.py b/src/excelbench/perf/renderer.py index 3b08fcd..a69ce60 100644 --- a/src/excelbench/perf/renderer.py +++ b/src/excelbench/perf/renderer.py @@ -74,6 +74,13 @@ def render_perf_markdown(results: PerfResults, path: Path) -> None: lines.append(f"*Profile: {data['metadata']['profile']}*") lines.append(f"*Platform: {data['metadata']['platform']}*") lines.append(f"*Python: {data['metadata']['python']}*") + run_env = data["metadata"].get("run_environment") or {} + if run_env: + lines.append(f"*CPU: {run_env.get('cpu_model') or 'unknown'}*") + lines.append( + f"*Cores: {run_env.get('core_count') or 'unknown'} | " + f"Memory MB: {run_env.get('memory_total_mb') or 'unknown'}*" + ) if data["metadata"].get("commit"): lines.append(f"*Commit: {data['metadata']['commit']}*") cfg = data["metadata"].get("config", {}) @@ -93,6 +100,10 @@ def render_perf_markdown(results: PerfResults, path: Path) -> None: "These numbers measure only the library under test. " "Write timings do NOT include oracle verification." ) + lines.append( + "Confidence note: treat deltas under ~5% as noise unless " + "stable across multiple runs." + ) lines.append("") workload_features = _collect_workload_features(libs, features, lookup) @@ -107,10 +118,10 @@ def render_perf_markdown(results: PerfResults, path: Path) -> None: for lib in libs: caps = set(data["libraries"][lib].get("capabilities", [])) if "read" in caps: - header += f" {lib} (R p50 ms) |" + header += f" {lib} (R p50/p95 ms) |" sep += "--------------|" if "write" in caps: - header += f" {lib} (W p50 ms) |" + header += f" {lib} (W p50/p95 ms) |" sep += "--------------|" tier_map: dict[int, list[str]] = {0: [], 1: [], 2: []} @@ -133,9 +144,9 @@ def render_perf_markdown(results: PerfResults, path: Path) -> None: entry = lookup.get((feat, lib)) perf = entry.get("perf") if entry else None if "read" in caps: - row += f" {_fmt_p50_ms(perf, 'read')} |" + row += f" {_fmt_p50_p95_ms(perf, 'read')} |" if "write" in caps: - row += f" {_fmt_p50_ms(perf, 'write')} |" + row += f" {_fmt_p50_p95_ms(perf, 'write')} |" lines.append(row) lines.append("") @@ -303,7 +314,7 @@ def _fmt_rate(rate: float) -> str: return f"{rate:.2f}" -def _fmt_p50_ms(perf: dict[str, Any] | None, op: str) -> str: +def _fmt_p50_p95_ms(perf: dict[str, Any] | None, op: str) -> str: if not perf or not isinstance(perf, dict): return "—" op_data = perf.get(op) @@ -316,16 +327,37 @@ def _fmt_p50_ms(perf: dict[str, Any] | None, op: str) -> str: if p50 is None: return "—" try: - return f"{float(p50):.2f}" + p95 = wall.get("p95") + p95_txt = f"/{float(p95):.2f}" if p95 is not None else "" + return f"{float(p50):.2f}{p95_txt}" except (TypeError, ValueError): return "—" def render_perf_csv(results: PerfResults, path: Path) -> None: data = perf_results_to_json_dict(results) + history_path = path.parent / "history.jsonl" + history_entries = _load_matching_history_entries(data, history_path) + header_columns = [ + "library", + "feature", + "read_p50_wall_ms", + "read_p95_wall_ms", + "read_op_count", + "read_op_unit", + "read_p50_units_per_sec", + "write_p50_wall_ms", + "write_p95_wall_ms", + "write_op_count", + "write_op_unit", + "write_p50_units_per_sec", + "read_tail_ratio", + "write_tail_ratio", + "confidence_note", + "regression_status", + ] lines = [ - "library,feature,read_p50_wall_ms,read_p95_wall_ms,read_op_count,read_op_unit,read_p50_units_per_sec," - "write_p50_wall_ms,write_p95_wall_ms,write_op_count,write_op_unit,write_p50_units_per_sec", + ",".join(header_columns), ] for r in data["results"]: perf = r.get("perf") or {} @@ -350,6 +382,9 @@ def _rate(count: Any, p50_ms: Any) -> str: def _f(v: Any) -> str: return "" if v is None else str(v) + read_tail_ratio = _tail_ratio(read_wall) + write_tail_ratio = _tail_ratio(write_wall) + reg_status = _regression_status(history_entries, r) lines.append( ",".join( [ @@ -365,6 +400,17 @@ def _f(v: Any) -> str: _f(write_count), _f(write_unit), _rate(write_count, write_wall.get("p50")), + _f(read_tail_ratio), + _f(write_tail_ratio), + _f( + "high" + if ( + (read_tail_ratio or 0) > 0.20 + or (write_tail_ratio or 0) > 0.20 + ) + else "ok" + ), + reg_status, ] ) ) @@ -400,3 +446,73 @@ def append_perf_history(results: PerfResults, history_path: Path) -> None: with open(history_path, "a") as f: f.write(json.dumps(entry) + "\n") + + +def _tail_ratio(wall: dict[str, Any]) -> float | None: + p50 = wall.get("p50") + p95 = wall.get("p95") + try: + if p50 is None or p95 is None: + return None + p50_float = float(p50) + if p50_float == 0: + return None + return round(max(float(p95) - p50_float, 0.0) / p50_float, 4) + except (TypeError, ValueError, ZeroDivisionError): + return None + + +def _load_matching_history_entries( + data: dict[str, Any], + history_path: Path, +) -> list[dict[str, Any]]: + if not history_path.exists(): + return [] + metadata = data.get("metadata", {}) + current_profile = metadata.get("profile") + current_config = metadata.get("config") + entries: list[dict[str, Any]] = [] + for line in history_path.read_text().splitlines(): + try: + entry = json.loads(line) + if entry.get("profile") != current_profile or entry.get("config") != current_config: + continue + entries.append(entry) + except json.JSONDecodeError: + continue + return entries + + +def _regression_status( + history_entries: list[dict[str, Any]], + row: dict[str, Any], + *, + threshold_pct: float = 10.0, +) -> str: + if not history_entries: + return "no_history" + vals: list[float] = [] + for entry in history_entries: + try: + sample = entry.get("p50_wall_ms", {}).get(row["library"], {}).get(row["feature"], {}) + rv = sample.get("read_p50") + if rv is not None: + vals.append(float(rv)) + except (TypeError, ValueError, KeyError): + continue + if len(vals) < 3: + return "insufficient_history" + baseline = sorted(vals[-5:])[len(vals[-5:]) // 2] + cur = ((row.get("perf") or {}).get("read") or {}).get("wall_ms", {}).get("p50") + try: + curf = float(cur) + except (TypeError, ValueError): + return "n/a" + if baseline <= 0: + return "n/a" + delta = ((curf - baseline) / baseline) * 100.0 + if delta > threshold_pct: + return f"regressed:{delta:.1f}%" + if delta < -threshold_pct: + return f"improved:{delta:.1f}%" + return f"stable:{delta:.1f}%" diff --git a/src/excelbench/perf/runner.py b/src/excelbench/perf/runner.py index c7e49e6..1a8cbd6 100644 --- a/src/excelbench/perf/runner.py +++ b/src/excelbench/perf/runner.py @@ -8,6 +8,8 @@ from __future__ import annotations +import os +import platform from collections.abc import Callable from dataclasses import asdict, dataclass from datetime import UTC, date, datetime, timedelta @@ -67,6 +69,13 @@ class PerfFeatureResult: notes: str | None = None +@dataclass(frozen=True) +class PerfRunEnvironment: + cpu_model: str | None + core_count: int | None + memory_total_mb: float | None + + @dataclass(frozen=True) class PerfMetadata: benchmark_version: str @@ -77,6 +86,7 @@ class PerfMetadata: python: str commit: str | None config: PerfConfig + run_environment: PerfRunEnvironment @dataclass(frozen=True) @@ -101,7 +111,6 @@ def run_perf( breakdown: bool = False, memory_mode: MemoryMode = "getrusage", ) -> PerfResults: - import platform as _platform from excelbench.generator.generate import load_manifest from excelbench.harness.adapters import get_all_adapters @@ -137,9 +146,9 @@ def run_perf( benchmark_version=BENCHMARK_VERSION, run_date=datetime.now(UTC), excel_version=manifest.excel_version, - platform=f"{_platform.system()}-{_platform.machine()}", + platform=f"{platform.system()}-{platform.machine()}", profile=profile, - python=_platform.python_version(), + python=platform.python_version(), commit=_get_git_commit(), config=PerfConfig( warmup=warmup, @@ -147,6 +156,7 @@ def run_perf( iteration_policy=iteration_policy_normalized, breakdown=breakdown, ), + run_environment=_collect_run_environment(), ) libraries = {a.name: _library_info_dict(a.info) for a in adapters} @@ -238,6 +248,7 @@ def perf_results_to_json_dict(results: PerfResults) -> dict[str, Any]: "python": results.metadata.python, "commit": results.metadata.commit, "config": asdict(results.metadata.config), + "run_environment": asdict(results.metadata.run_environment), }, "libraries": results.libraries, "results": [_feature_result_to_dict(r) for r in results.results], @@ -1704,3 +1715,22 @@ def run_one_iteration( ) raise ValueError(f"kind must be 'read' or 'write'; got {kind!r}") + + +def _collect_run_environment() -> PerfRunEnvironment: + cpu_model = platform.processor() or None + if not cpu_model: + cpu_model = os.environ.get("PROCESSOR_IDENTIFIER") + core_count = os.cpu_count() + mem_mb = None + try: + pages = os.sysconf("SC_PHYS_PAGES") + page_size = os.sysconf("SC_PAGE_SIZE") + mem_mb = float(pages * page_size) / (1024.0 * 1024.0) + except (AttributeError, OSError, ValueError): + mem_mb = None + return PerfRunEnvironment( + cpu_model=cpu_model, + core_count=core_count, + memory_total_mb=round(mem_mb, 2) if mem_mb is not None else None, + ) diff --git a/tests/test_perf_cli.py b/tests/test_perf_cli.py index f3a0c6f..dde718f 100644 --- a/tests/test_perf_cli.py +++ b/tests/test_perf_cli.py @@ -16,6 +16,7 @@ PerfMetadata, PerfOpResult, PerfResults, + PerfRunEnvironment, PerfStats, ) @@ -91,6 +92,8 @@ def test_perf_command_writes_outputs(tmp_path: Path) -> None: assert data["metadata"]["config"]["iters"] == 1 assert data["metadata"]["config"]["iteration_policy"] == "fixed" assert "openpyxl" in data["libraries"] + assert "run_environment" in data["metadata"] + assert "cpu_model" in data["metadata"]["run_environment"] def test_perf_markdown_header_matches_all_rendered_cells(tmp_path: Path) -> None: @@ -110,6 +113,11 @@ def test_perf_markdown_header_matches_all_rendered_cells(tmp_path: Path) -> None iteration_policy="fixed", breakdown=False, ), + run_environment=PerfRunEnvironment( + cpu_model=None, + core_count=None, + memory_total_mb=None, + ), ), libraries={ "openpyxl": { @@ -158,8 +166,10 @@ def test_perf_markdown_header_matches_all_rendered_cells(tmp_path: Path) -> None markdown = readme.read_text() assert ( - "| Feature | openpyxl (R p50 ms) | openpyxl (W p50 ms) | " - "python-calamine (R p50 ms) |" + "| Feature | openpyxl (R p50/p95 ms) | openpyxl (W p50/p95 ms) | " + "python-calamine (R p50/p95 ms) |" ) in markdown - assert "| cell_values | 1.00 | 2.00 | 0.50 |" in markdown + assert "| cell_values | 1.00/1.00 | 2.00/2.00 | 0.50/0.50 |" in markdown assert markdown.count("**Tier 0") == 1 + assert "Confidence note:" in markdown + assert "p50/p95" in markdown diff --git a/tests/test_perf_data_shape.py b/tests/test_perf_data_shape.py index 31eaf8f..e0b188c 100644 --- a/tests/test_perf_data_shape.py +++ b/tests/test_perf_data_shape.py @@ -622,3 +622,45 @@ def test_shape_fixtures_stale_corrupt_manifest_with_needs_1m(tmp_path: Path) -> # Corrupt manifest with needs_1m=True triggers regeneration. assert _shape_fixtures_stale(manifest, generator, needs_1m=True) is True + + +def test_perf_csv_includes_regression_status(tmp_path: Path) -> None: + from excelbench.perf.renderer import render_perf_csv + from excelbench.perf.runner import ( + PerfConfig, + PerfFeatureResult, + PerfMetadata, + PerfOpResult, + PerfResults, + PerfRunEnvironment, + PerfStats, + ) + + stats = PerfStats(min=1, p50=1, p95=2) + res = PerfResults( + metadata=PerfMetadata( + benchmark_version="x", + run_date=datetime.now(UTC), + excel_version="x", + platform="x", + profile="xlsx", + python="3", + commit=None, + config=PerfConfig(warmup=0, iters=1, iteration_policy="fixed", breakdown=False), + run_environment=PerfRunEnvironment(cpu_model=None, core_count=1, memory_total_mb=None), + ), + libraries={"openpyxl": {"capabilities": ["read"]}}, + results=[ + PerfFeatureResult( + feature="f", + library="openpyxl", + workload_size="tiny", + perf={"read": PerfOpResult(wall_ms=stats, cpu_ms=stats), "write": None}, + ) + ], + ) + out = tmp_path / "m.csv" + render_perf_csv(res, out) + txt = out.read_text() + assert "regression_status" in txt + assert "confidence_note" in txt diff --git a/tests/test_perf_workloads.py b/tests/test_perf_workloads.py index 0d53467..a86ef6a 100644 --- a/tests/test_perf_workloads.py +++ b/tests/test_perf_workloads.py @@ -652,3 +652,55 @@ def test_perf_workload_standardizes_size_and_phase_attribution(tmp_path: Path) - assert read_phase is not None and read_phase["parse"] > 0 assert write_phase is not None and write_phase["write"] > 0 assert write_phase["verify"] == 0.0 + + +def test_perf_metadata_includes_run_environment(tmp_path: Path) -> None: + suite = tmp_path / "suite" + suite.mkdir(parents=True, exist_ok=True) + wb = Workbook() + ws = wb.active + assert ws is not None + ws.title = "S1" + ws["A1"] = 1 + (suite / "tier0").mkdir(parents=True, exist_ok=True) + wb.save(suite / "tier0" / "00_one.xlsx") + manifest = Manifest( + generated_at=datetime.now(UTC), + excel_version="test", + generator_version="test", + file_format="xlsx", + files=[ + BenchFile( + path="tier0/00_one.xlsx", + feature="cell_values", + tier=0, + file_format="xlsx", + test_cases=[ + BenchCase( + id="x", + label="x", + row=1, + expected={ + "workload": { + "scenario": "cell_values", + "op": "cell_value", + "sheet": "S1", + "range": "A1:A1", + } + }, + importance=Importance.BASIC, + ) + ], + ) + ], + ) + write_manifest(manifest, suite / "manifest.json") + results = run_perf( + suite, + adapters=[OpenpyxlAdapter()], + warmup=0, + iters=1, + breakdown=False, + ) + env = results.metadata.run_environment + assert env.core_count is None or env.core_count > 0