diff --git a/.github/dashboard/atomesh_mocker_index.html b/.github/dashboard/atomesh_mocker_index.html new file mode 100644 index 0000000000..c3dfea79b0 --- /dev/null +++ b/.github/dashboard/atomesh_mocker_index.html @@ -0,0 +1,285 @@ + + + + + + ATOMesh Mocker Benchmark Dashboard + + + +

Mocker Benchmark

Loading...
+ +
+
+
+

Detailed Performance Data

+
+ +
+

Atomesh Standalone Accuracy (GSM8K)

+
+
+ + + + + + + + + diff --git a/.github/scripts/atomesh_mocker_benchmark.sh b/.github/scripts/atomesh_mocker_benchmark.sh new file mode 100755 index 0000000000..6900f491cf --- /dev/null +++ b/.github/scripts/atomesh_mocker_benchmark.sh @@ -0,0 +1,315 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCENARIO="${SCENARIO:-pd-chat}" +BENCHMARK_NAME="${BENCHMARK_NAME:-${SCENARIO}}" +DURATION="${DURATION:-20s}" +KILL_AFTER="${KILL_AFTER:-300s}" +PRODUCER_THREADS="${PRODUCER_THREADS:-1}" +CONSUMER_THREADS="${CONSUMER_THREADS:-8}" +PREFILL_WORKERS="${PREFILL_WORKERS:-1}" +DECODE_WORKERS="${DECODE_WORKERS:-1}" +POLICY="${POLICY:-round_robin}" +RESULT_DIR="${RESULT_DIR:-atomesh-mocker-results}" + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +MESH_DIR="${REPO_ROOT}/atom/mesh" +MOCKER_DIR="${MESH_DIR}/mocker" +MOCKER_TARGET_DIR="${MOCKER_DIR}/target/mocker" +MESH_TARGET_DIR="${MOCKER_DIR}/target/mesh" +ATOMESH_BIN="${MESH_TARGET_DIR}/release/atomesh" +MOCKER_BIN="${MOCKER_TARGET_DIR}/release/atomesh-mocker" +LOG_DIR="${RESULT_DIR}/logs/${BENCHMARK_NAME}" +FIXTURE="${MOCKER_DIR}/fixtures/http_pd_chat.json" +ROUTER_MODE="pd" +WORKERS=$((PREFILL_WORKERS + DECODE_WORKERS)) + +mkdir -p "${RESULT_DIR}" "${LOG_DIR}" + +if [[ "${SCENARIO}" != "pd-chat" ]]; then + echo "Unsupported SCENARIO=${SCENARIO}; this benchmark script only runs pd-chat" >&2 + exit 2 +fi + +if (( PREFILL_WORKERS < 1 || DECODE_WORKERS < 1 )); then + echo "PREFILL_WORKERS and DECODE_WORKERS must both be >= 1" >&2 + exit 2 +fi + +pick_ports() { + python3 - <<'PY' +import socket + +def free_port(): + sock = socket.socket() + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + sock.close() + return port + +print(free_port(), free_port(), free_port()) +PY +} + +wait_http() { + local url="$1" + local name="$2" + for _ in $(seq 1 100); do + if curl -fsS "${url}" >/dev/null 2>&1; then + return 0 + fi + sleep 0.2 + done + echo "${name} did not become ready at ${url}" >&2 + return 1 +} + +cleanup() { + local status=$? + if [[ -n "${ROUTER_PID:-}" ]]; then + kill -INT "${ROUTER_PID}" 2>/dev/null || true + fi + if [[ -n "${WORKER_PID:-}" ]]; then + kill -INT "${WORKER_PID}" 2>/dev/null || true + fi + wait "${ROUTER_PID:-}" 2>/dev/null || true + wait "${WORKER_PID:-}" 2>/dev/null || true + exit "${status}" +} +trap cleanup EXIT + +read -r ROUTER_PORT WORKER_BASE_PORT PROMETHEUS_PORT < <(pick_ports) + +if [[ ! -x "${MOCKER_BIN}" || ! -x "${ATOMESH_BIN}" ]]; then + echo "Missing release binaries. Build them before running this benchmark script." >&2 + echo " MOCKER_BIN=${MOCKER_BIN}" >&2 + echo " ATOMESH_BIN=${ATOMESH_BIN}" >&2 + exit 2 +fi + +echo "=== Starting virtual workers for ${BENCHMARK_NAME} (${PREFILL_WORKERS}P${DECODE_WORKERS}D) ===" +"${MOCKER_BIN}" virtual-workers \ + --ip 127.0.0.1 \ + --base-port "${WORKER_BASE_PORT}" \ + --workers "${WORKERS}" \ + "${FIXTURE}" \ + > "${LOG_DIR}/virtual-workers.log" 2>&1 & +WORKER_PID=$! +for index in $(seq 0 $((WORKERS - 1))); do + wait_http "http://127.0.0.1:$((WORKER_BASE_PORT + index))/health" "virtual worker ${index}" +done + +echo "=== Starting Atomesh router (${ROUTER_MODE}) ===" +COMMON_ROUTER_ARGS=( + launch + --host 127.0.0.1 + --port "${ROUTER_PORT}" + --policy "${POLICY}" + --worker-startup-timeout-secs 10 + --worker-startup-check-interval 1 + --request-timeout-secs 30 + --disable-retries + --disable-circuit-breaker + --health-check-interval-secs 300 + --prometheus-port "${PROMETHEUS_PORT}" + --log-level warn +) + +pd_worker_args=(--pd-disaggregation) +for index in $(seq 0 $((PREFILL_WORKERS - 1))); do + pd_worker_args+=(--prefill "http://127.0.0.1:$((WORKER_BASE_PORT + index))") +done +for index in $(seq 0 $((DECODE_WORKERS - 1))); do + pd_worker_args+=(--decode "http://127.0.0.1:$((WORKER_BASE_PORT + PREFILL_WORKERS + index))") +done + +"${ATOMESH_BIN}" "${COMMON_ROUTER_ARGS[@]}" \ + "${pd_worker_args[@]}" \ + --prefill-policy "${POLICY}" \ + --decode-policy "${POLICY}" \ + > "${LOG_DIR}/atomesh.log" 2>&1 & +ROUTER_PID=$! +wait_http "http://127.0.0.1:${ROUTER_PORT}/health" "Atomesh router" + +echo "=== Running request benchmark ${BENCHMARK_NAME} for ${DURATION} ===" +BENCH_LOG="${LOG_DIR}/benchmark-request.log" +set +e +timeout --signal=INT --kill-after="${KILL_AFTER}" "${DURATION}" \ + "${MOCKER_BIN}" benchmark-request \ + --base-url "http://127.0.0.1:${ROUTER_PORT}" \ + --producer-threads "${PRODUCER_THREADS}" \ + --consumer-threads "${CONSUMER_THREADS}" \ + "${FIXTURE}" \ + > "${BENCH_LOG}" 2>&1 +bench_status=$? +set -e + +if [[ "${bench_status}" -ne 0 && "${bench_status}" -ne 124 && "${bench_status}" -ne 130 ]]; then + echo "benchmark-request failed with status ${bench_status}" >&2 + exit "${bench_status}" +fi + +echo "=== Parsing benchmark metrics ===" +RESULT_JSON="${RESULT_DIR}/${BENCHMARK_NAME}.json" +ACTION_JSON="${RESULT_DIR}/${BENCHMARK_NAME}-benchmark-action.json" +SUMMARY_MD="${RESULT_DIR}/${BENCHMARK_NAME}.md" + +python3 - <<'PY' \ + "${BENCH_LOG}" "${RESULT_JSON}" "${ACTION_JSON}" "${SUMMARY_MD}" \ + "${SCENARIO}" "${FIXTURE}" "${ROUTER_MODE}" "${DURATION}" \ + "${PRODUCER_THREADS}" "${CONSUMER_THREADS}" "${WORKERS}" "${POLICY}" \ + "${BENCHMARK_NAME}" "${PREFILL_WORKERS}" "${DECODE_WORKERS}" +from datetime import UTC, datetime +import json +import os +import re +import sys +from pathlib import Path + +( + bench_log, + result_json, + action_json, + summary_md, + scenario, + fixture, + router_mode, + duration, + producer_threads, + consumer_threads, + workers, + policy, + benchmark_name, + prefill_workers, + decode_workers, +) = sys.argv[1:] + +text = Path(bench_log).read_text(encoding="utf-8", errors="replace") +metric_lines = [ + line for line in text.splitlines() + if re.match(r"^all\s+\d+\s+\d+\s+\d+\s+", line) +] +if not metric_lines: + print(text) + raise SystemExit("No aggregate metrics line found in benchmark log") + +fields = metric_lines[-1].split() +total = int(fields[1]) +success = int(fields[2]) +failed = int(fields[3]) +avg_ms = float(fields[4]) +p99_ms = float(fields[5]) +p999_ms = float(fields[6]) +one_second_qps = float(fields[8]) +one_minute_qps = float(fields[10]) +five_minute_qps = float(fields[12]) + +seconds_match = re.match(r"^(\d+)([smh]?)$", duration) +duration_seconds = None +if seconds_match: + value = int(seconds_match.group(1)) + unit = seconds_match.group(2) or "s" + duration_seconds = value * {"s": 1, "m": 60, "h": 3600}[unit] + +request_throughput = ( + success / duration_seconds + if duration_seconds and duration_seconds > 0 + else one_minute_qps +) + +payload = { + "date": datetime.now(UTC).strftime("%Y%m%d-%H%M%S"), + "benchmark_backend": "Atomesh-Mocker", + "dashboard_backend": "Atomesh-Mocker", + "benchmark_model_name": benchmark_name, + "benchmark_name": benchmark_name, + "scenario": scenario, + "fixture": str(Path(fixture).name), + "router_mode": router_mode, + "connection_mode": "http", + "policy": policy, + "producer_threads": int(producer_threads), + "consumer_threads": int(consumer_threads), + "workers": int(workers), + "prefill_workers": int(prefill_workers), + "decode_workers": int(decode_workers), + "duration_seconds": duration_seconds, + "completed": success, + "failed": failed, + "request_throughput": request_throughput, + "output_throughput": request_throughput, + "total_token_throughput": request_throughput, + "avg_latency_ms": avg_ms, + "mean_ttft_ms": avg_ms, + "mean_tpot_ms": avg_ms, + "p99_latency_ms": p99_ms, + "p999_latency_ms": p999_ms, + "one_second_qps": one_second_qps, + "one_minute_qps": one_minute_qps, + "five_minute_qps": five_minute_qps, + "total": total, +} +Path(result_json).write_text(json.dumps(payload, indent=2), encoding="utf-8") + +run_url = "" +server_url = os.environ.get("GITHUB_SERVER_URL", "https://github.com") +repository = os.environ.get("GITHUB_REPOSITORY") +run_id = os.environ.get("GITHUB_RUN_ID") +if repository and run_id: + run_url = f"{server_url}/{repository}/actions/runs/{run_id}" + +extra_parts = [ + f"cell={benchmark_name}", + f"router={router_mode}", + f"policy={policy}", + f"workers={workers}", + f"prefill={prefill_workers}", + f"decode={decode_workers}", + f"producers={producer_threads}", + f"consumers={consumer_threads}", + f"duration_seconds={duration_seconds}", + f"request_number={success}", +] +if run_url: + extra_parts.append(f"Run: {run_url}") +extra = " ".join(extra_parts) + +entries = [] +for metric_name, unit, value in [ + ("request throughput", "req/s", request_throughput), + ("avg latency", "ms", avg_ms), + ("p99 latency", "ms", p99_ms), + ("p999 latency", "ms", p999_ms), + ("failed requests", "count", failed), +]: + entries.append( + { + "name": f"Atomesh-Mocker::{benchmark_name} {metric_name}", + "unit": unit, + "value": round(float(value), 2), + "extra": extra, + } + ) +Path(action_json).write_text(json.dumps(entries, indent=2), encoding="utf-8") + +summary = f"""### Atomesh Mocker Benchmark: {benchmark_name} + +| Metric | Value | +| --- | ---: | +| scenario | {scenario} | +| router mode | {router_mode} | +| workers | {workers} | +| prefill/decode workers | {prefill_workers}/{decode_workers} | +| producer/consumer threads | {producer_threads}/{consumer_threads} | +| completed | {success} | +| failed | {failed} | +| request throughput | {request_throughput:.2f} req/s | +| avg latency | {avg_ms:.3f} ms | +| p99 latency | {p99_ms:.3f} ms | +| p999 latency | {p999_ms:.3f} ms | +""" +Path(summary_md).write_text(summary, encoding="utf-8") +print(summary) +PY + +echo "Result JSON: ${RESULT_JSON}" diff --git a/.github/scripts/atomesh_mocker_benchmark_summary.py b/.github/scripts/atomesh_mocker_benchmark_summary.py new file mode 100644 index 0000000000..03fb2d3e97 --- /dev/null +++ b/.github/scripts/atomesh_mocker_benchmark_summary.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Run Atomesh mocker benchmark cells and generate an aggregate summary.""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run Atomesh mocker benchmark cells and summarize results." + ) + parser.add_argument( + "--cells-json", + default=os.environ.get("CELLS_JSON", "[]"), + help="JSON array of benchmark cells. Defaults to CELLS_JSON.", + ) + parser.add_argument( + "--result-dir", + default=os.environ.get("RESULT_DIR", "atomesh-mocker-results"), + help="Directory where per-cell results and summary are written.", + ) + parser.add_argument( + "--benchmark-script", + default=".github/scripts/atomesh_mocker_benchmark.sh", + help="Single-cell benchmark script to invoke.", + ) + return parser.parse_args() + + +def run_cells(cells: list[dict], result_dir: Path, benchmark_script: str) -> int: + result_dir.mkdir(parents=True, exist_ok=True) + + for index, cell in enumerate(cells, start=1): + print( + f"=== Running benchmark cell {index}/{len(cells)}: {cell['display']} ===", + flush=True, + ) + env = os.environ.copy() + env.update( + { + "BENCHMARK_NAME": cell["id"], + "SCENARIO": cell["scenario"], + "DURATION": cell["duration"], + "PREFILL_WORKERS": str(cell["prefill_workers"]), + "DECODE_WORKERS": str(cell["decode_workers"]), + "PRODUCER_THREADS": str(cell["producer_threads"]), + "CONSUMER_THREADS": str(cell["consumer_threads"]), + "RESULT_DIR": str(result_dir), + } + ) + try: + subprocess.run([benchmark_script], check=True, env=env) + except subprocess.CalledProcessError as error: + print( + f"Benchmark cell {cell['id']} failed with status {error.returncode}", + file=sys.stderr, + ) + return error.returncode + + return 0 + + +def collect_rows(result_dir: Path) -> list[tuple]: + rows = [] + for path in sorted(result_dir.glob("pd-chat-*.json")): + if path.name.endswith("-benchmark-action.json"): + continue + payload = json.loads(path.read_text(encoding="utf-8")) + rows.append( + ( + payload["prefill_workers"], + payload["decode_workers"], + payload["consumer_threads"], + payload["duration_seconds"], + payload["completed"], + payload["failed"], + payload["request_throughput"], + payload["avg_latency_ms"], + payload["p99_latency_ms"], + payload["p999_latency_ms"], + ) + ) + rows.sort(key=lambda row: (row[0], row[1], row[2])) + return rows + + +def write_summary(result_dir: Path) -> str: + rows = collect_rows(result_dir) + lines = [ + "### Atomesh Mocker Benchmark Summary", + "", + "| Topology | Concurrency | Duration (s) | Completed | Failed | Throughput (req/s) | Avg Latency (ms) | P99 (ms) | P999 (ms) |", + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + for ( + prefill, + decode, + consumers, + duration, + completed, + failed, + throughput, + avg, + p99, + p999, + ) in rows: + lines.append( + f"| {prefill}P{decode}D | {consumers} | {duration} | {completed} | {failed} | " + f"{throughput:.2f} | {avg:.3f} | {p99:.3f} | {p999:.3f} |" + ) + + summary = "\n".join(lines) + "\n" + (result_dir / "benchmark-summary.md").write_text(summary, encoding="utf-8") + return summary + + +def main() -> int: + args = parse_args() + cells = json.loads(args.cells_json) + result_dir = Path(args.result_dir) + + exit_code = run_cells(cells, result_dir, args.benchmark_script) + summary = write_summary(result_dir) + print(summary) + return exit_code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/workflows/atom-sglang-test.yaml b/.github/workflows/atom-sglang-test.yaml index 14a4640b40..f13025e252 100644 --- a/.github/workflows/atom-sglang-test.yaml +++ b/.github/workflows/atom-sglang-test.yaml @@ -8,6 +8,10 @@ on: - '**/*.md' - 'docs/**' - 'atom/plugin/vllm/**' + - 'atom/mesh/**' + - '.github/workflows/atomesh-*.yaml' + - '.github/scripts/atomesh_*.sh' + - '.github/dashboard/atomesh_*.html' - '.github/workflows/atom-vllm-*.yaml' - '.github/benchmark/oot_models_accuracy.json' - 'LICENSE' diff --git a/.github/workflows/atom-test.yaml b/.github/workflows/atom-test.yaml index 708ab30a91..b7fc03dcf1 100644 --- a/.github/workflows/atom-test.yaml +++ b/.github/workflows/atom-test.yaml @@ -10,6 +10,10 @@ on: - '**/*.md' - 'docs/**' - 'atom/plugin/**' + - 'atom/mesh/**' + - '.github/workflows/atomesh-*.yaml' + - '.github/scripts/atomesh_*.sh' + - '.github/dashboard/atomesh_*.html' - '.github/workflows/atom-vllm-*.yaml' - '.github/workflows/atom-sglang-*.yaml' - '.github/benchmark/oot_models_accuracy.json' diff --git a/.github/workflows/atom-vllm-test.yaml b/.github/workflows/atom-vllm-test.yaml index 5d33a27215..1542e4d49d 100644 --- a/.github/workflows/atom-vllm-test.yaml +++ b/.github/workflows/atom-vllm-test.yaml @@ -8,6 +8,10 @@ on: - '**/*.md' - 'docs/**' - 'atom/plugin/sglang/**' + - 'atom/mesh/**' + - '.github/workflows/atomesh-*.yaml' + - '.github/scripts/atomesh_*.sh' + - '.github/dashboard/atomesh_*.html' - '.github/workflows/atom-sglang-*.yaml' - '.github/benchmark/sglang_models_accuracy.json' - 'LICENSE' diff --git a/.github/workflows/atomesh-accuracy-validation.yaml b/.github/workflows/atomesh-accuracy-validation.yaml new file mode 100644 index 0000000000..27ad7205dd --- /dev/null +++ b/.github/workflows/atomesh-accuracy-validation.yaml @@ -0,0 +1,800 @@ +name: Atomesh Accuracy Validation + +on: + push: + branches: [main] + paths: + - 'atom/mesh/**' + - '.github/workflows/atomesh-accuracy-validation.yaml' + - '.github/scripts/accuracy_to_dashboard.py' + - '.github/benchmark/models_accuracy.json' + pull_request: + branches: [main] # Triggers on PRs targeting `main` + types: [opened, synchronize, reopened, ready_for_review] + paths: + - 'atom/mesh/**' + - '.github/workflows/atomesh-accuracy-validation.yaml' + - '.github/scripts/accuracy_to_dashboard.py' + - '.github/benchmark/models_accuracy.json' + schedule: + # Nightly at 00:00 Beijing time (16:00 UTC) + - cron: '0 16 * * *' + workflow_dispatch: + inputs: + aiter_branch: + description: 'ROCm/aiter branch to build inside the CI image' + required: false + default: 'main' + type: string + atom_base_image: + description: 'Docker image used as the ATOM test base image' + required: false + default: 'rocm/atom-dev:latest' + type: string + +concurrency: + # Keep scheduled main runs from blocking push-triggered validation. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +env: + ATOM_BASE_IMAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.atom_base_image || 'rocm/atom-dev:latest' }} + ATOM_PYTHON_TAG: "cp312" + GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/ATOM.git' }} + GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id || github.sha }} + # workflow_dispatch: inputs.aiter_branch; otherwise main (matches previous default-branch shallow clone) + AITER_GIT_REF: ${{ github.event_name == 'workflow_dispatch' && inputs.aiter_branch || 'main' }} + +jobs: + check-signal: + if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} + name: Check Pre Checkin Signal + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + steps: + - name: Checkout ATOM repo + if: ${{ github.event_name != 'workflow_dispatch' }} + uses: actions/checkout@v6 + + - name: Wait for Pre Checkin workflow + if: ${{ github.event_name != 'workflow_dispatch' }} + run: bash ./.github/scripts/check_signal.sh + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_SHA: ${{ github.sha }} + + download_aiter_wheel: + if: ${{ needs.check-signal.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }} + needs: [check-signal] + name: Download aiter wheel + runs-on: ubuntu-latest + steps: + - name: Prefer latest main aiter wheel manifest and fallback to artifact + run: | + set -euo pipefail + echo "=== Trying latest main aiter wheel manifest from S3 first ===" + + S3_MAIN_MANIFEST_URL="https://rocm.frameworks-nightlies.amd.com/whl-staging/gfx942-gfx950/main/latest.json" + API_URL="https://api.github.com" + AUTH_HEADER="Authorization: token ${{ secrets.GITHUB_TOKEN }}" + AITER_TEST_WORKFLOW_ID=179476100 + + ARTIFACT_ID="" + ARTIFACT_NAME="" + ARTIFACT_RUN_ID="" + ARTIFACT_RUN_SHA="" + ARTIFACT_RUN_CREATED_AT="" + + resolve_download_url() { + python3 -c 'import sys + from urllib.parse import quote, unquote, urlsplit, urlunsplit + parts = urlsplit(sys.argv[1]) + encoded_path = "/".join(quote(unquote(segment), safe="") for segment in parts.path.split("/")) + print(urlunsplit((parts.scheme, parts.netloc, encoded_path, parts.query, parts.fragment)))' "$1" + } + + find_latest_artifact() { + local runs_json artifact_json run_id python_artifact_suffix + + if [ -n "$ARTIFACT_ID" ] && [ "$ARTIFACT_ID" != "null" ]; then + return 0 + fi + + python_artifact_suffix="py${ATOM_PYTHON_TAG#cp}" + python_artifact_suffix="${python_artifact_suffix:0:3}.${python_artifact_suffix:3}" + + echo "=== Finding latest aiter-whl-* artifact for ${python_artifact_suffix} from ROCm/aiter ===" + runs_json=$(curl -fsSL -H "$AUTH_HEADER" \ + "$API_URL/repos/ROCm/aiter/actions/workflows/$AITER_TEST_WORKFLOW_ID/runs?per_page=100&branch=main&event=push") + + for run_id in $(echo "$runs_json" | jq -r '.workflow_runs[].id'); do + artifact_json=$(curl -fsSL -H "$AUTH_HEADER" \ + "$API_URL/repos/ROCm/aiter/actions/runs/$run_id/artifacts" \ + | jq --arg artifact_suffix "-${python_artifact_suffix}" '[.artifacts[] | select(.name | startswith("aiter-whl-") and endswith($artifact_suffix)) | select(.expired == false)] | sort_by(.created_at) | last') + + if [ "$artifact_json" != "null" ] && [ -n "$artifact_json" ]; then + ARTIFACT_ID=$(echo "$artifact_json" | jq -r '.id') + ARTIFACT_NAME=$(echo "$artifact_json" | jq -r '.name') + ARTIFACT_RUN_ID="$run_id" + ARTIFACT_RUN_SHA=$(echo "$runs_json" | jq -r --arg run_id "$run_id" '.workflow_runs[] | select((.id | tostring) == $run_id) | .head_sha') + ARTIFACT_RUN_CREATED_AT=$(echo "$runs_json" | jq -r --arg run_id "$run_id" '.workflow_runs[] | select((.id | tostring) == $run_id) | .created_at') + echo "Found artifact in run $ARTIFACT_RUN_ID: $ARTIFACT_NAME (ID: $ARTIFACT_ID, SHA: $ARTIFACT_RUN_SHA)" + return 0 + fi + done + + return 1 + } + + download_from_s3_manifest() { + local manifest_file manifest_fetch_url manifest_branch manifest_timestamp manifest_commit wheel_name wheel_url resolved_wheel_url + + mkdir -p aiter-whl + rm -f aiter-whl/amd_aiter*.whl + + manifest_file=$(mktemp) + trap 'rm -f "$manifest_file"' RETURN + manifest_fetch_url="${S3_MAIN_MANIFEST_URL}?ts=$(date +%s)" + curl -fsSL -H "Cache-Control: no-cache" "$manifest_fetch_url" -o "$manifest_file" || return 1 + + manifest_branch=$(jq -r '.branch // empty' "$manifest_file") + manifest_timestamp=$(jq -r '.timestamp // empty' "$manifest_file") + manifest_commit=$(jq -r '.commit // empty' "$manifest_file") + + wheel_name=$(jq -r ".wheels.${ATOM_PYTHON_TAG}.wheel_name // empty" "$manifest_file") + wheel_url=$(jq -r ".wheels.${ATOM_PYTHON_TAG}.wheel_url // empty" "$manifest_file") + if [ -n "$wheel_name" ] && [ -n "$wheel_url" ]; then + echo "Selected ${ATOM_PYTHON_TAG} wheel from versioned manifest" + else + wheel_name=$(jq -r '.wheel_name // empty' "$manifest_file") + wheel_url=$(jq -r '.wheel_url // empty' "$manifest_file") + echo "Versioned manifest not available, using top-level wheel fields" + fi + + if [ "$manifest_branch" != "main" ] || [ -z "$manifest_timestamp" ] || [ -z "$manifest_commit" ] || [ -z "$wheel_name" ] || [ -z "$wheel_url" ]; then + echo "Invalid latest main wheel manifest" + return 1 + fi + + if [[ "$wheel_name" == *cp* ]] && [[ "$wheel_name" != *${ATOM_PYTHON_TAG}* ]]; then + echo "WARNING: wheel $wheel_name does not match target Python ${ATOM_PYTHON_TAG}" + return 1 + fi + + if find_latest_artifact; then + if [ -n "$ARTIFACT_RUN_SHA" ] && [ "$manifest_commit" != "$ARTIFACT_RUN_SHA" ]; then + if [ -n "$ARTIFACT_RUN_CREATED_AT" ] && [[ "$manifest_timestamp" < "$ARTIFACT_RUN_CREATED_AT" ]]; then + echo "Manifest commit $manifest_commit is older than latest artifact run $ARTIFACT_RUN_ID ($ARTIFACT_RUN_SHA); treating manifest as stale" + return 1 + fi + echo "Manifest commit $manifest_commit differs from latest artifact run $ARTIFACT_RUN_ID ($ARTIFACT_RUN_SHA), but manifest timestamp is not older" + fi + else + echo "No GitHub fallback artifact found while checking manifest freshness" + fi + + resolved_wheel_url=$(resolve_download_url "$wheel_url") + + echo "Selected latest main wheel manifest: $S3_MAIN_MANIFEST_URL" + echo "Manifest timestamp: $manifest_timestamp" + echo "Manifest commit: $manifest_commit" + echo "Manifest wheel: $wheel_name" + echo "Downloading manifest-selected wheel: $resolved_wheel_url" + curl -fsSL "$resolved_wheel_url" -o "aiter-whl/$wheel_name" || return 1 + echo "Downloaded wheel from manifest: aiter-whl/$wheel_name" + + rm -f "$manifest_file" + trap - RETURN + } + + download_from_artifact() { + local fallback_wheel fallback_wheel_name + + echo "=== Falling back to latest ${ATOM_PYTHON_TAG} aiter-whl-* artifact from ROCm/aiter ===" + find_latest_artifact || { + echo "ERROR: No ${ATOM_PYTHON_TAG} aiter-whl-* artifact found in recent Aiter Test runs" + return 1 + } + + mkdir -p aiter-whl + rm -f aiter-whl/amd_aiter*.whl + curl -fsSL -H "$AUTH_HEADER" \ + "$API_URL/repos/ROCm/aiter/actions/artifacts/$ARTIFACT_ID/zip" \ + -o aiter-whl.zip + unzip -o aiter-whl.zip -d aiter-whl + rm -f aiter-whl.zip + + fallback_wheel=$(ls -t aiter-whl/amd_aiter*.whl 2>/dev/null | head -1) + fallback_wheel_name=$(basename "${fallback_wheel:-}") + if [ -z "$fallback_wheel" ] || [[ "$fallback_wheel_name" != *${ATOM_PYTHON_TAG}* ]]; then + echo "ERROR: artifact fallback did not produce a ${ATOM_PYTHON_TAG} wheel" + ls -la aiter-whl/ || true + return 1 + fi + echo "Downloaded artifact-selected wheel: $fallback_wheel" + } + + if download_from_s3_manifest; then + echo "Using wheel from S3 main manifest" + else + echo "Main wheel manifest download failed, falling back to GitHub artifact" + download_from_artifact + fi + + AITER_WHL=$(ls -t aiter-whl/amd_aiter*.whl 2>/dev/null | head -1) + if [ -z "$AITER_WHL" ]; then + echo "ERROR: No amd_aiter wheel available after S3/artifact attempts" + ls -la aiter-whl/ || true + exit 1 + fi + if [[ "$(basename "$AITER_WHL")" != *${ATOM_PYTHON_TAG}* ]]; then + echo "ERROR: selected wheel $AITER_WHL does not match target Python ${ATOM_PYTHON_TAG}" + exit 1 + fi + + echo "Selected wheel: $AITER_WHL" + + - name: Upload aiter wheel + uses: actions/upload-artifact@v4 + with: + name: aiter-whl + path: aiter-whl/amd_aiter*.whl + retention-days: 7 + + load-test-models: + name: Load test model configs + runs-on: ubuntu-latest + outputs: + models_json: ${{ steps.load.outputs.models_json }} + steps: + - uses: actions/checkout@v6 + - id: load + env: + EVENT_NAME: ${{ github.event_name }} + run: | + python3 << 'PY' + import json, os + event = os.environ["EVENT_NAME"] + # Atomesh standalone validates a small representative subset only. + # Keep this whitelist local; full ATOM accuracy owns test_level. + level_map = {"schedule": "nightly", "workflow_dispatch": "nightly", "push": "main"} + current = level_map.get(event, "pr") + allowed = {"pr": {"pr"}, "main": {"pr", "main"}, "nightly": {"pr", "main", "nightly"}}[current] + models = json.load(open(".github/benchmark/models_accuracy.json", encoding="utf-8")) + atomesh_levels = { + "Meta-Llama-3-8B-Instruct": "pr", + "DeepSeek-R1-0528": "main", + "DeepSeek-V4-Pro MTP": "nightly", + "gpt-oss-120b": "nightly", + } + filtered = [m for m in models if atomesh_levels.get(m["model_name"], "skip") in allowed] + with open(os.environ["GITHUB_OUTPUT"], "a") as f: + f.write(f"models_json={json.dumps(filtered)}\n") + print(f"Event={event} level={current}: {len(filtered)}/{len(models)} models") + print(f"{'Model':<45} {'Atomesh':<10} {'ATOM':<10} {'Runner'}") + print("-" * 80) + for m in models: + enabled = "✓" if m in filtered else "·" + print( + f" {enabled} {m['model_name']:<43} " + f"{atomesh_levels.get(m['model_name'],'skip'):<10} " + f"{m.get('test_level','?'):<10} {m['runner']}" + ) + PY + + atomesh-test: + needs: [download_aiter_wheel, load-test-models] + name: Accuracy + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.load-test-models.outputs.models_json) }} + if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} + runs-on: ${{ matrix.runner }} + + env: + CONTAINER_NAME: atomesh_test_${{ strategy.job-index }} + USE_ATOMESH_ENTRYPOINTS: 1 + ATOM_SERVER_PORT: 8000 + + steps: + - name: Kill all Docker containers and clean up workspace + if: matrix.runner == 'atom-mi355-8gpu.predownload' || matrix.runner == 'linux-atom-do-mi350x-8' + run: | + echo "=== Cleaning up containers on $(hostname) ===" + containers=$(docker ps -q) + if [ -n "$containers" ]; then + docker kill $containers || true + fi + docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && find /workspace -mindepth 1 -delete" || true + + - name: Show Docker containers + if: matrix.runner == 'atom-mi355-8gpu.predownload' || matrix.runner == 'linux-atom-do-mi350x-8' + run: docker ps -a + + - name: Show ROCm memory usage + if: matrix.runner == 'atom-mi355-8gpu.predownload' || matrix.runner == 'linux-atom-do-mi350x-8' + run: rocm-smi --showmemuse + + - name: Show ROCm GPU processes + if: matrix.runner == 'atom-mi355-8gpu.predownload' || matrix.runner == 'linux-atom-do-mi350x-8' + run: rocm-smi --showpidgpus + + - name: Checkout ATOM repo + uses: actions/checkout@v6 + + - name: Docker Login + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u ${{ secrets.DOCKER_USERNAME }} --password-stdin + + - name: Resolve immutable native dashboard image + if: ${{ github.ref == 'refs/heads/main' && (github.event_name == 'push' || github.event_name == 'schedule') }} + env: + DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + run: | + set -euo pipefail + if RESOLUTION_JSON="$( + python3 .github/scripts/resolve_atom_image.py \ + --repository rocm/atom-dev \ + --reference-tag latest \ + --image-family native + )"; then + RESOLVED_ATOM_IMAGE="$( + RESOLUTION_JSON="${RESOLUTION_JSON}" python3 - <<'PY' + import json + import os + + resolution = json.loads(os.environ["RESOLUTION_JSON"]) + print(resolution["resolved_image"]) + PY + )" + echo "Resolved native dashboard image: ${RESOLVED_ATOM_IMAGE}" + else + echo "::error::Failed to resolve ${ATOM_BASE_IMAGE} to an immutable reference for dashboard-uploading native runs." + exit 1 + fi + echo "RESOLVED_ATOM_BASE_IMAGE=${RESOLVED_ATOM_IMAGE}" >> "$GITHUB_ENV" + echo "ATOM_DASHBOARD_DOCKER_IMAGE=${RESOLVED_ATOM_IMAGE}" >> "$GITHUB_ENV" + + - name: Pull immutable native dashboard image + if: ${{ github.ref == 'refs/heads/main' && (github.event_name == 'push' || github.event_name == 'schedule') }} + run: | + echo "Pulling immutable native dashboard image: ${RESOLVED_ATOM_BASE_IMAGE}" + docker pull "${RESOLVED_ATOM_BASE_IMAGE}" + + - name: Generate Dockerfile for forked repo + if: ${{ github.event.pull_request.head.repo.fork }} + run: | + cat < Dockerfile.mod + FROM ${{ env.ATOM_BASE_IMAGE }} + RUN pip install -U lm-eval[api] + RUN pip show lm-eval || true + RUN pip install hf_transfer + RUN pip show hf_transfer || true + RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true + RUN pip uninstall -y amd-aiter + RUN pip install --upgrade "pybind11>=3.0.1" + RUN pip show pybind11 + RUN rm -rf /app/aiter-test + RUN git clone --depth 1 -b ${{ env.AITER_GIT_REF }} https://github.com/ROCm/aiter.git /app/aiter-test && \\ + cd /app/aiter-test && \\ + git submodule sync && git submodule update --init --recursive && \\ + MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop + RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true + + RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true + RUN pip uninstall -y atom + RUN rm -rf /app/ATOM + ARG RUST_VERSION="1.94.0" + RUN if ! command -v cargo >/dev/null 2>&1; then \\ + echo "=== Installing Rust toolchain for atomesh build ===" && \\ + apt-get update && \\ + apt --fix-broken install -y && \\ + apt-get install -y --no-install-recommends curl build-essential pkg-config libssl-dev protobuf-compiler libprotobuf-dev && \\ + rm -rf /var/lib/apt/lists/* && \\ + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \\ + | sh -s -- -y --default-toolchain "\${RUST_VERSION}" --profile minimal && \\ + . "\$HOME/.cargo/env" && \\ + rustc --version && cargo --version; \\ + fi + ENV PATH="/root/.cargo/bin:\$PATH" + RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\ + cd /app/ATOM && \\ + git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\ + ATOM_MESH_BUILD=1 python -m pip install -e . + + RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true + EOF + + - name: Download aiter wheel + uses: actions/download-artifact@v4 + with: + name: aiter-whl + path: /tmp/aiter-whl + + - name: Set HF token for predownload runner + if: matrix.runner == 'atom-mi355-8gpu.predownload' || matrix.runner == 'linux-atom-do-mi350x-8' + run: echo "HF_TOKEN=${HF_TOKEN:-${{ secrets.AMD_HF_TOKEN }}}" >> "$GITHUB_ENV" + + - name: Start CI container + run: | + echo "Clean up containers..." + (docker ps -aq -f name="^${CONTAINER_NAME}$" | xargs -r docker stop) || true + (docker ps -aq -f name="^${CONTAINER_NAME}$" | xargs -r docker rm) || true + + if [ -f "/etc/podinfo/gha-render-devices" ]; then + DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) + else + DEVICE_FLAG="--device /dev/dri" + fi + + if [ -d "/models" ]; then + MODEL_MOUNT="-v /models:/models" + else + echo "Warning: /models directory not found on runner; skipping /models mount and disabling model pre-download optimization." + MODEL_MOUNT="" + fi + + # Write env_vars via env block (avoids expression injection) + printenv MODEL_ENV_VARS | grep -v '^$' > /tmp/env_file.txt || true + + IMAGE_TAG="${RESOLVED_ATOM_BASE_IMAGE:-$ATOM_BASE_IMAGE}" + echo "Starting container with image: $IMAGE_TAG" + echo "Model-specific environment variables:" + cat /tmp/env_file.txt + + PULL_FLAG="" + if [ -n "${RESOLVED_ATOM_BASE_IMAGE:-}" ]; then + PULL_FLAG="" + elif [ "${{ matrix.runner }}" = "atom-mi355-8gpu.predownload" ] || [ "${{ matrix.runner }}" = "linux-atom-do-mi350x-8" ]; then + PULL_FLAG="--pull always" + fi + + docker run -dt $PULL_FLAG --device=/dev/kfd $DEVICE_FLAG \ + -v "${GITHUB_WORKSPACE:-$PWD}":/workspace \ + $MODEL_MOUNT \ + -w /workspace \ + --ipc=host --group-add video \ + --shm-size=16G \ + --privileged \ + --cap-add=SYS_PTRACE \ + -e HF_TOKEN="${HF_TOKEN:-}" \ + -e ATOM_DOCKER_IMAGE="${ATOM_DASHBOARD_DOCKER_IMAGE:-}" \ + -e USE_ATOMESH_ENTRYPOINTS="${USE_ATOMESH_ENTRYPOINTS}" \ + -e ATOM_SERVER_PORT="${ATOM_SERVER_PORT}" \ + --env-file /tmp/env_file.txt \ + --security-opt seccomp=unconfined \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -e ATOM_DISABLE_MMAP=true \ + -v "${{ github.workspace }}:/workspace" \ + -w /workspace \ + --name "$CONTAINER_NAME" \ + $IMAGE_TAG + + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + MODEL_ENV_VARS: ${{ matrix.env_vars }} + + - name: Check shm size + run: | + docker exec "$CONTAINER_NAME" df -h /dev/shm + + - name: Collect GPU info (inside container) + id: gpu-info + run: bash .github/scripts/collect_gpu_info.sh "$CONTAINER_NAME" docker "${{ matrix.runner }}" + + - name: Install aiter from wheel + run: | + AITER_WHL=$(ls -t /tmp/aiter-whl/amd_aiter*.whl 2>/dev/null | head -1) + if [ -z "$AITER_WHL" ]; then + echo "ERROR: No amd_aiter wheel found" + ls -la /tmp/aiter-whl/ + exit 1 + fi + + echo "=== Copying wheel into container ===" + WHL_NAME=$(basename "$AITER_WHL") + docker cp "$AITER_WHL" "$CONTAINER_NAME:/tmp/$WHL_NAME" + + docker exec "$CONTAINER_NAME" bash -lc " + set -euo pipefail + echo '=== Uninstalling existing amd-aiter ===' + pip uninstall -y amd-aiter || true + + echo '=== Installing amd-aiter from wheel ===' + pip install /tmp/$WHL_NAME + + echo '=== Installed amd-aiter version ===' + pip show amd-aiter + " + + - name: Install ATOM and dependencies + run: | + docker exec "$CONTAINER_NAME" bash -lc " + set -euo pipefail + pip install --timeout 60 --retries 10 -U 'lm-eval[api]' + pip install --timeout 60 --retries 10 hf_transfer + pip install --timeout 60 --retries 10 --upgrade 'pybind11>=3.0.1' + if ! command -v cargo >/dev/null 2>&1; then + echo '=== Installing Rust toolchain for atomesh build ===' + RUST_VERSION='1.94.0' + apt-get update + apt --fix-broken install -y + apt-get install -y --no-install-recommends curl build-essential pkg-config libssl-dev protobuf-compiler libprotobuf-dev + rm -rf /var/lib/apt/lists/* + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --default-toolchain \"\$RUST_VERSION\" --profile minimal + . \"\$HOME/.cargo/env\" + fi + export PATH=\"\$HOME/.cargo/bin:\$PATH\" + rustc --version + cargo --version + + echo '=== Installing ATOM ===' + cd /workspace + git config --global --add safe.directory /workspace + ATOM_MESH_BUILD=1 python -m pip install -e . + + echo '=== Installed package versions ===' + pip show amd-aiter | grep -E '^(Name|Version):' + pip show atom | grep -E '^(Name|Version):' + pip show triton | grep -E '^(Name|Version):' + pip show torch | grep -E '^(Name|Version):' + " + + - name: Download models + timeout-minutes: 150 + run: | + set -euo pipefail + if [ -d "/models" ]; then + model_dir="/models/${{ matrix.model_path }}" + echo "/models directory found, checking cache and lock-protected download for ${model_dir}" + if ! docker exec \ + -e HF_TOKEN="${HF_TOKEN:-}" \ + -e MODEL_ID="${{ matrix.model_path }}" \ + -e TARGET_DIR="${model_dir}" \ + -e MODEL_DOWNLOAD_TIMEOUT="${MODEL_DOWNLOAD_TIMEOUT}" \ + -e MODEL_LOCK_WAIT_SECONDS="${MODEL_LOCK_WAIT_SECONDS}" \ + -e MODEL_LOCK_POLL_INTERVAL="${MODEL_LOCK_POLL_INTERVAL}" \ + -e MODEL_PROGRESS_INTERVAL="${MODEL_PROGRESS_INTERVAL}" \ + "$CONTAINER_NAME" bash -lc 'bash /workspace/.github/scripts/download_model_with_lock.sh "$MODEL_ID" "$TARGET_DIR"'; then + echo "Model download failed for '${{ matrix.model_path }}'. Aborting." + exit 1 + fi + else + echo "/models directory not found, skipping model download" + fi + env: + MODEL_DOWNLOAD_TIMEOUT: "30m" + MODEL_LOCK_WAIT_SECONDS: "1800" + MODEL_LOCK_POLL_INTERVAL: "30" + MODEL_PROGRESS_INTERVAL: "60" + + - name: Run ATOM simple inference + # Skip simple inference; accuracy test already validates correctness + if: false + timeout-minutes: 30 + run: | + # Run the inference and capture output + set -euo pipefail + + echo "" + echo "========== Running test ==========" + + if [ -d "/models" ]; then + model_path="/models/${{ matrix.model_path }}" + else + model_path="${{ matrix.model_path }}" + fi + echo "Model path: $model_path" + ls -la $model_path || true + # Print debug logs + echo "========= Runner debug logs ===============" + ps aux + rocm-smi --showmemuse + rocm-smi --showpids + docker ps -a + echo "========= End runner debug logs ===============" + docker exec "$CONTAINER_NAME" bash -lc " + set -euo pipefail + python3 -m atom.examples.simple_inference \ + --model \"$model_path\" \ + ${{ matrix.extraArgs }} \ + --temperature 0 \ + | grep -E '^Prompt: |^Completion:' + " > atom_test_output.txt + + echo "" + echo "========== Showing test output below ==========" + cat atom_test_output.txt + + - name: Compare output with golden outputs + if: false + timeout-minutes: 30 + # TODO: skip for all test until it's fixed + run: | + echo "========== Comparing output with golden outputs ==========" + if ! diff -u -B -w --strip-trailing-cr \ + atom_test_output.txt \ + ".github/workflows/golden_outputs/${{ matrix.model_name }}_golden_output.txt"; then + echo "Failed: Output does not match golden outputs." + exit 1 + else + echo "Success: Output matches golden outputs." + fi + + - name: Run ATOM accuracy test + timeout-minutes: 30 + env: + MODEL_EXTRA_ARGS: ${{ matrix.extraArgs }} + CLIENT_COMMAND: ${{ matrix.client_command || '' }} + run: | + set -euo pipefail + echo "" + echo "========== Launching ATOM server ==========" + if [ -d "/models" ]; then + model_path="/models/${{ matrix.model_path }}" + else + model_path="${{ matrix.model_path }}" + fi + # Pipe via stdin so container bash parses shell quoting in extraArgs + # (e.g. single-quoted JSON in --default-chat-template-kwargs) naturally. + echo ".github/scripts/atom_test.sh launch $model_path $MODEL_EXTRA_ARGS" | \ + docker exec -i "$CONTAINER_NAME" bash -l + echo "" + echo "========== Running accuracy test ==========" + docker exec \ + -e CLIENT_COMMAND="${CLIENT_COMMAND}" \ + -e GPU_NAME="${{ steps.gpu-info.outputs.gpu_name }}" \ + -e GPU_VRAM_GB="${{ steps.gpu-info.outputs.gpu_vram_gb }}" \ + -e ROCM_VERSION="${{ steps.gpu-info.outputs.rocm_version }}" \ + "$CONTAINER_NAME" bash -lc " + .github/scripts/atom_test.sh accuracy $model_path + " 2>&1 | tee atom_accuracy_output.txt + + - name: Dump server log + if: always() + run: | + docker exec "$CONTAINER_NAME" cat /tmp/atom_server.log 2>/dev/null || true + + - name: Dump client log + if: always() + run: | + docker exec "$CONTAINER_NAME" cat /tmp/atom_client.log 2>/dev/null || true + + - name: Check accuracy test results + if: success() + env: + MODEL_NAME: ${{ matrix.model_name }} + run: | + result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null | head -n 1) + if [ -z "$result_file" ] || [ ! -f "$result_file" ]; then + echo "ERROR: No results JSON file found in accuracy_test_results/" + exit 2 + else + echo "RESULT_FILE: $result_file" + fi + flexible_extract_value=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file") + echo "Flexible extract value: $flexible_extract_value" + + # Read threshold from models_accuracy.json (via env var to avoid shell injection) + threshold=$(python3 -c " + import json, os + models = json.load(open('.github/benchmark/models_accuracy.json', encoding='utf-8')) + name = os.environ['MODEL_NAME'] + t = next((m.get('accuracy_threshold', 0) for m in models if m['model_name'] == name), 0) + print(t) + ") + echo "Accuracy test threshold: $threshold" + + result=$(awk -v val="$flexible_extract_value" -v threshold="$threshold" 'BEGIN {print (val < threshold) ? 1 : 0}') + if [ "$result" -eq 1 ]; then + echo "Accuracy test failed: $flexible_extract_value < $threshold" + exit 1 + else + echo "Accuracy test passed: $flexible_extract_value >= $threshold" + fi + + - name: Collect Test Summary + if: success() + env: + MODEL_NAME: ${{ matrix.model_name }} + run: | + # Read threshold and score for summary + threshold=$(python3 -c " + import json, os + models = json.load(open('.github/benchmark/models_accuracy.json', encoding='utf-8')) + name = os.environ['MODEL_NAME'] + print(next((m.get('accuracy_threshold', 0) for m in models if m['model_name'] == name), 0)) + ") + result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null | head -n 1) + score=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file" 2>/dev/null || echo "N/A") + + echo "Accuracy Test Summary for ${{ matrix.model_name }} (threshold: ${threshold}, score: ${score}):" >> $GITHUB_STEP_SUMMARY + awk '/\|Tasks\|Version\|/,/^$/ { if (NF > 0) print }' atom_accuracy_output.txt >> $GITHUB_STEP_SUMMARY + + - name: Upload output + if: always() + uses: actions/upload-artifact@v7 + with: + name: ${{ matrix.model_name }}_atom_test_output.txt + path: atom_test_output.txt + + - name: Upload accuracy results + if: always() + uses: actions/upload-artifact@v7 + with: + name: accuracy-${{ matrix.model_name }} + path: accuracy_test_results/*.json + if-no-files-found: ignore + + - name: Clean Up + if: always() + run: | + # TODO: run a separate container for cleanup of the workspace due to permission issue to remove some pyc files under __pycache__ whose owners are root. + # We should use non-root user to run the test to avoid this issue. + set -x + echo "========== Cleaning up workspace ==========" + if [[ ${{ matrix.runner }} == atom-mi355-8gpu.predownload ]]; then + docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && find /workspace -mindepth 1 -delete" || true + fi + docker stop "$CONTAINER_NAME" || true + docker rm "$CONTAINER_NAME" || true + # Remove the pre-built image to free disk space on the runner + docker rmi "rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}" || true + + # ---------- Publish Atomesh accuracy data for the mocker benchmark dashboard ---------- + publish-atomesh-accuracy-data: + name: Publish Atomesh accuracy data + needs: [atomesh-test] + if: always() && github.ref == 'refs/heads/main' && (github.event_name == 'push' || github.event_name == 'schedule') + runs-on: ubuntu-latest + permissions: + actions: read + contents: write + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 + with: + python-version: '3.12' + + - name: Download accuracy artifacts + uses: actions/download-artifact@v8 + with: + path: /tmp/accuracy-results + pattern: accuracy-* + + - name: List downloaded artifacts + run: | + echo "=== Downloaded accuracy artifacts ===" + find /tmp/accuracy-results -type f -name '*.json' | head -20 || echo "No JSON files found" + + - name: Transform accuracy results for mocker dashboard data + run: | + python3 .github/scripts/accuracy_to_dashboard.py \ + /tmp/accuracy-results \ + --output accuracy-benchmark-input.json \ + --models .github/benchmark/models_accuracy.json \ + --backend ATOMesh \ + --run-url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + echo "=== Generated entries ===" + cat accuracy-benchmark-input.json + + - name: Store Atomesh accuracy data + if: hashFiles('accuracy-benchmark-input.json') != '' + uses: benchmark-action/github-action-benchmark@v1 + with: + tool: customBiggerIsBetter + output-file-path: accuracy-benchmark-input.json + gh-pages-branch: gh-pages + benchmark-data-dir-path: atomesh-accuracy-dashboard + auto-push: true + max-items-in-chart: 300 + github-token: ${{ secrets.GITHUB_TOKEN }} + diff --git a/.github/workflows/atomesh-mocker-benchmark.yaml b/.github/workflows/atomesh-mocker-benchmark.yaml new file mode 100644 index 0000000000..826599a8b3 --- /dev/null +++ b/.github/workflows/atomesh-mocker-benchmark.yaml @@ -0,0 +1,288 @@ +name: Atomesh Mocker Benchmark + +on: + push: + branches: [main] + paths: + - 'atom/mesh/**' + - '.github/scripts/atomesh_mocker_benchmark.sh' + - '.github/workflows/atomesh-mocker-benchmark.yaml' + - '.github/dashboard/atomesh_mocker_index.html' + - 'docs/assets/atomesh_logo.png' + pull_request: + branches: [main] + types: [opened, synchronize, reopened, ready_for_review] + paths: + - 'atom/mesh/**' + - '.github/scripts/atomesh_mocker_benchmark.sh' + - '.github/workflows/atomesh-mocker-benchmark.yaml' + - '.github/dashboard/atomesh_mocker_index.html' + - 'docs/assets/atomesh_logo.png' + schedule: + # Nightly at 02:00 Beijing time (18:00 UTC) + - cron: '0 18 * * *' + workflow_dispatch: + inputs: + suite: + description: 'Benchmark suite: smoke runs 1P1D/c=1 sanity check; full runs 1P1D, 2P1D, 3P1D across c=1,2,4,8,16' + required: false + default: 'full' + type: choice + options: + - smoke + - full + publish_dashboard: + description: 'Publish workflow_dispatch results to the benchmark dashboard' + required: false + default: false + type: boolean + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +permissions: + actions: read + contents: write + +jobs: + run_atomesh_test_harness: + if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} + name: run_atomesh_test_harness + runs-on: ubuntu-latest + steps: + - name: Checkout ATOM repo + uses: actions/checkout@v6 + + - name: Set up build environment + run: | + sudo apt-get update + sudo apt-get install -y protobuf-compiler + + - name: Cache cargo build output + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + atom/mesh/mocker/target + key: atomesh-mocker-cargo-${{ runner.os }}-${{ hashFiles('atom/mesh/Cargo.lock', 'atom/mesh/Cargo.toml', 'atom/mesh/mocker/Cargo.toml') }} + restore-keys: | + atomesh-mocker-cargo-${{ runner.os }}- + + - name: Run Atomesh test harness + run: | + set -euo pipefail + cargo test \ + --manifest-path atom/mesh/mocker/Cargo.toml \ + --target-dir atom/mesh/mocker/target/mocker \ + --release \ + test_atomesh_harness + + run_atomesh_mocker_benchmark: + if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} + name: run_atomesh_mocker_benchmark + needs: [run_atomesh_test_harness] + runs-on: ubuntu-latest + timeout-minutes: 75 + steps: + - name: Checkout ATOM repo + uses: actions/checkout@v6 + + - name: Build benchmark matrix + env: + SUITE: ${{ github.event_name == 'workflow_dispatch' && inputs.suite || 'full' }} + run: | + python3 <<'PY' + import json + import os + + suite = os.environ.get("SUITE", "full") + if suite == "smoke": + duration = "30s" + consumer_threads = [1] + topologies = [(1, 1)] + elif suite == "full": + duration = "3m" + consumer_threads = [1, 2, 4, 8, 16] + topologies = [(1, 1), (2, 1), (3, 1)] + else: + raise SystemExit(f"Unsupported suite={suite}") + + cells = [] + + def add_pd(duration, prefill, decode, consumers): + cells.append({ + "id": f"pd-chat-{prefill}p{decode}d-conc{consumers}", + "display": f"pd-chat {prefill}P{decode}D CONC{consumers}", + "scenario": "pd-chat", + "duration": duration, + "prefill_workers": prefill, + "decode_workers": decode, + "producer_threads": 1, + "consumer_threads": consumers, + }) + + for prefill, decode in topologies: + for consumers in consumer_threads: + add_pd(duration, prefill, decode, consumers) + + cells_json = json.dumps(cells) + with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env: + env.write(f"CELLS_JSON={cells_json}\n") + + print(f"Generated {len(cells)} benchmark cells for suite={suite}") + for cell in cells: + print( + f" {cell['id']}: scenario={cell['scenario']} duration={cell['duration']} " + f"P/D={cell['prefill_workers']}/{cell['decode_workers']} " + f"producer/consumer={cell['producer_threads']}/{cell['consumer_threads']}" + ) + PY + + - name: Cache Rust build artifacts + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + atom/mesh/mocker/target + key: atomesh-mocker-cargo-${{ runner.os }}-${{ hashFiles('atom/mesh/Cargo.lock', 'atom/mesh/Cargo.toml', 'atom/mesh/mocker/Cargo.toml') }} + restore-keys: | + atomesh-mocker-cargo-${{ runner.os }}- + + - name: Build Atomesh + run: | + set -euo pipefail + sudo apt-get update + sudo apt-get install -y protobuf-compiler + cargo build \ + --manifest-path atom/mesh/mocker/Cargo.toml \ + --target-dir atom/mesh/mocker/target/mocker \ + --release + cargo build \ + --manifest-path atom/mesh/Cargo.toml \ + --target-dir atom/mesh/mocker/target/mesh \ + --release + + - name: Run mocker benchmark + env: + RESULT_DIR: atomesh-mocker-results + run: | + set -euo pipefail + chmod +x .github/scripts/atomesh_mocker_benchmark.sh + python3 .github/scripts/atomesh_mocker_benchmark_summary.py + + - name: Dump mocker benchmark-request log + if: always() + run: | + set -euo pipefail + shopt -s nullglob + logs=(atomesh-mocker-results/logs/*/benchmark-request.log) + if [ "${#logs[@]}" -eq 0 ]; then + echo "No Atomesh mocker benchmark-request logs found." + exit 0 + fi + + for log in "${logs[@]}"; do + cell="$(basename "$(dirname "$log")")" + echo "::group::benchmark-request ${cell}" + cat "$log" + echo "::endgroup::" + done + + - name: Summarize mocker benchmark result + if: always() + run: | + set -euo pipefail + if [ -f "atomesh-mocker-results/benchmark-summary.md" ]; then + cat "atomesh-mocker-results/benchmark-summary.md" + cat "atomesh-mocker-results/benchmark-summary.md" >> "$GITHUB_STEP_SUMMARY" + else + echo "No Atomesh mocker benchmark summary was generated." >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Upload benchmark result + if: always() + uses: actions/upload-artifact@v7 + with: + name: atomesh-mocker-benchmark-results + path: | + atomesh-mocker-results/*.json + atomesh-mocker-results/*.md + atomesh-mocker-results/logs/ + if-no-files-found: ignore + + dashboard: + name: Update Mocker Benchmark Dashboard + needs: [run_atomesh_mocker_benchmark] + if: >- + !cancelled() + && needs.run_atomesh_mocker_benchmark.result == 'success' + && ( + github.event_name == 'schedule' + || github.event_name == 'push' + || (github.event_name == 'workflow_dispatch' && inputs.publish_dashboard) + ) + runs-on: ubuntu-latest + steps: + - name: Checkout ATOM repo + uses: actions/checkout@v6 + + - name: Download benchmark artifacts + uses: actions/download-artifact@v8 + with: + pattern: atomesh-mocker-benchmark-* + merge-multiple: true + path: atomesh-mocker-results + + - name: Build benchmark-action input + run: | + set -euo pipefail + python3 - <<'PY' + import json + from pathlib import Path + + entries = [] + for path in sorted(Path("atomesh-mocker-results").glob("*-benchmark-action.json")): + entries.extend(json.loads(path.read_text(encoding="utf-8"))) + + Path("atomesh-mocker-dashboard-input.json").write_text( + json.dumps(entries, indent=2), + encoding="utf-8", + ) + print(f"Generated {len(entries)} dashboard entries") + PY + + - name: Store benchmark result to dashboard + if: hashFiles('atomesh-mocker-dashboard-input.json') != '' + uses: benchmark-action/github-action-benchmark@v1 + with: + tool: customBiggerIsBetter + output-file-path: atomesh-mocker-dashboard-input.json + gh-pages-branch: gh-pages + benchmark-data-dir-path: atomesh-mocker-dashboard + auto-push: false + max-items-in-chart: 300 + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Deploy mocker benchmark dashboard to gh-pages + if: hashFiles('atomesh-mocker-dashboard-input.json') != '' + run: | + set -euo pipefail + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + DASHBOARD_TEMPLATE=$(mktemp) + LOGO_ASSET=$(mktemp) + cp .github/dashboard/atomesh_mocker_index.html "$DASHBOARD_TEMPLATE" + cp docs/assets/atomesh_logo.png "$LOGO_ASSET" + CURRENT_SHA=$(git rev-parse HEAD) + git fetch origin gh-pages + git checkout gh-pages + mkdir -p atomesh-mocker-dashboard + cp "$DASHBOARD_TEMPLATE" atomesh-mocker-dashboard/index.html + cp "$LOGO_ASSET" atomesh-mocker-dashboard/atomesh_logo.png + git add atomesh-mocker-dashboard/ + git diff --cached --quiet || git commit -m "Update Atomesh mocker benchmark dashboard" + git push origin gh-pages + git checkout "$CURRENT_SHA" diff --git a/atom/mesh/mocker/fixtures/grpc_pd_generate.json b/atom/mesh/mocker/fixtures/grpc_pd_generate.json index 83b2c94b8b..4278a47dcf 100644 --- a/atom/mesh/mocker/fixtures/grpc_pd_generate.json +++ b/atom/mesh/mocker/fixtures/grpc_pd_generate.json @@ -1,6 +1,6 @@ { "name": "grpc_pd_generate", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "endpoint": "/generate", "route": { "worker_kind": "prefill_decode", @@ -8,7 +8,7 @@ "backend": "sglang" }, "request": { - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "text": "Hello world", "stream": false }, diff --git a/atom/mesh/mocker/fixtures/grpc_regular_generate.json b/atom/mesh/mocker/fixtures/grpc_regular_generate.json index 191f8ea4d9..6b485eed2a 100644 --- a/atom/mesh/mocker/fixtures/grpc_regular_generate.json +++ b/atom/mesh/mocker/fixtures/grpc_regular_generate.json @@ -1,6 +1,6 @@ { "name": "grpc_regular_generate", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "endpoint": "/generate", "route": { "worker_kind": "regular", @@ -8,7 +8,7 @@ "backend": "sglang" }, "request": { - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "text": "Hello world", "stream": false }, diff --git a/atom/mesh/mocker/fixtures/grpc_regular_generate_vllm.json b/atom/mesh/mocker/fixtures/grpc_regular_generate_vllm.json index c94e4e683b..816b84438c 100644 --- a/atom/mesh/mocker/fixtures/grpc_regular_generate_vllm.json +++ b/atom/mesh/mocker/fixtures/grpc_regular_generate_vllm.json @@ -1,6 +1,6 @@ { "name": "grpc_regular_generate_vllm", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "endpoint": "/generate", "route": { "worker_kind": "regular", @@ -8,7 +8,7 @@ "backend": "vllm" }, "request": { - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "text": "Hello world", "stream": false }, diff --git a/atom/mesh/mocker/fixtures/http_pd_chat.json b/atom/mesh/mocker/fixtures/http_pd_chat.json index cccfb97cbe..e548f179cf 100644 --- a/atom/mesh/mocker/fixtures/http_pd_chat.json +++ b/atom/mesh/mocker/fixtures/http_pd_chat.json @@ -1,6 +1,6 @@ { "name": "http_pd_chat", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "endpoint": "/v1/chat/completions", "route": { "worker_kind": "prefill_decode", @@ -20,7 +20,7 @@ "body": { "id": "chatcmpl-pd-test", "object": "chat.completion", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "choices": [ { "index": 0, diff --git a/atom/mesh/mocker/fixtures/http_regular_chat.json b/atom/mesh/mocker/fixtures/http_regular_chat.json index 45bd74e7db..30d96dea56 100644 --- a/atom/mesh/mocker/fixtures/http_regular_chat.json +++ b/atom/mesh/mocker/fixtures/http_regular_chat.json @@ -1,6 +1,6 @@ { "name": "http_regular_chat", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "endpoint": "/v1/chat/completions", "route": { "worker_kind": "regular", @@ -20,7 +20,7 @@ "body": { "id": "chatcmpl-test", "object": "chat.completion", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "choices": [ { "index": 0, diff --git a/atom/mesh/mocker/fixtures/http_regular_chat_streaming.json b/atom/mesh/mocker/fixtures/http_regular_chat_streaming.json index 23800f2b46..f998d2c4c8 100644 --- a/atom/mesh/mocker/fixtures/http_regular_chat_streaming.json +++ b/atom/mesh/mocker/fixtures/http_regular_chat_streaming.json @@ -1,6 +1,6 @@ { "name": "http_regular_chat_streaming", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "endpoint": "/v1/chat/completions", "route": { "worker_kind": "regular", diff --git a/atom/mesh/mocker/fixtures/http_regular_completion.json b/atom/mesh/mocker/fixtures/http_regular_completion.json index af462fe9ad..b47adcc1b7 100644 --- a/atom/mesh/mocker/fixtures/http_regular_completion.json +++ b/atom/mesh/mocker/fixtures/http_regular_completion.json @@ -1,6 +1,6 @@ { "name": "http_regular_completion", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "endpoint": "/v1/completions", "route": { "worker_kind": "regular", @@ -16,7 +16,7 @@ "body": { "id": "cmpl-test", "object": "text_completion", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "choices": [ { "index": 0, diff --git a/atom/mesh/mocker/fixtures/http_regular_generate.json b/atom/mesh/mocker/fixtures/http_regular_generate.json index 4238567ce0..108ebab95c 100644 --- a/atom/mesh/mocker/fixtures/http_regular_generate.json +++ b/atom/mesh/mocker/fixtures/http_regular_generate.json @@ -1,6 +1,6 @@ { "name": "http_regular_generate", - "model": "test-model", + "model": "hf-internal-testing/llama-tokenizer", "endpoint": "/generate", "route": { "worker_kind": "regular", diff --git a/docs/assets/atomesh_logo.png b/docs/assets/atomesh_logo.png new file mode 100644 index 0000000000..14ae5d89fb Binary files /dev/null and b/docs/assets/atomesh_logo.png differ