diff --git a/.github/file-filter.yml b/.github/file-filter.yml index a2910c89af..b095532c6e 100644 --- a/.github/file-filter.yml +++ b/.github/file-filter.yml @@ -25,7 +25,6 @@ yml: &yml - '.github/workflows/phoenix/**' - '.github/workflows/frontier/**' - '.github/workflows/frontier_amd/**' - - '.github/workflows/bench.yml' - '.github/workflows/test.yml' - '.github/workflows/formatting.yml' diff --git a/.github/scripts/frontier_bench_config.sh b/.github/scripts/frontier_bench_config.sh new file mode 100755 index 0000000000..0ebbe2a290 --- /dev/null +++ b/.github/scripts/frontier_bench_config.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Run a single benchmark on a Frontier compute node (build already done on login node). +# Usage: frontier_bench_config.sh +# Runs inside a SLURM allocation on an ssh'd compute node. + +set -e +set -x + +cluster=$1; device=$2; interface=$3 + +flag="f"; [ "$cluster" = "frontier_amd" ] && flag="famd" +mode="g"; [ "$device" = "cpu" ] && mode="c" + +. ./mfc.sh load -c "$flag" -m "$mode" + +# Benchmark +job_slug="bench-${device}-${interface}" +n_ranks=12 +device_opts="" +if [ "$device" = "gpu" ]; then + gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') + n_ranks=$(echo "$gpus" | wc -w) + if [ "$n_ranks" -lt 1 ] || [ "$n_ranks" -gt 16 ]; then + echo "ERROR: Unexpected GPU count ($n_ranks). Expected 1-16 for Frontier MI250X." + echo "rocm-smi output:" + rocm-smi --showid + exit 1 + fi + echo "Detected $n_ranks GPUs: $gpus" + gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') + device_opts="--gpu" + [ "$interface" = "acc" ] && device_opts+=" acc" + [ "$interface" = "omp" ] && device_opts+=" mp" + device_opts+=" -g $gpu_ids" +fi + +if [ "$device" = "gpu" ]; then + ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c "$cluster" $device_opts -n $n_ranks +else + ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c "$cluster" $device_opts -n $n_ranks +fi diff --git a/.github/scripts/frontier_bench_post.sh b/.github/scripts/frontier_bench_post.sh new file mode 100755 index 0000000000..2faebdff2e --- /dev/null +++ b/.github/scripts/frontier_bench_post.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Post-process all Frontier benchmark results after the SLURM job completes. +# Runs bench_diff for each config, comparing master vs PR YAML outputs. + +set -euo pipefail + +# Benchmark configs: cluster device interface flag +bench_configs=( + "frontier:gpu:acc:f" + "frontier:gpu:omp:f" + "frontier_amd:gpu:omp:famd" +) + +for cfg in "${bench_configs[@]}"; do + IFS=':' read -r cluster device interface flag <<< "$cfg" + pr_yaml="pr-${cluster}-${device}-${interface}/bench-${device}-${interface}.yaml" + master_yaml="master-${cluster}-${device}-${interface}/bench-${device}-${interface}.yaml" + + echo "==========================================" + echo "bench_diff: $cluster $device $interface" + echo " PR: $pr_yaml" + echo " Master: $master_yaml" + echo "==========================================" + + if [ ! -f "$pr_yaml" ]; then + echo "ERROR: PR YAML not found: $pr_yaml" + exit 1 + fi + if [ ! -f "$master_yaml" ]; then + echo "ERROR: Master YAML not found: $master_yaml" + exit 1 + fi + + (cd pr && . ./mfc.sh load -c "$flag" -m g && ./mfc.sh bench_diff "../$master_yaml" "../$pr_yaml") +done diff --git a/.github/scripts/frontier_test_config.sh b/.github/scripts/frontier_test_config.sh new file mode 100755 index 0000000000..8200174bee --- /dev/null +++ b/.github/scripts/frontier_test_config.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Run a single test on a Frontier compute node (build already done on login node). +# Usage: frontier_test_config.sh +# Runs inside a SLURM allocation on an ssh'd compute node. + +set -e +set -x + +cluster=$1; device=$2; interface=$3 + +flag="f"; [ "$cluster" = "frontier_amd" ] && flag="famd" +mode="g"; [ "$device" = "cpu" ] && mode="c" + +. ./mfc.sh load -c "$flag" -m "$mode" + +# Device options +device_opts="" +if [ "$device" = "gpu" ]; then + device_opts="--gpu" + [ "$interface" = "acc" ] && device_opts+=" acc" + [ "$interface" = "omp" ] && device_opts+=" mp" +fi + +rdma="" +[ "$cluster" = "frontier" ] && [ "$device" = "gpu" ] && rdma="--rdma-mpi" + +# Test +if [ "$device" = "gpu" ]; then + gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') + ngpus=$(echo "$gpus" | wc -w) + if [ "$ngpus" -lt 1 ] || [ "$ngpus" -gt 16 ]; then + echo "ERROR: Unexpected GPU count ($ngpus). Expected 1-16 for Frontier MI250X." + echo "rocm-smi output:" + rocm-smi --showid + exit 1 + fi + echo "Detected $ngpus GPUs: $gpus" + ./mfc.sh test -v -a $rdma --max-attempts 3 -j $ngpus $device_opts -- -c "$cluster" +else + ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c "$cluster" +fi diff --git a/.github/scripts/run_frontier_all_benchmarks.sh b/.github/scripts/run_frontier_all_benchmarks.sh new file mode 100755 index 0000000000..58dc49b0e3 --- /dev/null +++ b/.github/scripts/run_frontier_all_benchmarks.sh @@ -0,0 +1,233 @@ +#!/bin/bash +# Orchestrate all Frontier benchmark configs in one multi-node SLURM allocation. +# 1. Builds all configs on the login node (PR and master, in parallel) +# 2. Submits a single SLURM job running benchmarks in parallel via ssh + +set -euo pipefail + +# Ignore SIGHUP to survive login node session drops +trap '' HUP + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# SLURM parameters +SLURM_ACCOUNT="ENG160" +SLURM_PARTITION="extended" +SLURM_WALLTIME="05:59:00" +CONFIG_TIMEOUT=7200 # 120 min per config + +# Benchmark configs: version cluster device interface +# 6 total: 3 configs x 2 versions (PR + master) +configs=( + "pr frontier gpu acc" + "pr frontier gpu omp" + "pr frontier_amd gpu omp" + "master frontier gpu acc" + "master frontier gpu omp" + "master frontier_amd gpu omp" +) +num_nodes=${#configs[@]} + +echo "==========================================" +echo "Frontier consolidated benchmarks: $num_nodes configs on $num_nodes nodes" +echo "==========================================" + +# Write config file for sbatch to read (single source of truth) +config_file="frontier-bench-configs.txt" +printf '%s\n' "${configs[@]}" > "$config_file" + +# --- Phase 1: Create per-config source copies --- +for cfg in "${configs[@]}"; do + read -r version cluster device interface <<< "$cfg" + dir="${version}-${cluster}-${device}-${interface}" + echo "Creating source copy: $dir from $version/" + rm -rf "$dir" + cp -al "$version" "$dir" 2>/dev/null || cp -r "$version" "$dir" +done + +# --- Phase 2: Build all configs on login node in parallel --- +echo "" +echo "==========================================" +echo "Starting parallel builds (${num_nodes} configs)..." +echo "==========================================" + +build_pids=() +for cfg in "${configs[@]}"; do + read -r version cluster device interface <<< "$cfg" + dir="${version}-${cluster}-${device}-${interface}" + log="build-${version}-${cluster}-${device}-${interface}.log" + echo " Starting: $version $cluster $device $interface" + ( + cd "$dir" + bash .github/workflows/${cluster}/build.sh "$device" "$interface" bench + ) > "$log" 2>&1 & + build_pids+=($!) +done + +# Periodic heartbeat while builds run +( + while true; do + sleep 120 + alive=0 + for pid in "${build_pids[@]}"; do + kill -0 "$pid" 2>/dev/null && alive=$((alive + 1)) + done + [ "$alive" -eq 0 ] && break + echo "--- Build heartbeat ($(date +%H:%M:%S)): $alive/${#build_pids[@]} running ---" + for i in "${!configs[@]}"; do + read -r version cluster device interface <<< "${configs[$i]}" + log="build-${version}-${cluster}-${device}-${interface}.log" + if kill -0 "${build_pids[$i]}" 2>/dev/null; then + size=$(stat -c%s "$log" 2>/dev/null || echo 0) + last=$(tail -n 1 "$log" 2>/dev/null | head -c 120 || echo "") + echo " $version $cluster $device $interface: running (${size} bytes) $last" + fi + done + done +) & +heartbeat_pid=$! + +# Wait for all builds and report results +build_failed=0 +build_exits=() +for i in "${!build_pids[@]}"; do + read -r version cluster device interface <<< "${configs[$i]}" + if wait "${build_pids[$i]}"; then + build_exits+=(0) + else + code=$? + build_exits+=($code) + build_failed=1 + fi +done + +# Stop heartbeat +kill "$heartbeat_pid" 2>/dev/null || true; wait "$heartbeat_pid" 2>/dev/null || true + +# Print build logs: passed builds collapsed, failed builds in full +for i in "${!configs[@]}"; do + read -r version cluster device interface <<< "${configs[$i]}" + log="build-${version}-${cluster}-${device}-${interface}.log" + if [ "${build_exits[$i]}" -eq 0 ]; then + echo "::group::Build PASSED: $version $cluster $device $interface" + cat "$log" + echo "::endgroup::" + else + echo "==========================================" + echo "Build FAILED: $version $cluster $device $interface (exit ${build_exits[$i]})" + echo "==========================================" + cat "$log" + fi +done + +# Abort on failure +if [ "$build_failed" -ne 0 ]; then + echo "" + echo "==========================================" + echo "Build failures detected — see logs above." + echo "==========================================" + exit 1 +fi + +echo "" +echo "==========================================" +echo "All builds complete. Submitting ${num_nodes}-node SLURM job..." +echo "==========================================" + +# --- Phase 3: Submit one sbatch job with N nodes --- +output_file="bench-frontier-all.out" + +submit_output=$(sbatch </dev/null + done + wait +} +trap cleanup EXIT + +for i in "\${!configs[@]}"; do + read -r version cluster device interface <<< "\${configs[\$i]}" + node="\${nodes[\$i]}" + dir="\${version}-\${cluster}-\${device}-\${interface}" + outfile="\${dir}/bench-\${device}-\${interface}.out" + + echo "[\$node] Starting bench: \$version \$cluster \$device \$interface in \$dir" + + timeout $CONFIG_TIMEOUT ssh -q -o StrictHostKeyChecking=no "\$node" \ + "cd \$SLURM_SUBMIT_DIR/\$dir && bash .github/scripts/frontier_bench_config.sh \$cluster \$device \$interface" \ + > "\$outfile" 2>&1 & + pids+=(\$!) +done + +echo "All bench configs launched, waiting for completion..." + +# Wait for all and collect exit codes +overall_exit=0 +for i in "\${!pids[@]}"; do + read -r version cluster device interface <<< "\${configs[\$i]}" + pid=\${pids[\$i]} + if wait "\$pid"; then + echo "PASSED: \$version \$cluster \$device \$interface (PID \$pid)" + else + code=\$? + echo "FAILED: \$version \$cluster \$device \$interface (PID \$pid, exit code \$code)" + overall_exit=1 + fi +done + +# Print summary +echo "" +echo "==========================================" +echo "Benchmark summary:" +for cfg in "\${configs[@]}"; do + read -r version cluster device interface <<< "\$cfg" + dir="\${version}-\${cluster}-\${device}-\${interface}" + yaml="\${dir}/bench-\${device}-\${interface}.yaml" + if [ -f "\$yaml" ]; then + echo " \$version \$cluster \$device \$interface: OK (\$(stat -c%s "\$yaml" 2>/dev/null) bytes)" + else + echo " \$version \$cluster \$device \$interface: MISSING YAML" + fi +done +echo "==========================================" + +exit \$overall_exit +OUTER +) + +job_id=$(echo "$submit_output" | awk '/Submitted batch job/ {print $4}') +if [ -z "$job_id" ]; then + echo "ERROR: Failed to submit job. sbatch output:" + echo "$submit_output" + exit 1 +fi + +echo "Submitted batch job $job_id ($num_nodes nodes)" + +# Monitor the job +bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" diff --git a/.github/scripts/run_frontier_all_tests.sh b/.github/scripts/run_frontier_all_tests.sh new file mode 100755 index 0000000000..d97c7d12d0 --- /dev/null +++ b/.github/scripts/run_frontier_all_tests.sh @@ -0,0 +1,237 @@ +#!/bin/bash +# Orchestrate all Frontier test configs in one multi-node SLURM allocation. +# 1. Builds all configs on the login node (in parallel, different modules each) +# 2. Submits a single SLURM job running tests in parallel via ssh + +set -euo pipefail + +# Ignore SIGHUP to survive login node session drops +trap '' HUP + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# SLURM parameters +SLURM_ACCOUNT="ENG160" +SLURM_PARTITION="extended" +SLURM_WALLTIME="05:59:00" +CONFIG_TIMEOUT=5400 # 90 min per config + +# Config table: cluster device interface +configs=( + "frontier gpu acc" + "frontier gpu omp" + "frontier cpu none" + "frontier_amd gpu omp" + "frontier_amd cpu none" +) +num_nodes=${#configs[@]} + +echo "==========================================" +echo "Frontier consolidated tests: $num_nodes configs" +echo "==========================================" + +# Write config file for sbatch to read (single source of truth) +config_file="frontier-test-configs.txt" +printf '%s\n' "${configs[@]}" > "$config_file" + +# --- Phase 1: Create per-config source copies --- +# Build exclude list to prevent copying into self +excludes="" +for cfg in "${configs[@]}"; do + read -r cluster device interface <<< "$cfg" + excludes+=" --exclude=test-${cluster}-${device}-${interface}" +done + +for cfg in "${configs[@]}"; do + read -r cluster device interface <<< "$cfg" + dir="test-${cluster}-${device}-${interface}" + echo "Creating source copy: $dir" + rm -rf "$dir" + rsync -a --link-dest="$(pwd)" $excludes ./ "$dir/" +done + +# --- Phase 2: Build all configs on login node in parallel --- +echo "" +echo "==========================================" +echo "Starting parallel builds (${num_nodes} configs)..." +echo "==========================================" + +build_pids=() +for cfg in "${configs[@]}"; do + read -r cluster device interface <<< "$cfg" + dir="test-${cluster}-${device}-${interface}" + log="build-${cluster}-${device}-${interface}.log" + echo " Starting: $cluster $device $interface" + ( + cd "$dir" + bash .github/workflows/${cluster}/build.sh "$device" "$interface" + ) > "$log" 2>&1 & + build_pids+=($!) +done + +# Periodic heartbeat while builds run +( + while true; do + sleep 120 + alive=0 + for pid in "${build_pids[@]}"; do + kill -0 "$pid" 2>/dev/null && alive=$((alive + 1)) + done + [ "$alive" -eq 0 ] && break + echo "--- Build heartbeat ($(date +%H:%M:%S)): $alive/${#build_pids[@]} running ---" + for i in "${!configs[@]}"; do + read -r cluster device interface <<< "${configs[$i]}" + log="build-${cluster}-${device}-${interface}.log" + if kill -0 "${build_pids[$i]}" 2>/dev/null; then + size=$(stat -c%s "$log" 2>/dev/null || echo 0) + last=$(tail -n 1 "$log" 2>/dev/null | head -c 120 || echo "") + echo " $cluster $device $interface: running (${size} bytes) $last" + fi + done + done +) & +heartbeat_pid=$! + +# Wait for all builds and report results +build_failed=0 +build_exits=() +for i in "${!build_pids[@]}"; do + read -r cluster device interface <<< "${configs[$i]}" + if wait "${build_pids[$i]}"; then + build_exits+=(0) + else + code=$? + build_exits+=($code) + build_failed=1 + fi +done + +# Stop heartbeat +kill "$heartbeat_pid" 2>/dev/null || true; wait "$heartbeat_pid" 2>/dev/null || true + +# Print build logs: passed builds collapsed, failed builds in full +for i in "${!configs[@]}"; do + read -r cluster device interface <<< "${configs[$i]}" + log="build-${cluster}-${device}-${interface}.log" + if [ "${build_exits[$i]}" -eq 0 ]; then + echo "::group::Build PASSED: $cluster $device $interface" + cat "$log" + echo "::endgroup::" + else + echo "==========================================" + echo "Build FAILED: $cluster $device $interface (exit ${build_exits[$i]})" + echo "==========================================" + cat "$log" + fi +done + +# Abort on failure +if [ "$build_failed" -ne 0 ]; then + echo "" + echo "==========================================" + echo "Build failures detected — see logs above." + echo "==========================================" + exit 1 +fi + +echo "" +echo "==========================================" +echo "All builds complete. Submitting ${num_nodes}-node SLURM job..." +echo "==========================================" + +# --- Phase 3: Submit one sbatch job with N nodes --- +output_file="test-frontier-all.out" + +submit_output=$(sbatch </dev/null + done + wait +} +trap cleanup EXIT + +for i in "\${!configs[@]}"; do + read -r cluster device interface <<< "\${configs[\$i]}" + node="\${nodes[\$i]}" + dir="test-\${cluster}-\${device}-\${interface}" + outfile="test-\${cluster}-\${device}-\${interface}.out" + + echo "[\$node] Starting test: \$cluster \$device \$interface in \$dir" + + timeout $CONFIG_TIMEOUT ssh -q -o StrictHostKeyChecking=no "\$node" \ + "cd \$SLURM_SUBMIT_DIR/\$dir && bash .github/scripts/frontier_test_config.sh \$cluster \$device \$interface" \ + > "\$outfile" 2>&1 & + pids+=(\$!) +done + +echo "All test configs launched, waiting for completion..." + +# Wait for all and collect exit codes +overall_exit=0 +for i in "\${!pids[@]}"; do + read -r cluster device interface <<< "\${configs[\$i]}" + pid=\${pids[\$i]} + if wait "\$pid"; then + echo "PASSED: \$cluster \$device \$interface (PID \$pid)" + else + code=\$? + echo "FAILED: \$cluster \$device \$interface (PID \$pid, exit code \$code)" + overall_exit=1 + fi +done + +# Print summary +echo "" +echo "==========================================" +echo "Test summary:" +for cfg in "\${configs[@]}"; do + read -r cluster device interface <<< "\$cfg" + outfile="test-\${cluster}-\${device}-\${interface}.out" + if [ -f "\$outfile" ]; then + echo " \$cluster \$device \$interface: \$(tail -n 1 "\$outfile")" + else + echo " \$cluster \$device \$interface: NO OUTPUT FILE" + fi +done +echo "==========================================" + +exit \$overall_exit +OUTER +) + +job_id=$(echo "$submit_output" | awk '/Submitted batch job/ {print $4}') +if [ -z "$job_id" ]; then + echo "ERROR: Failed to submit job. sbatch output:" + echo "$submit_output" + exit 1 +fi + +echo "Submitted batch job $job_id ($num_nodes nodes)" + +# Monitor the job +bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml deleted file mode 100644 index 6279f5f578..0000000000 --- a/.github/workflows/bench.yml +++ /dev/null @@ -1,187 +0,0 @@ -name: 'Benchmark' - -on: - # Trigger when Test Suite completes (no polling needed) - workflow_run: - workflows: ["Test Suite"] - types: [completed] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }} - cancel-in-progress: true - -jobs: - file-changes: - name: Detect File Changes - # Only run if Test Suite passed (or manual dispatch) - if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' - runs-on: 'ubuntu-latest' - outputs: - checkall: ${{ steps.changes.outputs.checkall }} - pr_number: ${{ steps.pr-info.outputs.pr_number }} - pr_approved: ${{ steps.pr-info.outputs.approved }} - pr_author: ${{ steps.pr-info.outputs.author }} - steps: - - name: Clone - uses: actions/checkout@v4 - with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} - - - name: Detect Changes - uses: dorny/paths-filter@v3 - id: changes - with: - filters: ".github/file-filter.yml" - - - name: Get PR Info - id: pr-info - env: - GH_TOKEN: ${{ github.token }} - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=true" >> $GITHUB_OUTPUT - echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT - else - # Get PR number from workflow_run - PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}" - if [ -n "$PR_NUMBER" ]; then - echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT - - # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author) - PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login') - echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT - - # Check if PR is approved - APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \ - --jq '[.[] | select(.state == "APPROVED")] | length') - if [ "$APPROVED" -gt 0 ]; then - echo "approved=true" >> $GITHUB_OUTPUT - else - echo "approved=false" >> $GITHUB_OUTPUT - fi - else - echo "pr_number=" >> $GITHUB_OUTPUT - echo "approved=false" >> $GITHUB_OUTPUT - echo "author=" >> $GITHUB_OUTPUT - fi - fi - - self: - name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" - if: > - github.repository == 'MFlowCode/MFC' && - needs.file-changes.outputs.checkall == 'true' && - ( - github.event_name == 'workflow_dispatch' || - needs.file-changes.outputs.pr_approved == 'true' || - needs.file-changes.outputs.pr_author == 'sbryngelson' || - needs.file-changes.outputs.pr_author == 'wilfonba' - ) - needs: [file-changes] - strategy: - fail-fast: false - matrix: - include: - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: cpu - interface: none - build_script: "" - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: gpu - interface: acc - build_script: "" - - cluster: phoenix - name: Georgia Tech | Phoenix (NVHPC) - group: phoenix - labels: gt - flag: p - device: gpu - interface: omp - build_script: "" - - cluster: frontier - name: Oak Ridge | Frontier (CCE) - group: phoenix - labels: frontier - flag: f - device: gpu - interface: acc - build_script: "bash .github/workflows/frontier/build.sh gpu acc bench" - - cluster: frontier - name: Oak Ridge | Frontier (CCE) - group: phoenix - labels: frontier - flag: f - device: gpu - interface: omp - build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" - - cluster: frontier_amd - name: Oak Ridge | Frontier (AMD) - group: phoenix - labels: frontier - flag: famd - device: gpu - interface: omp - build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" - runs-on: - group: ${{ matrix.group }} - labels: ${{ matrix.labels }} - timeout-minutes: 480 - env: - ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 - ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true - steps: - - name: Clone - PR - uses: actions/checkout@v4 - with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} - path: pr - - - name: Clone - Master - uses: actions/checkout@v4 - with: - repository: MFlowCode/MFC - ref: master - path: master - - - name: Setup & Build - if: matrix.build_script != '' - run: | - (cd pr && ${{ matrix.build_script }}) & - (cd master && ${{ matrix.build_script }}) & - wait %1 && wait %2 - - - name: Bench (Master v. PR) - run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - - - name: Generate & Post Comment - run: | - (cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g) - (cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml) - - - name: Print Logs - if: always() - run: | - cat pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true - cat master/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true - - # All other runners (non-Phoenix) just run without special env - - name: Archive Logs (Frontier) - if: always() && matrix.cluster != 'phoenix' - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.cluster }}-${{ matrix.device }}-${{ matrix.interface }} - path: | - pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* - pr/build/benchmarks/* - master/bench-${{ matrix.device }}-${{ matrix.interface }}.* - master/build/benchmarks/* diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 7ae85e66fe..fc28b3046b 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -20,9 +20,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -CL40S -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 06a03e465a..5747c839f0 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -23,9 +23,8 @@ sbatch_cpu_opts="\ " sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ +#SBATCH --gres=gpu:H200:2 +#SBATCH --ntasks-per-node=8 # Number of cores per node required\ " if [ "$2" = "cpu" ]; then diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0be51076ec..96fd2d70ff 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -165,33 +165,12 @@ jobs: cluster_name: 'Georgia Tech | Phoenix' device: 'cpu' interface: 'none' - # Frontier (ORNL) — build on login node, test via SLURM + # Frontier (ORNL) — all configs consolidated into one 5-node SLURM job - runner: 'frontier' - cluster: 'frontier' + cluster: 'frontier_all' cluster_name: 'Oak Ridge | Frontier' - device: 'gpu' - interface: 'acc' - - runner: 'frontier' - cluster: 'frontier' - cluster_name: 'Oak Ridge | Frontier' - device: 'gpu' - interface: 'omp' - - runner: 'frontier' - cluster: 'frontier' - cluster_name: 'Oak Ridge | Frontier' - device: 'cpu' - interface: 'none' - # Frontier AMD — build on login node, test via SLURM - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'gpu' - interface: 'omp' - - runner: 'frontier' - cluster: 'frontier_amd' - cluster_name: 'Oak Ridge | Frontier (AMD)' - device: 'cpu' - interface: 'none' + device: 'all' + interface: 'configs' runs-on: group: phoenix labels: ${{ matrix.runner }} @@ -204,19 +183,138 @@ jobs: uses: actions/checkout@v4 - name: Build - if: matrix.cluster != 'phoenix' - run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} + if: matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_all' + run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - name: Test - run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} + if: matrix.cluster != 'frontier_all' + run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} + + - name: Build & Test (Frontier All Configs) + if: matrix.cluster == 'frontier_all' + run: bash .github/scripts/run_frontier_all_tests.sh - name: Print Logs - if: always() - run: cat test-${{ matrix.device }}-${{ matrix.interface }}.out + if: always() + run: | + cat test-*.out 2>/dev/null || true - name: Archive Logs uses: actions/upload-artifact@v4 - if: matrix.cluster != 'phoenix' + if: always() && matrix.cluster != 'phoenix' with: - name: logs-${{ strategy.job-index }}-${{ matrix.device }}-${{ matrix.interface }} - path: test-${{ matrix.device }}-${{ matrix.interface }}.out + name: logs-${{ matrix.cluster }} + path: test-*.out + + bench: + name: "Benchmark | ${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})" + if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' + needs: [lint-gate, file-changes] + continue-on-error: false + timeout-minutes: 480 + strategy: + fail-fast: true + matrix: + include: + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: cpu + interface: none + build_script: "" + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: gpu + interface: acc + build_script: "" + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: gpu + interface: omp + build_script: "" + # Frontier — all configs consolidated into one 6-node SLURM job + - cluster: frontier_all + name: Oak Ridge | Frontier + group: phoenix + labels: frontier + device: all + interface: configs + build_script: "" + runs-on: + group: ${{ matrix.group }} + labels: ${{ matrix.labels }} + env: + ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16 + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + steps: + - name: Clone - PR + uses: actions/checkout@v4 + with: + path: pr + + - name: Clone - Master + uses: actions/checkout@v4 + with: + repository: MFlowCode/MFC + ref: master + path: master + + - name: Setup & Build + if: matrix.build_script != '' && matrix.cluster != 'frontier_all' + run: | + (cd pr && ${{ matrix.build_script }}) & + (cd master && ${{ matrix.build_script }}) & + wait %1 && wait %2 + + - name: Bench (Master v. PR) + if: matrix.cluster != 'frontier_all' + run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} + + - name: Bench All Configs (Frontier) + if: matrix.cluster == 'frontier_all' + run: bash pr/.github/scripts/run_frontier_all_benchmarks.sh + + - name: Generate & Post Comment + if: matrix.cluster != 'frontier_all' + run: | + (cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g) + (cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml) + + - name: Generate & Post Comments (Frontier) + if: matrix.cluster == 'frontier_all' + run: bash pr/.github/scripts/frontier_bench_post.sh + + - name: Print Logs + if: always() + run: | + cat pr/bench-*.* master/bench-*.* 2>/dev/null || true + cat pr-*/bench-*.* master-*/bench-*.* 2>/dev/null || true + + - name: Archive Logs + if: always() && matrix.cluster != 'phoenix' + uses: actions/upload-artifact@v4 + with: + name: bench-${{ matrix.cluster }} + path: | + pr*/bench-*.* + master*/bench-*.* + + cancel-on-test-failure: + name: Cancel on Test Failure + needs: [self] + if: failure() + runs-on: ubuntu-latest + steps: + - name: Cancel Workflow Run + run: gh run cancel ${{ github.run_id }} --repo ${{ github.repository }} + env: + GH_TOKEN: ${{ github.token }} +