MFlowCode · sbryngelson · Feb 9, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
@@ -1,87 +1,85 @@
 name: 'Benchmark'
 
 on:
-  pull_request:
-  pull_request_review:
-    types: [submitted]
+  # Trigger when Test Suite completes (no polling needed)
+  workflow_run:
+    workflows: ["Test Suite"]
+    types: [completed]
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
   cancel-in-progress: true
 
 jobs:
   file-changes:
     name: Detect File Changes
+    # Only run if Test Suite passed (or manual dispatch)
+    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
     runs-on: 'ubuntu-latest'
     outputs:
       checkall: ${{ steps.changes.outputs.checkall }}
+      pr_number: ${{ steps.pr-info.outputs.pr_number }}
+      pr_approved: ${{ steps.pr-info.outputs.approved }}
+      pr_author: ${{ steps.pr-info.outputs.author }}
     steps:
       - name: Clone
         uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
 
       - name: Detect Changes
         uses: dorny/paths-filter@v3
         id: changes
         with:
           filters: ".github/file-filter.yml"
 
-  wait-for-tests:
-    name: Wait for Test Suite
-    runs-on: ubuntu-latest
-    steps:
-      - name: Wait for Test Suite to Pass
+      - name: Get PR Info
+        id: pr-info
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
-          echo "Waiting for Test Suite workflow to complete..."
-          SHA="${{ github.event.pull_request.head.sha || github.sha }}"
-
-          # Poll every 60 seconds for up to 3 hours
-          for i in $(seq 1 180); do
-            # Get the Test Suite workflow runs for this commit
-            STATUS=$(gh api repos/${{ github.repository }}/commits/$SHA/check-runs \
-              --jq '.check_runs[] | select(.name == "Lint Gate") | .conclusion' | head -1)
-
-            if [ "$STATUS" = "success" ]; then
-              echo "Lint Gate passed. Checking test jobs..."
-
-              # Check if any Github test jobs failed
-              FAILED=$(gh api repos/${{ github.repository }}/commits/$SHA/check-runs \
-                --jq '[.check_runs[] | select(.name | startswith("Github")) | select(.conclusion == "failure")] | length')
-
-              if [ "$FAILED" != "0" ]; then
-                echo "::error::Test Suite has failing jobs. Benchmarks will not run."
-                exit 1
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "pr_number=" >> $GITHUB_OUTPUT
+            echo "approved=true" >> $GITHUB_OUTPUT
+            echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
+          else
+            # Get PR number from workflow_run
+            PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
+            if [ -n "$PR_NUMBER" ]; then
+              echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
+
+              # Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
+              PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
+              echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT
+
+              # Check if PR is approved
+              APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
+                --jq '[.[] | select(.state == "APPROVED")] | length')
+              if [ "$APPROVED" -gt 0 ]; then
+                echo "approved=true" >> $GITHUB_OUTPUT
+              else
+                echo "approved=false" >> $GITHUB_OUTPUT
               fi
-
-              # Check if Github tests are still running
-              PENDING=$(gh api repos/${{ github.repository }}/commits/$SHA/check-runs \
-                --jq '[.check_runs[] | select(.name | startswith("Github")) | select(.conclusion == null)] | length')
-
-              if [ "$PENDING" = "0" ]; then
-                echo "All Test Suite jobs completed successfully!"
-                exit 0
-              fi
-
-              echo "Tests still running ($PENDING pending)..."
-            elif [ "$STATUS" = "failure" ]; then
-              echo "::error::Lint Gate failed. Benchmarks will not run."
-              exit 1
             else
-              echo "Lint Gate status: ${STATUS:-pending}..."
+              echo "pr_number=" >> $GITHUB_OUTPUT
+              echo "approved=false" >> $GITHUB_OUTPUT
+              echo "author=" >> $GITHUB_OUTPUT
             fi
-
-            sleep 60
-          done
-
-          echo "::error::Timeout waiting for Test Suite to complete."
-          exit 1
+          fi
 
   self:
     name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
-    if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
-    needs: [file-changes, wait-for-tests]
+    if: >
+      github.repository == 'MFlowCode/MFC' &&
+      needs.file-changes.outputs.checkall == 'true' &&
+      (
+        github.event_name == 'workflow_dispatch' ||
+        needs.file-changes.outputs.pr_approved == 'true' ||
+        needs.file-changes.outputs.pr_author == 'sbryngelson' ||
+        needs.file-changes.outputs.pr_author == 'wilfonba'
+      )
+    needs: [file-changes]
     strategy:
       fail-fast: false
       matrix:
@@ -145,6 +143,7 @@ jobs:
       - name: Clone - PR
         uses: actions/checkout@v4
         with:
+          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
           path: pr
 
       - name: Clone - Master

@@ -1,6 +1,10 @@
 name: Cleanliness
 
-on: [push, pull_request, workflow_dispatch]
+on:
+  push:
+    branches: [master]
+  pull_request:
+  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

@@ -1,6 +1,10 @@
 name: Coverage Check
 
-on: [push, pull_request, workflow_dispatch]
+on:
+  push:
+    branches: [master]
+  pull_request:
+  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -39,10 +43,10 @@ jobs:
                 libfftw3-dev libhdf5-dev libblas-dev liblapack-dev
 
       - name: Build
-        run: /bin/bash mfc.sh build -j $(nproc) --gcov
+        run: /bin/bash mfc.sh build -v -j $(nproc) --gcov
 
       - name: Test
-        run: /bin/bash mfc.sh test -a -j $(nproc)
+        run: /bin/bash mfc.sh test -v -a -j $(nproc)
 
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4

@@ -5,6 +5,7 @@ on:
     - cron: '0 0 * * *'  # This runs every day at midnight UTC
   workflow_dispatch:
   push:
+    branches: [master]
   pull_request:
 
 jobs:

@@ -1,6 +1,10 @@
 name: Pretty
 
-on: [push, pull_request, workflow_dispatch]
+on:
+  push:
+    branches: [master]
+  pull_request:
+  workflow_dispatch:
 
 jobs:
   docs:

@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# Ignore SIGHUP to survive login node session drops
+trap '' HUP
+
 job_device=$1
 job_interface=$2
 run_bench=$3
@@ -15,12 +18,39 @@ fi
 
 . ./mfc.sh load -c f -m g
 
-if [ "$run_bench" == "bench" ]; then
-    for dir in benchmarks/*/; do
-        dirname=$(basename "$dir")
-        ./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
-    done
-else
-    ./mfc.sh test -a --dry-run --rdma-mpi -j 8 $build_opts
-fi
+max_attempts=3
+attempt=1
+while [ $attempt -le $max_attempts ]; do
+    echo "Build attempt $attempt of $max_attempts..."
+    if [ "$run_bench" == "bench" ]; then
+        build_cmd_ok=true
+        for dir in benchmarks/*/; do
+            dirname=$(basename "$dir")
+            if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
+                build_cmd_ok=false
+                break
+            fi
+        done
+    else
+        if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
+            build_cmd_ok=true
+        else
+            build_cmd_ok=false
+        fi
+    fi
+
+    if [ "$build_cmd_ok" = true ]; then
+        echo "Build succeeded on attempt $attempt."
+        exit 0
+    fi
+
+    if [ $attempt -lt $max_attempts ]; then
+        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
+        ./mfc.sh clean
+        sleep 30
+    fi
+    attempt=$((attempt + 1))
+done
 
+echo "Build failed after $max_attempts attempts."
+exit 1
@@ -29,7 +29,7 @@ job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
 sbatch <<EOT
 #!/bin/bash
 #SBATCH -JMFC-$job_slug            # Job name
-#SBATCH -A CFD154                  # charge account
+#SBATCH -A ENG160                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
 #SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)

@@ -2,6 +2,9 @@
 
 set -e
 
+# Ignore SIGHUP to survive login node session drops
+trap '' HUP
+
 usage() {
     echo "Usage: $0 [script.sh] [cpu|gpu]"
 }
@@ -26,17 +29,17 @@ fi
 
 
 job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
+output_file="$job_slug.out"
 
-sbatch <<EOT
+submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A CFD154                  # charge account
+#SBATCH -A ENG160                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
 #SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
-#SBATCH -o$job_slug.out            # Combined output and error messages file
+#SBATCH -o$output_file             # Combined output and error messages file
 #SBATCH -p extended                # Extended partition for shorter queues
-#SBATCH -W                         # Do not exit until the submitted job terminates.
 
 set -e
 set -x
@@ -53,4 +56,17 @@ job_interface="$3"
 $sbatch_script_contents
 
 EOT
+)
+
+job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
-job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
+job_id=$(echo "$submit_output" | grep -oE '[0-9]+' | head -n1)
-job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
+job_id=$(echo "$submit_output" | grep -oE '[0-9]+' | head -n1)
+if [ -z "$job_id" ]; then
+    echo "ERROR: Failed to submit job. sbatch output:"
+    echo "$submit_output"
+    exit 1
+fi
+
+echo "Submitted batch job $job_id"
 
+# Use resilient monitoring instead of sbatch -W
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
@@ -14,7 +14,7 @@ if [ "$job_device" = "gpu" ]; then
 fi
 
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
+    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
 else
-    ./mfc.sh test -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
+    ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
 fi
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# Ignore SIGHUP to survive login node session drops
+trap '' HUP
+
 job_device=$1
 job_interface=$2
 run_bench=$3
@@ -15,12 +18,39 @@ fi
 
 . ./mfc.sh load -c famd -m g
 
-if [ "$run_bench" == "bench" ]; then
-    for dir in benchmarks/*/; do
-        dirname=$(basename "$dir")
-        ./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
-    done
-else
-    ./mfc.sh test -a --dry-run -j 8 $build_opts
-fi
+max_attempts=3
+attempt=1
+while [ $attempt -le $max_attempts ]; do
+    echo "Build attempt $attempt of $max_attempts..."
+    if [ "$run_bench" == "bench" ]; then
+        build_cmd_ok=true
+        for dir in benchmarks/*/; do
+            dirname=$(basename "$dir")
+            if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
+                build_cmd_ok=false
+                break
+            fi
+        done
+    else
+        if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
+            build_cmd_ok=true
+        else
+            build_cmd_ok=false
+        fi
+    fi
+
+    if [ "$build_cmd_ok" = true ]; then
+        echo "Build succeeded on attempt $attempt."
+        exit 0
+    fi
+
+    if [ $attempt -lt $max_attempts ]; then
+        echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
+        ./mfc.sh clean
+        sleep 30
+    fi
+    attempt=$((attempt + 1))
+done
 
+echo "Build failed after $max_attempts attempts."
+exit 1
@@ -29,7 +29,7 @@ job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
 sbatch <<EOT
 #!/bin/bash
 #SBATCH -JMFC-$job_slug            # Job name
-#SBATCH -A CFD154                  # charge account
+#SBATCH -A ENG160                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
 #SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)