Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
7496ab7
Add CI lint gate and local precheck command
sbryngelson Feb 4, 2026
26f7e17
Fix precheck.sh portability and usability issues
sbryngelson Feb 4, 2026
4b17fa2
Gate benchmarks on Test Suite completion
sbryngelson Feb 4, 2026
1b211a5
Auto-install git pre-commit hook for precheck
sbryngelson Feb 4, 2026
9304137
Use dynamic CPU count in pre-commit hook
sbryngelson Feb 4, 2026
77ac8cc
Show CPU count in pre-commit hook output
sbryngelson Feb 4, 2026
a7cf1d2
Merge upstream master, resolve mfc.sh conflict
sbryngelson Feb 5, 2026
4b55c30
Add precheck command to CLI and autocomplete
sbryngelson Feb 5, 2026
e017cf0
Auto-update installed shell completions on regeneration
sbryngelson Feb 5, 2026
780138b
Show source command when completions auto-update
sbryngelson Feb 5, 2026
06f0425
Always check installed completions on every run
sbryngelson Feb 5, 2026
f65ed30
Prevent directory completion fallback in shell completions
sbryngelson Feb 5, 2026
d23addc
Auto-install completions and fix bash completion options
sbryngelson Feb 5, 2026
33cbd0e
Auto-install completions from mfc.sh with shell rc setup
sbryngelson Feb 5, 2026
1ba4666
Clarify verbose, debug, and debug-log flag documentation
sbryngelson Feb 5, 2026
adfcb7f
Cap pre-commit hook parallelism at 12 jobs
sbryngelson Feb 5, 2026
02ef3de
Fix completion auto-install to check for files, not just directory
sbryngelson Feb 5, 2026
ff8d499
Merge MFlowCode/MFC master into triage
sbryngelson Feb 5, 2026
fc5e2a5
Auto-update outdated shell completions
sbryngelson Feb 5, 2026
9c15c8a
Auto-activate completions when mfc.sh is sourced
sbryngelson Feb 5, 2026
52d0ddd
Use workflow_run for benchmarks and extract completion logic
sbryngelson Feb 5, 2026
8e3f405
Suppress verbose package list from uv install by default
sbryngelson Feb 5, 2026
989a79d
Compact splash screen from 45 to 20 lines
sbryngelson Feb 5, 2026
e913f7c
Pass arguments to python.sh for verbose flag support
sbryngelson Feb 5, 2026
6972e6e
Handle ./mfc.sh -v without command (show help, not error)
sbryngelson Feb 5, 2026
cf49bcc
Fix splash screen to use COMMANDS as single source of truth
sbryngelson Feb 5, 2026
72d8a38
Address AI reviewer feedback on PR #1124
sbryngelson Feb 5, 2026
88d8736
Show full build error output instead of truncating to 40 lines
sbryngelson Feb 5, 2026
c96edf2
Fix troubleshooting tips to suggest --debug instead of --debug-log
sbryngelson Feb 5, 2026
b44360f
Fix flags-before-command detection and shell detection
sbryngelson Feb 5, 2026
f24c62b
Raise minimum Python version to 3.12 (pyrometheus requires it)
sbryngelson Feb 5, 2026
e0bdc88
Lower minimum Python to 3.10 by pinning pyrometheus to pre-3.12 commit
sbryngelson Feb 5, 2026
f9fa919
Add -v flag to all CI build/run/test/bench commands
sbryngelson Feb 5, 2026
0ec406b
Add -v flag to coverage and GitHub runner CI commands
sbryngelson Feb 5, 2026
1e863ae
Revert pyrometheus pin to track git HEAD
sbryngelson Feb 5, 2026
73e0de3
Remove -v flag from HPC CI scripts to fix build stalls
sbryngelson Feb 8, 2026
b76f0a3
Fix shell detection for zsh users sourcing under bash
sbryngelson Feb 8, 2026
13fcc83
Limit push-triggered CI to master branch only
sbryngelson Feb 8, 2026
6b0faef
Fix shell detection to not use $SHELL fallback when in bash
sbryngelson Feb 8, 2026
2651c08
Change Frontier SLURM account from CFD154 to ENG160
sbryngelson Feb 8, 2026
18548ab
Remove -v flag from GitHub/coverage CI and pin pyrometheus
sbryngelson Feb 8, 2026
32c4fa4
Harden CI against transient Frontier/Phoenix failures
sbryngelson Feb 8, 2026
cd1b771
Add SIGHUP protection to Frontier build scripts
sbryngelson Feb 8, 2026
4353e1e
Restore -v flag to all CI build and test commands
sbryngelson Feb 9, 2026
40a12e1
Remove -v from GitHub-hosted CI, keep for self-hosted runners
sbryngelson Feb 9, 2026
0fdd27e
Fix pipe deadlock in -v build mode and restore -v everywhere
sbryngelson Feb 9, 2026
8321eac
Pass --test-all to Build step so post_process builds during dry-run
sbryngelson Feb 9, 2026
f213175
Refactor test.yml: generic self-hosted steps via matrix.cluster
sbryngelson Feb 9, 2026
3470c7a
Unpin pyrometheus to track git HEAD
sbryngelson Feb 9, 2026
d169d08
Clean build directory between retry attempts
sbryngelson Feb 9, 2026
9734b2d
Add build retry logic with clean to Phoenix test script
sbryngelson Feb 9, 2026
3fefb70
Add build retry with clean to Phoenix benchmark script
sbryngelson Feb 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 50 additions & 51 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
@@ -1,87 +1,85 @@
name: 'Benchmark'

on:
pull_request:
pull_request_review:
types: [submitted]
# Trigger when Test Suite completes (no polling needed)
workflow_run:
workflows: ["Test Suite"]
types: [completed]
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
cancel-in-progress: true

jobs:
file-changes:
name: Detect File Changes
# Only run if Test Suite passed (or manual dispatch)
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
pr_number: ${{ steps.pr-info.outputs.pr_number }}
pr_approved: ${{ steps.pr-info.outputs.approved }}
pr_author: ${{ steps.pr-info.outputs.author }}
steps:
- name: Clone
uses: actions/checkout@v4
with:
ref: ${{ github.event.workflow_run.head_sha || github.sha }}

- name: Detect Changes
uses: dorny/paths-filter@v3
id: changes
with:
filters: ".github/file-filter.yml"

wait-for-tests:
name: Wait for Test Suite
runs-on: ubuntu-latest
steps:
- name: Wait for Test Suite to Pass
- name: Get PR Info
id: pr-info
env:
GH_TOKEN: ${{ github.token }}
run: |
echo "Waiting for Test Suite workflow to complete..."
SHA="${{ github.event.pull_request.head.sha || github.sha }}"

# Poll every 60 seconds for up to 3 hours
for i in $(seq 1 180); do
# Get the Test Suite workflow runs for this commit
STATUS=$(gh api repos/${{ github.repository }}/commits/$SHA/check-runs \
--jq '.check_runs[] | select(.name == "Lint Gate") | .conclusion' | head -1)

if [ "$STATUS" = "success" ]; then
echo "Lint Gate passed. Checking test jobs..."

# Check if any Github test jobs failed
FAILED=$(gh api repos/${{ github.repository }}/commits/$SHA/check-runs \
--jq '[.check_runs[] | select(.name | startswith("Github")) | select(.conclusion == "failure")] | length')

if [ "$FAILED" != "0" ]; then
echo "::error::Test Suite has failing jobs. Benchmarks will not run."
exit 1
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "pr_number=" >> $GITHUB_OUTPUT
echo "approved=true" >> $GITHUB_OUTPUT
echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
else
# Get PR number from workflow_run
PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
if [ -n "$PR_NUMBER" ]; then
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT

# Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT

# Check if PR is approved
APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
--jq '[.[] | select(.state == "APPROVED")] | length')
if [ "$APPROVED" -gt 0 ]; then
echo "approved=true" >> $GITHUB_OUTPUT
else
echo "approved=false" >> $GITHUB_OUTPUT
fi

# Check if Github tests are still running
PENDING=$(gh api repos/${{ github.repository }}/commits/$SHA/check-runs \
--jq '[.check_runs[] | select(.name | startswith("Github")) | select(.conclusion == null)] | length')

if [ "$PENDING" = "0" ]; then
echo "All Test Suite jobs completed successfully!"
exit 0
fi

echo "Tests still running ($PENDING pending)..."
elif [ "$STATUS" = "failure" ]; then
echo "::error::Lint Gate failed. Benchmarks will not run."
exit 1
else
echo "Lint Gate status: ${STATUS:-pending}..."
echo "pr_number=" >> $GITHUB_OUTPUT
echo "approved=false" >> $GITHUB_OUTPUT
echo "author=" >> $GITHUB_OUTPUT
fi

sleep 60
done

echo "::error::Timeout waiting for Test Suite to complete."
exit 1
fi

self:
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba'))) }}
needs: [file-changes, wait-for-tests]
if: >
github.repository == 'MFlowCode/MFC' &&
needs.file-changes.outputs.checkall == 'true' &&
(
github.event_name == 'workflow_dispatch' ||
needs.file-changes.outputs.pr_approved == 'true' ||
needs.file-changes.outputs.pr_author == 'sbryngelson' ||
needs.file-changes.outputs.pr_author == 'wilfonba'
)
needs: [file-changes]
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -145,6 +143,7 @@ jobs:
- name: Clone - PR
uses: actions/checkout@v4
with:
ref: ${{ github.event.workflow_run.head_sha || github.sha }}
path: pr

- name: Clone - Master
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/cleanliness.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
name: Cleanliness

on: [push, pull_request, workflow_dispatch]
on:
push:
branches: [master]
pull_request:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
name: Coverage Check

on: [push, pull_request, workflow_dispatch]
on:
push:
branches: [master]
pull_request:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down Expand Up @@ -39,10 +43,10 @@ jobs:
libfftw3-dev libhdf5-dev libblas-dev liblapack-dev

- name: Build
run: /bin/bash mfc.sh build -j $(nproc) --gcov
run: /bin/bash mfc.sh build -v -j $(nproc) --gcov

- name: Test
run: /bin/bash mfc.sh test -a -j $(nproc)
run: /bin/bash mfc.sh test -v -a -j $(nproc)

- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v4
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
- cron: '0 0 * * *' # This runs every day at midnight UTC
workflow_dispatch:
push:
branches: [master]
pull_request:

jobs:
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
name: Pretty

on: [push, pull_request, workflow_dispatch]
on:
push:
branches: [master]
pull_request:
workflow_dispatch:

jobs:
docs:
Expand Down
46 changes: 38 additions & 8 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

# Ignore SIGHUP to survive login node session drops
trap '' HUP

job_device=$1
job_interface=$2
run_bench=$3
Expand All @@ -15,12 +18,39 @@ fi

. ./mfc.sh load -c f -m g

if [ "$run_bench" == "bench" ]; then
for dir in benchmarks/*/; do
dirname=$(basename "$dir")
./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
done
else
./mfc.sh test -a --dry-run --rdma-mpi -j 8 $build_opts
fi
max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Build attempt $attempt of $max_attempts..."
if [ "$run_bench" == "bench" ]; then
build_cmd_ok=true
for dir in benchmarks/*/; do
dirname=$(basename "$dir")
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
build_cmd_ok=false
break
fi
done
else
if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
build_cmd_ok=true
else
build_cmd_ok=false
fi
fi

if [ "$build_cmd_ok" = true ]; then
echo "Build succeeded on attempt $attempt."
exit 0
fi

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
sleep 30
fi
attempt=$((attempt + 1))
done

echo "Build failed after $max_attempts attempts."
exit 1
2 changes: 1 addition & 1 deletion .github/workflows/frontier/submit-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
sbatch <<EOT
#!/bin/bash
#SBATCH -JMFC-$job_slug # Job name
#SBATCH -A CFD154 # charge account
#SBATCH -A ENG160 # charge account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
Expand Down
24 changes: 20 additions & 4 deletions .github/workflows/frontier/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

set -e

# Ignore SIGHUP to survive login node session drops
trap '' HUP

usage() {
echo "Usage: $0 [script.sh] [cpu|gpu]"
}
Expand All @@ -26,17 +29,17 @@ fi


job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
output_file="$job_slug.out"

sbatch <<EOT
submit_output=$(sbatch <<EOT
#!/bin/bash
#SBATCH -J MFC-$job_slug # Job name
#SBATCH -A CFD154 # charge account
#SBATCH -A ENG160 # charge account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
#SBATCH -o$job_slug.out # Combined output and error messages file
#SBATCH -o$output_file # Combined output and error messages file
#SBATCH -p extended # Extended partition for shorter queues
#SBATCH -W # Do not exit until the submitted job terminates.
set -e
set -x
Expand All @@ -53,4 +56,17 @@ job_interface="$3"
$sbatch_script_contents
EOT
)

job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: The job ID extraction uses grep -oE '[0-9]+' directly on the sbatch output, which returns every numeric substring on separate lines; if the output ever contains more than one number, job_id will contain multiple IDs separated by newlines, causing the monitoring script to receive an invalid job identifier and fail to track the job correctly. Limiting the extraction to the first numeric match avoids this and ensures job_id is a single valid token. [logic error]

Severity Level: Major ⚠️
- ❌ Array jobs monitored with malformed, multi-token SLURM job IDs.
- ⚠️ Frontier submit wrapper unreliable for scripts using --array.
- ⚠️ Downstream monitoring scripts may misreport job status.
Suggested change
job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
job_id=$(echo "$submit_output" | grep -oE '[0-9]+' | head -n1)
Steps of Reproduction ✅
1. In the repo root, create a SLURM batch script `array_job.sh` that defines an array job,
e.g.:

   - `#SBATCH --array=0-3`

   This script will later be passed as `$1` to `.github/workflows/frontier/submit.sh`.

2. On a SLURM system, run the wrapper:

   - `.github/workflows/frontier/submit.sh array_job.sh cpu`

   This executes the logic in `.github/workflows/frontier/submit.sh` including the
   `sbatch` submission block and captures `sbatch` output into `submit_output`.

3. SLURM prints a message like `Submitted batch job 123456_0` (typical for array jobs) to
stdout. The script at `.github/workflows/frontier/submit.sh:31-58` captures this into
`submit_output`, then the block at lines 61-66 executes:

   - `job_id=$(echo "$submit_output" | grep -oE '[0-9]+')`

   `grep -oE '[0-9]+'` returns two lines: `123456` and `0`, so `job_id` contains a
   newline-separated list of two numeric tokens.

4. The monitoring code at `.github/workflows/frontier/submit.sh:71-72` runs:

   - `bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"`

   Because `"$job_id"` contains a newline, `monitor_slurm_job.sh` receives an invalid job
   identifier (either as a value with embedded newline or as multiple numeric tokens,
   depending on its parsing). This causes incorrect monitoring behavior for the submitted
   job array (e.g., failing SLURM commands like `squeue -j` or tracking the wrong job),
   demonstrating that extracting all numeric substrings from `submit_output` produces an
   invalid job ID when multiple numbers are present.
Prompt for AI Agent 🤖
This is a comment left during a code review.

**Path:** .github/workflows/frontier/submit.sh
**Line:** 61:61
**Comment:**
	*Logic Error: The job ID extraction uses `grep -oE '[0-9]+'` directly on the `sbatch` output, which returns every numeric substring on separate lines; if the output ever contains more than one number, `job_id` will contain multiple IDs separated by newlines, causing the monitoring script to receive an invalid job identifier and fail to track the job correctly. Limiting the extraction to the first numeric match avoids this and ensures `job_id` is a single valid token.

Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise.
👍 | 👎

if [ -z "$job_id" ]; then
echo "ERROR: Failed to submit job. sbatch output:"
echo "$submit_output"
exit 1
fi

echo "Submitted batch job $job_id"

# Use resilient monitoring instead of sbatch -W
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
4 changes: 2 additions & 2 deletions .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ if [ "$job_device" = "gpu" ]; then
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
else
./mfc.sh test -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
fi
46 changes: 38 additions & 8 deletions .github/workflows/frontier_amd/build.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

# Ignore SIGHUP to survive login node session drops
trap '' HUP

job_device=$1
job_interface=$2
run_bench=$3
Expand All @@ -15,12 +18,39 @@ fi

. ./mfc.sh load -c famd -m g

if [ "$run_bench" == "bench" ]; then
for dir in benchmarks/*/; do
dirname=$(basename "$dir")
./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
done
else
./mfc.sh test -a --dry-run -j 8 $build_opts
fi
max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Build attempt $attempt of $max_attempts..."
if [ "$run_bench" == "bench" ]; then
build_cmd_ok=true
for dir in benchmarks/*/; do
dirname=$(basename "$dir")
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
build_cmd_ok=false
break
fi
done
else
if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
build_cmd_ok=true
else
build_cmd_ok=false
fi
fi

if [ "$build_cmd_ok" = true ]; then
echo "Build succeeded on attempt $attempt."
exit 0
fi

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
sleep 30
fi
attempt=$((attempt + 1))
done

echo "Build failed after $max_attempts attempts."
exit 1
2 changes: 1 addition & 1 deletion .github/workflows/frontier_amd/submit-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
sbatch <<EOT
#!/bin/bash
#SBATCH -JMFC-$job_slug # Job name
#SBATCH -A CFD154 # charge account
#SBATCH -A ENG160 # charge account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
Expand Down
Loading
Loading