feat(cli): add evaluate subcommand for automata ground-truth evaluation #934
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Performance Benchmarking | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| iterations: | |
| description: 'Number of benchmark iterations' | |
| required: false | |
| default: '1000' | |
| type: string | |
| baseline_ref: | |
| description: 'Git reference for baseline comparison (branch/tag/commit)' | |
| required: false | |
| default: 'main' | |
| type: string | |
| pull_request: | |
| paths: | |
| - 'crates/terraphim_*/src/**' | |
| - 'terraphim_server/src/**' | |
| - 'scripts/run-performance-benchmarks.sh' | |
| - '.github/workflows/performance-benchmarking.yml' | |
| push: | |
| branches: [main, develop] | |
| paths: | |
| - 'crates/terraphim_*/src/**' | |
| - 'terraphim_server/src/**' | |
| - 'scripts/run-performance-benchmarks.sh' | |
| env: | |
| CARGO_TERM_COLOR: always | |
| RUST_BACKTRACE: 1 | |
| permissions: | |
| contents: read | |
| issues: write | |
| jobs: | |
| performance-benchmarks: | |
| name: Performance Benchmarks | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 # Fetch full history for baseline comparison | |
| - name: Set up Rust | |
| uses: dtolnay/rust-toolchain@stable | |
| - name: Cache Rust dependencies | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cargo/registry | |
| ~/.cargo/git | |
| target | |
| key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} | |
| restore-keys: | | |
| ${{ runner.os }}-cargo- | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y curl jq bc wrk | |
| - name: Download baseline (if comparing) | |
| if: github.event.inputs.baseline_ref || github.event_name == 'pull_request' | |
| run: | | |
| if [ "${{ github.event.inputs.baseline_ref }}" != "" ]; then | |
| BASELINE_REF="${{ github.event.inputs.baseline_ref }}" | |
| elif [ "${{ github.event_name }}" == "pull_request" ]; then | |
| BASELINE_REF="${{ github.event.pull_request.base.ref }}" | |
| else | |
| BASELINE_REF="main" | |
| fi | |
| echo "Downloading baseline from ref: $BASELINE_REF" | |
| # Download baseline results from previous run | |
| # This assumes you have baseline results stored as artifacts or in a separate repo | |
| # For now, create an empty but schema-valid baseline if none exists | |
| mkdir -p benchmark-results | |
| cat <<'EOF' > benchmark-results/baseline.json | |
| { | |
| "timestamp": "2024-01-01T00:00:00Z", | |
| "config": { | |
| "iterations": 1000, | |
| "warmup_iterations": 100, | |
| "concurrent_users": [1, 5, 10, 25, 50], | |
| "data_scales": [1000, 10000, 100000, 1000000], | |
| "slos": { | |
| "max_startup_time_ms": 5000, | |
| "max_api_response_time_ms": 500, | |
| "max_search_time_ms": 1000, | |
| "max_indexing_time_per_doc_ms": 50, | |
| "max_memory_mb": 1024, | |
| "max_cpu_idle_percent": 5.0, | |
| "max_cpu_load_percent": 80.0, | |
| "min_rps": 10.0, | |
| "max_concurrent_users": 100, | |
| "max_data_scale": 1000000 | |
| }, | |
| "monitoring_interval_ms": 1000, | |
| "enable_profiling": false | |
| }, | |
| "results": {}, | |
| "slo_compliance": { | |
| "overall_compliance": 100.0, | |
| "violations": [], | |
| "critical_violations": [] | |
| }, | |
| "system_info": { | |
| "os": "unknown", | |
| "os_version": "unknown", | |
| "cpu_model": "unknown", | |
| "cpu_cores": 0, | |
| "total_memory_mb": 0, | |
| "available_memory_mb": 0, | |
| "rust_version": "unknown", | |
| "terraphim_version": "unknown" | |
| }, | |
| "trends": null | |
| } | |
| EOF | |
| - name: Start Terraphim server | |
| run: | | |
| # Build and start the server in background | |
| cargo build --release --package terraphim_server | |
| ./target/release/terraphim_server & | |
| SERVER_PID=$! | |
| # Wait for server to start | |
| for i in {1..30}; do | |
| if curl -s http://localhost:3000/health > /dev/null; then | |
| echo "Server started successfully" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| # Store PID for cleanup | |
| echo $SERVER_PID > server.pid | |
| - name: Run performance benchmarks | |
| run: | | |
| # Set environment variables | |
| export TERRAPHIM_BENCH_ITERATIONS="${{ github.event.inputs.iterations || '1000' }}" | |
| export TERRAPHIM_SERVER_URL="http://localhost:3000" | |
| # Make script executable | |
| chmod +x scripts/run-performance-benchmarks.sh | |
| # Run benchmarks | |
| ./scripts/run-performance-benchmarks.sh --verbose | |
| - name: Stop Terraphim server | |
| if: always() | |
| run: | | |
| if [ -f server.pid ]; then | |
| kill $(cat server.pid) || true | |
| rm server.pid | |
| fi | |
| - name: Upload benchmark results | |
| uses: actions/upload-artifact@v4 | |
| if: always() | |
| with: | |
| name: benchmark-results-${{ github.run_id }} | |
| path: benchmark-results/ | |
| retention-days: 30 | |
| - name: Generate performance report | |
| if: always() | |
| run: | | |
| # Create a summary for GitHub Actions | |
| REPORT_FILE=$(find benchmark-results -name "benchmark_report.md" | head -1) | |
| if [ -n "$REPORT_FILE" ] && [ -f "$REPORT_FILE" ]; then | |
| echo "## Performance Benchmark Report" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| - name: Check performance gates | |
| run: | | |
| # Check if benchmark results meet performance requirements | |
| RESULTS_FILE=$(find benchmark-results -name "benchmark_results.json" | head -1) | |
| if [ -n "$RESULTS_FILE" ] && [ -f "$RESULTS_FILE" ]; then | |
| # Extract SLO compliance percentage | |
| SLO_COMPLIANCE=$(jq -r '.slo_compliance.overall_compliance // 0' "$RESULTS_FILE") | |
| echo "SLO Compliance: ${SLO_COMPLIANCE}%" | |
| # Set output for other jobs | |
| echo "slo-compliance=${SLO_COMPLIANCE}" >> $GITHUB_OUTPUT | |
| # Check critical violations | |
| CRITICAL_VIOLATIONS=$(jq -r '.slo_compliance.critical_violations | length' "$RESULTS_FILE") | |
| if [ "$CRITICAL_VIOLATIONS" -gt 0 ]; then | |
| echo "❌ Critical performance violations detected!" | |
| jq -r '.slo_compliance.critical_violations[] | "🚨 \(.metric): \(.actual_value) (threshold: \(.threshold_value))"' "$RESULTS_FILE" | |
| echo "performance-gates-passed=false" >> $GITHUB_OUTPUT | |
| exit 1 | |
| else | |
| echo "✅ All performance gates passed" | |
| echo "performance-gates-passed=true" >> $GITHUB_OUTPUT | |
| fi | |
| else | |
| echo "No benchmark results found" | |
| echo "slo-compliance=0" >> $GITHUB_OUTPUT | |
| echo "performance-gates-passed=false" >> $GITHUB_OUTPUT | |
| exit 1 | |
| fi | |
| - name: Comment on PR (if applicable) | |
| if: github.event_name == 'pull_request' && always() | |
| continue-on-error: true | |
| uses: actions/github-script@v9 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const { execSync } = require('child_process'); | |
| // Find the benchmark report using shell (glob npm module not available in runner) | |
| let reportPath; | |
| try { | |
| reportPath = execSync('find benchmark-results -name "benchmark_report.md" | head -1', { encoding: 'utf8' }).trim(); | |
| } catch (e) { | |
| reportPath = null; | |
| } | |
| if (reportPath && fs.existsSync(reportPath)) { | |
| const report = fs.readFileSync(reportPath, 'utf8'); | |
| // Extract key metrics for comment | |
| const sloMatch = report.match(/SLO Compliance: (\d+\.?\d*)%/); | |
| const sloCompliance = sloMatch ? sloMatch[1] : 'N/A'; | |
| const comment = [ | |
| "## Performance Benchmark Results", | |
| "", | |
| `**SLO Compliance:** ${sloCompliance}%`, | |
| "", | |
| "### Key Findings:", | |
| report.includes('violations') | |
| ? 'Some performance thresholds were not met' | |
| : 'All performance requirements satisfied', | |
| "", | |
| `[View full report](https://github.com/terraphim/terraphim-ai/actions/runs/${process.env.GITHUB_RUN_ID})`, | |
| ].join("\n"); | |
| github.rest.issues.createComment({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| } | |
| performance-regression-check: | |
| name: Performance Regression Check | |
| runs-on: ubuntu-latest | |
| needs: performance-benchmarks | |
| if: always() && needs.performance-benchmarks.result == 'success' | |
| steps: | |
| - name: Check for regressions | |
| run: | | |
| # Compare current results with baseline | |
| # This is a simplified check - in practice you'd want more sophisticated analysis | |
| if [ "${{ needs.performance-benchmarks.outputs.performance-gates-passed }}" == "false" ]; then | |
| echo "Performance regression detected!" | |
| exit 1 | |
| else | |
| echo "No performance regressions detected" | |
| fi | |
| update-baseline: | |
| name: Update Performance Baseline | |
| runs-on: ubuntu-latest | |
| needs: [performance-benchmarks, performance-regression-check] | |
| if: github.ref == 'refs/heads/main' && needs.performance-regression-check.result == 'success' | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download benchmark results | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: benchmark-results-${{ github.run_id }} | |
| - name: Update baseline | |
| run: | | |
| # Copy latest results as new baseline | |
| RESULTS_FILE=$(find benchmark-results -name "benchmark_results.json" | head -1) | |
| if [ -n "$RESULTS_FILE" ] && [ -f "$RESULTS_FILE" ]; then | |
| cp "$RESULTS_FILE" "benchmark-results/baseline.json" | |
| echo "Updated performance baseline" | |
| fi | |
| - name: Build headline snapshot for terraphim.ai | |
| env: | |
| GITHUB_SHA: ${{ github.sha }} | |
| WORKFLOW_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} | |
| run: | | |
| # Emits benchmark-results/snapshot.json consumed by terraphim.ai's | |
| # content/data/benchmarks.json (see Gitea zestic-ai/terraphim-ai #574). | |
| # Script is idempotent; falls back to static values when Criterion | |
| # parameter labels do not match. | |
| bash scripts/update-benchmark-snapshot.sh | |
| - name: Commit baseline + snapshot update | |
| run: | | |
| git config --global user.name 'github-actions[bot]' | |
| git config --global user.email 'github-actions[bot]@users.noreply.github.com' | |
| git add benchmark-results/baseline.json benchmark-results/snapshot.json | |
| git commit -m "chore: update performance baseline and snapshot from run ${{ github.run_id }}" || echo "No changes to commit" | |
| git push origin main |