diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..535692e --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: PR Quality Gate + +on: + pull_request: + push: + branches: + - main + +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + validate: + name: Node 24 | npm ci + typecheck + test + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: "24" + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Typecheck + run: npm run typecheck + + - name: Run tests + run: npm test diff --git a/README.md b/README.md index cbe031c..15b49bd 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,13 @@ Evaluate `SWE`, `TB2`, `Tau`, and `SAE` through a unified `kbench` CLI plus benc `adapter generate` is part of the current runtime path for bootstrapping dynamic `custom-adapter` integrations. Today this generator is heuristic and repository-inspection-based; this repo does not yet ship a built-in remote LLM adapter generator. +## Environment Requirements + +- Node.js `24.x` is the recommended local and CI runtime for this repository +- npm `11.x` is the expected package manager line +- `npm ci` is the canonical clean-install path and is enforced by the PR quality gate +- Benchmark workflows still install their own extra runtime dependencies such as Python/Harbor/tau-bench as needed + ## Quick Start This repository is primarily intended to run benchmarks through GitHub Actions. diff --git a/README.zh-CN.md b/README.zh-CN.md index 7c9cd6a..0f66865 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -55,6 +55,13 @@ `adapter generate` 已经是当前仓库里动态 `custom-adapter` bootstrap 的一部分。 但当前实现仍然是基于本地仓库检查和启发式推断,不是“仓库内置远程 LLM 动态生成 adapter”。 +## 环境要求 + +- 推荐使用 Node.js `24.x` 作为本地与 CI 运行时 +- 预期使用 npm `11.x` +- `npm ci` 是标准的干净安装路径,并已作为 PR 质量门禁的一部分 +- benchmark workflow 仍会按需安装额外运行时依赖,例如 Python、Harbor、tau-bench + ## Quick Start 这个仓库的主要用法仍然是通过 GitHub Actions 发起评测。 diff --git a/src/benchmark/sae/runner.ts b/src/benchmark/sae/runner.ts index ab7cef0..3f585cd 100644 --- a/src/benchmark/sae/runner.ts +++ b/src/benchmark/sae/runner.ts @@ -234,6 +234,17 @@ function mergeSummary(summary: SummaryResult, benchmarkResult: Record(modelName: string, baseUrl: string | undefined, fn: () => Promise): Promise { if (!baseUrl) { return fn(); @@ -254,6 +265,8 @@ function withTemporaryBaseUrl(modelName: string, baseUrl: string | undefined, } export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise { + const saeTimeoutMs = normalizePositiveMs(config.saeTimeoutMs, 'saeTimeoutMs', 30 * 60 * 1000); + const saePollIntervalMs = normalizePositiveMs(config.saePollIntervalMs, 'saePollIntervalMs'); const layout = createRunLayout(config.runDir, config.runId); const artifactDir = path.join(layout.runDir, 'artifacts', 'sae'); const metadata: RunMetadata = { @@ -270,8 +283,8 @@ export async function runSaeBenchmark(config: SaeBenchmarkConfig): Promise, name: string): number | undefined { + const raw = values.get(name); + if (raw === undefined) { + return undefined; + } + + const parsed = Number(raw); + if (!Number.isFinite(parsed) || !Number.isInteger(parsed) || parsed <= 0) { + throw new Error(`Invalid --${name}. Expected a positive integer.`); + } + return parsed; +} + function parseRunArgs(argv: string[]): RunCliArgs { const values = parseFlags(argv); const benchmark = values.get('benchmark') as BenchmarkId | undefined; @@ -702,7 +715,7 @@ function parseRunArgs(argv: string[]): RunCliArgs { const runId = values.get('run-id') || nowId('run'); const instanceId = values.get('instance-id') || `${benchmark}-instance`; const runDir = path.resolve(values.get('run-dir') || path.join(process.cwd(), '.kbench', 'runs', runId)); - const explicitTimeoutMs = values.get('timeout-ms') ? Number(values.get('timeout-ms')) : undefined; + const explicitTimeoutMs = parsePositiveIntegerFlag(values, 'timeout-ms'); const configModeValue = values.get('config-mode'); if (configModeValue && configModeValue !== 'inherit' && configModeValue !== 'isolated') { throw new Error('Invalid --config-mode. Expected one of: inherit, isolated.'); @@ -770,7 +783,7 @@ function parseBenchmarkRunArgs(argv: string[]): BenchmarkRunCliArgs { storeDirProvided: values.has('store-dir'), workDir: values.get('workdir') ? path.resolve(values.get('workdir') as string) : undefined, storeDir: values.get('store-dir') ? path.resolve(values.get('store-dir') as string) : undefined, - timeoutMs: values.get('sae-timeout-ms') ? Number(values.get('sae-timeout-ms')) : undefined, + timeoutMs: parsePositiveIntegerFlag(values, 'sae-timeout-ms'), saeApiBase: values.get('sae-api-base') || 'https://www.kaggle.com/api/v1', saeAgentIdFile: values.get('sae-agent-id-file') || '~/.kaggle-agent-id', saeApiKeyFile: values.get('sae-api-key-file') || '~/.kaggle-agent-api-key', @@ -779,7 +792,7 @@ function parseBenchmarkRunArgs(argv: string[]): BenchmarkRunCliArgs { saeAgentDescription: values.get('sae-agent-description'), saeAgentVersion: values.get('sae-agent-version') || '1.0', saeAgentType: values.get('sae-agent-type') || harness, - saePollIntervalMs: values.get('sae-poll-interval-ms') ? Number(values.get('sae-poll-interval-ms')) : 2000, + saePollIntervalMs: parsePositiveIntegerFlag(values, 'sae-poll-interval-ms') ?? 2000, }; } diff --git a/src/harness/drivers/cli/runtime.ts b/src/harness/drivers/cli/runtime.ts index a86ef08..692f501 100644 --- a/src/harness/drivers/cli/runtime.ts +++ b/src/harness/drivers/cli/runtime.ts @@ -93,9 +93,6 @@ export function extractPatchSinceBaseline(baseline: GitPatchBaseline): string | if ((baseline.beforeDiff || '') === afterDiff) { return undefined; } - if (baseline.beforeDiff && baseline.beforeDiff.trim()) { - return undefined; - } return afterDiff; } diff --git a/src/harness/sdk/validate.ts b/src/harness/sdk/validate.ts index ebb50c8..2846790 100644 --- a/src/harness/sdk/validate.ts +++ b/src/harness/sdk/validate.ts @@ -284,7 +284,7 @@ export async function validateAdapter(adapterPath: string): Promise check.ok || check.warnings.length > 0), + ok: loaded.schema.ok && entryValidation.ok && executionChecks.every((check) => check.ok), adapterPath: loaded.adapterPath, manifestPath: loaded.manifestPath, entryPath: loaded.manifest ? loaded.entryPath : undefined, diff --git a/test/benchmark/sae-runner.test.ts b/test/benchmark/sae-runner.test.ts index 97d2cef..c879bf6 100644 --- a/test/benchmark/sae-runner.test.ts +++ b/test/benchmark/sae-runner.test.ts @@ -1,6 +1,9 @@ +import os from 'node:os'; +import path from 'node:path'; + import { describe, expect, it } from 'vitest'; -import { parseBoolean } from '../../src/benchmark/sae/runner.js'; +import { parseBoolean, runSaeBenchmark } from '../../src/benchmark/sae/runner.js'; describe('SAE runner helpers', () => { it('parses common truthy values', () => { @@ -21,4 +24,20 @@ describe('SAE runner helpers', () => { expect(parseBoolean(undefined, true)).toBe(true); expect(parseBoolean('maybe', false)).toBe(false); }); + + it('rejects invalid polling configuration before making network requests', async () => { + await expect(runSaeBenchmark({ + runId: 'sae-invalid-config', + runDir: path.join(os.tmpdir(), 'kbench-sae-invalid-config'), + harness: 'kode-agent-sdk', + modelName: 'openai/gpt-4.1-mini', + saeApiBase: 'https://www.kaggle.com/api/v1', + saeAgentIdFile: '~/.kaggle-agent-id', + saeApiKeyFile: '~/.kaggle-agent-api-key', + saeRegisterIfMissing: false, + saeAgentVersion: '1.0', + saeAgentType: 'kode-agent-sdk', + saePollIntervalMs: Number.NaN, + })).rejects.toThrow('saePollIntervalMs must be a positive finite number of milliseconds.'); + }); }); diff --git a/test/cli/kbench-cli.test.ts b/test/cli/kbench-cli.test.ts index 043c680..62c5b95 100644 --- a/test/cli/kbench-cli.test.ts +++ b/test/cli/kbench-cli.test.ts @@ -99,4 +99,59 @@ describe('kbench CLI', () => { expect(payload.benchmarkError.message).toContain(missingIdFile); expect(payload.benchmarkError.message).toContain(missingKeyFile); }); + + it('rejects invalid sae-timeout-ms values before running the benchmark', async () => { + const result = await runKbench([ + 'benchmark', + 'run', + '--benchmark', + 'sae', + '--harness', + 'kode-agent-sdk', + '--model-name', + 'openai/gpt-4.1-mini', + '--sae-timeout-ms', + 'NaN', + ]); + + expect(result.status).toBe(1); + expect(result.stderr).toContain('Invalid --sae-timeout-ms. Expected a positive integer.'); + }); + + it('rejects invalid sae-poll-interval-ms values before running the benchmark', async () => { + const result = await runKbench([ + 'benchmark', + 'run', + '--benchmark', + 'sae', + '--harness', + 'kode-agent-sdk', + '--model-name', + 'openai/gpt-4.1-mini', + '--sae-poll-interval-ms', + '0', + ]); + + expect(result.status).toBe(1); + expect(result.stderr).toContain('Invalid --sae-poll-interval-ms. Expected a positive integer.'); + }); + + it('rejects invalid timeout-ms values before running a single instance', async () => { + const result = await runKbench([ + 'run', + '--benchmark', + 'swe', + '--harness', + 'kode-agent-sdk', + '--model-name', + 'openai/gpt-4.1-mini', + '--instruction', + 'Fix the bug', + '--timeout-ms', + 'NaN', + ]); + + expect(result.status).toBe(1); + expect(result.stderr).toContain('Invalid --timeout-ms. Expected a positive integer.'); + }); }); diff --git a/test/harness/cli-runtime.test.ts b/test/harness/cli-runtime.test.ts new file mode 100644 index 0000000..935b80e --- /dev/null +++ b/test/harness/cli-runtime.test.ts @@ -0,0 +1,45 @@ +import { execFileSync } from 'node:child_process'; +import fs from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; + +import { afterEach, describe, expect, it } from 'vitest'; + +import { captureGitPatchBaseline, extractPatchSinceBaseline } from '../../src/harness/drivers/cli/runtime.js'; + +const tempDirs: string[] = []; + +afterEach(async () => { + await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true }))); +}); + +async function makeGitRepo(): Promise { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), 'kbench-cli-runtime-')); + tempDirs.push(dir); + + execFileSync('git', ['init'], { cwd: dir, stdio: 'ignore' }); + execFileSync('git', ['config', 'user.email', 'kbench@example.com'], { cwd: dir, stdio: 'ignore' }); + execFileSync('git', ['config', 'user.name', 'kbench'], { cwd: dir, stdio: 'ignore' }); + + await fs.writeFile(path.join(dir, 'tracked.txt'), 'base\n', 'utf-8'); + execFileSync('git', ['add', 'tracked.txt'], { cwd: dir, stdio: 'ignore' }); + execFileSync('git', ['commit', '-m', 'init'], { cwd: dir, stdio: 'ignore' }); + + return dir; +} + +describe('CLI runtime patch capture', () => { + it('captures the current diff even when the worktree was already dirty before execution', async () => { + const repoDir = await makeGitRepo(); + + await fs.writeFile(path.join(repoDir, 'tracked.txt'), 'base\nbefore\n', 'utf-8'); + const baseline = captureGitPatchBaseline(repoDir); + + await fs.writeFile(path.join(repoDir, 'tracked.txt'), 'base\nbefore\nafter\n', 'utf-8'); + const patch = extractPatchSinceBaseline(baseline); + + expect(patch).toBeDefined(); + expect(patch).toContain('tracked.txt'); + expect(patch).toContain('after'); + }); +}); diff --git a/test/sdk/adapter.test.ts b/test/sdk/adapter.test.ts index b3cf22c..2795d63 100644 --- a/test/sdk/adapter.test.ts +++ b/test/sdk/adapter.test.ts @@ -60,4 +60,54 @@ describe('adapter scaffolding', () => { expect(report.executionChecks[0]?.mode).toBe('task'); expect(report.executionChecks[0]?.output?.status).toBe('ok'); }); + + it('does not mark adapter validation as ok when execution has both errors and warnings', async () => { + const root = await makeTempRoot(); + const adapterDir = path.join(root, 'warning-error-adapter'); + + await fs.mkdir(adapterDir, { recursive: true }); + await fs.writeFile( + path.join(adapterDir, 'adapter.manifest.json'), + `${JSON.stringify({ + schemaVersion: 'kbench.adapter/v1', + id: 'warning-error-adapter', + kind: 'node', + entry: './runner.mjs', + version: '0.1.0', + supportedBenchmarks: ['swe'], + capabilities: { + runModes: ['task'], + machineReadableStdout: true, + supportsPatchOutput: false, + supportsTrajectory: false, + supportsToolCallTrace: false, + supportsResume: false, + supportsImages: false, + supportsSandboxBridge: false, + supportsPromptTemplate: false, + }, + }, null, 2)}\n`, + 'utf-8' + ); + await fs.writeFile( + path.join(adapterDir, 'runner.mjs'), + `#!/usr/bin/env node +process.stdout.write(JSON.stringify({ + ok: true, + status: 'bad-status', + elapsedMs: 1, + patch: 'diff --git a/a b/a\\n', +})); +`, + { encoding: 'utf-8', mode: 0o755 } + ); + + const report = await validateAdapter(adapterDir); + + expect(report.ok).toBe(false); + expect(report.executionChecks).toHaveLength(1); + expect(report.executionChecks[0]?.ok).toBe(false); + expect(report.executionChecks[0]?.errors.some((error) => error.includes('valid "status"'))).toBe(true); + expect(report.executionChecks[0]?.warnings.some((warning) => warning.includes('returned a patch'))).toBe(true); + }); });