diff --git a/.changeset/0000-per-eval-threshold.md b/.changeset/0000-per-eval-threshold.md new file mode 100644 index 00000000..33ec7ae2 --- /dev/null +++ b/.changeset/0000-per-eval-threshold.md @@ -0,0 +1,5 @@ +--- +"evalite": minor +--- + +Added per-eval `scoreThreshold` option that overrides the global threshold (from config or `--threshold` CLI flag). This allows different evals to have different quality requirements. diff --git a/packages/evalite/src/evalite.ts b/packages/evalite/src/evalite.ts index 4329f402..0e34f995 100644 --- a/packages/evalite/src/evalite.ts +++ b/packages/evalite/src/evalite.ts @@ -236,6 +236,12 @@ function registerEvalite( : evalName; return describeFn(fullEvalName, async () => { + const configTrialCount = inject("trialCount"); + const trialCount = opts.trialCount ?? configTrialCount ?? 1; + + const configScoreThreshold = inject("scoreThreshold"); + const scoreThreshold = opts.scoreThreshold ?? configScoreThreshold; + const datasetResult = await datasetPromise; if (!datasetResult.success) { @@ -251,6 +257,7 @@ function registerEvalite( variantName: vitestOpts.variantName, variantGroup: vitestOpts.variantGroup, trialIndex: undefined, + scoreThreshold: scoreThreshold, duration: 0, expected: null, input: null, @@ -276,10 +283,6 @@ function registerEvalite( ? dataset.filter((d) => d.only === true) : dataset; - // Get trialCount from opts or config (opts wins) - const configTrialCount = inject("trialCount"); - const trialCount = opts.trialCount ?? configTrialCount ?? 1; - // Expand dataset with trials const expandedDataset: Array<{ input: TInput; @@ -329,6 +332,7 @@ function registerEvalite( variantGroup: vitestOpts.variantGroup, status: "running", trialIndex: data.trialIndex, + scoreThreshold: scoreThreshold, }, }) ); @@ -433,6 +437,7 @@ function registerEvalite( variantName: vitestOpts.variantName, variantGroup: vitestOpts.variantGroup, trialIndex: data.trialIndex, + scoreThreshold: scoreThreshold, }, }) ); @@ -469,6 +474,7 @@ function registerEvalite( variantName: vitestOpts.variantName, variantGroup: vitestOpts.variantGroup, trialIndex: data.trialIndex, + scoreThreshold: scoreThreshold, }, }) ); diff --git a/packages/evalite/src/reporter.ts b/packages/evalite/src/reporter.ts index dab277a7..8c46a17c 100644 --- a/packages/evalite/src/reporter.ts +++ b/packages/evalite/src/reporter.ts @@ -81,6 +81,7 @@ export default class EvaliteReporter implements Reporter { errors, failedDueToThreshold, scoreThreshold: this.opts.scoreThreshold, + failedThresholds: this.runner.getFailedThresholds(), }); } @@ -371,6 +372,7 @@ export default class EvaliteReporter implements Reporter { variantName: data.initialEval.variantName, variantGroup: data.initialEval.variantGroup, trialIndex: data.initialEval.trialIndex, + scoreThreshold: data.initialEval.scoreThreshold, }, }); } diff --git a/packages/evalite/src/reporter/EvaliteRunner.ts b/packages/evalite/src/reporter/EvaliteRunner.ts index 93b12833..a4310d04 100644 --- a/packages/evalite/src/reporter/EvaliteRunner.ts +++ b/packages/evalite/src/reporter/EvaliteRunner.ts @@ -8,6 +8,12 @@ export interface EvaliteRunnerOptions { scoreThreshold: number | undefined; } +export interface FailedThresholdInfo { + suiteName: string; + score: number | null; + threshold: number; +} + export class EvaliteRunner { private opts: EvaliteRunnerOptions; private state: Evalite.ServerState = { @@ -16,6 +22,7 @@ export class EvaliteRunner { cacheHitsByScorer: {}, }; private didLastRunFailThreshold: "yes" | "no" | "unknown" = "unknown"; + private failedThresholds: FailedThresholdInfo[] = []; private collectedResults: Map = new Map(); private eventQueue: Promise = Promise.resolve(); @@ -31,6 +38,10 @@ export class EvaliteRunner { return this.didLastRunFailThreshold; } + getFailedThresholds(): FailedThresholdInfo[] { + return this.failedThresholds; + } + getAllScores(): Evalite.Score[] { return Array.from(this.collectedResults.values()).flatMap( (_eval) => _eval.scores @@ -49,6 +60,46 @@ export class EvaliteRunner { .flatMap((_eval) => _eval.scores); } + getSuiteResults(): Array<{ + suiteName: string; + averageScore: number | null; + threshold: number | undefined; + }> { + const suiteMap = new Map< + string, + { scores: number[]; threshold: number | undefined } + >(); + + for (const _eval of this.collectedResults.values()) { + const existing = suiteMap.get(_eval.suiteName); + const evalScores = _eval.scores.map((s) => s.score ?? 0); + + if (existing) { + existing.scores.push(...evalScores); + if ( + existing.threshold === undefined && + _eval.scoreThreshold !== undefined + ) { + existing.threshold = _eval.scoreThreshold; + } + } else { + suiteMap.set(_eval.suiteName, { + scores: evalScores, + threshold: _eval.scoreThreshold, + }); + } + } + + return Array.from(suiteMap.entries()).map(([suiteName, data]) => ({ + suiteName, + averageScore: + data.scores.length === 0 + ? null + : data.scores.reduce((a, b) => a + b, 0) / data.scores.length, + threshold: data.threshold, + })); + } + handleTestSummary(data: { failedTasksCount: number; averageScore: number | null; @@ -58,16 +109,44 @@ export class EvaliteRunner { this.opts.modifyExitCode(1); } - // Handle threshold checking - if (typeof this.opts.scoreThreshold === "number") { - if ( - data.averageScore === null || - data.averageScore * 100 < this.opts.scoreThreshold - ) { + const suiteResults = this.getSuiteResults(); + const globalThreshold = this.opts.scoreThreshold; + + const hasAnyThreshold = + typeof globalThreshold === "number" || + suiteResults.some((s) => typeof s.threshold === "number"); + + this.failedThresholds = []; + + if (hasAnyThreshold) { + let anyFailed = false; + let anyPassed = false; + + for (const suite of suiteResults) { + const threshold = suite.threshold ?? globalThreshold; + + if (typeof threshold === "number") { + const passed = + suite.averageScore !== null && + suite.averageScore * 100 >= threshold; + + if (passed) { + anyPassed = true; + } else { + anyFailed = true; + this.failedThresholds.push({ + suiteName: suite.suiteName, + score: suite.averageScore, + threshold: threshold, + }); + } + } + } + + if (anyFailed) { this.opts.modifyExitCode(1); this.didLastRunFailThreshold = "yes"; - } else { - // Only set exit code to 0 if there are no failed tasks + } else if (anyPassed) { if (data.failedTasksCount === 0) { this.opts.modifyExitCode(0); } diff --git a/packages/evalite/src/reporter/rendering.ts b/packages/evalite/src/reporter/rendering.ts index 3d614b32..81ca8a30 100644 --- a/packages/evalite/src/reporter/rendering.ts +++ b/packages/evalite/src/reporter/rendering.ts @@ -6,6 +6,7 @@ import type { RunnerTestFile } from "vitest"; import type { Evalite } from "../types.js"; import { average, EvaliteFile } from "../utils.js"; import type { TestModule } from "vitest/node"; +import type { FailedThresholdInfo } from "./EvaliteRunner.js"; export function withLabel( color: "red" | "green" | "blue" | "cyan", @@ -54,6 +55,7 @@ export function renderWatcherStart( errors: unknown[]; failedDueToThreshold: boolean; scoreThreshold: number | undefined; + failedThresholds: FailedThresholdInfo[]; } ) { logger.log(""); @@ -91,9 +93,26 @@ export function renderWatcherStart( withLabel( "red", "FAIL", - `${opts.scoreThreshold}% threshold not met. Watching for file changes...` + "Score threshold not met. Watching for file changes..." ) ); + + if (opts.failedThresholds.length > 0) { + for (const failed of opts.failedThresholds) { + const scoreDisplay = + failed.score === null + ? c.red("no score") + : c.red(`${Math.round(failed.score * 100)}%`); + logger.log( + BADGE_PADDING + + c.dim("- ") + + failed.suiteName + + c.dim(": ") + + scoreDisplay + + c.dim(` < ${failed.threshold}%`) + ); + } + } } else { logger.log(withLabel("green", "PASS", "Waiting for file changes...")); } diff --git a/packages/evalite/src/run-evalite.ts b/packages/evalite/src/run-evalite.ts index ec0ef78f..f5793976 100644 --- a/packages/evalite/src/run-evalite.ts +++ b/packages/evalite/src/run-evalite.ts @@ -22,6 +22,11 @@ declare module "vitest" { * non-serializable functions (like storage factory). */ trialCount: number | undefined; + /** + * Global score threshold (0-100). + * Per-eval thresholds override this when set. + */ + scoreThreshold: number | undefined; /** * Port number where the evalite server is running. * Used by cache and other features that need to communicate with the server. @@ -349,6 +354,7 @@ export const runEvalite = async (opts: { vitest.provide("cwd", cwd); vitest.provide("trialCount", config?.trialCount); + vitest.provide("scoreThreshold", scoreThreshold); vitest.provide("serverPort", actualServerPort); vitest.provide("cacheDebug", opts.cacheDebug ?? false); vitest.provide("cacheEnabled", cacheEnabled); diff --git a/packages/evalite/src/types.ts b/packages/evalite/src/types.ts index 2de53593..bd0f37d4 100644 --- a/packages/evalite/src/types.ts +++ b/packages/evalite/src/types.ts @@ -188,6 +188,7 @@ export declare namespace Evalite { variantName: string | undefined; variantGroup: string | undefined; trialIndex: number | undefined; + scoreThreshold: number | undefined; } export type EvalStatus = "success" | "fail" | "running"; @@ -211,6 +212,7 @@ export declare namespace Evalite { variantName: string | undefined; variantGroup: string | undefined; trialIndex: number | undefined; + scoreThreshold: number | undefined; /** * Technically, input and expected are known at the start * of the suite. But because they may be files, they @@ -313,6 +315,20 @@ export declare namespace Evalite { * ``` */ trialCount?: number; + /** + * Minimum average score threshold (0-100) for this eval. + * If the average score falls below this threshold, the process will exit with code 1. + * Overrides the global scoreThreshold when set. + * @example + * ```ts + * evalite("My Eval", { + * data: [...], + * task: ..., + * scoreThreshold: 80 // This eval requires 80% + * }) + * ``` + */ + scoreThreshold?: number; }; export type ScorerOpts = {