Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/0000-per-eval-threshold.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"evalite": minor
---

Added per-eval `scoreThreshold` option that overrides the global threshold (from config or `--threshold` CLI flag). This allows different evals to have different quality requirements.
14 changes: 10 additions & 4 deletions packages/evalite/src/evalite.ts
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,12 @@ function registerEvalite<TInput, TOutput, TExpected>(
: evalName;

return describeFn(fullEvalName, async () => {
const configTrialCount = inject("trialCount");
const trialCount = opts.trialCount ?? configTrialCount ?? 1;

const configScoreThreshold = inject("scoreThreshold");
const scoreThreshold = opts.scoreThreshold ?? configScoreThreshold;

const datasetResult = await datasetPromise;

if (!datasetResult.success) {
Expand All @@ -251,6 +257,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
variantName: vitestOpts.variantName,
variantGroup: vitestOpts.variantGroup,
trialIndex: undefined,
scoreThreshold: scoreThreshold,
duration: 0,
expected: null,
input: null,
Expand All @@ -276,10 +283,6 @@ function registerEvalite<TInput, TOutput, TExpected>(
? dataset.filter((d) => d.only === true)
: dataset;

// Get trialCount from opts or config (opts wins)
const configTrialCount = inject("trialCount");
const trialCount = opts.trialCount ?? configTrialCount ?? 1;

// Expand dataset with trials
const expandedDataset: Array<{
input: TInput;
Expand Down Expand Up @@ -329,6 +332,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
variantGroup: vitestOpts.variantGroup,
status: "running",
trialIndex: data.trialIndex,
scoreThreshold: scoreThreshold,
},
})
);
Expand Down Expand Up @@ -433,6 +437,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
variantName: vitestOpts.variantName,
variantGroup: vitestOpts.variantGroup,
trialIndex: data.trialIndex,
scoreThreshold: scoreThreshold,
},
})
);
Expand Down Expand Up @@ -469,6 +474,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
variantName: vitestOpts.variantName,
variantGroup: vitestOpts.variantGroup,
trialIndex: data.trialIndex,
scoreThreshold: scoreThreshold,
},
})
);
Expand Down
2 changes: 2 additions & 0 deletions packages/evalite/src/reporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ export default class EvaliteReporter implements Reporter {
errors,
failedDueToThreshold,
scoreThreshold: this.opts.scoreThreshold,
failedThresholds: this.runner.getFailedThresholds(),
});
}

Expand Down Expand Up @@ -371,6 +372,7 @@ export default class EvaliteReporter implements Reporter {
variantName: data.initialEval.variantName,
variantGroup: data.initialEval.variantGroup,
trialIndex: data.initialEval.trialIndex,
scoreThreshold: data.initialEval.scoreThreshold,
},
});
}
Expand Down
95 changes: 87 additions & 8 deletions packages/evalite/src/reporter/EvaliteRunner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ export interface EvaliteRunnerOptions {
scoreThreshold: number | undefined;
}

export interface FailedThresholdInfo {
suiteName: string;
score: number | null;
threshold: number;
}

export class EvaliteRunner {
private opts: EvaliteRunnerOptions;
private state: Evalite.ServerState = {
Expand All @@ -16,6 +22,7 @@ export class EvaliteRunner {
cacheHitsByScorer: {},
};
private didLastRunFailThreshold: "yes" | "no" | "unknown" = "unknown";
private failedThresholds: FailedThresholdInfo[] = [];
private collectedResults: Map<string, Evalite.Eval> = new Map();
private eventQueue: Promise<void> = Promise.resolve();

Expand All @@ -31,6 +38,10 @@ export class EvaliteRunner {
return this.didLastRunFailThreshold;
}

getFailedThresholds(): FailedThresholdInfo[] {
return this.failedThresholds;
}

getAllScores(): Evalite.Score[] {
return Array.from(this.collectedResults.values()).flatMap(
(_eval) => _eval.scores
Expand All @@ -49,6 +60,46 @@ export class EvaliteRunner {
.flatMap((_eval) => _eval.scores);
}

getSuiteResults(): Array<{
suiteName: string;
averageScore: number | null;
threshold: number | undefined;
}> {
const suiteMap = new Map<
string,
{ scores: number[]; threshold: number | undefined }
>();

for (const _eval of this.collectedResults.values()) {
const existing = suiteMap.get(_eval.suiteName);
const evalScores = _eval.scores.map((s) => s.score ?? 0);

if (existing) {
existing.scores.push(...evalScores);
if (
existing.threshold === undefined &&
_eval.scoreThreshold !== undefined
) {
existing.threshold = _eval.scoreThreshold;
}
} else {
suiteMap.set(_eval.suiteName, {
scores: evalScores,
threshold: _eval.scoreThreshold,
});
}
}

return Array.from(suiteMap.entries()).map(([suiteName, data]) => ({
suiteName,
averageScore:
data.scores.length === 0
? null
: data.scores.reduce((a, b) => a + b, 0) / data.scores.length,
threshold: data.threshold,
}));
}

handleTestSummary(data: {
failedTasksCount: number;
averageScore: number | null;
Expand All @@ -58,16 +109,44 @@ export class EvaliteRunner {
this.opts.modifyExitCode(1);
}

// Handle threshold checking
if (typeof this.opts.scoreThreshold === "number") {
if (
data.averageScore === null ||
data.averageScore * 100 < this.opts.scoreThreshold
) {
const suiteResults = this.getSuiteResults();
const globalThreshold = this.opts.scoreThreshold;

const hasAnyThreshold =
typeof globalThreshold === "number" ||
suiteResults.some((s) => typeof s.threshold === "number");

this.failedThresholds = [];

if (hasAnyThreshold) {
let anyFailed = false;
let anyPassed = false;

for (const suite of suiteResults) {
const threshold = suite.threshold ?? globalThreshold;

if (typeof threshold === "number") {
const passed =
suite.averageScore !== null &&
suite.averageScore * 100 >= threshold;

if (passed) {
anyPassed = true;
} else {
anyFailed = true;
this.failedThresholds.push({
suiteName: suite.suiteName,
score: suite.averageScore,
threshold: threshold,
});
}
}
}

if (anyFailed) {
this.opts.modifyExitCode(1);
this.didLastRunFailThreshold = "yes";
} else {
// Only set exit code to 0 if there are no failed tasks
} else if (anyPassed) {
if (data.failedTasksCount === 0) {
this.opts.modifyExitCode(0);
}
Expand Down
21 changes: 20 additions & 1 deletion packages/evalite/src/reporter/rendering.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import type { RunnerTestFile } from "vitest";
import type { Evalite } from "../types.js";
import { average, EvaliteFile } from "../utils.js";
import type { TestModule } from "vitest/node";
import type { FailedThresholdInfo } from "./EvaliteRunner.js";

export function withLabel(
color: "red" | "green" | "blue" | "cyan",
Expand Down Expand Up @@ -54,6 +55,7 @@ export function renderWatcherStart(
errors: unknown[];
failedDueToThreshold: boolean;
scoreThreshold: number | undefined;
failedThresholds: FailedThresholdInfo[];
}
) {
logger.log("");
Expand Down Expand Up @@ -91,9 +93,26 @@ export function renderWatcherStart(
withLabel(
"red",
"FAIL",
`${opts.scoreThreshold}% threshold not met. Watching for file changes...`
"Score threshold not met. Watching for file changes..."
)
);

if (opts.failedThresholds.length > 0) {
for (const failed of opts.failedThresholds) {
const scoreDisplay =
failed.score === null
? c.red("no score")
: c.red(`${Math.round(failed.score * 100)}%`);
logger.log(
BADGE_PADDING +
c.dim("- ") +
failed.suiteName +
c.dim(": ") +
scoreDisplay +
c.dim(` < ${failed.threshold}%`)
);
}
}
} else {
logger.log(withLabel("green", "PASS", "Waiting for file changes..."));
}
Expand Down
6 changes: 6 additions & 0 deletions packages/evalite/src/run-evalite.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ declare module "vitest" {
* non-serializable functions (like storage factory).
*/
trialCount: number | undefined;
/**
* Global score threshold (0-100).
* Per-eval thresholds override this when set.
*/
scoreThreshold: number | undefined;
/**
* Port number where the evalite server is running.
* Used by cache and other features that need to communicate with the server.
Expand Down Expand Up @@ -349,6 +354,7 @@ export const runEvalite = async (opts: {

vitest.provide("cwd", cwd);
vitest.provide("trialCount", config?.trialCount);
vitest.provide("scoreThreshold", scoreThreshold);
vitest.provide("serverPort", actualServerPort);
vitest.provide("cacheDebug", opts.cacheDebug ?? false);
vitest.provide("cacheEnabled", cacheEnabled);
Expand Down
16 changes: 16 additions & 0 deletions packages/evalite/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ export declare namespace Evalite {
variantName: string | undefined;
variantGroup: string | undefined;
trialIndex: number | undefined;
scoreThreshold: number | undefined;
}

export type EvalStatus = "success" | "fail" | "running";
Expand All @@ -211,6 +212,7 @@ export declare namespace Evalite {
variantName: string | undefined;
variantGroup: string | undefined;
trialIndex: number | undefined;
scoreThreshold: number | undefined;
/**
* Technically, input and expected are known at the start
* of the suite. But because they may be files, they
Expand Down Expand Up @@ -313,6 +315,20 @@ export declare namespace Evalite {
* ```
*/
trialCount?: number;
/**
* Minimum average score threshold (0-100) for this eval.
* If the average score falls below this threshold, the process will exit with code 1.
* Overrides the global scoreThreshold when set.
* @example
* ```ts
* evalite("My Eval", {
* data: [...],
* task: ...,
* scoreThreshold: 80 // This eval requires 80%
* })
* ```
*/
scoreThreshold?: number;
};

export type ScorerOpts<TInput, TOutput, TExpected> = {
Expand Down