Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
aac64bb
feat: dataset evaluators
mikeldking Sep 25, 2025
11c843e
ci: add ci for version-13
mikeldking Sep 25, 2025
394ddad
feat: dataset evaluators
mikeldking Sep 25, 2025
addd08c
fix: drop support for python 3.9 (#9818)
RogerHYang Oct 7, 2025
bd302ed
feat(evaluators): db migration for evaluator tables (#9960)
RogerHYang Oct 29, 2025
9092f2c
feat(evaluators): mutations for playground evaluator selector (#10042)
axiomofjoy Oct 29, 2025
b7b2af0
feat: Create evaluator mutations with optional dataset_id (#10065)
cephalization Oct 30, 2025
92c90d5
fix: ensure fields of polymorphic evaluator orm types are eagerly loa…
axiomofjoy Oct 30, 2025
0a3e5a3
feat: Evaluators creation page (#10054)
cephalization Oct 30, 2025
5fa3506
fix(evaluators): persist choices (#10076)
axiomofjoy Oct 31, 2025
2013b79
feat: Collect all json path segments when flattening example keys (#1…
cephalization Oct 31, 2025
a8ae458
feat(evaluators): add evaluator select (#10063)
yfrigui2 Oct 31, 2025
855d680
feat: Reorganize new evaluator form (#10081)
cephalization Nov 3, 2025
c642e79
chore: Dummy playground evaluation payloads (#10102)
anticorrelator Nov 4, 2025
7147b15
feat: Add examples route with examples table (#10123)
cephalization Nov 5, 2025
44df74a
feat: Add optional description field to new evaluator creation (#10132)
cephalization Nov 5, 2025
97c3cbe
feat: Improve rendering of dataset evals on playground (#10136)
cephalization Nov 5, 2025
b6a58e6
feat(evaluators): assign evaluator to dataset UI (#10135)
yfrigui2 Nov 5, 2025
9c22f28
feat: add metadata to evaluator db table (#10139)
RogerHYang Nov 5, 2025
88edc13
fix(evaluators): return annotation name in output config resolver (#1…
axiomofjoy Nov 6, 2025
51cb5e8
feat(evaluators): evaluators update and delete mutations (#10128)
axiomofjoy Nov 6, 2025
573004f
feat(evaluators): add annotation name to eval menu (#10156)
yfrigui2 Nov 6, 2025
cc9addd
feat: Add evaluators table to dataset evaluators page (#10157)
cephalization Nov 7, 2025
9d63e13
fix: Fix import error on evaluator page (#10185)
cephalization Nov 7, 2025
8f14f74
feat(evaluators): load in a default template for the evaluator that i…
mikeldking Nov 7, 2025
4fb7b3e
fix: eslint errors
mikeldking Nov 8, 2025
7a1eff4
ci: add ci for 12 (#10196)
mikeldking Nov 8, 2025
f19783b
feat: persist tools with eval (#10220)
cephalization Nov 12, 2025
baa233e
fix(evaluators): add validation for llm evaluator prompts (#10193)
axiomofjoy Nov 13, 2025
8f7ce68
feat: Refactor evaluator form for usage in create and edit workflows …
cephalization Nov 14, 2025
3ab2a58
feat: playground eval select updates (#10163)
yfrigui2 Nov 17, 2025
7f74cce
only include dataset-specific evaluators in playground eval select (#…
yfrigui2 Nov 17, 2025
5108469
feat: add eval outputs to playground (#10263)
yfrigui2 Nov 19, 2025
6f4dab3
feat: Create distinct slideovers for evaluator use cases (#10303)
cephalization Nov 19, 2025
c011874
fix(evaluators): clean up evaluators rebase
axiomofjoy Nov 20, 2025
8b50c03
feat: Implement builtin evaluators (#10308)
anticorrelator Nov 20, 2025
5b21e6e
feat: enable deleting dataset evaluator (#10354)
yfrigui2 Nov 21, 2025
905764c
fix: fix evaluator config dialog layout (#10366)
yfrigui2 Nov 24, 2025
6da124a
feat: Add input mapping support to built-in evaluators (#10355)
cephalization Nov 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/playwright.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@ permissions:

on:
push:
branches: [main, feat/annotations]
branches:
- main
- version-13
paths:
- "src/**"
- "app/**"
pull_request:
branches: [main, feat/annotations]
branches:
- main
- version-13
paths:
- "src/**"
- "app/**"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
push:
branches:
- main
- feat/version-12
- version-13
pull_request:
paths:
- "**/*.py"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/typescript-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
push:
branches:
- main
- feat/version-12
- version-13
pull_request:
paths:
- "app/**"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/typescript-packages-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
push:
branches:
- main
- feat/version-12
- version-13
pull_request:
paths:
- "js/**"
Expand Down
200 changes: 200 additions & 0 deletions app/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@ interface ApiKey {
expiresAt: DateTime
}

input AssignEvaluatorToDatasetInput {
datasetId: ID!
evaluatorId: ID!
inputMapping: EvaluatorInputMappingInput = null
}

enum AuthMethod {
LOCAL
OAUTH2
Expand Down Expand Up @@ -174,6 +180,21 @@ type BoundedFloatInvocationParameter implements InvocationParameterBase {
maxValue: Float!
}

type BuiltInEvaluator implements Evaluator & Node {
"""The Globally Unique ID of this object"""
id: ID!
name: Identifier!
description: String
metadata: JSON!
kind: EvaluatorKind!
createdAt: DateTime!
updatedAt: DateTime!
inputSchema: JSON
isAssignedToDataset(datasetId: ID = null): Boolean!
datasetInputMapping(datasetId: ID = null): EvaluatorInputMapping
user: User
}

enum CanonicalParameterName {
TEMPERATURE
MAX_COMPLETION_TOKENS
Expand Down Expand Up @@ -227,6 +248,7 @@ input ChatCompletionInput {
template: PromptTemplateOptions
promptName: Identifier = null
repetitions: Int!
evaluators: [PlaygroundEvaluatorInput!]! = []
}

input ChatCompletionMessageInput {
Expand Down Expand Up @@ -270,6 +292,7 @@ input ChatCompletionOverDatasetInput {
experimentDescription: String = null
experimentMetadata: JSON = {}
promptName: Identifier = null
evaluators: [PlaygroundEvaluatorInput!]! = []
}

type ChatCompletionOverDatasetMutationExamplePayload {
Expand All @@ -292,6 +315,7 @@ type ChatCompletionRepetition {
toolCalls: [ChatCompletionToolCall!]!
span: Span
errorMessage: String
evaluations: [ExperimentRunAnnotation!]!
}

type ChatCompletionSubscriptionError implements ChatCompletionSubscriptionPayload {
Expand Down Expand Up @@ -377,6 +401,26 @@ input ClusterInput {
id: ID
}

type CodeEvaluator implements Evaluator & Node {
"""The Globally Unique ID of this object"""
id: ID!
name: Identifier!
description: String
metadata: JSON!
kind: EvaluatorKind!
createdAt: DateTime!
updatedAt: DateTime!
inputSchema: JSON
isAssignedToDataset(datasetId: ID = null): Boolean!
datasetInputMapping(datasetId: ID = null): EvaluatorInputMapping
user: User
}

type CodeEvaluatorMutationPayload {
evaluator: CodeEvaluator!
query: Query!
}

union ContentPart = TextContentPart | ToolCallContentPart | ToolResultContentPart

input ContentPartInput @oneOf {
Expand Down Expand Up @@ -440,6 +484,12 @@ input CreateChatPromptVersionInput {
tags: [SetPromptVersionTagInput!] = null
}

input CreateCodeEvaluatorInput {
datasetId: ID
name: Identifier!
description: String
}

input CreateDatasetInput {
name: String!
description: String
Expand Down Expand Up @@ -473,6 +523,14 @@ input CreateDatasetSplitWithExamplesInput {
exampleIds: [ID!]!
}

input CreateLLMEvaluatorInput {
datasetId: ID
name: Identifier!
description: String
promptVersion: ChatPromptVersionInput!
outputConfig: CategoricalAnnotationConfigInput!
}

input CreateModelMutationInput {
name: String!
provider: String = null
Expand Down Expand Up @@ -624,6 +682,7 @@ type Dataset implements Node {
experiments(first: Int = 50, last: Int, after: String, before: String, filterCondition: String, filterIds: [ID!]): ExperimentConnection!
experimentAnnotationSummaries: [DatasetExperimentAnnotationSummary!]!
labels: [DatasetLabel!]!
evaluators(first: Int = 50, last: Int, after: String, before: String, sort: EvaluatorSort, filter: EvaluatorFilter): EvaluatorConnection!
lastUpdatedAt: DateTime
}

Expand Down Expand Up @@ -892,6 +951,15 @@ type DeleteDatasetSplitsMutationPayload {
query: Query!
}

input DeleteEvaluatorsInput {
evaluatorIds: [ID!]!
}

type DeleteEvaluatorsPayload {
evaluatorIds: [ID!]!
query: Query!
}

input DeleteExperimentsInput {
experimentIds: [ID!]!
}
Expand Down Expand Up @@ -1231,6 +1299,88 @@ input EvalResultKey {
attr: EvalAttr!
}

type EvaluationChunk implements ChatCompletionSubscriptionPayload {
datasetExampleId: ID
repetitionNumber: Int
experimentRunEvaluation: ExperimentRunAnnotation
spanEvaluation: SpanAnnotation
}

interface Evaluator implements Node {
"""The Globally Unique ID of this object"""
id: ID!
name: Identifier!
description: String
metadata: JSON!
kind: EvaluatorKind!
createdAt: DateTime!
updatedAt: DateTime!
inputSchema: JSON
isAssignedToDataset(datasetId: ID = null): Boolean!
datasetInputMapping(datasetId: ID = null): EvaluatorInputMapping
}

enum EvaluatorColumn {
name
kind
createdAt
updatedAt
}

"""A connection to a list of items."""
type EvaluatorConnection {
"""Pagination data for this connection"""
pageInfo: PageInfo!

"""Contains the nodes in this connection"""
edges: [EvaluatorEdge!]!
}

"""An edge in a connection."""
type EvaluatorEdge {
"""A cursor for use in pagination"""
cursor: String!

"""The item at the end of the edge"""
node: Evaluator!
}

"""The filter key and value for evaluator connections"""
input EvaluatorFilter {
col: EvaluatorFilterColumn!
value: String!
}

enum EvaluatorFilterColumn {
name
}

type EvaluatorInputMapping {
literalMapping: JSON!
pathMapping: JSON!
}

input EvaluatorInputMappingInput {
literalMapping: JSON! = {}
pathMapping: JSON! = {}
}

enum EvaluatorKind {
LLM
CODE
}

type EvaluatorMutationPayload {
evaluator: Evaluator!
query: Query!
}

"""The sort key and direction for evaluator connections"""
input EvaluatorSort {
col: EvaluatorColumn!
dir: SortDir!
}

type Event {
id: ID!
eventMetadata: EventMetadata!
Expand Down Expand Up @@ -1763,6 +1913,30 @@ type JSONInvocationParameter implements InvocationParameterBase {
defaultValue: JSON
}

type LLMEvaluator implements Evaluator & Node {
"""The Globally Unique ID of this object"""
id: ID!
name: Identifier!
description: String
metadata: JSON!
kind: EvaluatorKind!
createdAt: DateTime!
updatedAt: DateTime!
inputSchema: JSON
isAssignedToDataset(datasetId: ID = null): Boolean!
datasetInputMapping(datasetId: ID = null): EvaluatorInputMapping
outputConfig: CategoricalAnnotationConfig!
prompt: Prompt!
promptVersionTag: PromptVersionTag
user: User
promptVersion: PromptVersion!
}

type LLMEvaluatorMutationPayload {
evaluator: LLMEvaluator!
query: Query!
}

type LabelFraction {
label: String!
fraction: Float!
Expand Down Expand Up @@ -1825,6 +1999,12 @@ type Mutation {
deleteDatasetSplits(input: DeleteDatasetSplitInput!): DeleteDatasetSplitsMutationPayload!
setDatasetExampleSplits(input: SetDatasetExampleSplitsInput!): SetDatasetExampleSplitsMutationPayload!
createDatasetSplitWithExamples(input: CreateDatasetSplitWithExamplesInput!): DatasetSplitMutationPayloadWithExamples!
createCodeEvaluator(input: CreateCodeEvaluatorInput!): CodeEvaluatorMutationPayload!
createLlmEvaluator(input: CreateLLMEvaluatorInput!): LLMEvaluatorMutationPayload!
updateLlmEvaluator(input: UpdateLLMEvaluatorInput!): LLMEvaluatorMutationPayload!
deleteEvaluators(input: DeleteEvaluatorsInput!): DeleteEvaluatorsPayload!
assignEvaluatorToDataset(input: AssignEvaluatorToDatasetInput!): EvaluatorMutationPayload!
unassignEvaluatorFromDataset(input: UnassignEvaluatorFromDatasetInput!): EvaluatorMutationPayload!
deleteExperiments(input: DeleteExperimentsInput!): ExperimentMutationPayload!

"""
Expand Down Expand Up @@ -1990,6 +2170,11 @@ type PerformanceTimeSeries implements TimeSeries {
data: [TimeSeriesDataPoint!]!
}

input PlaygroundEvaluatorInput {
id: ID!
inputMapping: EvaluatorInputMappingInput! = {}
}

type PlaygroundModel implements ModelInterface {
name: String!
providerKey: GenerativeProviderKey!
Expand Down Expand Up @@ -2497,6 +2682,8 @@ type Query {
promptLabels(first: Int = 50, last: Int, after: String, before: String): PromptLabelConnection!
datasetLabels(first: Int = 50, last: Int, after: String, before: String): DatasetLabelConnection!
datasetSplits(first: Int = 50, last: Int, after: String, before: String): DatasetSplitConnection!
builtInEvaluators: [BuiltInEvaluator!]!
evaluators(first: Int = 50, last: Int, after: String, before: String, sort: EvaluatorSort, filter: EvaluatorFilter): EvaluatorConnection!
annotationConfigs(first: Int = 50, last: Int = null, after: String = null, before: String = null): AnnotationConfigConnection!
clusters(clusters: [ClusterInput!]!): [Cluster!]!
hdbscanClustering(
Expand Down Expand Up @@ -3263,6 +3450,11 @@ type UMAPPoints {
contextRetrievals: [Retrieval!]!
}

input UnassignEvaluatorFromDatasetInput {
datasetId: ID!
evaluatorId: ID!
}

input UnsetPromptLabelsInput {
promptId: ID!
promptLabelIds: [ID!]!
Expand All @@ -3289,6 +3481,14 @@ input UpdateAnnotationInput {
source: AnnotationSource! = APP
}

input UpdateLLMEvaluatorInput {
evaluatorId: ID!
name: Identifier!
description: String = null
promptVersion: ChatPromptVersionInput!
outputConfig: CategoricalAnnotationConfigInput!
}

input UpdateModelMutationInput {
id: ID!
name: String!
Expand Down
Loading
Loading