Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 82 additions & 29 deletions apps/web-evals/src/app/runs/[id]/run.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ function formatLogContent(log: string): React.ReactNode[] {

export function Run({ run }: { run: Run }) {
const runStatus = useRunStatus(run)
const { tasks, tokenUsage, usageUpdatedAt, heartbeat, runners } = runStatus
const { tasks, tokenUsage, toolUsage, usageUpdatedAt, heartbeat, runners } = runStatus

const [selectedTask, setSelectedTask] = useState<Task | null>(null)
const [taskLog, setTaskLog] = useState<string | null>(null)
Expand Down Expand Up @@ -336,37 +336,70 @@ export function Run({ run }: { run: Run }) {
)

const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
// Reference usageUpdatedAt to trigger recomputation when Map contents change
void usageUpdatedAt
const metrics: Record<number, TaskMetrics> = {}

tasks?.forEach((task) => {
const usage = tokenUsage.get(task.id)

if (task.finishedAt && task.taskMetrics) {
metrics[task.id] = task.taskMetrics
} else if (usage) {
const streamingUsage = tokenUsage.get(task.id)
const dbMetrics = task.taskMetrics

// For finished tasks, prefer DB values but fall back to streaming values
// This handles race conditions during timeout where DB might not have latest data
if (task.finishedAt) {
// Check if DB metrics have meaningful values (not just default/empty)
const dbHasData = dbMetrics && (dbMetrics.tokensIn > 0 || dbMetrics.tokensOut > 0 || dbMetrics.cost > 0)
if (dbHasData) {
metrics[task.id] = dbMetrics
} else if (streamingUsage) {
// Fall back to streaming values if DB is empty/stale
metrics[task.id] = {
tokensIn: streamingUsage.totalTokensIn,
tokensOut: streamingUsage.totalTokensOut,
tokensContext: streamingUsage.contextTokens,
duration: streamingUsage.duration ?? 0,
cost: streamingUsage.totalCost,
}
}
} else if (streamingUsage) {
// For running tasks, use streaming values
metrics[task.id] = {
tokensIn: usage.totalTokensIn,
tokensOut: usage.totalTokensOut,
tokensContext: usage.contextTokens,
duration: usage.duration ?? 0,
cost: usage.totalCost,
tokensIn: streamingUsage.totalTokensIn,
tokensOut: streamingUsage.totalTokensOut,
tokensContext: streamingUsage.contextTokens,
duration: streamingUsage.duration ?? 0,
cost: streamingUsage.totalCost,
}
}
})

return metrics
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [tasks, tokenUsage, usageUpdatedAt])

// Collect all unique tool names from all tasks and sort by total attempts
const toolColumns = useMemo<ToolName[]>(() => {
// Reference usageUpdatedAt to trigger recomputation when Map contents change
void usageUpdatedAt
if (!tasks) return []

const toolTotals = new Map<ToolName, number>()

for (const task of tasks) {
if (task.taskMetrics?.toolUsage) {
for (const [toolName, usage] of Object.entries(task.taskMetrics.toolUsage)) {
// Get both DB and streaming values
const dbToolUsage = task.taskMetrics?.toolUsage
const streamingToolUsage = toolUsage.get(task.id)

// For finished tasks, prefer DB values but fall back to streaming values
// For running tasks, use streaming values
// This handles race conditions during timeout where DB might not have latest data
const taskToolUsage = task.finishedAt
? dbToolUsage && Object.keys(dbToolUsage).length > 0
? dbToolUsage
: streamingToolUsage
: streamingToolUsage

if (taskToolUsage) {
for (const [toolName, usage] of Object.entries(taskToolUsage)) {
const tool = toolName as ToolName
const current = toolTotals.get(tool) ?? 0
toolTotals.set(tool, current + usage.attempts)
Expand All @@ -378,10 +411,13 @@ export function Run({ run }: { run: Run }) {
return Array.from(toolTotals.entries())
.sort((a, b) => b[1] - a[1])
.map(([name]): ToolName => name)
}, [tasks])
// toolUsage ref is stable; usageUpdatedAt triggers recomputation when Map contents change
}, [tasks, toolUsage, usageUpdatedAt])

// Compute aggregate stats
const stats = useMemo(() => {
// Reference usageUpdatedAt to trigger recomputation when Map contents change
void usageUpdatedAt
if (!tasks) return null

const passed = tasks.filter((t) => t.passed === true).length
Expand All @@ -393,8 +429,8 @@ export function Run({ run }: { run: Run }) {
let totalCost = 0
let totalDuration = 0

// Aggregate tool usage from completed tasks
const toolUsage: ToolUsage = {}
// Aggregate tool usage from all tasks (both finished and running)
const toolUsageAggregate: ToolUsage = {}

for (const task of tasks) {
const metrics = taskMetrics[task.id]
Expand All @@ -405,15 +441,24 @@ export function Run({ run }: { run: Run }) {
totalDuration += metrics.duration
}

// Aggregate tool usage from finished tasks with taskMetrics
if (task.finishedAt && task.taskMetrics?.toolUsage) {
for (const [key, usage] of Object.entries(task.taskMetrics.toolUsage)) {
// Aggregate tool usage: prefer DB values for finished tasks, fall back to streaming values
// This handles race conditions during timeout where DB might not have latest data
const dbToolUsage = task.taskMetrics?.toolUsage
const streamingToolUsage = toolUsage.get(task.id)
const taskToolUsage = task.finishedAt
? dbToolUsage && Object.keys(dbToolUsage).length > 0
? dbToolUsage
: streamingToolUsage
: streamingToolUsage

if (taskToolUsage) {
for (const [key, usage] of Object.entries(taskToolUsage)) {
const tool = key as keyof ToolUsage
if (!toolUsage[tool]) {
toolUsage[tool] = { attempts: 0, failures: 0 }
if (!toolUsageAggregate[tool]) {
toolUsageAggregate[tool] = { attempts: 0, failures: 0 }
}
toolUsage[tool].attempts += usage.attempts
toolUsage[tool].failures += usage.failures
toolUsageAggregate[tool].attempts += usage.attempts
toolUsageAggregate[tool].failures += usage.failures
}
}
}
Expand All @@ -427,13 +472,15 @@ export function Run({ run }: { run: Run }) {
totalTokensOut,
totalCost,
totalDuration,
toolUsage,
toolUsage: toolUsageAggregate,
}
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [tasks, taskMetrics, tokenUsage, usageUpdatedAt])
// Map refs are stable; usageUpdatedAt triggers recomputation when Map contents change
}, [tasks, taskMetrics, toolUsage, usageUpdatedAt])

// Calculate elapsed time (wall-clock time from run creation to completion or now)
const elapsedTime = useMemo(() => {
// Reference usageUpdatedAt to trigger recomputation for live elapsed time updates
void usageUpdatedAt
if (!tasks || tasks.length === 0) return null

const startTime = new Date(run.createdAt).getTime()
Expand All @@ -452,7 +499,6 @@ export function Run({ run }: { run: Run }) {

// If still running, use current time
return Date.now() - startTime
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [tasks, run.createdAt, run.taskMetricsId, usageUpdatedAt])

return (
Expand Down Expand Up @@ -655,7 +701,14 @@ export function Run({ run }: { run: Run }) {
{formatTokens(taskMetrics[task.id]!.tokensContext)}
</TableCell>
{toolColumns.map((toolName) => {
const usage = task.taskMetrics?.toolUsage?.[toolName]
// Use DB values for finished tasks, but fall back to streaming values
// if DB values are missing (handles race condition during timeout)
const dbUsage = task.taskMetrics?.toolUsage?.[toolName]
const streamingUsage = toolUsage.get(task.id)?.[toolName]
const usage = task.finishedAt
? (dbUsage ?? streamingUsage)
: streamingUsage

const successRate =
usage && usage.attempts > 0
? ((usage.attempts - usage.failures) / usage.attempts) * 100
Expand Down
11 changes: 10 additions & 1 deletion apps/web-evals/src/hooks/use-run-status.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { useState, useCallback, useRef } from "react"
import { useQuery, keepPreviousData } from "@tanstack/react-query"

import { type TokenUsage, RooCodeEventName, taskEventSchema } from "@roo-code/types"
import { type TokenUsage, type ToolUsage, RooCodeEventName, taskEventSchema } from "@roo-code/types"
import type { Run, Task, TaskMetrics } from "@roo-code/evals"

import { getHeartbeat } from "@/actions/heartbeat"
Expand All @@ -15,6 +15,7 @@ export type RunStatus = {
runners: string[] | undefined
tasks: (Task & { taskMetrics: TaskMetrics | null })[] | undefined
tokenUsage: Map<number, TokenUsage & { duration?: number }>
toolUsage: Map<number, ToolUsage>
usageUpdatedAt: number | undefined
}

Expand All @@ -23,6 +24,7 @@ export const useRunStatus = (run: Run): RunStatus => {
const [usageUpdatedAt, setUsageUpdatedAt] = useState<number>()

const tokenUsage = useRef<Map<number, TokenUsage & { duration?: number }>>(new Map())
const toolUsage = useRef<Map<number, ToolUsage>>(new Map())
const startTimes = useRef<Map<number, number>>(new Map())

const { data: heartbeat } = useQuery({
Expand Down Expand Up @@ -78,6 +80,12 @@ export const useRunStatus = (run: Run): RunStatus => {
const startTime = startTimes.current.get(taskId)
const duration = startTime ? Date.now() - startTime : undefined
tokenUsage.current.set(taskId, { ...payload[1], duration })

// Track tool usage from streaming updates
if (payload[2]) {
toolUsage.current.set(taskId, payload[2])
}

setUsageUpdatedAt(Date.now())
break
}
Expand All @@ -96,6 +104,7 @@ export const useRunStatus = (run: Run): RunStatus => {
runners,
tasks,
tokenUsage: tokenUsage.current,
toolUsage: toolUsage.current,
usageUpdatedAt,
}
}
30 changes: 25 additions & 5 deletions packages/evals/src/cli/runTask.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
RooCodeEventName,
IpcMessageType,
EVALS_SETTINGS,
type ToolUsage,
} from "@roo-code/types"
import { IpcClient } from "@roo-code/ipc"

Expand Down Expand Up @@ -277,6 +278,8 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
let taskMetricsId: number | undefined
let rooTaskId: string | undefined
let isClientDisconnected = false
// Track accumulated tool usage across task instances (handles rehydration after abort)
const accumulatedToolUsage: ToolUsage = {}

const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = {
broadcast: [RooCodeEventName.Message],
Expand Down Expand Up @@ -373,6 +376,27 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } =
payload[1]

// For both TaskTokenUsageUpdated and TaskCompleted: toolUsage is payload[2]
const incomingToolUsage: ToolUsage = payload[2] ?? {}

// Merge incoming tool usage with accumulated data using MAX strategy.
// This handles the case where a task is rehydrated after abort:
// - Empty rehydrated data won't overwrite existing: max(5, 0) = 5
// - Legitimate restart with additional work is captured: max(5, 8) = 8
// Each task instance tracks its own cumulative values, so we take the max
// to preserve the highest values seen across all instances.
for (const [toolName, usage] of Object.entries(incomingToolUsage)) {
const existing = accumulatedToolUsage[toolName as keyof ToolUsage]
if (existing) {
accumulatedToolUsage[toolName as keyof ToolUsage] = {
attempts: Math.max(existing.attempts, usage.attempts),
failures: Math.max(existing.failures, usage.failures),
}
} else {
accumulatedToolUsage[toolName as keyof ToolUsage] = { ...usage }
}
}

await updateTaskMetrics(taskMetricsId, {
cost: totalCost,
tokensIn: totalTokensIn,
Expand All @@ -381,14 +405,10 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO
duration,
cacheWrites: totalCacheWrites ?? 0,
cacheReads: totalCacheReads ?? 0,
toolUsage: accumulatedToolUsage,
})
}

if (eventName === RooCodeEventName.TaskCompleted && taskMetricsId) {
const toolUsage = payload[2]
await updateTaskMetrics(taskMetricsId, { toolUsage })
}

if (eventName === RooCodeEventName.TaskAborted) {
taskAbortedAt = Date.now()
}
Expand Down
2 changes: 1 addition & 1 deletion packages/types/src/events.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ export const rooCodeEventsSchema = z.object({
[RooCodeEventName.TaskUserMessage]: z.tuple([z.string()]),

[RooCodeEventName.TaskToolFailed]: z.tuple([z.string(), toolNamesSchema, z.string()]),
[RooCodeEventName.TaskTokenUsageUpdated]: z.tuple([z.string(), tokenUsageSchema]),
[RooCodeEventName.TaskTokenUsageUpdated]: z.tuple([z.string(), tokenUsageSchema, toolUsageSchema]),

[RooCodeEventName.ModeChanged]: z.tuple([z.string()]),
[RooCodeEventName.ProviderProfileChanged]: z.tuple([z.object({ name: z.string(), provider: z.string() })]),
Expand Down
4 changes: 2 additions & 2 deletions packages/types/src/task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ export type TaskProviderEvents = {

[RooCodeEventName.TaskUserMessage]: [taskId: string]

[RooCodeEventName.TaskTokenUsageUpdated]: [taskId: string, tokenUsage: TokenUsage]
[RooCodeEventName.TaskTokenUsageUpdated]: [taskId: string, tokenUsage: TokenUsage, toolUsage: ToolUsage]

[RooCodeEventName.ModeChanged]: [mode: string]
[RooCodeEventName.ProviderProfileChanged]: [config: { name: string; provider?: string }]
Expand Down Expand Up @@ -159,5 +159,5 @@ export type TaskEvents = {

// Task Analytics
[RooCodeEventName.TaskToolFailed]: [taskId: string, tool: ToolName, error: string]
[RooCodeEventName.TaskTokenUsageUpdated]: [taskId: string, tokenUsage: TokenUsage]
[RooCodeEventName.TaskTokenUsageUpdated]: [taskId: string, tokenUsage: TokenUsage, toolUsage: ToolUsage]
}
2 changes: 2 additions & 0 deletions src/__tests__/nested-delegation-resume.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ describe("Nested delegation resume (A → B → C)", () => {
clineMessages: [],
userMessageContent: [],
consecutiveMistakeCount: 0,
emitFinalTokenUsageUpdate: vi.fn(),
} as unknown as Task

const blockC = {
Expand Down Expand Up @@ -223,6 +224,7 @@ describe("Nested delegation resume (A → B → C)", () => {
clineMessages: [],
userMessageContent: [],
consecutiveMistakeCount: 0,
emitFinalTokenUsageUpdate: vi.fn(),
} as unknown as Task

const blockB = {
Expand Down
Loading
Loading