diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/atif.py b/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/atif.py index 70b2e063c..602fab655 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/atif.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/atif.py @@ -15,6 +15,7 @@ from data_designer.engine.resources.agent_rollout.utils import ( build_message, coerce_optional_str, + min_max_timestamps, require_string, stringify_json_value, ) @@ -157,6 +158,7 @@ def parse_file( project_path = coerce_optional_str(agent_extra.get("project_path")) or cwd git_branch = coerce_optional_str(agent_extra.get("git_branch")) + started_at, ended_at = min_max_timestamps(timestamps) return [ NormalizedAgentRolloutRecord( trace_id=session_id, @@ -168,8 +170,8 @@ def parse_file( cwd=cwd, project_path=project_path, git_branch=git_branch, - started_at=min(timestamps) if timestamps else None, - ended_at=max(timestamps) if timestamps else None, + started_at=started_at, + ended_at=ended_at, messages=messages, source_meta=source_meta, ) diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/claude_code.py b/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/claude_code.py index 90835f3bb..4557aaa63 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/claude_code.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/claude_code.py @@ -16,6 +16,7 @@ build_message, coerce_optional_str, load_jsonl_rows, + min_max_timestamps, require_string, stringify_json_value, stringify_text_value, @@ -86,6 +87,7 @@ def parse_file( elif record_type == "user": messages.extend(normalize_claude_user_messages(raw_record)) + started_at, ended_at = min_max_timestamps(timestamps) session_key = session_id or file_path.stem index_entry = session_index.get(session_key, {}) project_path = coerce_optional_str(index_entry.get("projectPath")) or cwd @@ -112,8 +114,8 @@ def parse_file( cwd=cwd, project_path=project_path, git_branch=git_branch, - started_at=min(timestamps) if timestamps else None, - ended_at=max(timestamps) if timestamps else None, + started_at=started_at, + ended_at=ended_at, messages=messages, source_meta=source_meta, ) diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/codex.py b/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/codex.py index d8f34fe03..c38d73304 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/codex.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/codex.py @@ -14,6 +14,7 @@ build_message, coerce_optional_str, load_jsonl_rows, + min_max_timestamps, require_string, stringify_json_value, stringify_text_value, @@ -143,6 +144,7 @@ def parse_file( if pending_reasoning: source_meta["unattached_reasoning"] = list(pending_reasoning) + earliest, latest = min_max_timestamps(timestamps) return [ NormalizedAgentRolloutRecord( trace_id=session_id, @@ -154,9 +156,8 @@ def parse_file( cwd=coerce_optional_str(session_meta.get("cwd")), project_path=coerce_optional_str(session_meta.get("cwd")), git_branch=coerce_optional_str(session_meta.get("git_branch")), - started_at=coerce_optional_str(session_meta.get("timestamp")) - or (min(timestamps) if timestamps else None), - ended_at=max(timestamps) if timestamps else None, + started_at=coerce_optional_str(session_meta.get("timestamp")) or earliest, + ended_at=latest, messages=messages, source_meta=source_meta, ) diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/utils.py b/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/utils.py index a799afdaf..fe4877a16 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/utils.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/utils.py @@ -5,6 +5,7 @@ import json from collections.abc import Iterator +from datetime import datetime, timezone from pathlib import Path from typing import Any, Literal @@ -110,3 +111,39 @@ def stringify_text_value(value: Any) -> str: if isinstance(value, str): return value return str(value) + + +def min_max_timestamps(timestamps: list[str]) -> tuple[str | None, str | None]: + """Return the chronologically earliest and latest timestamps. + + Values are parsed as ISO 8601 before comparison so that mixed UTC offsets + and precisions order correctly (e.g. ``2025-01-01T00:30:00+01:00`` is + earlier than ``2025-01-01T00:00:00Z``). Naive timestamps are treated as + UTC. Unparseable values are skipped. The winning entries are returned in + their original string form. + """ + parsed: list[tuple[datetime, str]] = [] + for original in timestamps: + instant = parse_iso8601(original) + if instant is not None: + parsed.append((instant, original)) + if not parsed: + return None, None + earliest = min(parsed, key=lambda pair: pair[0])[1] + latest = max(parsed, key=lambda pair: pair[0])[1] + return earliest, latest + + +def parse_iso8601(value: str) -> datetime | None: + """Parse an ISO 8601 timestamp, treating naive values as UTC. + + Returns ``None`` for strings that cannot be parsed so callers can silently + skip malformed entries. + """ + try: + parsed = datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError: + return None + if parsed.tzinfo is None: + return parsed.replace(tzinfo=timezone.utc) + return parsed diff --git a/packages/data-designer-engine/tests/engine/resources/agent_rollout/test_utils.py b/packages/data-designer-engine/tests/engine/resources/agent_rollout/test_utils.py new file mode 100644 index 000000000..4abe32b28 --- /dev/null +++ b/packages/data-designer-engine/tests/engine/resources/agent_rollout/test_utils.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +from data_designer.engine.resources.agent_rollout.utils import min_max_timestamps + + +@pytest.mark.parametrize( + ("timestamps", "expected"), + [ + pytest.param([], (None, None), id="empty"), + pytest.param( + ["2025-01-01T00:30:00+01:00", "2025-01-01T00:00:00Z"], + ("2025-01-01T00:30:00+01:00", "2025-01-01T00:00:00Z"), + id="mixed-offset-lex-disagrees-with-chrono", + ), + pytest.param( + ["2025-01-01T00:00:00.500Z", "2025-01-01T00:00:00Z"], + ("2025-01-01T00:00:00Z", "2025-01-01T00:00:00.500Z"), + id="mixed-precision", + ), + pytest.param( + ["2025-01-01T00:00:00", "2025-01-02T00:00:00Z"], + ("2025-01-01T00:00:00", "2025-01-02T00:00:00Z"), + id="naive-treated-as-utc-and-compared-against-aware", + ), + pytest.param( + ["not-a-timestamp", "2025-01-01T00:00:00Z"], + ("2025-01-01T00:00:00Z", "2025-01-01T00:00:00Z"), + id="unparseable-values-skipped", + ), + pytest.param(["not-a-timestamp"], (None, None), id="only-unparseable"), + ], +) +def test_min_max_timestamps(timestamps: list[str], expected: tuple[str | None, str | None]) -> None: + assert min_max_timestamps(timestamps) == expected