Gracker · Gracker · May 8, 2026
diff --git a/backend/src/scripts/__tests__/verifyEnterpriseMultiTenantWindows.test.ts b/backend/src/scripts/__tests__/verifyEnterpriseMultiTenantWindows.test.ts
@@ -31,7 +31,7 @@ describe('verifyEnterpriseMultiTenantWindows script', () => {
     await fs.rm(tempRoot, { recursive: true, force: true });
   });
 
-  test('covers current D1/D2/D4/D5/D6/D7/D8 isolation invariants without invoking a real provider', async () => {
+  test('covers current D1/D2/D4/D5/D6/D7/D8/D9 isolation invariants without invoking a real provider', async () => {
     const tracePath = path.join(tempRoot, 'fixture.pftrace');
     const uploadRoot = path.join(tempRoot, 'uploads');
     const outputPath = path.join(tempRoot, 'report.json');
@@ -53,6 +53,7 @@ describe('verifyEnterpriseMultiTenantWindows script', () => {
       D6: true,
       D7: true,
       D8: true,
+      D9: true,
     });
     expect(Object.values(report.scenarios.D1.checks)).toEqual(expect.arrayContaining([true]));
     expect(Object.values(report.scenarios.D1.checks).every(Boolean)).toBe(true);
@@ -62,6 +63,7 @@ describe('verifyEnterpriseMultiTenantWindows script', () => {
     expect(Object.values(report.scenarios.D6.checks).every(Boolean)).toBe(true);
     expect(Object.values(report.scenarios.D7.checks).every(Boolean)).toBe(true);
     expect(Object.values(report.scenarios.D8.checks).every(Boolean)).toBe(true);
+    expect(Object.values(report.scenarios.D9.checks).every(Boolean)).toBe(true);
     expect(report.coverageLimitations.join('\n')).toContain('TraceProcessorLease');
     expect(report.scenarios.D6.details).toEqual(expect.objectContaining({
       reportUrl: expect.stringMatching(/^\/api\/reports\//),
@@ -71,6 +73,12 @@ describe('verifyEnterpriseMultiTenantWindows script', () => {
       providerSnapshotChanged: true,
       sdkSessionReusable: false,
     }));
+    expect(report.scenarios.D9.details).toEqual(expect.objectContaining({
+      recoveredPreviousStatuses: expect.arrayContaining([
+        expect.arrayContaining([expect.stringMatching(/^run-/), 'pending']),
+        expect.arrayContaining([expect.stringMatching(/^run-/), 'running']),
+      ]),
+    }));
     expect(process.env.UPLOAD_DIR).toBe(previousUploadDir);
 
     const written = JSON.parse(await fs.readFile(outputPath, 'utf8')) as EnterpriseWindowRegressionReport;
@@ -79,6 +87,6 @@ describe('verifyEnterpriseMultiTenantWindows script', () => {
 
     const traceFiles = (await fs.readdir(path.join(uploadRoot, 'traces')))
       .filter(file => file.endsWith('.trace'));
-    expect(traceFiles).toHaveLength(11);
+    expect(traceFiles).toHaveLength(12);
   });
 });
diff --git a/backend/src/scripts/verifyEnterpriseMultiTenantWindows.ts b/backend/src/scripts/verifyEnterpriseMultiTenantWindows.ts
@@ -20,7 +20,7 @@ import { TraceProcessorLeaseStore } from '../services/traceProcessorLeaseStore';
 import { ownerFieldsFromContext } from '../services/resourceOwnership';
 import type { RequestContext } from '../middleware/auth';
 
-type ScenarioName = 'D1' | 'D2' | 'D4' | 'D5' | 'D6' | 'D7' | 'D8';
+type ScenarioName = 'D1' | 'D2' | 'D4' | 'D5' | 'D6' | 'D7' | 'D8' | 'D9';
 
 interface VerifyOptions {
   tracePath?: string;
@@ -1182,6 +1182,185 @@ async function scenarioD8(
   };
 }
 
+async function scenarioD9(
+  tracePath: string,
+  userAWindow1: WindowContext,
+): Promise<ScenarioReport> {
+  const restartDbPath = path.join(path.dirname(getTracesDir()), `d9-restart-${crypto.randomUUID()}.sqlite`);
+  let restartDb = new Database(restartDbPath);
+  try {
+    applyEnterpriseMinimalSchema(restartDb);
+    const upload = await simulateUpload(restartDb, userAWindow1, tracePath, 'd9-backend-restart.pftrace');
+    const pendingRun = createAnalysisRun(restartDb, userAWindow1, upload, 'pending');
+    const runningRun = createAnalysisRun(restartDb, userAWindow1, upload, 'running');
+    const completedRun = createAnalysisRun(restartDb, userAWindow1, upload, 'completed');
+    const recoveryAt = 1_777_200_100_000;
+    const recoveryErrorJson = JSON.stringify({
+      message: 'backend restart during D9 regression',
+      source: 'backend_startup_recovery',
+    });
+
+    appendAgentEvent(restartDb, userAWindow1, pendingRun.runId, 1, 'progress', {
+      type: 'progress',
+      data: { phase: 'queued-before-restart' },
+    });
+    appendAgentEvent(restartDb, userAWindow1, runningRun.runId, 1, 'progress', {
+      type: 'progress',
+      data: { phase: 'running-before-restart' },
+    });
+    appendAgentEvent(restartDb, userAWindow1, runningRun.runId, 2, 'thought', {
+      type: 'thought',
+      data: { summary: 'partial thought before restart' },
+    });
+    appendAgentEvent(restartDb, userAWindow1, completedRun.runId, 1, 'conclusion', {
+      type: 'conclusion',
+      data: { summary: 'completed before restart' },
+    });
+    appendAgentEvent(restartDb, userAWindow1, completedRun.runId, 2, 'analysis_completed', {
+      type: 'analysis_completed',
+      data: { reportUrl: '/api/reports/d9-terminal-report' },
+    });
+    restartDb.prepare(`
+      UPDATE analysis_runs
+      SET completed_at = ?, updated_at = ?
+      WHERE id = ?
+    `).run(recoveryAt - 1_000, recoveryAt - 1_000, completedRun.runId);
+    restartDb.prepare(`
+      UPDATE analysis_sessions
+      SET status = 'completed', updated_at = ?
+      WHERE id = ?
+    `).run(recoveryAt - 1_000, completedRun.sessionId);
+
+    restartDb.close();
+    restartDb = new Database(restartDbPath);
+    applyEnterpriseMinimalSchema(restartDb);
+
+    const interrupted = restartDb.prepare<unknown[], {
+      id: string;
+      tenant_id: string;
+      workspace_id: string;
+      session_id: string;
+      status: string;
+    }>(`
+      SELECT id, tenant_id, workspace_id, session_id, status
+      FROM analysis_runs
+      WHERE status IN ('pending', 'running', 'awaiting_user')
+      ORDER BY updated_at ASC, started_at ASC, id ASC
+    `).all();
+    const recoverInterrupted = restartDb.transaction(() => {
+      for (const run of interrupted) {
+        restartDb.prepare(`
+          UPDATE analysis_runs
+          SET status = 'failed',
+              heartbeat_at = ?,
+              updated_at = ?,
+              completed_at = COALESCE(completed_at, ?),
+              error_json = COALESCE(error_json, ?)
+          WHERE tenant_id = ?
+            AND workspace_id = ?
+            AND id = ?
+            AND status = ?
+        `).run(
+          recoveryAt,
+          recoveryAt,
+          recoveryAt,
+          recoveryErrorJson,
+          run.tenant_id,
+          run.workspace_id,
+          run.id,
+          run.status,
+        );
+        restartDb.prepare(`
+          UPDATE analysis_sessions
+          SET status = 'failed',
+              updated_at = ?
+          WHERE tenant_id = ?
+            AND workspace_id = ?
+            AND id = ?
+            AND status IN ('pending', 'running', 'awaiting_user')
+        `).run(recoveryAt, run.tenant_id, run.workspace_id, run.session_id);
+      }
+    });
+    recoverInterrupted();
+
+    const lifecycleRows = restartDb.prepare<unknown[], {
+      id: string;
+      status: string;
+      completed_at: number | null;
+      error_json: string | null;
+    }>(`
+      SELECT id, status, completed_at, error_json
+      FROM analysis_runs
+      WHERE id IN (?, ?, ?)
+      ORDER BY id ASC
+    `).all(pendingRun.runId, runningRun.runId, completedRun.runId);
+    const lifecycleById = Object.fromEntries(lifecycleRows.map(row => [row.id, row]));
+    const pendingEvents = listAgentEventsAfter(restartDb, userAWindow1, pendingRun.runId, 0);
+    const runningEvents = listAgentEventsAfter(restartDb, userAWindow1, runningRun.runId, 0);
+    const completedEvents = listAgentEventsAfter(restartDb, userAWindow1, completedRun.runId, 0);
+    const traceMetadata = await readTraceMetadataForContext(upload.traceId, userAWindow1.context);
+    const traceAssetRow = restartDb.prepare<unknown[], { id: string; local_path: string; status: string }>(`
+      SELECT id, local_path, status
+      FROM trace_assets
+      WHERE tenant_id = ? AND workspace_id = ? AND id = ?
+    `).get(userAWindow1.context.tenantId, userAWindow1.context.workspaceId, upload.traceId);
+    const sessionStatuses = restartDb.prepare<unknown[], { id: string; status: string }>(`
+      SELECT id, status
+      FROM analysis_sessions
+      WHERE id IN (?, ?, ?)
+      ORDER BY id ASC
+    `).all(pendingRun.sessionId, runningRun.sessionId, completedRun.sessionId);
+    const sessionStatusById = Object.fromEntries(sessionStatuses.map(row => [row.id, row.status]));
+    const recoveredPreviousStatuses = interrupted.map(run => [run.id, run.status]);
+
+    const checks = {
+      fileBackedDbReopensWithTraceAsset: traceAssetRow?.id === upload.traceId
+        && traceAssetRow.status === 'ready'
+        && fs.existsSync(upload.localPath),
+      traceMetadataReadableAfterRestart: traceMetadata?.id === upload.traceId
+        && traceMetadata.tenantId === userAWindow1.context.tenantId
+        && traceMetadata.workspaceId === userAWindow1.context.workspaceId,
+      startupRecoveryFindsOnlyNonterminalRuns: recoveredPreviousStatuses.length === 2
+        && recoveredPreviousStatuses.some(([id, status]) => id === pendingRun.runId && status === 'pending')
+        && recoveredPreviousStatuses.some(([id, status]) => id === runningRun.runId && status === 'running')
+        && !recoveredPreviousStatuses.some(([id]) => id === completedRun.runId),
+      pendingAndRunningRunsBecomeFailed: lifecycleById[pendingRun.runId]?.status === 'failed'
+        && lifecycleById[runningRun.runId]?.status === 'failed'
+        && lifecycleById[pendingRun.runId]?.completed_at === recoveryAt
+        && lifecycleById[runningRun.runId]?.completed_at === recoveryAt,
+      terminalRunRemainsCompleted: lifecycleById[completedRun.runId]?.status === 'completed'
+        && lifecycleById[completedRun.runId]?.completed_at === recoveryAt - 1_000,
+      persistedEventsReplayAfterRestart: pendingEvents.map(event => event.cursor).join(',') === '1'
+        && runningEvents.map(event => event.cursor).join(',') === '1,2'
+        && completedEvents.map(event => `${event.cursor}:${event.eventType}`).join(',') === '1:conclusion,2:analysis_completed',
+      failedSessionsMarkedWithoutTouchingTerminalSession: sessionStatusById[pendingRun.sessionId] === 'failed'
+        && sessionStatusById[runningRun.sessionId] === 'failed'
+        && sessionStatusById[completedRun.sessionId] === 'completed',
+      recoveryErrorIsAuditable: lifecycleById[pendingRun.runId]?.error_json === recoveryErrorJson
+        && lifecycleById[runningRun.runId]?.error_json === recoveryErrorJson
+        && lifecycleById[completedRun.runId]?.error_json === null,
+    };
+
+    return {
+      checks,
+      details: {
+        traceId: upload.traceId,
+        restartDbPath,
+        pendingRun,
+        runningRun,
+        completedRun,
+        recoveredPreviousStatuses,
+        pendingEvents,
+        runningEvents,
+        completedEvents,
+        sessionStatusById,
+      },
+    };
+  } finally {
+    restartDb.close();
+  }
+}
+
 function allChecksPassed(report: EnterpriseWindowRegressionReport): boolean {
   return Object.values(report.scenarios).every(scenario =>
     Object.values(scenario.checks).every(Boolean),
@@ -1248,6 +1427,7 @@ export async function runEnterpriseWindowRegression(
       D6: await scenarioD6(db, tracePath, windows.userAWindow1, windows.userCWindow1),
       D7: await scenarioD7(db, tracePath, windows.userAWindow1),
       D8: await scenarioD8(db, tracePath, windows.userAWindow1),
+      D9: await scenarioD9(tracePath, windows.userAWindow1),
     };
     const report: EnterpriseWindowRegressionReport = {
       timestamp: new Date().toISOString(),
@@ -1260,6 +1440,7 @@ export async function runEnterpriseWindowRegression(
         D6: scenarioPassed(scenarios.D6),
         D7: scenarioPassed(scenarios.D7),
         D8: scenarioPassed(scenarios.D8),
+        D9: scenarioPassed(scenarios.D9),
       },
       uploadRoot,
       tracePath,
@@ -1272,7 +1453,8 @@ export async function runEnterpriseWindowRegression(
         'D6 covers the persisted AgentEvent replay contract after a conclusion cursor; the live stream route path is covered by agentRoutesRbac tests.',
         'D7 covers running run, active lease, report_generation holder, and draining rejection invariants; actual route blocking is covered by enterpriseTraceMetadataRoutes tests.',
         'D8 covers the DB ProviderSnapshot pin/hash-mismatch invariant in the enterprise window regression; AgentAnalyzeSessionService tests cover actual in-memory and persisted SDK session non-reuse.',
-        'Production backend proxy and queue behavior remain future §0.7 D1/D2/D4/D5/D6/D7/D8 final-acceptance work against a live browser and trace_processor_shell.',
+        'D9 covers file-backed DB close/reopen recovery for trace metadata, run states, and AgentEvent replay; enterpriseRestartPersistence tests cover the route-level restart recovery path.',
+        'Production backend proxy and queue behavior remain future §0.7 D1/D2/D4/D5/D6/D7/D8/D9 final-acceptance work against a live browser and trace_processor_shell.',
       ],
     };
     report.passed = allChecksPassed(report);

diff --git a/docs/features/enterprise-multi-tenant/README.md b/docs/features/enterprise-multi-tenant/README.md
@@ -113,7 +113,7 @@
 - [x] D6 SSE 在 conclusion 后、analysis_completed 前断开 → AgentEvent replay 能补回 reportUrl
 - [x] D7 手动 cleanup / delete → running run / active lease / 正在生成的 report 被 draining 保护
 - [x] D8 Provider 配置在 session 中途变更 → resume 校验 ProviderSnapshot hash，不复用错误 SDK session
-- [ ] D9 后端进程重启 → pending/running/terminal run 状态、events、trace metadata 可恢复或转 failed
+- [x] D9 后端进程重启 → pending/running/terminal run 状态、events、trace metadata 可恢复或转 failed
 - [ ] D10 机器内存接近上限 → admission 拒绝新 lease，不通过 OOM 杀已有窗口
 
 ### 0.8 §19 总验收