From 9e7723f9a59bafac183a898c4ceacc6c33a46828 Mon Sep 17 00:00:00 2001 From: Danijel Martinek Date: Thu, 14 May 2026 11:48:32 +0200 Subject: [PATCH] fix(scripts): remove broken session-resume from dispatch loop Sandcastle rejects `resumeSession` when `maxIterations > 1` with "Resume applies to iteration 1 only; multi-iteration resume semantics are not supported." Since a TDD slice needs the full 30-iteration budget, the session-resume path we shipped in d5c0120 is dead infrastructure that breaks dispatch mid-run. Rip it out cleanly: - runOneSlice drops the resumeSession param + the context-exhaustion safety net + sessionId/usage return fields - executeDispatch drops the currentStory/currentSession bookkeeping and the token-reset threshold - helpers totalInputTokens + isContextExhaustedError go (only used by the resume path) - SANDCASTLE_SESSION_TOKEN_RESET removed from .env.example Net: -153 lines. Each slice is again an independent sandcastle session; token cost per slice goes up (each implementer re-discovers context) but the multi-iteration TDD shape works. A different cross-slice context-passing mechanism (e.g. a story-level context summary injected into each task spec) is left as future work. --- .env.example | 11 --- scripts/work/dispatch.mjs | 164 +++----------------------------------- 2 files changed, 11 insertions(+), 164 deletions(-) diff --git a/.env.example b/.env.example index 2acf48b..9341a9d 100644 --- a/.env.example +++ b/.env.example @@ -88,14 +88,3 @@ CMS_URL=http://localhost:3001 # notes printed. Bump for tricky slices; lower for fast-feedback iteration. # # SANDCASTLE_MAX_ATTEMPTS=3 - -# Session-resume token threshold. The orchestrator passes the prior -# implementer's session ID into the next slice's run() via sandcastle's -# `resumeSession` — the agent walks into task 2 already knowing where -# helpers live, what the prior diff looked like, which gates passed. -# When the prior iteration's total input tokens (input + cacheRead + -# cacheCreation) crosses this threshold the orchestrator drops the -# session and starts the next task fresh, avoiding mid-slice context -# exhaustion. Default 140000 ≈ 70% of Sonnet 4.6's 200k window. -# -# SANDCASTLE_SESSION_TOKEN_RESET=140000 diff --git a/scripts/work/dispatch.mjs b/scripts/work/dispatch.mjs index b10f56b..d5fe647 100644 --- a/scripts/work/dispatch.mjs +++ b/scripts/work/dispatch.mjs @@ -323,60 +323,19 @@ function explainSandcastleError(stage, e) { console.error(" See docs/guides/runbook.md → 'Using Sandcastle' for setup."); } -/** - * Sum of all input-token classes counted against the model's context - * window for one iteration. `cacheReadInputTokens` and - * `cacheCreationInputTokens` are cheap dollar-wise but still occupy the - * window, so they all count for the reset-threshold check. - */ -function totalInputTokens(usage) { - if (!usage) return 0; - return ( - (usage.inputTokens ?? 0) + - (usage.cacheCreationInputTokens ?? 0) + - (usage.cacheReadInputTokens ?? 0) - ); -} - -/** - * Detect Claude / sandcastle errors that indicate the agent's input - * exceeded the model's context window. The orchestrator handles this by - * dropping the resumed session and retrying once with a fresh session. - */ -function isContextExhaustedError(e) { - const msg = String(e?.message ?? e ?? ""); - return ( - /prompt is too long/i.test(msg) || - /context_length_exceeded/i.test(msg) || - /context window/i.test(msg) || - /too many tokens/i.test(msg) - ); -} - /** * Run one slice end-to-end: implementer + reviewer, with a fix-up cycle on - * reject (capped at maxAttempts). The implementer is invoked with - * `resumeSession` so its prior context (file reads, helper signatures, - * gate output) carries forward; the reviewer always runs fresh so each - * approve/reject decision is independent. + * reject (capped at maxAttempts). Each slice is an independent sandcastle + * session — sandcastle's `resumeSession` is incompatible with the + * multi-iteration budgets a TDD slice requires (applies to iteration 1 only). * - * Returns: - * { outcome, attempts, sessionId, usage, ... } - * where sessionId / usage are from the implementer's LAST iteration of - * its LAST attempt (used by the caller for the next slice's resume + - * threshold check). Outcome variants: + * Outcome variants: * "approved" (implJson, reviewJson) * "rejected-final" (lastRejectNotes) * "blocked" (implJson) * "error" (reason) */ -async function runOneSlice({ - sandcastleRoot, - sandbox, - agent, - next, - resumeSession, -}) { +async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) { const maxAttempts = Number(process.env.SANDCASTLE_MAX_ATTEMPTS ?? 3); const implementerPrompt = path.join(SANDCASTLE_DIR, "implementer.prompt.md"); const reviewerPrompt = path.join(SANDCASTLE_DIR, "reviewer.prompt.md"); @@ -384,13 +343,6 @@ async function runOneSlice({ let rejectionNotes = null; let lastRejectNotes = null; let attempts = 0; - // Across retries within this slice, the implementer resumes from the - // most recent session — so attempt N sees attempt N-1's reasoning + the - // reviewer's rejection notes. Caller's resumeSession (from the prior - // slice) seeds the first attempt; null = fresh session. - let currentSession = resumeSession ?? null; - let lastUsage = null; - let contextResetUsedThisSlice = false; while (attempts < maxAttempts) { attempts++; @@ -414,55 +366,21 @@ async function runOneSlice({ // it, sandcastle re-invokes the model up to maxIterations even // when the work is already done. completionSignal: "COMPLETE", - // Resume from the prior slice's (or prior attempt's) session so - // the implementer doesn't re-explore the repo for every task in - // the same story. - resumeSession: currentSession ?? undefined, }); } catch (e) { - // Context-exhaustion safety net: drop the resumed session and retry - // the same attempt fresh, exactly once per slice. - if ( - isContextExhaustedError(e) && - currentSession && - !contextResetUsedThisSlice - ) { - console.log( - "↺ Context window exhausted; dropping resumed session and retrying fresh.", - ); - currentSession = null; - contextResetUsedThisSlice = true; - attempts--; // not counted against SANDCASTLE_MAX_ATTEMPTS - continue; - } explainSandcastleError("Implementer", e); - return { - outcome: "error", - attempts, - reason: e.message, - sessionId: currentSession, - usage: lastUsage, - }; + return { outcome: "error", attempts, reason: e.message }; } console.log( `Implementer returned. Branch: ${implResult.branch}, Commits: ${implResult.commits.length}`, ); - const finalIter = implResult.iterations[implResult.iterations.length - 1]; - currentSession = finalIter?.sessionId ?? currentSession; - lastUsage = finalIter?.usage ?? lastUsage; const implJson = parseAgentJson(implResult.stdout); if ( implJson?.status === "blocked" || implJson?.status === "needs-clarification" ) { - return { - outcome: "blocked", - attempts, - implJson, - sessionId: currentSession, - usage: lastUsage, - }; + return { outcome: "blocked", attempts, implJson }; } let diff = ""; @@ -489,30 +407,15 @@ async function runOneSlice({ maxIterations: Number(process.env.SANDCASTLE_REVIEWER_ITERATIONS ?? 10), // See implementer comment above. completionSignal: "COMPLETE", - // Reviewer always runs fresh — each approve/reject decision should - // be independent of prior tasks to keep the gate honest. }); } catch (e) { explainSandcastleError("Reviewer", e); - return { - outcome: "error", - attempts, - reason: e.message, - sessionId: currentSession, - usage: lastUsage, - }; + return { outcome: "error", attempts, reason: e.message }; } const reviewJson = parseAgentJson(reviewResult.stdout); if (reviewJson?.decision === "approve") { - return { - outcome: "approved", - attempts, - implJson, - reviewJson, - sessionId: currentSession, - usage: lastUsage, - }; + return { outcome: "approved", attempts, implJson, reviewJson }; } if (reviewJson?.decision === "reject") { rejectionNotes = @@ -527,17 +430,9 @@ async function runOneSlice({ outcome: "error", attempts, reason: `reviewer returned no parseable decision; stdout:\n${reviewResult.stdout}`, - sessionId: currentSession, - usage: lastUsage, }; } - return { - outcome: "rejected-final", - attempts, - lastRejectNotes, - sessionId: currentSession, - usage: lastUsage, - }; + return { outcome: "rejected-final", attempts, lastRejectNotes }; } /** @@ -641,17 +536,7 @@ async function executeDispatch({ maxTasks }) { const sandbox = dockerProvider(dockerOpts); const agent = sandcastleRoot.claudeCode("claude-sonnet-4-6", agentOpts); - // Session is carried forward across slices WITHIN the same story so the - // implementer doesn't re-explore the repo each task. Reset on story - // boundaries (different repo area, different context) and when the prior - // session's input-token usage crosses the threshold (avoids hitting the - // model's context-window limit mid-slice). - const tokenResetThreshold = Number( - process.env.SANDCASTLE_SESSION_TOKEN_RESET ?? 140000, - ); let approved = 0; - let currentStory = null; - let currentSession = null; while (true) { if (maxTasks !== null && approved >= maxTasks) { console.log(`\nHit --max-tasks=${maxTasks} cap; stopping.`); @@ -662,40 +547,13 @@ async function executeDispatch({ maxTasks }) { console.log("\nNo more ready tasks. Dispatch loop complete."); break; } - if (next.story !== currentStory) { - if (currentStory !== null && currentSession) { - console.log( - `\n(Story boundary — resetting implementer session from ${currentStory} → ${next.story})`, - ); - } - currentSession = null; - currentStory = next.story; - } console.log( `\n--- Slice ${approved + 1}: ${next.epic} / ${next.story} ---`, ); console.log(` Bullet: ${next.bulletLine.trim()}`); - if (currentSession) { - console.log(` Resuming session: ${currentSession.slice(0, 12)}…`); - } - const result = await runOneSlice({ - sandcastleRoot, - sandbox, - agent, - next, - resumeSession: currentSession, - }); + const result = await runOneSlice({ sandcastleRoot, sandbox, agent, next }); if (result.outcome === "approved") { - const usedTokens = totalInputTokens(result.usage); - if (usedTokens > tokenResetThreshold) { - console.log( - `(Session at ${usedTokens} input tokens > threshold ${tokenResetThreshold} — resetting before next task)`, - ); - currentSession = null; - } else { - currentSession = result.sessionId ?? null; - } applyApprovedState(next); approved++; continue;