From d5c01209eae35f36b2ad9b4e997418e84e8e91f1 Mon Sep 17 00:00:00 2001 From: Danijel Martinek Date: Wed, 13 May 2026 20:13:30 +0200 Subject: [PATCH] feat(work): resume implementer session across same-story slices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires sandcastle's native `resumeSession` into the dispatch loop so the implementer walks into task N already knowing what task N-1 discovered — repo layout, helper signatures, gate output, prior diff. No scratchpad / no hand-curated context file; the agent's own Claude Code conversation log is the carrier. Three guardrails keep it bounded: - Story boundary reset. `currentSession` is dropped whenever findNextTask returns a different story id. New domain ≈ new context — keeps story 03 from inheriting story 02's residue. - Token-threshold reset. After each approved slice, sum the implementer's last-iteration usage (inputTokens + cacheCreationInputTokens + cacheReadInputTokens — caching saves dollars but doesn't free window space). If above SANDCASTLE_SESSION_TOKEN_RESET (default 140000 ≈ 70% of Sonnet 4.6's 200k), drop the session before the next task. Configurable via env. - Context-exhausted safety net. If the model rejects with "prompt is too long" / "context_length_exceeded" / similar, the retry loop drops the session and re-runs the attempt fresh exactly once. Doesn't count against SANDCASTLE_MAX_ATTEMPTS (different failure mode). Reviewer always runs fresh — each approve/reject decision should be independent of prior tasks to keep the gate honest. Within a single slice's reject-fixup retries, the implementer also carries forward across attempts (so attempt 2 sees attempt 1's reasoning + the reviewer notes), but that's per-slice cumulative, not cross-slice. runOneSlice now returns { sessionId, usage } so executeDispatch can make the carry-or-reset decision per slice. --- .env.example | 11 +++ scripts/work/dispatch.mjs | 175 +++++++++++++++++++++++++++++++++++--- 2 files changed, 172 insertions(+), 14 deletions(-) diff --git a/.env.example b/.env.example index 9341a9d..2acf48b 100644 --- a/.env.example +++ b/.env.example @@ -88,3 +88,14 @@ CMS_URL=http://localhost:3001 # notes printed. Bump for tricky slices; lower for fast-feedback iteration. # # SANDCASTLE_MAX_ATTEMPTS=3 + +# Session-resume token threshold. The orchestrator passes the prior +# implementer's session ID into the next slice's run() via sandcastle's +# `resumeSession` — the agent walks into task 2 already knowing where +# helpers live, what the prior diff looked like, which gates passed. +# When the prior iteration's total input tokens (input + cacheRead + +# cacheCreation) crosses this threshold the orchestrator drops the +# session and starts the next task fresh, avoiding mid-slice context +# exhaustion. Default 140000 ≈ 70% of Sonnet 4.6's 200k window. +# +# SANDCASTLE_SESSION_TOKEN_RESET=140000 diff --git a/scripts/work/dispatch.mjs b/scripts/work/dispatch.mjs index d532d91..b10f56b 100644 --- a/scripts/work/dispatch.mjs +++ b/scripts/work/dispatch.mjs @@ -324,14 +324,59 @@ function explainSandcastleError(stage, e) { } /** - * Run one slice end-to-end: implementer + reviewer, with a fix-up cycle on - * reject (capped at maxAttempts). Returns: - * { outcome: "approved", attempts, implJson, reviewJson } - * { outcome: "rejected-final", attempts, lastRejectNotes } - * { outcome: "blocked", attempts, implJson } - * { outcome: "error", attempts, reason } + * Sum of all input-token classes counted against the model's context + * window for one iteration. `cacheReadInputTokens` and + * `cacheCreationInputTokens` are cheap dollar-wise but still occupy the + * window, so they all count for the reset-threshold check. */ -async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) { +function totalInputTokens(usage) { + if (!usage) return 0; + return ( + (usage.inputTokens ?? 0) + + (usage.cacheCreationInputTokens ?? 0) + + (usage.cacheReadInputTokens ?? 0) + ); +} + +/** + * Detect Claude / sandcastle errors that indicate the agent's input + * exceeded the model's context window. The orchestrator handles this by + * dropping the resumed session and retrying once with a fresh session. + */ +function isContextExhaustedError(e) { + const msg = String(e?.message ?? e ?? ""); + return ( + /prompt is too long/i.test(msg) || + /context_length_exceeded/i.test(msg) || + /context window/i.test(msg) || + /too many tokens/i.test(msg) + ); +} + +/** + * Run one slice end-to-end: implementer + reviewer, with a fix-up cycle on + * reject (capped at maxAttempts). The implementer is invoked with + * `resumeSession` so its prior context (file reads, helper signatures, + * gate output) carries forward; the reviewer always runs fresh so each + * approve/reject decision is independent. + * + * Returns: + * { outcome, attempts, sessionId, usage, ... } + * where sessionId / usage are from the implementer's LAST iteration of + * its LAST attempt (used by the caller for the next slice's resume + + * threshold check). Outcome variants: + * "approved" (implJson, reviewJson) + * "rejected-final" (lastRejectNotes) + * "blocked" (implJson) + * "error" (reason) + */ +async function runOneSlice({ + sandcastleRoot, + sandbox, + agent, + next, + resumeSession, +}) { const maxAttempts = Number(process.env.SANDCASTLE_MAX_ATTEMPTS ?? 3); const implementerPrompt = path.join(SANDCASTLE_DIR, "implementer.prompt.md"); const reviewerPrompt = path.join(SANDCASTLE_DIR, "reviewer.prompt.md"); @@ -339,6 +384,14 @@ async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) { let rejectionNotes = null; let lastRejectNotes = null; let attempts = 0; + // Across retries within this slice, the implementer resumes from the + // most recent session — so attempt N sees attempt N-1's reasoning + the + // reviewer's rejection notes. Caller's resumeSession (from the prior + // slice) seeds the first attempt; null = fresh session. + let currentSession = resumeSession ?? null; + let lastUsage = null; + let contextResetUsedThisSlice = false; + while (attempts < maxAttempts) { attempts++; const taskSpec = buildTaskSpec(next, rejectionNotes); @@ -361,20 +414,55 @@ async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) { // it, sandcastle re-invokes the model up to maxIterations even // when the work is already done. completionSignal: "COMPLETE", + // Resume from the prior slice's (or prior attempt's) session so + // the implementer doesn't re-explore the repo for every task in + // the same story. + resumeSession: currentSession ?? undefined, }); } catch (e) { + // Context-exhaustion safety net: drop the resumed session and retry + // the same attempt fresh, exactly once per slice. + if ( + isContextExhaustedError(e) && + currentSession && + !contextResetUsedThisSlice + ) { + console.log( + "↺ Context window exhausted; dropping resumed session and retrying fresh.", + ); + currentSession = null; + contextResetUsedThisSlice = true; + attempts--; // not counted against SANDCASTLE_MAX_ATTEMPTS + continue; + } explainSandcastleError("Implementer", e); - return { outcome: "error", attempts, reason: e.message }; + return { + outcome: "error", + attempts, + reason: e.message, + sessionId: currentSession, + usage: lastUsage, + }; } console.log( `Implementer returned. Branch: ${implResult.branch}, Commits: ${implResult.commits.length}`, ); + const finalIter = implResult.iterations[implResult.iterations.length - 1]; + currentSession = finalIter?.sessionId ?? currentSession; + lastUsage = finalIter?.usage ?? lastUsage; + const implJson = parseAgentJson(implResult.stdout); if ( implJson?.status === "blocked" || implJson?.status === "needs-clarification" ) { - return { outcome: "blocked", attempts, implJson }; + return { + outcome: "blocked", + attempts, + implJson, + sessionId: currentSession, + usage: lastUsage, + }; } let diff = ""; @@ -401,15 +489,30 @@ async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) { maxIterations: Number(process.env.SANDCASTLE_REVIEWER_ITERATIONS ?? 10), // See implementer comment above. completionSignal: "COMPLETE", + // Reviewer always runs fresh — each approve/reject decision should + // be independent of prior tasks to keep the gate honest. }); } catch (e) { explainSandcastleError("Reviewer", e); - return { outcome: "error", attempts, reason: e.message }; + return { + outcome: "error", + attempts, + reason: e.message, + sessionId: currentSession, + usage: lastUsage, + }; } const reviewJson = parseAgentJson(reviewResult.stdout); if (reviewJson?.decision === "approve") { - return { outcome: "approved", attempts, implJson, reviewJson }; + return { + outcome: "approved", + attempts, + implJson, + reviewJson, + sessionId: currentSession, + usage: lastUsage, + }; } if (reviewJson?.decision === "reject") { rejectionNotes = @@ -420,14 +523,21 @@ async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) { ); continue; } - // Reviewer produced unparseable / non-decision output — treat as error. return { outcome: "error", attempts, reason: `reviewer returned no parseable decision; stdout:\n${reviewResult.stdout}`, + sessionId: currentSession, + usage: lastUsage, }; } - return { outcome: "rejected-final", attempts, lastRejectNotes }; + return { + outcome: "rejected-final", + attempts, + lastRejectNotes, + sessionId: currentSession, + usage: lastUsage, + }; } /** @@ -531,7 +641,17 @@ async function executeDispatch({ maxTasks }) { const sandbox = dockerProvider(dockerOpts); const agent = sandcastleRoot.claudeCode("claude-sonnet-4-6", agentOpts); + // Session is carried forward across slices WITHIN the same story so the + // implementer doesn't re-explore the repo each task. Reset on story + // boundaries (different repo area, different context) and when the prior + // session's input-token usage crosses the threshold (avoids hitting the + // model's context-window limit mid-slice). + const tokenResetThreshold = Number( + process.env.SANDCASTLE_SESSION_TOKEN_RESET ?? 140000, + ); let approved = 0; + let currentStory = null; + let currentSession = null; while (true) { if (maxTasks !== null && approved >= maxTasks) { console.log(`\nHit --max-tasks=${maxTasks} cap; stopping.`); @@ -542,13 +662,40 @@ async function executeDispatch({ maxTasks }) { console.log("\nNo more ready tasks. Dispatch loop complete."); break; } + if (next.story !== currentStory) { + if (currentStory !== null && currentSession) { + console.log( + `\n(Story boundary — resetting implementer session from ${currentStory} → ${next.story})`, + ); + } + currentSession = null; + currentStory = next.story; + } console.log( `\n--- Slice ${approved + 1}: ${next.epic} / ${next.story} ---`, ); console.log(` Bullet: ${next.bulletLine.trim()}`); + if (currentSession) { + console.log(` Resuming session: ${currentSession.slice(0, 12)}…`); + } - const result = await runOneSlice({ sandcastleRoot, sandbox, agent, next }); + const result = await runOneSlice({ + sandcastleRoot, + sandbox, + agent, + next, + resumeSession: currentSession, + }); if (result.outcome === "approved") { + const usedTokens = totalInputTokens(result.usage); + if (usedTokens > tokenResetThreshold) { + console.log( + `(Session at ${usedTokens} input tokens > threshold ${tokenResetThreshold} — resetting before next task)`, + ); + currentSession = null; + } else { + currentSession = result.sessionId ?? null; + } applyApprovedState(next); approved++; continue;