From d5c01209eae35f36b2ad9b4e997418e84e8e91f1 Mon Sep 17 00:00:00 2001
From: Danijel Martinek <danijel@fraqtal.xyz>
Date: Wed, 13 May 2026 20:13:30 +0200
Subject: [PATCH] feat(work): resume implementer session across same-story
 slices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires sandcastle's native `resumeSession` into the dispatch loop so
the implementer walks into task N already knowing what task N-1
discovered — repo layout, helper signatures, gate output, prior diff.
No scratchpad / no hand-curated context file; the agent's own Claude
Code conversation log is the carrier.

Three guardrails keep it bounded:

- Story boundary reset. `currentSession` is dropped whenever
  findNextTask returns a different story id. New domain ≈ new
  context — keeps story 03 from inheriting story 02's residue.
- Token-threshold reset. After each approved slice, sum the
  implementer's last-iteration usage (inputTokens +
  cacheCreationInputTokens + cacheReadInputTokens — caching saves
  dollars but doesn't free window space). If above
  SANDCASTLE_SESSION_TOKEN_RESET (default 140000 ≈ 70% of Sonnet
  4.6's 200k), drop the session before the next task. Configurable
  via env.
- Context-exhausted safety net. If the model rejects with
  "prompt is too long" / "context_length_exceeded" / similar, the
  retry loop drops the session and re-runs the attempt fresh
  exactly once. Doesn't count against SANDCASTLE_MAX_ATTEMPTS
  (different failure mode).

Reviewer always runs fresh — each approve/reject decision should be
independent of prior tasks to keep the gate honest. Within a single
slice's reject-fixup retries, the implementer also carries forward
across attempts (so attempt 2 sees attempt 1's reasoning + the
reviewer notes), but that's per-slice cumulative, not cross-slice.

runOneSlice now returns { sessionId, usage } so executeDispatch can
make the carry-or-reset decision per slice.
---
 .env.example              |  11 +++
 scripts/work/dispatch.mjs | 175 +++++++++++++++++++++++++++++++++++---
 2 files changed, 172 insertions(+), 14 deletions(-)

diff --git a/.env.example b/.env.example
index 9341a9d..2acf48b 100644
--- a/.env.example
+++ b/.env.example
@@ -88,3 +88,14 @@ CMS_URL=http://localhost:3001
 # notes printed. Bump for tricky slices; lower for fast-feedback iteration.
 #
 # SANDCASTLE_MAX_ATTEMPTS=3
+
+# Session-resume token threshold. The orchestrator passes the prior
+# implementer's session ID into the next slice's run() via sandcastle's
+# `resumeSession` — the agent walks into task 2 already knowing where
+# helpers live, what the prior diff looked like, which gates passed.
+# When the prior iteration's total input tokens (input + cacheRead +
+# cacheCreation) crosses this threshold the orchestrator drops the
+# session and starts the next task fresh, avoiding mid-slice context
+# exhaustion. Default 140000 ≈ 70% of Sonnet 4.6's 200k window.
+#
+# SANDCASTLE_SESSION_TOKEN_RESET=140000
diff --git a/scripts/work/dispatch.mjs b/scripts/work/dispatch.mjs
index d532d91..b10f56b 100644
--- a/scripts/work/dispatch.mjs
+++ b/scripts/work/dispatch.mjs
@@ -324,14 +324,59 @@ function explainSandcastleError(stage, e) {
 }
 
 /**
- * Run one slice end-to-end: implementer + reviewer, with a fix-up cycle on
- * reject (capped at maxAttempts). Returns:
- *   { outcome: "approved",        attempts, implJson, reviewJson }
- *   { outcome: "rejected-final",  attempts, lastRejectNotes }
- *   { outcome: "blocked",         attempts, implJson }
- *   { outcome: "error",           attempts, reason }
+ * Sum of all input-token classes counted against the model's context
+ * window for one iteration. `cacheReadInputTokens` and
+ * `cacheCreationInputTokens` are cheap dollar-wise but still occupy the
+ * window, so they all count for the reset-threshold check.
  */
-async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) {
+function totalInputTokens(usage) {
+  if (!usage) return 0;
+  return (
+    (usage.inputTokens ?? 0) +
+    (usage.cacheCreationInputTokens ?? 0) +
+    (usage.cacheReadInputTokens ?? 0)
+  );
+}
+
+/**
+ * Detect Claude / sandcastle errors that indicate the agent's input
+ * exceeded the model's context window. The orchestrator handles this by
+ * dropping the resumed session and retrying once with a fresh session.
+ */
+function isContextExhaustedError(e) {
+  const msg = String(e?.message ?? e ?? "");
+  return (
+    /prompt is too long/i.test(msg) ||
+    /context_length_exceeded/i.test(msg) ||
+    /context window/i.test(msg) ||
+    /too many tokens/i.test(msg)
+  );
+}
+
+/**
+ * Run one slice end-to-end: implementer + reviewer, with a fix-up cycle on
+ * reject (capped at maxAttempts). The implementer is invoked with
+ * `resumeSession` so its prior context (file reads, helper signatures,
+ * gate output) carries forward; the reviewer always runs fresh so each
+ * approve/reject decision is independent.
+ *
+ * Returns:
+ *   { outcome, attempts, sessionId, usage, ... }
+ * where sessionId / usage are from the implementer's LAST iteration of
+ * its LAST attempt (used by the caller for the next slice's resume +
+ * threshold check). Outcome variants:
+ *   "approved"       (implJson, reviewJson)
+ *   "rejected-final" (lastRejectNotes)
+ *   "blocked"        (implJson)
+ *   "error"          (reason)
+ */
+async function runOneSlice({
+  sandcastleRoot,
+  sandbox,
+  agent,
+  next,
+  resumeSession,
+}) {
   const maxAttempts = Number(process.env.SANDCASTLE_MAX_ATTEMPTS ?? 3);
   const implementerPrompt = path.join(SANDCASTLE_DIR, "implementer.prompt.md");
   const reviewerPrompt = path.join(SANDCASTLE_DIR, "reviewer.prompt.md");
@@ -339,6 +384,14 @@ async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) {
   let rejectionNotes = null;
   let lastRejectNotes = null;
   let attempts = 0;
+  // Across retries within this slice, the implementer resumes from the
+  // most recent session — so attempt N sees attempt N-1's reasoning + the
+  // reviewer's rejection notes. Caller's resumeSession (from the prior
+  // slice) seeds the first attempt; null = fresh session.
+  let currentSession = resumeSession ?? null;
+  let lastUsage = null;
+  let contextResetUsedThisSlice = false;
+
   while (attempts < maxAttempts) {
     attempts++;
     const taskSpec = buildTaskSpec(next, rejectionNotes);
@@ -361,20 +414,55 @@ async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) {
         // it, sandcastle re-invokes the model up to maxIterations even
         // when the work is already done.
         completionSignal: "<promise>COMPLETE</promise>",
+        // Resume from the prior slice's (or prior attempt's) session so
+        // the implementer doesn't re-explore the repo for every task in
+        // the same story.
+        resumeSession: currentSession ?? undefined,
       });
     } catch (e) {
+      // Context-exhaustion safety net: drop the resumed session and retry
+      // the same attempt fresh, exactly once per slice.
+      if (
+        isContextExhaustedError(e) &&
+        currentSession &&
+        !contextResetUsedThisSlice
+      ) {
+        console.log(
+          "↺ Context window exhausted; dropping resumed session and retrying fresh.",
+        );
+        currentSession = null;
+        contextResetUsedThisSlice = true;
+        attempts--; // not counted against SANDCASTLE_MAX_ATTEMPTS
+        continue;
+      }
       explainSandcastleError("Implementer", e);
-      return { outcome: "error", attempts, reason: e.message };
+      return {
+        outcome: "error",
+        attempts,
+        reason: e.message,
+        sessionId: currentSession,
+        usage: lastUsage,
+      };
     }
     console.log(
       `Implementer returned. Branch: ${implResult.branch}, Commits: ${implResult.commits.length}`,
     );
+    const finalIter = implResult.iterations[implResult.iterations.length - 1];
+    currentSession = finalIter?.sessionId ?? currentSession;
+    lastUsage = finalIter?.usage ?? lastUsage;
+
     const implJson = parseAgentJson(implResult.stdout);
     if (
       implJson?.status === "blocked" ||
       implJson?.status === "needs-clarification"
     ) {
-      return { outcome: "blocked", attempts, implJson };
+      return {
+        outcome: "blocked",
+        attempts,
+        implJson,
+        sessionId: currentSession,
+        usage: lastUsage,
+      };
     }
 
     let diff = "";
@@ -401,15 +489,30 @@ async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) {
         maxIterations: Number(process.env.SANDCASTLE_REVIEWER_ITERATIONS ?? 10),
         // See implementer comment above.
         completionSignal: "<promise>COMPLETE</promise>",
+        // Reviewer always runs fresh — each approve/reject decision should
+        // be independent of prior tasks to keep the gate honest.
       });
     } catch (e) {
       explainSandcastleError("Reviewer", e);
-      return { outcome: "error", attempts, reason: e.message };
+      return {
+        outcome: "error",
+        attempts,
+        reason: e.message,
+        sessionId: currentSession,
+        usage: lastUsage,
+      };
     }
     const reviewJson = parseAgentJson(reviewResult.stdout);
 
     if (reviewJson?.decision === "approve") {
-      return { outcome: "approved", attempts, implJson, reviewJson };
+      return {
+        outcome: "approved",
+        attempts,
+        implJson,
+        reviewJson,
+        sessionId: currentSession,
+        usage: lastUsage,
+      };
     }
     if (reviewJson?.decision === "reject") {
       rejectionNotes =
@@ -420,14 +523,21 @@ async function runOneSlice({ sandcastleRoot, sandbox, agent, next }) {
       );
       continue;
     }
-    // Reviewer produced unparseable / non-decision output — treat as error.
     return {
       outcome: "error",
       attempts,
       reason: `reviewer returned no parseable decision; stdout:\n${reviewResult.stdout}`,
+      sessionId: currentSession,
+      usage: lastUsage,
     };
   }
-  return { outcome: "rejected-final", attempts, lastRejectNotes };
+  return {
+    outcome: "rejected-final",
+    attempts,
+    lastRejectNotes,
+    sessionId: currentSession,
+    usage: lastUsage,
+  };
 }
 
 /**
@@ -531,7 +641,17 @@ async function executeDispatch({ maxTasks }) {
   const sandbox = dockerProvider(dockerOpts);
   const agent = sandcastleRoot.claudeCode("claude-sonnet-4-6", agentOpts);
 
+  // Session is carried forward across slices WITHIN the same story so the
+  // implementer doesn't re-explore the repo each task. Reset on story
+  // boundaries (different repo area, different context) and when the prior
+  // session's input-token usage crosses the threshold (avoids hitting the
+  // model's context-window limit mid-slice).
+  const tokenResetThreshold = Number(
+    process.env.SANDCASTLE_SESSION_TOKEN_RESET ?? 140000,
+  );
   let approved = 0;
+  let currentStory = null;
+  let currentSession = null;
   while (true) {
     if (maxTasks !== null && approved >= maxTasks) {
       console.log(`\nHit --max-tasks=${maxTasks} cap; stopping.`);
@@ -542,13 +662,40 @@ async function executeDispatch({ maxTasks }) {
       console.log("\nNo more ready tasks. Dispatch loop complete.");
       break;
     }
+    if (next.story !== currentStory) {
+      if (currentStory !== null && currentSession) {
+        console.log(
+          `\n(Story boundary — resetting implementer session from ${currentStory} → ${next.story})`,
+        );
+      }
+      currentSession = null;
+      currentStory = next.story;
+    }
     console.log(
       `\n--- Slice ${approved + 1}: ${next.epic} / ${next.story} ---`,
     );
     console.log(`    Bullet: ${next.bulletLine.trim()}`);
+    if (currentSession) {
+      console.log(`    Resuming session: ${currentSession.slice(0, 12)}…`);
+    }
 
-    const result = await runOneSlice({ sandcastleRoot, sandbox, agent, next });
+    const result = await runOneSlice({
+      sandcastleRoot,
+      sandbox,
+      agent,
+      next,
+      resumeSession: currentSession,
+    });
     if (result.outcome === "approved") {
+      const usedTokens = totalInputTokens(result.usage);
+      if (usedTokens > tokenResetThreshold) {
+        console.log(
+          `(Session at ${usedTokens} input tokens > threshold ${tokenResetThreshold} — resetting before next task)`,
+        );
+        currentSession = null;
+      } else {
+        currentSession = result.sessionId ?? null;
+      }
       applyApprovedState(next);
       approved++;
       continue;