feat: enhance workflow-tune skill with test requirements and analysis improvements

2026-03-21 19:08:17 +08:00 · 2026-03-20 15:11:32 +08:00
parent d843112094
commit 9c49a32cd9
3 changed files with 102 additions and 27 deletions
--- a/.claude/skills/workflow-tune/phases/03-step-analyze.md
+++ b/.claude/skills/workflow-tune/phases/03-step-analyze.md
@@ -103,7 +103,30 @@ const depthInstructions = {
  deep: 'Provide exhaustive assessment. Cover: execution quality, output completeness and correctness, artifact quality and structure, step-to-step handoff integrity, error handling, performance signals, architecture implications, edge cases.'
 };

-const analysisPrompt = `PURPOSE: Analyze the output of workflow step "${step.name}" (step ${stepIdx + 1}/${state.steps.length}) to assess quality, identify issues, and evaluate handoff readiness for the next step.
+// ★ Build test requirements section (the evaluation baseline)
+const testReqs = step.test_requirements;
+let testReqSection = '';
+if (testReqs) {
+  testReqSection = `
+TEST REQUIREMENTS (Acceptance Criteria — use these as the PRIMARY evaluation baseline):
+  Pass Criteria: ${testReqs.pass_criteria}
+  Expected Outputs: ${(testReqs.expected_outputs || []).join(', ') || 'not specified'}
+  Content Signals (patterns that indicate success): ${(testReqs.content_signals || []).join(', ') || 'not specified'}
+  Quality Thresholds: ${(testReqs.quality_thresholds || []).join(', ') || 'not specified'}
+  Fail Signals (patterns that indicate failure): ${(testReqs.fail_signals || []).join(', ') || 'not specified'}
+  Handoff Contract (what next step needs): ${testReqs.handoff_contract || 'not specified'}
+
+IMPORTANT: Score quality_score based on how well the actual output matches these test requirements.
+  - 90-100: All pass_criteria met, all expected_outputs present, content_signals found, no fail_signals
+  - 70-89: Most criteria met, minor gaps
+  - 50-69: Partial match, significant gaps
+  - 0-49: Fail — fail_signals present or pass_criteria not met`;
+} else {
+  testReqSection = `
+NOTE: No pre-generated test requirements for this step. Evaluate based on general quality signals and workflow context.`;
+}
+
+const analysisPrompt = `PURPOSE: Evaluate workflow step "${step.name}" (step ${stepIdx + 1}/${state.steps.length}) against its acceptance criteria. Judge whether the command execution met the pre-defined test requirements.

 WORKFLOW CONTEXT:
 Name: ${state.workflow_name}
@@ -116,6 +139,7 @@ Name: ${step.name}
 Type: ${step.type}
 Command: ${step.command}
 ${step.success_criteria ? `Success Criteria: ${step.success_criteria}` : ''}
+${testReqSection}

 EXECUTION RESULT:
 ${execSummary}
@@ -129,15 +153,25 @@ ANALYSIS DEPTH: ${state.analysis_depth}
 ${depthInstructions[state.analysis_depth]}

 TASK:
-1. Assess step execution quality (did it succeed? complete output?)
-2. Evaluate artifact quality (content correctness, completeness, format)
-3. Check handoff readiness (can the next step consume this output?)
-4. Identify issues, risks, or optimization opportunities
-5. Rate overall step quality 0-100
+1. **Requirement Matching**: Compare actual output against test requirements (pass_criteria, expected_outputs, content_signals)
+2. **Fail Signal Detection**: Check for any fail_signals in the output
+3. **Handoff Contract Verification**: Does the output satisfy handoff_contract for the next step?
+4. **Gap Analysis**: What's missing between actual output and requirements?
+5. **Quality Score**: Rate 0-100 based on requirement fulfillment (NOT general quality)

 EXPECTED OUTPUT (strict JSON, no markdown):
 {
  "quality_score": <0-100>,
+  "requirement_match": {
+    "pass": <true|false>,
+    "criteria_met": ["<which pass_criteria were satisfied>"],
+    "criteria_missed": ["<which pass_criteria were NOT satisfied>"],
+    "expected_outputs_found": ["<expected files that exist>"],
+    "expected_outputs_missing": ["<expected files that are absent>"],
+    "content_signals_found": ["<success patterns detected in output>"],
+    "content_signals_missing": ["<success patterns NOT found>"],
+    "fail_signals_detected": ["<failure patterns found, if any>"]
+  },
  "execution_assessment": {
    "success": <true|false>,
    "completeness": "<complete|partial|failed>",
@@ -151,6 +185,7 @@ EXPECTED OUTPUT (strict JSON, no markdown):
  },
  "handoff_assessment": {
    "ready": <true|false>,
+    "contract_satisfied": <true|false|null>,
    "next_step_compatible": <true|false|null>,
    "handoff_notes": "<what next step should know>"
  },
@@ -163,7 +198,7 @@ EXPECTED OUTPUT (strict JSON, no markdown):
  "step_summary": "<1-2 sentence summary for process log>"
 }

-CONSTRAINTS: Be specific, reference artifact content where possible, output ONLY JSON`;
+CONSTRAINTS: Be specific, reference artifact content where possible, score against requirements not general quality, output ONLY JSON`;
 ```

 ### Step 3.4: Execute via ccw cli Gemini with Resume
@@ -231,11 +266,36 @@ if (jsonMatch) {
 }

 // Write step analysis file
+const reqMatch = analysis.requirement_match;
+const reqMatchSection = reqMatch ? `
+## Requirement Match — ${reqMatch.pass ? 'PASS ✓' : 'FAIL ✗'}
+
+### Criteria Met
+${(reqMatch.criteria_met || []).map(c => `- ✓ ${c}`).join('\n') || '- None'}
+
+### Criteria Missed
+${(reqMatch.criteria_missed || []).map(c => `- ✗ ${c}`).join('\n') || '- None'}
+
+### Expected Outputs
+- Found: ${(reqMatch.expected_outputs_found || []).join(', ') || 'None'}
+- Missing: ${(reqMatch.expected_outputs_missing || []).join(', ') || 'None'}
+
+### Content Signals
+- Detected: ${(reqMatch.content_signals_found || []).join(', ') || 'None'}
+- Missing: ${(reqMatch.content_signals_missing || []).join(', ') || 'None'}
+
+### Fail Signals
+${(reqMatch.fail_signals_detected || []).length > 0
+  ? (reqMatch.fail_signals_detected || []).map(f => `- ⚠ ${f}`).join('\n')
+  : '- None detected'}
+` : '';
+
 const stepAnalysisReport = `# Step ${stepIdx + 1} Analysis: ${step.name}

 **Quality Score**: ${analysis.quality_score}/100
+**Requirement Match**: ${reqMatch ? (reqMatch.pass ? 'PASS' : 'FAIL') : 'N/A (no test requirements)'}
 **Date**: ${new Date().toISOString()}
-
+${reqMatchSection}
 ## Execution
 - Success: ${analysis.execution_assessment?.success}
 - Completeness: ${analysis.execution_assessment?.completeness}
@@ -249,6 +309,7 @@ const stepAnalysisReport = `# Step ${stepIdx + 1} Analysis: ${step.name}

 ## Handoff Readiness
 - Ready: ${analysis.handoff_assessment?.ready}
+- Contract Satisfied: ${analysis.handoff_assessment?.contract_satisfied}
 - Next Step Compatible: ${analysis.handoff_assessment?.next_step_compatible}
 - Notes: ${analysis.handoff_assessment?.handoff_notes}

@@ -262,14 +323,16 @@ ${(analysis.optimization_opportunities || []).map(o => `- [${o.impact}] ${o.area
 Write(`${stepDir}/step-${stepIdx + 1}-analysis.md`, stepAnalysisReport);

 // Append to process log
+const reqPassStr = reqMatch ? (reqMatch.pass ? 'PASS' : 'FAIL') : 'N/A';
 const processLogEntry = `
-## Step ${stepIdx + 1}: ${step.name} — Score: ${analysis.quality_score}/100
+## Step ${stepIdx + 1}: ${step.name} — Score: ${analysis.quality_score}/100 | Req: ${reqPassStr}

 **Command**: \`${step.command}\`
 **Result**: ${analysis.execution_assessment?.completeness || 'unknown'} | ${analysis.artifact_assessment?.count || 0} artifacts
+**Requirement Match**: ${reqPassStr}${reqMatch ? ` — Met: ${(reqMatch.criteria_met || []).length}, Missed: ${(reqMatch.criteria_missed || []).length}, Fail Signals: ${(reqMatch.fail_signals_detected || []).length}` : ''}
 **Summary**: ${analysis.step_summary || 'No summary'}
 **Issues**: ${(analysis.issues || []).filter(i => i.severity === 'high').map(i => i.description).join('; ') || 'None critical'}
-**Handoff**: ${analysis.handoff_assessment?.handoff_notes || 'Ready'}
+**Handoff**: ${analysis.handoff_assessment?.contract_satisfied ? 'Contract satisfied' : analysis.handoff_assessment?.handoff_notes || 'Ready'}

 ---
 `;
@@ -281,8 +344,13 @@ Write(`${state.work_dir}/process-log.md`, currentLog + processLogEntry);
 // Update state
 state.steps[stepIdx].analysis = {
  quality_score: analysis.quality_score,
+  requirement_pass: reqMatch?.pass ?? null,
+  criteria_met_count: (reqMatch?.criteria_met || []).length,
+  criteria_missed_count: (reqMatch?.criteria_missed || []).length,
+  fail_signals_count: (reqMatch?.fail_signals_detected || []).length,
  key_outputs: analysis.artifact_assessment?.key_outputs || [],
  handoff_notes: analysis.handoff_assessment?.handoff_notes || '',
+  contract_satisfied: analysis.handoff_assessment?.contract_satisfied ?? null,
  issue_count: (analysis.issues || []).length,
  high_issues: (analysis.issues || []).filter(i => i.severity === 'high').length,
  optimization_count: (analysis.optimization_opportunities || []).length,