From 9c49a32cd9db21667ade778e96d5823f6f80494f Mon Sep 17 00:00:00 2001 From: catlog22 Date: Fri, 20 Mar 2026 15:11:32 +0800 Subject: [PATCH] feat: enhance workflow-tune skill with test requirements and analysis improvements --- .claude/skills/workflow-tune/SKILL.md | 27 +++--- .../workflow-tune/phases/03-step-analyze.md | 88 ++++++++++++++++--- .../phases/05-optimize-report.md | 14 +-- 3 files changed, 102 insertions(+), 27 deletions(-) diff --git a/.claude/skills/workflow-tune/SKILL.md b/.claude/skills/workflow-tune/SKILL.md index 0d2f27c1..f08d550a 100644 --- a/.claude/skills/workflow-tune/SKILL.md +++ b/.claude/skills/workflow-tune/SKILL.md @@ -35,12 +35,13 @@ Phase 1 Detail: ## Key Design Principles -1. **Step-by-Step Execution**: Each workflow step executes independently, artifacts inspected before proceeding -2. **Resume-Based Analysis**: Uses ccw cli `--resume` to maintain analysis context across steps -3. **Process Documentation**: Running `process-log.md` accumulates observations per step -4. **Two-Tool Pipeline**: Claude/target tool (execute) + Gemini (analyze) = complementary perspectives -5. **Pure Orchestrator**: SKILL.md coordinates only — execution detail in phase files -6. **Progressive Phase Loading**: Phase docs read only when that phase executes +1. **Test-First Evaluation**: Auto-generate per-step acceptance criteria before execution — judge results against concrete requirements, not vague quality +2. **Step-by-Step Execution**: Each workflow step executes independently, artifacts inspected before proceeding +3. **Resume-Based Analysis**: Uses ccw cli `--resume` to maintain analysis context across steps +4. **Process Documentation**: Running `process-log.md` accumulates observations per step +5. **Two-Tool Pipeline**: Claude/target tool (execute) + Gemini (analyze) = complementary perspectives +6. **Pure Orchestrator**: SKILL.md coordinates only — execution detail in phase files +7. **Progressive Phase Loading**: Phase docs read only when that phase executes ## Interactive Preference Collection @@ -280,11 +281,12 @@ Read and execute: `Ref: phases/01-setup.md` - Parse workflow steps from input (Format 1-3: direct parse, Format 4: semantic decomposition) - Generate Command Document (formatted execution plan) - **User Confirmation**: Display plan, wait for confirm/edit/cancel +- **Generate Test Requirements**: Auto-create per-step acceptance criteria via Gemini (expected outputs, content signals, quality thresholds, pass/fail criteria, handoff contracts) - Create workspace at `.workflow/.scratchpad/workflow-tune-{ts}/` -- Initialize workflow-state.json +- Initialize workflow-state.json (with test_requirements per step) - Create process-log.md template -Output: `workDir`, `steps[]`, `workflowContext`, `commandDoc`, initialized state +Output: `workDir`, `steps[]` (with test_requirements), `workflowContext`, `commandDoc`, initialized state ### Step Loop (Phase 2 + Phase 3, per step) @@ -331,9 +333,10 @@ Read and execute: `Ref: phases/02-step-execute.md` Read and execute: `Ref: phases/03-step-analyze.md` - Inspect step artifacts (file list, content summary, quality signals) -- Build analysis prompt with step context + previous steps' process log +- **Compare actual output against test requirements** (pass_criteria, content_signals, fail_signals, handoff_contract) +- Build analysis prompt with step context + test requirements + previous process log - Execute: `ccw cli --tool gemini --mode analysis [--resume sessionId]` -- Parse analysis → write step-{N}-analysis.md +- Parse analysis → write step-{N}-analysis.md (with requirement match: PASS/FAIL) - Append findings to process-log.md - Return analysis session ID for resume chain @@ -380,7 +383,9 @@ Phase 1: Setup ↓ User Confirmation (Execute / Edit / Cancel) ↓ (Execute confirmed) - ↓ workDir, steps[], workflow-state.json, process-log.md + ↓ + Generate Test Requirements (Gemini) → per-step acceptance criteria + ↓ workDir, steps[] (with test_requirements), workflow-state.json, process-log.md ↓ ┌─→ Phase 2: Execute Step N (ccw cli / Skill) │ ↓ step-N/ artifacts diff --git a/.claude/skills/workflow-tune/phases/03-step-analyze.md b/.claude/skills/workflow-tune/phases/03-step-analyze.md index 48c65da2..1cf67e0c 100644 --- a/.claude/skills/workflow-tune/phases/03-step-analyze.md +++ b/.claude/skills/workflow-tune/phases/03-step-analyze.md @@ -103,7 +103,30 @@ const depthInstructions = { deep: 'Provide exhaustive assessment. Cover: execution quality, output completeness and correctness, artifact quality and structure, step-to-step handoff integrity, error handling, performance signals, architecture implications, edge cases.' }; -const analysisPrompt = `PURPOSE: Analyze the output of workflow step "${step.name}" (step ${stepIdx + 1}/${state.steps.length}) to assess quality, identify issues, and evaluate handoff readiness for the next step. +// ★ Build test requirements section (the evaluation baseline) +const testReqs = step.test_requirements; +let testReqSection = ''; +if (testReqs) { + testReqSection = ` +TEST REQUIREMENTS (Acceptance Criteria — use these as the PRIMARY evaluation baseline): + Pass Criteria: ${testReqs.pass_criteria} + Expected Outputs: ${(testReqs.expected_outputs || []).join(', ') || 'not specified'} + Content Signals (patterns that indicate success): ${(testReqs.content_signals || []).join(', ') || 'not specified'} + Quality Thresholds: ${(testReqs.quality_thresholds || []).join(', ') || 'not specified'} + Fail Signals (patterns that indicate failure): ${(testReqs.fail_signals || []).join(', ') || 'not specified'} + Handoff Contract (what next step needs): ${testReqs.handoff_contract || 'not specified'} + +IMPORTANT: Score quality_score based on how well the actual output matches these test requirements. + - 90-100: All pass_criteria met, all expected_outputs present, content_signals found, no fail_signals + - 70-89: Most criteria met, minor gaps + - 50-69: Partial match, significant gaps + - 0-49: Fail — fail_signals present or pass_criteria not met`; +} else { + testReqSection = ` +NOTE: No pre-generated test requirements for this step. Evaluate based on general quality signals and workflow context.`; +} + +const analysisPrompt = `PURPOSE: Evaluate workflow step "${step.name}" (step ${stepIdx + 1}/${state.steps.length}) against its acceptance criteria. Judge whether the command execution met the pre-defined test requirements. WORKFLOW CONTEXT: Name: ${state.workflow_name} @@ -116,6 +139,7 @@ Name: ${step.name} Type: ${step.type} Command: ${step.command} ${step.success_criteria ? `Success Criteria: ${step.success_criteria}` : ''} +${testReqSection} EXECUTION RESULT: ${execSummary} @@ -129,15 +153,25 @@ ANALYSIS DEPTH: ${state.analysis_depth} ${depthInstructions[state.analysis_depth]} TASK: -1. Assess step execution quality (did it succeed? complete output?) -2. Evaluate artifact quality (content correctness, completeness, format) -3. Check handoff readiness (can the next step consume this output?) -4. Identify issues, risks, or optimization opportunities -5. Rate overall step quality 0-100 +1. **Requirement Matching**: Compare actual output against test requirements (pass_criteria, expected_outputs, content_signals) +2. **Fail Signal Detection**: Check for any fail_signals in the output +3. **Handoff Contract Verification**: Does the output satisfy handoff_contract for the next step? +4. **Gap Analysis**: What's missing between actual output and requirements? +5. **Quality Score**: Rate 0-100 based on requirement fulfillment (NOT general quality) EXPECTED OUTPUT (strict JSON, no markdown): { "quality_score": <0-100>, + "requirement_match": { + "pass": , + "criteria_met": [""], + "criteria_missed": [""], + "expected_outputs_found": [""], + "expected_outputs_missing": [""], + "content_signals_found": [""], + "content_signals_missing": [""], + "fail_signals_detected": [""] + }, "execution_assessment": { "success": , "completeness": "", @@ -151,6 +185,7 @@ EXPECTED OUTPUT (strict JSON, no markdown): }, "handoff_assessment": { "ready": , + "contract_satisfied": , "next_step_compatible": , "handoff_notes": "" }, @@ -163,7 +198,7 @@ EXPECTED OUTPUT (strict JSON, no markdown): "step_summary": "<1-2 sentence summary for process log>" } -CONSTRAINTS: Be specific, reference artifact content where possible, output ONLY JSON`; +CONSTRAINTS: Be specific, reference artifact content where possible, score against requirements not general quality, output ONLY JSON`; ``` ### Step 3.4: Execute via ccw cli Gemini with Resume @@ -231,11 +266,36 @@ if (jsonMatch) { } // Write step analysis file +const reqMatch = analysis.requirement_match; +const reqMatchSection = reqMatch ? ` +## Requirement Match — ${reqMatch.pass ? 'PASS ✓' : 'FAIL ✗'} + +### Criteria Met +${(reqMatch.criteria_met || []).map(c => `- ✓ ${c}`).join('\n') || '- None'} + +### Criteria Missed +${(reqMatch.criteria_missed || []).map(c => `- ✗ ${c}`).join('\n') || '- None'} + +### Expected Outputs +- Found: ${(reqMatch.expected_outputs_found || []).join(', ') || 'None'} +- Missing: ${(reqMatch.expected_outputs_missing || []).join(', ') || 'None'} + +### Content Signals +- Detected: ${(reqMatch.content_signals_found || []).join(', ') || 'None'} +- Missing: ${(reqMatch.content_signals_missing || []).join(', ') || 'None'} + +### Fail Signals +${(reqMatch.fail_signals_detected || []).length > 0 + ? (reqMatch.fail_signals_detected || []).map(f => `- ⚠ ${f}`).join('\n') + : '- None detected'} +` : ''; + const stepAnalysisReport = `# Step ${stepIdx + 1} Analysis: ${step.name} **Quality Score**: ${analysis.quality_score}/100 +**Requirement Match**: ${reqMatch ? (reqMatch.pass ? 'PASS' : 'FAIL') : 'N/A (no test requirements)'} **Date**: ${new Date().toISOString()} - +${reqMatchSection} ## Execution - Success: ${analysis.execution_assessment?.success} - Completeness: ${analysis.execution_assessment?.completeness} @@ -249,6 +309,7 @@ const stepAnalysisReport = `# Step ${stepIdx + 1} Analysis: ${step.name} ## Handoff Readiness - Ready: ${analysis.handoff_assessment?.ready} +- Contract Satisfied: ${analysis.handoff_assessment?.contract_satisfied} - Next Step Compatible: ${analysis.handoff_assessment?.next_step_compatible} - Notes: ${analysis.handoff_assessment?.handoff_notes} @@ -262,14 +323,16 @@ ${(analysis.optimization_opportunities || []).map(o => `- [${o.impact}] ${o.area Write(`${stepDir}/step-${stepIdx + 1}-analysis.md`, stepAnalysisReport); // Append to process log +const reqPassStr = reqMatch ? (reqMatch.pass ? 'PASS' : 'FAIL') : 'N/A'; const processLogEntry = ` -## Step ${stepIdx + 1}: ${step.name} — Score: ${analysis.quality_score}/100 +## Step ${stepIdx + 1}: ${step.name} — Score: ${analysis.quality_score}/100 | Req: ${reqPassStr} **Command**: \`${step.command}\` **Result**: ${analysis.execution_assessment?.completeness || 'unknown'} | ${analysis.artifact_assessment?.count || 0} artifacts +**Requirement Match**: ${reqPassStr}${reqMatch ? ` — Met: ${(reqMatch.criteria_met || []).length}, Missed: ${(reqMatch.criteria_missed || []).length}, Fail Signals: ${(reqMatch.fail_signals_detected || []).length}` : ''} **Summary**: ${analysis.step_summary || 'No summary'} **Issues**: ${(analysis.issues || []).filter(i => i.severity === 'high').map(i => i.description).join('; ') || 'None critical'} -**Handoff**: ${analysis.handoff_assessment?.handoff_notes || 'Ready'} +**Handoff**: ${analysis.handoff_assessment?.contract_satisfied ? 'Contract satisfied' : analysis.handoff_assessment?.handoff_notes || 'Ready'} --- `; @@ -281,8 +344,13 @@ Write(`${state.work_dir}/process-log.md`, currentLog + processLogEntry); // Update state state.steps[stepIdx].analysis = { quality_score: analysis.quality_score, + requirement_pass: reqMatch?.pass ?? null, + criteria_met_count: (reqMatch?.criteria_met || []).length, + criteria_missed_count: (reqMatch?.criteria_missed || []).length, + fail_signals_count: (reqMatch?.fail_signals_detected || []).length, key_outputs: analysis.artifact_assessment?.key_outputs || [], handoff_notes: analysis.handoff_assessment?.handoff_notes || '', + contract_satisfied: analysis.handoff_assessment?.contract_satisfied ?? null, issue_count: (analysis.issues || []).length, high_issues: (analysis.issues || []).filter(i => i.severity === 'high').length, optimization_count: (analysis.optimization_opportunities || []).length, diff --git a/.claude/skills/workflow-tune/phases/05-optimize-report.md b/.claude/skills/workflow-tune/phases/05-optimize-report.md index 63d8eb34..fa367448 100644 --- a/.claude/skills/workflow-tune/phases/05-optimize-report.md +++ b/.claude/skills/workflow-tune/phases/05-optimize-report.md @@ -40,10 +40,12 @@ const minStep = state.steps.reduce((min, s) => const totalIssues = state.steps.reduce((sum, s) => sum + (s.analysis?.issue_count || 0), 0); const totalHighIssues = state.steps.reduce((sum, s) => sum + (s.analysis?.high_issues || 0), 0); -// Step quality table -const stepTable = state.steps.map((s, i) => - `| ${i + 1} | ${s.name} | ${s.type} | ${s.execution?.success ? 'OK' : 'FAIL'} | ${s.analysis?.quality_score || '-'} | ${s.analysis?.issue_count || 0} | ${s.analysis?.high_issues || 0} |` -).join('\n'); +// Step quality table (with requirement match) +const stepTable = state.steps.map((s, i) => { + const reqPass = s.analysis?.requirement_pass; + const reqStr = reqPass === true ? 'PASS' : reqPass === false ? 'FAIL' : '-'; + return `| ${i + 1} | ${s.name} | ${s.type} | ${s.execution?.success ? 'OK' : 'FAIL'} | ${reqStr} | ${s.analysis?.quality_score || '-'} | ${s.analysis?.issue_count || 0} | ${s.analysis?.high_issues || 0} |`; +}).join('\n'); // Collect all improvements (workflow-level + per-step) const allImprovements = []; @@ -97,8 +99,8 @@ const report = `# Workflow Tune — Final Report ## Step Quality Matrix -| # | Step | Type | Exec | Quality | Issues | High | -|---|------|------|------|---------|--------|------| +| # | Step | Type | Exec | Req Match | Quality | Issues | High | +|---|------|------|------|-----------|---------|--------|------| ${stepTable} ## Workflow Flow Assessment