feat: enhance workflow-tune skill with test requirements and analysis improvements

This commit is contained in:
catlog22
2026-03-20 15:11:32 +08:00
parent d843112094
commit 9c49a32cd9
3 changed files with 102 additions and 27 deletions

View File

@@ -103,7 +103,30 @@ const depthInstructions = {
deep: 'Provide exhaustive assessment. Cover: execution quality, output completeness and correctness, artifact quality and structure, step-to-step handoff integrity, error handling, performance signals, architecture implications, edge cases.'
};
const analysisPrompt = `PURPOSE: Analyze the output of workflow step "${step.name}" (step ${stepIdx + 1}/${state.steps.length}) to assess quality, identify issues, and evaluate handoff readiness for the next step.
// ★ Build test requirements section (the evaluation baseline)
const testReqs = step.test_requirements;
let testReqSection = '';
if (testReqs) {
testReqSection = `
TEST REQUIREMENTS (Acceptance Criteria — use these as the PRIMARY evaluation baseline):
Pass Criteria: ${testReqs.pass_criteria}
Expected Outputs: ${(testReqs.expected_outputs || []).join(', ') || 'not specified'}
Content Signals (patterns that indicate success): ${(testReqs.content_signals || []).join(', ') || 'not specified'}
Quality Thresholds: ${(testReqs.quality_thresholds || []).join(', ') || 'not specified'}
Fail Signals (patterns that indicate failure): ${(testReqs.fail_signals || []).join(', ') || 'not specified'}
Handoff Contract (what next step needs): ${testReqs.handoff_contract || 'not specified'}
IMPORTANT: Score quality_score based on how well the actual output matches these test requirements.
- 90-100: All pass_criteria met, all expected_outputs present, content_signals found, no fail_signals
- 70-89: Most criteria met, minor gaps
- 50-69: Partial match, significant gaps
- 0-49: Fail — fail_signals present or pass_criteria not met`;
} else {
testReqSection = `
NOTE: No pre-generated test requirements for this step. Evaluate based on general quality signals and workflow context.`;
}
const analysisPrompt = `PURPOSE: Evaluate workflow step "${step.name}" (step ${stepIdx + 1}/${state.steps.length}) against its acceptance criteria. Judge whether the command execution met the pre-defined test requirements.
WORKFLOW CONTEXT:
Name: ${state.workflow_name}
@@ -116,6 +139,7 @@ Name: ${step.name}
Type: ${step.type}
Command: ${step.command}
${step.success_criteria ? `Success Criteria: ${step.success_criteria}` : ''}
${testReqSection}
EXECUTION RESULT:
${execSummary}
@@ -129,15 +153,25 @@ ANALYSIS DEPTH: ${state.analysis_depth}
${depthInstructions[state.analysis_depth]}
TASK:
1. Assess step execution quality (did it succeed? complete output?)
2. Evaluate artifact quality (content correctness, completeness, format)
3. Check handoff readiness (can the next step consume this output?)
4. Identify issues, risks, or optimization opportunities
5. Rate overall step quality 0-100
1. **Requirement Matching**: Compare actual output against test requirements (pass_criteria, expected_outputs, content_signals)
2. **Fail Signal Detection**: Check for any fail_signals in the output
3. **Handoff Contract Verification**: Does the output satisfy handoff_contract for the next step?
4. **Gap Analysis**: What's missing between actual output and requirements?
5. **Quality Score**: Rate 0-100 based on requirement fulfillment (NOT general quality)
EXPECTED OUTPUT (strict JSON, no markdown):
{
"quality_score": <0-100>,
"requirement_match": {
"pass": <true|false>,
"criteria_met": ["<which pass_criteria were satisfied>"],
"criteria_missed": ["<which pass_criteria were NOT satisfied>"],
"expected_outputs_found": ["<expected files that exist>"],
"expected_outputs_missing": ["<expected files that are absent>"],
"content_signals_found": ["<success patterns detected in output>"],
"content_signals_missing": ["<success patterns NOT found>"],
"fail_signals_detected": ["<failure patterns found, if any>"]
},
"execution_assessment": {
"success": <true|false>,
"completeness": "<complete|partial|failed>",
@@ -151,6 +185,7 @@ EXPECTED OUTPUT (strict JSON, no markdown):
},
"handoff_assessment": {
"ready": <true|false>,
"contract_satisfied": <true|false|null>,
"next_step_compatible": <true|false|null>,
"handoff_notes": "<what next step should know>"
},
@@ -163,7 +198,7 @@ EXPECTED OUTPUT (strict JSON, no markdown):
"step_summary": "<1-2 sentence summary for process log>"
}
CONSTRAINTS: Be specific, reference artifact content where possible, output ONLY JSON`;
CONSTRAINTS: Be specific, reference artifact content where possible, score against requirements not general quality, output ONLY JSON`;
```
### Step 3.4: Execute via ccw cli Gemini with Resume
@@ -231,11 +266,36 @@ if (jsonMatch) {
}
// Write step analysis file
const reqMatch = analysis.requirement_match;
const reqMatchSection = reqMatch ? `
## Requirement Match — ${reqMatch.pass ? 'PASS ✓' : 'FAIL ✗'}
### Criteria Met
${(reqMatch.criteria_met || []).map(c => `- ✓ ${c}`).join('\n') || '- None'}
### Criteria Missed
${(reqMatch.criteria_missed || []).map(c => `- ✗ ${c}`).join('\n') || '- None'}
### Expected Outputs
- Found: ${(reqMatch.expected_outputs_found || []).join(', ') || 'None'}
- Missing: ${(reqMatch.expected_outputs_missing || []).join(', ') || 'None'}
### Content Signals
- Detected: ${(reqMatch.content_signals_found || []).join(', ') || 'None'}
- Missing: ${(reqMatch.content_signals_missing || []).join(', ') || 'None'}
### Fail Signals
${(reqMatch.fail_signals_detected || []).length > 0
? (reqMatch.fail_signals_detected || []).map(f => `- ⚠ ${f}`).join('\n')
: '- None detected'}
` : '';
const stepAnalysisReport = `# Step ${stepIdx + 1} Analysis: ${step.name}
**Quality Score**: ${analysis.quality_score}/100
**Requirement Match**: ${reqMatch ? (reqMatch.pass ? 'PASS' : 'FAIL') : 'N/A (no test requirements)'}
**Date**: ${new Date().toISOString()}
${reqMatchSection}
## Execution
- Success: ${analysis.execution_assessment?.success}
- Completeness: ${analysis.execution_assessment?.completeness}
@@ -249,6 +309,7 @@ const stepAnalysisReport = `# Step ${stepIdx + 1} Analysis: ${step.name}
## Handoff Readiness
- Ready: ${analysis.handoff_assessment?.ready}
- Contract Satisfied: ${analysis.handoff_assessment?.contract_satisfied}
- Next Step Compatible: ${analysis.handoff_assessment?.next_step_compatible}
- Notes: ${analysis.handoff_assessment?.handoff_notes}
@@ -262,14 +323,16 @@ ${(analysis.optimization_opportunities || []).map(o => `- [${o.impact}] ${o.area
Write(`${stepDir}/step-${stepIdx + 1}-analysis.md`, stepAnalysisReport);
// Append to process log
const reqPassStr = reqMatch ? (reqMatch.pass ? 'PASS' : 'FAIL') : 'N/A';
const processLogEntry = `
## Step ${stepIdx + 1}: ${step.name} — Score: ${analysis.quality_score}/100
## Step ${stepIdx + 1}: ${step.name} — Score: ${analysis.quality_score}/100 | Req: ${reqPassStr}
**Command**: \`${step.command}\`
**Result**: ${analysis.execution_assessment?.completeness || 'unknown'} | ${analysis.artifact_assessment?.count || 0} artifacts
**Requirement Match**: ${reqPassStr}${reqMatch ? ` — Met: ${(reqMatch.criteria_met || []).length}, Missed: ${(reqMatch.criteria_missed || []).length}, Fail Signals: ${(reqMatch.fail_signals_detected || []).length}` : ''}
**Summary**: ${analysis.step_summary || 'No summary'}
**Issues**: ${(analysis.issues || []).filter(i => i.severity === 'high').map(i => i.description).join('; ') || 'None critical'}
**Handoff**: ${analysis.handoff_assessment?.handoff_notes || 'Ready'}
**Handoff**: ${analysis.handoff_assessment?.contract_satisfied ? 'Contract satisfied' : analysis.handoff_assessment?.handoff_notes || 'Ready'}
---
`;
@@ -281,8 +344,13 @@ Write(`${state.work_dir}/process-log.md`, currentLog + processLogEntry);
// Update state
state.steps[stepIdx].analysis = {
quality_score: analysis.quality_score,
requirement_pass: reqMatch?.pass ?? null,
criteria_met_count: (reqMatch?.criteria_met || []).length,
criteria_missed_count: (reqMatch?.criteria_missed || []).length,
fail_signals_count: (reqMatch?.fail_signals_detected || []).length,
key_outputs: analysis.artifact_assessment?.key_outputs || [],
handoff_notes: analysis.handoff_assessment?.handoff_notes || '',
contract_satisfied: analysis.handoff_assessment?.contract_satisfied ?? null,
issue_count: (analysis.issues || []).length,
high_issues: (analysis.issues || []).filter(i => i.severity === 'high').length,
optimization_count: (analysis.optimization_opportunities || []).length,

View File

@@ -40,10 +40,12 @@ const minStep = state.steps.reduce((min, s) =>
const totalIssues = state.steps.reduce((sum, s) => sum + (s.analysis?.issue_count || 0), 0);
const totalHighIssues = state.steps.reduce((sum, s) => sum + (s.analysis?.high_issues || 0), 0);
// Step quality table
const stepTable = state.steps.map((s, i) =>
`| ${i + 1} | ${s.name} | ${s.type} | ${s.execution?.success ? 'OK' : 'FAIL'} | ${s.analysis?.quality_score || '-'} | ${s.analysis?.issue_count || 0} | ${s.analysis?.high_issues || 0} |`
).join('\n');
// Step quality table (with requirement match)
const stepTable = state.steps.map((s, i) => {
const reqPass = s.analysis?.requirement_pass;
const reqStr = reqPass === true ? 'PASS' : reqPass === false ? 'FAIL' : '-';
return `| ${i + 1} | ${s.name} | ${s.type} | ${s.execution?.success ? 'OK' : 'FAIL'} | ${reqStr} | ${s.analysis?.quality_score || '-'} | ${s.analysis?.issue_count || 0} | ${s.analysis?.high_issues || 0} |`;
}).join('\n');
// Collect all improvements (workflow-level + per-step)
const allImprovements = [];
@@ -97,8 +99,8 @@ const report = `# Workflow Tune — Final Report
## Step Quality Matrix
| # | Step | Type | Exec | Quality | Issues | High |
|---|------|------|------|---------|--------|------|
| # | Step | Type | Exec | Req Match | Quality | Issues | High |
|---|------|------|------|-----------|---------|--------|------|
${stepTable}
## Workflow Flow Assessment