Files
Claude-Code-Workflow/ccw/src/tools/spec-keyword-extractor.ts
catlog22 ffe79d28e2 refactor(spec): update folder structure and dimensions
- Change folder from .workflow/ to .ccw/
- Reduce dimensions from 4 to 2: specs, personal
- Remove changelog and roadmap dimensions
- Update help text and examples

Folder structure:
- .ccw/specs/ - Project rules and conventions
- .ccw/personal/ - Personal preferences (supports global ~/.ccw/personal/)
- .ccw/.spec-index/ - Index cache

Keyword categories for workflow stages:
- exploration - Code exploration, analysis, debugging
- planning - Task planning, requirements
- execution - Implementation, testing, deployment
2026-02-26 23:13:00 +08:00

209 lines
6.3 KiB
TypeScript

/**
* Spec Keyword Extractor
*
* Extracts keywords from user prompt text for matching against
* spec document YAML frontmatter keywords.
*
* Supports:
* - English word tokenization (split by spaces/punctuation, remove stop words)
* - Chinese character segment extraction (CJK boundary splitting)
*/
/**
* Common English stop words to filter out during keyword extraction.
* These words appear frequently but carry little semantic meaning
* for spec matching.
*/
export const STOP_WORDS = new Set([
// Articles
'a', 'an', 'the',
// Pronouns
'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'it', 'they', 'them',
'this', 'that', 'these', 'those', 'what', 'which', 'who', 'whom',
// Prepositions
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'into',
'about', 'between', 'through', 'after', 'before', 'above', 'below',
// Conjunctions
'and', 'or', 'but', 'if', 'then', 'else', 'when', 'while', 'so', 'because',
// Auxiliary verbs
'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being',
'has', 'have', 'had', 'do', 'does', 'did',
'will', 'would', 'shall', 'should', 'may', 'might', 'can', 'could', 'must',
// Common verbs (too generic for matching)
'get', 'got', 'make', 'made', 'let', 'go', 'going', 'come', 'take', 'give',
// Adverbs
'not', 'no', 'yes', 'also', 'just', 'only', 'very', 'too', 'now', 'here',
'there', 'how', 'why', 'where', 'all', 'each', 'every', 'both', 'some',
'any', 'most', 'more', 'less', 'much', 'many', 'few', 'other', 'such',
// Misc
'please', 'need', 'want', 'like', 'know', 'think', 'see', 'use', 'using',
'way', 'thing', 'something', 'anything', 'nothing',
]);
/**
* Regex to detect CJK (Chinese/Japanese/Korean) characters.
* Covers CJK Unified Ideographs and common extensions.
*/
const CJK_REGEX = /[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]/;
/**
* Regex to match contiguous CJK character sequences.
*/
const CJK_SEGMENT_REGEX = /[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]+/g;
/**
* Regex to split text into English word tokens.
* Splits on whitespace and common punctuation.
*/
const WORD_SPLIT_REGEX = /[\s,;:!?.()\[\]{}<>"'`~@#$%^&*+=|\\/_\-\u3001\u3002\uff0c\uff1b\uff1a\uff01\uff1f]+/;
/**
* Minimum word length to keep (filters out single-char English tokens).
*/
const MIN_WORD_LENGTH = 2;
/**
* Extract keywords from prompt text.
*
* For English text:
* Splits by whitespace/punctuation, lowercases, removes stop words,
* filters short tokens, and deduplicates.
*
* For Chinese text:
* Extracts contiguous CJK character sequences. For sequences longer
* than 2 characters, also generates 2-character sliding window bigrams
* to improve matching (since Chinese keywords in YAML are typically
* 2-4 character compounds).
*
* @param text - The user prompt text to extract keywords from
* @returns Array of unique keywords (lowercase for English, original for CJK)
*/
export function extractKeywords(text: string): string[] {
if (!text || typeof text !== 'string') {
return [];
}
const keywords = new Set<string>();
// Extract English keywords
const englishKeywords = extractEnglishKeywords(text);
for (const kw of englishKeywords) {
keywords.add(kw);
}
// Extract CJK keywords
const cjkKeywords = extractCjkKeywords(text);
for (const kw of cjkKeywords) {
keywords.add(kw);
}
return Array.from(keywords);
}
/**
* Extract English keywords from text.
*
* @param text - Input text
* @returns Array of lowercase English keyword tokens
*/
function extractEnglishKeywords(text: string): string[] {
// Remove CJK characters first so they don't pollute English tokens
const cleanedText = text.replace(CJK_SEGMENT_REGEX, ' ');
const tokens = cleanedText
.split(WORD_SPLIT_REGEX)
.map(token => token.toLowerCase().trim())
.filter(token =>
token.length >= MIN_WORD_LENGTH &&
!STOP_WORDS.has(token) &&
// Filter out pure number tokens
!/^\d+$/.test(token)
);
// Deduplicate while preserving order
return Array.from(new Set(tokens));
}
/**
* Extract CJK keywords from text.
*
* Extracts contiguous CJK segments. For segments longer than 2 characters,
* generates 2-character bigrams as well (common Chinese keyword length).
*
* @param text - Input text
* @returns Array of CJK keyword segments
*/
function extractCjkKeywords(text: string): string[] {
if (!CJK_REGEX.test(text)) {
return [];
}
const keywords = new Set<string>();
// Find all contiguous CJK segments
const segments = text.match(CJK_SEGMENT_REGEX);
if (!segments) {
return [];
}
for (const segment of segments) {
// Add the full segment
keywords.add(segment);
// For longer segments, generate 2-char bigrams
if (segment.length > 2) {
for (let i = 0; i <= segment.length - 2; i++) {
keywords.add(segment.substring(i, i + 2));
}
}
}
return Array.from(keywords);
}
/**
* Check if a keyword matches any entry in a keyword list.
* Supports case-insensitive matching for English and exact matching for CJK.
*
* @param keyword - The keyword to check
* @param targetKeywords - The target keyword list from spec frontmatter
* @returns true if keyword matches any target
*/
export function keywordMatches(keyword: string, targetKeywords: string[]): boolean {
const lowerKeyword = keyword.toLowerCase();
return targetKeywords.some(target => {
const lowerTarget = target.toLowerCase();
// Exact match (case insensitive)
if (lowerKeyword === lowerTarget) return true;
// Substring match: keyword appears within target or vice versa
if (lowerTarget.includes(lowerKeyword) || lowerKeyword.includes(lowerTarget)) return true;
return false;
});
}
/**
* Calculate match score between extracted keywords and spec keywords.
* Higher score means better match.
*
* @param extractedKeywords - Keywords extracted from user prompt
* @param specKeywords - Keywords from spec YAML frontmatter
* @returns Number of matching keywords (0 = no match)
*/
export function calculateMatchScore(
extractedKeywords: string[],
specKeywords: string[]
): number {
if (!extractedKeywords.length || !specKeywords.length) {
return 0;
}
let score = 0;
for (const keyword of extractedKeywords) {
if (keywordMatches(keyword, specKeywords)) {
score++;
}
}
return score;
}