Files
Claude-Code-Workflow/ccw/src/core/pattern-detector.ts
catlog22 48a6a1f2aa Add comprehensive tests for ast-grep and tree-sitter relationship extraction
- Introduced test suite for AstGrepPythonProcessor covering pattern definitions, parsing, and relationship extraction.
- Added comparison tests between tree-sitter and ast-grep for consistency in relationship extraction.
- Implemented tests for ast-grep binding module to verify functionality and availability.
- Ensured tests cover various scenarios including inheritance, function calls, and imports.
2026-02-15 21:14:14 +08:00

486 lines
16 KiB
TypeScript

/**
* Pattern Detector - Detects recurring content patterns across sessions
*
* Uses vector clustering (cosine similarity > 0.85) to group semantically similar
* chunks into patterns. Patterns appearing in N>=3 distinct sessions are flagged
* as candidates. High-confidence patterns (>=0.8) are solidified into CoreMemory
* and skills/*.md files.
*/
import { CoreMemoryStore, getCoreMemoryStore } from './core-memory-store.js';
import { UnifiedVectorIndex, isUnifiedEmbedderAvailable } from './unified-vector-index.js';
import type { VectorSearchMatch } from './unified-vector-index.js';
import { existsSync, mkdirSync, writeFileSync } from 'fs';
import { join } from 'path';
// -- Constants --
/** Minimum cosine similarity to group chunks into the same pattern */
const PATTERN_SIMILARITY_THRESHOLD = 0.85;
/** Minimum number of distinct sessions a pattern must appear in */
const MIN_SESSION_FREQUENCY = 3;
/** Confidence threshold for auto-solidification */
const SOLIDIFY_CONFIDENCE_THRESHOLD = 0.8;
/** Maximum number of chunks to analyze per detection run */
const MAX_CHUNKS_TO_ANALYZE = 200;
/** Top-K neighbors to search per chunk during clustering */
const NEIGHBOR_TOP_K = 15;
// -- Types --
export interface DetectedPattern {
/** Unique pattern identifier */
id: string;
/** Human-readable pattern name derived from content */
name: string;
/** Representative content snippet */
representative: string;
/** Source IDs (sessions) where this pattern appears */
sourceIds: string[];
/** Number of distinct sessions */
sessionCount: number;
/** Average similarity score within the pattern group */
avgSimilarity: number;
/** Confidence score (0-1), based on frequency and similarity */
confidence: number;
/** Category of the chunks in this pattern */
category: string;
}
export interface PatternDetectionResult {
/** All detected patterns */
patterns: DetectedPattern[];
/** Number of chunks analyzed */
chunksAnalyzed: number;
/** Patterns that were solidified (written to CoreMemory + skills) */
solidified: string[];
/** Elapsed time in ms */
elapsedMs: number;
}
export interface SolidifyResult {
memoryId: string;
skillPath: string | null;
}
// -- PatternDetector --
export class PatternDetector {
private projectPath: string;
private coreMemoryStore: CoreMemoryStore;
private vectorIndex: UnifiedVectorIndex | null = null;
constructor(projectPath: string) {
this.projectPath = projectPath;
this.coreMemoryStore = getCoreMemoryStore(projectPath);
if (isUnifiedEmbedderAvailable()) {
this.vectorIndex = new UnifiedVectorIndex(projectPath);
}
}
/**
* Detect recurring patterns across sessions by vector clustering.
*
* Algorithm:
* 1. Get representative chunks from VectorStore (via search with broad queries)
* 2. For each chunk, search HNSW for nearest neighbors (cosine > PATTERN_SIMILARITY_THRESHOLD)
* 3. Group chunks with high mutual similarity into pattern clusters
* 4. Count distinct source_ids per cluster (session frequency)
* 5. Patterns with sessionCount >= MIN_SESSION_FREQUENCY become candidates
*
* @returns Detection result with candidate patterns
*/
async detectPatterns(): Promise<PatternDetectionResult> {
const startTime = Date.now();
const result: PatternDetectionResult = {
patterns: [],
chunksAnalyzed: 0,
solidified: [],
elapsedMs: 0,
};
if (!this.vectorIndex) {
result.elapsedMs = Date.now() - startTime;
return result;
}
// Step 1: Gather chunks from the vector store via broad category searches
const allChunks = await this.gatherChunksForAnalysis();
result.chunksAnalyzed = allChunks.length;
if (allChunks.length < MIN_SESSION_FREQUENCY) {
result.elapsedMs = Date.now() - startTime;
return result;
}
// Step 2: Cluster chunks by vector similarity
const patternGroups = await this.clusterChunksByVector(allChunks);
// Step 3: Filter by session frequency and build DetectedPattern objects
for (const group of patternGroups) {
const uniqueSources = new Set(group.map(c => c.source_id));
if (uniqueSources.size < MIN_SESSION_FREQUENCY) continue;
const avgSim = group.reduce((sum, c) => sum + c.score, 0) / group.length;
// Confidence: combines frequency (normalized) and avg similarity
const frequencyScore = Math.min(uniqueSources.size / 10, 1.0);
const confidence = avgSim * 0.6 + frequencyScore * 0.4;
const representative = group[0]; // Highest scoring chunk
const patternName = this.derivePatternName(group);
const patternId = `PAT-${Date.now()}-${Math.random().toString(36).substring(2, 6)}`;
result.patterns.push({
id: patternId,
name: patternName,
representative: representative.content.substring(0, 500),
sourceIds: Array.from(uniqueSources),
sessionCount: uniqueSources.size,
avgSimilarity: Math.round(avgSim * 1000) / 1000,
confidence: Math.round(confidence * 1000) / 1000,
category: representative.category || 'unknown',
});
}
// Sort by confidence descending
result.patterns.sort((a, b) => b.confidence - a.confidence);
// Step 4: Auto-solidify high-confidence patterns (fire-and-forget)
for (const pattern of result.patterns) {
if (pattern.confidence >= SOLIDIFY_CONFIDENCE_THRESHOLD) {
try {
await this.solidifyPattern(pattern);
result.solidified.push(pattern.id);
} catch (err) {
console.warn(
`[PatternDetector] Failed to solidify pattern ${pattern.id}:`,
(err as Error).message
);
}
}
}
result.elapsedMs = Date.now() - startTime;
return result;
}
/**
* Gather a representative set of chunks for pattern analysis.
* Uses broad search queries across categories to collect diverse chunks.
*/
private async gatherChunksForAnalysis(): Promise<VectorSearchMatch[]> {
if (!this.vectorIndex) return [];
const allChunks: VectorSearchMatch[] = [];
const seenContent = new Set<string>();
// Search across common categories with broad queries
const broadQueries = [
'implementation pattern',
'configuration setup',
'error handling',
'testing approach',
'workflow process',
];
const categories = ['core_memory', 'cli_history', 'workflow'] as const;
for (const category of categories) {
for (const query of broadQueries) {
if (allChunks.length >= MAX_CHUNKS_TO_ANALYZE) break;
try {
const result = await this.vectorIndex.search(query, {
topK: Math.ceil(MAX_CHUNKS_TO_ANALYZE / (broadQueries.length * categories.length)),
minScore: 0.1,
category,
});
if (result.success) {
for (const match of result.matches) {
// Deduplicate by content hash (first 100 chars)
const contentKey = match.content.substring(0, 100);
if (!seenContent.has(contentKey)) {
seenContent.add(contentKey);
allChunks.push(match);
}
if (allChunks.length >= MAX_CHUNKS_TO_ANALYZE) break;
}
}
} catch {
// Search failed for this query/category, continue
}
}
}
return allChunks;
}
/**
* Cluster chunks by vector similarity using HNSW neighbor search.
*
* For each unprocessed chunk, search for its nearest neighbors.
* Chunks with cosine similarity > PATTERN_SIMILARITY_THRESHOLD are grouped together.
* Uses a union-find-like approach via visited tracking.
*/
private async clusterChunksByVector(
chunks: VectorSearchMatch[]
): Promise<VectorSearchMatch[][]> {
if (!this.vectorIndex) return [];
const groups: VectorSearchMatch[][] = [];
const processed = new Set<number>();
for (let i = 0; i < chunks.length; i++) {
if (processed.has(i)) continue;
const seedChunk = chunks[i];
const group: VectorSearchMatch[] = [seedChunk];
processed.add(i);
// Search for neighbors of this chunk's content
try {
const neighbors = await this.vectorIndex.search(seedChunk.content, {
topK: NEIGHBOR_TOP_K,
minScore: PATTERN_SIMILARITY_THRESHOLD,
});
if (neighbors.success) {
for (const neighbor of neighbors.matches) {
// Skip self-matches
if (neighbor.content === seedChunk.content) continue;
// Find this neighbor in our chunk list
for (let j = 0; j < chunks.length; j++) {
if (processed.has(j)) continue;
if (
chunks[j].source_id === neighbor.source_id &&
chunks[j].chunk_index === neighbor.chunk_index
) {
group.push({ ...chunks[j], score: neighbor.score });
processed.add(j);
break;
}
}
// Also include neighbors not in our original list
if (neighbor.source_id && neighbor.source_id !== seedChunk.source_id) {
// Check if already in group by source_id
const alreadyInGroup = group.some(
g => g.source_id === neighbor.source_id && g.chunk_index === neighbor.chunk_index
);
if (!alreadyInGroup) {
group.push(neighbor);
}
}
}
}
} catch {
// HNSW search failed, skip this chunk's neighborhood
}
// Only keep groups with chunks from multiple sources
const uniqueSources = new Set(group.map(c => c.source_id));
if (uniqueSources.size >= 2) {
groups.push(group);
}
}
return groups;
}
/**
* Derive a human-readable pattern name from a group of similar chunks.
* Extracts common keywords/phrases from the representative content.
*/
private derivePatternName(group: VectorSearchMatch[]): string {
// Extended stopwords including generic tech terms
const stopwords = new Set([
'the', 'and', 'for', 'that', 'this', 'with', 'from', 'have', 'will',
'are', 'was', 'were', 'been', 'what', 'when', 'where', 'which',
'there', 'their', 'they', 'them', 'then', 'than', 'into', 'some',
'code', 'file', 'function', 'class', 'import', 'export', 'const',
'async', 'await', 'return', 'type', 'interface', 'string', 'number',
'true', 'false', 'null', 'undefined', 'object', 'array', 'value',
'data', 'result', 'error', 'name', 'path', 'index', 'item', 'list',
'should', 'would', 'could', 'does', 'make', 'like', 'just', 'also',
'used', 'using', 'each', 'other', 'more', 'only', 'need', 'very',
]);
const isSignificant = (w: string) => w.length >= 4 && !stopwords.has(w);
// Count word and bigram frequency across all chunks
const wordFreq = new Map<string, number>();
const bigramFreq = new Map<string, number>();
for (const chunk of group) {
const words = chunk.content.toLowerCase().split(/[\s\W]+/).filter(isSignificant);
const uniqueWords = new Set(words);
for (const word of uniqueWords) {
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
}
// Extract bigrams from consecutive significant words
for (let i = 0; i < words.length - 1; i++) {
const bigram = `${words[i]}-${words[i + 1]}`;
bigramFreq.set(bigram, (bigramFreq.get(bigram) || 0) + 1);
}
}
// Prefer bigrams that appear in multiple chunks
const topBigrams = Array.from(bigramFreq.entries())
.filter(([, count]) => count >= 2)
.sort((a, b) => b[1] - a[1]);
if (topBigrams.length > 0) {
// Use top bigram, optionally append a distinguishing single word
const name = topBigrams[0][0];
const bigramWords = new Set(name.split('-'));
const extra = Array.from(wordFreq.entries())
.filter(([w, count]) => count >= 2 && !bigramWords.has(w))
.sort((a, b) => b[1] - a[1]);
if (extra.length > 0) {
const candidate = `${name}-${extra[0][0]}`;
return candidate.length <= 50 ? candidate : name;
}
return name;
}
// Fallback to top single words
const topWords = Array.from(wordFreq.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 3)
.map(([w]) => w);
if (topWords.length >= 2) {
const name = topWords.join('-');
return name.length <= 50 ? name : topWords.slice(0, 2).join('-');
} else if (topWords.length === 1) {
return topWords[0];
}
return 'unnamed-pattern';
}
/**
* Solidify a detected pattern by writing it to CoreMemory and skills/*.md.
*
* Creates:
* 1. A CoreMemory entry with the pattern content and metadata
* 2. A skills/{pattern_slug}.md file with the pattern documentation
*
* This method is fire-and-forget - errors are logged but not propagated.
*
* @param pattern - The detected pattern to solidify
* @returns Result with memory ID and skill file path
*/
async solidifyPattern(pattern: DetectedPattern): Promise<SolidifyResult> {
// 1. Create CoreMemory entry
const memoryContent = this.buildPatternMemoryContent(pattern);
const memory = this.coreMemoryStore.upsertMemory({
content: memoryContent,
summary: `Detected pattern: ${pattern.name} (${pattern.sessionCount} sessions, confidence: ${pattern.confidence})`,
metadata: JSON.stringify({
type: 'detected_pattern',
pattern_id: pattern.id,
pattern_name: pattern.name,
session_count: pattern.sessionCount,
confidence: pattern.confidence,
source_ids: pattern.sourceIds,
detected_at: new Date().toISOString(),
}),
});
// 2. Write skills file
let skillPath: string | null = null;
try {
const slug = pattern.name
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '')
.substring(0, 50);
const skillsDir = join(this.projectPath, '.claude', 'skills');
if (!existsSync(skillsDir)) {
mkdirSync(skillsDir, { recursive: true });
}
skillPath = join(skillsDir, `${slug}.md`);
const skillContent = this.buildSkillContent(pattern);
writeFileSync(skillPath, skillContent, 'utf-8');
} catch (err) {
console.warn(
`[PatternDetector] Failed to write skill file for ${pattern.name}:`,
(err as Error).message
);
skillPath = null;
}
console.log(
`[PatternDetector] Solidified pattern '${pattern.name}' -> memory=${memory.id}, skill=${skillPath || 'none'}`
);
return { memoryId: memory.id, skillPath };
}
/**
* Build CoreMemory content for a detected pattern.
*/
private buildPatternMemoryContent(pattern: DetectedPattern): string {
const lines: string[] = [
`# Detected Pattern: ${pattern.name}`,
'',
`**Confidence**: ${pattern.confidence}`,
`**Sessions**: ${pattern.sessionCount} (${pattern.sourceIds.join(', ')})`,
`**Category**: ${pattern.category}`,
`**Avg Similarity**: ${pattern.avgSimilarity}`,
'',
'## Representative Content',
'',
pattern.representative,
'',
'## Usage',
'',
'This pattern was automatically detected across multiple sessions.',
'It represents a recurring approach or concept in this project.',
];
return lines.join('\n');
}
/**
* Build skill file content for a detected pattern.
*/
private buildSkillContent(pattern: DetectedPattern): string {
const lines: string[] = [
`# ${pattern.name}`,
'',
`> Auto-detected pattern (confidence: ${pattern.confidence}, sessions: ${pattern.sessionCount})`,
'',
'## Description',
'',
pattern.representative,
'',
'## Context',
'',
`This pattern was detected across ${pattern.sessionCount} sessions:`,
...pattern.sourceIds.map(id => `- ${id}`),
'',
'## When to Apply',
'',
'Apply this pattern when working on similar tasks or encountering related concepts.',
'',
`---`,
`*Auto-generated by PatternDetector on ${new Date().toISOString()}*`,
];
return lines.join('\n');
}
}