mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-14 17:41:22 +08:00
- Introduced test suite for AstGrepPythonProcessor covering pattern definitions, parsing, and relationship extraction. - Added comparison tests between tree-sitter and ast-grep for consistency in relationship extraction. - Implemented tests for ast-grep binding module to verify functionality and availability. - Ensured tests cover various scenarios including inheritance, function calls, and imports.
486 lines
16 KiB
TypeScript
486 lines
16 KiB
TypeScript
/**
|
|
* Pattern Detector - Detects recurring content patterns across sessions
|
|
*
|
|
* Uses vector clustering (cosine similarity > 0.85) to group semantically similar
|
|
* chunks into patterns. Patterns appearing in N>=3 distinct sessions are flagged
|
|
* as candidates. High-confidence patterns (>=0.8) are solidified into CoreMemory
|
|
* and skills/*.md files.
|
|
*/
|
|
|
|
import { CoreMemoryStore, getCoreMemoryStore } from './core-memory-store.js';
|
|
import { UnifiedVectorIndex, isUnifiedEmbedderAvailable } from './unified-vector-index.js';
|
|
import type { VectorSearchMatch } from './unified-vector-index.js';
|
|
import { existsSync, mkdirSync, writeFileSync } from 'fs';
|
|
import { join } from 'path';
|
|
|
|
// -- Constants --
|
|
|
|
/** Minimum cosine similarity to group chunks into the same pattern */
|
|
const PATTERN_SIMILARITY_THRESHOLD = 0.85;
|
|
|
|
/** Minimum number of distinct sessions a pattern must appear in */
|
|
const MIN_SESSION_FREQUENCY = 3;
|
|
|
|
/** Confidence threshold for auto-solidification */
|
|
const SOLIDIFY_CONFIDENCE_THRESHOLD = 0.8;
|
|
|
|
/** Maximum number of chunks to analyze per detection run */
|
|
const MAX_CHUNKS_TO_ANALYZE = 200;
|
|
|
|
/** Top-K neighbors to search per chunk during clustering */
|
|
const NEIGHBOR_TOP_K = 15;
|
|
|
|
// -- Types --
|
|
|
|
export interface DetectedPattern {
|
|
/** Unique pattern identifier */
|
|
id: string;
|
|
/** Human-readable pattern name derived from content */
|
|
name: string;
|
|
/** Representative content snippet */
|
|
representative: string;
|
|
/** Source IDs (sessions) where this pattern appears */
|
|
sourceIds: string[];
|
|
/** Number of distinct sessions */
|
|
sessionCount: number;
|
|
/** Average similarity score within the pattern group */
|
|
avgSimilarity: number;
|
|
/** Confidence score (0-1), based on frequency and similarity */
|
|
confidence: number;
|
|
/** Category of the chunks in this pattern */
|
|
category: string;
|
|
}
|
|
|
|
export interface PatternDetectionResult {
|
|
/** All detected patterns */
|
|
patterns: DetectedPattern[];
|
|
/** Number of chunks analyzed */
|
|
chunksAnalyzed: number;
|
|
/** Patterns that were solidified (written to CoreMemory + skills) */
|
|
solidified: string[];
|
|
/** Elapsed time in ms */
|
|
elapsedMs: number;
|
|
}
|
|
|
|
export interface SolidifyResult {
|
|
memoryId: string;
|
|
skillPath: string | null;
|
|
}
|
|
|
|
// -- PatternDetector --
|
|
|
|
export class PatternDetector {
|
|
private projectPath: string;
|
|
private coreMemoryStore: CoreMemoryStore;
|
|
private vectorIndex: UnifiedVectorIndex | null = null;
|
|
|
|
constructor(projectPath: string) {
|
|
this.projectPath = projectPath;
|
|
this.coreMemoryStore = getCoreMemoryStore(projectPath);
|
|
|
|
if (isUnifiedEmbedderAvailable()) {
|
|
this.vectorIndex = new UnifiedVectorIndex(projectPath);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Detect recurring patterns across sessions by vector clustering.
|
|
*
|
|
* Algorithm:
|
|
* 1. Get representative chunks from VectorStore (via search with broad queries)
|
|
* 2. For each chunk, search HNSW for nearest neighbors (cosine > PATTERN_SIMILARITY_THRESHOLD)
|
|
* 3. Group chunks with high mutual similarity into pattern clusters
|
|
* 4. Count distinct source_ids per cluster (session frequency)
|
|
* 5. Patterns with sessionCount >= MIN_SESSION_FREQUENCY become candidates
|
|
*
|
|
* @returns Detection result with candidate patterns
|
|
*/
|
|
async detectPatterns(): Promise<PatternDetectionResult> {
|
|
const startTime = Date.now();
|
|
const result: PatternDetectionResult = {
|
|
patterns: [],
|
|
chunksAnalyzed: 0,
|
|
solidified: [],
|
|
elapsedMs: 0,
|
|
};
|
|
|
|
if (!this.vectorIndex) {
|
|
result.elapsedMs = Date.now() - startTime;
|
|
return result;
|
|
}
|
|
|
|
// Step 1: Gather chunks from the vector store via broad category searches
|
|
const allChunks = await this.gatherChunksForAnalysis();
|
|
result.chunksAnalyzed = allChunks.length;
|
|
|
|
if (allChunks.length < MIN_SESSION_FREQUENCY) {
|
|
result.elapsedMs = Date.now() - startTime;
|
|
return result;
|
|
}
|
|
|
|
// Step 2: Cluster chunks by vector similarity
|
|
const patternGroups = await this.clusterChunksByVector(allChunks);
|
|
|
|
// Step 3: Filter by session frequency and build DetectedPattern objects
|
|
for (const group of patternGroups) {
|
|
const uniqueSources = new Set(group.map(c => c.source_id));
|
|
if (uniqueSources.size < MIN_SESSION_FREQUENCY) continue;
|
|
|
|
const avgSim = group.reduce((sum, c) => sum + c.score, 0) / group.length;
|
|
|
|
// Confidence: combines frequency (normalized) and avg similarity
|
|
const frequencyScore = Math.min(uniqueSources.size / 10, 1.0);
|
|
const confidence = avgSim * 0.6 + frequencyScore * 0.4;
|
|
|
|
const representative = group[0]; // Highest scoring chunk
|
|
const patternName = this.derivePatternName(group);
|
|
const patternId = `PAT-${Date.now()}-${Math.random().toString(36).substring(2, 6)}`;
|
|
|
|
result.patterns.push({
|
|
id: patternId,
|
|
name: patternName,
|
|
representative: representative.content.substring(0, 500),
|
|
sourceIds: Array.from(uniqueSources),
|
|
sessionCount: uniqueSources.size,
|
|
avgSimilarity: Math.round(avgSim * 1000) / 1000,
|
|
confidence: Math.round(confidence * 1000) / 1000,
|
|
category: representative.category || 'unknown',
|
|
});
|
|
}
|
|
|
|
// Sort by confidence descending
|
|
result.patterns.sort((a, b) => b.confidence - a.confidence);
|
|
|
|
// Step 4: Auto-solidify high-confidence patterns (fire-and-forget)
|
|
for (const pattern of result.patterns) {
|
|
if (pattern.confidence >= SOLIDIFY_CONFIDENCE_THRESHOLD) {
|
|
try {
|
|
await this.solidifyPattern(pattern);
|
|
result.solidified.push(pattern.id);
|
|
} catch (err) {
|
|
console.warn(
|
|
`[PatternDetector] Failed to solidify pattern ${pattern.id}:`,
|
|
(err as Error).message
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
result.elapsedMs = Date.now() - startTime;
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Gather a representative set of chunks for pattern analysis.
|
|
* Uses broad search queries across categories to collect diverse chunks.
|
|
*/
|
|
private async gatherChunksForAnalysis(): Promise<VectorSearchMatch[]> {
|
|
if (!this.vectorIndex) return [];
|
|
|
|
const allChunks: VectorSearchMatch[] = [];
|
|
const seenContent = new Set<string>();
|
|
|
|
// Search across common categories with broad queries
|
|
const broadQueries = [
|
|
'implementation pattern',
|
|
'configuration setup',
|
|
'error handling',
|
|
'testing approach',
|
|
'workflow process',
|
|
];
|
|
|
|
const categories = ['core_memory', 'cli_history', 'workflow'] as const;
|
|
|
|
for (const category of categories) {
|
|
for (const query of broadQueries) {
|
|
if (allChunks.length >= MAX_CHUNKS_TO_ANALYZE) break;
|
|
|
|
try {
|
|
const result = await this.vectorIndex.search(query, {
|
|
topK: Math.ceil(MAX_CHUNKS_TO_ANALYZE / (broadQueries.length * categories.length)),
|
|
minScore: 0.1,
|
|
category,
|
|
});
|
|
|
|
if (result.success) {
|
|
for (const match of result.matches) {
|
|
// Deduplicate by content hash (first 100 chars)
|
|
const contentKey = match.content.substring(0, 100);
|
|
if (!seenContent.has(contentKey)) {
|
|
seenContent.add(contentKey);
|
|
allChunks.push(match);
|
|
}
|
|
if (allChunks.length >= MAX_CHUNKS_TO_ANALYZE) break;
|
|
}
|
|
}
|
|
} catch {
|
|
// Search failed for this query/category, continue
|
|
}
|
|
}
|
|
}
|
|
|
|
return allChunks;
|
|
}
|
|
|
|
/**
|
|
* Cluster chunks by vector similarity using HNSW neighbor search.
|
|
*
|
|
* For each unprocessed chunk, search for its nearest neighbors.
|
|
* Chunks with cosine similarity > PATTERN_SIMILARITY_THRESHOLD are grouped together.
|
|
* Uses a union-find-like approach via visited tracking.
|
|
*/
|
|
private async clusterChunksByVector(
|
|
chunks: VectorSearchMatch[]
|
|
): Promise<VectorSearchMatch[][]> {
|
|
if (!this.vectorIndex) return [];
|
|
|
|
const groups: VectorSearchMatch[][] = [];
|
|
const processed = new Set<number>();
|
|
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
if (processed.has(i)) continue;
|
|
|
|
const seedChunk = chunks[i];
|
|
const group: VectorSearchMatch[] = [seedChunk];
|
|
processed.add(i);
|
|
|
|
// Search for neighbors of this chunk's content
|
|
try {
|
|
const neighbors = await this.vectorIndex.search(seedChunk.content, {
|
|
topK: NEIGHBOR_TOP_K,
|
|
minScore: PATTERN_SIMILARITY_THRESHOLD,
|
|
});
|
|
|
|
if (neighbors.success) {
|
|
for (const neighbor of neighbors.matches) {
|
|
// Skip self-matches
|
|
if (neighbor.content === seedChunk.content) continue;
|
|
|
|
// Find this neighbor in our chunk list
|
|
for (let j = 0; j < chunks.length; j++) {
|
|
if (processed.has(j)) continue;
|
|
if (
|
|
chunks[j].source_id === neighbor.source_id &&
|
|
chunks[j].chunk_index === neighbor.chunk_index
|
|
) {
|
|
group.push({ ...chunks[j], score: neighbor.score });
|
|
processed.add(j);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Also include neighbors not in our original list
|
|
if (neighbor.source_id && neighbor.source_id !== seedChunk.source_id) {
|
|
// Check if already in group by source_id
|
|
const alreadyInGroup = group.some(
|
|
g => g.source_id === neighbor.source_id && g.chunk_index === neighbor.chunk_index
|
|
);
|
|
if (!alreadyInGroup) {
|
|
group.push(neighbor);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch {
|
|
// HNSW search failed, skip this chunk's neighborhood
|
|
}
|
|
|
|
// Only keep groups with chunks from multiple sources
|
|
const uniqueSources = new Set(group.map(c => c.source_id));
|
|
if (uniqueSources.size >= 2) {
|
|
groups.push(group);
|
|
}
|
|
}
|
|
|
|
return groups;
|
|
}
|
|
|
|
/**
|
|
* Derive a human-readable pattern name from a group of similar chunks.
|
|
* Extracts common keywords/phrases from the representative content.
|
|
*/
|
|
private derivePatternName(group: VectorSearchMatch[]): string {
|
|
// Extended stopwords including generic tech terms
|
|
const stopwords = new Set([
|
|
'the', 'and', 'for', 'that', 'this', 'with', 'from', 'have', 'will',
|
|
'are', 'was', 'were', 'been', 'what', 'when', 'where', 'which',
|
|
'there', 'their', 'they', 'them', 'then', 'than', 'into', 'some',
|
|
'code', 'file', 'function', 'class', 'import', 'export', 'const',
|
|
'async', 'await', 'return', 'type', 'interface', 'string', 'number',
|
|
'true', 'false', 'null', 'undefined', 'object', 'array', 'value',
|
|
'data', 'result', 'error', 'name', 'path', 'index', 'item', 'list',
|
|
'should', 'would', 'could', 'does', 'make', 'like', 'just', 'also',
|
|
'used', 'using', 'each', 'other', 'more', 'only', 'need', 'very',
|
|
]);
|
|
|
|
const isSignificant = (w: string) => w.length >= 4 && !stopwords.has(w);
|
|
|
|
// Count word and bigram frequency across all chunks
|
|
const wordFreq = new Map<string, number>();
|
|
const bigramFreq = new Map<string, number>();
|
|
|
|
for (const chunk of group) {
|
|
const words = chunk.content.toLowerCase().split(/[\s\W]+/).filter(isSignificant);
|
|
const uniqueWords = new Set(words);
|
|
for (const word of uniqueWords) {
|
|
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
|
|
}
|
|
|
|
// Extract bigrams from consecutive significant words
|
|
for (let i = 0; i < words.length - 1; i++) {
|
|
const bigram = `${words[i]}-${words[i + 1]}`;
|
|
bigramFreq.set(bigram, (bigramFreq.get(bigram) || 0) + 1);
|
|
}
|
|
}
|
|
|
|
// Prefer bigrams that appear in multiple chunks
|
|
const topBigrams = Array.from(bigramFreq.entries())
|
|
.filter(([, count]) => count >= 2)
|
|
.sort((a, b) => b[1] - a[1]);
|
|
|
|
if (topBigrams.length > 0) {
|
|
// Use top bigram, optionally append a distinguishing single word
|
|
const name = topBigrams[0][0];
|
|
const bigramWords = new Set(name.split('-'));
|
|
const extra = Array.from(wordFreq.entries())
|
|
.filter(([w, count]) => count >= 2 && !bigramWords.has(w))
|
|
.sort((a, b) => b[1] - a[1]);
|
|
if (extra.length > 0) {
|
|
const candidate = `${name}-${extra[0][0]}`;
|
|
return candidate.length <= 50 ? candidate : name;
|
|
}
|
|
return name;
|
|
}
|
|
|
|
// Fallback to top single words
|
|
const topWords = Array.from(wordFreq.entries())
|
|
.sort((a, b) => b[1] - a[1])
|
|
.slice(0, 3)
|
|
.map(([w]) => w);
|
|
|
|
if (topWords.length >= 2) {
|
|
const name = topWords.join('-');
|
|
return name.length <= 50 ? name : topWords.slice(0, 2).join('-');
|
|
} else if (topWords.length === 1) {
|
|
return topWords[0];
|
|
}
|
|
|
|
return 'unnamed-pattern';
|
|
}
|
|
|
|
/**
|
|
* Solidify a detected pattern by writing it to CoreMemory and skills/*.md.
|
|
*
|
|
* Creates:
|
|
* 1. A CoreMemory entry with the pattern content and metadata
|
|
* 2. A skills/{pattern_slug}.md file with the pattern documentation
|
|
*
|
|
* This method is fire-and-forget - errors are logged but not propagated.
|
|
*
|
|
* @param pattern - The detected pattern to solidify
|
|
* @returns Result with memory ID and skill file path
|
|
*/
|
|
async solidifyPattern(pattern: DetectedPattern): Promise<SolidifyResult> {
|
|
// 1. Create CoreMemory entry
|
|
const memoryContent = this.buildPatternMemoryContent(pattern);
|
|
const memory = this.coreMemoryStore.upsertMemory({
|
|
content: memoryContent,
|
|
summary: `Detected pattern: ${pattern.name} (${pattern.sessionCount} sessions, confidence: ${pattern.confidence})`,
|
|
metadata: JSON.stringify({
|
|
type: 'detected_pattern',
|
|
pattern_id: pattern.id,
|
|
pattern_name: pattern.name,
|
|
session_count: pattern.sessionCount,
|
|
confidence: pattern.confidence,
|
|
source_ids: pattern.sourceIds,
|
|
detected_at: new Date().toISOString(),
|
|
}),
|
|
});
|
|
|
|
// 2. Write skills file
|
|
let skillPath: string | null = null;
|
|
try {
|
|
const slug = pattern.name
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/^-|-$/g, '')
|
|
.substring(0, 50);
|
|
|
|
const skillsDir = join(this.projectPath, '.claude', 'skills');
|
|
if (!existsSync(skillsDir)) {
|
|
mkdirSync(skillsDir, { recursive: true });
|
|
}
|
|
|
|
skillPath = join(skillsDir, `${slug}.md`);
|
|
const skillContent = this.buildSkillContent(pattern);
|
|
writeFileSync(skillPath, skillContent, 'utf-8');
|
|
} catch (err) {
|
|
console.warn(
|
|
`[PatternDetector] Failed to write skill file for ${pattern.name}:`,
|
|
(err as Error).message
|
|
);
|
|
skillPath = null;
|
|
}
|
|
|
|
console.log(
|
|
`[PatternDetector] Solidified pattern '${pattern.name}' -> memory=${memory.id}, skill=${skillPath || 'none'}`
|
|
);
|
|
|
|
return { memoryId: memory.id, skillPath };
|
|
}
|
|
|
|
/**
|
|
* Build CoreMemory content for a detected pattern.
|
|
*/
|
|
private buildPatternMemoryContent(pattern: DetectedPattern): string {
|
|
const lines: string[] = [
|
|
`# Detected Pattern: ${pattern.name}`,
|
|
'',
|
|
`**Confidence**: ${pattern.confidence}`,
|
|
`**Sessions**: ${pattern.sessionCount} (${pattern.sourceIds.join(', ')})`,
|
|
`**Category**: ${pattern.category}`,
|
|
`**Avg Similarity**: ${pattern.avgSimilarity}`,
|
|
'',
|
|
'## Representative Content',
|
|
'',
|
|
pattern.representative,
|
|
'',
|
|
'## Usage',
|
|
'',
|
|
'This pattern was automatically detected across multiple sessions.',
|
|
'It represents a recurring approach or concept in this project.',
|
|
];
|
|
|
|
return lines.join('\n');
|
|
}
|
|
|
|
/**
|
|
* Build skill file content for a detected pattern.
|
|
*/
|
|
private buildSkillContent(pattern: DetectedPattern): string {
|
|
const lines: string[] = [
|
|
`# ${pattern.name}`,
|
|
'',
|
|
`> Auto-detected pattern (confidence: ${pattern.confidence}, sessions: ${pattern.sessionCount})`,
|
|
'',
|
|
'## Description',
|
|
'',
|
|
pattern.representative,
|
|
'',
|
|
'## Context',
|
|
'',
|
|
`This pattern was detected across ${pattern.sessionCount} sessions:`,
|
|
...pattern.sourceIds.map(id => `- ${id}`),
|
|
'',
|
|
'## When to Apply',
|
|
'',
|
|
'Apply this pattern when working on similar tasks or encountering related concepts.',
|
|
'',
|
|
`---`,
|
|
`*Auto-generated by PatternDetector on ${new Date().toISOString()}*`,
|
|
];
|
|
|
|
return lines.join('\n');
|
|
}
|
|
}
|