diff --git a/ccw/scripts/memory_embedder.py b/ccw/scripts/memory_embedder.py new file mode 100644 index 00000000..0c875d24 --- /dev/null +++ b/ccw/scripts/memory_embedder.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 +""" +Memory Embedder - Bridge CCW to CodexLens semantic search + +This script generates and searches embeddings for memory chunks stored in CCW's +SQLite database using CodexLens's embedder. + +Usage: + python memory_embedder.py embed [--source-id ID] [--batch-size N] [--force] + python memory_embedder.py search [--top-k N] [--min-score F] [--type TYPE] + python memory_embedder.py status +""" + +import argparse +import json +import sqlite3 +import sys +import time +from pathlib import Path +from typing import List, Dict, Any, Optional, Tuple + +try: + import numpy as np +except ImportError: + print("Error: numpy is required. Install with: pip install numpy", file=sys.stderr) + sys.exit(1) + +try: + from codexlens.semantic.embedder import get_embedder +except ImportError: + print("Error: CodexLens not found. Install with: pip install codexlens[semantic]", file=sys.stderr) + sys.exit(1) + + +class MemoryEmbedder: + """Generate and search embeddings for memory chunks.""" + + EMBEDDING_DIM = 768 # jina-embeddings-v2-base-code dimension + + def __init__(self, db_path: str): + """Initialize embedder with database path.""" + self.db_path = Path(db_path) + if not self.db_path.exists(): + raise FileNotFoundError(f"Database not found: {db_path}") + + self.conn = sqlite3.connect(str(self.db_path)) + self.conn.row_factory = sqlite3.Row + + # Initialize embedder (uses cached singleton) + self.embedder = get_embedder(profile="code") + + def close(self): + """Close database connection.""" + if self.conn: + self.conn.close() + + def embed_chunks( + self, + source_id: Optional[str] = None, + batch_size: int = 8, + force: bool = False + ) -> Dict[str, Any]: + """ + Generate embeddings for unembedded chunks. + + Args: + source_id: Only process chunks from this source + batch_size: Number of chunks to process in each batch + force: Re-embed chunks that already have embeddings + + Returns: + Result dict with success, chunks_processed, chunks_failed, elapsed_time + """ + start_time = time.time() + + # Build query + query = "SELECT id, source_id, source_type, chunk_index, content FROM memory_chunks" + params = [] + + if force: + # Process all chunks (with optional source filter) + if source_id: + query += " WHERE source_id = ?" + params.append(source_id) + else: + # Only process chunks without embeddings + query += " WHERE embedding IS NULL" + if source_id: + query += " AND source_id = ?" + params.append(source_id) + + query += " ORDER BY id" + + cursor = self.conn.cursor() + cursor.execute(query, params) + + chunks_processed = 0 + chunks_failed = 0 + batch = [] + batch_ids = [] + + for row in cursor: + batch.append(row["content"]) + batch_ids.append(row["id"]) + + # Process batch when full + if len(batch) >= batch_size: + processed, failed = self._process_batch(batch, batch_ids) + chunks_processed += processed + chunks_failed += failed + batch = [] + batch_ids = [] + + # Process remaining chunks + if batch: + processed, failed = self._process_batch(batch, batch_ids) + chunks_processed += processed + chunks_failed += failed + + elapsed_time = time.time() - start_time + + return { + "success": chunks_failed == 0, + "chunks_processed": chunks_processed, + "chunks_failed": chunks_failed, + "elapsed_time": round(elapsed_time, 2) + } + + def _process_batch(self, texts: List[str], ids: List[int]) -> Tuple[int, int]: + """Process a batch of texts and update embeddings.""" + try: + # Generate embeddings for batch + embeddings = self.embedder.embed(texts) + + processed = 0 + failed = 0 + + # Update database + cursor = self.conn.cursor() + for chunk_id, embedding in zip(ids, embeddings): + try: + # Convert to numpy array and store as bytes + emb_array = np.array(embedding, dtype=np.float32) + emb_bytes = emb_array.tobytes() + + cursor.execute( + "UPDATE memory_chunks SET embedding = ? WHERE id = ?", + (emb_bytes, chunk_id) + ) + processed += 1 + except Exception as e: + print(f"Error updating chunk {chunk_id}: {e}", file=sys.stderr) + failed += 1 + + self.conn.commit() + return processed, failed + + except Exception as e: + print(f"Error processing batch: {e}", file=sys.stderr) + return 0, len(ids) + + def search( + self, + query: str, + top_k: int = 10, + min_score: float = 0.3, + source_type: Optional[str] = None + ) -> Dict[str, Any]: + """ + Perform semantic search on memory chunks. + + Args: + query: Search query text + top_k: Number of results to return + min_score: Minimum similarity score (0-1) + source_type: Filter by source type (core_memory, workflow, cli_history) + + Returns: + Result dict with success and matches list + """ + try: + # Generate query embedding + query_embedding = self.embedder.embed_single(query) + query_array = np.array(query_embedding, dtype=np.float32) + + # Build database query + sql = """ + SELECT id, source_id, source_type, chunk_index, content, embedding + FROM memory_chunks + WHERE embedding IS NOT NULL + """ + params = [] + + if source_type: + sql += " AND source_type = ?" + params.append(source_type) + + cursor = self.conn.cursor() + cursor.execute(sql, params) + + # Calculate similarities + matches = [] + for row in cursor: + # Load embedding from bytes + emb_bytes = row["embedding"] + emb_array = np.frombuffer(emb_bytes, dtype=np.float32) + + # Cosine similarity + score = float( + np.dot(query_array, emb_array) / + (np.linalg.norm(query_array) * np.linalg.norm(emb_array)) + ) + + if score >= min_score: + # Generate restore command + restore_command = self._get_restore_command( + row["source_id"], + row["source_type"] + ) + + matches.append({ + "source_id": row["source_id"], + "source_type": row["source_type"], + "chunk_index": row["chunk_index"], + "content": row["content"], + "score": round(score, 4), + "restore_command": restore_command + }) + + # Sort by score and limit + matches.sort(key=lambda x: x["score"], reverse=True) + matches = matches[:top_k] + + return { + "success": True, + "matches": matches + } + + except Exception as e: + return { + "success": False, + "error": str(e), + "matches": [] + } + + def _get_restore_command(self, source_id: str, source_type: str) -> str: + """Generate restore command for a source.""" + if source_type in ("core_memory", "cli_history"): + return f"ccw memory export {source_id}" + elif source_type == "workflow": + return f"ccw session resume {source_id}" + else: + return f"# Unknown source type: {source_type}" + + def get_status(self) -> Dict[str, Any]: + """Get embedding status statistics.""" + cursor = self.conn.cursor() + + # Total chunks + cursor.execute("SELECT COUNT(*) as count FROM memory_chunks") + total_chunks = cursor.fetchone()["count"] + + # Embedded chunks + cursor.execute("SELECT COUNT(*) as count FROM memory_chunks WHERE embedding IS NOT NULL") + embedded_chunks = cursor.fetchone()["count"] + + # By type + cursor.execute(""" + SELECT + source_type, + COUNT(*) as total, + SUM(CASE WHEN embedding IS NOT NULL THEN 1 ELSE 0 END) as embedded + FROM memory_chunks + GROUP BY source_type + """) + + by_type = {} + for row in cursor: + by_type[row["source_type"]] = { + "total": row["total"], + "embedded": row["embedded"], + "pending": row["total"] - row["embedded"] + } + + return { + "total_chunks": total_chunks, + "embedded_chunks": embedded_chunks, + "pending_chunks": total_chunks - embedded_chunks, + "by_type": by_type + } + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Memory Embedder - Bridge CCW to CodexLens semantic search" + ) + + subparsers = parser.add_subparsers(dest="command", help="Command to execute") + subparsers.required = True + + # Embed command + embed_parser = subparsers.add_parser("embed", help="Generate embeddings for chunks") + embed_parser.add_argument("db_path", help="Path to SQLite database") + embed_parser.add_argument("--source-id", help="Only process chunks from this source") + embed_parser.add_argument("--batch-size", type=int, default=8, help="Batch size (default: 8)") + embed_parser.add_argument("--force", action="store_true", help="Re-embed existing chunks") + + # Search command + search_parser = subparsers.add_parser("search", help="Semantic search") + search_parser.add_argument("db_path", help="Path to SQLite database") + search_parser.add_argument("query", help="Search query") + search_parser.add_argument("--top-k", type=int, default=10, help="Number of results (default: 10)") + search_parser.add_argument("--min-score", type=float, default=0.3, help="Minimum score (default: 0.3)") + search_parser.add_argument("--type", dest="source_type", help="Filter by source type") + + # Status command + status_parser = subparsers.add_parser("status", help="Get embedding status") + status_parser.add_argument("db_path", help="Path to SQLite database") + + args = parser.parse_args() + + try: + embedder = MemoryEmbedder(args.db_path) + + if args.command == "embed": + result = embedder.embed_chunks( + source_id=args.source_id, + batch_size=args.batch_size, + force=args.force + ) + print(json.dumps(result, indent=2)) + + elif args.command == "search": + result = embedder.search( + query=args.query, + top_k=args.top_k, + min_score=args.min_score, + source_type=args.source_type + ) + print(json.dumps(result, indent=2)) + + elif args.command == "status": + result = embedder.get_status() + print(json.dumps(result, indent=2)) + + embedder.close() + + # Exit with error code if operation failed + if "success" in result and not result["success"]: + sys.exit(1) + + except Exception as e: + print(json.dumps({ + "success": False, + "error": str(e) + }, indent=2), file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ccw/src/cli.ts b/ccw/src/cli.ts index 6ec85392..4e300b36 100644 --- a/ccw/src/cli.ts +++ b/ccw/src/cli.ts @@ -181,19 +181,24 @@ export function run(argv: string[]): void { program .command('memory [subcommand] [args...]') .description('Memory module for context tracking and prompt optimization') - .option('--type ', 'Entity type: file, module, topic') + .option('--type ', 'Entity type: file, module, topic (track) OR source type: core_memory, workflow, cli_history (search)') .option('--action ', 'Action: read, write, mention') .option('--value ', 'Entity value (file path, etc.)') .option('--session ', 'Session ID') .option('--stdin', 'Read input from stdin (for Claude Code hooks)') .option('--source ', 'Import source: history, sessions, all', 'all') .option('--project ', 'Project name filter') - .option('--limit ', 'Number of results', '20') + .option('--limit ', 'Number of results (prompt search)', '20') .option('--sort ', 'Sort by: heat, reads, writes', 'heat') .option('--json', 'Output as JSON') .option('--context ', 'Current task context') .option('--older-than ', 'Age threshold for pruning', '30d') .option('--dry-run', 'Preview without deleting') + .option('--id ', 'Memory/session ID (for embed command)') + .option('--force', 'Force re-embed all chunks') + .option('--batch-size ', 'Batch size for embedding', '8') + .option('--top-k ', 'Number of semantic search results', '10') + .option('--min-score ', 'Minimum similarity score for semantic search', '0.5') .action((subcommand, args, options) => memoryCommand(subcommand, args, options)); // Core Memory command diff --git a/ccw/src/commands/memory.ts b/ccw/src/commands/memory.ts index a449fd2c..5c17e84c 100644 --- a/ccw/src/commands/memory.ts +++ b/ccw/src/commands/memory.ts @@ -10,6 +10,16 @@ import { notifyMemoryUpdate, notifyRefreshRequired } from '../tools/notifier.js' import { join } from 'path'; import { existsSync, readdirSync } from 'fs'; import { StoragePaths } from '../config/storage-paths.js'; +import { + generateEmbeddings, + searchMemories, + getEmbeddingStatus, + isEmbedderAvailable, + type EmbedOptions, + type SearchOptions as EmbedSearchOptions +} from '../core/memory-embedder-bridge.js'; +import { getCoreMemoryStore } from '../core/core-memory-store.js'; +import { CliHistoryStore } from '../tools/cli-history-store.js'; interface TrackOptions { type?: string; @@ -47,6 +57,23 @@ interface PruneOptions { dryRun?: boolean; } +interface EmbedCommandOptions { + id?: string; + force?: boolean; + batchSize?: string; +} + +interface SearchCommandOptions { + topK?: string; + type?: 'core_memory' | 'workflow' | 'cli_history'; + minScore?: string; + json?: boolean; +} + +interface EmbedStatusOptions { + json?: boolean; +} + /** * Read JSON data from stdin (for Claude Code hooks) */ @@ -636,16 +663,320 @@ async function pruneAction(options: PruneOptions): Promise { } } +/** + * Chunk and prepare memories for embedding + */ +async function chunkMemoriesForEmbedding(projectPath: string, sourceId?: string, force?: boolean): Promise { + const coreMemoryStore = getCoreMemoryStore(projectPath); + let chunksCreated = 0; + + // 1. Chunk core memories + const memories = coreMemoryStore.getMemories({ archived: false, limit: 1000 }); + for (const memory of memories) { + if (sourceId && memory.id !== sourceId) continue; + + // Check if already chunked (skip unless force) + const existingChunks = coreMemoryStore.getChunks(memory.id); + if (existingChunks.length > 0 && !force) continue; + + // Delete old chunks if force + if (force && existingChunks.length > 0) { + coreMemoryStore.deleteChunks(memory.id); + } + + // Chunk the memory content + const chunks = coreMemoryStore.chunkContent(memory.content, memory.id, 'core_memory'); + + // Insert chunks + for (let i = 0; i < chunks.length; i++) { + coreMemoryStore.insertChunk({ + source_id: memory.id, + source_type: 'core_memory', + chunk_index: i, + content: chunks[i], + created_at: new Date().toISOString() + }); + chunksCreated++; + } + } + + // 2. Chunk CLI history + try { + const cliHistoryStore = new CliHistoryStore(projectPath); + const history = cliHistoryStore.getHistory({ limit: 500 }); + + for (const exec of history.executions) { + if (sourceId && exec.id !== sourceId) continue; + + // Check if already chunked + const existingChunks = coreMemoryStore.getChunks(exec.id); + if (existingChunks.length > 0 && !force) continue; + + // Delete old chunks if force + if (force && existingChunks.length > 0) { + coreMemoryStore.deleteChunks(exec.id); + } + + // Get conversation content + const conversation = cliHistoryStore.getConversation(exec.id); + if (!conversation || !conversation.turns || conversation.turns.length === 0) continue; + + // Create content from turns + const content = conversation.turns + .map((t: any) => `Prompt: ${t.prompt}\nOutput: ${(t.stdout || '').substring(0, 500)}`) + .join('\n---\n'); + + // Chunk the content + const chunks = coreMemoryStore.chunkContent(content, exec.id, 'cli_history'); + + // Insert chunks + for (let i = 0; i < chunks.length; i++) { + coreMemoryStore.insertChunk({ + source_id: exec.id, + source_type: 'cli_history', + chunk_index: i, + content: chunks[i], + created_at: new Date().toISOString() + }); + chunksCreated++; + } + } + } catch { + // CLI history might not exist, continue + } + + return chunksCreated; +} + +/** + * Generate embeddings for memory chunks + */ +async function embedAction(options: EmbedCommandOptions): Promise { + const { id, force, batchSize } = options; + + try { + // Check embedder availability + if (!isEmbedderAvailable()) { + console.error(chalk.red('\nError: Memory embedder not available')); + console.error(chalk.gray('Ensure CodexLens venv exists at ~/.codexlens/venv\n')); + process.exit(1); + } + + const projectPath = getProjectPath(); + const paths = StoragePaths.project(projectPath); + const dbPath = join(paths.root, 'core-memory', 'core_memory.db'); + + if (!existsSync(dbPath)) { + console.error(chalk.red('\nError: Core memory database not found')); + console.error(chalk.gray('Create memories first using "ccw core-memory import"\n')); + process.exit(1); + } + + // Step 1: Chunk memories first + console.log(chalk.cyan('Chunking memories...')); + const chunksCreated = await chunkMemoriesForEmbedding(projectPath, id, force); + if (chunksCreated > 0) { + console.log(chalk.green(` Created ${chunksCreated} new chunks`)); + } + + // Step 2: Generate embeddings + console.log(chalk.cyan('Generating embeddings...')); + + const embedOptions: EmbedOptions = { + sourceId: id, + force: force || false, + batchSize: batchSize ? parseInt(batchSize, 10) : 8 + }; + + const result = await generateEmbeddings(dbPath, embedOptions); + + if (!result.success) { + console.error(chalk.red(`\nError: ${result.error}\n`)); + process.exit(1); + } + + console.log(chalk.green(`\nāœ“ Processed ${result.chunks_processed} chunks in ${result.elapsed_time.toFixed(1)}s`)); + + // Get status to show breakdown by type + const status = await getEmbeddingStatus(dbPath); + if (status.success && Object.keys(status.by_type).length > 0) { + for (const [type, stats] of Object.entries(status.by_type)) { + if (stats.total > 0) { + console.log(chalk.white(` - ${type}: ${stats.embedded} chunks`)); + } + } + } + console.log(); + + } catch (error) { + console.error(chalk.red(`\nError: ${(error as Error).message}\n`)); + process.exit(1); + } +} + +/** + * Search memories using semantic search + */ +async function searchEmbedAction(query: string | undefined, options: SearchCommandOptions): Promise { + if (!query) { + console.error(chalk.red('Error: Search query is required')); + console.error(chalk.gray('Usage: ccw memory search ""')); + process.exit(1); + } + + const { topK = '10', type, minScore = '0.5', json } = options; + + try { + // Check embedder availability + if (!isEmbedderAvailable()) { + console.error(chalk.red('\nError: Memory embedder not available')); + console.error(chalk.gray('Ensure CodexLens venv exists at ~/.codexlens/venv\n')); + process.exit(1); + } + + const projectPath = getProjectPath(); + const paths = StoragePaths.project(projectPath); + const dbPath = join(paths.root, 'core-memory', 'core_memory.db'); + + if (!existsSync(dbPath)) { + console.error(chalk.red('\nError: Core memory database not found')); + console.error(chalk.gray('Create memories first using "ccw core-memory import"\n')); + process.exit(1); + } + + const searchOptions: EmbedSearchOptions = { + topK: parseInt(topK, 10), + minScore: parseFloat(minScore), + sourceType: type + }; + + const result = await searchMemories(dbPath, query, searchOptions); + + if (!result.success) { + console.error(chalk.red(`\nError: ${result.error}\n`)); + process.exit(1); + } + + if (json) { + const output = result.matches.map(m => ({ + sourceId: m.source_id, + sourceType: m.source_type, + score: m.score, + content: m.content, + restoreCommand: m.restore_command + })); + console.log(JSON.stringify(output, null, 2)); + return; + } + + console.log(chalk.bold.cyan(`\nFound ${result.matches.length} matches for "${query}":\n`)); + + if (result.matches.length === 0) { + console.log(chalk.yellow('No results found. Try:')); + console.log(chalk.gray(' - Using different keywords')); + console.log(chalk.gray(' - Lowering --min-score threshold')); + console.log(chalk.gray(' - Running "ccw memory embed" to generate embeddings\n')); + return; + } + + for (let i = 0; i < result.matches.length; i++) { + const match = result.matches[i]; + const preview = match.content.length > 80 + ? match.content.substring(0, 80) + '...' + : match.content; + + console.log(chalk.bold.white(`${i + 1}. [${match.score.toFixed(2)}] ${match.source_id}`) + chalk.gray(` (${match.source_type})`)); + console.log(chalk.white(` "${preview}"`)); + console.log(chalk.cyan(` → ${match.restore_command}`)); + console.log(); + } + + } catch (error) { + if (json) { + console.log(JSON.stringify({ error: (error as Error).message }, null, 2)); + } else { + console.error(chalk.red(`\nError: ${(error as Error).message}\n`)); + } + process.exit(1); + } +} + +/** + * Show embedding status + */ +async function embedStatusAction(options: EmbedStatusOptions): Promise { + const { json } = options; + + try { + // Check embedder availability + if (!isEmbedderAvailable()) { + console.error(chalk.red('\nError: Memory embedder not available')); + console.error(chalk.gray('Ensure CodexLens venv exists at ~/.codexlens/venv\n')); + process.exit(1); + } + + const projectPath = getProjectPath(); + const paths = StoragePaths.project(projectPath); + const dbPath = join(paths.root, 'core-memory', 'core_memory.db'); + + if (!existsSync(dbPath)) { + console.error(chalk.red('\nError: Core memory database not found')); + console.error(chalk.gray('Create memories first using "ccw core-memory import"\n')); + process.exit(1); + } + + const status = await getEmbeddingStatus(dbPath); + + if (!status.success) { + console.error(chalk.red(`\nError: ${status.error}\n`)); + process.exit(1); + } + + if (json) { + console.log(JSON.stringify(status, null, 2)); + return; + } + + const embeddedPercent = status.total_chunks > 0 + ? Math.round((status.embedded_chunks / status.total_chunks) * 100) + : 0; + + console.log(chalk.bold.cyan('\nEmbedding Status:')); + console.log(chalk.white(` Total chunks: ${status.total_chunks}`)); + console.log(chalk.white(` Embedded: ${status.embedded_chunks} (${embeddedPercent}%)`)); + console.log(chalk.white(` Pending: ${status.pending_chunks}`)); + + if (Object.keys(status.by_type).length > 0) { + console.log(chalk.bold.white('\nBy Type:')); + for (const [type, stats] of Object.entries(status.by_type)) { + const typePercent = stats.total > 0 + ? Math.round((stats.embedded / stats.total) * 100) + : 0; + console.log(chalk.cyan(` ${type}: `) + chalk.white(`${stats.embedded}/${stats.total} (${typePercent}%)`)); + } + } + console.log(); + + } catch (error) { + if (json) { + console.log(JSON.stringify({ error: (error as Error).message }, null, 2)); + } else { + console.error(chalk.red(`\nError: ${(error as Error).message}\n`)); + } + process.exit(1); + } +} + /** * Memory command entry point - * @param {string} subcommand - Subcommand (track, import, stats, search, suggest, prune) + * @param {string} subcommand - Subcommand (track, import, stats, search, suggest, prune, embed, embed-status) * @param {string|string[]} args - Arguments array * @param {Object} options - CLI options */ export async function memoryCommand( subcommand: string, args: string | string[], - options: TrackOptions | ImportOptions | StatsOptions | SearchOptions | SuggestOptions | PruneOptions + options: TrackOptions | ImportOptions | StatsOptions | SearchOptions | SuggestOptions | PruneOptions | EmbedCommandOptions | SearchCommandOptions | EmbedStatusOptions ): Promise { const argsArray = Array.isArray(args) ? args : (args ? [args] : []); @@ -663,7 +994,12 @@ export async function memoryCommand( break; case 'search': - await searchAction(argsArray[0], options as SearchOptions); + // Check if this is semantic search (has --top-k or --min-score) or prompt history search + if ('topK' in options || 'minScore' in options) { + await searchEmbedAction(argsArray[0], options as SearchCommandOptions); + } else { + await searchAction(argsArray[0], options as SearchOptions); + } break; case 'suggest': @@ -674,6 +1010,14 @@ export async function memoryCommand( await pruneAction(options as PruneOptions); break; + case 'embed': + await embedAction(options as EmbedCommandOptions); + break; + + case 'embed-status': + await embedStatusAction(options as EmbedStatusOptions); + break; + default: console.log(chalk.bold.cyan('\n CCW Memory Module\n')); console.log(' Context tracking and prompt optimization.\n'); @@ -681,9 +1025,11 @@ export async function memoryCommand( console.log(chalk.gray(' track Track entity access (used by hooks)')); console.log(chalk.gray(' import Import Claude Code history')); console.log(chalk.gray(' stats Show hotspot statistics')); - console.log(chalk.gray(' search Search through prompt history')); + console.log(chalk.gray(' search Search through prompt history (semantic or FTS)')); console.log(chalk.gray(' suggest Get optimization suggestions')); console.log(chalk.gray(' prune Clean up old data')); + console.log(chalk.gray(' embed Generate embeddings for semantic search')); + console.log(chalk.gray(' embed-status Show embedding generation status')); console.log(); console.log(' Track Options:'); console.log(chalk.gray(' --type Entity type: file, module, topic')); @@ -701,10 +1047,24 @@ export async function memoryCommand( console.log(chalk.gray(' --sort Sort by: heat, reads, writes (default: heat)')); console.log(chalk.gray(' --json Output as JSON')); console.log(); - console.log(' Search Options:'); + console.log(' Search Options (Prompt History):'); console.log(chalk.gray(' --limit Number of results (default: 20)')); console.log(chalk.gray(' --json Output as JSON')); console.log(); + console.log(' Search Options (Semantic - requires embeddings):'); + console.log(chalk.gray(' --top-k Number of results (default: 10)')); + console.log(chalk.gray(' --min-score Minimum similarity score (default: 0.5)')); + console.log(chalk.gray(' --type Filter: core_memory, workflow, cli_history')); + console.log(chalk.gray(' --json Output as JSON')); + console.log(); + console.log(' Embed Options:'); + console.log(chalk.gray(' --id Specific memory/session ID to embed')); + console.log(chalk.gray(' --force Force re-embed all chunks')); + console.log(chalk.gray(' --batch-size Batch size for embedding (default: 8)')); + console.log(); + console.log(' Embed Status Options:'); + console.log(chalk.gray(' --json Output as JSON')); + console.log(); console.log(' Suggest Options:'); console.log(chalk.gray(' --context Current task context (optional)')); console.log(chalk.gray(' --limit Number of suggestions (default: 5)')); @@ -718,7 +1078,11 @@ export async function memoryCommand( console.log(chalk.gray(' ccw memory track --type file --action read --value "src/auth.ts"')); console.log(chalk.gray(' ccw memory import --source history --project "my-app"')); console.log(chalk.gray(' ccw memory stats --type file --sort heat --limit 10')); - console.log(chalk.gray(' ccw memory search "authentication patterns"')); + console.log(chalk.gray(' ccw memory search "authentication patterns" # FTS search')); + console.log(chalk.gray(' ccw memory embed # Generate all embeddings')); + console.log(chalk.gray(' ccw memory embed --id CMEM-xxx # Embed specific memory')); + console.log(chalk.gray(' ccw memory embed-status # Check embedding status')); + console.log(chalk.gray(' ccw memory search "auth patterns" --top-k 5 # Semantic search')); console.log(chalk.gray(' ccw memory suggest --context "implementing JWT auth"')); console.log(chalk.gray(' ccw memory prune --older-than 60d --dry-run')); console.log(); diff --git a/ccw/src/core/core-memory-store.ts b/ccw/src/core/core-memory-store.ts index fdc279e0..914f4f9a 100644 --- a/ccw/src/core/core-memory-store.ts +++ b/ccw/src/core/core-memory-store.ts @@ -60,6 +60,17 @@ export interface SessionMetadataCache { access_count: number; } +export interface MemoryChunk { + id?: number; + source_id: string; + source_type: 'core_memory' | 'workflow' | 'cli_history'; + chunk_index: number; + content: string; + embedding?: Buffer; + metadata?: string; + created_at: string; +} + /** * Core Memory Store using SQLite */ @@ -152,6 +163,19 @@ export class CoreMemoryStore { access_count INTEGER DEFAULT 0 ); + -- Memory chunks table for embeddings + CREATE TABLE IF NOT EXISTS memory_chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id TEXT NOT NULL, + source_type TEXT NOT NULL, + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + embedding BLOB, + metadata TEXT, + created_at TEXT NOT NULL, + UNIQUE(source_id, chunk_index) + ); + -- Indexes for efficient queries CREATE INDEX IF NOT EXISTS idx_memories_created ON memories(created_at DESC); CREATE INDEX IF NOT EXISTS idx_memories_updated ON memories(updated_at DESC); @@ -160,6 +184,8 @@ export class CoreMemoryStore { CREATE INDEX IF NOT EXISTS idx_cluster_members_cluster ON cluster_members(cluster_id); CREATE INDEX IF NOT EXISTS idx_cluster_members_session ON cluster_members(session_id); CREATE INDEX IF NOT EXISTS idx_session_metadata_type ON session_metadata_cache(session_type); + CREATE INDEX IF NOT EXISTS idx_memory_chunks_source ON memory_chunks(source_id, source_type); + CREATE INDEX IF NOT EXISTS idx_memory_chunks_embedded ON memory_chunks(embedding IS NOT NULL); `); } @@ -815,6 +841,243 @@ ${memory.content} })); } + // ============================================================================ + // Memory Chunks CRUD Operations + // ============================================================================ + + /** + * Chunk content into smaller pieces for embedding + * @param content Content to chunk + * @param sourceId Source identifier (e.g., memory ID) + * @param sourceType Type of source + * @returns Array of chunk content strings + */ + chunkContent(content: string, sourceId: string, sourceType: string): string[] { + const CHUNK_SIZE = 1500; + const OVERLAP = 200; + const chunks: string[] = []; + + // Split by paragraph boundaries first + const paragraphs = content.split(/\n\n+/); + let currentChunk = ''; + + for (const paragraph of paragraphs) { + // If adding this paragraph would exceed chunk size + if (currentChunk.length + paragraph.length > CHUNK_SIZE && currentChunk.length > 0) { + // Save current chunk + chunks.push(currentChunk.trim()); + + // Start new chunk with overlap + const overlapText = currentChunk.slice(-OVERLAP); + currentChunk = overlapText + '\n\n' + paragraph; + } else { + // Add paragraph to current chunk + currentChunk += (currentChunk ? '\n\n' : '') + paragraph; + } + } + + // Add remaining chunk + if (currentChunk.trim()) { + chunks.push(currentChunk.trim()); + } + + // If no paragraphs or chunks are still too large, split by sentences + const finalChunks: string[] = []; + for (const chunk of chunks) { + if (chunk.length <= CHUNK_SIZE) { + finalChunks.push(chunk); + } else { + // Split by sentence boundaries + const sentences = chunk.split(/\. +/); + let sentenceChunk = ''; + + for (const sentence of sentences) { + const sentenceWithPeriod = sentence + '. '; + if (sentenceChunk.length + sentenceWithPeriod.length > CHUNK_SIZE && sentenceChunk.length > 0) { + finalChunks.push(sentenceChunk.trim()); + const overlapText = sentenceChunk.slice(-OVERLAP); + sentenceChunk = overlapText + sentenceWithPeriod; + } else { + sentenceChunk += sentenceWithPeriod; + } + } + + if (sentenceChunk.trim()) { + finalChunks.push(sentenceChunk.trim()); + } + } + } + + return finalChunks.length > 0 ? finalChunks : [content]; + } + + /** + * Insert a single chunk + */ + insertChunk(chunk: Omit): number { + const now = new Date().toISOString(); + + const stmt = this.db.prepare(` + INSERT INTO memory_chunks (source_id, source_type, chunk_index, content, embedding, metadata, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + `); + + const result = stmt.run( + chunk.source_id, + chunk.source_type, + chunk.chunk_index, + chunk.content, + chunk.embedding || null, + chunk.metadata || null, + chunk.created_at || now + ); + + return result.lastInsertRowid as number; + } + + /** + * Insert multiple chunks in a batch + */ + insertChunksBatch(chunks: Omit[]): void { + const now = new Date().toISOString(); + const insert = this.db.prepare(` + INSERT INTO memory_chunks (source_id, source_type, chunk_index, content, embedding, metadata, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + `); + + const transaction = this.db.transaction((chunks: Omit[]) => { + for (const chunk of chunks) { + insert.run( + chunk.source_id, + chunk.source_type, + chunk.chunk_index, + chunk.content, + chunk.embedding || null, + chunk.metadata || null, + chunk.created_at || now + ); + } + }); + + transaction(chunks); + } + + /** + * Get all chunks for a source + */ + getChunks(sourceId: string): MemoryChunk[] { + const stmt = this.db.prepare(` + SELECT * FROM memory_chunks + WHERE source_id = ? + ORDER BY chunk_index ASC + `); + + const rows = stmt.all(sourceId) as any[]; + return rows.map(row => ({ + id: row.id, + source_id: row.source_id, + source_type: row.source_type, + chunk_index: row.chunk_index, + content: row.content, + embedding: row.embedding, + metadata: row.metadata, + created_at: row.created_at + })); + } + + /** + * Get chunks by source type + */ + getChunksByType(sourceType: string): MemoryChunk[] { + const stmt = this.db.prepare(` + SELECT * FROM memory_chunks + WHERE source_type = ? + ORDER BY source_id, chunk_index ASC + `); + + const rows = stmt.all(sourceType) as any[]; + return rows.map(row => ({ + id: row.id, + source_id: row.source_id, + source_type: row.source_type, + chunk_index: row.chunk_index, + content: row.content, + embedding: row.embedding, + metadata: row.metadata, + created_at: row.created_at + })); + } + + /** + * Get chunks without embeddings + */ + getUnembeddedChunks(limit?: number): MemoryChunk[] { + const query = ` + SELECT * FROM memory_chunks + WHERE embedding IS NULL + ORDER BY created_at ASC + ${limit ? 'LIMIT ?' : ''} + `; + + const stmt = this.db.prepare(query); + const rows = (limit ? stmt.all(limit) : stmt.all()) as any[]; + + return rows.map(row => ({ + id: row.id, + source_id: row.source_id, + source_type: row.source_type, + chunk_index: row.chunk_index, + content: row.content, + embedding: row.embedding, + metadata: row.metadata, + created_at: row.created_at + })); + } + + /** + * Update embedding for a chunk + */ + updateChunkEmbedding(chunkId: number, embedding: Buffer): void { + const stmt = this.db.prepare(` + UPDATE memory_chunks + SET embedding = ? + WHERE id = ? + `); + + stmt.run(embedding, chunkId); + } + + /** + * Update embeddings for multiple chunks in a batch + */ + updateChunkEmbeddingsBatch(updates: { id: number; embedding: Buffer }[]): void { + const update = this.db.prepare(` + UPDATE memory_chunks + SET embedding = ? + WHERE id = ? + `); + + const transaction = this.db.transaction((updates: { id: number; embedding: Buffer }[]) => { + for (const { id, embedding } of updates) { + update.run(embedding, id); + } + }); + + transaction(updates); + } + + /** + * Delete all chunks for a source + */ + deleteChunks(sourceId: string): void { + const stmt = this.db.prepare(` + DELETE FROM memory_chunks + WHERE source_id = ? + `); + + stmt.run(sourceId); + } + /** * Close database connection */ diff --git a/ccw/src/core/memory-embedder-bridge.ts b/ccw/src/core/memory-embedder-bridge.ts new file mode 100644 index 00000000..66a7931b --- /dev/null +++ b/ccw/src/core/memory-embedder-bridge.ts @@ -0,0 +1,262 @@ +/** + * Memory Embedder Bridge - TypeScript interface to Python memory embedder + * + * This module provides a TypeScript bridge to the Python memory_embedder.py script, + * which generates and searches embeddings for memory chunks using CodexLens's embedder. + * + * Features: + * - Reuses CodexLens venv at ~/.codexlens/venv + * - JSON protocol communication + * - Three commands: embed, search, status + * - Automatic availability checking + */ + +import { spawn } from 'child_process'; +import { join, dirname } from 'path'; +import { homedir } from 'os'; +import { existsSync } from 'fs'; +import { fileURLToPath } from 'url'; + +// Get directory of this module +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Venv paths (reuse CodexLens venv) +const CODEXLENS_VENV = join(homedir(), '.codexlens', 'venv'); +const VENV_PYTHON = + process.platform === 'win32' + ? join(CODEXLENS_VENV, 'Scripts', 'python.exe') + : join(CODEXLENS_VENV, 'bin', 'python'); + +// Script path +const EMBEDDER_SCRIPT = join(__dirname, '..', '..', 'scripts', 'memory_embedder.py'); + +// Types +export interface EmbedResult { + success: boolean; + chunks_processed: number; + chunks_failed: number; + elapsed_time: number; + error?: string; +} + +export interface SearchMatch { + source_id: string; + source_type: 'core_memory' | 'workflow' | 'cli_history'; + chunk_index: number; + content: string; + score: number; + restore_command: string; +} + +export interface SearchResult { + success: boolean; + matches: SearchMatch[]; + query?: string; + elapsed_time?: number; + error?: string; +} + +export interface EmbeddingStatus { + success?: boolean; + total_chunks: number; + embedded_chunks: number; + pending_chunks: number; + by_type: Record; + error?: string; +} + +export interface EmbedOptions { + sourceId?: string; + batchSize?: number; + force?: boolean; +} + +export interface SearchOptions { + topK?: number; + minScore?: number; + sourceType?: 'core_memory' | 'workflow' | 'cli_history'; +} + +/** + * Check if embedder is available (venv and script exist) + * @returns True if embedder is available + */ +export function isEmbedderAvailable(): boolean { + // Check venv python exists + if (!existsSync(VENV_PYTHON)) { + return false; + } + + // Check script exists + if (!existsSync(EMBEDDER_SCRIPT)) { + return false; + } + + return true; +} + +/** + * Run Python script with arguments + * @param args - Command line arguments + * @param timeout - Timeout in milliseconds + * @returns JSON output from script + */ +function runPython(args: string[], timeout: number = 300000): Promise { + return new Promise((resolve, reject) => { + // Check availability + if (!isEmbedderAvailable()) { + reject( + new Error( + 'Memory embedder not available. Ensure CodexLens venv exists at ~/.codexlens/venv' + ) + ); + return; + } + + // Spawn Python process + const child = spawn(VENV_PYTHON, [EMBEDDER_SCRIPT, ...args], { + stdio: ['ignore', 'pipe', 'pipe'], + timeout, + }); + + let stdout = ''; + let stderr = ''; + + child.stdout.on('data', (data) => { + stdout += data.toString(); + }); + + child.stderr.on('data', (data) => { + stderr += data.toString(); + }); + + child.on('close', (code) => { + if (code === 0) { + resolve(stdout.trim()); + } else { + reject(new Error(`Python script failed (exit code ${code}): ${stderr || stdout}`)); + } + }); + + child.on('error', (err) => { + if ((err as NodeJS.ErrnoException).code === 'ETIMEDOUT') { + reject(new Error('Python script timed out')); + } else { + reject(new Error(`Failed to spawn Python: ${err.message}`)); + } + }); + }); +} + +/** + * Generate embeddings for memory chunks + * @param dbPath - Path to SQLite database + * @param options - Embedding options + * @returns Embedding result + */ +export async function generateEmbeddings( + dbPath: string, + options: EmbedOptions = {} +): Promise { + const { sourceId, batchSize = 8, force = false } = options; + + // Build arguments + const args = ['embed', dbPath]; + + if (sourceId) { + args.push('--source-id', sourceId); + } + + if (batchSize !== 8) { + args.push('--batch-size', batchSize.toString()); + } + + if (force) { + args.push('--force'); + } + + try { + // Default timeout: 5 minutes + const output = await runPython(args, 300000); + const result = JSON.parse(output) as EmbedResult; + return result; + } catch (err) { + return { + success: false, + chunks_processed: 0, + chunks_failed: 0, + elapsed_time: 0, + error: (err as Error).message, + }; + } +} + +/** + * Search memory chunks using semantic search + * @param dbPath - Path to SQLite database + * @param query - Search query text + * @param options - Search options + * @returns Search results + */ +export async function searchMemories( + dbPath: string, + query: string, + options: SearchOptions = {} +): Promise { + const { topK = 10, minScore = 0.3, sourceType } = options; + + // Build arguments + const args = ['search', dbPath, query]; + + if (topK !== 10) { + args.push('--top-k', topK.toString()); + } + + if (minScore !== 0.3) { + args.push('--min-score', minScore.toString()); + } + + if (sourceType) { + args.push('--type', sourceType); + } + + try { + // Default timeout: 30 seconds + const output = await runPython(args, 30000); + const result = JSON.parse(output) as SearchResult; + return result; + } catch (err) { + return { + success: false, + matches: [], + error: (err as Error).message, + }; + } +} + +/** + * Get embedding status statistics + * @param dbPath - Path to SQLite database + * @returns Embedding status + */ +export async function getEmbeddingStatus(dbPath: string): Promise { + // Build arguments + const args = ['status', dbPath]; + + try { + // Default timeout: 30 seconds + const output = await runPython(args, 30000); + const result = JSON.parse(output) as EmbeddingStatus; + return { ...result, success: true }; + } catch (err) { + return { + success: false, + total_chunks: 0, + embedded_chunks: 0, + pending_chunks: 0, + by_type: {}, + error: (err as Error).message, + }; + } +} diff --git a/ccw/src/core/session-clustering-service.ts b/ccw/src/core/session-clustering-service.ts index 91fad46f..e2c60c29 100644 --- a/ccw/src/core/session-clustering-service.ts +++ b/ccw/src/core/session-clustering-service.ts @@ -11,9 +11,10 @@ import { join } from 'path'; // Clustering dimension weights const WEIGHTS = { - fileOverlap: 0.3, - temporalProximity: 0.2, - semanticSimilarity: 0.3, + fileOverlap: 0.2, + temporalProximity: 0.15, + keywordSimilarity: 0.15, + vectorSimilarity: 0.3, intentAlignment: 0.2, }; @@ -219,13 +220,15 @@ export class SessionClusteringService { calculateRelevance(session1: SessionMetadataCache, session2: SessionMetadataCache): number { const fileScore = this.calculateFileOverlap(session1, session2); const temporalScore = this.calculateTemporalProximity(session1, session2); - const semanticScore = this.calculateSemanticSimilarity(session1, session2); + const keywordScore = this.calculateSemanticSimilarity(session1, session2); + const vectorScore = this.calculateVectorSimilarity(session1, session2); const intentScore = this.calculateIntentAlignment(session1, session2); return ( fileScore * WEIGHTS.fileOverlap + temporalScore * WEIGHTS.temporalProximity + - semanticScore * WEIGHTS.semanticSimilarity + + keywordScore * WEIGHTS.keywordSimilarity + + vectorScore * WEIGHTS.vectorSimilarity + intentScore * WEIGHTS.intentAlignment ); } @@ -301,6 +304,98 @@ export class SessionClusteringService { return intersection.size / union.size; } + /** + * Calculate vector similarity using pre-computed embeddings from memory_chunks + * Returns average cosine similarity of chunk embeddings + */ + private calculateVectorSimilarity(s1: SessionMetadataCache, s2: SessionMetadataCache): number { + const embedding1 = this.getSessionEmbedding(s1.session_id); + const embedding2 = this.getSessionEmbedding(s2.session_id); + + // Graceful fallback if no embeddings available + if (!embedding1 || !embedding2) { + return 0; + } + + return this.cosineSimilarity(embedding1, embedding2); + } + + /** + * Get session embedding by averaging all chunk embeddings + */ + private getSessionEmbedding(sessionId: string): number[] | null { + const chunks = this.coreMemoryStore.getChunks(sessionId); + + if (chunks.length === 0) { + return null; + } + + // Filter chunks that have embeddings + const embeddedChunks = chunks.filter(chunk => chunk.embedding && chunk.embedding.length > 0); + + if (embeddedChunks.length === 0) { + return null; + } + + // Convert Buffer embeddings to number arrays and calculate average + const embeddings = embeddedChunks.map(chunk => { + // Convert Buffer to Float32Array + const buffer = chunk.embedding!; + const float32Array = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 4); + return Array.from(float32Array); + }); + + // Check all embeddings have same dimension + const dimension = embeddings[0].length; + if (!embeddings.every(emb => emb.length === dimension)) { + console.warn(`[VectorSimilarity] Inconsistent embedding dimensions for session ${sessionId}`); + return null; + } + + // Calculate average embedding + const avgEmbedding = new Array(dimension).fill(0); + for (const embedding of embeddings) { + for (let i = 0; i < dimension; i++) { + avgEmbedding[i] += embedding[i]; + } + } + + for (let i = 0; i < dimension; i++) { + avgEmbedding[i] /= embeddings.length; + } + + return avgEmbedding; + } + + /** + * Calculate cosine similarity between two vectors + */ + private cosineSimilarity(a: number[], b: number[]): number { + if (a.length !== b.length) { + console.warn('[VectorSimilarity] Vector dimension mismatch'); + return 0; + } + + let dotProduct = 0; + let normA = 0; + let normB = 0; + + for (let i = 0; i < a.length; i++) { + dotProduct += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + + normA = Math.sqrt(normA); + normB = Math.sqrt(normB); + + if (normA === 0 || normB === 0) { + return 0; + } + + return dotProduct / (normA * normB); + } + /** * Find the most relevant existing cluster for a set of session IDs * Returns the cluster with highest session overlap diff --git a/ccw/src/tools/core-memory.ts b/ccw/src/tools/core-memory.ts index 27ab4c84..caa8dfaf 100644 --- a/ccw/src/tools/core-memory.ts +++ b/ccw/src/tools/core-memory.ts @@ -1,14 +1,17 @@ /** * Core Memory Tool - MCP tool for core memory management - * Operations: list, import, export, summary + * Operations: list, import, export, summary, embed, search, embed_status */ import { z } from 'zod'; import type { ToolSchema, ToolResult } from '../types/tool.js'; import { getCoreMemoryStore } from '../core/core-memory-store.js'; +import * as MemoryEmbedder from '../core/memory-embedder-bridge.js'; +import { StoragePaths } from '../config/storage-paths.js'; +import { join } from 'path'; // Zod schemas -const OperationEnum = z.enum(['list', 'import', 'export', 'summary']); +const OperationEnum = z.enum(['list', 'import', 'export', 'summary', 'embed', 'search', 'embed_status']); const ParamsSchema = z.object({ operation: OperationEnum, @@ -16,6 +19,15 @@ const ParamsSchema = z.object({ id: z.string().optional(), tool: z.enum(['gemini', 'qwen']).optional().default('gemini'), limit: z.number().optional().default(100), + // Search parameters + query: z.string().optional(), + top_k: z.number().optional().default(10), + min_score: z.number().optional().default(0.3), + source_type: z.enum(['core_memory', 'workflow', 'cli_history']).optional(), + // Embed parameters + source_id: z.string().optional(), + batch_size: z.number().optional().default(8), + force: z.boolean().optional().default(false), }); type Params = z.infer; @@ -53,7 +65,36 @@ interface SummaryResult { summary: string; } -type OperationResult = ListResult | ImportResult | ExportResult | SummaryResult; +interface EmbedResult { + operation: 'embed'; + chunks_processed: number; + chunks_failed: number; + elapsed_time: number; + message: string; +} + +interface SearchResult { + operation: 'search'; + query: string; + matches: Array<{ + source_id: string; + source_type: string; + score: number; + excerpt: string; + restore_command: string; + }>; + total_matches: number; +} + +interface EmbedStatusResult { + operation: 'embed_status'; + total_chunks: number; + embedded_chunks: number; + pending_chunks: number; + by_type: Record; +} + +type OperationResult = ListResult | ImportResult | ExportResult | SummaryResult | EmbedResult | SearchResult | EmbedStatusResult; /** * Get project path from current working directory @@ -62,6 +103,15 @@ function getProjectPath(): string { return process.cwd(); } +/** + * Get database path for current project + */ +function getDatabasePath(): string { + const projectPath = getProjectPath(); + const paths = StoragePaths.project(projectPath); + return join(paths.root, 'core-memory', 'core_memory.db'); +} + /** * Operation: list * List all memories @@ -153,6 +203,92 @@ async function executeSummary(params: Params): Promise { }; } +/** + * Operation: embed + * Generate embeddings for memory chunks + */ +async function executeEmbed(params: Params): Promise { + const { source_id, batch_size = 8, force = false } = params; + const dbPath = getDatabasePath(); + + const result = await MemoryEmbedder.generateEmbeddings(dbPath, { + sourceId: source_id, + batchSize: batch_size, + force, + }); + + if (!result.success) { + throw new Error(result.error || 'Embedding generation failed'); + } + + return { + operation: 'embed', + chunks_processed: result.chunks_processed, + chunks_failed: result.chunks_failed, + elapsed_time: result.elapsed_time, + message: `Successfully processed ${result.chunks_processed} chunks in ${result.elapsed_time.toFixed(2)}s`, + }; +} + +/** + * Operation: search + * Search memory chunks using semantic search + */ +async function executeSearch(params: Params): Promise { + const { query, top_k = 10, min_score = 0.3, source_type } = params; + + if (!query) { + throw new Error('Parameter "query" is required for search operation'); + } + + const dbPath = getDatabasePath(); + + const result = await MemoryEmbedder.searchMemories(dbPath, query, { + topK: top_k, + minScore: min_score, + sourceType: source_type, + }); + + if (!result.success) { + throw new Error(result.error || 'Search failed'); + } + + return { + operation: 'search', + query, + matches: result.matches.map((match) => ({ + source_id: match.source_id, + source_type: match.source_type, + score: match.score, + excerpt: match.content.substring(0, 200) + (match.content.length > 200 ? '...' : ''), + restore_command: match.restore_command, + })), + total_matches: result.matches.length, + }; +} + +/** + * Operation: embed_status + * Get embedding status statistics + */ +async function executeEmbedStatus(params: Params): Promise { + const dbPath = getDatabasePath(); + + const result = await MemoryEmbedder.getEmbeddingStatus(dbPath); + + if (!result.success) { + throw new Error(result.error || 'Failed to get embedding status'); + } + + return { + operation: 'embed_status', + total_chunks: result.total_chunks, + embedded_chunks: result.embedded_chunks, + pending_chunks: result.pending_chunks, + by_type: result.by_type, + }; +} + /** * Route to appropriate operation handler */ @@ -168,9 +304,15 @@ async function execute(params: Params): Promise { return executeExport(params); case 'summary': return executeSummary(params); + case 'embed': + return executeEmbed(params); + case 'search': + return executeSearch(params); + case 'embed_status': + return executeEmbedStatus(params); default: throw new Error( - `Unknown operation: ${operation}. Valid operations: list, import, export, summary` + `Unknown operation: ${operation}. Valid operations: list, import, export, summary, embed, search, embed_status` ); } } @@ -185,6 +327,9 @@ Usage: core_memory(operation="import", text="important context") # Import text as new memory core_memory(operation="export", id="CMEM-xxx") # Export memory as plain text core_memory(operation="summary", id="CMEM-xxx") # Generate AI summary + core_memory(operation="embed", source_id="CMEM-xxx") # Generate embeddings for memory + core_memory(operation="search", query="authentication") # Search memories semantically + core_memory(operation="embed_status") # Check embedding status Memory IDs use format: CMEM-YYYYMMDD-HHMMSS`, inputSchema: { @@ -192,7 +337,7 @@ Memory IDs use format: CMEM-YYYYMMDD-HHMMSS`, properties: { operation: { type: 'string', - enum: ['list', 'import', 'export', 'summary'], + enum: ['list', 'import', 'export', 'summary', 'embed', 'search', 'embed_status'], description: 'Operation to perform', }, text: { @@ -212,6 +357,35 @@ Memory IDs use format: CMEM-YYYYMMDD-HHMMSS`, type: 'number', description: 'Max number of memories to list (default: 100)', }, + query: { + type: 'string', + description: 'Search query text (required for search operation)', + }, + top_k: { + type: 'number', + description: 'Number of search results to return (default: 10)', + }, + min_score: { + type: 'number', + description: 'Minimum similarity score threshold (default: 0.3)', + }, + source_type: { + type: 'string', + enum: ['core_memory', 'workflow', 'cli_history'], + description: 'Filter search by source type', + }, + source_id: { + type: 'string', + description: 'Source ID to embed (optional for embed operation)', + }, + batch_size: { + type: 'number', + description: 'Batch size for embedding generation (default: 8)', + }, + force: { + type: 'boolean', + description: 'Force re-embedding even if embeddings exist (default: false)', + }, }, required: ['operation'], },