/** * Unified Vector Index - TypeScript bridge to unified_memory_embedder.py * * Provides HNSW-backed vector indexing and search for all memory content * (core_memory, cli_history, workflow, entity, pattern) via CodexLens VectorStore. * * Features: * - JSON stdin/stdout protocol to Python embedder * - Content chunking (paragraph -> sentence splitting, CHUNK_SIZE=1500, OVERLAP=200) * - Batch embedding via CodexLens EmbedderFactory * - HNSW approximate nearest neighbor search (sub-10ms for 1000 chunks) * - Category-based filtering */ import { spawn } from 'child_process'; import { join, dirname } from 'path'; import { existsSync } from 'fs'; import { fileURLToPath } from 'url'; import { getCodexLensPython } from '../utils/codexlens-path.js'; import { StoragePaths, ensureStorageDir } from '../config/storage-paths.js'; // Get directory of this module const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // Venv python path (reuse CodexLens venv) const VENV_PYTHON = getCodexLensPython(); // Script path const EMBEDDER_SCRIPT = join(__dirname, '..', '..', 'scripts', 'unified_memory_embedder.py'); // Chunking constants (match existing core-memory-store.ts) const CHUNK_SIZE = 1500; const OVERLAP = 200; // ============================================================================= // Types // ============================================================================= /** Valid source types for vector content */ export type SourceType = 'core_memory' | 'workflow' | 'cli_history'; /** Valid category values for vector filtering */ export type VectorCategory = 'core_memory' | 'cli_history' | 'workflow' | 'entity' | 'pattern'; /** Metadata attached to each chunk in the vector store */ export interface ChunkMetadata { /** Source identifier (e.g., memory ID, session ID) */ source_id: string; /** Source type */ source_type: SourceType; /** Category for filtering */ category: VectorCategory; /** Chunk index within the source */ chunk_index?: number; /** Additional metadata */ [key: string]: unknown; } /** A chunk to be embedded and indexed */ export interface VectorChunk { /** Text content */ content: string; /** Source identifier */ source_id: string; /** Source type */ source_type: SourceType; /** Category for filtering */ category: VectorCategory; /** Chunk index */ chunk_index: number; /** Additional metadata */ metadata?: Record; } /** Result of an embed operation */ export interface EmbedResult { success: boolean; chunks_processed: number; chunks_failed: number; elapsed_time: number; error?: string; } /** A single search match */ export interface VectorSearchMatch { content: string; score: number; source_id: string; source_type: string; chunk_index: number; category: string; metadata: Record; } /** Result of a search operation */ export interface VectorSearchResult { success: boolean; matches: VectorSearchMatch[]; elapsed_time?: number; total_searched?: number; error?: string; } /** Search options */ export interface VectorSearchOptions { topK?: number; minScore?: number; category?: VectorCategory; } /** Index status information */ export interface VectorIndexStatus { success: boolean; total_chunks: number; hnsw_available: boolean; hnsw_count: number; dimension: number; categories?: Record; model_config?: { backend: string; profile: string; dimension: number; max_tokens: number; }; error?: string; } /** Reindex result */ export interface ReindexResult { success: boolean; hnsw_count?: number; elapsed_time?: number; error?: string; } // ============================================================================= // Python Bridge // ============================================================================= /** * Check if the unified embedder is available (venv and script exist) */ export function isUnifiedEmbedderAvailable(): boolean { if (!existsSync(VENV_PYTHON)) { return false; } if (!existsSync(EMBEDDER_SCRIPT)) { return false; } return true; } /** * Run Python script with JSON stdin/stdout protocol. * * @param request - JSON request object to send via stdin * @param timeout - Timeout in milliseconds (default: 5 minutes) * @returns Parsed JSON response */ function runPython(request: Record, timeout: number = 300000): Promise { return new Promise((resolve, reject) => { if (!isUnifiedEmbedderAvailable()) { reject( new Error( 'Unified embedder not available. Ensure CodexLens venv exists at ~/.codexlens/venv' ) ); return; } const child = spawn(VENV_PYTHON, [EMBEDDER_SCRIPT], { stdio: ['pipe', 'pipe', 'pipe'], timeout, }); let stdout = ''; let stderr = ''; child.stdout.on('data', (data) => { stdout += data.toString(); }); child.stderr.on('data', (data) => { stderr += data.toString(); }); child.on('close', (code) => { if (code === 0 && stdout.trim()) { try { resolve(JSON.parse(stdout.trim()) as T); } catch { reject(new Error(`Failed to parse Python output: ${stdout.substring(0, 500)}`)); } } else { reject(new Error(`Python script failed (exit code ${code}): ${stderr || stdout}`)); } }); child.on('error', (err) => { if ((err as NodeJS.ErrnoException).code === 'ETIMEDOUT') { reject(new Error('Python script timed out')); } else { reject(new Error(`Failed to spawn Python: ${err.message}`)); } }); // Write JSON request to stdin and close const jsonInput = JSON.stringify(request); child.stdin.write(jsonInput); child.stdin.end(); }); } // ============================================================================= // Content Chunking // ============================================================================= /** * Chunk content into smaller pieces for embedding. * Uses paragraph-first, sentence-fallback strategy with overlap. * * Matches the chunking logic in core-memory-store.ts: * - CHUNK_SIZE = 1500 characters * - OVERLAP = 200 characters * - Split by paragraph boundaries (\n\n) first * - Fall back to sentence boundaries (. ) for oversized paragraphs * * @param content - Text content to chunk * @returns Array of chunk strings */ export function chunkContent(content: string): string[] { const chunks: string[] = []; // Split by paragraph boundaries first const paragraphs = content.split(/\n\n+/); let currentChunk = ''; for (const paragraph of paragraphs) { // If adding this paragraph would exceed chunk size if (currentChunk.length + paragraph.length > CHUNK_SIZE && currentChunk.length > 0) { chunks.push(currentChunk.trim()); // Start new chunk with overlap const overlapText = currentChunk.slice(-OVERLAP); currentChunk = overlapText + '\n\n' + paragraph; } else { currentChunk += (currentChunk ? '\n\n' : '') + paragraph; } } // Add remaining chunk if (currentChunk.trim()) { chunks.push(currentChunk.trim()); } // If chunks are still too large, split by sentences const finalChunks: string[] = []; for (const chunk of chunks) { if (chunk.length <= CHUNK_SIZE) { finalChunks.push(chunk); } else { // Split by sentence boundaries const sentences = chunk.split(/\. +/); let sentenceChunk = ''; for (const sentence of sentences) { const sentenceWithPeriod = sentence + '. '; if ( sentenceChunk.length + sentenceWithPeriod.length > CHUNK_SIZE && sentenceChunk.length > 0 ) { finalChunks.push(sentenceChunk.trim()); const overlapText = sentenceChunk.slice(-OVERLAP); sentenceChunk = overlapText + sentenceWithPeriod; } else { sentenceChunk += sentenceWithPeriod; } } if (sentenceChunk.trim()) { finalChunks.push(sentenceChunk.trim()); } } } return finalChunks.length > 0 ? finalChunks : [content]; } // ============================================================================= // UnifiedVectorIndex Class // ============================================================================= /** * Unified vector index backed by CodexLens VectorStore (HNSW). * * Provides content chunking, embedding, storage, and search for all * memory content types through a single interface. */ export class UnifiedVectorIndex { private storePath: string; /** * Create a UnifiedVectorIndex for a project. * * @param projectPath - Project root path (used to resolve storage location) */ constructor(projectPath: string) { const paths = StoragePaths.project(projectPath); this.storePath = paths.unifiedVectors.root; ensureStorageDir(this.storePath); } /** * Index content by chunking, embedding, and storing in VectorStore. * * @param content - Text content to index * @param metadata - Metadata for all chunks (source_id, source_type, category) * @returns Embed result */ async indexContent( content: string, metadata: ChunkMetadata ): Promise { if (!content.trim()) { return { success: true, chunks_processed: 0, chunks_failed: 0, elapsed_time: 0, }; } // Chunk content const textChunks = chunkContent(content); // Build chunk objects for Python const chunks: VectorChunk[] = textChunks.map((text, index) => ({ content: text, source_id: metadata.source_id, source_type: metadata.source_type, category: metadata.category, chunk_index: metadata.chunk_index != null ? metadata.chunk_index + index : index, metadata: { ...metadata }, })); try { const result = await runPython({ operation: 'embed', store_path: this.storePath, chunks, batch_size: 8, }); return result; } catch (err) { return { success: false, chunks_processed: 0, chunks_failed: textChunks.length, elapsed_time: 0, error: (err as Error).message, }; } } /** * Search the vector index using semantic similarity. * * @param query - Natural language search query * @param options - Search options (topK, minScore, category) * @returns Search results sorted by relevance */ async search( query: string, options: VectorSearchOptions = {} ): Promise { const { topK = 10, minScore = 0.3, category } = options; try { const result = await runPython({ operation: 'search', store_path: this.storePath, query, top_k: topK, min_score: minScore, category: category || null, }); return result; } catch (err) { return { success: false, matches: [], error: (err as Error).message, }; } } /** * Search the vector index using a pre-computed embedding vector. * Bypasses text embedding, directly querying HNSW with a raw vector. * * @param vector - Pre-computed embedding vector (array of floats) * @param options - Search options (topK, minScore, category) * @returns Search results sorted by relevance */ async searchByVector( vector: number[], options: VectorSearchOptions = {} ): Promise { const { topK = 10, minScore = 0.3, category } = options; try { const result = await runPython({ operation: 'search_by_vector', store_path: this.storePath, vector, top_k: topK, min_score: minScore, category: category || null, }); return result; } catch (err) { return { success: false, matches: [], error: (err as Error).message, }; } } /** * Rebuild the HNSW index from scratch. * * @returns Reindex result */ async reindexAll(): Promise { try { const result = await runPython({ operation: 'reindex', store_path: this.storePath, }); return result; } catch (err) { return { success: false, error: (err as Error).message, }; } } /** * Get the current status of the vector index. * * @returns Index status including chunk counts, HNSW availability, dimension */ async getStatus(): Promise { try { const result = await runPython({ operation: 'status', store_path: this.storePath, }); return result; } catch (err) { return { success: false, total_chunks: 0, hnsw_available: false, hnsw_count: 0, dimension: 0, error: (err as Error).message, }; } } }