Files
Claude-Code-Workflow/ccw/src/core/unified-vector-index.ts
catlog22 48a6a1f2aa Add comprehensive tests for ast-grep and tree-sitter relationship extraction
- Introduced test suite for AstGrepPythonProcessor covering pattern definitions, parsing, and relationship extraction.
- Added comparison tests between tree-sitter and ast-grep for consistency in relationship extraction.
- Implemented tests for ast-grep binding module to verify functionality and availability.
- Ensured tests cover various scenarios including inheritance, function calls, and imports.
2026-02-15 21:14:14 +08:00

475 lines
13 KiB
TypeScript

/**
* Unified Vector Index - TypeScript bridge to unified_memory_embedder.py
*
* Provides HNSW-backed vector indexing and search for all memory content
* (core_memory, cli_history, workflow, entity, pattern) via CodexLens VectorStore.
*
* Features:
* - JSON stdin/stdout protocol to Python embedder
* - Content chunking (paragraph -> sentence splitting, CHUNK_SIZE=1500, OVERLAP=200)
* - Batch embedding via CodexLens EmbedderFactory
* - HNSW approximate nearest neighbor search (sub-10ms for 1000 chunks)
* - Category-based filtering
*/
import { spawn } from 'child_process';
import { join, dirname } from 'path';
import { existsSync } from 'fs';
import { fileURLToPath } from 'url';
import { getCodexLensPython } from '../utils/codexlens-path.js';
import { StoragePaths, ensureStorageDir } from '../config/storage-paths.js';
// Get directory of this module
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Venv python path (reuse CodexLens venv)
const VENV_PYTHON = getCodexLensPython();
// Script path
const EMBEDDER_SCRIPT = join(__dirname, '..', '..', 'scripts', 'unified_memory_embedder.py');
// Chunking constants (match existing core-memory-store.ts)
const CHUNK_SIZE = 1500;
const OVERLAP = 200;
// =============================================================================
// Types
// =============================================================================
/** Valid source types for vector content */
export type SourceType = 'core_memory' | 'workflow' | 'cli_history';
/** Valid category values for vector filtering */
export type VectorCategory = 'core_memory' | 'cli_history' | 'workflow' | 'entity' | 'pattern';
/** Metadata attached to each chunk in the vector store */
export interface ChunkMetadata {
/** Source identifier (e.g., memory ID, session ID) */
source_id: string;
/** Source type */
source_type: SourceType;
/** Category for filtering */
category: VectorCategory;
/** Chunk index within the source */
chunk_index?: number;
/** Additional metadata */
[key: string]: unknown;
}
/** A chunk to be embedded and indexed */
export interface VectorChunk {
/** Text content */
content: string;
/** Source identifier */
source_id: string;
/** Source type */
source_type: SourceType;
/** Category for filtering */
category: VectorCategory;
/** Chunk index */
chunk_index: number;
/** Additional metadata */
metadata?: Record<string, unknown>;
}
/** Result of an embed operation */
export interface EmbedResult {
success: boolean;
chunks_processed: number;
chunks_failed: number;
elapsed_time: number;
error?: string;
}
/** A single search match */
export interface VectorSearchMatch {
content: string;
score: number;
source_id: string;
source_type: string;
chunk_index: number;
category: string;
metadata: Record<string, unknown>;
}
/** Result of a search operation */
export interface VectorSearchResult {
success: boolean;
matches: VectorSearchMatch[];
elapsed_time?: number;
total_searched?: number;
error?: string;
}
/** Search options */
export interface VectorSearchOptions {
topK?: number;
minScore?: number;
category?: VectorCategory;
}
/** Index status information */
export interface VectorIndexStatus {
success: boolean;
total_chunks: number;
hnsw_available: boolean;
hnsw_count: number;
dimension: number;
categories?: Record<string, number>;
model_config?: {
backend: string;
profile: string;
dimension: number;
max_tokens: number;
};
error?: string;
}
/** Reindex result */
export interface ReindexResult {
success: boolean;
hnsw_count?: number;
elapsed_time?: number;
error?: string;
}
// =============================================================================
// Python Bridge
// =============================================================================
/**
* Check if the unified embedder is available (venv and script exist)
*/
export function isUnifiedEmbedderAvailable(): boolean {
if (!existsSync(VENV_PYTHON)) {
return false;
}
if (!existsSync(EMBEDDER_SCRIPT)) {
return false;
}
return true;
}
/**
* Run Python script with JSON stdin/stdout protocol.
*
* @param request - JSON request object to send via stdin
* @param timeout - Timeout in milliseconds (default: 5 minutes)
* @returns Parsed JSON response
*/
function runPython<T>(request: Record<string, unknown>, timeout: number = 300000): Promise<T> {
return new Promise((resolve, reject) => {
if (!isUnifiedEmbedderAvailable()) {
reject(
new Error(
'Unified embedder not available. Ensure CodexLens venv exists at ~/.codexlens/venv'
)
);
return;
}
const child = spawn(VENV_PYTHON, [EMBEDDER_SCRIPT], {
stdio: ['pipe', 'pipe', 'pipe'],
timeout,
});
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => {
stdout += data.toString();
});
child.stderr.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
if (code === 0 && stdout.trim()) {
try {
resolve(JSON.parse(stdout.trim()) as T);
} catch {
reject(new Error(`Failed to parse Python output: ${stdout.substring(0, 500)}`));
}
} else {
reject(new Error(`Python script failed (exit code ${code}): ${stderr || stdout}`));
}
});
child.on('error', (err) => {
if ((err as NodeJS.ErrnoException).code === 'ETIMEDOUT') {
reject(new Error('Python script timed out'));
} else {
reject(new Error(`Failed to spawn Python: ${err.message}`));
}
});
// Write JSON request to stdin and close
const jsonInput = JSON.stringify(request);
child.stdin.write(jsonInput);
child.stdin.end();
});
}
// =============================================================================
// Content Chunking
// =============================================================================
/**
* Chunk content into smaller pieces for embedding.
* Uses paragraph-first, sentence-fallback strategy with overlap.
*
* Matches the chunking logic in core-memory-store.ts:
* - CHUNK_SIZE = 1500 characters
* - OVERLAP = 200 characters
* - Split by paragraph boundaries (\n\n) first
* - Fall back to sentence boundaries (. ) for oversized paragraphs
*
* @param content - Text content to chunk
* @returns Array of chunk strings
*/
export function chunkContent(content: string): string[] {
const chunks: string[] = [];
// Split by paragraph boundaries first
const paragraphs = content.split(/\n\n+/);
let currentChunk = '';
for (const paragraph of paragraphs) {
// If adding this paragraph would exceed chunk size
if (currentChunk.length + paragraph.length > CHUNK_SIZE && currentChunk.length > 0) {
chunks.push(currentChunk.trim());
// Start new chunk with overlap
const overlapText = currentChunk.slice(-OVERLAP);
currentChunk = overlapText + '\n\n' + paragraph;
} else {
currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
}
}
// Add remaining chunk
if (currentChunk.trim()) {
chunks.push(currentChunk.trim());
}
// If chunks are still too large, split by sentences
const finalChunks: string[] = [];
for (const chunk of chunks) {
if (chunk.length <= CHUNK_SIZE) {
finalChunks.push(chunk);
} else {
// Split by sentence boundaries
const sentences = chunk.split(/\. +/);
let sentenceChunk = '';
for (const sentence of sentences) {
const sentenceWithPeriod = sentence + '. ';
if (
sentenceChunk.length + sentenceWithPeriod.length > CHUNK_SIZE &&
sentenceChunk.length > 0
) {
finalChunks.push(sentenceChunk.trim());
const overlapText = sentenceChunk.slice(-OVERLAP);
sentenceChunk = overlapText + sentenceWithPeriod;
} else {
sentenceChunk += sentenceWithPeriod;
}
}
if (sentenceChunk.trim()) {
finalChunks.push(sentenceChunk.trim());
}
}
}
return finalChunks.length > 0 ? finalChunks : [content];
}
// =============================================================================
// UnifiedVectorIndex Class
// =============================================================================
/**
* Unified vector index backed by CodexLens VectorStore (HNSW).
*
* Provides content chunking, embedding, storage, and search for all
* memory content types through a single interface.
*/
export class UnifiedVectorIndex {
private storePath: string;
/**
* Create a UnifiedVectorIndex for a project.
*
* @param projectPath - Project root path (used to resolve storage location)
*/
constructor(projectPath: string) {
const paths = StoragePaths.project(projectPath);
this.storePath = paths.unifiedVectors.root;
ensureStorageDir(this.storePath);
}
/**
* Index content by chunking, embedding, and storing in VectorStore.
*
* @param content - Text content to index
* @param metadata - Metadata for all chunks (source_id, source_type, category)
* @returns Embed result
*/
async indexContent(
content: string,
metadata: ChunkMetadata
): Promise<EmbedResult> {
if (!content.trim()) {
return {
success: true,
chunks_processed: 0,
chunks_failed: 0,
elapsed_time: 0,
};
}
// Chunk content
const textChunks = chunkContent(content);
// Build chunk objects for Python
const chunks: VectorChunk[] = textChunks.map((text, index) => ({
content: text,
source_id: metadata.source_id,
source_type: metadata.source_type,
category: metadata.category,
chunk_index: metadata.chunk_index != null ? metadata.chunk_index + index : index,
metadata: { ...metadata },
}));
try {
const result = await runPython<EmbedResult>({
operation: 'embed',
store_path: this.storePath,
chunks,
batch_size: 8,
});
return result;
} catch (err) {
return {
success: false,
chunks_processed: 0,
chunks_failed: textChunks.length,
elapsed_time: 0,
error: (err as Error).message,
};
}
}
/**
* Search the vector index using semantic similarity.
*
* @param query - Natural language search query
* @param options - Search options (topK, minScore, category)
* @returns Search results sorted by relevance
*/
async search(
query: string,
options: VectorSearchOptions = {}
): Promise<VectorSearchResult> {
const { topK = 10, minScore = 0.3, category } = options;
try {
const result = await runPython<VectorSearchResult>({
operation: 'search',
store_path: this.storePath,
query,
top_k: topK,
min_score: minScore,
category: category || null,
});
return result;
} catch (err) {
return {
success: false,
matches: [],
error: (err as Error).message,
};
}
}
/**
* Search the vector index using a pre-computed embedding vector.
* Bypasses text embedding, directly querying HNSW with a raw vector.
*
* @param vector - Pre-computed embedding vector (array of floats)
* @param options - Search options (topK, minScore, category)
* @returns Search results sorted by relevance
*/
async searchByVector(
vector: number[],
options: VectorSearchOptions = {}
): Promise<VectorSearchResult> {
const { topK = 10, minScore = 0.3, category } = options;
try {
const result = await runPython<VectorSearchResult>({
operation: 'search_by_vector',
store_path: this.storePath,
vector,
top_k: topK,
min_score: minScore,
category: category || null,
});
return result;
} catch (err) {
return {
success: false,
matches: [],
error: (err as Error).message,
};
}
}
/**
* Rebuild the HNSW index from scratch.
*
* @returns Reindex result
*/
async reindexAll(): Promise<ReindexResult> {
try {
const result = await runPython<ReindexResult>({
operation: 'reindex',
store_path: this.storePath,
});
return result;
} catch (err) {
return {
success: false,
error: (err as Error).message,
};
}
}
/**
* Get the current status of the vector index.
*
* @returns Index status including chunk counts, HNSW availability, dimension
*/
async getStatus(): Promise<VectorIndexStatus> {
try {
const result = await runPython<VectorIndexStatus>({
operation: 'status',
store_path: this.storePath,
});
return result;
} catch (err) {
return {
success: false,
total_chunks: 0,
hnsw_available: false,
hnsw_count: 0,
dimension: 0,
error: (err as Error).message,
};
}
}
}