mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-02 15:23:19 +08:00
- Introduced test suite for AstGrepPythonProcessor covering pattern definitions, parsing, and relationship extraction. - Added comparison tests between tree-sitter and ast-grep for consistency in relationship extraction. - Implemented tests for ast-grep binding module to verify functionality and availability. - Ensured tests cover various scenarios including inheritance, function calls, and imports.
475 lines
13 KiB
TypeScript
475 lines
13 KiB
TypeScript
/**
|
|
* Unified Vector Index - TypeScript bridge to unified_memory_embedder.py
|
|
*
|
|
* Provides HNSW-backed vector indexing and search for all memory content
|
|
* (core_memory, cli_history, workflow, entity, pattern) via CodexLens VectorStore.
|
|
*
|
|
* Features:
|
|
* - JSON stdin/stdout protocol to Python embedder
|
|
* - Content chunking (paragraph -> sentence splitting, CHUNK_SIZE=1500, OVERLAP=200)
|
|
* - Batch embedding via CodexLens EmbedderFactory
|
|
* - HNSW approximate nearest neighbor search (sub-10ms for 1000 chunks)
|
|
* - Category-based filtering
|
|
*/
|
|
|
|
import { spawn } from 'child_process';
|
|
import { join, dirname } from 'path';
|
|
import { existsSync } from 'fs';
|
|
import { fileURLToPath } from 'url';
|
|
import { getCodexLensPython } from '../utils/codexlens-path.js';
|
|
import { StoragePaths, ensureStorageDir } from '../config/storage-paths.js';
|
|
|
|
// Get directory of this module
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = dirname(__filename);
|
|
|
|
// Venv python path (reuse CodexLens venv)
|
|
const VENV_PYTHON = getCodexLensPython();
|
|
|
|
// Script path
|
|
const EMBEDDER_SCRIPT = join(__dirname, '..', '..', 'scripts', 'unified_memory_embedder.py');
|
|
|
|
// Chunking constants (match existing core-memory-store.ts)
|
|
const CHUNK_SIZE = 1500;
|
|
const OVERLAP = 200;
|
|
|
|
// =============================================================================
|
|
// Types
|
|
// =============================================================================
|
|
|
|
/** Valid source types for vector content */
|
|
export type SourceType = 'core_memory' | 'workflow' | 'cli_history';
|
|
|
|
/** Valid category values for vector filtering */
|
|
export type VectorCategory = 'core_memory' | 'cli_history' | 'workflow' | 'entity' | 'pattern';
|
|
|
|
/** Metadata attached to each chunk in the vector store */
|
|
export interface ChunkMetadata {
|
|
/** Source identifier (e.g., memory ID, session ID) */
|
|
source_id: string;
|
|
/** Source type */
|
|
source_type: SourceType;
|
|
/** Category for filtering */
|
|
category: VectorCategory;
|
|
/** Chunk index within the source */
|
|
chunk_index?: number;
|
|
/** Additional metadata */
|
|
[key: string]: unknown;
|
|
}
|
|
|
|
/** A chunk to be embedded and indexed */
|
|
export interface VectorChunk {
|
|
/** Text content */
|
|
content: string;
|
|
/** Source identifier */
|
|
source_id: string;
|
|
/** Source type */
|
|
source_type: SourceType;
|
|
/** Category for filtering */
|
|
category: VectorCategory;
|
|
/** Chunk index */
|
|
chunk_index: number;
|
|
/** Additional metadata */
|
|
metadata?: Record<string, unknown>;
|
|
}
|
|
|
|
/** Result of an embed operation */
|
|
export interface EmbedResult {
|
|
success: boolean;
|
|
chunks_processed: number;
|
|
chunks_failed: number;
|
|
elapsed_time: number;
|
|
error?: string;
|
|
}
|
|
|
|
/** A single search match */
|
|
export interface VectorSearchMatch {
|
|
content: string;
|
|
score: number;
|
|
source_id: string;
|
|
source_type: string;
|
|
chunk_index: number;
|
|
category: string;
|
|
metadata: Record<string, unknown>;
|
|
}
|
|
|
|
/** Result of a search operation */
|
|
export interface VectorSearchResult {
|
|
success: boolean;
|
|
matches: VectorSearchMatch[];
|
|
elapsed_time?: number;
|
|
total_searched?: number;
|
|
error?: string;
|
|
}
|
|
|
|
/** Search options */
|
|
export interface VectorSearchOptions {
|
|
topK?: number;
|
|
minScore?: number;
|
|
category?: VectorCategory;
|
|
}
|
|
|
|
/** Index status information */
|
|
export interface VectorIndexStatus {
|
|
success: boolean;
|
|
total_chunks: number;
|
|
hnsw_available: boolean;
|
|
hnsw_count: number;
|
|
dimension: number;
|
|
categories?: Record<string, number>;
|
|
model_config?: {
|
|
backend: string;
|
|
profile: string;
|
|
dimension: number;
|
|
max_tokens: number;
|
|
};
|
|
error?: string;
|
|
}
|
|
|
|
/** Reindex result */
|
|
export interface ReindexResult {
|
|
success: boolean;
|
|
hnsw_count?: number;
|
|
elapsed_time?: number;
|
|
error?: string;
|
|
}
|
|
|
|
// =============================================================================
|
|
// Python Bridge
|
|
// =============================================================================
|
|
|
|
/**
|
|
* Check if the unified embedder is available (venv and script exist)
|
|
*/
|
|
export function isUnifiedEmbedderAvailable(): boolean {
|
|
if (!existsSync(VENV_PYTHON)) {
|
|
return false;
|
|
}
|
|
if (!existsSync(EMBEDDER_SCRIPT)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Run Python script with JSON stdin/stdout protocol.
|
|
*
|
|
* @param request - JSON request object to send via stdin
|
|
* @param timeout - Timeout in milliseconds (default: 5 minutes)
|
|
* @returns Parsed JSON response
|
|
*/
|
|
function runPython<T>(request: Record<string, unknown>, timeout: number = 300000): Promise<T> {
|
|
return new Promise((resolve, reject) => {
|
|
if (!isUnifiedEmbedderAvailable()) {
|
|
reject(
|
|
new Error(
|
|
'Unified embedder not available. Ensure CodexLens venv exists at ~/.codexlens/venv'
|
|
)
|
|
);
|
|
return;
|
|
}
|
|
|
|
const child = spawn(VENV_PYTHON, [EMBEDDER_SCRIPT], {
|
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
timeout,
|
|
});
|
|
|
|
let stdout = '';
|
|
let stderr = '';
|
|
|
|
child.stdout.on('data', (data) => {
|
|
stdout += data.toString();
|
|
});
|
|
|
|
child.stderr.on('data', (data) => {
|
|
stderr += data.toString();
|
|
});
|
|
|
|
child.on('close', (code) => {
|
|
if (code === 0 && stdout.trim()) {
|
|
try {
|
|
resolve(JSON.parse(stdout.trim()) as T);
|
|
} catch {
|
|
reject(new Error(`Failed to parse Python output: ${stdout.substring(0, 500)}`));
|
|
}
|
|
} else {
|
|
reject(new Error(`Python script failed (exit code ${code}): ${stderr || stdout}`));
|
|
}
|
|
});
|
|
|
|
child.on('error', (err) => {
|
|
if ((err as NodeJS.ErrnoException).code === 'ETIMEDOUT') {
|
|
reject(new Error('Python script timed out'));
|
|
} else {
|
|
reject(new Error(`Failed to spawn Python: ${err.message}`));
|
|
}
|
|
});
|
|
|
|
// Write JSON request to stdin and close
|
|
const jsonInput = JSON.stringify(request);
|
|
child.stdin.write(jsonInput);
|
|
child.stdin.end();
|
|
});
|
|
}
|
|
|
|
// =============================================================================
|
|
// Content Chunking
|
|
// =============================================================================
|
|
|
|
/**
|
|
* Chunk content into smaller pieces for embedding.
|
|
* Uses paragraph-first, sentence-fallback strategy with overlap.
|
|
*
|
|
* Matches the chunking logic in core-memory-store.ts:
|
|
* - CHUNK_SIZE = 1500 characters
|
|
* - OVERLAP = 200 characters
|
|
* - Split by paragraph boundaries (\n\n) first
|
|
* - Fall back to sentence boundaries (. ) for oversized paragraphs
|
|
*
|
|
* @param content - Text content to chunk
|
|
* @returns Array of chunk strings
|
|
*/
|
|
export function chunkContent(content: string): string[] {
|
|
const chunks: string[] = [];
|
|
|
|
// Split by paragraph boundaries first
|
|
const paragraphs = content.split(/\n\n+/);
|
|
let currentChunk = '';
|
|
|
|
for (const paragraph of paragraphs) {
|
|
// If adding this paragraph would exceed chunk size
|
|
if (currentChunk.length + paragraph.length > CHUNK_SIZE && currentChunk.length > 0) {
|
|
chunks.push(currentChunk.trim());
|
|
|
|
// Start new chunk with overlap
|
|
const overlapText = currentChunk.slice(-OVERLAP);
|
|
currentChunk = overlapText + '\n\n' + paragraph;
|
|
} else {
|
|
currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
|
|
}
|
|
}
|
|
|
|
// Add remaining chunk
|
|
if (currentChunk.trim()) {
|
|
chunks.push(currentChunk.trim());
|
|
}
|
|
|
|
// If chunks are still too large, split by sentences
|
|
const finalChunks: string[] = [];
|
|
for (const chunk of chunks) {
|
|
if (chunk.length <= CHUNK_SIZE) {
|
|
finalChunks.push(chunk);
|
|
} else {
|
|
// Split by sentence boundaries
|
|
const sentences = chunk.split(/\. +/);
|
|
let sentenceChunk = '';
|
|
|
|
for (const sentence of sentences) {
|
|
const sentenceWithPeriod = sentence + '. ';
|
|
if (
|
|
sentenceChunk.length + sentenceWithPeriod.length > CHUNK_SIZE &&
|
|
sentenceChunk.length > 0
|
|
) {
|
|
finalChunks.push(sentenceChunk.trim());
|
|
const overlapText = sentenceChunk.slice(-OVERLAP);
|
|
sentenceChunk = overlapText + sentenceWithPeriod;
|
|
} else {
|
|
sentenceChunk += sentenceWithPeriod;
|
|
}
|
|
}
|
|
|
|
if (sentenceChunk.trim()) {
|
|
finalChunks.push(sentenceChunk.trim());
|
|
}
|
|
}
|
|
}
|
|
|
|
return finalChunks.length > 0 ? finalChunks : [content];
|
|
}
|
|
|
|
// =============================================================================
|
|
// UnifiedVectorIndex Class
|
|
// =============================================================================
|
|
|
|
/**
|
|
* Unified vector index backed by CodexLens VectorStore (HNSW).
|
|
*
|
|
* Provides content chunking, embedding, storage, and search for all
|
|
* memory content types through a single interface.
|
|
*/
|
|
export class UnifiedVectorIndex {
|
|
private storePath: string;
|
|
|
|
/**
|
|
* Create a UnifiedVectorIndex for a project.
|
|
*
|
|
* @param projectPath - Project root path (used to resolve storage location)
|
|
*/
|
|
constructor(projectPath: string) {
|
|
const paths = StoragePaths.project(projectPath);
|
|
this.storePath = paths.unifiedVectors.root;
|
|
ensureStorageDir(this.storePath);
|
|
}
|
|
|
|
/**
|
|
* Index content by chunking, embedding, and storing in VectorStore.
|
|
*
|
|
* @param content - Text content to index
|
|
* @param metadata - Metadata for all chunks (source_id, source_type, category)
|
|
* @returns Embed result
|
|
*/
|
|
async indexContent(
|
|
content: string,
|
|
metadata: ChunkMetadata
|
|
): Promise<EmbedResult> {
|
|
if (!content.trim()) {
|
|
return {
|
|
success: true,
|
|
chunks_processed: 0,
|
|
chunks_failed: 0,
|
|
elapsed_time: 0,
|
|
};
|
|
}
|
|
|
|
// Chunk content
|
|
const textChunks = chunkContent(content);
|
|
|
|
// Build chunk objects for Python
|
|
const chunks: VectorChunk[] = textChunks.map((text, index) => ({
|
|
content: text,
|
|
source_id: metadata.source_id,
|
|
source_type: metadata.source_type,
|
|
category: metadata.category,
|
|
chunk_index: metadata.chunk_index != null ? metadata.chunk_index + index : index,
|
|
metadata: { ...metadata },
|
|
}));
|
|
|
|
try {
|
|
const result = await runPython<EmbedResult>({
|
|
operation: 'embed',
|
|
store_path: this.storePath,
|
|
chunks,
|
|
batch_size: 8,
|
|
});
|
|
return result;
|
|
} catch (err) {
|
|
return {
|
|
success: false,
|
|
chunks_processed: 0,
|
|
chunks_failed: textChunks.length,
|
|
elapsed_time: 0,
|
|
error: (err as Error).message,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Search the vector index using semantic similarity.
|
|
*
|
|
* @param query - Natural language search query
|
|
* @param options - Search options (topK, minScore, category)
|
|
* @returns Search results sorted by relevance
|
|
*/
|
|
async search(
|
|
query: string,
|
|
options: VectorSearchOptions = {}
|
|
): Promise<VectorSearchResult> {
|
|
const { topK = 10, minScore = 0.3, category } = options;
|
|
|
|
try {
|
|
const result = await runPython<VectorSearchResult>({
|
|
operation: 'search',
|
|
store_path: this.storePath,
|
|
query,
|
|
top_k: topK,
|
|
min_score: minScore,
|
|
category: category || null,
|
|
});
|
|
return result;
|
|
} catch (err) {
|
|
return {
|
|
success: false,
|
|
matches: [],
|
|
error: (err as Error).message,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Search the vector index using a pre-computed embedding vector.
|
|
* Bypasses text embedding, directly querying HNSW with a raw vector.
|
|
*
|
|
* @param vector - Pre-computed embedding vector (array of floats)
|
|
* @param options - Search options (topK, minScore, category)
|
|
* @returns Search results sorted by relevance
|
|
*/
|
|
async searchByVector(
|
|
vector: number[],
|
|
options: VectorSearchOptions = {}
|
|
): Promise<VectorSearchResult> {
|
|
const { topK = 10, minScore = 0.3, category } = options;
|
|
|
|
try {
|
|
const result = await runPython<VectorSearchResult>({
|
|
operation: 'search_by_vector',
|
|
store_path: this.storePath,
|
|
vector,
|
|
top_k: topK,
|
|
min_score: minScore,
|
|
category: category || null,
|
|
});
|
|
return result;
|
|
} catch (err) {
|
|
return {
|
|
success: false,
|
|
matches: [],
|
|
error: (err as Error).message,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Rebuild the HNSW index from scratch.
|
|
*
|
|
* @returns Reindex result
|
|
*/
|
|
async reindexAll(): Promise<ReindexResult> {
|
|
try {
|
|
const result = await runPython<ReindexResult>({
|
|
operation: 'reindex',
|
|
store_path: this.storePath,
|
|
});
|
|
return result;
|
|
} catch (err) {
|
|
return {
|
|
success: false,
|
|
error: (err as Error).message,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get the current status of the vector index.
|
|
*
|
|
* @returns Index status including chunk counts, HNSW availability, dimension
|
|
*/
|
|
async getStatus(): Promise<VectorIndexStatus> {
|
|
try {
|
|
const result = await runPython<VectorIndexStatus>({
|
|
operation: 'status',
|
|
store_path: this.storePath,
|
|
});
|
|
return result;
|
|
} catch (err) {
|
|
return {
|
|
success: false,
|
|
total_chunks: 0,
|
|
hnsw_available: false,
|
|
hnsw_count: 0,
|
|
dimension: 0,
|
|
error: (err as Error).message,
|
|
};
|
|
}
|
|
}
|
|
}
|