Files
Claude-Code-Workflow/ccw/src/core/memory-embedder-bridge.ts
catlog22 99ee4e7d36 feat: unified task.json schema migration and multi-module updates
- Create task-schema.json (JSON Schema draft-07) with 10 field blocks fusing
  Unified JSONL, 6-field Task JSON, and Solution Schema advantages
- Migrate unified-execute-with-file from JSONL to .task/*.json directory scanning
- Migrate 3 producers (lite-plan, plan-converter, collaborative-plan) to
  .task/*.json multi-file output
- Add review-cycle Phase 7.5 export-to-tasks (FIX-*.json) and issue-resolve
  --export-tasks option
- Add schema compatibility annotations to action-planning-agent, workflow-plan,
  and tdd-plan
- Add spec-generator skill phases and templates
- Add memory v2 pipeline (consolidation, extraction, job scheduler, embedder)
- Add secret-redactor utility and core-memory enhancements
- Add codex-lens accuracy benchmarks and staged env config overrides
2026-02-11 17:40:56 +08:00

371 lines
9.5 KiB
TypeScript

/**
* Memory Embedder Bridge - TypeScript interface to Python memory embedder
*
* This module provides a TypeScript bridge to the Python memory_embedder.py script,
* which generates and searches embeddings for memory chunks using CodexLens's embedder.
*
* Features:
* - Reuses CodexLens venv at ~/.codexlens/venv
* - JSON protocol communication
* - Three commands: embed, search, status
* - Automatic availability checking
* - Stage1 output embedding for V2 pipeline
*/
import { spawn } from 'child_process';
import { join, dirname } from 'path';
import { existsSync } from 'fs';
import { fileURLToPath } from 'url';
import { getCodexLensPython } from '../utils/codexlens-path.js';
import { getCoreMemoryStore } from './core-memory-store.js';
import type { Stage1Output } from './core-memory-store.js';
import { StoragePaths } from '../config/storage-paths.js';
// Get directory of this module
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Venv paths (reuse CodexLens venv)
const VENV_PYTHON = getCodexLensPython();
// Script path
const EMBEDDER_SCRIPT = join(__dirname, '..', '..', 'scripts', 'memory_embedder.py');
// Types
export interface EmbedResult {
success: boolean;
chunks_processed: number;
chunks_failed: number;
elapsed_time: number;
error?: string;
}
export interface SearchMatch {
source_id: string;
source_type: 'core_memory' | 'workflow' | 'cli_history';
chunk_index: number;
content: string;
score: number;
restore_command: string;
}
export interface SearchResult {
success: boolean;
matches: SearchMatch[];
query?: string;
elapsed_time?: number;
error?: string;
}
export interface EmbeddingStatus {
success?: boolean;
total_chunks: number;
embedded_chunks: number;
pending_chunks: number;
by_type: Record<string, { total: number; embedded: number; pending: number }>;
error?: string;
}
export interface EmbedOptions {
sourceId?: string;
batchSize?: number;
force?: boolean;
}
export interface SearchOptions {
topK?: number;
minScore?: number;
sourceType?: 'core_memory' | 'workflow' | 'cli_history';
}
/**
* Check if embedder is available (venv and script exist)
* @returns True if embedder is available
*/
export function isEmbedderAvailable(): boolean {
// Check venv python exists
if (!existsSync(VENV_PYTHON)) {
return false;
}
// Check script exists
if (!existsSync(EMBEDDER_SCRIPT)) {
return false;
}
return true;
}
/**
* Run Python script with arguments
* @param args - Command line arguments
* @param timeout - Timeout in milliseconds
* @returns JSON output from script
*/
function runPython(args: string[], timeout: number = 300000): Promise<string> {
return new Promise((resolve, reject) => {
// Check availability
if (!isEmbedderAvailable()) {
reject(
new Error(
'Memory embedder not available. Ensure CodexLens venv exists at ~/.codexlens/venv'
)
);
return;
}
// Spawn Python process
const child = spawn(VENV_PYTHON, [EMBEDDER_SCRIPT, ...args], {
stdio: ['ignore', 'pipe', 'pipe'],
timeout,
});
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => {
stdout += data.toString();
});
child.stderr.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
if (code === 0) {
resolve(stdout.trim());
} else {
reject(new Error(`Python script failed (exit code ${code}): ${stderr || stdout}`));
}
});
child.on('error', (err) => {
if ((err as NodeJS.ErrnoException).code === 'ETIMEDOUT') {
reject(new Error('Python script timed out'));
} else {
reject(new Error(`Failed to spawn Python: ${err.message}`));
}
});
});
}
/**
* Generate embeddings for memory chunks
* @param dbPath - Path to SQLite database
* @param options - Embedding options
* @returns Embedding result
*/
export async function generateEmbeddings(
dbPath: string,
options: EmbedOptions = {}
): Promise<EmbedResult> {
const { sourceId, batchSize = 8, force = false } = options;
// Build arguments
const args = ['embed', dbPath];
if (sourceId) {
args.push('--source-id', sourceId);
}
if (batchSize !== 8) {
args.push('--batch-size', batchSize.toString());
}
if (force) {
args.push('--force');
}
try {
// Default timeout: 5 minutes
const output = await runPython(args, 300000);
const result = JSON.parse(output) as EmbedResult;
return result;
} catch (err) {
return {
success: false,
chunks_processed: 0,
chunks_failed: 0,
elapsed_time: 0,
error: (err as Error).message,
};
}
}
/**
* Search memory chunks using semantic search
* @param dbPath - Path to SQLite database
* @param query - Search query text
* @param options - Search options
* @returns Search results
*/
export async function searchMemories(
dbPath: string,
query: string,
options: SearchOptions = {}
): Promise<SearchResult> {
const { topK = 10, minScore = 0.3, sourceType } = options;
// Build arguments
const args = ['search', dbPath, query];
if (topK !== 10) {
args.push('--top-k', topK.toString());
}
if (minScore !== 0.3) {
args.push('--min-score', minScore.toString());
}
if (sourceType) {
args.push('--type', sourceType);
}
try {
// Default timeout: 30 seconds
const output = await runPython(args, 30000);
const result = JSON.parse(output) as SearchResult;
return result;
} catch (err) {
return {
success: false,
matches: [],
error: (err as Error).message,
};
}
}
/**
* Get embedding status statistics
* @param dbPath - Path to SQLite database
* @returns Embedding status
*/
export async function getEmbeddingStatus(dbPath: string): Promise<EmbeddingStatus> {
// Build arguments
const args = ['status', dbPath];
try {
// Default timeout: 30 seconds
const output = await runPython(args, 30000);
const result = JSON.parse(output) as EmbeddingStatus;
return { ...result, success: true };
} catch (err) {
return {
success: false,
total_chunks: 0,
embedded_chunks: 0,
pending_chunks: 0,
by_type: {},
error: (err as Error).message,
};
}
}
// ============================================================================
// Memory V2: Stage1 Output Embedding
// ============================================================================
/** Result of stage1 embedding operation */
export interface Stage1EmbedResult {
success: boolean;
chunksCreated: number;
chunksEmbedded: number;
error?: string;
}
/**
* Chunk and embed stage1_outputs (raw_memory + rollout_summary) for semantic search.
*
* Reads all stage1_outputs from the DB, chunks their raw_memory and rollout_summary
* content, inserts chunks into memory_chunks with source_type='cli_history' and
* metadata indicating the V2 origin, then triggers embedding generation.
*
* Uses source_id format: "s1:{thread_id}" to differentiate from regular cli_history chunks.
*
* @param projectPath - Project root path
* @param force - Force re-chunking even if chunks exist
* @returns Embedding result
*/
export async function embedStage1Outputs(
projectPath: string,
force: boolean = false
): Promise<Stage1EmbedResult> {
try {
const store = getCoreMemoryStore(projectPath);
const stage1Outputs = store.listStage1Outputs();
if (stage1Outputs.length === 0) {
return { success: true, chunksCreated: 0, chunksEmbedded: 0 };
}
let totalChunksCreated = 0;
for (const output of stage1Outputs) {
const sourceId = `s1:${output.thread_id}`;
// Check if already chunked
const existingChunks = store.getChunks(sourceId);
if (existingChunks.length > 0 && !force) continue;
// Delete old chunks if force
if (force && existingChunks.length > 0) {
store.deleteChunks(sourceId);
}
// Combine raw_memory and rollout_summary for richer semantic content
const combinedContent = [
output.rollout_summary ? `## Summary\n${output.rollout_summary}` : '',
output.raw_memory ? `## Raw Memory\n${output.raw_memory}` : '',
].filter(Boolean).join('\n\n');
if (!combinedContent.trim()) continue;
// Chunk using the store's built-in chunking
const chunks = store.chunkContent(combinedContent, sourceId, 'cli_history');
// Insert chunks with V2 metadata
for (let i = 0; i < chunks.length; i++) {
store.insertChunk({
source_id: sourceId,
source_type: 'cli_history',
chunk_index: i,
content: chunks[i],
metadata: JSON.stringify({
v2_source: 'stage1_output',
thread_id: output.thread_id,
generated_at: output.generated_at,
}),
created_at: new Date().toISOString(),
});
totalChunksCreated++;
}
}
// If we created chunks, generate embeddings
let chunksEmbedded = 0;
if (totalChunksCreated > 0) {
const paths = StoragePaths.project(projectPath);
const dbPath = join(paths.root, 'core-memory', 'core_memory.db');
const embedResult = await generateEmbeddings(dbPath, { force: false });
if (embedResult.success) {
chunksEmbedded = embedResult.chunks_processed;
}
}
return {
success: true,
chunksCreated: totalChunksCreated,
chunksEmbedded,
};
} catch (err) {
return {
success: false,
chunksCreated: 0,
chunksEmbedded: 0,
error: (err as Error).message,
};
}
}