Implement SPLADE sparse encoder and associated database migrations

- Added `splade_encoder.py` for ONNX-optimized SPLADE encoding, including methods for encoding text and batch processing.
- Created `SPLADE_IMPLEMENTATION.md` to document the SPLADE encoder's functionality, design patterns, and integration points.
- Introduced migration script `migration_009_add_splade.py` to add SPLADE metadata and posting list tables to the database.
- Developed `splade_index.py` for managing the SPLADE inverted index, supporting efficient sparse vector retrieval.
- Added verification script `verify_watcher.py` to test FileWatcher event filtering and debouncing functionality.
This commit is contained in:
catlog22
2026-01-01 17:41:22 +08:00
parent 520f2d26f2
commit 5bb01755bc
16 changed files with 3122 additions and 2792 deletions

View File

@@ -17,6 +17,16 @@ import {
isIndexingInProgress
} from '../../tools/codex-lens.js';
import type { ProgressInfo, GpuMode } from '../../tools/codex-lens.js';
import { loadLiteLLMApiConfig } from '../../config/litellm-api-config-manager.js';
// File watcher state (persisted across requests)
let watcherProcess: any = null;
let watcherStats = {
running: false,
root_path: '',
events_processed: 0,
start_time: null as Date | null
};
export interface RouteContext {
pathname: string;
@@ -1052,5 +1062,478 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise<boolean>
return true;
}
// ============================================================
// RERANKER CONFIGURATION ENDPOINTS
// ============================================================
// API: Get Reranker Configuration
if (pathname === '/api/codexlens/reranker/config' && req.method === 'GET') {
try {
const venvStatus = await checkVenvStatus();
// Default reranker config
const rerankerConfig = {
backend: 'onnx',
model_name: 'cross-encoder/ms-marco-MiniLM-L-6-v2',
api_provider: 'siliconflow',
api_key_set: false,
available_backends: ['onnx', 'api', 'litellm', 'legacy'],
api_providers: ['siliconflow', 'cohere', 'jina'],
litellm_endpoints: [] as string[],
config_source: 'default'
};
// Load LiteLLM endpoints for dropdown
try {
const litellmConfig = loadLiteLLMApiConfig(initialPath);
if (litellmConfig.endpoints && Array.isArray(litellmConfig.endpoints)) {
rerankerConfig.litellm_endpoints = litellmConfig.endpoints.map(
(ep: any) => ep.alias || ep.name || ep.baseUrl
).filter(Boolean);
}
} catch (e) {
// LiteLLM config not available, continue with empty endpoints
}
// If CodexLens is installed, try to get actual config
if (venvStatus.ready) {
try {
const result = await executeCodexLens(['config', '--json']);
if (result.success) {
const config = extractJSON(result.output);
if (config.success && config.result) {
// Map config values
if (config.result.reranker_backend) {
rerankerConfig.backend = config.result.reranker_backend;
rerankerConfig.config_source = 'codexlens';
}
if (config.result.reranker_model) {
rerankerConfig.model_name = config.result.reranker_model;
}
if (config.result.reranker_api_provider) {
rerankerConfig.api_provider = config.result.reranker_api_provider;
}
// Check if API key is set (from env)
if (process.env.RERANKER_API_KEY) {
rerankerConfig.api_key_set = true;
}
}
}
} catch (e) {
console.error('[CodexLens] Failed to get reranker config:', e);
}
}
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ success: true, ...rerankerConfig }));
} catch (err) {
res.writeHead(500, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ success: false, error: err.message }));
}
return true;
}
// API: Set Reranker Configuration
if (pathname === '/api/codexlens/reranker/config' && req.method === 'POST') {
handlePostRequest(req, res, async (body) => {
const { backend, model_name, api_provider, api_key, litellm_endpoint } = body;
// Validate backend
const validBackends = ['onnx', 'api', 'litellm', 'legacy'];
if (backend && !validBackends.includes(backend)) {
return { success: false, error: `Invalid backend: ${backend}. Valid options: ${validBackends.join(', ')}`, status: 400 };
}
// Validate api_provider
const validProviders = ['siliconflow', 'cohere', 'jina'];
if (api_provider && !validProviders.includes(api_provider)) {
return { success: false, error: `Invalid api_provider: ${api_provider}. Valid options: ${validProviders.join(', ')}`, status: 400 };
}
try {
const updates: string[] = [];
// Set backend
if (backend) {
const result = await executeCodexLens(['config', 'set', 'reranker_backend', backend, '--json']);
if (result.success) updates.push('backend');
}
// Set model
if (model_name) {
const result = await executeCodexLens(['config', 'set', 'reranker_model', model_name, '--json']);
if (result.success) updates.push('model_name');
}
// Set API provider
if (api_provider) {
const result = await executeCodexLens(['config', 'set', 'reranker_api_provider', api_provider, '--json']);
if (result.success) updates.push('api_provider');
}
// Set LiteLLM endpoint
if (litellm_endpoint) {
const result = await executeCodexLens(['config', 'set', 'reranker_litellm_endpoint', litellm_endpoint, '--json']);
if (result.success) updates.push('litellm_endpoint');
}
// Handle API key - write to .env file or environment
if (api_key) {
// For security, we store in process.env for the current session
// In production, this should be written to a secure .env file
process.env.RERANKER_API_KEY = api_key;
updates.push('api_key');
}
return {
success: true,
message: `Updated: ${updates.join(', ')}`,
updated_fields: updates
};
} catch (err) {
return { success: false, error: err.message, status: 500 };
}
});
return true;
}
// ============================================================
// FILE WATCHER CONTROL ENDPOINTS
// ============================================================
// API: Get File Watcher Status
if (pathname === '/api/codexlens/watch/status') {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
success: true,
running: watcherStats.running,
root_path: watcherStats.root_path,
events_processed: watcherStats.events_processed,
start_time: watcherStats.start_time?.toISOString() || null,
uptime_seconds: watcherStats.start_time
? Math.floor((Date.now() - watcherStats.start_time.getTime()) / 1000)
: 0
}));
return true;
}
// API: Start File Watcher
if (pathname === '/api/codexlens/watch/start' && req.method === 'POST') {
handlePostRequest(req, res, async (body) => {
const { path: watchPath, debounce_ms = 1000 } = body;
const targetPath = watchPath || initialPath;
if (watcherStats.running) {
return { success: false, error: 'Watcher already running', status: 400 };
}
try {
const { spawn } = await import('child_process');
const { join } = await import('path');
const { existsSync, statSync } = await import('fs');
// Validate path exists and is a directory
if (!existsSync(targetPath)) {
return { success: false, error: `Path does not exist: ${targetPath}`, status: 400 };
}
const pathStat = statSync(targetPath);
if (!pathStat.isDirectory()) {
return { success: false, error: `Path is not a directory: ${targetPath}`, status: 400 };
}
// Get the codexlens CLI path
const venvStatus = await checkVenvStatus();
if (!venvStatus.ready) {
return { success: false, error: 'CodexLens not installed', status: 400 };
}
// Spawn watch process (no shell: true for security)
// Use process.platform to determine if we need .cmd extension on Windows
const isWindows = process.platform === 'win32';
const codexlensCmd = isWindows ? 'codexlens.exe' : 'codexlens';
const args = ['watch', targetPath, '--debounce', String(debounce_ms)];
watcherProcess = spawn(codexlensCmd, args, {
cwd: targetPath,
stdio: ['ignore', 'pipe', 'pipe'],
env: { ...process.env }
});
watcherStats = {
running: true,
root_path: targetPath,
events_processed: 0,
start_time: new Date()
};
// Handle process output for event counting
if (watcherProcess.stdout) {
watcherProcess.stdout.on('data', (data: Buffer) => {
const output = data.toString();
// Count processed events from output
const matches = output.match(/Processed \d+ events?/g);
if (matches) {
watcherStats.events_processed += matches.length;
}
});
}
// Handle process exit
watcherProcess.on('exit', (code: number) => {
watcherStats.running = false;
watcherProcess = null;
console.log(`[CodexLens] Watcher exited with code ${code}`);
});
// Broadcast watcher started
broadcastToClients({
type: 'CODEXLENS_WATCHER_STATUS',
payload: { running: true, path: targetPath }
});
return {
success: true,
message: 'Watcher started',
path: targetPath,
pid: watcherProcess.pid
};
} catch (err) {
return { success: false, error: err.message, status: 500 };
}
});
return true;
}
// API: Stop File Watcher
if (pathname === '/api/codexlens/watch/stop' && req.method === 'POST') {
handlePostRequest(req, res, async () => {
if (!watcherStats.running || !watcherProcess) {
return { success: false, error: 'Watcher not running', status: 400 };
}
try {
// Send SIGTERM to gracefully stop the watcher
watcherProcess.kill('SIGTERM');
// Wait a moment for graceful shutdown
await new Promise(resolve => setTimeout(resolve, 500));
// Force kill if still running
if (watcherProcess && !watcherProcess.killed) {
watcherProcess.kill('SIGKILL');
}
const finalStats = {
events_processed: watcherStats.events_processed,
uptime_seconds: watcherStats.start_time
? Math.floor((Date.now() - watcherStats.start_time.getTime()) / 1000)
: 0
};
watcherStats = {
running: false,
root_path: '',
events_processed: 0,
start_time: null
};
watcherProcess = null;
// Broadcast watcher stopped
broadcastToClients({
type: 'CODEXLENS_WATCHER_STATUS',
payload: { running: false }
});
return {
success: true,
message: 'Watcher stopped',
...finalStats
};
} catch (err) {
return { success: false, error: err.message, status: 500 };
}
});
return true;
}
// ============================================================
// SPLADE ENDPOINTS
// ============================================================
// API: SPLADE Status - Check if SPLADE is available and installed
if (pathname === '/api/codexlens/splade/status') {
try {
// Check if CodexLens is installed first
const venvStatus = await checkVenvStatus();
if (!venvStatus.ready) {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
available: false,
installed: false,
model: 'naver/splade-cocondenser-ensembledistil',
error: 'CodexLens not installed'
}));
return true;
}
// Check SPLADE availability using Python check
const result = await executeCodexLens(['python', '-c',
'from codexlens.semantic.splade_encoder import check_splade_available; ok, err = check_splade_available(); print("OK" if ok else err)'
]);
const available = result.output.includes('OK');
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
available,
installed: available,
model: 'naver/splade-cocondenser-ensembledistil',
error: available ? null : result.output.trim()
}));
} catch (err) {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
available: false,
installed: false,
model: 'naver/splade-cocondenser-ensembledistil',
error: err.message
}));
}
return true;
}
// API: SPLADE Install - Install SPLADE dependencies
if (pathname === '/api/codexlens/splade/install' && req.method === 'POST') {
handlePostRequest(req, res, async (body) => {
try {
const gpu = body?.gpu || false;
const packageName = gpu ? 'codex-lens[splade-gpu]' : 'codex-lens[splade]';
// Use pip to install the SPLADE extras
const { spawn } = await import('child_process');
const { promisify } = await import('util');
const execFilePromise = promisify(require('child_process').execFile);
const result = await execFilePromise('pip', ['install', packageName], {
timeout: 600000 // 10 minutes
});
return {
success: true,
message: `SPLADE installed successfully (${gpu ? 'GPU' : 'CPU'} mode)`,
output: result.stdout
};
} catch (err) {
return {
success: false,
error: err.message,
stderr: err.stderr,
status: 500
};
}
});
return true;
}
// API: SPLADE Index Status - Check if SPLADE index exists for a project
if (pathname === '/api/codexlens/splade/index-status') {
try {
const projectPath = url.searchParams.get('path');
if (!projectPath) {
res.writeHead(400, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ success: false, error: 'Missing path parameter' }));
return true;
}
// Check if CodexLens is installed first
const venvStatus = await checkVenvStatus();
if (!venvStatus.ready) {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ exists: false, error: 'CodexLens not installed' }));
return true;
}
const { join } = await import('path');
const indexDb = join(projectPath, '.codexlens', '_index.db');
// Use Python to check SPLADE index status
const pythonCode = `
from codexlens.storage.splade_index import SpladeIndex
from pathlib import Path
try:
idx = SpladeIndex(Path("${indexDb.replace(/\\/g, '\\\\')}"))
if idx.has_index():
stats = idx.get_stats()
meta = idx.get_metadata()
model = meta.get('model_name', '') if meta else ''
print(f"OK|{stats['unique_chunks']}|{stats['total_postings']}|{model}")
else:
print("NO_INDEX")
except Exception as e:
print(f"ERROR|{str(e)}")
`;
const result = await executeCodexLens(['python', '-c', pythonCode]);
if (result.output.startsWith('OK|')) {
const parts = result.output.trim().split('|');
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
exists: true,
chunks: parseInt(parts[1]),
postings: parseInt(parts[2]),
model: parts[3]
}));
} else if (result.output.startsWith('ERROR|')) {
const errorMsg = result.output.substring(6).trim();
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ exists: false, error: errorMsg }));
} else {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ exists: false }));
}
} catch (err) {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ exists: false, error: err.message }));
}
return true;
}
// API: SPLADE Index Rebuild - Rebuild SPLADE index for a project
if (pathname === '/api/codexlens/splade/rebuild' && req.method === 'POST') {
handlePostRequest(req, res, async (body) => {
const { path: projectPath } = body;
if (!projectPath) {
return { success: false, error: 'Missing path parameter', status: 400 };
}
try {
const result = await executeCodexLens(['splade-index', projectPath, '--rebuild'], {
cwd: projectPath,
timeout: 1800000 // 30 minutes for large codebases
});
if (result.success) {
return {
success: true,
message: 'SPLADE index rebuilt successfully',
output: result.output
};
} else {
return {
success: false,
error: result.error || 'Failed to rebuild SPLADE index',
output: result.output,
status: 500
};
}
} catch (err) {
return { success: false, error: err.message, status: 500 };
}
});
return true;
}
return false;
}