feat: Enhance CodexLens search functionality with new parameters and result handling

- Added search limit, content length, and extra files input fields in the CodexLens manager UI.
- Updated API request parameters to include new fields: max_content_length and extra_files_count.
- Refactored smart-search.ts to support new parameters with default values.
- Implemented result splitting logic to return both full content and additional file paths.
- Updated CLI commands to remove worker limits and allow dynamic scaling based on endpoint count.
- Introduced EmbeddingPoolConfig for improved embedding management and auto-discovery of providers.
- Enhanced search engines to utilize new parameters for fuzzy and exact searches.
- Added support for embedding single texts in the LiteLLM embedder.
This commit is contained in:
catlog22
2025-12-25 16:16:44 +08:00
parent a1413dd1b3
commit ebcbb11cb2
6 changed files with 188 additions and 63 deletions

View File

@@ -586,6 +586,8 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise<boolean>
const query = url.searchParams.get('query') || '';
const limit = parseInt(url.searchParams.get('limit') || '20', 10);
const mode = url.searchParams.get('mode') || 'exact'; // exact, fuzzy, hybrid, vector
const maxContentLength = parseInt(url.searchParams.get('max_content_length') || '200', 10);
const extraFilesCount = parseInt(url.searchParams.get('extra_files_count') || '10', 10);
const projectPath = url.searchParams.get('path') || initialPath;
if (!query) {
@@ -595,15 +597,46 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise<boolean>
}
try {
const args = ['search', query, '--path', projectPath, '--limit', limit.toString(), '--mode', mode, '--json'];
// Request more results to support split (full content + extra files)
const totalToFetch = limit + extraFilesCount;
const args = ['search', query, '--path', projectPath, '--limit', totalToFetch.toString(), '--mode', mode, '--json'];
const result = await executeCodexLens(args, { cwd: projectPath });
if (result.success) {
try {
const parsed = extractJSON(result.output);
const allResults = parsed.result?.results || [];
// Truncate content and split results
const truncateContent = (content: string | null | undefined): string => {
if (!content) return '';
if (content.length <= maxContentLength) return content;
return content.slice(0, maxContentLength) + '...';
};
// Split results: first N with full content, rest as file paths only
const resultsWithContent = allResults.slice(0, limit).map((r: any) => ({
...r,
content: truncateContent(r.content || r.excerpt),
excerpt: truncateContent(r.excerpt || r.content),
}));
const extraResults = allResults.slice(limit, limit + extraFilesCount);
const extraFiles = [...new Set(extraResults.map((r: any) => r.path || r.file))];
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ success: true, ...parsed.result }));
res.end(JSON.stringify({
success: true,
results: resultsWithContent,
extra_files: extraFiles.length > 0 ? extraFiles : undefined,
metadata: {
total: allResults.length,
limit,
max_content_length: maxContentLength,
extra_files_count: extraFilesCount,
},
}));
} catch {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ success: true, results: [], output: result.output }));

View File

@@ -36,10 +36,12 @@ const ParamsSchema = z.object({
path: z.string().optional(),
paths: z.array(z.string()).default([]),
contextLines: z.number().default(0),
maxResults: z.number().default(20), // Increased default
maxResults: z.number().default(5), // Default 5 with full content
includeHidden: z.boolean().default(false),
languages: z.array(z.string()).optional(),
limit: z.number().default(20), // Increased default
limit: z.number().default(5), // Default 5 with full content
extraFilesCount: z.number().default(10), // Additional file-only results
maxContentLength: z.number().default(200), // Max content length for truncation (50-2000)
offset: z.number().default(0), // NEW: Pagination offset (start_index)
enrich: z.boolean().default(false),
// Search modifiers for ripgrep mode
@@ -268,6 +270,7 @@ interface SearchMetadata {
interface SearchResult {
success: boolean;
results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown;
extra_files?: string[]; // Additional file paths without content
output?: string;
metadata?: SearchMetadata;
error?: string;
@@ -301,6 +304,42 @@ function stripAnsi(str: string): string {
return str.replace(/\x1b\[[0-9;]*m/g, '');
}
/** Default maximum content length to return (avoid excessive output) */
const DEFAULT_MAX_CONTENT_LENGTH = 200;
/**
* Truncate content to specified length with ellipsis
* @param content - The content to truncate
* @param maxLength - Maximum length (default: 200)
*/
function truncateContent(content: string | null | undefined, maxLength: number = DEFAULT_MAX_CONTENT_LENGTH): string {
if (!content) return '';
if (content.length <= maxLength) return content;
return content.slice(0, maxLength) + '...';
}
/**
* Split results into full content results and extra file-only results
* Generic function supporting both SemanticMatch and ExactMatch types
* @param allResults - All search results (must have 'file' property)
* @param fullContentLimit - Number of results with full content (default: 5)
* @param extraFilesCount - Number of additional file-only results (default: 10)
*/
function splitResultsWithExtraFiles<T extends { file: string }>(
allResults: T[],
fullContentLimit: number = 5,
extraFilesCount: number = 10
): { results: T[]; extra_files: string[] } {
// First N results with full content
const results = allResults.slice(0, fullContentLimit);
// Next M results as file paths only (deduplicated)
const extraResults = allResults.slice(fullContentLimit, fullContentLimit + extraFilesCount);
const extra_files = [...new Set(extraResults.map(r => r.file))];
return { results, extra_files };
}
/**
* Check if CodexLens index exists for current directory
* @param path - Directory path to check
@@ -714,7 +753,7 @@ async function executeAutoMode(params: Params): Promise<SearchResult> {
* Supports tokenized multi-word queries with OR matching and result ranking
*/
async function executeRipgrepMode(params: Params): Promise<SearchResult> {
const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params;
const { query, paths = [], contextLines = 0, maxResults = 5, extraFilesCount = 10, maxContentLength = 200, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params;
if (!query) {
return {
@@ -726,6 +765,9 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
// Check if ripgrep is available
const hasRipgrep = checkToolAvailability('rg');
// Calculate total to fetch for split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
// If ripgrep not available, fall back to CodexLens exact mode
if (!hasRipgrep) {
const readyStatus = await ensureCodexLensReady();
@@ -737,7 +779,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
}
// Use CodexLens exact mode as fallback
const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json'];
const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'exact', '--json'];
const result = await executeCodexLens(args, { cwd: path });
if (!result.success) {
@@ -754,23 +796,27 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
}
// Parse results
let results: SemanticMatch[] = [];
let allResults: SemanticMatch[] = [];
try {
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
results = (Array.isArray(data) ? data : []).map((item: any) => ({
allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
file: item.path || item.file,
score: item.score || 0,
content: item.excerpt || item.content || '',
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
}));
} catch {
// Keep empty results
}
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'ripgrep',
backend: 'codexlens-fallback',
@@ -781,12 +827,12 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
};
}
// Use ripgrep
// Use ripgrep - request more results to support split
const { command, args, tokens } = buildRipgrepCommand({
query,
paths: paths.length > 0 ? paths : [path],
contextLines,
maxResults,
maxResults: totalToFetch, // Fetch more to support split
includeHidden,
regex,
caseSensitive,
@@ -812,14 +858,14 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
});
child.on('close', (code) => {
const results: ExactMatch[] = [];
const allResults: ExactMatch[] = [];
const lines = stdout.split('\n').filter((line) => line.trim());
// Limit total results to prevent memory overflow (--max-count only limits per-file)
const effectiveLimit = maxResults > 0 ? maxResults : 500;
const effectiveLimit = totalToFetch > 0 ? totalToFetch : 500;
for (const line of lines) {
// Stop collecting if we've reached the limit
if (results.length >= effectiveLimit) {
if (allResults.length >= effectiveLimit) {
resultLimitReached = true;
break;
}
@@ -837,7 +883,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
: 1,
content: item.data.lines.text.trim(),
};
results.push(match);
allResults.push(match);
}
} catch {
continue;
@@ -850,9 +896,12 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
// Apply token-based scoring and sorting for multi-word queries
// Results matching more tokens are ranked higher (exact matches first)
const scoredResults = tokens.length > 1 ? scoreByTokenMatch(results, tokens) : results;
const scoredResults = tokens.length > 1 ? scoreByTokenMatch(allResults, tokens) : allResults;
if (code === 0 || code === 1 || (isWindowsDeviceError && scoredResults.length > 0)) {
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(scoredResults, maxResults, extraFilesCount);
// Build warning message for various conditions
const warnings: string[] = [];
if (resultLimitReached) {
@@ -864,18 +913,19 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
resolve({
success: true,
results: scoredResults,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'ripgrep',
backend: 'ripgrep',
count: scoredResults.length,
count: results.length,
query,
tokens: tokens.length > 1 ? tokens : undefined, // Include tokens in metadata for debugging
tokenized: tokens.length > 1,
...(warnings.length > 0 && { warning: warnings.join('; ') }),
},
});
} else if (isWindowsDeviceError && results.length === 0) {
} else if (isWindowsDeviceError && allResults.length === 0) {
// Windows device error but no results - might be the only issue
resolve({
success: true,
@@ -912,7 +962,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
* Requires index
*/
async function executeCodexLensExactMode(params: Params): Promise<SearchResult> {
const { query, path = '.', maxResults = 10, enrich = false } = params;
const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;
if (!query) {
return {
@@ -933,7 +983,9 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
// Check index status
const indexStatus = await checkIndexStatus(path);
const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json'];
// Request more results to support split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'exact', '--json'];
if (enrich) {
args.push('--enrich');
}
@@ -954,14 +1006,14 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
}
// Parse results
let results: SemanticMatch[] = [];
let allResults: SemanticMatch[] = [];
try {
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
results = (Array.isArray(data) ? data : []).map((item: any) => ({
allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
file: item.path || item.file,
score: item.score || 0,
content: item.excerpt || item.content || '',
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
}));
} catch {
@@ -969,8 +1021,8 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
}
// Fallback to fuzzy mode if exact returns no results
if (results.length === 0) {
const fuzzyArgs = ['search', query, '--limit', maxResults.toString(), '--mode', 'fuzzy', '--json'];
if (allResults.length === 0) {
const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'fuzzy', '--json'];
if (enrich) {
fuzzyArgs.push('--enrich');
}
@@ -980,20 +1032,23 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
try {
const parsed = JSON.parse(stripAnsi(fuzzyResult.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
results = (Array.isArray(data) ? data : []).map((item: any) => ({
allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
file: item.path || item.file,
score: item.score || 0,
content: item.excerpt || item.content || '',
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
}));
} catch {
// Keep empty results
}
if (results.length > 0) {
if (allResults.length > 0) {
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'exact',
backend: 'codexlens',
@@ -1008,9 +1063,13 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
}
}
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'exact',
backend: 'codexlens',
@@ -1027,7 +1086,7 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
* Requires index with embeddings
*/
async function executeHybridMode(params: Params): Promise<SearchResult> {
const { query, path = '.', maxResults = 10, enrich = false } = params;
const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;
if (!query) {
return {
@@ -1048,7 +1107,9 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
// Check index status
const indexStatus = await checkIndexStatus(path);
const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'hybrid', '--json'];
// Request more results to support split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'hybrid', '--json'];
if (enrich) {
args.push('--enrich');
}
@@ -1069,14 +1130,14 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
}
// Parse results
let results: SemanticMatch[] = [];
let allResults: SemanticMatch[] = [];
let baselineInfo: { score: number; count: number } | null = null;
let initialCount = 0;
try {
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
results = (Array.isArray(data) ? data : []).map((item: any) => {
allResults = (Array.isArray(data) ? data : []).map((item: any) => {
const rawScore = item.score || 0;
// Hybrid mode returns distance scores (lower is better).
// Convert to similarity scores (higher is better) for consistency.
@@ -1085,27 +1146,27 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
return {
file: item.path || item.file,
score: similarityScore,
content: item.excerpt || item.content || '',
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
};
});
initialCount = results.length;
initialCount = allResults.length;
// Post-processing pipeline to improve semantic search quality
// 0. Filter dominant baseline scores (hot spot detection)
const baselineResult = filterDominantBaselineScores(results);
results = baselineResult.filteredResults;
const baselineResult = filterDominantBaselineScores(allResults);
allResults = baselineResult.filteredResults;
baselineInfo = baselineResult.baselineInfo;
// 1. Filter noisy files (coverage, node_modules, etc.)
results = filterNoisyFiles(results);
allResults = filterNoisyFiles(allResults);
// 2. Boost results containing query keywords
results = applyKeywordBoosting(results, query);
allResults = applyKeywordBoosting(allResults, query);
// 3. Enforce score diversity (penalize identical scores)
results = enforceScoreDiversity(results);
allResults = enforceScoreDiversity(allResults);
// 4. Re-sort by adjusted scores
results.sort((a, b) => b.score - a.score);
allResults.sort((a, b) => b.score - a.score);
} catch {
return {
success: true,
@@ -1121,15 +1182,19 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
};
}
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
// Build metadata with baseline info if detected
let note = 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results';
if (baselineInfo) {
note += ` | Filtered ${initialCount - results.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
note += ` | Filtered ${initialCount - allResults.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
}
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'hybrid',
backend: 'codexlens',
@@ -1540,7 +1605,7 @@ export const schema: ToolSchema = {
mode: {
type: 'string',
enum: SEARCH_MODES,
description: 'Search mode: auto (default), hybrid (best quality), exact (CodexLens FTS), ripgrep (fast, no index), priority (fallback: hybrid->exact->ripgrep)',
description: 'Search mode: auto, hybrid (best quality), exact (CodexLens FTS), ripgrep (fast, no index), priority (fallback chain)',
default: 'auto',
},
output_mode: {
@@ -1576,6 +1641,16 @@ export const schema: ToolSchema = {
description: 'Alias for maxResults (default: 20)',
default: 20,
},
extraFilesCount: {
type: 'number',
description: 'Number of additional file-only results (paths without content)',
default: 10,
},
maxContentLength: {
type: 'number',
description: 'Maximum content length for truncation (50-2000)',
default: 200,
},
offset: {
type: 'number',
description: 'Pagination offset - skip first N results (default: 0)',