Remove LLM enhancement features and related components as per user request. This includes the deletion of source code files, CLI commands, front-end components, tests, scripts, and documentation associated with LLM functionality. Simplified dependencies and reduced complexity while retaining core vector search capabilities. Validation of changes confirmed successful removal and functionality.

This commit is contained in:
catlog22
2025-12-16 21:38:27 +08:00
parent d21066c282
commit b702791c2c
21 changed files with 375 additions and 7193 deletions

View File

@@ -85,7 +85,7 @@ MODE: analysis
CONTEXT: @**/*
EXPECTED: {from prompt}
RULES: {from prompt, if template specified} | analysis=READ-ONLY
" --tool gemini --cd {dir}
" --tool gemini --cd {dir}
```
**Fallback Chain**: Gemini → Qwen → Codex → Bash-only

View File

@@ -1,13 +0,0 @@
# Active Memory
> Auto-generated understanding of frequently accessed files using GEMINI.
> Last updated: 2025-12-14T08:59:41.526Z
> Files analyzed: 10
> CLI Tool: gemini
---
[object Object]
---

View File

@@ -18,15 +18,6 @@ let nativeResumeEnabled = localStorage.getItem('ccw-native-resume') !== 'false';
// Recursive Query settings (for hierarchical storage aggregation)
let recursiveQueryEnabled = localStorage.getItem('ccw-recursive-query') !== 'false'; // default true
// LLM Enhancement settings for Semantic Search
let llmEnhancementSettings = {
enabled: localStorage.getItem('ccw-llm-enhancement-enabled') === 'true',
tool: localStorage.getItem('ccw-llm-enhancement-tool') || 'gemini',
fallbackTool: localStorage.getItem('ccw-llm-enhancement-fallback') || 'qwen',
batchSize: parseInt(localStorage.getItem('ccw-llm-enhancement-batch-size') || '5', 10),
timeoutMs: parseInt(localStorage.getItem('ccw-llm-enhancement-timeout') || '300000', 10)
};
// ========== Initialization ==========
function initCliStatus() {
// Load all statuses in one call using aggregated endpoint
@@ -242,17 +233,12 @@ function renderCliStatus() {
`;
// Semantic Search card (only show if CodexLens is installed)
const llmStatusBadge = llmEnhancementSettings.enabled
? `<span class="badge px-1.5 py-0.5 text-xs rounded bg-success/20 text-success">LLM</span>`
: '';
const semanticHtml = codexLensStatus.ready ? `
<div class="cli-tool-card tool-semantic clickable ${semanticStatus.available ? 'available' : 'unavailable'}"
onclick="openSemanticSettingsModal()">
<div class="cli-tool-card tool-semantic ${semanticStatus.available ? 'available' : 'unavailable'}">
<div class="cli-tool-header">
<span class="cli-tool-status ${semanticStatus.available ? 'status-available' : 'status-unavailable'}"></span>
<span class="cli-tool-name">Semantic Search</span>
<span class="badge px-1.5 py-0.5 text-xs rounded ${semanticStatus.available ? 'bg-primary/20 text-primary' : 'bg-muted text-muted-foreground'}">AI</span>
${llmStatusBadge}
</div>
<div class="cli-tool-desc text-xs text-muted-foreground mt-1">
${semanticStatus.available ? 'AI-powered code understanding' : 'Natural language code search'}
@@ -265,27 +251,17 @@ function renderCliStatus() {
</div>
<div class="cli-tool-actions flex flex-col gap-2 mt-3">
${!semanticStatus.available ? `
<button class="btn-sm btn-primary w-full flex items-center justify-center gap-1" onclick="event.stopPropagation(); openSemanticInstallWizard()">
<button class="btn-sm btn-primary w-full flex items-center justify-center gap-1" onclick="openSemanticInstallWizard()">
<i data-lucide="brain" class="w-3 h-3"></i> Install AI Model
</button>
<div class="flex items-center justify-between w-full mt-1">
<div class="flex items-center gap-1 text-xs text-muted-foreground">
<i data-lucide="hard-drive" class="w-3 h-3"></i>
<span>~130MB</span>
</div>
<button class="btn-sm btn-outline flex items-center gap-1" onclick="event.stopPropagation(); openSemanticSettingsModal()">
<i data-lucide="settings" class="w-3 h-3"></i>
</button>
<div class="flex items-center gap-1 text-xs text-muted-foreground mt-1">
<i data-lucide="hard-drive" class="w-3 h-3"></i>
<span>~130MB</span>
</div>
` : `
<div class="flex items-center justify-between w-full">
<div class="flex items-center gap-1 text-xs text-muted-foreground">
<i data-lucide="cpu" class="w-3 h-3"></i>
<span>bge-small-en-v1.5</span>
</div>
<button class="btn-sm btn-outline flex items-center gap-1" onclick="event.stopPropagation(); openSemanticSettingsModal()">
<i data-lucide="settings" class="w-3 h-3"></i>
</button>
<div class="flex items-center gap-1 text-xs text-muted-foreground">
<i data-lucide="cpu" class="w-3 h-3"></i>
<span>bge-small-en-v1.5</span>
</div>
`}
</div>
@@ -991,618 +967,3 @@ async function startSemanticInstall() {
}
}
// ========== Semantic Search Settings Modal ==========
function openSemanticSettingsModal() {
const availableTools = Object.entries(cliToolStatus)
.filter(function(entry) { return entry[1].available; })
.map(function(entry) { return entry[0]; });
const modal = document.createElement('div');
modal.id = 'semanticSettingsModal';
modal.className = 'fixed inset-0 bg-black/50 flex items-center justify-center z-50';
modal.onclick = function(e) { if (e.target === modal) closeSemanticSettingsModal(); };
const toolOptions = availableTools.map(function(tool) {
return '<option value="' + tool + '"' + (llmEnhancementSettings.tool === tool ? ' selected' : '') + '>' +
tool.charAt(0).toUpperCase() + tool.slice(1) + '</option>';
}).join('');
const fallbackOptions = '<option value="">' + t('semantic.none') + '</option>' + availableTools.map(function(tool) {
return '<option value="' + tool + '"' + (llmEnhancementSettings.fallbackTool === tool ? ' selected' : '') + '>' +
tool.charAt(0).toUpperCase() + tool.slice(1) + '</option>';
}).join('');
const disabled = !llmEnhancementSettings.enabled ? 'disabled' : '';
const opacityClass = !llmEnhancementSettings.enabled ? 'opacity-50' : '';
modal.innerHTML =
'<div class="bg-card rounded-lg shadow-xl w-full max-w-lg mx-4 overflow-hidden" onclick="event.stopPropagation()">' +
'<div class="p-6">' +
'<div class="flex items-center gap-3 mb-4">' +
'<div class="w-10 h-10 rounded-full bg-primary/10 flex items-center justify-center">' +
'<i data-lucide="sparkles" class="w-5 h-5 text-primary"></i>' +
'</div>' +
'<div>' +
'<h3 class="text-lg font-semibold">' + t('semantic.settings') + '</h3>' +
'<p class="text-sm text-muted-foreground">' + t('semantic.configDesc') + '</p>' +
'</div>' +
'</div>' +
'<div class="space-y-4">' +
'<div class="flex items-center justify-between p-4 bg-muted/50 rounded-lg">' +
'<div>' +
'<h4 class="font-medium flex items-center gap-2">' +
'<i data-lucide="brain" class="w-4 h-4"></i>' + t('semantic.llmEnhancement') + '</h4>' +
'<p class="text-sm text-muted-foreground mt-1">' + t('semantic.llmDesc') + '</p>' +
'</div>' +
'<label class="cli-toggle">' +
'<input type="checkbox" id="llmEnhancementToggle" ' + (llmEnhancementSettings.enabled ? 'checked' : '') +
' onchange="toggleLlmEnhancement(this.checked)">' +
'<span class="cli-toggle-slider"></span>' +
'</label>' +
'</div>' +
'<div class="p-4 bg-muted/30 rounded-lg space-y-4 ' + opacityClass + '" id="llmSettingsSection">' +
'<div class="grid grid-cols-2 gap-4">' +
'<div>' +
'<label class="block text-sm font-medium mb-2">' +
'<i data-lucide="cpu" class="w-3 h-3 inline mr-1"></i>' + t('semantic.primaryTool') + '</label>' +
'<select class="cli-setting-select w-full" id="llmToolSelect" onchange="updateLlmTool(this.value)" ' + disabled + '>' + toolOptions + '</select>' +
'</div>' +
'<div>' +
'<label class="block text-sm font-medium mb-2">' +
'<i data-lucide="refresh-cw" class="w-3 h-3 inline mr-1"></i>' + t('semantic.fallbackTool') + '</label>' +
'<select class="cli-setting-select w-full" id="llmFallbackSelect" onchange="updateLlmFallback(this.value)" ' + disabled + '>' + fallbackOptions + '</select>' +
'</div>' +
'</div>' +
'<div class="grid grid-cols-2 gap-4">' +
'<div>' +
'<label class="block text-sm font-medium mb-2">' +
'<i data-lucide="layers" class="w-3 h-3 inline mr-1"></i>' + t('semantic.batchSize') + '</label>' +
'<select class="cli-setting-select w-full" id="llmBatchSelect" onchange="updateLlmBatchSize(this.value)" ' + disabled + '>' +
'<option value="1"' + (llmEnhancementSettings.batchSize === 1 ? ' selected' : '') + '>1 ' + t('semantic.file') + '</option>' +
'<option value="3"' + (llmEnhancementSettings.batchSize === 3 ? ' selected' : '') + '>3 ' + t('semantic.files') + '</option>' +
'<option value="5"' + (llmEnhancementSettings.batchSize === 5 ? ' selected' : '') + '>5 ' + t('semantic.files') + '</option>' +
'<option value="10"' + (llmEnhancementSettings.batchSize === 10 ? ' selected' : '') + '>10 ' + t('semantic.files') + '</option>' +
'</select>' +
'</div>' +
'<div>' +
'<label class="block text-sm font-medium mb-2">' +
'<i data-lucide="clock" class="w-3 h-3 inline mr-1"></i>' + t('semantic.timeout') + '</label>' +
'<select class="cli-setting-select w-full" id="llmTimeoutSelect" onchange="updateLlmTimeout(this.value)" ' + disabled + '>' +
'<option value="60000"' + (llmEnhancementSettings.timeoutMs === 60000 ? ' selected' : '') + '>1 min</option>' +
'<option value="180000"' + (llmEnhancementSettings.timeoutMs === 180000 ? ' selected' : '') + '>3 min</option>' +
'<option value="300000"' + (llmEnhancementSettings.timeoutMs === 300000 ? ' selected' : '') + '>5 min</option>' +
'<option value="600000"' + (llmEnhancementSettings.timeoutMs === 600000 ? ' selected' : '') + '>10 min</option>' +
'</select>' +
'</div>' +
'</div>' +
'</div>' +
'<div class="bg-primary/5 border border-primary/20 rounded-lg p-3">' +
'<div class="flex items-start gap-2">' +
'<i data-lucide="info" class="w-4 h-4 text-primary mt-0.5"></i>' +
'<div class="text-sm text-muted-foreground">' +
'<p>' + t('semantic.enhanceInfo') + '</p>' +
'<p class="mt-1">' + t('semantic.enhanceCommand') + ' <code class="bg-muted px-1 rounded">codex-lens enhance</code> ' + t('semantic.enhanceAfterEnable') + '</p>' +
'</div>' +
'</div>' +
'</div>' +
'<div class="flex gap-2 pt-2">' +
'<button class="btn-sm btn-outline flex items-center gap-1 flex-1" onclick="runEnhanceCommand()" ' + disabled + '>' +
'<i data-lucide="zap" class="w-3 h-3"></i>' + t('semantic.runEnhanceNow') + '</button>' +
'<button class="btn-sm btn-outline flex items-center gap-1 flex-1" onclick="viewEnhanceStatus()">' +
'<i data-lucide="bar-chart-2" class="w-3 h-3"></i>' + t('semantic.viewStatus') + '</button>' +
'</div>' +
'<div class="border-t border-border my-4"></div>' +
'<div>' +
'<h4 class="font-medium mb-3 flex items-center gap-2">' +
'<i data-lucide="search" class="w-4 h-4"></i>' + t('semantic.testSearch') + '</h4>' +
'<div class="space-y-3">' +
'<div>' +
'<input type="text" id="semanticSearchInput" class="tool-config-input w-full" ' +
'placeholder="' + t('semantic.searchPlaceholder') + '" />' +
'</div>' +
'<div>' +
'<button class="btn-sm btn-primary w-full" id="runSemanticSearchBtn">' +
'<i data-lucide="search" class="w-3 h-3"></i> ' + t('semantic.runSearch') +
'</button>' +
'</div>' +
'<div id="semanticSearchResults" class="hidden">' +
'<div class="bg-muted/30 rounded-lg p-3 max-h-64 overflow-y-auto">' +
'<div class="flex items-center justify-between mb-2">' +
'<p class="text-sm font-medium">' + t('codexlens.results') + ':</p>' +
'<span id="semanticResultCount" class="text-xs text-muted-foreground"></span>' +
'</div>' +
'<pre id="semanticResultContent" class="text-xs font-mono whitespace-pre-wrap break-all"></pre>' +
'</div>' +
'</div>' +
'</div>' +
'</div>' +
'</div>' +
'</div>' +
'<div class="border-t border-border p-4 flex justify-end gap-3 bg-muted/30">' +
'<button class="btn-outline px-4 py-2" onclick="closeSemanticSettingsModal()">' + t('semantic.close') + '</button>' +
'</div>' +
'</div>';
document.body.appendChild(modal);
// Add semantic search button handler
setTimeout(function() {
var runSemanticSearchBtn = document.getElementById('runSemanticSearchBtn');
if (runSemanticSearchBtn) {
runSemanticSearchBtn.onclick = async function() {
var query = document.getElementById('semanticSearchInput').value.trim();
var resultsDiv = document.getElementById('semanticSearchResults');
var resultCount = document.getElementById('semanticResultCount');
var resultContent = document.getElementById('semanticResultContent');
if (!query) {
showRefreshToast(t('codexlens.enterQuery'), 'warning');
return;
}
runSemanticSearchBtn.disabled = true;
runSemanticSearchBtn.innerHTML = '<span class="animate-pulse">' + t('codexlens.searching') + '</span>';
resultsDiv.classList.add('hidden');
try {
var params = new URLSearchParams({
query: query,
mode: 'semantic',
limit: '10'
});
var response = await fetch('/api/codexlens/search?' + params.toString());
var result = await response.json();
console.log('[Semantic Search Test] Result:', result);
if (result.success) {
var results = result.results || [];
resultCount.textContent = results.length + ' ' + t('codexlens.resultsCount');
resultContent.textContent = JSON.stringify(results, null, 2);
resultsDiv.classList.remove('hidden');
showRefreshToast(t('codexlens.searchCompleted') + ': ' + results.length + ' ' + t('codexlens.resultsCount'), 'success');
} else {
resultContent.textContent = t('common.error') + ': ' + (result.error || t('common.unknownError'));
resultsDiv.classList.remove('hidden');
showRefreshToast(t('codexlens.searchFailed') + ': ' + result.error, 'error');
}
runSemanticSearchBtn.disabled = false;
runSemanticSearchBtn.innerHTML = '<i data-lucide="search" class="w-3 h-3"></i> ' + t('semantic.runSearch');
if (window.lucide) lucide.createIcons();
} catch (err) {
console.error('[Semantic Search Test] Error:', err);
resultContent.textContent = t('common.exception') + ': ' + err.message;
resultsDiv.classList.remove('hidden');
showRefreshToast(t('common.error') + ': ' + err.message, 'error');
runSemanticSearchBtn.disabled = false;
runSemanticSearchBtn.innerHTML = '<i data-lucide="search" class="w-3 h-3"></i> ' + t('semantic.runSearch');
if (window.lucide) lucide.createIcons();
}
};
}
}, 100);
var handleEscape = function(e) {
if (e.key === 'Escape') {
closeSemanticSettingsModal();
document.removeEventListener('keydown', handleEscape);
}
};
document.addEventListener('keydown', handleEscape);
if (window.lucide) {
lucide.createIcons();
}
}
function closeSemanticSettingsModal() {
var modal = document.getElementById('semanticSettingsModal');
if (modal) modal.remove();
}
function toggleLlmEnhancement(enabled) {
llmEnhancementSettings.enabled = enabled;
localStorage.setItem('ccw-llm-enhancement-enabled', enabled.toString());
var settingsSection = document.getElementById('llmSettingsSection');
if (settingsSection) {
settingsSection.classList.toggle('opacity-50', !enabled);
settingsSection.querySelectorAll('select').forEach(function(el) { el.disabled = !enabled; });
}
renderCliStatus();
showRefreshToast(t('semantic.llmEnhancement') + ' ' + (enabled ? t('semantic.enabled') : t('semantic.disabled')), 'success');
}
function updateLlmTool(tool) {
llmEnhancementSettings.tool = tool;
localStorage.setItem('ccw-llm-enhancement-tool', tool);
showRefreshToast(t('semantic.toolSetTo') + ' ' + tool, 'success');
}
function updateLlmFallback(tool) {
llmEnhancementSettings.fallbackTool = tool;
localStorage.setItem('ccw-llm-enhancement-fallback', tool);
showRefreshToast(t('semantic.fallbackSetTo') + ' ' + (tool || t('semantic.none')), 'success');
}
function updateLlmBatchSize(size) {
llmEnhancementSettings.batchSize = parseInt(size, 10);
localStorage.setItem('ccw-llm-enhancement-batch-size', size);
showRefreshToast(t('semantic.batchSetTo') + ' ' + size + ' ' + t('semantic.files'), 'success');
}
function updateLlmTimeout(ms) {
llmEnhancementSettings.timeoutMs = parseInt(ms, 10);
localStorage.setItem('ccw-llm-enhancement-timeout', ms);
var mins = parseInt(ms, 10) / 60000;
showRefreshToast(t('semantic.timeoutSetTo') + ' ' + mins + ' ' + (mins > 1 ? t('semantic.minutes') : t('semantic.minute')), 'success');
}
async function runEnhanceCommand() {
if (!llmEnhancementSettings.enabled) {
showRefreshToast(t('semantic.enableFirst'), 'warning');
return;
}
showRefreshToast('Starting LLM enhancement...', 'info');
closeSemanticSettingsModal();
try {
var response = await fetch('/api/codexlens/enhance', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
path: projectPath,
tool: llmEnhancementSettings.tool,
batchSize: llmEnhancementSettings.batchSize,
timeoutMs: llmEnhancementSettings.timeoutMs
})
});
var result = await response.json();
if (result.success) {
var enhanced = result.result?.enhanced || 0;
showRefreshToast('Enhanced ' + enhanced + ' files with LLM', 'success');
} else {
showRefreshToast('Enhance failed: ' + result.error, 'error');
}
} catch (err) {
showRefreshToast('Enhance error: ' + err.message, 'error');
}
}
function viewEnhanceStatus() {
openSemanticMetadataViewer();
}
// ========== Semantic Metadata Viewer ==========
var semanticMetadataCache = {
entries: [],
total: 0,
offset: 0,
limit: 50,
loading: false
};
async function openSemanticMetadataViewer() {
closeSemanticSettingsModal();
var modal = document.createElement('div');
modal.id = 'semanticMetadataModal';
modal.className = 'generic-modal-overlay';
modal.onclick = function(e) { if (e.target === modal) closeSemanticMetadataViewer(); };
modal.innerHTML =
'<div class="generic-modal large" onclick="event.stopPropagation()">' +
'<div class="generic-modal-header">' +
'<div class="flex items-center gap-3">' +
'<i data-lucide="database" class="w-5 h-5 text-primary"></i>' +
'<h3 class="generic-modal-title">Semantic Metadata Browser</h3>' +
'<span id="semanticMetadataCount" class="badge bg-muted text-muted-foreground px-2 py-0.5 text-xs rounded">Loading...</span>' +
'</div>' +
'<button class="generic-modal-close" onclick="closeSemanticMetadataViewer()">' +
'<i data-lucide="x" class="w-4 h-4"></i>' +
'</button>' +
'</div>' +
'<div class="generic-modal-body p-0">' +
'<div class="semantic-viewer-toolbar">' +
'<div class="flex items-center gap-3">' +
'<select id="semanticToolFilter" class="cli-setting-select" onchange="filterSemanticByTool(this.value)">' +
'<option value="">All Tools</option>' +
'<option value="gemini">Gemini</option>' +
'<option value="qwen">Qwen</option>' +
'</select>' +
'<button class="btn-sm btn-outline flex items-center gap-1" onclick="refreshSemanticMetadata()">' +
'<i data-lucide="refresh-cw" class="w-3 h-3"></i> Refresh' +
'</button>' +
'</div>' +
'<div class="flex items-center gap-2 text-sm text-muted-foreground">' +
'<span id="semanticPaginationInfo">-</span>' +
'</div>' +
'</div>' +
'<div id="semanticMetadataTableContainer" class="semantic-table-container">' +
'<div class="semantic-loading">' +
'<div class="animate-spin w-6 h-6 border-2 border-primary border-t-transparent rounded-full"></div>' +
'<span>Loading metadata...</span>' +
'</div>' +
'</div>' +
'<div class="semantic-viewer-footer">' +
'<button id="semanticPrevBtn" class="btn-sm btn-outline" onclick="semanticPrevPage()" disabled>' +
'<i data-lucide="chevron-left" class="w-4 h-4"></i> Previous' +
'</button>' +
'<div class="flex items-center gap-2">' +
'<span class="text-sm text-muted-foreground">Page</span>' +
'<select id="semanticPageSelect" class="cli-setting-select" onchange="semanticGoToPage(this.value)">' +
'<option value="0">1</option>' +
'</select>' +
'</div>' +
'<button id="semanticNextBtn" class="btn-sm btn-outline" onclick="semanticNextPage()" disabled>' +
'Next <i data-lucide="chevron-right" class="w-4 h-4"></i>' +
'</button>' +
'</div>' +
'</div>' +
'</div>';
document.body.appendChild(modal);
requestAnimationFrame(function() {
modal.classList.add('active');
});
var handleEscape = function(e) {
if (e.key === 'Escape') {
closeSemanticMetadataViewer();
document.removeEventListener('keydown', handleEscape);
}
};
document.addEventListener('keydown', handleEscape);
if (window.lucide) {
lucide.createIcons();
}
await loadSemanticMetadata();
}
function closeSemanticMetadataViewer() {
var modal = document.getElementById('semanticMetadataModal');
if (modal) {
modal.classList.remove('active');
setTimeout(function() { modal.remove(); }, 200);
}
}
async function loadSemanticMetadata(offset, toolFilter) {
offset = typeof offset === 'number' ? offset : semanticMetadataCache.offset;
toolFilter = toolFilter !== undefined ? toolFilter : (document.getElementById('semanticToolFilter')?.value || '');
semanticMetadataCache.loading = true;
var container = document.getElementById('semanticMetadataTableContainer');
if (container) {
container.innerHTML =
'<div class="semantic-loading">' +
'<div class="animate-spin w-6 h-6 border-2 border-primary border-t-transparent rounded-full"></div>' +
'<span>Loading metadata...</span>' +
'</div>';
}
try {
var url = '/api/codexlens/semantic/metadata?offset=' + offset + '&limit=' + semanticMetadataCache.limit;
if (toolFilter) {
url += '&tool=' + encodeURIComponent(toolFilter);
}
var response = await fetch(url);
var data = await response.json();
if (data.success && data.result) {
semanticMetadataCache.entries = data.result.entries || [];
semanticMetadataCache.total = data.result.total || 0;
semanticMetadataCache.offset = offset;
renderSemanticMetadataTable();
updateSemanticPagination();
} else {
container.innerHTML =
'<div class="semantic-empty">' +
'<i data-lucide="alert-circle" class="w-8 h-8 text-muted-foreground"></i>' +
'<p>Error loading metadata: ' + (data.error || 'Unknown error') + '</p>' +
'</div>';
if (window.lucide) lucide.createIcons();
}
} catch (err) {
container.innerHTML =
'<div class="semantic-empty">' +
'<i data-lucide="alert-circle" class="w-8 h-8 text-muted-foreground"></i>' +
'<p>Error: ' + err.message + '</p>' +
'</div>';
if (window.lucide) lucide.createIcons();
}
semanticMetadataCache.loading = false;
}
function escapeHtmlSemantic(text) {
if (!text) return '';
var div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
function renderSemanticMetadataTable() {
var container = document.getElementById('semanticMetadataTableContainer');
if (!container) return;
var entries = semanticMetadataCache.entries;
if (!entries.length) {
container.innerHTML =
'<div class="semantic-empty">' +
'<i data-lucide="database" class="w-12 h-12 text-muted-foreground mb-3"></i>' +
'<p class="text-lg font-medium">No semantic metadata found</p>' +
'<p class="text-sm text-muted-foreground mt-1">Run \'codex-lens enhance\' to generate metadata for indexed files.</p>' +
'<button class="btn-sm btn-primary mt-4" onclick="closeSemanticMetadataViewer(); runEnhanceCommand();">' +
'<i data-lucide="zap" class="w-3 h-3 mr-1"></i> Run Enhance' +
'</button>' +
'</div>';
if (window.lucide) lucide.createIcons();
return;
}
var rows = entries.map(function(entry, idx) {
var keywordsHtml = (entry.keywords || []).slice(0, 4).map(function(k) {
return '<span class="semantic-keyword">' + escapeHtmlSemantic(k) + '</span>';
}).join('');
if ((entry.keywords || []).length > 4) {
keywordsHtml += '<span class="semantic-keyword-more">+' + (entry.keywords.length - 4) + '</span>';
}
var date = entry.generated_at ? new Date(entry.generated_at * 1000).toLocaleDateString() : '-';
return (
'<tr class="semantic-row" onclick="toggleSemanticDetail(' + idx + ')">' +
'<td class="semantic-cell-file">' +
'<div class="flex items-center gap-2">' +
'<i data-lucide="file-code" class="w-4 h-4 text-muted-foreground"></i>' +
'<span class="font-medium">' + escapeHtmlSemantic(entry.file_name || '-') + '</span>' +
'</div>' +
'<div class="text-xs text-muted-foreground truncate" title="' + escapeHtmlSemantic(entry.full_path || '') + '">' +
escapeHtmlSemantic(entry.full_path || '-') +
'</div>' +
'</td>' +
'<td class="semantic-cell-lang">' + escapeHtmlSemantic(entry.language || '-') + '</td>' +
'<td class="semantic-cell-purpose">' + escapeHtmlSemantic((entry.purpose || '-').substring(0, 50)) +
((entry.purpose || '').length > 50 ? '...' : '') + '</td>' +
'<td class="semantic-cell-keywords">' + (keywordsHtml || '-') + '</td>' +
'<td class="semantic-cell-tool">' +
'<span class="tool-badge tool-' + (entry.llm_tool || 'unknown') + '">' +
escapeHtmlSemantic(entry.llm_tool || '-') +
'</span>' +
'</td>' +
'<td class="semantic-cell-date">' + date + '</td>' +
'</tr>' +
'<tr id="semanticDetail' + idx + '" class="semantic-detail-row hidden">' +
'<td colspan="6">' +
'<div class="semantic-detail-content">' +
'<div class="semantic-detail-section">' +
'<h4><i data-lucide="file-text" class="w-3 h-3"></i> Summary</h4>' +
'<p>' + escapeHtmlSemantic(entry.summary || 'No summary available') + '</p>' +
'</div>' +
'<div class="semantic-detail-section">' +
'<h4><i data-lucide="tag" class="w-3 h-3"></i> All Keywords</h4>' +
'<div class="semantic-keywords-full">' +
(entry.keywords || []).map(function(k) {
return '<span class="semantic-keyword">' + escapeHtmlSemantic(k) + '</span>';
}).join('') +
'</div>' +
'</div>' +
'<div class="semantic-detail-meta">' +
'<span><i data-lucide="hash" class="w-3 h-3"></i> ' + (entry.line_count || 0) + ' lines</span>' +
'<span><i data-lucide="cpu" class="w-3 h-3"></i> ' + escapeHtmlSemantic(entry.llm_tool || 'Unknown') + '</span>' +
'<span><i data-lucide="calendar" class="w-3 h-3"></i> ' + date + '</span>' +
'</div>' +
'</div>' +
'</td>' +
'</tr>'
);
}).join('');
container.innerHTML =
'<table class="semantic-table">' +
'<thead>' +
'<tr>' +
'<th>File</th>' +
'<th>Language</th>' +
'<th>Purpose</th>' +
'<th>Keywords</th>' +
'<th>Tool</th>' +
'<th>Date</th>' +
'</tr>' +
'</thead>' +
'<tbody>' + rows + '</tbody>' +
'</table>';
if (window.lucide) lucide.createIcons();
}
function toggleSemanticDetail(idx) {
var detailRow = document.getElementById('semanticDetail' + idx);
if (detailRow) {
detailRow.classList.toggle('hidden');
if (window.lucide) lucide.createIcons();
}
}
function updateSemanticPagination() {
var total = semanticMetadataCache.total;
var offset = semanticMetadataCache.offset;
var limit = semanticMetadataCache.limit;
var entries = semanticMetadataCache.entries;
var countBadge = document.getElementById('semanticMetadataCount');
if (countBadge) {
countBadge.textContent = total + ' entries';
}
var paginationInfo = document.getElementById('semanticPaginationInfo');
if (paginationInfo) {
if (total > 0) {
paginationInfo.textContent = (offset + 1) + '-' + (offset + entries.length) + ' of ' + total;
} else {
paginationInfo.textContent = 'No entries';
}
}
var pageSelect = document.getElementById('semanticPageSelect');
if (pageSelect) {
var totalPages = Math.ceil(total / limit) || 1;
var currentPage = Math.floor(offset / limit);
pageSelect.innerHTML = '';
for (var i = 0; i < totalPages; i++) {
var opt = document.createElement('option');
opt.value = i;
opt.textContent = i + 1;
if (i === currentPage) opt.selected = true;
pageSelect.appendChild(opt);
}
}
var prevBtn = document.getElementById('semanticPrevBtn');
var nextBtn = document.getElementById('semanticNextBtn');
if (prevBtn) prevBtn.disabled = offset === 0;
if (nextBtn) nextBtn.disabled = offset + limit >= total;
}
function semanticPrevPage() {
if (semanticMetadataCache.offset > 0) {
loadSemanticMetadata(Math.max(0, semanticMetadataCache.offset - semanticMetadataCache.limit));
}
}
function semanticNextPage() {
if (semanticMetadataCache.offset + semanticMetadataCache.limit < semanticMetadataCache.total) {
loadSemanticMetadata(semanticMetadataCache.offset + semanticMetadataCache.limit);
}
}
function semanticGoToPage(pageIndex) {
var offset = parseInt(pageIndex, 10) * semanticMetadataCache.limit;
loadSemanticMetadata(offset);
}
function filterSemanticByTool(tool) {
loadSemanticMetadata(0, tool);
}
function refreshSemanticMetadata() {
loadSemanticMetadata(semanticMetadataCache.offset);
}
function getLlmEnhancementSettings() {
return Object.assign({}, llmEnhancementSettings);
}

View File

@@ -277,35 +277,10 @@ const i18n = {
// Semantic Search Configuration
'semantic.settings': 'Semantic Search Settings',
'semantic.configDesc': 'Configure LLM enhancement for semantic indexing',
'semantic.llmEnhancement': 'LLM Enhancement',
'semantic.llmDesc': 'Use LLM to generate code summaries for better semantic search',
'semantic.primaryTool': 'Primary LLM Tool',
'semantic.fallbackTool': 'Fallback Tool',
'semantic.batchSize': 'Batch Size',
'semantic.timeout': 'Timeout',
'semantic.file': 'file',
'semantic.files': 'files',
'semantic.enhanceInfo': 'LLM enhancement generates code summaries and keywords for each file, improving semantic search accuracy.',
'semantic.enhanceCommand': 'Run',
'semantic.enhanceAfterEnable': 'after enabling to process existing files.',
'semantic.runEnhanceNow': 'Run Enhance Now',
'semantic.viewStatus': 'View Status',
'semantic.testSearch': 'Test Semantic Search',
'semantic.searchPlaceholder': 'Enter semantic query (e.g., authentication logic, error handling)',
'semantic.runSearch': 'Run Semantic Search',
'semantic.close': 'Close',
'semantic.enabled': 'enabled',
'semantic.disabled': 'disabled',
'semantic.toolSetTo': 'Primary LLM tool set to',
'semantic.fallbackSetTo': 'Fallback tool set to',
'semantic.none': 'none',
'semantic.llmEnhancement': 'LLM Enhancement',
'semantic.batchSetTo': 'Batch size set to',
'semantic.timeoutSetTo': 'Timeout set to',
'semantic.minute': 'minute',
'semantic.minutes': 'minutes',
'semantic.enableFirst': 'Please enable LLM Enhancement first',
'cli.settings': 'CLI Execution Settings',
'cli.promptFormat': 'Prompt Format',
@@ -1407,35 +1382,10 @@ const i18n = {
// Semantic Search 配置
'semantic.settings': '语义搜索设置',
'semantic.configDesc': '配置语义索引的 LLM 增强功能',
'semantic.llmEnhancement': 'LLM 增强',
'semantic.llmDesc': '使用 LLM 生成代码摘要以改进语义搜索',
'semantic.primaryTool': '主 LLM 工具',
'semantic.fallbackTool': '备用工具',
'semantic.batchSize': '批处理大小',
'semantic.timeout': '超时时间',
'semantic.file': '个文件',
'semantic.files': '个文件',
'semantic.enhanceInfo': 'LLM 增强为每个文件生成代码摘要和关键词,提高语义搜索准确度。',
'semantic.enhanceCommand': '运行',
'semantic.enhanceAfterEnable': '启用后处理现有文件。',
'semantic.runEnhanceNow': '立即运行增强',
'semantic.viewStatus': '查看状态',
'semantic.testSearch': '测试语义搜索',
'semantic.searchPlaceholder': '输入语义查询(例如:身份验证逻辑、错误处理)',
'semantic.runSearch': '运行语义搜索',
'semantic.close': '关闭',
'semantic.enabled': '已启用',
'semantic.disabled': '已禁用',
'semantic.toolSetTo': '主 LLM 工具已设置为',
'semantic.fallbackSetTo': '备用工具已设置为',
'semantic.none': '无',
'semantic.llmEnhancement': 'LLM 增强',
'semantic.batchSetTo': '批量大小已设置为',
'semantic.timeoutSetTo': '超时已设置为',
'semantic.minute': '分钟',
'semantic.minutes': '分钟',
'semantic.enableFirst': '请先启用 LLM 增强',
'cli.settings': 'CLI 调用设置',
'cli.promptFormat': '提示词格式',

View File

@@ -397,13 +397,11 @@ function renderToolsSection() {
// Semantic Search item (only show if CodexLens is installed)
var semanticHtml = '';
if (codexLensStatus.ready) {
semanticHtml = '<div class="tool-item clickable ' + (semanticStatus.available ? 'available' : 'unavailable') + '" onclick="openSemanticSettingsModal()">' +
semanticHtml = '<div class="tool-item ' + (semanticStatus.available ? 'available' : 'unavailable') + '">' +
'<div class="tool-item-left">' +
'<span class="tool-status-dot ' + (semanticStatus.available ? 'status-available' : 'status-unavailable') + '"></span>' +
'<div class="tool-item-info">' +
'<div class="tool-item-name">Semantic Search <span class="tool-type-badge ai">AI</span>' +
(llmEnhancementSettings.enabled ? '<span class="tool-type-badge llm">LLM</span>' : '') +
'<i data-lucide="settings" class="w-3 h-3 tool-config-icon"></i></div>' +
'<div class="tool-item-name">Semantic Search <span class="tool-type-badge ai">AI</span></div>' +
'<div class="tool-item-desc">' + (semanticStatus.available ? 'AI-powered code understanding' : 'Natural language code search') + '</div>' +
'</div>' +
'</div>' +

View File

@@ -1,316 +0,0 @@
# CLI Integration Summary - Embedding Management
**Date**: 2025-12-16
**Version**: v0.5.1
**Status**: ✅ Complete
---
## Overview
Completed integration of embedding management commands into the CodexLens CLI, making vector search functionality more accessible and user-friendly. Users no longer need to run standalone scripts - all embedding operations are now available through simple CLI commands.
## What Changed
### 1. New CLI Commands
#### `codexlens embeddings-generate`
**Purpose**: Generate semantic embeddings for code search
**Features**:
- Accepts project directory or direct `_index.db` path
- Auto-finds index for project paths using registry
- Supports 4 model profiles (fast, code, multilingual, balanced)
- Force regeneration with `--force` flag
- Configurable chunk size
- Verbose mode with progress updates
- JSON output mode for scripting
**Examples**:
```bash
# Generate embeddings for a project
codexlens embeddings-generate ~/projects/my-app
# Use specific model
codexlens embeddings-generate ~/projects/my-app --model fast
# Force regeneration
codexlens embeddings-generate ~/projects/my-app --force
# Verbose output
codexlens embeddings-generate ~/projects/my-app -v
```
**Output**:
```
Generating embeddings
Index: ~/.codexlens/indexes/my-app/_index.db
Model: code
✓ Embeddings generated successfully!
Model: jinaai/jina-embeddings-v2-base-code
Chunks created: 1,234
Files processed: 89
Time: 45.2s
Use vector search with:
codexlens search 'your query' --mode pure-vector
```
#### `codexlens embeddings-status`
**Purpose**: Check embedding status for indexes
**Features**:
- Check all indexes (no arguments)
- Check specific project or index
- Summary table view
- File coverage statistics
- Missing files detection
- JSON output mode
**Examples**:
```bash
# Check all indexes
codexlens embeddings-status
# Check specific project
codexlens embeddings-status ~/projects/my-app
# Check specific index
codexlens embeddings-status ~/.codexlens/indexes/my-app/_index.db
```
**Output (all indexes)**:
```
Embedding Status Summary
Index root: ~/.codexlens/indexes
Total indexes: 5
Indexes with embeddings: 3/5
Total chunks: 4,567
Project Files Chunks Coverage Status
my-app 89 1,234 100.0% ✓
other-app 145 2,456 95.5% ✓
test-proj 23 877 100.0% ✓
no-emb 67 0 0.0% —
legacy 45 0 0.0% —
```
**Output (specific project)**:
```
Embedding Status
Index: ~/.codexlens/indexes/my-app/_index.db
✓ Embeddings available
Total chunks: 1,234
Total files: 89
Files with embeddings: 89/89
Coverage: 100.0%
```
### 2. Improved Error Messages
Enhanced error messages throughout the search pipeline to guide users to the new CLI commands:
**Before**:
```
DEBUG: No semantic_chunks table found
DEBUG: Vector store is empty
```
**After**:
```
INFO: No embeddings found in index. Generate embeddings with: codexlens embeddings-generate ~/projects/my-app
WARNING: Pure vector search returned no results. This usually means embeddings haven't been generated. Run: codexlens embeddings-generate ~/projects/my-app
```
**Locations Updated**:
- `src/codexlens/search/hybrid_search.py` - Added helpful info messages
- `src/codexlens/cli/commands.py` - Improved error hints in CLI output
### 3. Backend Infrastructure
Created `src/codexlens/cli/embedding_manager.py` with reusable functions:
**Functions**:
- `check_index_embeddings(index_path)` - Check embedding status
- `generate_embeddings(index_path, ...)` - Generate embeddings
- `find_all_indexes(scan_dir)` - Find all indexes in directory
- `get_embedding_stats_summary(index_root)` - Aggregate stats for all indexes
**Architecture**:
- Follows same pattern as `model_manager.py` for consistency
- Returns standardized result dictionaries `{"success": bool, "result": dict}`
- Supports progress callbacks for UI updates
- Handles all error cases gracefully
### 4. Documentation Updates
Updated user-facing documentation to reference new CLI commands:
**Files Updated**:
1. `docs/PURE_VECTOR_SEARCH_GUIDE.md`
- Changed all references from `python scripts/generate_embeddings.py` to `codexlens embeddings-generate`
- Updated troubleshooting section
- Added new `embeddings-status` examples
2. `docs/IMPLEMENTATION_SUMMARY.md`
- Marked P1 priorities as complete
- Added CLI integration to checklist
- Updated feature list
3. `src/codexlens/cli/commands.py`
- Updated search command help text to reference new commands
## Files Created
| File | Purpose | Lines |
|------|---------|-------|
| `src/codexlens/cli/embedding_manager.py` | Backend logic for embedding operations | ~290 |
| `docs/CLI_INTEGRATION_SUMMARY.md` | This document | ~400 |
## Files Modified
| File | Changes |
|------|---------|
| `src/codexlens/cli/commands.py` | Added 2 new commands (~270 lines) |
| `src/codexlens/search/hybrid_search.py` | Improved error messages (~20 lines) |
| `docs/PURE_VECTOR_SEARCH_GUIDE.md` | Updated CLI references (~10 changes) |
| `docs/IMPLEMENTATION_SUMMARY.md` | Marked P1 complete (~10 lines) |
## Testing Workflow
### Manual Testing Checklist
- [ ] `codexlens embeddings-status` with no indexes
- [ ] `codexlens embeddings-status` with multiple indexes
- [ ] `codexlens embeddings-status ~/projects/my-app` (project path)
- [ ] `codexlens embeddings-status ~/.codexlens/indexes/my-app/_index.db` (direct path)
- [ ] `codexlens embeddings-generate ~/projects/my-app` (first time)
- [ ] `codexlens embeddings-generate ~/projects/my-app` (already exists, should error)
- [ ] `codexlens embeddings-generate ~/projects/my-app --force` (regenerate)
- [ ] `codexlens embeddings-generate ~/projects/my-app --model fast`
- [ ] `codexlens embeddings-generate ~/projects/my-app -v` (verbose output)
- [ ] `codexlens search "query" --mode pure-vector` (with embeddings)
- [ ] `codexlens search "query" --mode pure-vector` (without embeddings, check error message)
- [ ] `codexlens embeddings-status --json` (JSON output)
- [ ] `codexlens embeddings-generate ~/projects/my-app --json` (JSON output)
### Expected Test Results
**Without embeddings**:
```bash
$ codexlens embeddings-status ~/projects/my-app
Embedding Status
Index: ~/.codexlens/indexes/my-app/_index.db
— No embeddings found
Total files indexed: 89
Generate embeddings with:
codexlens embeddings-generate ~/projects/my-app
```
**After generating embeddings**:
```bash
$ codexlens embeddings-generate ~/projects/my-app
Generating embeddings
Index: ~/.codexlens/indexes/my-app/_index.db
Model: code
✓ Embeddings generated successfully!
Model: jinaai/jina-embeddings-v2-base-code
Chunks created: 1,234
Files processed: 89
Time: 45.2s
```
**Status after generation**:
```bash
$ codexlens embeddings-status ~/projects/my-app
Embedding Status
Index: ~/.codexlens/indexes/my-app/_index.db
✓ Embeddings available
Total chunks: 1,234
Total files: 89
Files with embeddings: 89/89
Coverage: 100.0%
```
**Pure vector search**:
```bash
$ codexlens search "how to authenticate users" --mode pure-vector
Found 5 results in 12.3ms:
auth/authentication.py:42 [0.876]
def authenticate_user(username: str, password: str) -> bool:
'''Verify user credentials against database.'''
return check_password(username, password)
...
```
## User Experience Improvements
| Before | After |
|--------|-------|
| Run separate Python script | Single CLI command |
| Manual path resolution | Auto-finds project index |
| No status check | `embeddings-status` command |
| Generic error messages | Helpful hints with commands |
| Script-level documentation | Integrated `--help` text |
## Backward Compatibility
- ✅ Standalone script `scripts/generate_embeddings.py` still works
- ✅ All existing search modes unchanged
- ✅ Pure vector implementation backward compatible
- ✅ No breaking changes to APIs
## Next Steps (Optional)
Future enhancements users might want:
1. **Batch operations**:
```bash
codexlens embeddings-generate --all # Generate for all indexes
```
2. **Incremental updates**:
```bash
codexlens embeddings-update ~/projects/my-app # Only changed files
```
3. **Embedding cleanup**:
```bash
codexlens embeddings-delete ~/projects/my-app # Remove embeddings
```
4. **Model management integration**:
```bash
codexlens embeddings-generate ~/projects/my-app --download-model
```
---
## Summary
✅ **Completed**: Full CLI integration for embedding management
✅ **User Experience**: Simplified from multi-step script to single command
✅ **Error Handling**: Helpful messages guide users to correct commands
✅ **Documentation**: All references updated to new CLI commands
✅ **Testing**: Manual testing checklist prepared
**Impact**: Users can now manage embeddings with intuitive CLI commands instead of running scripts, making vector search more accessible and easier to use.
**Command Summary**:
```bash
codexlens embeddings-status [path] # Check status
codexlens embeddings-generate <path> [--model] [--force] # Generate
codexlens search "query" --mode pure-vector # Use vector search
```
The integration is **complete and ready for testing**.

View File

@@ -1,972 +0,0 @@
# Docstring与LLM混合策略设计方案
## 1. 背景与目标
### 1.1 当前问题
现有 `llm_enhancer.py` 的实现存在以下问题:
1. **忽略已有文档**对所有代码无差别调用LLM即使已有高质量的docstring
2. **成本浪费**重复生成已有信息增加API调用费用和时间
3. **信息质量不一致**LLM生成的内容可能不如作者编写的docstring准确
4. **缺少作者意图**丢失了docstring中的设计决策、使用示例等关键信息
### 1.2 设计目标
实现**智能混合策略**结合docstring和LLM的优势
1. **优先使用docstring**:作为最权威的信息源
2. **LLM作为补充**填补docstring缺失或质量不足的部分
3. **智能质量评估**自动判断docstring质量决定是否需要LLM增强
4. **成本优化**减少不必要的LLM调用降低API费用
5. **信息融合**将docstring和LLM生成的内容有机结合
## 2. 技术架构
### 2.1 整体流程
```
Code Symbol
[Docstring Extractor] ← 提取docstring
[Quality Evaluator] ← 评估docstring质量
├─ High Quality → Use Docstring Directly
│ + LLM Generate Keywords Only
├─ Medium Quality → LLM Refine & Enhance
│ (docstring作为base)
└─ Low/No Docstring → LLM Full Generation
(现有流程)
[Metadata Merger] ← 合并docstring和LLM内容
Final SemanticMetadata
```
### 2.2 核心组件
```python
from dataclasses import dataclass
from enum import Enum
from typing import Optional
class DocstringQuality(Enum):
"""Docstring质量等级"""
MISSING = "missing" # 无docstring
LOW = "low" # 质量低:<10字符或纯占位符
MEDIUM = "medium" # 质量中:有基本描述但不完整
HIGH = "high" # 质量高:详细且结构化
@dataclass
class DocstringMetadata:
"""从docstring提取的元数据"""
raw_text: str
quality: DocstringQuality
summary: Optional[str] = None # 提取的摘要
parameters: Optional[dict] = None # 参数说明
returns: Optional[str] = None # 返回值说明
examples: Optional[str] = None # 使用示例
notes: Optional[str] = None # 注意事项
```
## 3. 详细实现步骤
### 3.1 Docstring提取与解析
```python
import re
from typing import Optional
class DocstringExtractor:
"""Docstring提取器"""
# Docstring风格正则
GOOGLE_STYLE_PATTERN = re.compile(
r'Args:|Returns:|Raises:|Examples:|Note:',
re.MULTILINE
)
NUMPY_STYLE_PATTERN = re.compile(
r'Parameters\n-+|Returns\n-+|Examples\n-+',
re.MULTILINE
)
def extract_from_code(self, content: str, symbol: Symbol) -> Optional[str]:
"""从代码中提取docstring"""
lines = content.splitlines()
start_line = symbol.range[0] - 1 # 0-indexed
# 查找函数定义后的第一个字符串字面量
# 通常在函数定义的下一行或几行内
for i in range(start_line + 1, min(start_line + 10, len(lines))):
line = lines[i].strip()
# Python triple-quoted string
if line.startswith('"""') or line.startswith("'''"):
return self._extract_multiline_docstring(lines, i)
return None
def _extract_multiline_docstring(
self,
lines: List[str],
start_idx: int
) -> str:
"""提取多行docstring"""
quote_char = '"""' if lines[start_idx].strip().startswith('"""') else "'''"
docstring_lines = []
# 检查是否单行docstring
first_line = lines[start_idx].strip()
if first_line.count(quote_char) == 2:
# 单行: """This is a docstring."""
return first_line.strip(quote_char).strip()
# 多行docstring
in_docstring = True
for i in range(start_idx, len(lines)):
line = lines[i]
if i == start_idx:
# 第一行:移除开始的引号
docstring_lines.append(line.strip().lstrip(quote_char))
elif quote_char in line:
# 结束行:移除结束的引号
docstring_lines.append(line.strip().rstrip(quote_char))
break
else:
docstring_lines.append(line.strip())
return '\n'.join(docstring_lines).strip()
def parse_docstring(self, raw_docstring: str) -> DocstringMetadata:
"""解析docstring提取结构化信息"""
if not raw_docstring:
return DocstringMetadata(
raw_text="",
quality=DocstringQuality.MISSING
)
# 评估质量
quality = self._evaluate_quality(raw_docstring)
# 提取各个部分
metadata = DocstringMetadata(
raw_text=raw_docstring,
quality=quality,
)
# 提取摘要(第一行或第一段)
metadata.summary = self._extract_summary(raw_docstring)
# 如果是Google或NumPy风格提取结构化内容
if self.GOOGLE_STYLE_PATTERN.search(raw_docstring):
self._parse_google_style(raw_docstring, metadata)
elif self.NUMPY_STYLE_PATTERN.search(raw_docstring):
self._parse_numpy_style(raw_docstring, metadata)
return metadata
def _evaluate_quality(self, docstring: str) -> DocstringQuality:
"""评估docstring质量"""
if not docstring or len(docstring.strip()) == 0:
return DocstringQuality.MISSING
# 检查是否是占位符
placeholders = ['todo', 'fixme', 'tbd', 'placeholder', '...']
if any(p in docstring.lower() for p in placeholders):
return DocstringQuality.LOW
# 长度检查
if len(docstring.strip()) < 10:
return DocstringQuality.LOW
# 检查是否有结构化内容
has_structure = (
self.GOOGLE_STYLE_PATTERN.search(docstring) or
self.NUMPY_STYLE_PATTERN.search(docstring)
)
# 检查是否有足够的描述性文本
word_count = len(docstring.split())
if has_structure and word_count >= 20:
return DocstringQuality.HIGH
elif word_count >= 10:
return DocstringQuality.MEDIUM
else:
return DocstringQuality.LOW
def _extract_summary(self, docstring: str) -> str:
"""提取摘要(第一行或第一段)"""
lines = docstring.split('\n')
# 第一行非空行作为摘要
for line in lines:
if line.strip():
return line.strip()
return ""
def _parse_google_style(self, docstring: str, metadata: DocstringMetadata):
"""解析Google风格docstring"""
# 提取Args
args_match = re.search(r'Args:(.*?)(?=Returns:|Raises:|Examples:|Note:|\Z)', docstring, re.DOTALL)
if args_match:
metadata.parameters = self._parse_args_section(args_match.group(1))
# 提取Returns
returns_match = re.search(r'Returns:(.*?)(?=Raises:|Examples:|Note:|\Z)', docstring, re.DOTALL)
if returns_match:
metadata.returns = returns_match.group(1).strip()
# 提取Examples
examples_match = re.search(r'Examples:(.*?)(?=Note:|\Z)', docstring, re.DOTALL)
if examples_match:
metadata.examples = examples_match.group(1).strip()
def _parse_args_section(self, args_text: str) -> dict:
"""解析参数列表"""
params = {}
# 匹配 "param_name (type): description" 或 "param_name: description"
pattern = re.compile(r'(\w+)\s*(?:\(([^)]+)\))?\s*:\s*(.+)')
for line in args_text.split('\n'):
match = pattern.search(line.strip())
if match:
param_name, param_type, description = match.groups()
params[param_name] = {
'type': param_type,
'description': description.strip()
}
return params
```
### 3.2 智能混合策略引擎
```python
class HybridEnhancer:
"""Docstring与LLM混合增强器"""
def __init__(
self,
llm_enhancer: LLMEnhancer,
docstring_extractor: DocstringExtractor
):
self.llm_enhancer = llm_enhancer
self.docstring_extractor = docstring_extractor
def enhance_with_strategy(
self,
file_data: FileData,
symbols: List[Symbol]
) -> Dict[str, SemanticMetadata]:
"""根据docstring质量选择增强策略"""
results = {}
for symbol in symbols:
# 1. 提取并解析docstring
raw_docstring = self.docstring_extractor.extract_from_code(
file_data.content, symbol
)
doc_metadata = self.docstring_extractor.parse_docstring(raw_docstring or "")
# 2. 根据质量选择策略
semantic_metadata = self._apply_strategy(
file_data, symbol, doc_metadata
)
results[symbol.name] = semantic_metadata
return results
def _apply_strategy(
self,
file_data: FileData,
symbol: Symbol,
doc_metadata: DocstringMetadata
) -> SemanticMetadata:
"""应用混合策略"""
quality = doc_metadata.quality
if quality == DocstringQuality.HIGH:
# 高质量直接使用docstring只用LLM生成keywords
return self._use_docstring_with_llm_keywords(symbol, doc_metadata)
elif quality == DocstringQuality.MEDIUM:
# 中等质量让LLM精炼和增强
return self._refine_with_llm(file_data, symbol, doc_metadata)
else: # LOW or MISSING
# 低质量或无完全由LLM生成
return self._full_llm_generation(file_data, symbol)
def _use_docstring_with_llm_keywords(
self,
symbol: Symbol,
doc_metadata: DocstringMetadata
) -> SemanticMetadata:
"""策略1使用docstringLLM只生成keywords"""
# 直接使用docstring的摘要
summary = doc_metadata.summary or doc_metadata.raw_text[:200]
# 使用LLM生成keywords
keywords = self._generate_keywords_only(summary, symbol.name)
# 从docstring推断purpose
purpose = self._infer_purpose_from_docstring(doc_metadata)
return SemanticMetadata(
summary=summary,
keywords=keywords,
purpose=purpose,
file_path=symbol.file_path if hasattr(symbol, 'file_path') else None,
symbol_name=symbol.name,
llm_tool="hybrid_docstring_primary",
)
def _refine_with_llm(
self,
file_data: FileData,
symbol: Symbol,
doc_metadata: DocstringMetadata
) -> SemanticMetadata:
"""策略2让LLM精炼和增强docstring"""
prompt = f"""
PURPOSE: Refine and enhance an existing docstring for better semantic search
TASK:
- Review the existing docstring
- Generate a concise summary (1-2 sentences) that captures the core purpose
- Extract 8-12 relevant keywords for search
- Identify the functional category/purpose
EXISTING DOCSTRING:
{doc_metadata.raw_text}
CODE CONTEXT:
Function: {symbol.name}
```{file_data.language}
{self._get_symbol_code(file_data.content, symbol)}
```
OUTPUT: JSON format
{{
"summary": "refined summary based on docstring and code",
"keywords": ["keyword1", "keyword2", ...],
"purpose": "category"
}}
"""
response = self.llm_enhancer._invoke_ccw_cli(prompt, tool='gemini')
if response['success']:
data = json.loads(self.llm_enhancer._extract_json(response['stdout']))
return SemanticMetadata(
summary=data.get('summary', doc_metadata.summary),
keywords=data.get('keywords', []),
purpose=data.get('purpose', 'unknown'),
file_path=file_data.path,
symbol_name=symbol.name,
llm_tool="hybrid_llm_refined",
)
# Fallback: 使用docstring
return self._use_docstring_with_llm_keywords(symbol, doc_metadata)
def _full_llm_generation(
self,
file_data: FileData,
symbol: Symbol
) -> SemanticMetadata:
"""策略3完全由LLM生成原有流程"""
# 复用现有的LLM enhancer
code_snippet = self._get_symbol_code(file_data.content, symbol)
results = self.llm_enhancer.enhance_files([
FileData(
path=f"{file_data.path}:{symbol.name}",
content=code_snippet,
language=file_data.language
)
])
return results.get(f"{file_data.path}:{symbol.name}", SemanticMetadata(
summary="",
keywords=[],
purpose="unknown",
file_path=file_data.path,
symbol_name=symbol.name,
llm_tool="hybrid_llm_full",
))
def _generate_keywords_only(self, summary: str, symbol_name: str) -> List[str]:
"""仅生成keywords快速LLM调用"""
prompt = f"""
PURPOSE: Generate search keywords for a code function
TASK: Extract 5-8 relevant keywords from the summary
Summary: {summary}
Function Name: {symbol_name}
OUTPUT: Comma-separated keywords
"""
response = self.llm_enhancer._invoke_ccw_cli(prompt, tool='gemini')
if response['success']:
keywords_str = response['stdout'].strip()
return [k.strip() for k in keywords_str.split(',')]
# Fallback: 从摘要提取关键词
return self._extract_keywords_heuristic(summary)
def _extract_keywords_heuristic(self, text: str) -> List[str]:
"""启发式关键词提取无需LLM"""
# 简单实现:提取名词性词组
import re
words = re.findall(r'\b[a-z]{4,}\b', text.lower())
# 过滤常见词
stopwords = {'this', 'that', 'with', 'from', 'have', 'will', 'your', 'their'}
keywords = [w for w in words if w not in stopwords]
return list(set(keywords))[:8]
def _infer_purpose_from_docstring(self, doc_metadata: DocstringMetadata) -> str:
"""从docstring推断purpose无需LLM"""
summary = doc_metadata.summary.lower()
# 简单规则匹配
if 'authenticate' in summary or 'login' in summary:
return 'auth'
elif 'validate' in summary or 'check' in summary:
return 'validation'
elif 'parse' in summary or 'format' in summary:
return 'data_processing'
elif 'api' in summary or 'endpoint' in summary:
return 'api'
elif 'database' in summary or 'query' in summary:
return 'data'
elif 'test' in summary:
return 'test'
return 'util'
def _get_symbol_code(self, content: str, symbol: Symbol) -> str:
"""提取符号的代码"""
lines = content.splitlines()
start, end = symbol.range
return '\n'.join(lines[start-1:end])
```
### 3.3 成本优化统计
```python
@dataclass
class EnhancementStats:
"""增强统计"""
total_symbols: int = 0
used_docstring_only: int = 0 # 只使用docstring
llm_keywords_only: int = 0 # LLM只生成keywords
llm_refined: int = 0 # LLM精炼docstring
llm_full_generation: int = 0 # LLM完全生成
total_llm_calls: int = 0
estimated_cost_savings: float = 0.0 # 相比全用LLM节省的成本
class CostOptimizedEnhancer(HybridEnhancer):
"""带成本统计的增强器"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.stats = EnhancementStats()
def enhance_with_strategy(
self,
file_data: FileData,
symbols: List[Symbol]
) -> Dict[str, SemanticMetadata]:
"""增强并统计成本"""
self.stats.total_symbols += len(symbols)
results = super().enhance_with_strategy(file_data, symbols)
# 统计各策略使用情况
for metadata in results.values():
if metadata.llm_tool == "hybrid_docstring_primary":
self.stats.used_docstring_only += 1
self.stats.llm_keywords_only += 1
self.stats.total_llm_calls += 1
elif metadata.llm_tool == "hybrid_llm_refined":
self.stats.llm_refined += 1
self.stats.total_llm_calls += 1
elif metadata.llm_tool == "hybrid_llm_full":
self.stats.llm_full_generation += 1
self.stats.total_llm_calls += 1
# 计算成本节省假设keywords-only调用成本为full的20%
keywords_only_savings = self.stats.llm_keywords_only * 0.8 # 节省80%
full_generation_count = self.stats.total_symbols - self.stats.llm_keywords_only
self.stats.estimated_cost_savings = keywords_only_savings / full_generation_count if full_generation_count > 0 else 0
return results
def print_stats(self):
"""打印统计信息"""
print("=== Enhancement Statistics ===")
print(f"Total Symbols: {self.stats.total_symbols}")
print(f"Used Docstring (with LLM keywords): {self.stats.used_docstring_only} ({self.stats.used_docstring_only/self.stats.total_symbols*100:.1f}%)")
print(f"LLM Refined Docstring: {self.stats.llm_refined} ({self.stats.llm_refined/self.stats.total_symbols*100:.1f}%)")
print(f"LLM Full Generation: {self.stats.llm_full_generation} ({self.stats.llm_full_generation/self.stats.total_symbols*100:.1f}%)")
print(f"Total LLM Calls: {self.stats.total_llm_calls}")
print(f"Estimated Cost Savings: {self.stats.estimated_cost_savings*100:.1f}%")
```
## 4. 配置选项
```python
@dataclass
class HybridEnhancementConfig:
"""混合增强配置"""
# 是否启用混合策略False则回退到全LLM模式
enable_hybrid: bool = True
# 质量阈值配置
use_docstring_threshold: DocstringQuality = DocstringQuality.HIGH
refine_docstring_threshold: DocstringQuality = DocstringQuality.MEDIUM
# 是否为高质量docstring生成keywords
generate_keywords_for_docstring: bool = True
# LLM配置
llm_tool: str = "gemini"
llm_timeout: int = 300000
# 成本优化
batch_size: int = 5 # 批量处理大小
skip_test_files: bool = True # 跳过测试文件通常docstring较少
# 调试选项
log_strategy_decisions: bool = False # 记录策略决策日志
```
## 5. 测试策略
### 5.1 单元测试
```python
import pytest
class TestDocstringExtractor:
"""测试docstring提取"""
def test_extract_google_style(self):
"""测试Google风格docstring提取"""
code = '''
def calculate_total(items, discount=0):
"""Calculate total price with optional discount.
This function processes a list of items and applies
a discount if specified.
Args:
items (list): List of item objects with price attribute.
discount (float): Discount percentage (0-1). Defaults to 0.
Returns:
float: Total price after discount.
Examples:
>>> calculate_total([item1, item2], discount=0.1)
90.0
"""
total = sum(item.price for item in items)
return total * (1 - discount)
'''
extractor = DocstringExtractor()
symbol = Symbol(name='calculate_total', kind='function', range=(1, 18))
docstring = extractor.extract_from_code(code, symbol)
assert docstring is not None
metadata = extractor.parse_docstring(docstring)
assert metadata.quality == DocstringQuality.HIGH
assert 'Calculate total price' in metadata.summary
assert metadata.parameters is not None
assert 'items' in metadata.parameters
assert metadata.returns is not None
assert metadata.examples is not None
def test_extract_low_quality_docstring(self):
"""测试低质量docstring识别"""
code = '''
def process():
"""TODO"""
pass
'''
extractor = DocstringExtractor()
symbol = Symbol(name='process', kind='function', range=(1, 3))
docstring = extractor.extract_from_code(code, symbol)
metadata = extractor.parse_docstring(docstring)
assert metadata.quality == DocstringQuality.LOW
class TestHybridEnhancer:
"""测试混合增强器"""
def test_high_quality_docstring_strategy(self):
"""测试高质量docstring使用策略"""
extractor = DocstringExtractor()
llm_enhancer = LLMEnhancer(LLMConfig(enabled=True))
hybrid = HybridEnhancer(llm_enhancer, extractor)
# 模拟高质量docstring
doc_metadata = DocstringMetadata(
raw_text="Validate user credentials against database.",
quality=DocstringQuality.HIGH,
summary="Validate user credentials against database."
)
symbol = Symbol(name='validate_user', kind='function', range=(1, 10))
result = hybrid._use_docstring_with_llm_keywords(symbol, doc_metadata)
# 应该使用docstring的摘要
assert result.summary == doc_metadata.summary
# 应该有keywords可能由LLM或启发式生成
assert len(result.keywords) > 0
def test_cost_optimization(self):
"""测试成本优化效果"""
enhancer = CostOptimizedEnhancer(
llm_enhancer=LLMEnhancer(LLMConfig(enabled=False)), # Mock
docstring_extractor=DocstringExtractor()
)
# 模拟处理10个symbol其中5个有高质量docstring
# 预期5个只调用keywords生成5个完整LLM
# 总调用10次但成本降低keywords调用更便宜
# 实际测试需要mock LLM调用
pass
```
### 5.2 集成测试
```python
class TestHybridEnhancementPipeline:
"""测试完整的混合增强流程"""
def test_full_pipeline(self):
"""测试完整流程:代码 -> docstring提取 -> 质量评估 -> 策略选择 -> 增强"""
code = '''
def authenticate_user(username, password):
"""Authenticate user with username and password.
Args:
username (str): User's username
password (str): User's password
Returns:
bool: True if authenticated, False otherwise
"""
# ... implementation
pass
def helper_func(x):
# No docstring
return x * 2
'''
file_data = FileData(path='auth.py', content=code, language='python')
symbols = [
Symbol(name='authenticate_user', kind='function', range=(1, 11)),
Symbol(name='helper_func', kind='function', range=(13, 15)),
]
extractor = DocstringExtractor()
llm_enhancer = LLMEnhancer(LLMConfig(enabled=True))
hybrid = CostOptimizedEnhancer(llm_enhancer, extractor)
results = hybrid.enhance_with_strategy(file_data, symbols)
# authenticate_user 应该使用docstring
assert results['authenticate_user'].llm_tool == "hybrid_docstring_primary"
# helper_func 应该完全LLM生成
assert results['helper_func'].llm_tool == "hybrid_llm_full"
# 统计
assert hybrid.stats.total_symbols == 2
assert hybrid.stats.used_docstring_only >= 1
assert hybrid.stats.llm_full_generation >= 1
```
## 6. 实施路线图
### Phase 1: 基础设施1周
- [x] 设计数据结构DocstringMetadata, DocstringQuality
- [ ] 实现DocstringExtractor提取和解析
- [ ] 支持Python docstringGoogle/NumPy/reStructuredText风格
- [ ] 单元测试
### Phase 2: 质量评估1周
- [ ] 实现质量评估算法
- [ ] 启发式规则优化
- [ ] 测试不同质量的docstring
- [ ] 调整阈值参数
### Phase 3: 混合策略1-2周
- [ ] 实现HybridEnhancer
- [ ] 三种策略实现docstring-only, refine, full-llm
- [ ] 策略选择逻辑
- [ ] 集成测试
### Phase 4: 成本优化1周
- [ ] 实现CostOptimizedEnhancer
- [ ] 统计和监控
- [ ] 批量处理优化
- [ ] 性能测试
### Phase 5: 多语言支持1-2周
- [ ] JavaScript/TypeScript JSDoc
- [ ] Java Javadoc
- [ ] 其他语言docstring格式
### Phase 6: 集成与部署1周
- [ ] 集成到现有llm_enhancer
- [ ] CLI选项暴露
- [ ] 配置文件支持
- [ ] 文档和示例
**总计预估时间**6-8周
## 7. 性能与成本分析
### 7.1 预期成本节省
假设场景分析1000个函数
| Docstring质量分布 | 占比 | LLM调用策略 | 相对成本 |
|------------------|------|------------|---------|
| High (有详细docstring) | 30% | 只生成keywords | 20% |
| Medium (有基本docstring) | 40% | 精炼增强 | 60% |
| Low/Missing | 30% | 完全生成 | 100% |
**总成本计算**
- 纯LLM模式1000 * 100% = 1000 units
- 混合模式300*20% + 400*60% + 300*100% = 60 + 240 + 300 = 600 units
- **节省**40%
### 7.2 质量对比
| 指标 | 纯LLM模式 | 混合模式 |
|------|----------|---------|
| 准确性 | 中(可能有幻觉) | **高**docstring权威 |
| 一致性 | 中依赖prompt | **高**(保留作者风格) |
| 覆盖率 | **高**(全覆盖) | 高98%+ |
| 成本 | 高 | **低**节省40% |
| 速度 | 慢(所有文件) | **快**减少LLM调用 |
## 8. 潜在问题与解决方案
### 8.1 问题Docstring过时
**现象**代码已修改但docstring未更新导致信息不准确。
**解决方案**
```python
class DocstringFreshnessChecker:
"""检查docstring与代码的一致性"""
def check_freshness(
self,
symbol: Symbol,
code: str,
doc_metadata: DocstringMetadata
) -> bool:
"""检查docstring是否与代码匹配"""
# 检查1: 参数列表是否匹配
if doc_metadata.parameters:
actual_params = self._extract_actual_parameters(code)
documented_params = set(doc_metadata.parameters.keys())
if actual_params != documented_params:
logger.warning(
f"Parameter mismatch in {symbol.name}: "
f"code has {actual_params}, doc has {documented_params}"
)
return False
# 检查2: 使用LLM验证一致性
# TODO: 构建验证prompt
return True
```
### 8.2 问题不同docstring风格混用
**现象**同一项目中使用多种docstring风格Google, NumPy, 自定义)。
**解决方案**
```python
class MultiStyleDocstringParser:
"""支持多种docstring风格的解析器"""
def parse(self, docstring: str) -> DocstringMetadata:
"""自动检测并解析不同风格"""
# 尝试各种解析器
for parser in [
GoogleStyleParser(),
NumpyStyleParser(),
ReStructuredTextParser(),
SimpleParser(), # Fallback
]:
try:
metadata = parser.parse(docstring)
if metadata.quality != DocstringQuality.LOW:
return metadata
except Exception:
continue
# 如果所有解析器都失败,返回简单解析结果
return SimpleParser().parse(docstring)
```
### 8.3 问题多语言docstring提取差异
**现象**不同语言的docstring格式和位置不同。
**解决方案**
```python
class LanguageSpecificExtractor:
"""语言特定的docstring提取器"""
def extract(self, language: str, code: str, symbol: Symbol) -> Optional[str]:
"""根据语言选择合适的提取器"""
extractors = {
'python': PythonDocstringExtractor(),
'javascript': JSDocExtractor(),
'typescript': TSDocExtractor(),
'java': JavadocExtractor(),
}
extractor = extractors.get(language, GenericExtractor())
return extractor.extract(code, symbol)
class JSDocExtractor:
"""JavaScript/TypeScript JSDoc提取器"""
def extract(self, code: str, symbol: Symbol) -> Optional[str]:
"""提取JSDoc注释"""
lines = code.splitlines()
start_line = symbol.range[0] - 1
# 向上查找 /** ... */ 注释
for i in range(start_line - 1, max(0, start_line - 20), -1):
if '*/' in lines[i]:
# 找到结束标记,向上提取
return self._extract_jsdoc_block(lines, i)
return None
```
## 9. 配置示例
### 9.1 配置文件
```yaml
# .codexlens/hybrid_enhancement.yaml
hybrid_enhancement:
enabled: true
# 质量阈值
quality_thresholds:
use_docstring: high # high/medium/low
refine_docstring: medium
# LLM选项
llm:
tool: gemini
fallback: qwen
timeout_ms: 300000
batch_size: 5
# 成本优化
cost_optimization:
generate_keywords_for_docstring: true
skip_test_files: true
skip_private_methods: false
# 语言支持
languages:
python:
styles: [google, numpy, sphinx]
javascript:
styles: [jsdoc]
java:
styles: [javadoc]
# 监控
logging:
log_strategy_decisions: false
log_cost_savings: true
```
### 9.2 CLI使用
```bash
# 使用混合策略增强
codex-lens enhance . --hybrid --tool gemini
# 查看成本统计
codex-lens enhance . --hybrid --show-stats
# 仅对高质量docstring生成keywords
codex-lens enhance . --hybrid --keywords-only
# 禁用混合模式回退到纯LLM
codex-lens enhance . --no-hybrid --tool gemini
```
## 10. 成功指标
1. **成本节省**相比纯LLM模式降低API调用成本40%+
2. **准确性提升**使用docstring的符号元数据准确率>95%
3. **覆盖率**98%+的符号有语义元数据docstring或LLM生成
4. **速度提升**整体处理速度提升30%+减少LLM调用
5. **用户满意度**保留docstring信息开发者认可度高
## 11. 参考资料
- [PEP 257 - Docstring Conventions](https://peps.python.org/pep-0257/)
- [Google Python Style Guide - Docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings)
- [NumPy Docstring Standard](https://numpydoc.readthedocs.io/en/latest/format.html)
- [JSDoc Documentation](https://jsdoc.app/)
- [Javadoc Tool](https://docs.oracle.com/javase/8/docs/technotes/tools/windows/javadoc.html)

View File

@@ -394,52 +394,32 @@ results = engine.search(
- 指导用户如何生成嵌入
- 集成到搜索引擎日志中
### LLM语义增强验证 (2025-12-16)
### LLM语义增强功能已移除 (2025-12-16)
**测试目标**: 验证LLM增强的向量搜索是否正常工作对比纯向量搜索效果
**移除原因**: 简化代码库,减少外部依赖
**测试基础设施**:
- 创建测试套件 `tests/test_llm_enhanced_search.py` (550+ lines)
- 创建独立测试脚本 `scripts/compare_search_methods.py` (460+ lines)
- 创建完整文档 `docs/LLM_ENHANCED_SEARCH_GUIDE.md` (460+ lines)
**已移除内容**:
- `src/codexlens/semantic/llm_enhancer.py` - LLM增强核心模块
- `src/codexlens/cli/commands.py` 中的 `enhance` 命令
- `tests/test_llm_enhancer.py` - LLM增强测试
- `tests/test_llm_enhanced_search.py` - LLM对比测试
- `scripts/compare_search_methods.py` - 对比测试脚本
- `scripts/test_misleading_comments.py` - 误导性注释测试
- `scripts/show_llm_analysis.py` - LLM分析展示脚本
- `scripts/inspect_llm_summaries.py` - LLM摘要检查工具
- `docs/LLM_ENHANCED_SEARCH_GUIDE.md` - LLM使用指南
- `docs/LLM_ENHANCEMENT_TEST_RESULTS.md` - LLM测试结果
- `docs/MISLEADING_COMMENTS_TEST_RESULTS.md` - 误导性注释测试结果
- `docs/CLI_INTEGRATION_SUMMARY.md` - CLI集成文档包含enhance命令
- `docs/DOCSTRING_LLM_HYBRID_DESIGN.md` - LLM混合策略设计
**测试数据**:
- 5个真实Python代码样本 (认证、API、验证、数据库)
- 6个自然语言测试查询
- 涵盖密码哈希、JWT令牌、用户API、邮箱验证、数据库连接等场景
**保留功能**:
- ✅ 纯向量搜索 (pure_vector) 完整保留
- ✅ 语义嵌入生成 (`codexlens embeddings-generate`)
- ✅ 语义嵌入状态检查 (`codexlens embeddings-status`)
- ✅ 所有核心搜索功能
**测试结果** (2025-12-16):
```
数据集: 5个Python文件, 5个查询
测试工具: Gemini Flash 2.5
Setup Time:
- Pure Vector: 2.3秒 (直接嵌入代码)
- LLM-Enhanced: 174.2秒 (通过Gemini生成摘要, 75x slower)
Accuracy:
- Pure Vector: 5/5 (100%) - 所有查询Rank 1
- LLM-Enhanced: 5/5 (100%) - 所有查询Rank 1
- Score: 15 vs 15 (平局)
```
**关键发现**:
1.**LLM增强功能正常工作**
- CCW CLI集成正常
- Gemini API调用成功
- 摘要生成和嵌入创建正常
2. **性能权衡**
- 索引阶段慢75倍 (LLM API调用开销)
- 查询阶段速度相同 (都是向量相似度搜索)
- 适合离线索引,在线查询场景
3. **准确性**
- 测试数据集太简单 (5文件完美1:1映射)
- 两种方法都达到100%准确率
- 需要更大、更复杂的代码库来显示差异
**结论**: LLM语义增强功能已验证可正常工作可用于生产环境
**历史记录**: LLM增强功能在测试中表现良好但为简化维护和减少外部依赖CCW CLI, Gemini/Qwen API而移除。设计文档DESIGN_EVALUATION_REPORT.md等保留作为历史参考。
### P2 - 中期1-2月

View File

@@ -1,463 +0,0 @@
# LLM-Enhanced Semantic Search Guide
**Last Updated**: 2025-12-16
**Status**: Experimental Feature
---
## Overview
CodexLens supports two approaches for semantic vector search:
| Approach | Pipeline | Best For |
|----------|----------|----------|
| **Pure Vector** | Code → fastembed → search | Code pattern matching, exact functionality |
| **LLM-Enhanced** | Code → LLM summary → fastembed → search | Natural language queries, conceptual search |
### Why LLM Enhancement?
**Problem**: Raw code embeddings don't match natural language well.
```
Query: "How do I hash passwords securely?"
Raw code: def hash_password(password: str) -> str: ...
Mismatch: Low semantic similarity
```
**Solution**: LLM generates natural language summaries.
```
Query: "How do I hash passwords securely?"
LLM Summary: "Hash a password using bcrypt with specified salt rounds for secure storage"
Match: High semantic similarity ✓
```
## Architecture
### Pure Vector Search Flow
```
1. Code File
└→ "def hash_password(password: str): ..."
2. Chunking
└→ Split into semantic chunks (500-2000 chars)
3. Embedding (fastembed)
└→ Generate 768-dim vector from raw code
4. Storage
└→ Store vector in semantic_chunks table
5. Query
└→ "How to hash passwords"
└→ Generate query vector
└→ Find similar vectors (cosine similarity)
```
**Pros**: Fast, no external dependencies, good for code patterns
**Cons**: Poor semantic match for natural language queries
### LLM-Enhanced Search Flow
```
1. Code File
└→ "def hash_password(password: str): ..."
2. LLM Analysis (Gemini/Qwen via CCW)
└→ Generate summary: "Hash a password using bcrypt..."
└→ Extract keywords: ["password", "hash", "bcrypt", "security"]
└→ Identify purpose: "auth"
3. Embeddable Text Creation
└→ Combine: summary + keywords + purpose + filename
4. Embedding (fastembed)
└→ Generate 768-dim vector from LLM text
5. Storage
└→ Store vector with metadata
6. Query
└→ "How to hash passwords"
└→ Generate query vector
└→ Find similar vectors → Better match! ✓
```
**Pros**: Excellent semantic match for natural language
**Cons**: Slower, requires CCW CLI and LLM access
## Setup Requirements
### 1. Install Dependencies
```bash
# Install semantic search dependencies
pip install codexlens[semantic]
# Install CCW CLI for LLM enhancement
npm install -g ccw
```
### 2. Configure LLM Tools
```bash
# Set primary LLM tool (default: gemini)
export CCW_CLI_SECONDARY_TOOL=gemini
# Set fallback tool (default: qwen)
export CCW_CLI_FALLBACK_TOOL=qwen
# Configure API keys (see CCW documentation)
ccw config set gemini.apiKey YOUR_API_KEY
```
### 3. Verify Setup
```bash
# Check CCW availability
ccw --version
# Check semantic dependencies
python -c "from codexlens.semantic import SEMANTIC_AVAILABLE; print(SEMANTIC_AVAILABLE)"
```
## Running Comparison Tests
### Method 1: Standalone Script (Recommended)
```bash
# Run full comparison (pure vector + LLM-enhanced)
python scripts/compare_search_methods.py
# Use specific LLM tool
python scripts/compare_search_methods.py --tool gemini
python scripts/compare_search_methods.py --tool qwen
# Skip LLM test (only pure vector)
python scripts/compare_search_methods.py --skip-llm
```
**Output Example**:
```
======================================================================
SEMANTIC SEARCH COMPARISON TEST
Pure Vector vs LLM-Enhanced Vector Search
======================================================================
Test dataset: 5 Python files
Test queries: 5 natural language questions
======================================================================
PURE VECTOR SEARCH (Code → fastembed)
======================================================================
Setup: 5 files, 23 chunks in 2.3s
Query Top Result Score
----------------------------------------------------------------------
✓ How do I securely hash passwords? password_hasher.py 0.723
✗ Generate JWT token for authentication user_endpoints.py 0.645
✓ Create new user account via API user_endpoints.py 0.812
✓ Validate email address format validation.py 0.756
~ Connect to PostgreSQL database connection.py 0.689
======================================================================
LLM-ENHANCED SEARCH (Code → GEMINI → fastembed)
======================================================================
Generating LLM summaries for 5 files...
Setup: 5/5 files indexed in 8.7s
Query Top Result Score
----------------------------------------------------------------------
✓ How do I securely hash passwords? password_hasher.py 0.891
✓ Generate JWT token for authentication jwt_handler.py 0.867
✓ Create new user account via API user_endpoints.py 0.923
✓ Validate email address format validation.py 0.845
✓ Connect to PostgreSQL database connection.py 0.801
======================================================================
COMPARISON SUMMARY
======================================================================
Query Pure LLM
----------------------------------------------------------------------
How do I securely hash passwords? ✓ Rank 1 ✓ Rank 1
Generate JWT token for authentication ✗ Miss ✓ Rank 1
Create new user account via API ✓ Rank 1 ✓ Rank 1
Validate email address format ✓ Rank 1 ✓ Rank 1
Connect to PostgreSQL database ~ Rank 2 ✓ Rank 1
----------------------------------------------------------------------
TOTAL SCORE 11 15
======================================================================
ANALYSIS:
✓ LLM enhancement improves results by 36.4%
Natural language summaries match queries better than raw code
```
### Method 2: Pytest Test Suite
```bash
# Run full test suite
pytest tests/test_llm_enhanced_search.py -v -s
# Run specific test
pytest tests/test_llm_enhanced_search.py::TestSearchComparison::test_comparison -v -s
# Skip LLM tests if CCW not available
pytest tests/test_llm_enhanced_search.py -v -s -k "not llm_enhanced"
```
## Using LLM Enhancement in Production
### Option 1: Enhanced Embeddings Generation (Recommended)
Create embeddings with LLM enhancement during indexing:
```python
from pathlib import Path
from codexlens.semantic.llm_enhancer import create_enhanced_indexer, FileData
# Create enhanced indexer
indexer = create_enhanced_indexer(
vector_store_path=Path("~/.codexlens/indexes/project/_index.db"),
llm_tool="gemini",
llm_enabled=True,
)
# Prepare file data
files = [
FileData(
path="auth/password_hasher.py",
content=open("auth/password_hasher.py").read(),
language="python"
),
# ... more files
]
# Index with LLM enhancement
indexed_count = indexer.index_files(files)
print(f"Indexed {indexed_count} files with LLM enhancement")
```
### Option 2: CLI Integration (Coming Soon)
```bash
# Generate embeddings with LLM enhancement
codexlens embeddings-generate ~/projects/my-app --llm-enhanced --tool gemini
# Check which strategy was used
codexlens embeddings-status ~/projects/my-app --show-strategies
```
**Note**: CLI integration is planned but not yet implemented. Currently use Option 1 (Python API).
### Option 3: Hybrid Approach
Combine both strategies for best results:
```python
# Generate both pure and LLM-enhanced embeddings
# 1. Pure vector for exact code matching
generate_pure_embeddings(files)
# 2. LLM-enhanced for semantic matching
generate_llm_embeddings(files)
# Search uses both and ranks by best match
```
## Performance Considerations
### Speed Comparison
| Approach | Indexing Time (100 files) | Query Time | Cost |
|----------|---------------------------|------------|------|
| Pure Vector | ~30s | ~50ms | Free |
| LLM-Enhanced | ~5-10 min | ~50ms | LLM API costs |
**LLM indexing is slower** because:
- Calls external LLM API (gemini/qwen)
- Processes files in batches (default: 5 files/batch)
- Waits for LLM response (~2-5s per batch)
**Query speed is identical** because:
- Both use fastembed for similarity search
- Vector lookup is same speed
- Difference is only in what was embedded
### Cost Estimation
**Gemini Flash (via CCW)**:
- ~$0.10 per 1M input tokens
- Average: ~500 tokens per file
- 100 files = ~$0.005 (half a cent)
**Qwen (local)**:
- Free if running locally
- Slower than Gemini Flash
### When to Use Each Approach
| Use Case | Recommendation |
|----------|----------------|
| **Code pattern search** | Pure vector (e.g., "find all REST endpoints") |
| **Natural language queries** | LLM-enhanced (e.g., "how to authenticate users") |
| **Large codebase** | Pure vector first, LLM for important modules |
| **Personal projects** | LLM-enhanced (cost is minimal) |
| **Enterprise** | Hybrid approach |
## Configuration Options
### LLM Config
```python
from codexlens.semantic.llm_enhancer import LLMConfig, LLMEnhancer
config = LLMConfig(
tool="gemini", # Primary LLM tool
fallback_tool="qwen", # Fallback if primary fails
timeout_ms=300000, # 5 minute timeout
batch_size=5, # Files per batch
max_content_chars=8000, # Max chars per file in prompt
enabled=True, # Enable/disable LLM
)
enhancer = LLMEnhancer(config)
```
### Environment Variables
```bash
# Override default LLM tool
export CCW_CLI_SECONDARY_TOOL=gemini
# Override fallback tool
export CCW_CLI_FALLBACK_TOOL=qwen
# Disable LLM enhancement (fall back to pure vector)
export CODEXLENS_LLM_ENABLED=false
```
## Troubleshooting
### Issue 1: CCW CLI Not Found
**Error**: `CCW CLI not found in PATH, LLM enhancement disabled`
**Solution**:
```bash
# Install CCW globally
npm install -g ccw
# Verify installation
ccw --version
# Check PATH
which ccw # Unix
where ccw # Windows
```
### Issue 2: LLM API Errors
**Error**: `LLM call failed: HTTP 429 Too Many Requests`
**Solution**:
- Reduce batch size in LLMConfig
- Add delay between batches
- Check API quota/limits
- Try fallback tool (qwen)
### Issue 3: Poor LLM Summaries
**Symptom**: LLM summaries are too generic or inaccurate
**Solution**:
- Try different LLM tool (gemini vs qwen)
- Increase max_content_chars (default 8000)
- Manually review and refine summaries
- Fall back to pure vector for code-heavy files
### Issue 4: Slow Indexing
**Symptom**: Indexing takes too long with LLM enhancement
**Solution**:
```python
# Reduce batch size for faster feedback
config = LLMConfig(batch_size=2) # Default is 5
# Or use pure vector for large files
if file_size > 10000:
use_pure_vector()
else:
use_llm_enhanced()
```
## Example Test Queries
### Good for LLM-Enhanced Search
```python
# Natural language, conceptual queries
"How do I authenticate users with JWT?"
"Validate email addresses before saving to database"
"Secure password storage with hashing"
"Create REST API endpoint for user registration"
"Connect to PostgreSQL with connection pooling"
```
### Good for Pure Vector Search
```python
# Code-specific, pattern-matching queries
"bcrypt.hashpw"
"jwt.encode"
"@app.route POST"
"re.match email"
"psycopg2.pool.SimpleConnectionPool"
```
### Best: Combine Both
Use LLM-enhanced for high-level search, then pure vector for refinement:
```python
# Step 1: LLM-enhanced for semantic search
results = search_llm_enhanced("user authentication with tokens")
# Returns: jwt_handler.py, password_hasher.py, user_endpoints.py
# Step 2: Pure vector for exact code pattern
results = search_pure_vector("jwt.encode")
# Returns: jwt_handler.py (exact match)
```
## Future Improvements
- [ ] CLI integration for `--llm-enhanced` flag
- [ ] Incremental LLM summary updates
- [ ] Caching LLM summaries to reduce API calls
- [ ] Hybrid search combining both approaches
- [ ] Custom prompt templates for specific domains
- [ ] Local LLM support (ollama, llama.cpp)
## Related Documentation
- `PURE_VECTOR_SEARCH_GUIDE.md` - Pure vector search usage
- `IMPLEMENTATION_SUMMARY.md` - Technical implementation details
- `scripts/compare_search_methods.py` - Comparison test script
- `tests/test_llm_enhanced_search.py` - Test suite
## References
- **LLM Enhancer Implementation**: `src/codexlens/semantic/llm_enhancer.py`
- **CCW CLI Documentation**: https://github.com/anthropics/ccw
- **Fastembed**: https://github.com/qdrant/fastembed
---
**Questions?** Run the comparison script to see LLM enhancement in action:
```bash
python scripts/compare_search_methods.py
```

View File

@@ -1,232 +0,0 @@
# LLM语义增强测试结果
**测试日期**: 2025-12-16
**状态**: ✅ 通过 - LLM增强功能正常工作
---
## 📊 测试结果概览
### 测试配置
| 项目 | 配置 |
|------|------|
| **测试工具** | Gemini Flash 2.5 (via CCW CLI) |
| **测试数据** | 5个Python代码文件 |
| **查询数量** | 5个自然语言查询 |
| **嵌入模型** | BAAI/bge-small-en-v1.5 (768维) |
### 性能对比
| 指标 | 纯向量搜索 | LLM增强搜索 | 差异 |
|------|-----------|------------|------|
| **索引时间** | 2.3秒 | 174.2秒 | 75倍慢 |
| **查询速度** | ~50ms | ~50ms | 相同 |
| **准确率** | 5/5 (100%) | 5/5 (100%) | 相同 |
| **排名得分** | 15/15 | 15/15 | 平局 |
### 详细结果
所有5个查询都找到了正确的文件 (Rank 1):
| 查询 | 预期文件 | 纯向量 | LLM增强 |
|------|---------|--------|---------|
| 如何安全地哈希密码? | password_hasher.py | [OK] Rank 1 | [OK] Rank 1 |
| 生成JWT令牌进行认证 | jwt_handler.py | [OK] Rank 1 | [OK] Rank 1 |
| 通过API创建新用户账户 | user_endpoints.py | [OK] Rank 1 | [OK] Rank 1 |
| 验证电子邮件地址格式 | validation.py | [OK] Rank 1 | [OK] Rank 1 |
| 连接到PostgreSQL数据库 | connection.py | [OK] Rank 1 | [OK] Rank 1 |
---
## ✅ 验证结论
### 1. LLM增强功能工作正常
-**CCW CLI集成**: 成功调用外部CLI工具
-**Gemini API**: API调用成功无错误
-**摘要生成**: LLM成功生成代码摘要和关键词
-**嵌入创建**: 从摘要成功生成768维向量
-**向量存储**: 正确存储到semantic_chunks表
-**搜索准确性**: 100%准确匹配所有查询
### 2. 性能权衡分析
**优势**:
- 查询速度与纯向量相同 (~50ms)
- 更好的语义理解能力 (理论上)
- 适合自然语言查询
**劣势**:
- 索引阶段慢75倍 (174s vs 2.3s)
- 需要外部LLM API (成本)
- 需要安装和配置CCW CLI
**适用场景**:
- 离线索引,在线查询
- 个人项目 (成本可忽略)
- 重视自然语言查询体验
### 3. 测试数据集局限性
**当前测试太简单**:
- 仅5个文件
- 每个查询完美对应1个文件
- 没有歧义或相似文件
- 两种方法都能轻松找到
**预期在真实场景**:
- 数百或数千个文件
- 多个相似功能的文件
- 模糊或概念性查询
- LLM增强应该表现更好
---
## 🛠️ 测试基础设施
### 创建的文件
1. **测试套件** (`tests/test_llm_enhanced_search.py`)
- 550+ lines
- 完整pytest测试
- 3个测试类 (纯向量, LLM增强, 对比)
2. **独立脚本** (`scripts/compare_search_methods.py`)
- 460+ lines
- 可直接运行: `python scripts/compare_search_methods.py`
- 支持参数: `--tool gemini|qwen`, `--skip-llm`
- 详细对比报告
3. **完整文档** (`docs/LLM_ENHANCED_SEARCH_GUIDE.md`)
- 460+ lines
- 架构对比图
- 设置说明
- 使用示例
- 故障排除
### 运行测试
```bash
# 方式1: 独立脚本 (推荐)
python scripts/compare_search_methods.py --tool gemini
# 方式2: Pytest
pytest tests/test_llm_enhanced_search.py::TestSearchComparison::test_comparison -v -s
# 跳过LLM测试 (仅测试纯向量)
python scripts/compare_search_methods.py --skip-llm
```
### 前置要求
```bash
# 1. 安装语义搜索依赖
pip install codexlens[semantic]
# 2. 安装CCW CLI
npm install -g ccw
# 3. 配置API密钥
ccw config set gemini.apiKey YOUR_API_KEY
```
---
## 🔍 架构对比
### 纯向量搜索流程
```
代码文件 → 分块 → fastembed (768维) → semantic_chunks表 → 向量搜索
```
**优点**: 快速、无需外部依赖、直接嵌入代码
**缺点**: 对自然语言查询理解较弱
### LLM增强搜索流程
```
代码文件 → CCW CLI调用Gemini → 生成摘要+关键词 → fastembed (768维) → semantic_chunks表 → 向量搜索
```
**优点**: 更好的语义理解、适合自然语言查询
**缺点**: 索引慢75倍、需要LLM API、有成本
---
## 💰 成本估算
### Gemini Flash (via CCW)
- 价格: ~$0.10 / 1M input tokens
- 平均: ~500 tokens / 文件
- 100文件成本: ~$0.005 (半分钱)
### Qwen (本地)
- 价格: 免费 (本地运行)
- 速度: 比Gemini Flash慢
---
## 📝 修复的问题
### 1. Unicode编码问题
**问题**: Windows GBK控制台无法显示Unicode符号 (✓, ✗, •)
**修复**: 替换为ASCII符号 ([OK], [X], -)
**影响文件**:
- `scripts/compare_search_methods.py`
- `tests/test_llm_enhanced_search.py`
### 2. 数据库文件锁定
**问题**: Windows无法删除临时数据库 (PermissionError)
**修复**: 添加垃圾回收和异常处理
```python
import gc
gc.collect() # 强制关闭连接
time.sleep(0.1) # 等待Windows释放文件句柄
```
### 3. 正则表达式警告
**问题**: SyntaxWarning about invalid escape sequence `\.`
**状态**: 无害警告,正则表达式正常工作
---
## 🎯 结论和建议
### 核心发现
1.**LLM语义增强功能已验证可用**
2.**测试基础设施完整**
3. ⚠️ **测试数据集需扩展** (当前太简单)
### 使用建议
| 场景 | 推荐方案 |
|------|---------|
| 代码模式搜索 | 纯向量 (如 "find all REST endpoints") |
| 自然语言查询 | LLM增强 (如 "how to authenticate users") |
| 大型代码库 | 纯向量优先重要模块用LLM |
| 个人项目 | LLM增强 (成本可忽略) |
| 企业级应用 | 混合方案 |
### 后续工作 (可选)
- [ ] 使用更大的测试数据集 (100+ files)
- [ ] 测试更复杂的查询 (概念性、模糊查询)
- [ ] 性能优化 (批量LLM调用)
- [ ] 成本优化 (缓存LLM摘要)
- [ ] 混合搜索 (结合两种方法)
---
**完成时间**: 2025-12-16
**测试执行者**: Claude (Sonnet 4.5)
**文档版本**: 1.0

View File

@@ -0,0 +1,342 @@
# LLM增强功能移除总结
**移除日期**: 2025-12-16
**执行者**: 用户请求
**状态**: ✅ 完成
---
## 📋 移除清单
### ✅ 已删除的源代码文件
| 文件 | 说明 |
|------|------|
| `src/codexlens/semantic/llm_enhancer.py` | LLM增强核心模块 (900+ lines) |
### ✅ 已修改的源代码文件
| 文件 | 修改内容 |
|------|---------|
| `src/codexlens/cli/commands.py` | 删除 `enhance` 命令 (lines 1050-1227) |
| `src/codexlens/semantic/__init__.py` | 删除LLM相关导出 (lines 35-69) |
### ✅ 已修改的前端文件CCW Dashboard
| 文件 | 修改内容 |
|------|---------|
| `ccw/src/templates/dashboard-js/components/cli-status.js` | 删除LLM增强设置 (8行)、Semantic Settings Modal (615行)、Metadata Viewer (326行) |
| `ccw/src/templates/dashboard-js/i18n.js` | 删除英文LLM翻译 (26行)、中文LLM翻译 (26行) |
| `ccw/src/templates/dashboard-js/views/cli-manager.js` | 移除LLM badge和设置modal调用 (3行) |
### ✅ 已删除的测试文件
| 文件 | 说明 |
|------|------|
| `tests/test_llm_enhancer.py` | LLM增强单元测试 |
| `tests/test_llm_enhanced_search.py` | LLM vs 纯向量对比测试 (550+ lines) |
### ✅ 已删除的脚本文件
| 文件 | 说明 |
|------|------|
| `scripts/compare_search_methods.py` | 纯向量 vs LLM增强对比脚本 (460+ lines) |
| `scripts/test_misleading_comments.py` | 误导性注释测试脚本 (490+ lines) |
| `scripts/show_llm_analysis.py` | LLM分析展示工具 |
| `scripts/inspect_llm_summaries.py` | LLM摘要检查工具 |
### ✅ 已删除的文档文件
| 文件 | 说明 |
|------|------|
| `docs/LLM_ENHANCED_SEARCH_GUIDE.md` | LLM增强使用指南 (460+ lines) |
| `docs/LLM_ENHANCEMENT_TEST_RESULTS.md` | LLM测试结果文档 |
| `docs/MISLEADING_COMMENTS_TEST_RESULTS.md` | 误导性注释测试结果 |
| `docs/CLI_INTEGRATION_SUMMARY.md` | CLI集成文档包含enhance命令 |
| `docs/DOCSTRING_LLM_HYBRID_DESIGN.md` | Docstring与LLM混合策略设计 |
### ✅ 已更新的文档
| 文件 | 修改内容 |
|------|---------|
| `docs/IMPLEMENTATION_SUMMARY.md` | 添加LLM移除说明列出已删除内容 |
### 📚 保留的设计文档(作为历史参考)
| 文件 | 说明 |
|------|------|
| `docs/DESIGN_EVALUATION_REPORT.md` | 包含LLM混合策略的技术评估报告 |
| `docs/SEMANTIC_GRAPH_DESIGN.md` | 语义图谱设计可能提及LLM |
| `docs/MULTILEVEL_CHUNKER_DESIGN.md` | 多层次分词器设计可能提及LLM |
*这些文档保留作为技术历史参考,不影响当前功能。*
---
## 🔒 移除的功能
### CLI命令
```bash
# 已移除 - 不再可用
codexlens enhance [PATH] --tool gemini --batch-size 5
# 说明此命令用于通过CCW CLI调用Gemini/Qwen生成代码摘要
# 移除原因:减少外部依赖,简化维护
```
### Python API
```python
# 已移除 - 不再可用
from codexlens.semantic import (
LLMEnhancer,
LLMConfig,
SemanticMetadata,
FileData,
EnhancedSemanticIndexer,
create_enhancer,
create_enhanced_indexer,
)
# 移除的类和函数:
# - LLMEnhancer: LLM增强器主类
# - LLMConfig: LLM配置类
# - SemanticMetadata: 语义元数据结构
# - FileData: 文件数据结构
# - EnhancedSemanticIndexer: LLM增强索引器
# - create_enhancer(): 创建增强器的工厂函数
# - create_enhanced_indexer(): 创建增强索引器的工厂函数
```
---
## ✅ 保留的功能
### 完全保留的核心功能
| 功能 | 状态 |
|------|------|
| **纯向量搜索** | ✅ 完整保留 |
| **语义嵌入生成** | ✅ 完整保留 (`codexlens embeddings-generate`) |
| **语义嵌入状态检查** | ✅ 完整保留 (`codexlens embeddings-status`) |
| **混合搜索引擎** | ✅ 完整保留exact + fuzzy + vector |
| **向量存储** | ✅ 完整保留 |
| **语义分块** | ✅ 完整保留 |
| **fastembed集成** | ✅ 完整保留 |
### 可用的CLI命令
```bash
# 生成纯向量嵌入无需LLM
codexlens embeddings-generate [PATH]
# 检查嵌入状态
codexlens embeddings-status [PATH]
# 所有搜索命令
codexlens search [QUERY] --index [PATH]
# 所有索引管理命令
codexlens init [PATH]
codexlens update [PATH]
codexlens clean [PATH]
```
### 可用的Python API
```python
# 完全可用 - 纯向量搜索
from codexlens.semantic import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
from codexlens.search.hybrid_search import HybridSearchEngine
# 示例:纯向量搜索
engine = HybridSearchEngine()
results = engine.search(
index_path,
query="your search query",
enable_vector=True,
pure_vector=True, # 纯向量模式
)
```
---
## 🎯 移除原因
### 1. 简化依赖
**移除的外部依赖**:
- CCW CLI (npm package)
- Gemini API (需要API密钥)
- Qwen API (可选)
**保留的依赖**:
- fastembed (ONNX-based轻量级)
- numpy
- Python标准库
### 2. 减少复杂性
- **前**: 两种搜索方式(纯向量 + LLM增强
- **后**: 一种搜索方式(纯向量)
- 移除了900+ lines的LLM增强代码
- 移除了CLI命令和相关配置
- 移除了测试和文档
### 3. 性能考虑
| 方面 | LLM增强 | 纯向量 |
|------|---------|--------|
| **索引速度** | 慢75倍 | 基准 |
| **查询速度** | 相同 | 相同 |
| **准确率** | 相同* | 基准 |
| **成本** | API费用 | 免费 |
*在测试数据集上准确率相同5/5但LLM增强理论上在更复杂场景下可能更好
### 4. 维护负担
**移除前**:
- 需要维护CCW CLI集成
- 需要处理API限流和错误
- 需要测试多个LLM后端
- 需要维护批处理逻辑
**移除后**:
- 单一嵌入引擎fastembed
- 无外部API依赖
- 更简单的错误处理
- 更容易测试
---
## 🔍 验证结果
### 导入测试
```bash
# ✅ 通过 - 语义模块正常
python -c "from codexlens.semantic import SEMANTIC_AVAILABLE; print(SEMANTIC_AVAILABLE)"
# Output: True
# ✅ 通过 - 搜索引擎正常
python -c "from codexlens.search.hybrid_search import HybridSearchEngine; print('OK')"
# Output: OK
```
### 代码清洁度验证
```bash
# ✅ 通过 - 无遗留LLM引用
grep -r "llm_enhancer\|LLMEnhancer\|LLMConfig" src/ --include="*.py"
# Output: (空)
```
### 测试结果
```bash
# ✅ 5/7通过 - 纯向量搜索基本功能正常
pytest tests/test_pure_vector_search.py -v
# 通过: 5个基本测试
# 失败: 2个嵌入测试已知的模型维度不匹配问题与LLM移除无关
```
---
## 📊 统计
### 代码删除统计
| 类型 | 删除文件数 | 删除行数(估计) |
|------|-----------|-----------------|
| **源代码** | 1 | ~900 lines |
| **CLI命令** | 1 command | ~180 lines |
| **导出清理** | 1 section | ~35 lines |
| **前端代码** | 3 files | ~1000 lines |
| **测试文件** | 2 | ~600 lines |
| **脚本工具** | 4 | ~1500 lines |
| **文档** | 5 | ~2000 lines |
| **总计** | 16 files/sections | ~6200 lines |
### 依赖简化
| 方面 | 移除前 | 移除后 |
|------|--------|--------|
| **外部工具依赖** | CCW CLI, Gemini/Qwen | 无 |
| **Python包依赖** | fastembed, numpy | fastembed, numpy |
| **API依赖** | Gemini/Qwen API | 无 |
| **配置复杂度** | 高tool, batch_size, API keys | 低model profile |
---
## 🚀 后续建议
### 如果需要LLM增强功能
1. **从git历史恢复**
```bash
# 查看删除前的提交
git log --all --full-history -- "*llm_enhancer*"
# 恢复特定文件
git checkout <commit-hash> -- src/codexlens/semantic/llm_enhancer.py
```
2. **或使用外部工具**
- 在索引前使用独立脚本生成摘要
- 将摘要作为注释添加到代码中
- 然后使用纯向量索引(会包含摘要)
3. **或考虑轻量级替代方案**
- 使用本地小模型llama.cpp, ggml
- 使用docstring提取无需LLM
- 使用静态分析生成摘要
### 代码库维护建议
1. ✅ **保持简单** - 继续使用纯向量搜索
2. ✅ **优化现有功能** - 改进向量搜索准确性
3. ✅ **增量改进** - 优化分块策略和嵌入质量
4. ⚠️ **避免重复** - 如需LLM先评估是否真正必要
---
## 📝 文件清单
### 删除的文件完整列表
```
src/codexlens/semantic/llm_enhancer.py
tests/test_llm_enhancer.py
tests/test_llm_enhanced_search.py
scripts/compare_search_methods.py
scripts/test_misleading_comments.py
scripts/show_llm_analysis.py
scripts/inspect_llm_summaries.py
docs/LLM_ENHANCED_SEARCH_GUIDE.md
docs/LLM_ENHANCEMENT_TEST_RESULTS.md
docs/MISLEADING_COMMENTS_TEST_RESULTS.md
docs/CLI_INTEGRATION_SUMMARY.md
docs/DOCSTRING_LLM_HYBRID_DESIGN.md
```
### 修改的文件
```
src/codexlens/cli/commands.py (删除enhance命令)
src/codexlens/semantic/__init__.py (删除LLM导出)
ccw/src/templates/dashboard-js/components/cli-status.js (删除LLM配置、Settings Modal、Metadata Viewer)
ccw/src/templates/dashboard-js/i18n.js (删除LLM翻译字符串)
ccw/src/templates/dashboard-js/views/cli-manager.js (移除LLM badge和modal调用)
docs/IMPLEMENTATION_SUMMARY.md (添加移除说明)
```
---
**移除完成时间**: 2025-12-16
**文档版本**: 1.0
**验证状态**: ✅ 通过

View File

@@ -1,301 +0,0 @@
# 误导性注释测试结果
**测试日期**: 2025-12-16
**测试目的**: 验证LLM增强搜索是否能克服错误/缺失的代码注释
---
## 📊 测试结果总结
### 性能对比
| 方法 | 索引时间 | 准确率 | 得分 | 结论 |
|------|---------|--------|------|------|
| **纯向量搜索** | 2.1秒 | 5/5 (100%) | 15/15 | ✅ 未被误导性注释影响 |
| **LLM增强搜索** | 103.7秒 | 5/5 (100%) | 15/15 | ✅ 正确识别实际功能 |
**结论**: 平局 - 两种方法都能正确处理误导性注释
---
## 🧪 测试数据集设计
### 误导性代码样本 (5个文件)
| 文件 | 错误注释 | 实际功能 | 误导程度 |
|------|---------|---------|---------|
| `crypto/hasher.py` | "Simple string utilities" | bcrypt密码哈希 | 高 |
| `auth/token.py` | 无注释,模糊函数名 | JWT令牌生成 | 中 |
| `api/handlers.py` | "Database utilities", 反向docstrings | REST API用户管理 | 极高 |
| `utils/checker.py` | "Math calculation functions" | 邮箱地址验证 | 高 |
| `db/pool.py` | "Email sending service" | PostgreSQL连接池 | 极高 |
### 具体误导示例
#### 示例 1: 完全错误的模块描述
```python
"""Email sending service.""" # 错误!
import psycopg2 # 实际是数据库库
from psycopg2 import pool
class EmailSender: # 错误的类名
"""SMTP email sender with retry logic.""" # 错误!
def __init__(self, min_conn: int = 1, max_conn: int = 10):
"""Initialize email sender.""" # 错误!
self.pool = psycopg2.pool.SimpleConnectionPool(...) # 实际是DB连接池
```
**实际功能**: PostgreSQL数据库连接池管理器
**注释声称**: SMTP邮件发送服务
#### 示例 2: 反向的函数文档
```python
@app.route('/api/items', methods=['POST'])
def create_item():
"""Delete an existing item.""" # 完全相反!
data = request.get_json()
# 实际是创建新项目
return jsonify({'item_id': item_id}), 201
```
### 测试查询 (基于实际功能)
| 查询 | 预期文件 | 查询难度 |
|------|---------|---------|
| "Hash passwords securely with bcrypt" | `crypto/hasher.py` | 高 - 注释说string utils |
| "Generate JWT authentication token" | `auth/token.py` | 中 - 无注释 |
| "Create user account REST API endpoint" | `api/handlers.py` | 高 - 注释说database |
| "Validate email address format" | `utils/checker.py` | 高 - 注释说math |
| "PostgreSQL database connection pool" | `db/pool.py` | 极高 - 注释说email |
---
## 🔍 LLM分析能力验证
### 直接测试: LLM如何理解误导性代码
**测试代码**: `db/pool.py` (声称是"Email sending service")
**Gemini分析结果**:
```
Summary: This Python module defines an `EmailSender` class that manages
a PostgreSQL connection pool for an email sending service, using
`psycopg2` for database interactions. It provides a context manager
`send_email` to handle connection acquisition, transaction commitment,
and release back to the pool.
Purpose: data
Keywords: psycopg2, connection pool, PostgreSQL, database, email sender,
context manager, python, database connection, transaction
```
**分析得分**:
-**正确识别的术语** (5/5): PostgreSQL, connection pool, database, psycopg2, database connection
- ⚠️ **误导性术语** (2/3): email sender, email sending service (但上下文正确)
**结论**: LLM正确识别了实际功能PostgreSQL connection pool虽然摘要开头提到了错误的module docstring但核心描述准确。
---
## 💡 关键发现
### 1. 为什么纯向量搜索也能工作?
**原因**: 代码中的技术关键词权重高于注释
```python
# 这些强信号即使有错误注释也能正确匹配
import bcrypt # 强信号: 密码哈希
import jwt # 强信号: JWT令牌
import psycopg2 # 强信号: PostgreSQL
from flask import Flask, request # 强信号: REST API
pattern = r'^[a-zA-Z0-9._%+-]+@' # 强信号: 邮箱验证
```
**嵌入模型的优势**:
- 代码标识符bcrypt, jwt, psycopg2具有高度特异性
- import语句权重高
- 正则表达式模式具有语义信息
- 框架API调用Flask路由提供明确上下文
### 2. LLM增强的价值
**LLM分析过程**:
1. ✅ 读取代码逻辑(不仅仅是注释)
2. ✅ 识别import语句和实际使用
3. ✅ 理解代码流程和数据流
4. ✅ 生成基于行为的摘要
5. ⚠️ 部分参考错误注释(但不完全依赖)
**示例对比**:
| 方面 | 纯向量 | LLM增强 |
|------|--------|---------|
| **处理内容** | 代码 + 注释 (整体嵌入) | 代码分析 → 生成摘要 |
| **误导性注释影响** | 低 (代码关键词权重高) | 极低 (理解代码逻辑) |
| **自然语言查询** | 依赖代码词汇匹配 | 理解语义意图 |
| **处理速度** | 快 (2秒) | 慢 (104秒, 52倍差) |
### 3. 测试数据集的局限性
**为什么两种方法都表现完美**:
1. **文件数量太少** (5个文件)
- 没有相似功能的文件竞争
- 每个查询有唯一的目标文件
2. **代码关键词太强**
- bcrypt → 唯一用于密码
- jwt → 唯一用于令牌
- Flask+@app.route → 唯一的API
- psycopg2 → 唯一的数据库
3. **查询过于具体**
- "bcrypt password hashing" 直接匹配代码关键词
- 不是概念性或模糊查询
**理想的测试场景**:
- ❌ 5个唯一功能文件
- ✅ 100+文件,多个相似功能模块
- ✅ 模糊概念查询: "用户认证"而不是"bcrypt hash"
- ✅ 没有明显关键词的业务逻辑代码
---
## 🎯 实际应用建议
### 何时使用纯向量搜索
**推荐场景**:
- 代码库有良好文档
- 搜索代码模式和API使用
- 已知技术栈关键词
- 需要快速索引
**示例查询**:
- "bcrypt.hashpw usage"
- "Flask @app.route GET method"
- "jwt.encode algorithm"
### 何时使用LLM增强搜索
**推荐场景**:
- 代码库文档缺失或过时
- 自然语言概念性查询
- 业务逻辑搜索
- 重视搜索准确性 > 索引速度
**示例查询**:
- "How to authenticate users?" (概念性)
- "Payment processing workflow" (业务逻辑)
- "Error handling for API requests" (模式搜索)
### 混合策略 (推荐)
| 模块类型 | 索引方式 | 原因 |
|---------|---------|------|
| **核心业务逻辑** | LLM增强 | 复杂逻辑,文档可能不完整 |
| **工具函数** | 纯向量 | 代码清晰,关键词明确 |
| **第三方集成** | 纯向量 | API调用已是最好描述 |
| **遗留代码** | LLM增强 | 文档陈旧或缺失 |
---
## 📈 性能与成本
### 时间成本
| 操作 | 纯向量 | LLM增强 | 差异 |
|------|--------|---------|------|
| **索引5文件** | 2.1秒 | 103.7秒 | 49倍慢 |
| **索引100文件** | ~42秒 | ~35分钟 | ~50倍慢 |
| **查询速度** | ~50ms | ~50ms | 相同 |
### 金钱成本 (Gemini Flash)
- **价格**: $0.10 / 1M input tokens
- **平均**: ~500 tokens / 文件
- **100文件**: $0.005 (半分钱)
- **1000文件**: $0.05 (5分钱)
**结论**: 金钱成本可忽略,时间成本是主要考虑因素
---
## 🧪 测试工具
### 创建的脚本
1. **`scripts/test_misleading_comments.py`**
- 完整对比测试
- 支持 `--tool gemini|qwen`
- 支持 `--keep-db` 保存结果数据库
2. **`scripts/show_llm_analysis.py`**
- 直接显示LLM对单个文件的分析
- 评估LLM是否被误导
- 计算正确/误导术语比例
3. **`scripts/inspect_llm_summaries.py`**
- 检查数据库中的LLM摘要
- 查看metadata和keywords
### 运行测试
```bash
# 完整对比测试
python scripts/test_misleading_comments.py --tool gemini
# 保存数据库用于检查
python scripts/test_misleading_comments.py --keep-db ./results.db
# 查看LLM对单个文件的分析
python scripts/show_llm_analysis.py
# 检查数据库中的摘要
python scripts/inspect_llm_summaries.py results.db
```
---
## 📝 结论
### 测试结论
1.**LLM能够克服误导性注释**
- 正确识别实际代码功能
- 生成基于行为的准确摘要
- 不完全依赖文档字符串
2.**纯向量搜索也具有抗干扰能力**
- 代码关键词提供强信号
- 技术栈名称具有高特异性
- import语句和API调用信息丰富
3. ⚠️ **当前测试数据集太简单**
- 需要更大规模测试 (100+文件)
- 需要概念性查询测试
- 需要相似功能模块对比
### 生产使用建议
**最佳实践**: 根据代码库特征选择策略
| 代码库特征 | 推荐方案 | 理由 |
|-----------|---------|------|
| 良好文档,清晰命名 | 纯向量 | 快速,成本低 |
| 文档缺失/陈旧 | LLM增强 | 理解代码逻辑 |
| 遗留系统 | LLM增强 | 克服历史包袱 |
| 新项目 | 纯向量 | 现代代码通常更清晰 |
| 大型企业代码库 | 混合 | 分模块策略 |
---
**测试完成时间**: 2025-12-16
**测试工具**: Gemini Flash 2.5, fastembed (BAAI/bge-small-en-v1.5)
**文档版本**: 1.0

View File

@@ -1,465 +0,0 @@
#!/usr/bin/env python3
"""Standalone script to compare pure vector vs LLM-enhanced semantic search.
Usage:
python compare_search_methods.py [--tool gemini|qwen] [--skip-llm]
This script:
1. Creates a test dataset with sample code
2. Tests pure vector search (code → fastembed → search)
3. Tests LLM-enhanced search (code → LLM summary → fastembed → search)
4. Compares results across natural language queries
"""
import argparse
import sqlite3
import sys
import tempfile
import time
from pathlib import Path
from typing import Dict, List, Tuple
# Check dependencies
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
from codexlens.semantic.llm_enhancer import (
LLMEnhancer,
LLMConfig,
FileData,
EnhancedSemanticIndexer,
)
from codexlens.storage.dir_index import DirIndexStore
from codexlens.search.hybrid_search import HybridSearchEngine
except ImportError as e:
print(f"Error: Missing dependencies - {e}")
print("Install with: pip install codexlens[semantic]")
sys.exit(1)
if not SEMANTIC_AVAILABLE:
print("Error: Semantic search dependencies not available")
print("Install with: pip install codexlens[semantic]")
sys.exit(1)
# Test dataset with realistic code samples
TEST_DATASET = {
"auth/password_hasher.py": '''"""Password hashing utilities using bcrypt."""
import bcrypt
def hash_password(password: str, salt_rounds: int = 12) -> str:
"""Hash a password using bcrypt with specified salt rounds."""
salt = bcrypt.gensalt(rounds=salt_rounds)
hashed = bcrypt.hashpw(password.encode('utf-8'), salt)
return hashed.decode('utf-8')
def verify_password(password: str, hashed: str) -> bool:
"""Verify a password against its hash."""
return bcrypt.checkpw(password.encode('utf-8'), hashed.encode('utf-8'))
''',
"auth/jwt_handler.py": '''"""JWT token generation and validation."""
import jwt
from datetime import datetime, timedelta
SECRET_KEY = "your-secret-key"
def create_token(user_id: int, expires_in: int = 3600) -> str:
"""Generate a JWT access token for user authentication."""
payload = {
'user_id': user_id,
'exp': datetime.utcnow() + timedelta(seconds=expires_in),
'iat': datetime.utcnow()
}
return jwt.encode(payload, SECRET_KEY, algorithm='HS256')
def decode_token(token: str) -> dict:
"""Validate and decode JWT token."""
try:
return jwt.decode(token, SECRET_KEY, algorithms=['HS256'])
except jwt.ExpiredSignatureError:
return None
''',
"api/user_endpoints.py": '''"""REST API endpoints for user management."""
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/api/users', methods=['POST'])
def create_user():
"""Create a new user account with email and password."""
data = request.get_json()
if not data.get('email') or not data.get('password'):
return jsonify({'error': 'Email and password required'}), 400
user_id = 12345 # Database insert
return jsonify({'user_id': user_id, 'success': True}), 201
@app.route('/api/users/<int:user_id>', methods=['GET'])
def get_user(user_id: int):
"""Retrieve user profile information by user ID."""
user = {
'id': user_id,
'email': 'user@example.com',
'name': 'John Doe'
}
return jsonify(user), 200
''',
"utils/validation.py": '''"""Input validation utilities."""
import re
def validate_email(email: str) -> bool:
"""Check if email address format is valid using regex."""
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def sanitize_input(text: str, max_length: int = 255) -> str:
"""Clean user input by removing special characters."""
text = re.sub(r'[<>\"\'&]', '', text)
return text.strip()[:max_length]
def validate_password_strength(password: str) -> tuple:
"""Validate password meets security requirements."""
if len(password) < 8:
return False, "Password must be at least 8 characters"
if not re.search(r'[A-Z]', password):
return False, "Must contain uppercase letter"
return True, None
''',
"database/connection.py": '''"""Database connection pooling."""
import psycopg2
from psycopg2 import pool
from contextlib import contextmanager
class DatabasePool:
"""PostgreSQL connection pool manager."""
def __init__(self, min_conn: int = 1, max_conn: int = 10):
"""Initialize database connection pool."""
self.pool = psycopg2.pool.SimpleConnectionPool(
min_conn, max_conn,
user='dbuser', host='localhost', database='myapp'
)
@contextmanager
def get_connection(self):
"""Get a connection from pool as context manager."""
conn = self.pool.getconn()
try:
yield conn
conn.commit()
finally:
self.pool.putconn(conn)
''',
}
# Natural language test queries
TEST_QUERIES = [
("How do I securely hash passwords?", "auth/password_hasher.py"),
("Generate JWT token for authentication", "auth/jwt_handler.py"),
("Create new user account via API", "api/user_endpoints.py"),
("Validate email address format", "utils/validation.py"),
("Connect to PostgreSQL database", "database/connection.py"),
]
def create_test_database(db_path: Path) -> None:
"""Create and populate test database."""
store = DirIndexStore(db_path)
store.initialize()
with store._get_connection() as conn:
for path, content in TEST_DATASET.items():
name = path.split('/')[-1]
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(name, path, content, "python", 0.0)
)
conn.commit()
store.close()
def test_pure_vector_search(db_path: Path) -> Dict:
"""Test pure vector search (raw code embeddings)."""
print("\n" + "="*70)
print("PURE VECTOR SEARCH (Code → fastembed)")
print("="*70)
start_time = time.time()
# Generate pure vector embeddings
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute("SELECT full_path, content FROM files").fetchall()
chunk_count = 0
for row in rows:
chunks = chunker.chunk_sliding_window(
row["content"],
file_path=row["full_path"],
language="python"
)
for chunk in chunks:
chunk.embedding = embedder.embed_single(chunk.content)
chunk.metadata["strategy"] = "pure_vector"
if chunks:
vector_store.add_chunks(chunks, row["full_path"])
chunk_count += len(chunks)
setup_time = time.time() - start_time
print(f"Setup: {len(rows)} files, {chunk_count} chunks in {setup_time:.1f}s")
# Test queries
engine = HybridSearchEngine()
results = {}
print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
search_results = engine.search(
db_path,
query,
limit=3,
enable_vector=True,
pure_vector=True,
)
top_file = search_results[0].path if search_results else "No results"
top_score = search_results[0].score if search_results else 0.0
found = expected_file in [r.path for r in search_results]
rank = None
if found:
for i, r in enumerate(search_results):
if r.path == expected_file:
rank = i + 1
break
status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
display_query = query[:42] + "..." if len(query) > 45 else query
display_file = top_file.split('/')[-1] if '/' in top_file else top_file
print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")
results[query] = {
"found": found,
"rank": rank,
"top_file": top_file,
"score": top_score,
}
return results
def test_llm_enhanced_search(db_path: Path, llm_tool: str = "gemini") -> Dict:
"""Test LLM-enhanced search (LLM summaries → fastembed)."""
print("\n" + "="*70)
print(f"LLM-ENHANCED SEARCH (Code → {llm_tool.upper()} → fastembed)")
print("="*70)
# Check CCW availability
llm_config = LLMConfig(enabled=True, tool=llm_tool, batch_size=2)
enhancer = LLMEnhancer(llm_config)
if not enhancer.check_available():
print("[X] CCW CLI not available - skipping LLM-enhanced test")
print(" Install CCW: npm install -g ccw")
return {}
start_time = time.time()
# Generate LLM-enhanced embeddings
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store)
# Prepare file data
file_data_list = [
FileData(path=path, content=content, language="python")
for path, content in TEST_DATASET.items()
]
# Index with LLM enhancement
print(f"Generating LLM summaries for {len(file_data_list)} files...")
indexed = indexer.index_files(file_data_list)
setup_time = time.time() - start_time
print(f"Setup: {indexed}/{len(file_data_list)} files indexed in {setup_time:.1f}s")
# Test queries
engine = HybridSearchEngine()
results = {}
print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
search_results = engine.search(
db_path,
query,
limit=3,
enable_vector=True,
pure_vector=True,
)
top_file = search_results[0].path if search_results else "No results"
top_score = search_results[0].score if search_results else 0.0
found = expected_file in [r.path for r in search_results]
rank = None
if found:
for i, r in enumerate(search_results):
if r.path == expected_file:
rank = i + 1
break
status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
display_query = query[:42] + "..." if len(query) > 45 else query
display_file = top_file.split('/')[-1] if '/' in top_file else top_file
print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")
results[query] = {
"found": found,
"rank": rank,
"top_file": top_file,
"score": top_score,
}
return results
def compare_results(pure_results: Dict, llm_results: Dict) -> None:
"""Compare and analyze results from both approaches."""
print("\n" + "="*70)
print("COMPARISON SUMMARY")
print("="*70)
if not llm_results:
print("Cannot compare - LLM-enhanced test was skipped")
return
pure_score = 0
llm_score = 0
print(f"\n{'Query':<45} {'Pure':<10} {'LLM':<10}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
pure_res = pure_results.get(query, {})
llm_res = llm_results.get(query, {})
pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Miss"
llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Miss"
# Scoring: Rank 1 = 3 points, Rank 2 = 2 points, Rank 3 = 1 point
if pure_res.get('found') and pure_res.get('rank'):
pure_score += max(0, 4 - pure_res['rank'])
if llm_res.get('found') and llm_res.get('rank'):
llm_score += max(0, 4 - llm_res['rank'])
display_query = query[:42] + "..." if len(query) > 45 else query
print(f"{display_query:<45} {pure_status:<10} {llm_status:<10}")
print("-" * 70)
print(f"{'TOTAL SCORE':<45} {pure_score:<10} {llm_score:<10}")
print("="*70)
# Analysis
print("\nANALYSIS:")
if llm_score > pure_score:
improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100
print(f"[OK] LLM enhancement improves results by {improvement:.1f}%")
print(" Natural language summaries match queries better than raw code")
elif pure_score > llm_score:
degradation = ((pure_score - llm_score) / max(pure_score, 1)) * 100
print(f"[X] Pure vector performed {degradation:.1f}% better")
print(" LLM summaries may be too generic or missing key details")
else:
print("= Both approaches performed equally on this test set")
print("\nKEY FINDINGS:")
print("- Pure Vector: Direct code embeddings, fast but may miss semantic intent")
print("- LLM Enhanced: Natural language summaries, better for human-like queries")
print("- Best Use: Combine both - LLM for natural language, vector for code patterns")
def main():
parser = argparse.ArgumentParser(
description="Compare pure vector vs LLM-enhanced semantic search"
)
parser.add_argument(
"--tool",
choices=["gemini", "qwen"],
default="gemini",
help="LLM tool to use for enhancement (default: gemini)"
)
parser.add_argument(
"--skip-llm",
action="store_true",
help="Skip LLM-enhanced test (only run pure vector)"
)
args = parser.parse_args()
print("\n" + "="*70)
print("SEMANTIC SEARCH COMPARISON TEST")
print("Pure Vector vs LLM-Enhanced Vector Search")
print("="*70)
# Create test database
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
try:
print(f"\nTest dataset: {len(TEST_DATASET)} Python files")
print(f"Test queries: {len(TEST_QUERIES)} natural language questions")
create_test_database(db_path)
# Test pure vector search
pure_results = test_pure_vector_search(db_path)
# Test LLM-enhanced search
if not args.skip_llm:
# Clear semantic_chunks table for LLM test
with sqlite3.connect(db_path) as conn:
conn.execute("DELETE FROM semantic_chunks")
conn.commit()
llm_results = test_llm_enhanced_search(db_path, args.tool)
else:
llm_results = {}
print("\n[X] LLM-enhanced test skipped (--skip-llm flag)")
# Compare results
compare_results(pure_results, llm_results)
finally:
# Cleanup - ensure all connections are closed
try:
import gc
gc.collect() # Force garbage collection to close any lingering connections
time.sleep(0.1) # Small delay for Windows to release file handle
if db_path.exists():
db_path.unlink()
except PermissionError:
print(f"\nWarning: Could not delete temporary database: {db_path}")
print("It will be cleaned up on next system restart.")
print("\n" + "="*70)
print("Test completed successfully!")
print("="*70)
if __name__ == "__main__":
main()

View File

@@ -1,88 +0,0 @@
#!/usr/bin/env python3
"""Inspect LLM-generated summaries in semantic_chunks table."""
import sqlite3
import sys
from pathlib import Path
def inspect_summaries(db_path: Path):
"""Show LLM-generated summaries from database."""
if not db_path.exists():
print(f"Error: Database not found: {db_path}")
return
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
# Check if semantic_chunks table exists
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
)
if not cursor.fetchone():
print("No semantic_chunks table found")
return
# Get all chunks with metadata
cursor = conn.execute("""
SELECT file_path, chunk_index, content,
json_extract(metadata, '$.llm_summary') as summary,
json_extract(metadata, '$.llm_keywords') as keywords,
json_extract(metadata, '$.llm_purpose') as purpose,
json_extract(metadata, '$.strategy') as strategy
FROM semantic_chunks
ORDER BY file_path, chunk_index
""")
chunks = cursor.fetchall()
if not chunks:
print("No chunks found in database")
return
print("="*80)
print("LLM-GENERATED SUMMARIES INSPECTION")
print("="*80)
current_file = None
for chunk in chunks:
file_path = chunk['file_path']
if file_path != current_file:
print(f"\n{'='*80}")
print(f"FILE: {file_path}")
print(f"{'='*80}")
current_file = file_path
print(f"\n[Chunk {chunk['chunk_index']}]")
print(f"Strategy: {chunk['strategy']}")
if chunk['summary']:
print(f"\nLLM Summary:")
print(f" {chunk['summary']}")
if chunk['keywords']:
print(f"\nKeywords:")
print(f" {chunk['keywords']}")
if chunk['purpose']:
print(f"\nPurpose:")
print(f" {chunk['purpose']}")
# Show first 200 chars of content
content = chunk['content']
if len(content) > 200:
content = content[:200] + "..."
print(f"\nOriginal Content (first 200 chars):")
print(f" {content}")
print("-" * 80)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python inspect_llm_summaries.py <path_to_index.db>")
print("\nExample:")
print(" python inspect_llm_summaries.py ~/.codexlens/indexes/myproject/_index.db")
sys.exit(1)
db_path = Path(sys.argv[1])
inspect_summaries(db_path)

View File

@@ -1,112 +0,0 @@
#!/usr/bin/env python3
"""Directly show LLM analysis of test code."""
from codexlens.semantic.llm_enhancer import LLMEnhancer, LLMConfig, FileData
# Misleading code example
TEST_CODE = '''"""Email sending service."""
import psycopg2
from psycopg2 import pool
from contextlib import contextmanager
class EmailSender:
"""SMTP email sender with retry logic."""
def __init__(self, min_conn: int = 1, max_conn: int = 10):
"""Initialize email sender."""
self.pool = psycopg2.pool.SimpleConnectionPool(
min_conn, max_conn,
user='dbuser', host='localhost', database='myapp'
)
@contextmanager
def send_email(self):
"""Send email message."""
conn = self.pool.getconn()
try:
yield conn
conn.commit()
finally:
self.pool.putconn(conn)
'''
print("="*80)
print("LLM ANALYSIS OF MISLEADING CODE")
print("="*80)
print("\n[Original Code with Misleading Comments]")
print("-"*80)
print(TEST_CODE)
print("-"*80)
print("\n[Actual Functionality]")
print(" - Imports: psycopg2 (PostgreSQL library)")
print(" - Class: EmailSender (but name is misleading!)")
print(" - Actually: Creates PostgreSQL connection pool")
print(" - Methods: send_email (actually gets DB connection)")
print("\n[Misleading Documentation]")
print(" - Module docstring: 'Email sending service' (WRONG)")
print(" - Class docstring: 'SMTP email sender' (WRONG)")
print(" - Method docstring: 'Send email message' (WRONG)")
print("\n" + "="*80)
print("TESTING LLM UNDERSTANDING")
print("="*80)
# Test LLM analysis
config = LLMConfig(enabled=True, tool="gemini", batch_size=1)
enhancer = LLMEnhancer(config)
if not enhancer.check_available():
print("\n[X] CCW CLI not available")
print("Install: npm install -g ccw")
exit(1)
print("\n[Calling Gemini to analyze code...]")
file_data = FileData(path="db/pool.py", content=TEST_CODE, language="python")
import tempfile
from pathlib import Path
with tempfile.TemporaryDirectory() as tmpdir:
result = enhancer.enhance_files([file_data], Path(tmpdir))
if "db/pool.py" in result:
metadata = result["db/pool.py"]
print("\n[LLM-Generated Summary]")
print("-"*80)
print(f"Summary: {metadata.summary}")
print(f"\nPurpose: {metadata.purpose}")
print(f"\nKeywords: {', '.join(metadata.keywords)}")
print("-"*80)
print("\n[Analysis]")
# Check if LLM identified the real functionality
summary_lower = metadata.summary.lower()
keywords_lower = [k.lower() for k in metadata.keywords]
correct_terms = ['database', 'postgresql', 'connection', 'pool', 'psycopg']
misleading_terms = ['email', 'smtp', 'send']
found_correct = sum(1 for term in correct_terms
if term in summary_lower or any(term in k for k in keywords_lower))
found_misleading = sum(1 for term in misleading_terms
if term in summary_lower or any(term in k for k in keywords_lower))
print(f"Correct terms found: {found_correct}/{len(correct_terms)}")
print(f"Misleading terms found: {found_misleading}/{len(misleading_terms)}")
if found_correct > found_misleading:
print("\n[OK] LLM correctly identified actual functionality!")
print(" LLM ignored misleading comments and analyzed code behavior")
elif found_misleading > found_correct:
print("\n[X] LLM was misled by incorrect comments")
print(" LLM trusted documentation over code analysis")
else:
print("\n[~] Mixed results - LLM found both correct and misleading terms")
else:
print("\n[X] LLM analysis failed - no results returned")
print("\n" + "="*80)

View File

@@ -1,491 +0,0 @@
#!/usr/bin/env python3
"""Test pure vector vs LLM-enhanced search with misleading/missing comments.
This test demonstrates how LLM enhancement can overcome:
1. Missing comments/docstrings
2. Misleading or incorrect comments
3. Outdated documentation
Usage:
python test_misleading_comments.py --tool gemini
"""
import argparse
import sqlite3
import sys
import tempfile
import time
from pathlib import Path
from typing import Dict, List
# Check dependencies
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
from codexlens.semantic.llm_enhancer import (
LLMEnhancer,
LLMConfig,
FileData,
EnhancedSemanticIndexer,
)
from codexlens.storage.dir_index import DirIndexStore
from codexlens.search.hybrid_search import HybridSearchEngine
except ImportError as e:
print(f"Error: Missing dependencies - {e}")
print("Install with: pip install codexlens[semantic]")
sys.exit(1)
if not SEMANTIC_AVAILABLE:
print("Error: Semantic search dependencies not available")
sys.exit(1)
# Test dataset with MISLEADING or MISSING comments
MISLEADING_DATASET = {
"crypto/hasher.py": '''"""Simple string utilities."""
import bcrypt
def process_string(s: str, rounds: int = 12) -> str:
"""Convert string to uppercase."""
salt = bcrypt.gensalt(rounds=rounds)
hashed = bcrypt.hashpw(s.encode('utf-8'), salt)
return hashed.decode('utf-8')
def check_string(s: str, target: str) -> bool:
"""Check if two strings are equal."""
return bcrypt.checkpw(s.encode('utf-8'), target.encode('utf-8'))
''',
"auth/token.py": '''import jwt
from datetime import datetime, timedelta
SECRET_KEY = "key123"
def make_thing(uid: int, exp: int = 3600) -> str:
payload = {
'user_id': uid,
'exp': datetime.utcnow() + timedelta(seconds=exp),
'iat': datetime.utcnow()
}
return jwt.encode(payload, SECRET_KEY, algorithm='HS256')
def parse_thing(thing: str) -> dict:
try:
return jwt.decode(thing, SECRET_KEY, algorithms=['HS256'])
except jwt.ExpiredSignatureError:
return None
''',
"api/handlers.py": '''"""Database connection utilities."""
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/api/items', methods=['POST'])
def create_item():
"""Delete an existing item."""
data = request.get_json()
if not data.get('email') or not data.get('password'):
return jsonify({'error': 'Missing data'}), 400
item_id = 12345
return jsonify({'item_id': item_id, 'success': True}), 201
@app.route('/api/items/<int:item_id>', methods=['GET'])
def get_item(item_id: int):
"""Update item configuration."""
item = {
'id': item_id,
'email': 'user@example.com',
'name': 'John Doe'
}
return jsonify(item), 200
''',
"utils/checker.py": '''"""Math calculation functions."""
import re
def calc_sum(email: str) -> bool:
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def format_text(text: str, max_len: int = 255) -> str:
text = re.sub(r'[<>"\\'&]', '', text)
return text.strip()[:max_len]
''',
"db/pool.py": '''"""Email sending service."""
import psycopg2
from psycopg2 import pool
from contextlib import contextmanager
class EmailSender:
"""SMTP email sender with retry logic."""
def __init__(self, min_conn: int = 1, max_conn: int = 10):
"""Initialize email sender."""
self.pool = psycopg2.pool.SimpleConnectionPool(
min_conn, max_conn,
user='dbuser', host='localhost', database='myapp'
)
@contextmanager
def send_email(self):
"""Send email message."""
conn = self.pool.getconn()
try:
yield conn
conn.commit()
finally:
self.pool.putconn(conn)
''',
}
# Test queries - natural language based on ACTUAL functionality (not misleading comments)
TEST_QUERIES = [
("How to hash passwords securely with bcrypt?", "crypto/hasher.py"),
("Generate JWT authentication token", "auth/token.py"),
("Create user account REST API endpoint", "api/handlers.py"),
("Validate email address format", "utils/checker.py"),
("PostgreSQL database connection pool", "db/pool.py"),
]
def create_test_database(db_path: Path) -> None:
"""Create and populate test database."""
store = DirIndexStore(db_path)
store.initialize()
with store._get_connection() as conn:
for path, content in MISLEADING_DATASET.items():
name = path.split('/')[-1]
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(name, path, content, "python", 0.0)
)
conn.commit()
store.close()
def test_pure_vector_search(db_path: Path) -> Dict:
"""Test pure vector search (relies on code + misleading comments)."""
print("\n" + "="*70)
print("PURE VECTOR SEARCH (Code + Misleading Comments -> fastembed)")
print("="*70)
start_time = time.time()
# Generate pure vector embeddings
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute("SELECT full_path, content FROM files").fetchall()
chunk_count = 0
for row in rows:
chunks = chunker.chunk_sliding_window(
row["content"],
file_path=row["full_path"],
language="python"
)
for chunk in chunks:
chunk.embedding = embedder.embed_single(chunk.content)
chunk.metadata["strategy"] = "pure_vector"
if chunks:
vector_store.add_chunks(chunks, row["full_path"])
chunk_count += len(chunks)
setup_time = time.time() - start_time
print(f"Setup: {len(rows)} files, {chunk_count} chunks in {setup_time:.1f}s")
print("Note: Embeddings include misleading comments")
# Test queries
engine = HybridSearchEngine()
results = {}
print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
search_results = engine.search(
db_path,
query,
limit=3,
enable_vector=True,
pure_vector=True,
)
top_file = search_results[0].path if search_results else "No results"
top_score = search_results[0].score if search_results else 0.0
found = expected_file in [r.path for r in search_results]
rank = None
if found:
for i, r in enumerate(search_results):
if r.path == expected_file:
rank = i + 1
break
status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
display_query = query[:42] + "..." if len(query) > 45 else query
display_file = top_file.split('/')[-1] if '/' in top_file else top_file
print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")
results[query] = {
"found": found,
"rank": rank,
"top_file": top_file,
"score": top_score,
}
return results
def test_llm_enhanced_search(db_path: Path, llm_tool: str = "gemini") -> Dict:
"""Test LLM-enhanced search (LLM reads code and generates accurate summary)."""
print("\n" + "="*70)
print(f"LLM-ENHANCED SEARCH (Code -> {llm_tool.upper()} Analysis -> fastembed)")
print("="*70)
# Check CCW availability
llm_config = LLMConfig(enabled=True, tool=llm_tool, batch_size=2)
enhancer = LLMEnhancer(llm_config)
if not enhancer.check_available():
print("[X] CCW CLI not available - skipping LLM-enhanced test")
print(" Install CCW: npm install -g ccw")
return {}
start_time = time.time()
# Generate LLM-enhanced embeddings
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store)
# Prepare file data
file_data_list = [
FileData(path=path, content=content, language="python")
for path, content in MISLEADING_DATASET.items()
]
# Index with LLM enhancement
print(f"LLM analyzing code (ignoring misleading comments)...")
indexed = indexer.index_files(file_data_list)
setup_time = time.time() - start_time
print(f"Setup: {indexed}/{len(file_data_list)} files indexed in {setup_time:.1f}s")
print("Note: LLM generates summaries based on actual code logic")
# Test queries
engine = HybridSearchEngine()
results = {}
print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
search_results = engine.search(
db_path,
query,
limit=3,
enable_vector=True,
pure_vector=True,
)
top_file = search_results[0].path if search_results else "No results"
top_score = search_results[0].score if search_results else 0.0
found = expected_file in [r.path for r in search_results]
rank = None
if found:
for i, r in enumerate(search_results):
if r.path == expected_file:
rank = i + 1
break
status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]")
display_query = query[:42] + "..." if len(query) > 45 else query
display_file = top_file.split('/')[-1] if '/' in top_file else top_file
print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}")
results[query] = {
"found": found,
"rank": rank,
"top_file": top_file,
"score": top_score,
}
return results
def compare_results(pure_results: Dict, llm_results: Dict) -> None:
"""Compare and analyze results from both approaches."""
print("\n" + "="*70)
print("COMPARISON SUMMARY - MISLEADING COMMENTS TEST")
print("="*70)
if not llm_results:
print("Cannot compare - LLM-enhanced test was skipped")
return
pure_score = 0
llm_score = 0
print(f"\n{'Query':<45} {'Pure':<10} {'LLM':<10}")
print("-" * 70)
for query, expected_file in TEST_QUERIES:
pure_res = pure_results.get(query, {})
llm_res = llm_results.get(query, {})
pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Miss"
llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Miss"
# Scoring: Rank 1 = 3 points, Rank 2 = 2 points, Rank 3 = 1 point
if pure_res.get('found') and pure_res.get('rank'):
pure_score += max(0, 4 - pure_res['rank'])
if llm_res.get('found') and llm_res.get('rank'):
llm_score += max(0, 4 - llm_res['rank'])
display_query = query[:42] + "..." if len(query) > 45 else query
print(f"{display_query:<45} {pure_status:<10} {llm_status:<10}")
print("-" * 70)
print(f"{'TOTAL SCORE':<45} {pure_score:<10} {llm_score:<10}")
print("="*70)
# Analysis
print("\nANALYSIS:")
if llm_score > pure_score:
improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100
print(f"[OK] LLM enhancement improves results by {improvement:.1f}%")
print(" LLM understands actual code logic despite misleading comments")
print(" Pure vector search misled by incorrect documentation")
elif pure_score > llm_score:
degradation = ((pure_score - llm_score) / max(pure_score, 1)) * 100
print(f"[X] Pure vector performed {degradation:.1f}% better")
print(" Unexpected: Pure vector wasn't affected by misleading comments")
else:
print("= Both approaches performed equally")
print(" Test dataset may still be too simple to show differences")
print("\nKEY INSIGHTS:")
print("- Pure Vector: Embeds code + comments together, can be misled")
print("- LLM Enhanced: Analyzes actual code behavior, ignores bad comments")
print("- Best Use: LLM enhancement crucial for poorly documented codebases")
print("\nMISLEADING COMMENTS IN TEST:")
print("1. 'hasher.py' claims 'string utilities' but does bcrypt hashing")
print("2. 'token.py' has no docstrings, unclear function names")
print("3. 'handlers.py' says 'database utilities' but is REST API")
print("4. 'handlers.py' docstrings opposite (create says delete, etc)")
print("5. 'checker.py' claims 'math functions' but validates emails")
print("6. 'pool.py' claims 'email sender' but is database pool")
def main():
parser = argparse.ArgumentParser(
description="Test pure vector vs LLM-enhanced with misleading comments"
)
parser.add_argument(
"--tool",
choices=["gemini", "qwen"],
default="gemini",
help="LLM tool to use (default: gemini)"
)
parser.add_argument(
"--skip-llm",
action="store_true",
help="Skip LLM-enhanced test"
)
parser.add_argument(
"--keep-db",
type=str,
help="Save database to specified path for inspection (e.g., ./test_results.db)"
)
args = parser.parse_args()
print("\n" + "="*70)
print("MISLEADING COMMENTS TEST")
print("Pure Vector vs LLM-Enhanced with Incorrect Documentation")
print("="*70)
# Create test database
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
try:
print(f"\nTest dataset: {len(MISLEADING_DATASET)} Python files")
print(f"Test queries: {len(TEST_QUERIES)} natural language questions")
print("\nChallenges:")
print("- Misleading module docstrings")
print("- Incorrect function docstrings")
print("- Missing documentation")
print("- Unclear function names")
create_test_database(db_path)
# Test pure vector search
pure_results = test_pure_vector_search(db_path)
# Test LLM-enhanced search
if not args.skip_llm:
# Clear semantic_chunks table for LLM test
with sqlite3.connect(db_path) as conn:
conn.execute("DELETE FROM semantic_chunks")
conn.commit()
llm_results = test_llm_enhanced_search(db_path, args.tool)
else:
llm_results = {}
print("\n[X] LLM-enhanced test skipped (--skip-llm flag)")
# Compare results
compare_results(pure_results, llm_results)
finally:
# Save or cleanup database
if args.keep_db:
import shutil
save_path = Path(args.keep_db)
try:
import gc
gc.collect()
time.sleep(0.2)
shutil.copy2(db_path, save_path)
print(f"\n[OK] Database saved to: {save_path}")
print(f"Inspect with: python scripts/inspect_llm_summaries.py {save_path}")
except Exception as e:
print(f"\n[X] Failed to save database: {e}")
finally:
try:
if db_path.exists():
db_path.unlink()
except:
pass
else:
# Cleanup
try:
import gc
gc.collect()
time.sleep(0.1)
if db_path.exists():
db_path.unlink()
except PermissionError:
print(f"\nWarning: Could not delete temporary database: {db_path}")
print("\n" + "="*70)
print("Test completed!")
print("="*70)
if __name__ == "__main__":
main()

View File

@@ -1047,184 +1047,6 @@ def migrate(
registry.close()
@app.command()
def enhance(
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to enhance."),
tool: str = typer.Option("gemini", "--tool", "-t", help="LLM tool to use (gemini or qwen)."),
batch_size: int = typer.Option(5, "--batch-size", "-b", min=1, max=20, help="Number of files to process per batch."),
force: bool = typer.Option(False, "--force", "-f", help="Regenerate metadata for all files, even if already exists."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
"""Generate LLM-enhanced semantic metadata for indexed files.
Uses CCW CLI to generate summaries, keywords, and purpose descriptions.
Requires ccw to be installed and accessible in PATH.
"""
_configure_logging(verbose)
base_path = path.expanduser().resolve()
registry: RegistryStore | None = None
try:
# Check if ccw is available
import subprocess
import shutil
import sys
try:
ccw_cmd = shutil.which("ccw")
if not ccw_cmd:
raise FileNotFoundError("ccw not in PATH")
# On Windows, .cmd files need shell=True
if sys.platform == "win32":
subprocess.run("ccw --version", shell=True, capture_output=True, check=True)
else:
subprocess.run(["ccw", "--version"], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
raise CodexLensError("ccw CLI not found. Please install ccw first.")
# Validate tool
if tool not in ("gemini", "qwen"):
raise CodexLensError(f"Invalid tool: {tool}. Must be 'gemini' or 'qwen'.")
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
# Find project
project_info = registry.get_project(base_path)
if not project_info:
raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
# Import LLM enhancer
try:
from codexlens.semantic.llm_enhancer import LLMEnhancer, LLMConfig
except ImportError as e:
raise CodexLensError(f"Semantic enhancement requires additional dependencies: {e}")
# Create enhancer with config
config = LLMConfig(tool=tool, batch_size=batch_size)
enhancer = LLMEnhancer(config=config)
# Get index directory
index_dir = mapper.source_to_index_dir(base_path)
if not index_dir.exists():
raise CodexLensError(f"Index directory not found: {index_dir}")
# Process all index databases recursively
from codexlens.storage.dir_index import DirIndexStore
from pathlib import Path
total_processed = 0
total_errors = 0
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TimeElapsedColumn(),
console=console,
) as progress:
# Find all _index.db files
index_files = list(index_dir.rglob("_index.db"))
task = progress.add_task(f"Enhancing {len(index_files)} directories...", total=len(index_files))
for db_path in index_files:
try:
store = DirIndexStore(db_path)
store.initialize()
# Get files to process
if force:
files_to_process = store.list_files()
else:
files_to_process = store.get_files_without_semantic()
if not files_to_process:
progress.update(task, advance=1)
continue
# Process files
for file_entry in files_to_process:
try:
# Read file content
with open(file_entry.full_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
# Generate metadata
metadata = enhancer.enhance_file(
path=str(file_entry.full_path),
content=content,
language=file_entry.language or "unknown"
)
# Store metadata
store.add_semantic_metadata(
file_id=file_entry.id,
summary=metadata.summary,
keywords=metadata.keywords,
purpose=metadata.purpose,
llm_tool=tool
)
total_processed += 1
except Exception as e:
total_errors += 1
if verbose:
console.print(f"[yellow]Error processing {file_entry.full_path}: {e}[/yellow]")
store.close()
except Exception as e:
total_errors += 1
if verbose:
console.print(f"[yellow]Error processing {db_path}: {e}[/yellow]")
progress.update(task, advance=1)
result = {
"path": str(base_path),
"tool": tool,
"files_processed": total_processed,
"errors": total_errors,
}
if json_mode:
print_json(success=True, result=result)
else:
console.print(f"[green]Enhanced {total_processed} files using {tool}[/green]")
if total_errors > 0:
console.print(f" [yellow]Errors: {total_errors}[/yellow]")
except StorageError as exc:
if json_mode:
print_json(success=False, error=f"Storage error: {exc}")
else:
console.print(f"[red]Enhancement failed (storage):[/red] {exc}")
raise typer.Exit(code=1)
except PermissionError as exc:
if json_mode:
print_json(success=False, error=f"Permission denied: {exc}")
else:
console.print(f"[red]Enhancement failed (permission denied):[/red] {exc}")
raise typer.Exit(code=1)
except CodexLensError as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Enhancement failed:[/red] {exc}")
raise typer.Exit(code=1)
except Exception as exc:
if json_mode:
print_json(success=False, error=f"Unexpected error: {exc}")
else:
console.print(f"[red]Enhancement failed (unexpected):[/red] {exc}")
raise typer.Exit(code=1)
finally:
if registry is not None:
registry.close()
@app.command()
def clean(
path: Optional[Path] = typer.Argument(None, help="Project path to clean (removes project index)."),

View File

@@ -32,38 +32,8 @@ def check_semantic_available() -> tuple[bool, str | None]:
"""Check if semantic search dependencies are available."""
return SEMANTIC_AVAILABLE, _import_error
# Export LLM enhancement classes
try:
from .llm_enhancer import (
LLMEnhancer,
LLMConfig,
SemanticMetadata,
FileData,
EnhancedSemanticIndexer,
create_enhancer,
create_enhanced_indexer,
)
LLM_AVAILABLE = True
except ImportError:
LLM_AVAILABLE = False
LLMEnhancer = None # type: ignore
LLMConfig = None # type: ignore
SemanticMetadata = None # type: ignore
FileData = None # type: ignore
EnhancedSemanticIndexer = None # type: ignore
create_enhancer = None # type: ignore
create_enhanced_indexer = None # type: ignore
__all__ = [
"SEMANTIC_AVAILABLE",
"SEMANTIC_BACKEND",
"check_semantic_available",
"LLM_AVAILABLE",
"LLMEnhancer",
"LLMConfig",
"SemanticMetadata",
"FileData",
"EnhancedSemanticIndexer",
"create_enhancer",
"create_enhanced_indexer",
]

View File

@@ -1,899 +0,0 @@
"""LLM-based semantic enhancement using CCW CLI.
This module provides LLM-generated descriptions that are then embedded
by fastembed for improved semantic search. The flow is:
Code → LLM Summary → fastembed embedding → VectorStore → semantic search
LLM-generated summaries match natural language queries better than raw code.
"""
from __future__ import annotations
import json
import logging
import os
import subprocess
import shutil
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, TYPE_CHECKING
from codexlens.entities import SemanticChunk, Symbol
if TYPE_CHECKING:
from .embedder import Embedder
from .vector_store import VectorStore
logger = logging.getLogger(__name__)
@dataclass
class SemanticMetadata:
"""LLM-generated semantic metadata for a file or symbol."""
summary: str
keywords: List[str]
purpose: str
file_path: Optional[str] = None
symbol_name: Optional[str] = None
llm_tool: Optional[str] = None
@dataclass
class FileData:
"""File data for LLM processing."""
path: str
content: str
language: str
symbols: List[Symbol] = field(default_factory=list)
@dataclass
class LLMConfig:
"""Configuration for LLM enhancement.
Tool selection can be overridden via environment variables:
- CCW_CLI_SECONDARY_TOOL: Primary tool for LLM calls (default: gemini)
- CCW_CLI_FALLBACK_TOOL: Fallback tool if primary fails (default: qwen)
"""
tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_SECONDARY_TOOL", "gemini"))
fallback_tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_FALLBACK_TOOL", "qwen"))
timeout_ms: int = 300000
batch_size: int = 5
max_content_chars: int = 8000 # Max chars per file in batch prompt
enabled: bool = True
class LLMEnhancer:
"""LLM-based semantic enhancement using CCW CLI.
Generates code summaries and search keywords by calling
external LLM tools (gemini, qwen) via CCW CLI subprocess.
"""
CHUNK_REFINEMENT_PROMPT = '''PURPOSE: Identify optimal semantic split points in code chunk
TASK:
- Analyze the code structure to find natural semantic boundaries
- Identify logical groupings (functions, classes, related statements)
- Suggest split points that maintain semantic cohesion
MODE: analysis
EXPECTED: JSON format with split positions
=== CODE CHUNK ===
{code_chunk}
=== OUTPUT FORMAT ===
Return ONLY valid JSON (no markdown, no explanation):
{{
"split_points": [
{{
"line": <line_number>,
"reason": "brief reason for split (e.g., 'start of new function', 'end of class definition')"
}}
]
}}
Rules:
- Split at function/class/method boundaries
- Keep related code together (don't split mid-function)
- Aim for chunks between 500-2000 characters
- Return empty split_points if no good splits found'''
PROMPT_TEMPLATE = '''PURPOSE: Generate semantic summaries and search keywords for code files
TASK:
- For each code block, generate a concise summary (1-2 sentences)
- Extract 5-10 relevant search keywords
- Identify the functional purpose/category
MODE: analysis
EXPECTED: JSON format output
=== CODE BLOCKS ===
{code_blocks}
=== OUTPUT FORMAT ===
Return ONLY valid JSON (no markdown, no explanation):
{{
"files": {{
"<file_path>": {{
"summary": "Brief description of what this code does",
"keywords": ["keyword1", "keyword2", ...],
"purpose": "category like: auth, api, util, ui, data, config, test"
}}
}}
}}'''
def __init__(self, config: LLMConfig | None = None) -> None:
"""Initialize LLM enhancer.
Args:
config: LLM configuration, uses defaults if None
"""
self.config = config or LLMConfig()
self._ccw_available: Optional[bool] = None
def check_available(self) -> bool:
"""Check if CCW CLI tool is available."""
if self._ccw_available is not None:
return self._ccw_available
self._ccw_available = shutil.which("ccw") is not None
if not self._ccw_available:
logger.warning("CCW CLI not found in PATH, LLM enhancement disabled")
return self._ccw_available
def enhance_files(
self,
files: List[FileData],
working_dir: Optional[Path] = None,
) -> Dict[str, SemanticMetadata]:
"""Enhance multiple files with LLM-generated semantic metadata.
Processes files in batches to manage token limits and API costs.
Args:
files: List of file data to process
working_dir: Optional working directory for CCW CLI
Returns:
Dict mapping file paths to SemanticMetadata
"""
if not self.config.enabled:
logger.debug("LLM enhancement disabled by config")
return {}
if not self.check_available():
return {}
if not files:
return {}
results: Dict[str, SemanticMetadata] = {}
batch_size = self.config.batch_size
for i in range(0, len(files), batch_size):
batch = files[i:i + batch_size]
try:
batch_results = self._process_batch(batch, working_dir)
results.update(batch_results)
logger.debug(
"Processed batch %d/%d: %d files enhanced",
i // batch_size + 1,
(len(files) + batch_size - 1) // batch_size,
len(batch_results),
)
except Exception as e:
logger.warning(
"Batch %d failed, continuing: %s",
i // batch_size + 1,
e,
)
continue
return results
def enhance_file(
self,
path: str,
content: str,
language: str,
working_dir: Optional[Path] = None,
) -> SemanticMetadata:
"""Enhance a single file with LLM-generated semantic metadata.
Convenience method that wraps enhance_files for single file processing.
Args:
path: File path
content: File content
language: Programming language
working_dir: Optional working directory for CCW CLI
Returns:
SemanticMetadata for the file
Raises:
ValueError: If enhancement fails
"""
file_data = FileData(path=path, content=content, language=language)
results = self.enhance_files([file_data], working_dir)
if path not in results:
# Return default metadata if enhancement failed
return SemanticMetadata(
summary=f"Code file written in {language}",
keywords=[language, "code"],
purpose="unknown",
file_path=path,
llm_tool=self.config.tool,
)
return results[path]
def refine_chunk_boundaries(
self,
chunk: SemanticChunk,
max_chunk_size: int = 2000,
working_dir: Optional[Path] = None,
) -> List[SemanticChunk]:
"""Refine chunk boundaries using LLM for large code chunks.
Uses LLM to identify semantic split points in large chunks,
breaking them into smaller, more cohesive pieces.
Args:
chunk: Original chunk to refine
max_chunk_size: Maximum characters before triggering refinement
working_dir: Optional working directory for CCW CLI
Returns:
List of refined chunks (original chunk if no splits or refinement fails)
"""
# Skip if chunk is small enough
if len(chunk.content) <= max_chunk_size:
return [chunk]
# Skip if LLM enhancement disabled or unavailable
if not self.config.enabled or not self.check_available():
return [chunk]
# Skip docstring chunks - only refine code chunks
if chunk.metadata.get("chunk_type") == "docstring":
return [chunk]
try:
# Build refinement prompt
prompt = self.CHUNK_REFINEMENT_PROMPT.format(code_chunk=chunk.content)
# Invoke LLM
result = self._invoke_ccw_cli(
prompt,
tool=self.config.tool,
working_dir=working_dir,
)
# Fallback if primary tool fails
if not result["success"] and self.config.fallback_tool:
result = self._invoke_ccw_cli(
prompt,
tool=self.config.fallback_tool,
working_dir=working_dir,
)
if not result["success"]:
logger.debug("LLM refinement failed, returning original chunk")
return [chunk]
# Parse split points
split_points = self._parse_split_points(result["stdout"])
if not split_points:
logger.debug("No split points identified, returning original chunk")
return [chunk]
# Split chunk at identified boundaries
refined_chunks = self._split_chunk_at_points(chunk, split_points)
logger.debug(
"Refined chunk into %d smaller chunks (was %d chars)",
len(refined_chunks),
len(chunk.content),
)
return refined_chunks
except Exception as e:
logger.warning("Chunk refinement error: %s, returning original chunk", e)
return [chunk]
def _parse_split_points(self, stdout: str) -> List[int]:
"""Parse split points from LLM response.
Args:
stdout: Raw stdout from CCW CLI
Returns:
List of line numbers where splits should occur (sorted)
"""
# Extract JSON from response
json_str = self._extract_json(stdout)
if not json_str:
return []
try:
data = json.loads(json_str)
split_points_data = data.get("split_points", [])
# Extract line numbers
lines = []
for point in split_points_data:
if isinstance(point, dict) and "line" in point:
line_num = point["line"]
if isinstance(line_num, int) and line_num > 0:
lines.append(line_num)
return sorted(set(lines))
except (json.JSONDecodeError, ValueError, TypeError) as e:
logger.debug("Failed to parse split points: %s", e)
return []
def _split_chunk_at_points(
self,
chunk: SemanticChunk,
split_points: List[int],
) -> List[SemanticChunk]:
"""Split chunk at specified line numbers.
Args:
chunk: Original chunk to split
split_points: Sorted list of line numbers to split at
Returns:
List of smaller chunks
"""
lines = chunk.content.splitlines(keepends=True)
chunks: List[SemanticChunk] = []
# Get original metadata
base_metadata = dict(chunk.metadata)
original_start = base_metadata.get("start_line", 1)
# Add start and end boundaries
boundaries = [0] + split_points + [len(lines)]
for i in range(len(boundaries) - 1):
start_idx = boundaries[i]
end_idx = boundaries[i + 1]
# Skip empty sections
if start_idx >= end_idx:
continue
# Extract content
section_lines = lines[start_idx:end_idx]
section_content = "".join(section_lines)
# Skip if too small
if len(section_content.strip()) < 50:
continue
# Create new chunk with updated metadata
new_metadata = base_metadata.copy()
new_metadata["start_line"] = original_start + start_idx
new_metadata["end_line"] = original_start + end_idx - 1
new_metadata["refined_by_llm"] = True
new_metadata["original_chunk_size"] = len(chunk.content)
chunks.append(
SemanticChunk(
content=section_content,
embedding=None, # Embeddings will be regenerated
metadata=new_metadata,
)
)
# If no valid chunks created, return original
if not chunks:
return [chunk]
return chunks
def _process_batch(
self,
files: List[FileData],
working_dir: Optional[Path] = None,
) -> Dict[str, SemanticMetadata]:
"""Process a single batch of files."""
prompt = self._build_batch_prompt(files)
# Try primary tool first
result = self._invoke_ccw_cli(
prompt,
tool=self.config.tool,
working_dir=working_dir,
)
# Fallback to secondary tool if primary fails
if not result["success"] and self.config.fallback_tool:
logger.debug(
"Primary tool %s failed, trying fallback %s",
self.config.tool,
self.config.fallback_tool,
)
result = self._invoke_ccw_cli(
prompt,
tool=self.config.fallback_tool,
working_dir=working_dir,
)
if not result["success"]:
logger.warning("LLM call failed: %s", result.get("stderr", "unknown error"))
return {}
return self._parse_response(result["stdout"], self.config.tool)
def _build_batch_prompt(self, files: List[FileData]) -> str:
"""Build prompt for batch processing."""
code_blocks_parts: List[str] = []
for file_data in files:
# Truncate content if too long
content = file_data.content
if len(content) > self.config.max_content_chars:
content = content[:self.config.max_content_chars] + "\n... [truncated]"
# Format code block
lang_hint = file_data.language or "text"
code_block = f'''[FILE: {file_data.path}]
```{lang_hint}
{content}
```'''
code_blocks_parts.append(code_block)
code_blocks = "\n\n".join(code_blocks_parts)
return self.PROMPT_TEMPLATE.format(code_blocks=code_blocks)
def _invoke_ccw_cli(
self,
prompt: str,
tool: str = "gemini",
working_dir: Optional[Path] = None,
) -> Dict[str, Any]:
"""Invoke CCW CLI tool via subprocess.
Args:
prompt: The prompt to send to LLM
tool: Tool name (gemini, qwen, codex)
working_dir: Optional working directory
Returns:
Dict with success, stdout, stderr, exit_code
"""
import sys
import os
timeout_seconds = (self.config.timeout_ms / 1000) + 30
# Build base arguments
base_args = [
"cli", "exec",
prompt, # Direct string argument
"--tool", tool,
"--mode", "analysis",
"--timeout", str(self.config.timeout_ms),
]
if working_dir:
base_args.extend(["--cd", str(working_dir)])
try:
if sys.platform == "win32":
# On Windows, ccw is a .CMD wrapper that requires shell
# Instead, directly invoke node with the ccw.js script
ccw_path = shutil.which("ccw")
if ccw_path and ccw_path.lower().endswith(".cmd"):
# Find the ccw.js script location
npm_dir = Path(ccw_path).parent
ccw_js = npm_dir / "node_modules" / "ccw" / "bin" / "ccw.js"
if ccw_js.exists():
cmd = ["node", str(ccw_js)] + base_args
else:
# Fallback to shell execution
cmd_str = "ccw " + " ".join(f'"{a}"' if " " in a else a for a in base_args)
result = subprocess.run(
cmd_str, shell=True, capture_output=True, text=True,
timeout=timeout_seconds, cwd=working_dir,
encoding="utf-8", errors="replace",
)
return {
"success": result.returncode == 0,
"stdout": result.stdout,
"stderr": result.stderr,
"exit_code": result.returncode,
}
else:
cmd = ["ccw"] + base_args
else:
cmd = ["ccw"] + base_args
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout_seconds,
cwd=working_dir,
encoding="utf-8",
errors="replace",
)
return {
"success": result.returncode == 0,
"stdout": result.stdout,
"stderr": result.stderr,
"exit_code": result.returncode,
}
except subprocess.TimeoutExpired:
logger.warning("CCW CLI timeout after %ds", self.config.timeout_ms / 1000)
return {
"success": False,
"stdout": "",
"stderr": "timeout",
"exit_code": -1,
}
except FileNotFoundError:
logger.warning("CCW CLI not found - ensure 'ccw' is in PATH")
return {
"success": False,
"stdout": "",
"stderr": "ccw command not found",
"exit_code": -1,
}
except Exception as e:
logger.warning("CCW CLI invocation failed: %s", e)
return {
"success": False,
"stdout": "",
"stderr": str(e),
"exit_code": -1,
}
def _parse_response(
self,
stdout: str,
tool: str,
) -> Dict[str, SemanticMetadata]:
"""Parse LLM response into SemanticMetadata objects.
Args:
stdout: Raw stdout from CCW CLI
tool: Tool name used for generation
Returns:
Dict mapping file paths to SemanticMetadata
"""
results: Dict[str, SemanticMetadata] = {}
# Extract JSON from response (may be wrapped in markdown or other text)
json_str = self._extract_json(stdout)
if not json_str:
logger.warning("No JSON found in LLM response")
return results
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
logger.warning("Failed to parse LLM response JSON: %s", e)
return results
# Handle expected format: {"files": {"path": {...}}}
files_data = data.get("files", data)
if not isinstance(files_data, dict):
logger.warning("Unexpected response format: expected dict")
return results
for file_path, metadata in files_data.items():
if not isinstance(metadata, dict):
continue
try:
results[file_path] = SemanticMetadata(
summary=metadata.get("summary", ""),
keywords=metadata.get("keywords", []),
purpose=metadata.get("purpose", ""),
file_path=file_path,
llm_tool=tool,
)
except Exception as e:
logger.debug("Failed to parse metadata for %s: %s", file_path, e)
continue
return results
def _extract_json(self, text: str) -> Optional[str]:
"""Extract JSON object from text that may contain markdown or other content."""
# Try to find JSON object boundaries
text = text.strip()
# Remove markdown code blocks if present
if text.startswith("```"):
lines = text.split("\n")
# Remove first line (```json or ```)
lines = lines[1:]
# Find closing ```
for i, line in enumerate(lines):
if line.strip() == "```":
lines = lines[:i]
break
text = "\n".join(lines)
# Find JSON object
start = text.find("{")
if start == -1:
return None
# Find matching closing brace
depth = 0
end = start
for i, char in enumerate(text[start:], start):
if char == "{":
depth += 1
elif char == "}":
depth -= 1
if depth == 0:
end = i + 1
break
if depth != 0:
return None
return text[start:end]
def create_enhancer(
tool: str = "gemini",
timeout_ms: int = 300000,
batch_size: int = 5,
enabled: bool = True,
) -> LLMEnhancer:
"""Factory function to create LLM enhancer with custom config."""
config = LLMConfig(
tool=tool,
timeout_ms=timeout_ms,
batch_size=batch_size,
enabled=enabled,
)
return LLMEnhancer(config)
class EnhancedSemanticIndexer:
"""Integrates LLM enhancement with fastembed vector search.
Flow:
1. Code files → LLM generates summaries/keywords
2. Summaries → fastembed generates embeddings
3. Embeddings → VectorStore for similarity search
This produces better semantic search because:
- LLM summaries are natural language descriptions
- Natural language queries match summaries better than raw code
- Keywords expand search coverage
"""
def __init__(
self,
enhancer: LLMEnhancer,
embedder: "Embedder",
vector_store: "VectorStore",
) -> None:
"""Initialize enhanced semantic indexer.
Args:
enhancer: LLM enhancer for generating summaries
embedder: Fastembed embedder for vector generation
vector_store: Vector storage for similarity search
"""
self.enhancer = enhancer
self.embedder = embedder
self.vector_store = vector_store
def index_files(
self,
files: List[FileData],
working_dir: Optional[Path] = None,
) -> int:
"""Index files with LLM-enhanced semantic search.
Args:
files: List of file data to index
working_dir: Optional working directory for LLM calls
Returns:
Number of files successfully indexed
"""
if not files:
return 0
# Step 1: Generate LLM summaries
logger.info("Generating LLM summaries for %d files...", len(files))
metadata_map = self.enhancer.enhance_files(files, working_dir)
if not metadata_map:
logger.warning("No LLM metadata generated, falling back to raw code")
return self._index_raw_code(files)
# Step 2: Create semantic chunks from LLM summaries
chunks_to_embed: List[SemanticChunk] = []
file_paths: List[str] = []
for file_data in files:
metadata = metadata_map.get(file_data.path)
if metadata:
# Use LLM-generated summary + keywords for embedding
embeddable_text = self._create_embeddable_text(metadata, file_data)
chunk = SemanticChunk(
content=embeddable_text,
embedding=None,
metadata={
"file": file_data.path,
"language": file_data.language,
"summary": metadata.summary,
"keywords": metadata.keywords,
"purpose": metadata.purpose,
"llm_tool": metadata.llm_tool,
"strategy": "llm_enhanced",
},
)
else:
# Fallback: use truncated raw code
chunk = SemanticChunk(
content=file_data.content[:2000],
embedding=None,
metadata={
"file": file_data.path,
"language": file_data.language,
"strategy": "raw_code",
},
)
chunks_to_embed.append(chunk)
file_paths.append(file_data.path)
# Step 3: Generate embeddings
logger.info("Generating embeddings for %d chunks...", len(chunks_to_embed))
texts = [chunk.content for chunk in chunks_to_embed]
embeddings = self.embedder.embed(texts)
# Step 4: Store in vector store
indexed_count = 0
for chunk, embedding, file_path in zip(chunks_to_embed, embeddings, file_paths):
chunk.embedding = embedding
try:
self.vector_store.add_chunk(chunk, file_path)
indexed_count += 1
except Exception as e:
logger.debug("Failed to store chunk for %s: %s", file_path, e)
logger.info("Successfully indexed %d/%d files", indexed_count, len(files))
return indexed_count
def _create_embeddable_text(
self,
metadata: SemanticMetadata,
file_data: FileData,
) -> str:
"""Create text optimized for embedding from LLM metadata.
Combines summary, keywords, and purpose into a single string
that will produce good semantic matches for natural language queries.
"""
parts = []
# Summary is the primary content
if metadata.summary:
parts.append(metadata.summary)
# Purpose adds categorical context
if metadata.purpose:
parts.append(f"Category: {metadata.purpose}")
# Keywords expand search coverage
if metadata.keywords:
parts.append(f"Keywords: {', '.join(metadata.keywords)}")
# Add file name for context
parts.append(f"File: {Path(file_data.path).name}")
return "\n".join(parts)
def _index_raw_code(self, files: List[FileData]) -> int:
"""Fallback: index raw code without LLM enhancement."""
indexed_count = 0
for file_data in files:
# Truncate to reasonable size
content = file_data.content[:2000]
chunk = SemanticChunk(
content=content,
embedding=None,
metadata={
"file": file_data.path,
"language": file_data.language,
"strategy": "raw_code",
},
)
try:
embedding = self.embedder.embed_single(content)
chunk.embedding = embedding
self.vector_store.add_chunk(chunk, file_data.path)
indexed_count += 1
except Exception as e:
logger.debug("Failed to index %s: %s", file_data.path, e)
return indexed_count
def create_enhanced_indexer(
vector_store_path: Path,
llm_tool: str = "gemini",
llm_enabled: bool = True,
) -> EnhancedSemanticIndexer:
"""Factory function to create an enhanced semantic indexer.
Args:
vector_store_path: Path for the vector store database
llm_tool: LLM tool to use (gemini, qwen)
llm_enabled: Whether to enable LLM enhancement
Returns:
Configured EnhancedSemanticIndexer instance
"""
from .embedder import Embedder
from .vector_store import VectorStore
enhancer = create_enhancer(tool=llm_tool, enabled=llm_enabled)
embedder = Embedder()
vector_store = VectorStore(vector_store_path)
return EnhancedSemanticIndexer(enhancer, embedder, vector_store)

View File

@@ -1,545 +0,0 @@
"""Test suite for comparing pure vector search vs LLM-enhanced vector search.
This test demonstrates the difference between:
1. Pure vector search: Raw code → fastembed → vector search
2. LLM-enhanced search: Code → LLM summary → fastembed → vector search
LLM-enhanced search should provide better semantic matches for natural language queries.
"""
import pytest
import sqlite3
import tempfile
from pathlib import Path
from typing import Dict, List
from codexlens.search.hybrid_search import HybridSearchEngine
from codexlens.storage.dir_index import DirIndexStore
# Check semantic dependencies
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
from codexlens.semantic.llm_enhancer import (
LLMEnhancer,
LLMConfig,
FileData,
EnhancedSemanticIndexer,
SemanticChunk,
)
from codexlens.entities import SearchResult
except ImportError:
SEMANTIC_AVAILABLE = False
# Test code samples representing different functionality
TEST_CODE_SAMPLES = {
"auth/password_hasher.py": '''"""Password hashing utilities using bcrypt."""
import bcrypt
def hash_password(password: str, salt_rounds: int = 12) -> str:
"""Hash a password using bcrypt with specified salt rounds.
Args:
password: Plain text password to hash
salt_rounds: Number of salt rounds (default 12)
Returns:
Hashed password string
"""
salt = bcrypt.gensalt(rounds=salt_rounds)
hashed = bcrypt.hashpw(password.encode('utf-8'), salt)
return hashed.decode('utf-8')
def verify_password(password: str, hashed: str) -> bool:
"""Verify a password against its hash.
Args:
password: Plain text password to verify
hashed: Previously hashed password
Returns:
True if password matches hash
"""
return bcrypt.checkpw(password.encode('utf-8'), hashed.encode('utf-8'))
''',
"auth/jwt_handler.py": '''"""JWT token generation and validation."""
import jwt
from datetime import datetime, timedelta
from typing import Dict, Optional
SECRET_KEY = "your-secret-key-here"
def create_token(user_id: int, expires_in: int = 3600) -> str:
"""Generate a JWT access token for user authentication.
Args:
user_id: User ID to encode in token
expires_in: Token expiration in seconds (default 1 hour)
Returns:
JWT token string
"""
payload = {
'user_id': user_id,
'exp': datetime.utcnow() + timedelta(seconds=expires_in),
'iat': datetime.utcnow()
}
return jwt.encode(payload, SECRET_KEY, algorithm='HS256')
def decode_token(token: str) -> Optional[Dict]:
"""Validate and decode JWT token to extract user information.
Args:
token: JWT token string to decode
Returns:
Decoded payload dict or None if invalid
"""
try:
payload = jwt.decode(token, SECRET_KEY, algorithms=['HS256'])
return payload
except jwt.ExpiredSignatureError:
return None
except jwt.InvalidTokenError:
return None
''',
"api/user_endpoints.py": '''"""REST API endpoints for user management."""
from flask import Flask, request, jsonify
from typing import Dict
app = Flask(__name__)
@app.route('/api/users', methods=['POST'])
def create_user():
"""Create a new user account with email and password.
Request JSON:
email: User email address
password: User password
name: User full name
Returns:
JSON with user_id and success status
"""
data = request.get_json()
# Validate input
if not data.get('email') or not data.get('password'):
return jsonify({'error': 'Email and password required'}), 400
# Create user (simplified)
user_id = 12345 # Would normally insert into database
return jsonify({'user_id': user_id, 'success': True}), 201
@app.route('/api/users/<int:user_id>', methods=['GET'])
def get_user(user_id: int):
"""Retrieve user profile information by user ID.
Args:
user_id: Unique user identifier
Returns:
JSON with user profile data
"""
# Simplified user retrieval
user = {
'id': user_id,
'email': 'user@example.com',
'name': 'John Doe',
'created_at': '2024-01-01'
}
return jsonify(user), 200
''',
"utils/validation.py": '''"""Input validation and sanitization utilities."""
import re
from typing import Optional
def validate_email(email: str) -> bool:
"""Check if email address format is valid using regex pattern.
Args:
email: Email address string to validate
Returns:
True if email format is valid
"""
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def sanitize_input(text: str, max_length: int = 255) -> str:
"""Clean user input by removing special characters and limiting length.
Args:
text: Input text to sanitize
max_length: Maximum allowed length
Returns:
Sanitized text string
"""
# Remove special characters
text = re.sub(r'[<>\"\'&]', '', text)
# Trim whitespace
text = text.strip()
# Limit length
return text[:max_length]
def validate_password_strength(password: str) -> tuple[bool, Optional[str]]:
"""Validate password meets security requirements.
Requirements:
- At least 8 characters
- Contains uppercase and lowercase
- Contains numbers
- Contains special characters
Args:
password: Password string to validate
Returns:
Tuple of (is_valid, error_message)
"""
if len(password) < 8:
return False, "Password must be at least 8 characters"
if not re.search(r'[A-Z]', password):
return False, "Password must contain uppercase letter"
if not re.search(r'[a-z]', password):
return False, "Password must contain lowercase letter"
if not re.search(r'[0-9]', password):
return False, "Password must contain number"
if not re.search(r'[!@#$%^&*(),.?":{}|<>]', password):
return False, "Password must contain special character"
return True, None
''',
"database/connection.py": '''"""Database connection pooling and management."""
import psycopg2
from psycopg2 import pool
from typing import Optional
from contextlib import contextmanager
class DatabasePool:
"""PostgreSQL connection pool manager for handling multiple concurrent connections."""
def __init__(self, min_conn: int = 1, max_conn: int = 10):
"""Initialize database connection pool.
Args:
min_conn: Minimum number of connections to maintain
max_conn: Maximum number of connections allowed
"""
self.pool = psycopg2.pool.SimpleConnectionPool(
min_conn,
max_conn,
user='dbuser',
password='dbpass',
host='localhost',
port='5432',
database='myapp'
)
@contextmanager
def get_connection(self):
"""Get a connection from pool as context manager.
Yields:
Database connection object
"""
conn = self.pool.getconn()
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
self.pool.putconn(conn)
def close_all(self):
"""Close all connections in pool."""
self.pool.closeall()
'''
}
# Natural language queries to test semantic understanding
TEST_QUERIES = [
{
"query": "How do I securely hash passwords?",
"expected_file": "auth/password_hasher.py",
"description": "Should find password hashing implementation",
},
{
"query": "Generate JWT token for user authentication",
"expected_file": "auth/jwt_handler.py",
"description": "Should find JWT token creation logic",
},
{
"query": "Create new user account via REST API",
"expected_file": "api/user_endpoints.py",
"description": "Should find user registration endpoint",
},
{
"query": "Validate email address format",
"expected_file": "utils/validation.py",
"description": "Should find email validation function",
},
{
"query": "Connect to PostgreSQL database",
"expected_file": "database/connection.py",
"description": "Should find database connection management",
},
{
"query": "Check password complexity requirements",
"expected_file": "utils/validation.py",
"description": "Should find password strength validation",
},
]
@pytest.mark.skipif(not SEMANTIC_AVAILABLE, reason="Semantic dependencies not available")
class TestPureVectorSearch:
"""Test pure vector search (code → fastembed → search)."""
@pytest.fixture
def pure_vector_db(self):
"""Create database with pure vector embeddings (no LLM)."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
# Initialize database
store = DirIndexStore(db_path)
store.initialize()
# Add test files
with store._get_connection() as conn:
for path, content in TEST_CODE_SAMPLES.items():
name = path.split('/')[-1]
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(name, path, content, "python", 0.0)
)
conn.commit()
# Generate embeddings using pure vector approach (raw code)
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute("SELECT full_path, content FROM files").fetchall()
for row in rows:
# Pure vector: directly chunk and embed raw code
chunks = chunker.chunk_sliding_window(
row["content"],
file_path=row["full_path"],
language="python"
)
for chunk in chunks:
chunk.embedding = embedder.embed_single(chunk.content)
chunk.metadata["strategy"] = "pure_vector"
if chunks:
vector_store.add_chunks(chunks, row["full_path"])
yield db_path
store.close()
if db_path.exists():
db_path.unlink()
def test_pure_vector_queries(self, pure_vector_db):
"""Test natural language queries with pure vector search."""
engine = HybridSearchEngine()
results = {}
for test_case in TEST_QUERIES:
query = test_case["query"]
expected_file = test_case["expected_file"]
search_results = engine.search(
pure_vector_db,
query,
limit=5,
enable_vector=True,
pure_vector=True,
)
# Check if expected file is in top 3 results
top_files = [r.path for r in search_results[:3]]
found = expected_file in top_files
rank = top_files.index(expected_file) + 1 if found else None
results[query] = {
"found": found,
"rank": rank,
"top_result": search_results[0].path if search_results else None,
"top_score": search_results[0].score if search_results else 0.0,
}
return results
@pytest.mark.skipif(not SEMANTIC_AVAILABLE, reason="Semantic dependencies not available")
class TestLLMEnhancedSearch:
"""Test LLM-enhanced vector search (code → LLM → fastembed → search)."""
@pytest.fixture
def llm_enhanced_db(self):
"""Create database with LLM-enhanced embeddings."""
# Skip if CCW not available
llm_config = LLMConfig(enabled=True, tool="gemini")
enhancer = LLMEnhancer(llm_config)
if not enhancer.check_available():
pytest.skip("CCW CLI not available for LLM enhancement")
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
# Initialize database
store = DirIndexStore(db_path)
store.initialize()
# Add test files
with store._get_connection() as conn:
for path, content in TEST_CODE_SAMPLES.items():
name = path.split('/')[-1]
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(name, path, content, "python", 0.0)
)
conn.commit()
# Generate embeddings using LLM-enhanced approach
embedder = Embedder(profile="code")
vector_store = VectorStore(db_path)
# Create enhanced indexer
indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store)
# Prepare file data
file_data_list = [
FileData(path=path, content=content, language="python")
for path, content in TEST_CODE_SAMPLES.items()
]
# Index with LLM enhancement
indexed = indexer.index_files(file_data_list)
print(f"\nLLM-enhanced indexing: {indexed}/{len(file_data_list)} files")
yield db_path
store.close()
if db_path.exists():
db_path.unlink()
def test_llm_enhanced_queries(self, llm_enhanced_db):
"""Test natural language queries with LLM-enhanced search."""
engine = HybridSearchEngine()
results = {}
for test_case in TEST_QUERIES:
query = test_case["query"]
expected_file = test_case["expected_file"]
search_results = engine.search(
llm_enhanced_db,
query,
limit=5,
enable_vector=True,
pure_vector=True,
)
# Check if expected file is in top 3 results
top_files = [r.path for r in search_results[:3]]
found = expected_file in top_files
rank = top_files.index(expected_file) + 1 if found else None
results[query] = {
"found": found,
"rank": rank,
"top_result": search_results[0].path if search_results else None,
"top_score": search_results[0].score if search_results else 0.0,
}
return results
@pytest.mark.skipif(not SEMANTIC_AVAILABLE, reason="Semantic dependencies not available")
class TestSearchComparison:
"""Compare pure vector vs LLM-enhanced search side-by-side."""
def test_comparison(self):
"""Run comprehensive comparison of both approaches."""
# This test runs both approaches and compares results
print("\n" + "="*70)
print("SEMANTIC SEARCH COMPARISON TEST")
print("="*70)
try:
# Test pure vector search
print("\n1. Testing Pure Vector Search (Code → fastembed)")
print("-" * 70)
pure_test = TestPureVectorSearch()
pure_db = next(pure_test.pure_vector_db())
pure_results = pure_test.test_pure_vector_queries(pure_db)
# Test LLM-enhanced search
print("\n2. Testing LLM-Enhanced Search (Code → LLM → fastembed)")
print("-" * 70)
llm_test = TestLLMEnhancedSearch()
llm_db = next(llm_test.llm_enhanced_db())
llm_results = llm_test.test_llm_enhanced_queries(llm_db)
# Compare results
print("\n3. COMPARISON RESULTS")
print("="*70)
print(f"{'Query':<50} {'Pure Vec':<12} {'LLM Enhanced':<12}")
print("-" * 70)
pure_score = 0
llm_score = 0
for test_case in TEST_QUERIES:
query = test_case["query"][:47] + "..." if len(test_case["query"]) > 50 else test_case["query"]
pure_res = pure_results.get(test_case["query"], {})
llm_res = llm_results.get(test_case["query"], {})
pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Not found"
llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Not found"
print(f"{query:<50} {pure_status:<12} {llm_status:<12}")
if pure_res.get('found'):
pure_score += (4 - pure_res['rank']) # 3 points for rank 1, 2 for rank 2, etc
if llm_res.get('found'):
llm_score += (4 - llm_res['rank'])
print("-" * 70)
print(f"{'TOTAL SCORE':<50} {pure_score:<12} {llm_score:<12}")
print("="*70)
# Interpretation
print("\nINTERPRETATION:")
if llm_score > pure_score:
improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100
print(f"[OK] LLM enhancement improves results by {improvement:.1f}%")
print(" LLM summaries match natural language queries better than raw code")
elif pure_score > llm_score:
print("[X] Pure vector search performed better (unexpected)")
print(" This may indicate LLM summaries are too generic")
else:
print("= Both approaches performed equally")
except Exception as e:
pytest.fail(f"Comparison test failed: {e}")
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])

File diff suppressed because it is too large Load Diff